{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9184952978056424, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 4.789229106903076, "epoch": 0.003918495297805642, "grad_norm": 17.125, "learning_rate": 2e-06, "loss": 14.3537, "mean_token_accuracy": 0.0, "num_tokens": 9174.0, "step": 5 }, { "entropy": 4.8115012645721436, "epoch": 0.007836990595611285, "grad_norm": 19.625, "learning_rate": 4.5e-06, "loss": 14.2452, "mean_token_accuracy": 0.0, "num_tokens": 17790.0, "step": 10 }, { "entropy": 4.899150800704956, "epoch": 0.011755485893416929, "grad_norm": 24.25, "learning_rate": 7e-06, "loss": 13.9044, "mean_token_accuracy": 0.0, "num_tokens": 25850.0, "step": 15 }, { "entropy": 5.367580604553223, "epoch": 0.01567398119122257, "grad_norm": 32.5, "learning_rate": 9.5e-06, "loss": 13.1444, "mean_token_accuracy": 0.0, "num_tokens": 35194.0, "step": 20 }, { "entropy": 8.583788537979126, "epoch": 0.019592476489028215, "grad_norm": 7.5, "learning_rate": 1.2e-05, "loss": 11.3911, "mean_token_accuracy": 0.00023256096756085753, "num_tokens": 44218.0, "step": 25 }, { "entropy": 10.630141735076904, "epoch": 0.023510971786833857, "grad_norm": 3.234375, "learning_rate": 1.4500000000000002e-05, "loss": 10.7102, "mean_token_accuracy": 0.014597209030762314, "num_tokens": 53397.0, "step": 30 }, { "entropy": 10.695956897735595, "epoch": 0.0274294670846395, "grad_norm": 3.0, "learning_rate": 1.7000000000000003e-05, "loss": 10.4664, "mean_token_accuracy": 0.01781447734683752, "num_tokens": 62749.0, "step": 35 }, { "entropy": 10.673796558380127, "epoch": 0.03134796238244514, "grad_norm": 2.421875, "learning_rate": 1.95e-05, "loss": 10.1632, "mean_token_accuracy": 0.0182854525744915, "num_tokens": 71721.0, "step": 40 }, { "entropy": 10.631663513183593, "epoch": 0.03526645768025078, "grad_norm": 2.421875, "learning_rate": 2.2e-05, "loss": 9.8792, "mean_token_accuracy": 0.03653257880359888, "num_tokens": 79844.0, "step": 45 }, { "entropy": 10.549837970733643, "epoch": 0.03918495297805643, "grad_norm": 1.953125, "learning_rate": 2.4500000000000003e-05, "loss": 9.7665, "mean_token_accuracy": 0.04605128690600395, "num_tokens": 88866.0, "step": 50 }, { "entropy": 10.509830379486084, "epoch": 0.04310344827586207, "grad_norm": 1.84375, "learning_rate": 2.7e-05, "loss": 9.6605, "mean_token_accuracy": 0.044031094387173654, "num_tokens": 97918.0, "step": 55 }, { "entropy": 10.524475193023681, "epoch": 0.047021943573667714, "grad_norm": 1.875, "learning_rate": 2.95e-05, "loss": 9.5619, "mean_token_accuracy": 0.04601282589137554, "num_tokens": 107043.0, "step": 60 }, { "entropy": 10.512676334381103, "epoch": 0.050940438871473356, "grad_norm": 2.03125, "learning_rate": 3.2e-05, "loss": 9.4987, "mean_token_accuracy": 0.04643600396811962, "num_tokens": 116000.0, "step": 65 }, { "entropy": 10.477371215820312, "epoch": 0.054858934169279, "grad_norm": 1.8984375, "learning_rate": 3.4500000000000005e-05, "loss": 9.4179, "mean_token_accuracy": 0.04148977212607861, "num_tokens": 124559.0, "step": 70 }, { "entropy": 10.470399188995362, "epoch": 0.05877742946708464, "grad_norm": 1.828125, "learning_rate": 3.7e-05, "loss": 9.2945, "mean_token_accuracy": 0.04952896051108837, "num_tokens": 132868.0, "step": 75 }, { "entropy": 10.453096103668212, "epoch": 0.06269592476489028, "grad_norm": 1.796875, "learning_rate": 3.95e-05, "loss": 9.2848, "mean_token_accuracy": 0.05274602882564068, "num_tokens": 141286.0, "step": 80 }, { "entropy": 10.431593227386475, "epoch": 0.06661442006269593, "grad_norm": 1.71875, "learning_rate": 4.2000000000000004e-05, "loss": 9.1405, "mean_token_accuracy": 0.05872356928884983, "num_tokens": 150406.0, "step": 85 }, { "entropy": 10.36865291595459, "epoch": 0.07053291536050156, "grad_norm": 1.84375, "learning_rate": 4.45e-05, "loss": 9.0678, "mean_token_accuracy": 0.059385529905557635, "num_tokens": 158770.0, "step": 90 }, { "entropy": 10.264866065979003, "epoch": 0.07445141065830721, "grad_norm": 2.046875, "learning_rate": 4.7000000000000004e-05, "loss": 8.9633, "mean_token_accuracy": 0.06288341507315635, "num_tokens": 167763.0, "step": 95 }, { "entropy": 10.183263969421386, "epoch": 0.07836990595611286, "grad_norm": 1.6796875, "learning_rate": 4.9500000000000004e-05, "loss": 8.819, "mean_token_accuracy": 0.0607046652585268, "num_tokens": 177306.0, "step": 100 }, { "entropy": 10.144334697723389, "epoch": 0.0822884012539185, "grad_norm": 1.5078125, "learning_rate": 5.2e-05, "loss": 8.7349, "mean_token_accuracy": 0.06028640605509281, "num_tokens": 186014.0, "step": 105 }, { "entropy": 10.06214361190796, "epoch": 0.08620689655172414, "grad_norm": 1.390625, "learning_rate": 5.45e-05, "loss": 8.5758, "mean_token_accuracy": 0.06410923898220063, "num_tokens": 194122.0, "step": 110 }, { "entropy": 9.952830028533935, "epoch": 0.09012539184952978, "grad_norm": 1.4765625, "learning_rate": 5.7e-05, "loss": 8.4698, "mean_token_accuracy": 0.05936008468270302, "num_tokens": 203097.0, "step": 115 }, { "entropy": 9.820581531524658, "epoch": 0.09404388714733543, "grad_norm": 1.4375, "learning_rate": 5.9499999999999996e-05, "loss": 8.3405, "mean_token_accuracy": 0.06221077479422092, "num_tokens": 211413.0, "step": 120 }, { "entropy": 9.732498264312744, "epoch": 0.09796238244514106, "grad_norm": 1.1484375, "learning_rate": 6.2e-05, "loss": 8.2718, "mean_token_accuracy": 0.061625415459275246, "num_tokens": 220550.0, "step": 125 }, { "entropy": 9.504752349853515, "epoch": 0.10188087774294671, "grad_norm": 1.15625, "learning_rate": 6.450000000000001e-05, "loss": 8.0995, "mean_token_accuracy": 0.0649514563381672, "num_tokens": 229197.0, "step": 130 }, { "entropy": 9.307702922821045, "epoch": 0.10579937304075235, "grad_norm": 1.09375, "learning_rate": 6.7e-05, "loss": 8.0979, "mean_token_accuracy": 0.05685936994850636, "num_tokens": 238479.0, "step": 135 }, { "entropy": 9.162922954559326, "epoch": 0.109717868338558, "grad_norm": 1.375, "learning_rate": 6.950000000000001e-05, "loss": 7.9442, "mean_token_accuracy": 0.059861503541469574, "num_tokens": 246318.0, "step": 140 }, { "entropy": 8.96123743057251, "epoch": 0.11363636363636363, "grad_norm": 0.98828125, "learning_rate": 7.2e-05, "loss": 7.9513, "mean_token_accuracy": 0.05959276556968689, "num_tokens": 254783.0, "step": 145 }, { "entropy": 8.760778617858886, "epoch": 0.11755485893416928, "grad_norm": 1.1171875, "learning_rate": 7.45e-05, "loss": 7.7945, "mean_token_accuracy": 0.06369670145213605, "num_tokens": 263416.0, "step": 150 }, { "entropy": 8.68117027282715, "epoch": 0.12147335423197492, "grad_norm": 1.0703125, "learning_rate": 7.7e-05, "loss": 7.8147, "mean_token_accuracy": 0.0631796333938837, "num_tokens": 271930.0, "step": 155 }, { "entropy": 8.476777839660645, "epoch": 0.12539184952978055, "grad_norm": 0.78515625, "learning_rate": 7.950000000000001e-05, "loss": 7.7159, "mean_token_accuracy": 0.06549291461706161, "num_tokens": 280546.0, "step": 160 }, { "entropy": 8.308262157440186, "epoch": 0.12931034482758622, "grad_norm": 0.9296875, "learning_rate": 8.2e-05, "loss": 7.7078, "mean_token_accuracy": 0.06602046675980092, "num_tokens": 288813.0, "step": 165 }, { "entropy": 8.279962158203125, "epoch": 0.13322884012539185, "grad_norm": 0.97265625, "learning_rate": 8.450000000000001e-05, "loss": 7.6847, "mean_token_accuracy": 0.06443305909633637, "num_tokens": 297966.0, "step": 170 }, { "entropy": 8.152728843688966, "epoch": 0.1371473354231975, "grad_norm": 0.8671875, "learning_rate": 8.7e-05, "loss": 7.7467, "mean_token_accuracy": 0.06189337000250816, "num_tokens": 307135.0, "step": 175 }, { "entropy": 8.145699501037598, "epoch": 0.14106583072100312, "grad_norm": 0.890625, "learning_rate": 8.95e-05, "loss": 7.6581, "mean_token_accuracy": 0.06488074697554111, "num_tokens": 315546.0, "step": 180 }, { "entropy": 8.149786376953125, "epoch": 0.14498432601880878, "grad_norm": 0.89453125, "learning_rate": 9.2e-05, "loss": 7.6538, "mean_token_accuracy": 0.06405953019857406, "num_tokens": 323930.0, "step": 185 }, { "entropy": 7.983444690704346, "epoch": 0.14890282131661442, "grad_norm": 1.03125, "learning_rate": 9.45e-05, "loss": 7.5166, "mean_token_accuracy": 0.07129846066236496, "num_tokens": 332419.0, "step": 190 }, { "entropy": 7.974339866638184, "epoch": 0.15282131661442006, "grad_norm": 1.0234375, "learning_rate": 9.7e-05, "loss": 7.6157, "mean_token_accuracy": 0.06940566822886467, "num_tokens": 341362.0, "step": 195 }, { "entropy": 7.973450374603272, "epoch": 0.15673981191222572, "grad_norm": 1.625, "learning_rate": 9.95e-05, "loss": 7.5198, "mean_token_accuracy": 0.07214542552828788, "num_tokens": 349395.0, "step": 200 }, { "entropy": 7.946638202667236, "epoch": 0.16065830721003135, "grad_norm": 1.2890625, "learning_rate": 0.000102, "loss": 7.545, "mean_token_accuracy": 0.06696730926632881, "num_tokens": 358413.0, "step": 205 }, { "entropy": 7.863241577148438, "epoch": 0.164576802507837, "grad_norm": 1.2578125, "learning_rate": 0.00010449999999999999, "loss": 7.6316, "mean_token_accuracy": 0.06982938721776008, "num_tokens": 366489.0, "step": 210 }, { "entropy": 7.940403842926026, "epoch": 0.16849529780564262, "grad_norm": 1.1015625, "learning_rate": 0.000107, "loss": 7.5023, "mean_token_accuracy": 0.06740010716021061, "num_tokens": 375335.0, "step": 215 }, { "entropy": 7.874079847335816, "epoch": 0.1724137931034483, "grad_norm": 1.1015625, "learning_rate": 0.0001095, "loss": 7.5555, "mean_token_accuracy": 0.07188675999641418, "num_tokens": 384276.0, "step": 220 }, { "entropy": 7.911562585830689, "epoch": 0.17633228840125392, "grad_norm": 0.99609375, "learning_rate": 0.000112, "loss": 7.5929, "mean_token_accuracy": 0.06714313849806786, "num_tokens": 393571.0, "step": 225 }, { "entropy": 7.920306205749512, "epoch": 0.18025078369905956, "grad_norm": 1.2734375, "learning_rate": 0.0001145, "loss": 7.5193, "mean_token_accuracy": 0.07089398205280303, "num_tokens": 401865.0, "step": 230 }, { "entropy": 7.848536252975464, "epoch": 0.1841692789968652, "grad_norm": 0.91015625, "learning_rate": 0.00011700000000000001, "loss": 7.4905, "mean_token_accuracy": 0.07226377129554748, "num_tokens": 410518.0, "step": 235 }, { "entropy": 7.869985485076905, "epoch": 0.18808777429467086, "grad_norm": 0.97265625, "learning_rate": 0.00011949999999999999, "loss": 7.4997, "mean_token_accuracy": 0.07303371652960777, "num_tokens": 419769.0, "step": 240 }, { "entropy": 7.837644481658936, "epoch": 0.1920062695924765, "grad_norm": 0.95703125, "learning_rate": 0.000122, "loss": 7.437, "mean_token_accuracy": 0.0742616519331932, "num_tokens": 428204.0, "step": 245 }, { "entropy": 7.897941255569458, "epoch": 0.19592476489028213, "grad_norm": 1.1171875, "learning_rate": 0.0001245, "loss": 7.5267, "mean_token_accuracy": 0.06978406608104706, "num_tokens": 436594.0, "step": 250 }, { "entropy": 7.80897855758667, "epoch": 0.19984326018808776, "grad_norm": 0.96875, "learning_rate": 0.000127, "loss": 7.3256, "mean_token_accuracy": 0.07486266531050205, "num_tokens": 444645.0, "step": 255 }, { "entropy": 7.8133704662323, "epoch": 0.20376175548589343, "grad_norm": 1.234375, "learning_rate": 0.0001295, "loss": 7.3726, "mean_token_accuracy": 0.07738698273897171, "num_tokens": 453016.0, "step": 260 }, { "entropy": 7.736161422729492, "epoch": 0.20768025078369906, "grad_norm": 1.0546875, "learning_rate": 0.000132, "loss": 7.4507, "mean_token_accuracy": 0.06978621035814285, "num_tokens": 462116.0, "step": 265 }, { "entropy": 7.664476203918457, "epoch": 0.2115987460815047, "grad_norm": 1.1875, "learning_rate": 0.00013450000000000002, "loss": 7.3961, "mean_token_accuracy": 0.07115238644182682, "num_tokens": 470807.0, "step": 270 }, { "entropy": 7.7677568912506105, "epoch": 0.21551724137931033, "grad_norm": 1.0703125, "learning_rate": 0.00013700000000000002, "loss": 7.4818, "mean_token_accuracy": 0.07023664973676205, "num_tokens": 479592.0, "step": 275 }, { "entropy": 7.912611389160157, "epoch": 0.219435736677116, "grad_norm": 1.1484375, "learning_rate": 0.0001395, "loss": 7.4068, "mean_token_accuracy": 0.07160313390195369, "num_tokens": 488107.0, "step": 280 }, { "entropy": 7.755217599868774, "epoch": 0.22335423197492163, "grad_norm": 1.1796875, "learning_rate": 0.00014199999999999998, "loss": 7.4212, "mean_token_accuracy": 0.07538670524954796, "num_tokens": 496775.0, "step": 285 }, { "entropy": 7.762103033065796, "epoch": 0.22727272727272727, "grad_norm": 1.2421875, "learning_rate": 0.0001445, "loss": 7.4447, "mean_token_accuracy": 0.07036296911537647, "num_tokens": 505415.0, "step": 290 }, { "entropy": 7.757038116455078, "epoch": 0.23119122257053293, "grad_norm": 1.015625, "learning_rate": 0.000147, "loss": 7.4344, "mean_token_accuracy": 0.074312524497509, "num_tokens": 514447.0, "step": 295 }, { "entropy": 7.7855620861053465, "epoch": 0.23510971786833856, "grad_norm": 1.0234375, "learning_rate": 0.0001495, "loss": 7.4189, "mean_token_accuracy": 0.07684484757483005, "num_tokens": 522998.0, "step": 300 }, { "entropy": 7.774819326400757, "epoch": 0.2390282131661442, "grad_norm": 1.1015625, "learning_rate": 0.000152, "loss": 7.3794, "mean_token_accuracy": 0.07512850686907768, "num_tokens": 531542.0, "step": 305 }, { "entropy": 7.7300177097320555, "epoch": 0.24294670846394983, "grad_norm": 0.9921875, "learning_rate": 0.00015450000000000001, "loss": 7.3247, "mean_token_accuracy": 0.07261879369616508, "num_tokens": 540143.0, "step": 310 }, { "entropy": 7.773348236083985, "epoch": 0.2468652037617555, "grad_norm": 1.0390625, "learning_rate": 0.000157, "loss": 7.4017, "mean_token_accuracy": 0.07950926274061203, "num_tokens": 549156.0, "step": 315 }, { "entropy": 7.7285737037658695, "epoch": 0.2507836990595611, "grad_norm": 1.03125, "learning_rate": 0.0001595, "loss": 7.2557, "mean_token_accuracy": 0.07481630519032478, "num_tokens": 557522.0, "step": 320 }, { "entropy": 7.654256391525268, "epoch": 0.2547021943573668, "grad_norm": 1.109375, "learning_rate": 0.000162, "loss": 7.331, "mean_token_accuracy": 0.07940150126814842, "num_tokens": 566650.0, "step": 325 }, { "entropy": 7.672131299972534, "epoch": 0.25862068965517243, "grad_norm": 1.1328125, "learning_rate": 0.00016450000000000001, "loss": 7.2864, "mean_token_accuracy": 0.07869702018797398, "num_tokens": 576219.0, "step": 330 }, { "entropy": 7.713160848617553, "epoch": 0.26253918495297807, "grad_norm": 1.3984375, "learning_rate": 0.00016700000000000002, "loss": 7.374, "mean_token_accuracy": 0.07567794360220433, "num_tokens": 584304.0, "step": 335 }, { "entropy": 7.634349060058594, "epoch": 0.2664576802507837, "grad_norm": 1.2109375, "learning_rate": 0.00016950000000000003, "loss": 7.3003, "mean_token_accuracy": 0.07832697704434395, "num_tokens": 593163.0, "step": 340 }, { "entropy": 7.6303457736969, "epoch": 0.27037617554858934, "grad_norm": 1.21875, "learning_rate": 0.00017199999999999998, "loss": 7.2164, "mean_token_accuracy": 0.07754571028053761, "num_tokens": 602077.0, "step": 345 }, { "entropy": 7.6355628490448, "epoch": 0.274294670846395, "grad_norm": 1.59375, "learning_rate": 0.00017449999999999999, "loss": 7.3284, "mean_token_accuracy": 0.08122679404914379, "num_tokens": 610009.0, "step": 350 }, { "entropy": 7.685596513748169, "epoch": 0.2782131661442006, "grad_norm": 1.1015625, "learning_rate": 0.000177, "loss": 7.362, "mean_token_accuracy": 0.07597106769680977, "num_tokens": 619282.0, "step": 355 }, { "entropy": 7.619720935821533, "epoch": 0.28213166144200624, "grad_norm": 1.1328125, "learning_rate": 0.0001795, "loss": 7.2893, "mean_token_accuracy": 0.08018167689442635, "num_tokens": 628138.0, "step": 360 }, { "entropy": 7.6796112060546875, "epoch": 0.28605015673981193, "grad_norm": 1.1875, "learning_rate": 0.000182, "loss": 7.1745, "mean_token_accuracy": 0.08669476807117463, "num_tokens": 637021.0, "step": 365 }, { "entropy": 7.619413709640503, "epoch": 0.28996865203761757, "grad_norm": 1.453125, "learning_rate": 0.0001845, "loss": 7.3161, "mean_token_accuracy": 0.07477690353989601, "num_tokens": 646703.0, "step": 370 }, { "entropy": 7.623689222335815, "epoch": 0.2938871473354232, "grad_norm": 1.1796875, "learning_rate": 0.000187, "loss": 7.1981, "mean_token_accuracy": 0.08060777708888053, "num_tokens": 655616.0, "step": 375 }, { "entropy": 7.556978368759156, "epoch": 0.29780564263322884, "grad_norm": 1.3515625, "learning_rate": 0.0001895, "loss": 7.1994, "mean_token_accuracy": 0.08719867020845413, "num_tokens": 663783.0, "step": 380 }, { "entropy": 7.5845866203308105, "epoch": 0.3017241379310345, "grad_norm": 1.2734375, "learning_rate": 0.000192, "loss": 7.2094, "mean_token_accuracy": 0.08289245739579201, "num_tokens": 671855.0, "step": 385 }, { "entropy": 7.527840566635132, "epoch": 0.3056426332288401, "grad_norm": 1.34375, "learning_rate": 0.0001945, "loss": 7.2219, "mean_token_accuracy": 0.07747755497694016, "num_tokens": 680981.0, "step": 390 }, { "entropy": 7.6136561870574955, "epoch": 0.30956112852664575, "grad_norm": 1.328125, "learning_rate": 0.00019700000000000002, "loss": 7.1491, "mean_token_accuracy": 0.08336339518427849, "num_tokens": 689294.0, "step": 395 }, { "entropy": 7.469813442230224, "epoch": 0.31347962382445144, "grad_norm": 1.0625, "learning_rate": 0.00019950000000000002, "loss": 7.1035, "mean_token_accuracy": 0.08146922513842583, "num_tokens": 697703.0, "step": 400 }, { "entropy": 7.550826740264893, "epoch": 0.31739811912225707, "grad_norm": 0.98046875, "learning_rate": 0.000202, "loss": 7.2372, "mean_token_accuracy": 0.08058681413531303, "num_tokens": 706792.0, "step": 405 }, { "entropy": 7.606830406188965, "epoch": 0.3213166144200627, "grad_norm": 1.0546875, "learning_rate": 0.00020449999999999998, "loss": 7.1473, "mean_token_accuracy": 0.08346155509352685, "num_tokens": 715864.0, "step": 410 }, { "entropy": 7.3859583854675295, "epoch": 0.32523510971786834, "grad_norm": 1.5078125, "learning_rate": 0.000207, "loss": 7.1975, "mean_token_accuracy": 0.0853593334555626, "num_tokens": 723921.0, "step": 415 }, { "entropy": 7.5406107902526855, "epoch": 0.329153605015674, "grad_norm": 1.2578125, "learning_rate": 0.0002095, "loss": 7.2071, "mean_token_accuracy": 0.08046000376343727, "num_tokens": 732797.0, "step": 420 }, { "entropy": 7.510403490066528, "epoch": 0.3330721003134796, "grad_norm": 1.3515625, "learning_rate": 0.000212, "loss": 7.0654, "mean_token_accuracy": 0.0873202033340931, "num_tokens": 741248.0, "step": 425 }, { "entropy": 7.501159954071045, "epoch": 0.33699059561128525, "grad_norm": 1.1875, "learning_rate": 0.0002145, "loss": 7.1615, "mean_token_accuracy": 0.08190247714519501, "num_tokens": 749766.0, "step": 430 }, { "entropy": 7.408373832702637, "epoch": 0.3409090909090909, "grad_norm": 1.34375, "learning_rate": 0.00021700000000000002, "loss": 7.1268, "mean_token_accuracy": 0.08113668784499169, "num_tokens": 758695.0, "step": 435 }, { "entropy": 7.44956521987915, "epoch": 0.3448275862068966, "grad_norm": 1.234375, "learning_rate": 0.0002195, "loss": 7.1248, "mean_token_accuracy": 0.08192591443657875, "num_tokens": 767624.0, "step": 440 }, { "entropy": 7.422909212112427, "epoch": 0.3487460815047022, "grad_norm": 1.140625, "learning_rate": 0.000222, "loss": 7.117, "mean_token_accuracy": 0.0853099413216114, "num_tokens": 776616.0, "step": 445 }, { "entropy": 7.365292644500732, "epoch": 0.35266457680250785, "grad_norm": 1.203125, "learning_rate": 0.0002245, "loss": 7.1317, "mean_token_accuracy": 0.08413158729672432, "num_tokens": 786147.0, "step": 450 }, { "entropy": 7.536469745635986, "epoch": 0.3565830721003135, "grad_norm": 1.0078125, "learning_rate": 0.00022700000000000002, "loss": 7.2317, "mean_token_accuracy": 0.08228531405329705, "num_tokens": 795213.0, "step": 455 }, { "entropy": 7.461417722702026, "epoch": 0.3605015673981191, "grad_norm": 1.28125, "learning_rate": 0.00022950000000000002, "loss": 7.0349, "mean_token_accuracy": 0.09094136133790016, "num_tokens": 803118.0, "step": 460 }, { "entropy": 7.444038438796997, "epoch": 0.36442006269592475, "grad_norm": 1.1328125, "learning_rate": 0.00023200000000000003, "loss": 7.0219, "mean_token_accuracy": 0.09442275986075402, "num_tokens": 811358.0, "step": 465 }, { "entropy": 7.324700260162354, "epoch": 0.3683385579937304, "grad_norm": 1.515625, "learning_rate": 0.00023449999999999998, "loss": 7.0256, "mean_token_accuracy": 0.08778790757060051, "num_tokens": 819653.0, "step": 470 }, { "entropy": 7.2960240840911865, "epoch": 0.3722570532915361, "grad_norm": 1.34375, "learning_rate": 0.000237, "loss": 7.0511, "mean_token_accuracy": 0.08624262139201164, "num_tokens": 828462.0, "step": 475 }, { "entropy": 7.437795686721802, "epoch": 0.3761755485893417, "grad_norm": 1.09375, "learning_rate": 0.0002395, "loss": 7.1429, "mean_token_accuracy": 0.0912679947912693, "num_tokens": 836204.0, "step": 480 }, { "entropy": 7.2959794998168945, "epoch": 0.38009404388714735, "grad_norm": 1.3359375, "learning_rate": 0.000242, "loss": 7.0169, "mean_token_accuracy": 0.09246607050299645, "num_tokens": 845032.0, "step": 485 }, { "entropy": 7.4119359970092775, "epoch": 0.384012539184953, "grad_norm": 1.515625, "learning_rate": 0.0002445, "loss": 7.0308, "mean_token_accuracy": 0.08805579245090485, "num_tokens": 853324.0, "step": 490 }, { "entropy": 7.404975366592407, "epoch": 0.3879310344827586, "grad_norm": 1.3203125, "learning_rate": 0.000247, "loss": 6.992, "mean_token_accuracy": 0.1035026639699936, "num_tokens": 861640.0, "step": 495 }, { "entropy": 7.385119247436523, "epoch": 0.39184952978056425, "grad_norm": 1.5234375, "learning_rate": 0.0002495, "loss": 7.1744, "mean_token_accuracy": 0.082430200278759, "num_tokens": 870758.0, "step": 500 }, { "epoch": 0.39184952978056425, "eval_entropy": 7.167502074278603, "eval_loss": 7.156619548797607, "eval_mean_token_accuracy": 0.08891707594152684, "eval_num_tokens": 870758.0, "eval_runtime": 2.8546, "eval_samples_per_second": 1444.004, "eval_steps_per_second": 180.763, "step": 500 }, { "entropy": 7.3540655136108395, "epoch": 0.3957680250783699, "grad_norm": 1.265625, "learning_rate": 0.000252, "loss": 7.0906, "mean_token_accuracy": 0.0840302512049675, "num_tokens": 879968.0, "step": 505 }, { "entropy": 7.318256664276123, "epoch": 0.3996865203761755, "grad_norm": 1.1640625, "learning_rate": 0.0002545, "loss": 7.0471, "mean_token_accuracy": 0.08631112575531005, "num_tokens": 888804.0, "step": 510 }, { "entropy": 7.259663057327271, "epoch": 0.4036050156739812, "grad_norm": 1.765625, "learning_rate": 0.000257, "loss": 7.0624, "mean_token_accuracy": 0.08012468516826629, "num_tokens": 898593.0, "step": 515 }, { "entropy": 7.2833295345306395, "epoch": 0.40752351097178685, "grad_norm": 1.21875, "learning_rate": 0.0002595, "loss": 7.0799, "mean_token_accuracy": 0.08342506065964699, "num_tokens": 906873.0, "step": 520 }, { "entropy": 7.3285657405853275, "epoch": 0.4114420062695925, "grad_norm": 1.0390625, "learning_rate": 0.000262, "loss": 7.0359, "mean_token_accuracy": 0.0934828281402588, "num_tokens": 915080.0, "step": 525 }, { "entropy": 7.422465038299561, "epoch": 0.4153605015673981, "grad_norm": 1.2265625, "learning_rate": 0.00026450000000000003, "loss": 7.0396, "mean_token_accuracy": 0.08701496720314025, "num_tokens": 923741.0, "step": 530 }, { "entropy": 7.216967153549194, "epoch": 0.41927899686520376, "grad_norm": 1.09375, "learning_rate": 0.00026700000000000004, "loss": 6.9459, "mean_token_accuracy": 0.0904500350356102, "num_tokens": 932455.0, "step": 535 }, { "entropy": 7.235203742980957, "epoch": 0.4231974921630094, "grad_norm": 1.1796875, "learning_rate": 0.00026950000000000005, "loss": 6.9492, "mean_token_accuracy": 0.09165697544813156, "num_tokens": 941064.0, "step": 540 }, { "entropy": 7.23210301399231, "epoch": 0.427115987460815, "grad_norm": 1.078125, "learning_rate": 0.00027200000000000005, "loss": 6.9795, "mean_token_accuracy": 0.0916426420211792, "num_tokens": 950261.0, "step": 545 }, { "entropy": 7.305574369430542, "epoch": 0.43103448275862066, "grad_norm": 1.1796875, "learning_rate": 0.0002745, "loss": 7.0461, "mean_token_accuracy": 0.0905070275068283, "num_tokens": 959151.0, "step": 550 }, { "entropy": 7.299721527099609, "epoch": 0.43495297805642635, "grad_norm": 1.03125, "learning_rate": 0.000277, "loss": 7.0677, "mean_token_accuracy": 0.09062978066504002, "num_tokens": 968441.0, "step": 555 }, { "entropy": 7.134230327606201, "epoch": 0.438871473354232, "grad_norm": 0.96484375, "learning_rate": 0.0002795, "loss": 6.9371, "mean_token_accuracy": 0.09018276557326317, "num_tokens": 977058.0, "step": 560 }, { "entropy": 7.334470558166504, "epoch": 0.4427899686520376, "grad_norm": 1.234375, "learning_rate": 0.00028199999999999997, "loss": 6.9519, "mean_token_accuracy": 0.08950636759400368, "num_tokens": 986531.0, "step": 565 }, { "entropy": 7.123916816711426, "epoch": 0.44670846394984326, "grad_norm": 1.109375, "learning_rate": 0.0002845, "loss": 6.8108, "mean_token_accuracy": 0.09824811816215515, "num_tokens": 995081.0, "step": 570 }, { "entropy": 7.103513240814209, "epoch": 0.4506269592476489, "grad_norm": 1.390625, "learning_rate": 0.000287, "loss": 6.8802, "mean_token_accuracy": 0.09345417022705078, "num_tokens": 1003459.0, "step": 575 }, { "entropy": 7.101778936386109, "epoch": 0.45454545454545453, "grad_norm": 1.2734375, "learning_rate": 0.0002895, "loss": 6.9239, "mean_token_accuracy": 0.09018066227436065, "num_tokens": 1012420.0, "step": 580 }, { "entropy": 7.261321687698365, "epoch": 0.45846394984326017, "grad_norm": 1.5078125, "learning_rate": 0.000292, "loss": 6.9954, "mean_token_accuracy": 0.09160361662507058, "num_tokens": 1021198.0, "step": 585 }, { "entropy": 7.202180290222168, "epoch": 0.46238244514106586, "grad_norm": 1.3515625, "learning_rate": 0.0002945, "loss": 6.9694, "mean_token_accuracy": 0.09484207406640052, "num_tokens": 1030023.0, "step": 590 }, { "entropy": 7.164184045791626, "epoch": 0.4663009404388715, "grad_norm": 1.21875, "learning_rate": 0.000297, "loss": 6.8295, "mean_token_accuracy": 0.0953464850783348, "num_tokens": 1038824.0, "step": 595 }, { "entropy": 7.216883039474487, "epoch": 0.4702194357366771, "grad_norm": 1.109375, "learning_rate": 0.0002995, "loss": 7.0882, "mean_token_accuracy": 0.09343515783548355, "num_tokens": 1048207.0, "step": 600 }, { "entropy": 7.304975414276123, "epoch": 0.47413793103448276, "grad_norm": 1.109375, "learning_rate": 0.000302, "loss": 7.0559, "mean_token_accuracy": 0.08962106555700303, "num_tokens": 1057354.0, "step": 605 }, { "entropy": 7.198044776916504, "epoch": 0.4780564263322884, "grad_norm": 1.1484375, "learning_rate": 0.0003045, "loss": 6.9893, "mean_token_accuracy": 0.09330410435795784, "num_tokens": 1066825.0, "step": 610 }, { "entropy": 7.137618494033814, "epoch": 0.48197492163009403, "grad_norm": 1.2109375, "learning_rate": 0.000307, "loss": 6.9155, "mean_token_accuracy": 0.08996602892875671, "num_tokens": 1075752.0, "step": 615 }, { "entropy": 7.007559776306152, "epoch": 0.48589341692789967, "grad_norm": 1.109375, "learning_rate": 0.0003095, "loss": 6.8323, "mean_token_accuracy": 0.09399376884102821, "num_tokens": 1084826.0, "step": 620 }, { "entropy": 7.223559427261352, "epoch": 0.4898119122257053, "grad_norm": 1.2109375, "learning_rate": 0.000312, "loss": 6.9666, "mean_token_accuracy": 0.09359999522566795, "num_tokens": 1093425.0, "step": 625 }, { "entropy": 7.108446407318115, "epoch": 0.493730407523511, "grad_norm": 1.078125, "learning_rate": 0.0003145, "loss": 6.9828, "mean_token_accuracy": 0.089838757365942, "num_tokens": 1102619.0, "step": 630 }, { "entropy": 7.145089435577392, "epoch": 0.49764890282131663, "grad_norm": 1.109375, "learning_rate": 0.000317, "loss": 6.8601, "mean_token_accuracy": 0.09595804288983345, "num_tokens": 1111337.0, "step": 635 }, { "entropy": 7.005008172988892, "epoch": 0.5015673981191222, "grad_norm": 1.1484375, "learning_rate": 0.0003195, "loss": 6.7035, "mean_token_accuracy": 0.0990886114537716, "num_tokens": 1119381.0, "step": 640 }, { "entropy": 6.96934700012207, "epoch": 0.5054858934169278, "grad_norm": 1.4375, "learning_rate": 0.000322, "loss": 6.7597, "mean_token_accuracy": 0.09665322229266167, "num_tokens": 1127825.0, "step": 645 }, { "entropy": 7.11965799331665, "epoch": 0.5094043887147336, "grad_norm": 1.1875, "learning_rate": 0.00032450000000000003, "loss": 6.8203, "mean_token_accuracy": 0.09057366773486138, "num_tokens": 1136750.0, "step": 650 }, { "entropy": 7.017684459686279, "epoch": 0.5133228840125392, "grad_norm": 1.1875, "learning_rate": 0.00032700000000000003, "loss": 6.8499, "mean_token_accuracy": 0.09456580057740212, "num_tokens": 1145739.0, "step": 655 }, { "entropy": 7.034306764602661, "epoch": 0.5172413793103449, "grad_norm": 1.203125, "learning_rate": 0.00032950000000000004, "loss": 6.8973, "mean_token_accuracy": 0.09757498279213905, "num_tokens": 1154415.0, "step": 660 }, { "entropy": 7.052440977096557, "epoch": 0.5211598746081505, "grad_norm": 1.1640625, "learning_rate": 0.00033200000000000005, "loss": 6.7751, "mean_token_accuracy": 0.09611302688717842, "num_tokens": 1162480.0, "step": 665 }, { "entropy": 6.977913856506348, "epoch": 0.5250783699059561, "grad_norm": 1.15625, "learning_rate": 0.00033450000000000005, "loss": 6.8559, "mean_token_accuracy": 0.09223495721817017, "num_tokens": 1170774.0, "step": 670 }, { "entropy": 7.030881881713867, "epoch": 0.5289968652037618, "grad_norm": 1.046875, "learning_rate": 0.000337, "loss": 6.7924, "mean_token_accuracy": 0.09683787003159523, "num_tokens": 1179629.0, "step": 675 }, { "entropy": 7.053485012054443, "epoch": 0.5329153605015674, "grad_norm": 1.1171875, "learning_rate": 0.0003395, "loss": 6.938, "mean_token_accuracy": 0.09229604452848435, "num_tokens": 1189111.0, "step": 680 }, { "entropy": 7.108199834823608, "epoch": 0.536833855799373, "grad_norm": 1.1171875, "learning_rate": 0.000342, "loss": 6.9495, "mean_token_accuracy": 0.09128881692886352, "num_tokens": 1198827.0, "step": 685 }, { "entropy": 7.092297840118408, "epoch": 0.5407523510971787, "grad_norm": 1.109375, "learning_rate": 0.00034449999999999997, "loss": 6.8482, "mean_token_accuracy": 0.09970205947756768, "num_tokens": 1207089.0, "step": 690 }, { "entropy": 7.057426071166992, "epoch": 0.5446708463949843, "grad_norm": 1.1875, "learning_rate": 0.000347, "loss": 6.8435, "mean_token_accuracy": 0.08852889537811279, "num_tokens": 1216509.0, "step": 695 }, { "entropy": 6.900876426696778, "epoch": 0.54858934169279, "grad_norm": 1.0625, "learning_rate": 0.0003495, "loss": 6.6756, "mean_token_accuracy": 0.10320580378174782, "num_tokens": 1225684.0, "step": 700 }, { "entropy": 7.005167055130005, "epoch": 0.5525078369905956, "grad_norm": 1.2265625, "learning_rate": 0.000352, "loss": 6.7057, "mean_token_accuracy": 0.10480915755033493, "num_tokens": 1233675.0, "step": 705 }, { "entropy": 6.892497873306274, "epoch": 0.5564263322884012, "grad_norm": 1.3046875, "learning_rate": 0.0003545, "loss": 6.7091, "mean_token_accuracy": 0.10067695155739784, "num_tokens": 1242147.0, "step": 710 }, { "entropy": 6.934285736083984, "epoch": 0.5603448275862069, "grad_norm": 1.0234375, "learning_rate": 0.000357, "loss": 6.8249, "mean_token_accuracy": 0.09867035746574401, "num_tokens": 1251127.0, "step": 715 }, { "entropy": 6.978139781951905, "epoch": 0.5642633228840125, "grad_norm": 1.0703125, "learning_rate": 0.0003595, "loss": 6.7775, "mean_token_accuracy": 0.09740801975131035, "num_tokens": 1260589.0, "step": 720 }, { "entropy": 7.05702314376831, "epoch": 0.5681818181818182, "grad_norm": 1.2578125, "learning_rate": 0.000362, "loss": 6.8522, "mean_token_accuracy": 0.09754758477210998, "num_tokens": 1269281.0, "step": 725 }, { "entropy": 6.951777076721191, "epoch": 0.5721003134796239, "grad_norm": 1.0625, "learning_rate": 0.0003645, "loss": 6.7117, "mean_token_accuracy": 0.09818840324878693, "num_tokens": 1278033.0, "step": 730 }, { "entropy": 7.001934242248535, "epoch": 0.5760188087774295, "grad_norm": 1.171875, "learning_rate": 0.000367, "loss": 6.7808, "mean_token_accuracy": 0.10354246944189072, "num_tokens": 1285847.0, "step": 735 }, { "entropy": 6.932463216781616, "epoch": 0.5799373040752351, "grad_norm": 1.1015625, "learning_rate": 0.0003695, "loss": 6.7726, "mean_token_accuracy": 0.09513568431138993, "num_tokens": 1294770.0, "step": 740 }, { "entropy": 7.036710739135742, "epoch": 0.5838557993730408, "grad_norm": 1.234375, "learning_rate": 0.000372, "loss": 6.9512, "mean_token_accuracy": 0.0940048098564148, "num_tokens": 1304282.0, "step": 745 }, { "entropy": 6.997774505615235, "epoch": 0.5877742946708464, "grad_norm": 1.1015625, "learning_rate": 0.0003745, "loss": 6.7923, "mean_token_accuracy": 0.09595935121178627, "num_tokens": 1313218.0, "step": 750 }, { "entropy": 6.966546869277954, "epoch": 0.591692789968652, "grad_norm": 1.0859375, "learning_rate": 0.000377, "loss": 6.754, "mean_token_accuracy": 0.0985157236456871, "num_tokens": 1322093.0, "step": 755 }, { "entropy": 6.83478045463562, "epoch": 0.5956112852664577, "grad_norm": 1.125, "learning_rate": 0.0003795, "loss": 6.7857, "mean_token_accuracy": 0.09971616193652152, "num_tokens": 1330718.0, "step": 760 }, { "entropy": 6.954968690872192, "epoch": 0.5995297805642633, "grad_norm": 1.2265625, "learning_rate": 0.000382, "loss": 6.6457, "mean_token_accuracy": 0.10431547313928605, "num_tokens": 1339672.0, "step": 765 }, { "entropy": 6.861670970916748, "epoch": 0.603448275862069, "grad_norm": 1.1796875, "learning_rate": 0.0003845, "loss": 6.7001, "mean_token_accuracy": 0.10293109342455864, "num_tokens": 1348587.0, "step": 770 }, { "entropy": 6.8885609149932865, "epoch": 0.6073667711598746, "grad_norm": 1.109375, "learning_rate": 0.00038700000000000003, "loss": 6.6826, "mean_token_accuracy": 0.10689334198832512, "num_tokens": 1357597.0, "step": 775 }, { "entropy": 6.883357572555542, "epoch": 0.6112852664576802, "grad_norm": 1.1640625, "learning_rate": 0.00038950000000000003, "loss": 6.7744, "mean_token_accuracy": 0.101459039747715, "num_tokens": 1366246.0, "step": 780 }, { "entropy": 6.89829683303833, "epoch": 0.6152037617554859, "grad_norm": 1.1796875, "learning_rate": 0.00039200000000000004, "loss": 6.6695, "mean_token_accuracy": 0.10532263070344924, "num_tokens": 1374664.0, "step": 785 }, { "entropy": 6.8325498580932615, "epoch": 0.6191222570532915, "grad_norm": 1.3046875, "learning_rate": 0.00039450000000000005, "loss": 6.6889, "mean_token_accuracy": 0.10108358785510063, "num_tokens": 1382765.0, "step": 790 }, { "entropy": 6.7815876483917235, "epoch": 0.6230407523510971, "grad_norm": 1.1484375, "learning_rate": 0.00039700000000000005, "loss": 6.6442, "mean_token_accuracy": 0.10871021300554276, "num_tokens": 1391708.0, "step": 795 }, { "entropy": 6.786308908462525, "epoch": 0.6269592476489029, "grad_norm": 1.171875, "learning_rate": 0.0003995, "loss": 6.814, "mean_token_accuracy": 0.09444232732057571, "num_tokens": 1401373.0, "step": 800 }, { "entropy": 6.926651859283448, "epoch": 0.6308777429467085, "grad_norm": 1.109375, "learning_rate": 0.000402, "loss": 6.6124, "mean_token_accuracy": 0.10876154825091362, "num_tokens": 1409702.0, "step": 805 }, { "entropy": 6.87204418182373, "epoch": 0.6347962382445141, "grad_norm": 1.078125, "learning_rate": 0.0004045, "loss": 6.8084, "mean_token_accuracy": 0.10008606985211373, "num_tokens": 1419596.0, "step": 810 }, { "entropy": 6.807423734664917, "epoch": 0.6387147335423198, "grad_norm": 1.28125, "learning_rate": 0.00040699999999999997, "loss": 6.7461, "mean_token_accuracy": 0.10076582729816437, "num_tokens": 1428437.0, "step": 815 }, { "entropy": 6.9204614639282225, "epoch": 0.6426332288401254, "grad_norm": 1.125, "learning_rate": 0.0004095, "loss": 6.6547, "mean_token_accuracy": 0.10060450211167335, "num_tokens": 1436764.0, "step": 820 }, { "entropy": 6.812792015075684, "epoch": 0.646551724137931, "grad_norm": 1.1640625, "learning_rate": 0.000412, "loss": 6.7007, "mean_token_accuracy": 0.10875345095992088, "num_tokens": 1445755.0, "step": 825 }, { "entropy": 6.902109909057617, "epoch": 0.6504702194357367, "grad_norm": 1.2890625, "learning_rate": 0.0004145, "loss": 6.7627, "mean_token_accuracy": 0.10212339907884598, "num_tokens": 1454550.0, "step": 830 }, { "entropy": 6.757165718078613, "epoch": 0.6543887147335423, "grad_norm": 1.3125, "learning_rate": 0.000417, "loss": 6.6529, "mean_token_accuracy": 0.1076908752322197, "num_tokens": 1463039.0, "step": 835 }, { "entropy": 6.920673799514771, "epoch": 0.658307210031348, "grad_norm": 1.1328125, "learning_rate": 0.0004195, "loss": 6.7125, "mean_token_accuracy": 0.10059169679880142, "num_tokens": 1472029.0, "step": 840 }, { "entropy": 6.742834091186523, "epoch": 0.6622257053291536, "grad_norm": 1.109375, "learning_rate": 0.000422, "loss": 6.723, "mean_token_accuracy": 0.10695556625723839, "num_tokens": 1481588.0, "step": 845 }, { "entropy": 6.7755883693695065, "epoch": 0.6661442006269592, "grad_norm": 1.2265625, "learning_rate": 0.0004245, "loss": 6.6131, "mean_token_accuracy": 0.1035246841609478, "num_tokens": 1489915.0, "step": 850 }, { "entropy": 6.87349271774292, "epoch": 0.6700626959247649, "grad_norm": 1.03125, "learning_rate": 0.000427, "loss": 6.7211, "mean_token_accuracy": 0.10070185288786888, "num_tokens": 1497920.0, "step": 855 }, { "entropy": 6.742019605636597, "epoch": 0.6739811912225705, "grad_norm": 1.1640625, "learning_rate": 0.0004295, "loss": 6.5571, "mean_token_accuracy": 0.10514650270342826, "num_tokens": 1506965.0, "step": 860 }, { "entropy": 6.755401372909546, "epoch": 0.6778996865203761, "grad_norm": 1.078125, "learning_rate": 0.000432, "loss": 6.6626, "mean_token_accuracy": 0.10886923670768738, "num_tokens": 1516028.0, "step": 865 }, { "entropy": 6.824488735198974, "epoch": 0.6818181818181818, "grad_norm": 1.1875, "learning_rate": 0.0004345, "loss": 6.6802, "mean_token_accuracy": 0.10832962691783905, "num_tokens": 1524579.0, "step": 870 }, { "entropy": 6.729337167739868, "epoch": 0.6857366771159875, "grad_norm": 1.015625, "learning_rate": 0.000437, "loss": 6.605, "mean_token_accuracy": 0.1062053769826889, "num_tokens": 1534285.0, "step": 875 }, { "entropy": 6.7189655780792235, "epoch": 0.6896551724137931, "grad_norm": 1.0546875, "learning_rate": 0.0004395, "loss": 6.5898, "mean_token_accuracy": 0.10706395953893662, "num_tokens": 1543701.0, "step": 880 }, { "entropy": 6.891758966445923, "epoch": 0.6935736677115988, "grad_norm": 0.96875, "learning_rate": 0.000442, "loss": 6.8576, "mean_token_accuracy": 0.09462928622961045, "num_tokens": 1552715.0, "step": 885 }, { "entropy": 6.6106942653656, "epoch": 0.6974921630094044, "grad_norm": 1.09375, "learning_rate": 0.0004445, "loss": 6.4728, "mean_token_accuracy": 0.1057778999209404, "num_tokens": 1561523.0, "step": 890 }, { "entropy": 6.685323572158813, "epoch": 0.70141065830721, "grad_norm": 1.21875, "learning_rate": 0.000447, "loss": 6.6995, "mean_token_accuracy": 0.10680059865117073, "num_tokens": 1570057.0, "step": 895 }, { "entropy": 6.810699081420898, "epoch": 0.7053291536050157, "grad_norm": 1.078125, "learning_rate": 0.00044950000000000003, "loss": 6.5487, "mean_token_accuracy": 0.10731169655919075, "num_tokens": 1578795.0, "step": 900 }, { "entropy": 6.618034505844117, "epoch": 0.7092476489028213, "grad_norm": 1.15625, "learning_rate": 0.00045200000000000004, "loss": 6.627, "mean_token_accuracy": 0.10445040464401245, "num_tokens": 1588108.0, "step": 905 }, { "entropy": 6.834760522842407, "epoch": 0.713166144200627, "grad_norm": 1.1796875, "learning_rate": 0.00045450000000000004, "loss": 6.5897, "mean_token_accuracy": 0.10842868015170097, "num_tokens": 1596545.0, "step": 910 }, { "entropy": 6.617375135421753, "epoch": 0.7170846394984326, "grad_norm": 1.0546875, "learning_rate": 0.00045700000000000005, "loss": 6.5774, "mean_token_accuracy": 0.11101481318473816, "num_tokens": 1605098.0, "step": 915 }, { "entropy": 6.7286604881286625, "epoch": 0.7210031347962382, "grad_norm": 1.1015625, "learning_rate": 0.00045950000000000006, "loss": 6.5928, "mean_token_accuracy": 0.11244359910488129, "num_tokens": 1613760.0, "step": 920 }, { "entropy": 6.685993957519531, "epoch": 0.7249216300940439, "grad_norm": 1.2578125, "learning_rate": 0.000462, "loss": 6.6311, "mean_token_accuracy": 0.10528192594647408, "num_tokens": 1622345.0, "step": 925 }, { "entropy": 6.825484371185302, "epoch": 0.7288401253918495, "grad_norm": 1.203125, "learning_rate": 0.0004645, "loss": 6.6198, "mean_token_accuracy": 0.10753775164484977, "num_tokens": 1630706.0, "step": 930 }, { "entropy": 6.736838388442993, "epoch": 0.7327586206896551, "grad_norm": 1.2265625, "learning_rate": 0.000467, "loss": 6.6851, "mean_token_accuracy": 0.10634701699018478, "num_tokens": 1639880.0, "step": 935 }, { "entropy": 6.709425592422486, "epoch": 0.7366771159874608, "grad_norm": 1.0859375, "learning_rate": 0.0004695, "loss": 6.5189, "mean_token_accuracy": 0.10119672417640686, "num_tokens": 1648325.0, "step": 940 }, { "entropy": 6.547716999053955, "epoch": 0.7405956112852664, "grad_norm": 1.2109375, "learning_rate": 0.000472, "loss": 6.4928, "mean_token_accuracy": 0.11216101795434952, "num_tokens": 1656951.0, "step": 945 }, { "entropy": 6.6757955074310305, "epoch": 0.7445141065830722, "grad_norm": 1.15625, "learning_rate": 0.0004745, "loss": 6.6264, "mean_token_accuracy": 0.1065959431231022, "num_tokens": 1665832.0, "step": 950 }, { "entropy": 6.653597450256347, "epoch": 0.7484326018808778, "grad_norm": 1.203125, "learning_rate": 0.000477, "loss": 6.5458, "mean_token_accuracy": 0.10681739151477813, "num_tokens": 1675116.0, "step": 955 }, { "entropy": 6.55257477760315, "epoch": 0.7523510971786834, "grad_norm": 1.1015625, "learning_rate": 0.0004795, "loss": 6.5317, "mean_token_accuracy": 0.11514699757099152, "num_tokens": 1684195.0, "step": 960 }, { "entropy": 6.689774370193481, "epoch": 0.7562695924764891, "grad_norm": 1.046875, "learning_rate": 0.000482, "loss": 6.4895, "mean_token_accuracy": 0.11039396822452545, "num_tokens": 1693025.0, "step": 965 }, { "entropy": 6.481554937362671, "epoch": 0.7601880877742947, "grad_norm": 1.109375, "learning_rate": 0.0004845, "loss": 6.4836, "mean_token_accuracy": 0.10764000788331032, "num_tokens": 1701210.0, "step": 970 }, { "entropy": 6.6965916633605955, "epoch": 0.7641065830721003, "grad_norm": 1.140625, "learning_rate": 0.000487, "loss": 6.5847, "mean_token_accuracy": 0.10125251486897469, "num_tokens": 1711026.0, "step": 975 }, { "entropy": 6.65680046081543, "epoch": 0.768025078369906, "grad_norm": 1.171875, "learning_rate": 0.0004895, "loss": 6.5874, "mean_token_accuracy": 0.10824618190526962, "num_tokens": 1719441.0, "step": 980 }, { "entropy": 6.599896430969238, "epoch": 0.7719435736677116, "grad_norm": 1.09375, "learning_rate": 0.000492, "loss": 6.5849, "mean_token_accuracy": 0.10633337944746017, "num_tokens": 1728300.0, "step": 985 }, { "entropy": 6.620725393295288, "epoch": 0.7758620689655172, "grad_norm": 1.234375, "learning_rate": 0.0004945, "loss": 6.4769, "mean_token_accuracy": 0.11418513432145119, "num_tokens": 1736364.0, "step": 990 }, { "entropy": 6.466502714157104, "epoch": 0.7797805642633229, "grad_norm": 1.03125, "learning_rate": 0.000497, "loss": 6.3224, "mean_token_accuracy": 0.1263233445584774, "num_tokens": 1745043.0, "step": 995 }, { "entropy": 6.654029464721679, "epoch": 0.7836990595611285, "grad_norm": 1.171875, "learning_rate": 0.0004995, "loss": 6.4667, "mean_token_accuracy": 0.11487501338124276, "num_tokens": 1754008.0, "step": 1000 }, { "epoch": 0.7836990595611285, "eval_entropy": 6.3669652088668, "eval_loss": 6.630230903625488, "eval_mean_token_accuracy": 0.11402813254227472, "eval_num_tokens": 1754008.0, "eval_runtime": 2.8378, "eval_samples_per_second": 1452.509, "eval_steps_per_second": 181.828, "step": 1000 }, { "entropy": 6.444351482391357, "epoch": 0.7876175548589341, "grad_norm": 1.0, "learning_rate": 0.0004999998713243189, "loss": 6.5346, "mean_token_accuracy": 0.11141253411769866, "num_tokens": 1763592.0, "step": 1005 }, { "entropy": 6.584927940368653, "epoch": 0.7915360501567398, "grad_norm": 1.1796875, "learning_rate": 0.0004999993485796164, "loss": 6.403, "mean_token_accuracy": 0.1155676744878292, "num_tokens": 1773111.0, "step": 1010 }, { "entropy": 6.580813217163086, "epoch": 0.7954545454545454, "grad_norm": 1.265625, "learning_rate": 0.0004999984237245962, "loss": 6.4624, "mean_token_accuracy": 0.11948382258415222, "num_tokens": 1781230.0, "step": 1015 }, { "entropy": 6.740645933151245, "epoch": 0.799373040752351, "grad_norm": 1.3828125, "learning_rate": 0.0004999970967609109, "loss": 6.6906, "mean_token_accuracy": 0.1132524773478508, "num_tokens": 1790324.0, "step": 1020 }, { "entropy": 6.513845157623291, "epoch": 0.8032915360501567, "grad_norm": 1.265625, "learning_rate": 0.0004999953676909322, "loss": 6.5348, "mean_token_accuracy": 0.1044821061193943, "num_tokens": 1798849.0, "step": 1025 }, { "entropy": 6.588596630096435, "epoch": 0.8072100313479624, "grad_norm": 1.3671875, "learning_rate": 0.0004999932365177502, "loss": 6.4097, "mean_token_accuracy": 0.11618589833378792, "num_tokens": 1808019.0, "step": 1030 }, { "entropy": 6.624749708175659, "epoch": 0.8111285266457681, "grad_norm": 1.1953125, "learning_rate": 0.0004999907032451735, "loss": 6.5302, "mean_token_accuracy": 0.11330178454518318, "num_tokens": 1816550.0, "step": 1035 }, { "entropy": 6.507697629928589, "epoch": 0.8150470219435737, "grad_norm": 1.109375, "learning_rate": 0.0004999877678777296, "loss": 6.4044, "mean_token_accuracy": 0.12338297590613365, "num_tokens": 1825238.0, "step": 1040 }, { "entropy": 6.4800478458404545, "epoch": 0.8189655172413793, "grad_norm": 1.046875, "learning_rate": 0.0004999844304206645, "loss": 6.3892, "mean_token_accuracy": 0.11694491282105446, "num_tokens": 1833645.0, "step": 1045 }, { "entropy": 6.564522838592529, "epoch": 0.822884012539185, "grad_norm": 1.0390625, "learning_rate": 0.0004999806908799428, "loss": 6.4637, "mean_token_accuracy": 0.11064697802066803, "num_tokens": 1842892.0, "step": 1050 }, { "entropy": 6.528485631942749, "epoch": 0.8268025078369906, "grad_norm": 1.1875, "learning_rate": 0.0004999765492622474, "loss": 6.5046, "mean_token_accuracy": 0.11715267226099968, "num_tokens": 1851400.0, "step": 1055 }, { "entropy": 6.596149349212647, "epoch": 0.8307210031347962, "grad_norm": 1.078125, "learning_rate": 0.0004999720055749804, "loss": 6.4464, "mean_token_accuracy": 0.11808413341641426, "num_tokens": 1860088.0, "step": 1060 }, { "entropy": 6.574128770828247, "epoch": 0.8346394984326019, "grad_norm": 1.1484375, "learning_rate": 0.0004999670598262619, "loss": 6.4971, "mean_token_accuracy": 0.11734354719519616, "num_tokens": 1868558.0, "step": 1065 }, { "entropy": 6.427276229858398, "epoch": 0.8385579937304075, "grad_norm": 1.2109375, "learning_rate": 0.0004999617120249308, "loss": 6.4897, "mean_token_accuracy": 0.11183837950229644, "num_tokens": 1877624.0, "step": 1070 }, { "entropy": 6.671892261505127, "epoch": 0.8424764890282131, "grad_norm": 1.1875, "learning_rate": 0.0004999559621805445, "loss": 6.5975, "mean_token_accuracy": 0.11244333311915397, "num_tokens": 1886505.0, "step": 1075 }, { "entropy": 6.452223205566407, "epoch": 0.8463949843260188, "grad_norm": 1.1640625, "learning_rate": 0.0004999498103033788, "loss": 6.3456, "mean_token_accuracy": 0.12142532989382744, "num_tokens": 1895701.0, "step": 1080 }, { "entropy": 6.390986537933349, "epoch": 0.8503134796238244, "grad_norm": 1.0859375, "learning_rate": 0.0004999432564044284, "loss": 6.3008, "mean_token_accuracy": 0.12294157966971397, "num_tokens": 1904640.0, "step": 1085 }, { "entropy": 6.614175415039062, "epoch": 0.85423197492163, "grad_norm": 1.1171875, "learning_rate": 0.0004999363004954058, "loss": 6.5357, "mean_token_accuracy": 0.11326182112097741, "num_tokens": 1914216.0, "step": 1090 }, { "entropy": 6.48394021987915, "epoch": 0.8581504702194357, "grad_norm": 1.1171875, "learning_rate": 0.0004999289425887425, "loss": 6.4573, "mean_token_accuracy": 0.11912907212972641, "num_tokens": 1923280.0, "step": 1095 }, { "entropy": 6.611083984375, "epoch": 0.8620689655172413, "grad_norm": 1.1171875, "learning_rate": 0.0004999211826975884, "loss": 6.4764, "mean_token_accuracy": 0.11999231576919556, "num_tokens": 1931403.0, "step": 1100 }, { "entropy": 6.5545590877532955, "epoch": 0.8659874608150471, "grad_norm": 1.0234375, "learning_rate": 0.0004999130208358114, "loss": 6.4882, "mean_token_accuracy": 0.11854184418916702, "num_tokens": 1940774.0, "step": 1105 }, { "entropy": 6.561131238937378, "epoch": 0.8699059561128527, "grad_norm": 1.1640625, "learning_rate": 0.0004999044570179983, "loss": 6.5687, "mean_token_accuracy": 0.11162992268800735, "num_tokens": 1950045.0, "step": 1110 }, { "entropy": 6.606174612045288, "epoch": 0.8738244514106583, "grad_norm": 1.140625, "learning_rate": 0.0004998954912594541, "loss": 6.4405, "mean_token_accuracy": 0.11669689863920212, "num_tokens": 1959303.0, "step": 1115 }, { "entropy": 6.492517614364624, "epoch": 0.877742946708464, "grad_norm": 1.1171875, "learning_rate": 0.0004998861235762018, "loss": 6.424, "mean_token_accuracy": 0.1151531957089901, "num_tokens": 1968583.0, "step": 1120 }, { "entropy": 6.590066766738891, "epoch": 0.8816614420062696, "grad_norm": 1.0546875, "learning_rate": 0.0004998763539849832, "loss": 6.5317, "mean_token_accuracy": 0.11314806714653969, "num_tokens": 1977489.0, "step": 1125 }, { "entropy": 6.413249540328979, "epoch": 0.8855799373040752, "grad_norm": 1.109375, "learning_rate": 0.0004998661825032579, "loss": 6.306, "mean_token_accuracy": 0.1186840571463108, "num_tokens": 1985308.0, "step": 1130 }, { "entropy": 6.476549482345581, "epoch": 0.8894984326018809, "grad_norm": 1.1796875, "learning_rate": 0.0004998556091492043, "loss": 6.4658, "mean_token_accuracy": 0.11185984387993812, "num_tokens": 1993930.0, "step": 1135 }, { "entropy": 6.523940944671631, "epoch": 0.8934169278996865, "grad_norm": 1.046875, "learning_rate": 0.0004998446339417184, "loss": 6.4676, "mean_token_accuracy": 0.10710446015000344, "num_tokens": 2003802.0, "step": 1140 }, { "entropy": 6.495583724975586, "epoch": 0.8973354231974922, "grad_norm": 1.1015625, "learning_rate": 0.0004998332569004149, "loss": 6.3809, "mean_token_accuracy": 0.11923198625445366, "num_tokens": 2011957.0, "step": 1145 }, { "entropy": 6.457823705673218, "epoch": 0.9012539184952978, "grad_norm": 1.09375, "learning_rate": 0.0004998214780456263, "loss": 6.3813, "mean_token_accuracy": 0.12053523734211921, "num_tokens": 2020741.0, "step": 1150 }, { "entropy": 6.47773175239563, "epoch": 0.9051724137931034, "grad_norm": 1.078125, "learning_rate": 0.0004998092973984033, "loss": 6.4479, "mean_token_accuracy": 0.11438319608569145, "num_tokens": 2029804.0, "step": 1155 }, { "entropy": 6.543193197250366, "epoch": 0.9090909090909091, "grad_norm": 1.0546875, "learning_rate": 0.0004997967149805147, "loss": 6.5013, "mean_token_accuracy": 0.116908498108387, "num_tokens": 2038732.0, "step": 1160 }, { "entropy": 6.49971113204956, "epoch": 0.9130094043887147, "grad_norm": 1.1875, "learning_rate": 0.0004997837308144474, "loss": 6.4002, "mean_token_accuracy": 0.12262723073363305, "num_tokens": 2046362.0, "step": 1165 }, { "entropy": 6.484364652633667, "epoch": 0.9169278996865203, "grad_norm": 1.0625, "learning_rate": 0.0004997703449234062, "loss": 6.3721, "mean_token_accuracy": 0.11172138154506683, "num_tokens": 2056018.0, "step": 1170 }, { "entropy": 6.384702682495117, "epoch": 0.920846394984326, "grad_norm": 1.0546875, "learning_rate": 0.0004997565573313139, "loss": 6.3547, "mean_token_accuracy": 0.11563765183091164, "num_tokens": 2065077.0, "step": 1175 }, { "entropy": 6.540685033798217, "epoch": 0.9247648902821317, "grad_norm": 1.078125, "learning_rate": 0.0004997423680628109, "loss": 6.3473, "mean_token_accuracy": 0.1140012837946415, "num_tokens": 2073405.0, "step": 1180 }, { "entropy": 6.3114136219024655, "epoch": 0.9286833855799373, "grad_norm": 1.1484375, "learning_rate": 0.000499727777143256, "loss": 6.3174, "mean_token_accuracy": 0.11831804886460304, "num_tokens": 2081891.0, "step": 1185 }, { "entropy": 6.565793228149414, "epoch": 0.932601880877743, "grad_norm": 1.1796875, "learning_rate": 0.0004997127845987255, "loss": 6.3823, "mean_token_accuracy": 0.11906470507383346, "num_tokens": 2090732.0, "step": 1190 }, { "entropy": 6.347681045532227, "epoch": 0.9365203761755486, "grad_norm": 1.046875, "learning_rate": 0.0004996973904560135, "loss": 6.306, "mean_token_accuracy": 0.12649127021431922, "num_tokens": 2099352.0, "step": 1195 }, { "entropy": 6.5376229763031, "epoch": 0.9404388714733543, "grad_norm": 1.0703125, "learning_rate": 0.0004996815947426317, "loss": 6.346, "mean_token_accuracy": 0.1162750244140625, "num_tokens": 2107365.0, "step": 1200 }, { "entropy": 6.353217506408692, "epoch": 0.9443573667711599, "grad_norm": 1.0703125, "learning_rate": 0.0004996653974868097, "loss": 6.218, "mean_token_accuracy": 0.1250108003616333, "num_tokens": 2116192.0, "step": 1205 }, { "entropy": 6.376781034469604, "epoch": 0.9482758620689655, "grad_norm": 1.078125, "learning_rate": 0.0004996487987174946, "loss": 6.3167, "mean_token_accuracy": 0.12126539349555969, "num_tokens": 2124410.0, "step": 1210 }, { "entropy": 6.452775669097901, "epoch": 0.9521943573667712, "grad_norm": 1.1328125, "learning_rate": 0.0004996317984643512, "loss": 6.3483, "mean_token_accuracy": 0.11980568394064903, "num_tokens": 2132799.0, "step": 1215 }, { "entropy": 6.459938097000122, "epoch": 0.9561128526645768, "grad_norm": 1.0703125, "learning_rate": 0.0004996143967577615, "loss": 6.4935, "mean_token_accuracy": 0.11375040411949158, "num_tokens": 2141589.0, "step": 1220 }, { "entropy": 6.466125345230102, "epoch": 0.9600313479623824, "grad_norm": 1.109375, "learning_rate": 0.0004995965936288254, "loss": 6.3714, "mean_token_accuracy": 0.1200237862765789, "num_tokens": 2150704.0, "step": 1225 }, { "entropy": 6.399578809738159, "epoch": 0.9639498432601881, "grad_norm": 1.1015625, "learning_rate": 0.0004995783891093597, "loss": 6.2958, "mean_token_accuracy": 0.12307887598872184, "num_tokens": 2159587.0, "step": 1230 }, { "entropy": 6.330127573013305, "epoch": 0.9678683385579937, "grad_norm": 1.0703125, "learning_rate": 0.000499559783231899, "loss": 6.2928, "mean_token_accuracy": 0.11578266024589538, "num_tokens": 2169071.0, "step": 1235 }, { "entropy": 6.421966409683227, "epoch": 0.9717868338557993, "grad_norm": 1.0390625, "learning_rate": 0.0004995407760296952, "loss": 6.3688, "mean_token_accuracy": 0.11582615077495576, "num_tokens": 2178248.0, "step": 1240 }, { "entropy": 6.425446796417236, "epoch": 0.975705329153605, "grad_norm": 1.1015625, "learning_rate": 0.0004995213675367169, "loss": 6.2732, "mean_token_accuracy": 0.11606336981058121, "num_tokens": 2187520.0, "step": 1245 }, { "entropy": 6.3828057765960695, "epoch": 0.9796238244514106, "grad_norm": 0.96484375, "learning_rate": 0.0004995015577876504, "loss": 6.3415, "mean_token_accuracy": 0.11500431299209594, "num_tokens": 2196861.0, "step": 1250 }, { "entropy": 6.361384725570678, "epoch": 0.9835423197492164, "grad_norm": 1.1015625, "learning_rate": 0.000499481346817899, "loss": 6.2502, "mean_token_accuracy": 0.12506693974137306, "num_tokens": 2205523.0, "step": 1255 }, { "entropy": 6.345028877258301, "epoch": 0.987460815047022, "grad_norm": 1.109375, "learning_rate": 0.0004994607346635829, "loss": 6.215, "mean_token_accuracy": 0.12786355316638948, "num_tokens": 2214291.0, "step": 1260 }, { "entropy": 6.440300559997558, "epoch": 0.9913793103448276, "grad_norm": 1.1171875, "learning_rate": 0.0004994397213615394, "loss": 6.3295, "mean_token_accuracy": 0.11995508670806884, "num_tokens": 2223093.0, "step": 1265 }, { "entropy": 6.406629800796509, "epoch": 0.9952978056426333, "grad_norm": 1.1015625, "learning_rate": 0.0004994183069493225, "loss": 6.2832, "mean_token_accuracy": 0.1210085429251194, "num_tokens": 2231551.0, "step": 1270 }, { "entropy": 6.222655868530273, "epoch": 0.9992163009404389, "grad_norm": 1.0546875, "learning_rate": 0.0004993964914652034, "loss": 6.2829, "mean_token_accuracy": 0.1268168218433857, "num_tokens": 2240650.0, "step": 1275 }, { "entropy": 6.511307573318481, "epoch": 1.0031347962382444, "grad_norm": 1.0, "learning_rate": 0.0004993742749481699, "loss": 6.2046, "mean_token_accuracy": 0.12171700075268746, "num_tokens": 2249392.0, "step": 1280 }, { "entropy": 6.226445293426513, "epoch": 1.0070532915360502, "grad_norm": 1.0859375, "learning_rate": 0.0004993516574379265, "loss": 6.0221, "mean_token_accuracy": 0.12889871895313262, "num_tokens": 2258100.0, "step": 1285 }, { "entropy": 6.256803798675537, "epoch": 1.0109717868338557, "grad_norm": 1.0703125, "learning_rate": 0.0004993286389748943, "loss": 6.0869, "mean_token_accuracy": 0.13081810474395753, "num_tokens": 2266661.0, "step": 1290 }, { "entropy": 6.348307514190674, "epoch": 1.0148902821316614, "grad_norm": 1.078125, "learning_rate": 0.0004993052196002112, "loss": 6.1897, "mean_token_accuracy": 0.12489164769649505, "num_tokens": 2275475.0, "step": 1295 }, { "entropy": 6.266294240951538, "epoch": 1.0188087774294672, "grad_norm": 1.046875, "learning_rate": 0.0004992813993557312, "loss": 6.1038, "mean_token_accuracy": 0.12707522958517076, "num_tokens": 2284077.0, "step": 1300 }, { "entropy": 6.261200428009033, "epoch": 1.0227272727272727, "grad_norm": 1.046875, "learning_rate": 0.000499257178284025, "loss": 6.1127, "mean_token_accuracy": 0.12442725002765656, "num_tokens": 2293029.0, "step": 1305 }, { "entropy": 6.262345981597901, "epoch": 1.0266457680250785, "grad_norm": 1.171875, "learning_rate": 0.0004992325564283797, "loss": 6.0351, "mean_token_accuracy": 0.13578465953469276, "num_tokens": 2301574.0, "step": 1310 }, { "entropy": 6.176527118682861, "epoch": 1.030564263322884, "grad_norm": 1.0859375, "learning_rate": 0.0004992075338327984, "loss": 6.0354, "mean_token_accuracy": 0.13138693422079087, "num_tokens": 2310366.0, "step": 1315 }, { "entropy": 6.3460509300231935, "epoch": 1.0344827586206897, "grad_norm": 1.0859375, "learning_rate": 0.0004991821105420006, "loss": 6.1592, "mean_token_accuracy": 0.1265966959297657, "num_tokens": 2318943.0, "step": 1320 }, { "entropy": 6.320439672470092, "epoch": 1.0384012539184952, "grad_norm": 1.1328125, "learning_rate": 0.0004991562866014219, "loss": 6.1486, "mean_token_accuracy": 0.12730259895324708, "num_tokens": 2327173.0, "step": 1325 }, { "entropy": 6.314588499069214, "epoch": 1.042319749216301, "grad_norm": 1.1484375, "learning_rate": 0.0004991300620572138, "loss": 6.1095, "mean_token_accuracy": 0.1259642593562603, "num_tokens": 2336194.0, "step": 1330 }, { "entropy": 6.226988649368286, "epoch": 1.0462382445141065, "grad_norm": 1.078125, "learning_rate": 0.0004991034369562438, "loss": 6.1824, "mean_token_accuracy": 0.13008867129683493, "num_tokens": 2344721.0, "step": 1335 }, { "entropy": 6.249088525772095, "epoch": 1.0501567398119123, "grad_norm": 1.1015625, "learning_rate": 0.0004990764113460953, "loss": 6.1052, "mean_token_accuracy": 0.1238970473408699, "num_tokens": 2353406.0, "step": 1340 }, { "entropy": 6.400324296951294, "epoch": 1.0540752351097178, "grad_norm": 1.0234375, "learning_rate": 0.0004990489852750675, "loss": 6.2738, "mean_token_accuracy": 0.11888742819428444, "num_tokens": 2361896.0, "step": 1345 }, { "entropy": 6.248968696594238, "epoch": 1.0579937304075235, "grad_norm": 0.98046875, "learning_rate": 0.0004990211587921751, "loss": 6.1014, "mean_token_accuracy": 0.1269259825348854, "num_tokens": 2371861.0, "step": 1350 }, { "entropy": 6.221920919418335, "epoch": 1.061912225705329, "grad_norm": 1.125, "learning_rate": 0.0004989929319471487, "loss": 6.0119, "mean_token_accuracy": 0.13410131856799126, "num_tokens": 2381555.0, "step": 1355 }, { "entropy": 6.21340970993042, "epoch": 1.0658307210031348, "grad_norm": 1.078125, "learning_rate": 0.0004989643047904341, "loss": 6.019, "mean_token_accuracy": 0.13659467175602913, "num_tokens": 2389851.0, "step": 1360 }, { "entropy": 6.126182651519775, "epoch": 1.0697492163009406, "grad_norm": 1.1171875, "learning_rate": 0.0004989352773731928, "loss": 5.9411, "mean_token_accuracy": 0.13557685017585755, "num_tokens": 2397764.0, "step": 1365 }, { "entropy": 6.2651125431060795, "epoch": 1.073667711598746, "grad_norm": 1.125, "learning_rate": 0.0004989058497473014, "loss": 6.1303, "mean_token_accuracy": 0.13204967901110648, "num_tokens": 2406688.0, "step": 1370 }, { "entropy": 6.271961975097656, "epoch": 1.0775862068965518, "grad_norm": 1.125, "learning_rate": 0.0004988760219653518, "loss": 6.1871, "mean_token_accuracy": 0.12600790411233903, "num_tokens": 2415475.0, "step": 1375 }, { "entropy": 6.261941576004029, "epoch": 1.0815047021943573, "grad_norm": 1.0625, "learning_rate": 0.0004988457940806513, "loss": 6.1287, "mean_token_accuracy": 0.1264335960149765, "num_tokens": 2425719.0, "step": 1380 }, { "entropy": 6.207798719406128, "epoch": 1.085423197492163, "grad_norm": 1.171875, "learning_rate": 0.0004988151661472218, "loss": 6.0311, "mean_token_accuracy": 0.13111473768949508, "num_tokens": 2435138.0, "step": 1385 }, { "entropy": 6.299766397476196, "epoch": 1.0893416927899686, "grad_norm": 1.234375, "learning_rate": 0.0004987841382198006, "loss": 6.1646, "mean_token_accuracy": 0.1282002493739128, "num_tokens": 2443976.0, "step": 1390 }, { "entropy": 6.123603677749633, "epoch": 1.0932601880877744, "grad_norm": 1.046875, "learning_rate": 0.0004987527103538394, "loss": 6.0608, "mean_token_accuracy": 0.1322481580078602, "num_tokens": 2453287.0, "step": 1395 }, { "entropy": 6.353779220581055, "epoch": 1.09717868338558, "grad_norm": 1.15625, "learning_rate": 0.000498720882605505, "loss": 6.1542, "mean_token_accuracy": 0.12652794942259787, "num_tokens": 2462054.0, "step": 1400 }, { "entropy": 6.106056785583496, "epoch": 1.1010971786833856, "grad_norm": 1.1640625, "learning_rate": 0.0004986886550316788, "loss": 6.0653, "mean_token_accuracy": 0.13403623849153518, "num_tokens": 2470333.0, "step": 1405 }, { "entropy": 6.214111185073852, "epoch": 1.1050156739811912, "grad_norm": 1.078125, "learning_rate": 0.0004986560276899565, "loss": 6.0972, "mean_token_accuracy": 0.1270233377814293, "num_tokens": 2479621.0, "step": 1410 }, { "entropy": 6.246433115005493, "epoch": 1.108934169278997, "grad_norm": 1.1484375, "learning_rate": 0.0004986230006386488, "loss": 6.0655, "mean_token_accuracy": 0.12891049459576606, "num_tokens": 2488020.0, "step": 1415 }, { "entropy": 6.274629163742065, "epoch": 1.1128526645768024, "grad_norm": 1.0390625, "learning_rate": 0.0004985895739367802, "loss": 6.1085, "mean_token_accuracy": 0.12846623882651328, "num_tokens": 2497779.0, "step": 1420 }, { "entropy": 6.158310031890869, "epoch": 1.1167711598746082, "grad_norm": 1.1328125, "learning_rate": 0.0004985557476440895, "loss": 6.0415, "mean_token_accuracy": 0.12804254293441772, "num_tokens": 2506791.0, "step": 1425 }, { "entropy": 6.2934564590454105, "epoch": 1.1206896551724137, "grad_norm": 1.1328125, "learning_rate": 0.00049852152182103, "loss": 6.1559, "mean_token_accuracy": 0.1281921535730362, "num_tokens": 2515576.0, "step": 1430 }, { "entropy": 6.240061712265015, "epoch": 1.1246081504702194, "grad_norm": 1.203125, "learning_rate": 0.0004984868965287686, "loss": 6.0674, "mean_token_accuracy": 0.1283339627087116, "num_tokens": 2523933.0, "step": 1435 }, { "entropy": 6.1820862770080565, "epoch": 1.1285266457680252, "grad_norm": 1.2421875, "learning_rate": 0.0004984518718291864, "loss": 6.0056, "mean_token_accuracy": 0.12331884428858757, "num_tokens": 2532453.0, "step": 1440 }, { "entropy": 6.157241773605347, "epoch": 1.1324451410658307, "grad_norm": 1.0859375, "learning_rate": 0.0004984164477848783, "loss": 6.064, "mean_token_accuracy": 0.12599845975637436, "num_tokens": 2541149.0, "step": 1445 }, { "entropy": 6.237585258483887, "epoch": 1.1363636363636362, "grad_norm": 1.1171875, "learning_rate": 0.0004983806244591528, "loss": 6.0312, "mean_token_accuracy": 0.13384631648659706, "num_tokens": 2550302.0, "step": 1450 }, { "entropy": 6.050166320800781, "epoch": 1.140282131661442, "grad_norm": 1.140625, "learning_rate": 0.0004983444019160318, "loss": 5.8966, "mean_token_accuracy": 0.13734075129032136, "num_tokens": 2558507.0, "step": 1455 }, { "entropy": 6.258242225646972, "epoch": 1.1442006269592477, "grad_norm": 1.234375, "learning_rate": 0.0004983077802202511, "loss": 6.1788, "mean_token_accuracy": 0.1290736488997936, "num_tokens": 2567487.0, "step": 1460 }, { "entropy": 6.212359237670898, "epoch": 1.1481191222570533, "grad_norm": 1.09375, "learning_rate": 0.0004982707594372595, "loss": 6.0304, "mean_token_accuracy": 0.12699214443564416, "num_tokens": 2576232.0, "step": 1465 }, { "entropy": 6.1208006858825685, "epoch": 1.152037617554859, "grad_norm": 1.109375, "learning_rate": 0.000498233339633219, "loss": 6.0329, "mean_token_accuracy": 0.13209858015179635, "num_tokens": 2585081.0, "step": 1470 }, { "entropy": 6.143232536315918, "epoch": 1.1559561128526645, "grad_norm": 1.09375, "learning_rate": 0.000498195520875005, "loss": 6.0115, "mean_token_accuracy": 0.12682496458292009, "num_tokens": 2593890.0, "step": 1475 }, { "entropy": 6.200338554382324, "epoch": 1.1598746081504703, "grad_norm": 1.1875, "learning_rate": 0.0004981573032302059, "loss": 6.0958, "mean_token_accuracy": 0.1289630651473999, "num_tokens": 2602011.0, "step": 1480 }, { "entropy": 6.149346542358399, "epoch": 1.1637931034482758, "grad_norm": 1.1015625, "learning_rate": 0.0004981186867671225, "loss": 5.9939, "mean_token_accuracy": 0.1361051805317402, "num_tokens": 2610366.0, "step": 1485 }, { "entropy": 6.113060522079468, "epoch": 1.1677115987460815, "grad_norm": 1.171875, "learning_rate": 0.0004980796715547691, "loss": 6.0004, "mean_token_accuracy": 0.13451212123036385, "num_tokens": 2619737.0, "step": 1490 }, { "entropy": 6.229964303970337, "epoch": 1.171630094043887, "grad_norm": 1.0625, "learning_rate": 0.0004980402576628717, "loss": 6.0233, "mean_token_accuracy": 0.13166360333561897, "num_tokens": 2628176.0, "step": 1495 }, { "entropy": 6.0950154781341555, "epoch": 1.1755485893416928, "grad_norm": 1.1875, "learning_rate": 0.0004980004451618697, "loss": 6.0384, "mean_token_accuracy": 0.13299553468823433, "num_tokens": 2636195.0, "step": 1500 }, { "epoch": 1.1755485893416928, "eval_entropy": 6.165283305700435, "eval_loss": 6.313858985900879, "eval_mean_token_accuracy": 0.1281498552105108, "eval_num_tokens": 2636195.0, "eval_runtime": 2.8434, "eval_samples_per_second": 1449.67, "eval_steps_per_second": 181.472, "step": 1500 }, { "entropy": 6.294966459274292, "epoch": 1.1794670846394983, "grad_norm": 1.0625, "learning_rate": 0.0004979602341229144, "loss": 6.1167, "mean_token_accuracy": 0.13073519617319107, "num_tokens": 2645035.0, "step": 1505 }, { "entropy": 6.177581834793091, "epoch": 1.183385579937304, "grad_norm": 1.15625, "learning_rate": 0.0004979196246178694, "loss": 6.0716, "mean_token_accuracy": 0.13222582265734673, "num_tokens": 2653239.0, "step": 1510 }, { "entropy": 6.1044244289398195, "epoch": 1.1873040752351098, "grad_norm": 1.203125, "learning_rate": 0.0004978786167193105, "loss": 6.0998, "mean_token_accuracy": 0.13036856576800346, "num_tokens": 2662532.0, "step": 1515 }, { "entropy": 6.214647531509399, "epoch": 1.1912225705329154, "grad_norm": 1.078125, "learning_rate": 0.0004978372105005254, "loss": 6.0841, "mean_token_accuracy": 0.12801975160837173, "num_tokens": 2671867.0, "step": 1520 }, { "entropy": 6.091090679168701, "epoch": 1.1951410658307209, "grad_norm": 1.171875, "learning_rate": 0.0004977954060355139, "loss": 6.0188, "mean_token_accuracy": 0.1304849162697792, "num_tokens": 2680693.0, "step": 1525 }, { "entropy": 6.281388568878174, "epoch": 1.1990595611285266, "grad_norm": 1.1015625, "learning_rate": 0.0004977532033989871, "loss": 6.0883, "mean_token_accuracy": 0.1320122368633747, "num_tokens": 2689228.0, "step": 1530 }, { "entropy": 6.080573844909668, "epoch": 1.2029780564263324, "grad_norm": 1.0625, "learning_rate": 0.0004977106026663681, "loss": 5.9666, "mean_token_accuracy": 0.12833520472049714, "num_tokens": 2698483.0, "step": 1535 }, { "entropy": 6.26170859336853, "epoch": 1.206896551724138, "grad_norm": 1.125, "learning_rate": 0.0004976676039137914, "loss": 6.1056, "mean_token_accuracy": 0.126655612885952, "num_tokens": 2706920.0, "step": 1540 }, { "entropy": 6.0963341236114506, "epoch": 1.2108150470219436, "grad_norm": 0.9921875, "learning_rate": 0.0004976242072181026, "loss": 5.9939, "mean_token_accuracy": 0.13806139603257178, "num_tokens": 2715954.0, "step": 1545 }, { "entropy": 6.142190265655517, "epoch": 1.2147335423197492, "grad_norm": 1.0859375, "learning_rate": 0.0004975804126568587, "loss": 6.0112, "mean_token_accuracy": 0.13544101044535636, "num_tokens": 2724890.0, "step": 1550 }, { "entropy": 6.178206443786621, "epoch": 1.218652037617555, "grad_norm": 1.15625, "learning_rate": 0.0004975362203083277, "loss": 6.037, "mean_token_accuracy": 0.12968128994107248, "num_tokens": 2733688.0, "step": 1555 }, { "entropy": 6.206477451324463, "epoch": 1.2225705329153604, "grad_norm": 1.140625, "learning_rate": 0.0004974916302514886, "loss": 6.0428, "mean_token_accuracy": 0.12749146521091462, "num_tokens": 2743364.0, "step": 1560 }, { "entropy": 6.040554809570312, "epoch": 1.2264890282131662, "grad_norm": 1.015625, "learning_rate": 0.0004974466425660307, "loss": 6.1001, "mean_token_accuracy": 0.1249497301876545, "num_tokens": 2754276.0, "step": 1565 }, { "entropy": 6.225854349136353, "epoch": 1.2304075235109717, "grad_norm": 1.09375, "learning_rate": 0.0004974012573323545, "loss": 5.883, "mean_token_accuracy": 0.14270371645689012, "num_tokens": 2762680.0, "step": 1570 }, { "entropy": 5.993971490859986, "epoch": 1.2343260188087775, "grad_norm": 1.0703125, "learning_rate": 0.0004973554746315709, "loss": 5.9893, "mean_token_accuracy": 0.14108432978391647, "num_tokens": 2771161.0, "step": 1575 }, { "entropy": 6.177520227432251, "epoch": 1.238244514106583, "grad_norm": 1.1328125, "learning_rate": 0.0004973092945455009, "loss": 5.9578, "mean_token_accuracy": 0.14068967550992967, "num_tokens": 2779321.0, "step": 1580 }, { "entropy": 6.1567606925964355, "epoch": 1.2421630094043887, "grad_norm": 1.0625, "learning_rate": 0.0004972627171566757, "loss": 6.1195, "mean_token_accuracy": 0.13284459933638573, "num_tokens": 2788425.0, "step": 1585 }, { "entropy": 6.097169017791748, "epoch": 1.2460815047021945, "grad_norm": 1.046875, "learning_rate": 0.0004972157425483368, "loss": 5.9674, "mean_token_accuracy": 0.1364126294851303, "num_tokens": 2796747.0, "step": 1590 }, { "entropy": 6.055204153060913, "epoch": 1.25, "grad_norm": 1.0234375, "learning_rate": 0.0004971683708044353, "loss": 6.0172, "mean_token_accuracy": 0.1358514852821827, "num_tokens": 2805335.0, "step": 1595 }, { "entropy": 6.133439636230468, "epoch": 1.2539184952978055, "grad_norm": 1.1328125, "learning_rate": 0.0004971206020096323, "loss": 5.9043, "mean_token_accuracy": 0.13950337022542952, "num_tokens": 2813350.0, "step": 1600 }, { "entropy": 6.165687799453735, "epoch": 1.2578369905956113, "grad_norm": 1.078125, "learning_rate": 0.0004970724362492984, "loss": 6.0934, "mean_token_accuracy": 0.13864696100354196, "num_tokens": 2821922.0, "step": 1605 }, { "entropy": 6.101494073867798, "epoch": 1.261755485893417, "grad_norm": 1.1484375, "learning_rate": 0.0004970238736095135, "loss": 5.9931, "mean_token_accuracy": 0.1329360119998455, "num_tokens": 2830832.0, "step": 1610 }, { "entropy": 6.160137414932251, "epoch": 1.2656739811912225, "grad_norm": 1.0625, "learning_rate": 0.0004969749141770671, "loss": 6.0053, "mean_token_accuracy": 0.13711865544319152, "num_tokens": 2839699.0, "step": 1615 }, { "entropy": 6.093396949768066, "epoch": 1.2695924764890283, "grad_norm": 1.1953125, "learning_rate": 0.0004969255580394575, "loss": 6.0811, "mean_token_accuracy": 0.13654499500989914, "num_tokens": 2848293.0, "step": 1620 }, { "entropy": 6.201265144348144, "epoch": 1.2735109717868338, "grad_norm": 1.15625, "learning_rate": 0.000496875805284892, "loss": 6.0327, "mean_token_accuracy": 0.13467409685254098, "num_tokens": 2856468.0, "step": 1625 }, { "entropy": 6.086582183837891, "epoch": 1.2774294670846396, "grad_norm": 1.015625, "learning_rate": 0.0004968256560022871, "loss": 5.9502, "mean_token_accuracy": 0.1351684033870697, "num_tokens": 2865366.0, "step": 1630 }, { "entropy": 6.079522609710693, "epoch": 1.281347962382445, "grad_norm": 1.15625, "learning_rate": 0.0004967751102812676, "loss": 6.0118, "mean_token_accuracy": 0.13846911042928695, "num_tokens": 2873520.0, "step": 1635 }, { "entropy": 6.133967542648316, "epoch": 1.2852664576802508, "grad_norm": 1.1015625, "learning_rate": 0.0004967241682121669, "loss": 6.0771, "mean_token_accuracy": 0.13166362345218657, "num_tokens": 2883324.0, "step": 1640 }, { "entropy": 6.170493459701538, "epoch": 1.2891849529780564, "grad_norm": 1.15625, "learning_rate": 0.0004966728298860267, "loss": 6.0178, "mean_token_accuracy": 0.13498894423246383, "num_tokens": 2892200.0, "step": 1645 }, { "entropy": 6.1070939064025875, "epoch": 1.293103448275862, "grad_norm": 1.1640625, "learning_rate": 0.0004966210953945969, "loss": 5.9858, "mean_token_accuracy": 0.13817497938871384, "num_tokens": 2900597.0, "step": 1650 }, { "entropy": 6.192414903640747, "epoch": 1.2970219435736676, "grad_norm": 1.1484375, "learning_rate": 0.0004965689648303355, "loss": 6.0118, "mean_token_accuracy": 0.13011416494846345, "num_tokens": 2909869.0, "step": 1655 }, { "entropy": 6.000342845916748, "epoch": 1.3009404388714734, "grad_norm": 1.1953125, "learning_rate": 0.0004965164382864083, "loss": 6.0278, "mean_token_accuracy": 0.1339656464755535, "num_tokens": 2919166.0, "step": 1660 }, { "entropy": 6.232674837112427, "epoch": 1.3048589341692791, "grad_norm": 1.03125, "learning_rate": 0.0004964635158566886, "loss": 6.0383, "mean_token_accuracy": 0.13142292499542235, "num_tokens": 2927884.0, "step": 1665 }, { "entropy": 6.099806451797486, "epoch": 1.3087774294670846, "grad_norm": 1.1171875, "learning_rate": 0.0004964101976357574, "loss": 5.9802, "mean_token_accuracy": 0.14010540917515754, "num_tokens": 2936737.0, "step": 1670 }, { "entropy": 6.117660140991211, "epoch": 1.3126959247648902, "grad_norm": 1.234375, "learning_rate": 0.000496356483718903, "loss": 6.0705, "mean_token_accuracy": 0.13366047590970992, "num_tokens": 2944203.0, "step": 1675 }, { "entropy": 6.167582654953003, "epoch": 1.316614420062696, "grad_norm": 1.171875, "learning_rate": 0.0004963023742021211, "loss": 6.0739, "mean_token_accuracy": 0.1311576023697853, "num_tokens": 2952639.0, "step": 1680 }, { "entropy": 6.16412582397461, "epoch": 1.3205329153605017, "grad_norm": 1.09375, "learning_rate": 0.000496247869182114, "loss": 6.0037, "mean_token_accuracy": 0.1342972233891487, "num_tokens": 2961482.0, "step": 1685 }, { "entropy": 6.074027061462402, "epoch": 1.3244514106583072, "grad_norm": 1.1328125, "learning_rate": 0.0004961929687562909, "loss": 6.0286, "mean_token_accuracy": 0.1309918761253357, "num_tokens": 2970927.0, "step": 1690 }, { "entropy": 6.046236276626587, "epoch": 1.328369905956113, "grad_norm": 1.078125, "learning_rate": 0.0004961376730227681, "loss": 5.9461, "mean_token_accuracy": 0.1402522951364517, "num_tokens": 2979329.0, "step": 1695 }, { "entropy": 6.147798299789429, "epoch": 1.3322884012539185, "grad_norm": 1.140625, "learning_rate": 0.0004960819820803675, "loss": 5.9328, "mean_token_accuracy": 0.14073051139712334, "num_tokens": 2986894.0, "step": 1700 }, { "entropy": 6.0277563571929935, "epoch": 1.3362068965517242, "grad_norm": 1.0703125, "learning_rate": 0.0004960258960286185, "loss": 5.9188, "mean_token_accuracy": 0.1406124599277973, "num_tokens": 2994721.0, "step": 1705 }, { "entropy": 6.188552713394165, "epoch": 1.3401253918495297, "grad_norm": 1.0859375, "learning_rate": 0.0004959694149677553, "loss": 6.0716, "mean_token_accuracy": 0.12887891680002211, "num_tokens": 3003165.0, "step": 1710 }, { "entropy": 6.130622291564942, "epoch": 1.3440438871473355, "grad_norm": 1.0078125, "learning_rate": 0.0004959125389987193, "loss": 6.0519, "mean_token_accuracy": 0.12692190557718278, "num_tokens": 3012178.0, "step": 1715 }, { "entropy": 6.024265241622925, "epoch": 1.347962382445141, "grad_norm": 1.03125, "learning_rate": 0.0004958552682231567, "loss": 5.957, "mean_token_accuracy": 0.1394257813692093, "num_tokens": 3021685.0, "step": 1720 }, { "entropy": 6.13150839805603, "epoch": 1.3518808777429467, "grad_norm": 1.078125, "learning_rate": 0.0004957976027434199, "loss": 6.0119, "mean_token_accuracy": 0.12856094017624856, "num_tokens": 3030667.0, "step": 1725 }, { "entropy": 6.137048673629761, "epoch": 1.3557993730407523, "grad_norm": 1.109375, "learning_rate": 0.0004957395426625663, "loss": 6.064, "mean_token_accuracy": 0.13227416425943375, "num_tokens": 3039501.0, "step": 1730 }, { "entropy": 6.055467891693115, "epoch": 1.359717868338558, "grad_norm": 1.09375, "learning_rate": 0.0004956810880843587, "loss": 5.9946, "mean_token_accuracy": 0.13389588594436647, "num_tokens": 3048547.0, "step": 1735 }, { "entropy": 6.150904512405395, "epoch": 1.3636363636363638, "grad_norm": 1.125, "learning_rate": 0.000495622239113265, "loss": 5.9482, "mean_token_accuracy": 0.13820023238658904, "num_tokens": 3056521.0, "step": 1740 }, { "entropy": 6.032876205444336, "epoch": 1.3675548589341693, "grad_norm": 1.0390625, "learning_rate": 0.0004955629958544577, "loss": 5.9428, "mean_token_accuracy": 0.1336909145116806, "num_tokens": 3066007.0, "step": 1745 }, { "entropy": 6.174657106399536, "epoch": 1.3714733542319748, "grad_norm": 1.0625, "learning_rate": 0.0004955033584138143, "loss": 6.1364, "mean_token_accuracy": 0.12483339086174965, "num_tokens": 3075111.0, "step": 1750 }, { "entropy": 6.001473474502563, "epoch": 1.3753918495297806, "grad_norm": 1.03125, "learning_rate": 0.0004954433268979164, "loss": 5.93, "mean_token_accuracy": 0.15043148621916771, "num_tokens": 3083766.0, "step": 1755 }, { "entropy": 6.056762075424194, "epoch": 1.3793103448275863, "grad_norm": 1.0703125, "learning_rate": 0.0004953829014140502, "loss": 6.0034, "mean_token_accuracy": 0.13454223051667213, "num_tokens": 3092103.0, "step": 1760 }, { "entropy": 6.0849854946136475, "epoch": 1.3832288401253918, "grad_norm": 1.171875, "learning_rate": 0.0004953220820702057, "loss": 6.0065, "mean_token_accuracy": 0.13198206946253777, "num_tokens": 3101286.0, "step": 1765 }, { "entropy": 6.222050523757934, "epoch": 1.3871473354231976, "grad_norm": 1.125, "learning_rate": 0.0004952608689750771, "loss": 6.024, "mean_token_accuracy": 0.13246920630335807, "num_tokens": 3110554.0, "step": 1770 }, { "entropy": 5.981888675689698, "epoch": 1.391065830721003, "grad_norm": 1.078125, "learning_rate": 0.0004951992622380619, "loss": 5.8669, "mean_token_accuracy": 0.14516761153936386, "num_tokens": 3119265.0, "step": 1775 }, { "entropy": 6.022592401504516, "epoch": 1.3949843260188088, "grad_norm": 1.171875, "learning_rate": 0.0004951372619692615, "loss": 5.9176, "mean_token_accuracy": 0.13864269405603408, "num_tokens": 3127962.0, "step": 1780 }, { "entropy": 6.138263750076294, "epoch": 1.3989028213166144, "grad_norm": 1.0859375, "learning_rate": 0.0004950748682794804, "loss": 6.0343, "mean_token_accuracy": 0.13279442861676216, "num_tokens": 3137265.0, "step": 1785 }, { "entropy": 5.982170152664184, "epoch": 1.40282131661442, "grad_norm": 1.1484375, "learning_rate": 0.0004950120812802262, "loss": 5.8679, "mean_token_accuracy": 0.13771192952990532, "num_tokens": 3146458.0, "step": 1790 }, { "entropy": 6.121512508392334, "epoch": 1.4067398119122256, "grad_norm": 1.109375, "learning_rate": 0.0004949489010837095, "loss": 6.0051, "mean_token_accuracy": 0.13202869817614554, "num_tokens": 3155557.0, "step": 1795 }, { "entropy": 6.031117582321167, "epoch": 1.4106583072100314, "grad_norm": 1.0703125, "learning_rate": 0.0004948853278028436, "loss": 5.8921, "mean_token_accuracy": 0.1407647594809532, "num_tokens": 3163478.0, "step": 1800 }, { "entropy": 6.06480302810669, "epoch": 1.414576802507837, "grad_norm": 1.171875, "learning_rate": 0.000494821361551244, "loss": 5.9734, "mean_token_accuracy": 0.13325524404644967, "num_tokens": 3172632.0, "step": 1805 }, { "entropy": 6.034041261672973, "epoch": 1.4184952978056427, "grad_norm": 1.078125, "learning_rate": 0.0004947570024432291, "loss": 6.0159, "mean_token_accuracy": 0.14027554914355278, "num_tokens": 3181325.0, "step": 1810 }, { "entropy": 6.120739078521728, "epoch": 1.4224137931034484, "grad_norm": 1.109375, "learning_rate": 0.0004946922505938189, "loss": 6.058, "mean_token_accuracy": 0.140015921741724, "num_tokens": 3189783.0, "step": 1815 }, { "entropy": 6.078346109390258, "epoch": 1.426332288401254, "grad_norm": 1.046875, "learning_rate": 0.0004946271061187354, "loss": 5.9115, "mean_token_accuracy": 0.1306297406554222, "num_tokens": 3198605.0, "step": 1820 }, { "entropy": 6.07373309135437, "epoch": 1.4302507836990594, "grad_norm": 1.0625, "learning_rate": 0.0004945615691344025, "loss": 6.0025, "mean_token_accuracy": 0.12800363823771477, "num_tokens": 3208061.0, "step": 1825 }, { "entropy": 6.044371461868286, "epoch": 1.4341692789968652, "grad_norm": 1.171875, "learning_rate": 0.0004944956397579453, "loss": 5.9286, "mean_token_accuracy": 0.1459761567413807, "num_tokens": 3216093.0, "step": 1830 }, { "entropy": 6.0173381805419925, "epoch": 1.438087774294671, "grad_norm": 1.0546875, "learning_rate": 0.0004944293181071902, "loss": 5.9027, "mean_token_accuracy": 0.14357266277074815, "num_tokens": 3224446.0, "step": 1835 }, { "entropy": 6.0938514232635494, "epoch": 1.4420062695924765, "grad_norm": 1.109375, "learning_rate": 0.0004943626043006649, "loss": 6.0177, "mean_token_accuracy": 0.135706390440464, "num_tokens": 3233398.0, "step": 1840 }, { "entropy": 6.068123865127563, "epoch": 1.4459247648902822, "grad_norm": 1.0546875, "learning_rate": 0.000494295498457598, "loss": 6.0156, "mean_token_accuracy": 0.13109127432107925, "num_tokens": 3243391.0, "step": 1845 }, { "entropy": 6.0765832424163815, "epoch": 1.4498432601880877, "grad_norm": 1.0859375, "learning_rate": 0.0004942280006979179, "loss": 5.96, "mean_token_accuracy": 0.13749199137091636, "num_tokens": 3251954.0, "step": 1850 }, { "entropy": 6.0438025951385494, "epoch": 1.4537617554858935, "grad_norm": 1.015625, "learning_rate": 0.0004941601111422546, "loss": 5.9851, "mean_token_accuracy": 0.1373910054564476, "num_tokens": 3261121.0, "step": 1855 }, { "entropy": 5.976307153701782, "epoch": 1.457680250783699, "grad_norm": 1.109375, "learning_rate": 0.0004940918299119375, "loss": 5.9028, "mean_token_accuracy": 0.1375894144177437, "num_tokens": 3270179.0, "step": 1860 }, { "entropy": 5.898191833496094, "epoch": 1.4615987460815048, "grad_norm": 1.0234375, "learning_rate": 0.0004940231571289962, "loss": 5.8893, "mean_token_accuracy": 0.14039459228515624, "num_tokens": 3279749.0, "step": 1865 }, { "entropy": 6.1549577713012695, "epoch": 1.4655172413793103, "grad_norm": 1.0859375, "learning_rate": 0.0004939540929161603, "loss": 5.9246, "mean_token_accuracy": 0.13927078545093535, "num_tokens": 3288458.0, "step": 1870 }, { "entropy": 6.00097017288208, "epoch": 1.469435736677116, "grad_norm": 1.140625, "learning_rate": 0.0004938846373968586, "loss": 5.8991, "mean_token_accuracy": 0.14272075444459914, "num_tokens": 3297231.0, "step": 1875 }, { "entropy": 6.032529544830322, "epoch": 1.4733542319749215, "grad_norm": 1.0703125, "learning_rate": 0.0004938147906952194, "loss": 6.0329, "mean_token_accuracy": 0.13757839426398277, "num_tokens": 3306915.0, "step": 1880 }, { "entropy": 6.079828405380249, "epoch": 1.4772727272727273, "grad_norm": 1.0703125, "learning_rate": 0.00049374455293607, "loss": 5.9958, "mean_token_accuracy": 0.13423861265182496, "num_tokens": 3315985.0, "step": 1885 }, { "entropy": 6.122959566116333, "epoch": 1.481191222570533, "grad_norm": 1.09375, "learning_rate": 0.0004936739242449369, "loss": 5.9895, "mean_token_accuracy": 0.13724515214562416, "num_tokens": 3324924.0, "step": 1890 }, { "entropy": 5.991518545150757, "epoch": 1.4851097178683386, "grad_norm": 1.1640625, "learning_rate": 0.0004936029047480447, "loss": 5.9801, "mean_token_accuracy": 0.13726715967059136, "num_tokens": 3333464.0, "step": 1895 }, { "entropy": 6.0822971820831295, "epoch": 1.489028213166144, "grad_norm": 1.1015625, "learning_rate": 0.0004935314945723171, "loss": 5.8806, "mean_token_accuracy": 0.14093400090932845, "num_tokens": 3342441.0, "step": 1900 }, { "entropy": 5.940945720672607, "epoch": 1.4929467084639498, "grad_norm": 1.15625, "learning_rate": 0.0004934596938453754, "loss": 5.9352, "mean_token_accuracy": 0.1342185415327549, "num_tokens": 3351338.0, "step": 1905 }, { "entropy": 6.092441558837891, "epoch": 1.4968652037617556, "grad_norm": 1.0546875, "learning_rate": 0.0004933875026955391, "loss": 5.9069, "mean_token_accuracy": 0.1339087277650833, "num_tokens": 3360356.0, "step": 1910 }, { "entropy": 6.088913869857788, "epoch": 1.500783699059561, "grad_norm": 1.2109375, "learning_rate": 0.0004933149212518258, "loss": 5.988, "mean_token_accuracy": 0.13323872461915015, "num_tokens": 3369485.0, "step": 1915 }, { "entropy": 6.02648138999939, "epoch": 1.5047021943573666, "grad_norm": 1.09375, "learning_rate": 0.0004932419496439501, "loss": 5.9519, "mean_token_accuracy": 0.14375862777233123, "num_tokens": 3378355.0, "step": 1920 }, { "entropy": 5.99138445854187, "epoch": 1.5086206896551724, "grad_norm": 1.109375, "learning_rate": 0.000493168588002324, "loss": 5.8649, "mean_token_accuracy": 0.14118586033582686, "num_tokens": 3386729.0, "step": 1925 }, { "entropy": 6.119981384277343, "epoch": 1.5125391849529781, "grad_norm": 1.1171875, "learning_rate": 0.0004930948364580569, "loss": 6.088, "mean_token_accuracy": 0.1284665696322918, "num_tokens": 3395822.0, "step": 1930 }, { "entropy": 6.04303822517395, "epoch": 1.5164576802507836, "grad_norm": 1.1953125, "learning_rate": 0.0004930206951429546, "loss": 5.99, "mean_token_accuracy": 0.14219039529561997, "num_tokens": 3404671.0, "step": 1935 }, { "entropy": 6.036664438247681, "epoch": 1.5203761755485894, "grad_norm": 1.1015625, "learning_rate": 0.0004929461641895197, "loss": 5.9301, "mean_token_accuracy": 0.13862462490797042, "num_tokens": 3413289.0, "step": 1940 }, { "entropy": 6.010977125167846, "epoch": 1.5242946708463951, "grad_norm": 1.15625, "learning_rate": 0.000492871243730951, "loss": 5.9675, "mean_token_accuracy": 0.1358363598585129, "num_tokens": 3421960.0, "step": 1945 }, { "entropy": 6.0582923412323, "epoch": 1.5282131661442007, "grad_norm": 1.140625, "learning_rate": 0.0004927959339011437, "loss": 5.9564, "mean_token_accuracy": 0.13902525305747987, "num_tokens": 3431958.0, "step": 1950 }, { "entropy": 5.846984767913819, "epoch": 1.5321316614420062, "grad_norm": 1.0703125, "learning_rate": 0.0004927202348346885, "loss": 5.7801, "mean_token_accuracy": 0.14725423008203506, "num_tokens": 3441438.0, "step": 1955 }, { "entropy": 6.118220949172974, "epoch": 1.536050156739812, "grad_norm": 1.0703125, "learning_rate": 0.000492644146666872, "loss": 5.9105, "mean_token_accuracy": 0.14317999482154847, "num_tokens": 3450176.0, "step": 1960 }, { "entropy": 5.923649597167969, "epoch": 1.5399686520376177, "grad_norm": 1.1640625, "learning_rate": 0.0004925676695336761, "loss": 5.8946, "mean_token_accuracy": 0.14172052592039108, "num_tokens": 3458685.0, "step": 1965 }, { "entropy": 6.046487474441529, "epoch": 1.5438871473354232, "grad_norm": 1.0859375, "learning_rate": 0.0004924908035717777, "loss": 6.0445, "mean_token_accuracy": 0.13102332279086112, "num_tokens": 3468362.0, "step": 1970 }, { "entropy": 6.12939486503601, "epoch": 1.5478056426332287, "grad_norm": 1.140625, "learning_rate": 0.000492413548918549, "loss": 5.9607, "mean_token_accuracy": 0.13970244973897933, "num_tokens": 3476791.0, "step": 1975 }, { "entropy": 5.925748825073242, "epoch": 1.5517241379310345, "grad_norm": 1.1796875, "learning_rate": 0.0004923359057120563, "loss": 5.8781, "mean_token_accuracy": 0.13300700336694718, "num_tokens": 3486486.0, "step": 1980 }, { "entropy": 5.951593446731567, "epoch": 1.5556426332288402, "grad_norm": 1.0234375, "learning_rate": 0.0004922578740910608, "loss": 5.8334, "mean_token_accuracy": 0.14717195332050323, "num_tokens": 3496145.0, "step": 1985 }, { "entropy": 5.967483854293823, "epoch": 1.5595611285266457, "grad_norm": 1.1484375, "learning_rate": 0.0004921794541950177, "loss": 5.8214, "mean_token_accuracy": 0.14525392055511474, "num_tokens": 3504644.0, "step": 1990 }, { "entropy": 5.994989633560181, "epoch": 1.5634796238244513, "grad_norm": 1.1171875, "learning_rate": 0.0004921006461640758, "loss": 5.8562, "mean_token_accuracy": 0.13699238896369934, "num_tokens": 3513126.0, "step": 1995 }, { "entropy": 5.833573675155639, "epoch": 1.567398119122257, "grad_norm": 1.21875, "learning_rate": 0.000492021450139078, "loss": 5.7981, "mean_token_accuracy": 0.14626943692564964, "num_tokens": 3521505.0, "step": 2000 }, { "epoch": 1.567398119122257, "eval_entropy": 5.888265516406806, "eval_loss": 6.116030216217041, "eval_mean_token_accuracy": 0.13753249434855327, "eval_num_tokens": 3521505.0, "eval_runtime": 2.8299, "eval_samples_per_second": 1456.593, "eval_steps_per_second": 182.339, "step": 2000 }, { "entropy": 6.132147789001465, "epoch": 1.5713166144200628, "grad_norm": 1.0703125, "learning_rate": 0.0004919418662615605, "loss": 6.0583, "mean_token_accuracy": 0.1335877738893032, "num_tokens": 3530008.0, "step": 2005 }, { "entropy": 6.083332824707031, "epoch": 1.5752351097178683, "grad_norm": 1.078125, "learning_rate": 0.0004918618946737525, "loss": 5.9278, "mean_token_accuracy": 0.13504896759986879, "num_tokens": 3538778.0, "step": 2010 }, { "entropy": 5.979771709442138, "epoch": 1.579153605015674, "grad_norm": 1.1875, "learning_rate": 0.0004917815355185762, "loss": 5.9164, "mean_token_accuracy": 0.13633493483066558, "num_tokens": 3547108.0, "step": 2015 }, { "entropy": 5.988805103302002, "epoch": 1.5830721003134798, "grad_norm": 1.125, "learning_rate": 0.0004917007889396464, "loss": 5.9513, "mean_token_accuracy": 0.13188610523939132, "num_tokens": 3556094.0, "step": 2020 }, { "entropy": 5.985982990264892, "epoch": 1.5869905956112853, "grad_norm": 1.1875, "learning_rate": 0.0004916196550812706, "loss": 5.8151, "mean_token_accuracy": 0.14568774104118348, "num_tokens": 3564222.0, "step": 2025 }, { "entropy": 5.818160581588745, "epoch": 1.5909090909090908, "grad_norm": 1.171875, "learning_rate": 0.0004915381340884477, "loss": 5.827, "mean_token_accuracy": 0.14547126069664956, "num_tokens": 3573123.0, "step": 2030 }, { "entropy": 6.050127172470093, "epoch": 1.5948275862068966, "grad_norm": 1.1875, "learning_rate": 0.0004914562261068693, "loss": 5.8571, "mean_token_accuracy": 0.14239953309297562, "num_tokens": 3581462.0, "step": 2035 }, { "entropy": 5.88056173324585, "epoch": 1.5987460815047023, "grad_norm": 1.15625, "learning_rate": 0.0004913739312829181, "loss": 5.8744, "mean_token_accuracy": 0.14017535969614983, "num_tokens": 3590718.0, "step": 2040 }, { "entropy": 6.000182294845581, "epoch": 1.6026645768025078, "grad_norm": 1.1796875, "learning_rate": 0.0004912912497636683, "loss": 5.8899, "mean_token_accuracy": 0.13969212546944618, "num_tokens": 3599890.0, "step": 2045 }, { "entropy": 6.039797258377075, "epoch": 1.6065830721003134, "grad_norm": 1.1328125, "learning_rate": 0.0004912081816968853, "loss": 5.9815, "mean_token_accuracy": 0.14007024392485617, "num_tokens": 3607927.0, "step": 2050 }, { "entropy": 6.0202032089233395, "epoch": 1.6105015673981191, "grad_norm": 1.140625, "learning_rate": 0.000491124727231025, "loss": 5.838, "mean_token_accuracy": 0.14422516226768495, "num_tokens": 3615935.0, "step": 2055 }, { "entropy": 5.8747539043426515, "epoch": 1.6144200626959249, "grad_norm": 1.1171875, "learning_rate": 0.0004910408865152343, "loss": 5.8654, "mean_token_accuracy": 0.1397896021604538, "num_tokens": 3624475.0, "step": 2060 }, { "entropy": 6.097391796112061, "epoch": 1.6183385579937304, "grad_norm": 1.046875, "learning_rate": 0.0004909566596993498, "loss": 6.072, "mean_token_accuracy": 0.131089448928833, "num_tokens": 3633995.0, "step": 2065 }, { "entropy": 6.067581701278686, "epoch": 1.622257053291536, "grad_norm": 1.1328125, "learning_rate": 0.0004908720469338988, "loss": 5.9294, "mean_token_accuracy": 0.14296501129865646, "num_tokens": 3643121.0, "step": 2070 }, { "entropy": 5.959115791320801, "epoch": 1.6261755485893417, "grad_norm": 1.1484375, "learning_rate": 0.0004907870483700979, "loss": 5.8621, "mean_token_accuracy": 0.14175319969654082, "num_tokens": 3652285.0, "step": 2075 }, { "entropy": 5.929019594192505, "epoch": 1.6300940438871474, "grad_norm": 1.109375, "learning_rate": 0.0004907016641598534, "loss": 5.926, "mean_token_accuracy": 0.13432129770517348, "num_tokens": 3661441.0, "step": 2080 }, { "entropy": 6.0357630252838135, "epoch": 1.634012539184953, "grad_norm": 1.15625, "learning_rate": 0.0004906158944557607, "loss": 5.8105, "mean_token_accuracy": 0.14453670606017113, "num_tokens": 3669364.0, "step": 2085 }, { "entropy": 5.9998321533203125, "epoch": 1.6379310344827587, "grad_norm": 1.1015625, "learning_rate": 0.000490529739411104, "loss": 5.9024, "mean_token_accuracy": 0.13794894218444825, "num_tokens": 3677848.0, "step": 2090 }, { "entropy": 5.93680510520935, "epoch": 1.6418495297805644, "grad_norm": 1.1171875, "learning_rate": 0.0004904431991798565, "loss": 5.8138, "mean_token_accuracy": 0.14087132290005683, "num_tokens": 3686617.0, "step": 2095 }, { "entropy": 5.955365371704102, "epoch": 1.64576802507837, "grad_norm": 1.1328125, "learning_rate": 0.0004903562739166797, "loss": 5.8044, "mean_token_accuracy": 0.1422215446829796, "num_tokens": 3695106.0, "step": 2100 }, { "entropy": 5.7903650283813475, "epoch": 1.6496865203761755, "grad_norm": 1.0859375, "learning_rate": 0.0004902689637769229, "loss": 5.7518, "mean_token_accuracy": 0.15124305188655854, "num_tokens": 3703167.0, "step": 2105 }, { "entropy": 5.9747395515441895, "epoch": 1.6536050156739812, "grad_norm": 1.0859375, "learning_rate": 0.0004901812689166237, "loss": 5.8666, "mean_token_accuracy": 0.13829359784722328, "num_tokens": 3711597.0, "step": 2110 }, { "entropy": 5.945003080368042, "epoch": 1.657523510971787, "grad_norm": 1.109375, "learning_rate": 0.0004900931894925069, "loss": 5.9488, "mean_token_accuracy": 0.14057869464159012, "num_tokens": 3720277.0, "step": 2115 }, { "entropy": 5.949292469024658, "epoch": 1.6614420062695925, "grad_norm": 1.0859375, "learning_rate": 0.0004900047256619849, "loss": 5.921, "mean_token_accuracy": 0.13772802650928498, "num_tokens": 3729832.0, "step": 2120 }, { "entropy": 5.9761933326721195, "epoch": 1.665360501567398, "grad_norm": 1.1640625, "learning_rate": 0.0004899158775831566, "loss": 5.8632, "mean_token_accuracy": 0.14447186067700385, "num_tokens": 3738390.0, "step": 2125 }, { "entropy": 5.933046817779541, "epoch": 1.6692789968652038, "grad_norm": 1.1015625, "learning_rate": 0.0004898266454148081, "loss": 5.9185, "mean_token_accuracy": 0.14356547445058823, "num_tokens": 3747046.0, "step": 2130 }, { "entropy": 5.945548963546753, "epoch": 1.6731974921630095, "grad_norm": 1.0546875, "learning_rate": 0.0004897370293164119, "loss": 5.8896, "mean_token_accuracy": 0.13832552805542947, "num_tokens": 3755818.0, "step": 2135 }, { "entropy": 6.093547344207764, "epoch": 1.677115987460815, "grad_norm": 1.1484375, "learning_rate": 0.0004896470294481262, "loss": 5.9861, "mean_token_accuracy": 0.13742954656481743, "num_tokens": 3764123.0, "step": 2140 }, { "entropy": 5.950912809371948, "epoch": 1.6810344827586206, "grad_norm": 1.0390625, "learning_rate": 0.0004895566459707954, "loss": 6.035, "mean_token_accuracy": 0.12903214767575263, "num_tokens": 3773105.0, "step": 2145 }, { "entropy": 6.076629734039306, "epoch": 1.6849529780564263, "grad_norm": 1.1953125, "learning_rate": 0.0004894658790459498, "loss": 5.8805, "mean_token_accuracy": 0.14274725392460824, "num_tokens": 3781417.0, "step": 2150 }, { "entropy": 6.058791875839233, "epoch": 1.688871473354232, "grad_norm": 1.1328125, "learning_rate": 0.0004893747288358041, "loss": 5.9792, "mean_token_accuracy": 0.13869686052203178, "num_tokens": 3789657.0, "step": 2155 }, { "entropy": 5.9904515743255615, "epoch": 1.6927899686520376, "grad_norm": 1.125, "learning_rate": 0.000489283195503259, "loss": 5.9875, "mean_token_accuracy": 0.13768337592482566, "num_tokens": 3798700.0, "step": 2160 }, { "entropy": 5.9594886302948, "epoch": 1.6967084639498433, "grad_norm": 1.1640625, "learning_rate": 0.000489191279211899, "loss": 5.8479, "mean_token_accuracy": 0.14359756112098693, "num_tokens": 3807299.0, "step": 2165 }, { "entropy": 6.006121349334717, "epoch": 1.700626959247649, "grad_norm": 1.125, "learning_rate": 0.0004890989801259935, "loss": 5.9385, "mean_token_accuracy": 0.14270951524376868, "num_tokens": 3816292.0, "step": 2170 }, { "entropy": 6.049522542953492, "epoch": 1.7045454545454546, "grad_norm": 1.09375, "learning_rate": 0.000489006298410496, "loss": 5.8532, "mean_token_accuracy": 0.1399511620402336, "num_tokens": 3825070.0, "step": 2175 }, { "entropy": 5.959496307373047, "epoch": 1.70846394984326, "grad_norm": 1.1875, "learning_rate": 0.0004889132342310438, "loss": 5.8247, "mean_token_accuracy": 0.14473102912306784, "num_tokens": 3834016.0, "step": 2180 }, { "entropy": 5.956418132781982, "epoch": 1.7123824451410659, "grad_norm": 1.1640625, "learning_rate": 0.0004888197877539577, "loss": 5.8849, "mean_token_accuracy": 0.14025031253695489, "num_tokens": 3842172.0, "step": 2185 }, { "entropy": 5.971714496612549, "epoch": 1.7163009404388716, "grad_norm": 1.1640625, "learning_rate": 0.0004887259591462417, "loss": 5.798, "mean_token_accuracy": 0.14462782070040703, "num_tokens": 3851109.0, "step": 2190 }, { "entropy": 5.923008251190185, "epoch": 1.7202194357366771, "grad_norm": 1.125, "learning_rate": 0.0004886317485755825, "loss": 5.924, "mean_token_accuracy": 0.14367973506450654, "num_tokens": 3859656.0, "step": 2195 }, { "entropy": 5.933929538726806, "epoch": 1.7241379310344827, "grad_norm": 1.078125, "learning_rate": 0.0004885371562103498, "loss": 5.7677, "mean_token_accuracy": 0.14872414171695708, "num_tokens": 3868698.0, "step": 2200 }, { "entropy": 5.8945310592651365, "epoch": 1.7280564263322884, "grad_norm": 1.2109375, "learning_rate": 0.0004884421822195957, "loss": 5.8545, "mean_token_accuracy": 0.1411299616098404, "num_tokens": 3877474.0, "step": 2205 }, { "entropy": 5.9916675090789795, "epoch": 1.7319749216300941, "grad_norm": 1.1953125, "learning_rate": 0.0004883468267730538, "loss": 5.8228, "mean_token_accuracy": 0.1443895533680916, "num_tokens": 3886328.0, "step": 2210 }, { "entropy": 5.906041145324707, "epoch": 1.7358934169278997, "grad_norm": 1.1171875, "learning_rate": 0.00048825109004114006, "loss": 5.8058, "mean_token_accuracy": 0.1487313315272331, "num_tokens": 3894424.0, "step": 2215 }, { "entropy": 5.978779983520508, "epoch": 1.7398119122257052, "grad_norm": 1.046875, "learning_rate": 0.0004881549721949513, "loss": 5.8897, "mean_token_accuracy": 0.13909043669700621, "num_tokens": 3903746.0, "step": 2220 }, { "entropy": 5.851698732376098, "epoch": 1.743730407523511, "grad_norm": 1.1640625, "learning_rate": 0.0004880584734062658, "loss": 5.8724, "mean_token_accuracy": 0.14629912972450257, "num_tokens": 3912033.0, "step": 2225 }, { "entropy": 5.883364534378051, "epoch": 1.7476489028213167, "grad_norm": 1.1875, "learning_rate": 0.0004879615938475425, "loss": 5.7739, "mean_token_accuracy": 0.14114395081996917, "num_tokens": 3920388.0, "step": 2230 }, { "entropy": 5.934035730361939, "epoch": 1.7515673981191222, "grad_norm": 1.1484375, "learning_rate": 0.0004878643336919209, "loss": 5.9772, "mean_token_accuracy": 0.139875166118145, "num_tokens": 3929926.0, "step": 2235 }, { "entropy": 6.131916475296021, "epoch": 1.7554858934169277, "grad_norm": 1.15625, "learning_rate": 0.0004877666931132206, "loss": 5.9601, "mean_token_accuracy": 0.13168897181749345, "num_tokens": 3938696.0, "step": 2240 }, { "entropy": 5.873245096206665, "epoch": 1.7594043887147337, "grad_norm": 1.125, "learning_rate": 0.0004876686722859413, "loss": 5.7899, "mean_token_accuracy": 0.1513090804219246, "num_tokens": 3947537.0, "step": 2245 }, { "entropy": 5.992959451675415, "epoch": 1.7633228840125392, "grad_norm": 1.109375, "learning_rate": 0.000487570271385262, "loss": 5.7993, "mean_token_accuracy": 0.14130587950348855, "num_tokens": 3956539.0, "step": 2250 }, { "entropy": 5.880547904968262, "epoch": 1.7672413793103448, "grad_norm": 1.09375, "learning_rate": 0.0004874714905870411, "loss": 5.7667, "mean_token_accuracy": 0.14465454295277597, "num_tokens": 3964927.0, "step": 2255 }, { "entropy": 5.923993968963623, "epoch": 1.7711598746081505, "grad_norm": 1.1015625, "learning_rate": 0.0004873723300678159, "loss": 5.9062, "mean_token_accuracy": 0.1402622014284134, "num_tokens": 3973565.0, "step": 2260 }, { "entropy": 5.974027013778686, "epoch": 1.7750783699059562, "grad_norm": 1.1796875, "learning_rate": 0.00048727279000480226, "loss": 5.9283, "mean_token_accuracy": 0.14056170955300332, "num_tokens": 3982683.0, "step": 2265 }, { "entropy": 6.065245008468628, "epoch": 1.7789968652037618, "grad_norm": 1.0625, "learning_rate": 0.00048717287057589454, "loss": 5.8374, "mean_token_accuracy": 0.14206577837467194, "num_tokens": 3991462.0, "step": 2270 }, { "entropy": 5.725625848770141, "epoch": 1.7829153605015673, "grad_norm": 1.1640625, "learning_rate": 0.0004870725719596648, "loss": 5.6081, "mean_token_accuracy": 0.15651399940252303, "num_tokens": 3999821.0, "step": 2275 }, { "entropy": 5.943557548522949, "epoch": 1.786833855799373, "grad_norm": 1.078125, "learning_rate": 0.0004869718943353631, "loss": 5.8541, "mean_token_accuracy": 0.14041255488991738, "num_tokens": 4008655.0, "step": 2280 }, { "entropy": 5.972759962081909, "epoch": 1.7907523510971788, "grad_norm": 1.1328125, "learning_rate": 0.00048687083788291656, "loss": 5.8939, "mean_token_accuracy": 0.14818500876426696, "num_tokens": 4017317.0, "step": 2285 }, { "entropy": 5.872368335723877, "epoch": 1.7946708463949843, "grad_norm": 1.015625, "learning_rate": 0.00048676940278292953, "loss": 5.7282, "mean_token_accuracy": 0.14405218958854676, "num_tokens": 4025407.0, "step": 2290 }, { "entropy": 5.908810663223266, "epoch": 1.7985893416927898, "grad_norm": 1.0703125, "learning_rate": 0.00048666758921668286, "loss": 5.7369, "mean_token_accuracy": 0.14384883195161818, "num_tokens": 4034310.0, "step": 2295 }, { "entropy": 5.879950332641601, "epoch": 1.8025078369905956, "grad_norm": 1.171875, "learning_rate": 0.00048656539736613403, "loss": 5.8969, "mean_token_accuracy": 0.1422630712389946, "num_tokens": 4042326.0, "step": 2300 }, { "entropy": 5.854609394073487, "epoch": 1.8064263322884013, "grad_norm": 1.125, "learning_rate": 0.0004864628274139164, "loss": 5.7595, "mean_token_accuracy": 0.1504887729883194, "num_tokens": 4051248.0, "step": 2305 }, { "entropy": 5.929189538955688, "epoch": 1.8103448275862069, "grad_norm": 1.1484375, "learning_rate": 0.0004863598795433391, "loss": 5.8763, "mean_token_accuracy": 0.1436061829328537, "num_tokens": 4060227.0, "step": 2310 }, { "entropy": 6.035856866836548, "epoch": 1.8142633228840124, "grad_norm": 1.1328125, "learning_rate": 0.00048625655393838666, "loss": 5.9747, "mean_token_accuracy": 0.1343390792608261, "num_tokens": 4069700.0, "step": 2315 }, { "entropy": 5.9489781856536865, "epoch": 1.8181818181818183, "grad_norm": 1.078125, "learning_rate": 0.0004861528507837186, "loss": 5.8364, "mean_token_accuracy": 0.14606306403875352, "num_tokens": 4079363.0, "step": 2320 }, { "entropy": 6.038274621963501, "epoch": 1.8221003134796239, "grad_norm": 1.0703125, "learning_rate": 0.0004860487702646695, "loss": 5.9136, "mean_token_accuracy": 0.14080933034420012, "num_tokens": 4087944.0, "step": 2325 }, { "entropy": 5.765271520614624, "epoch": 1.8260188087774294, "grad_norm": 1.3125, "learning_rate": 0.0004859443125672479, "loss": 5.6074, "mean_token_accuracy": 0.16549111306667327, "num_tokens": 4096431.0, "step": 2330 }, { "entropy": 5.862937021255493, "epoch": 1.8299373040752351, "grad_norm": 1.1875, "learning_rate": 0.0004858394778781368, "loss": 5.7839, "mean_token_accuracy": 0.1452955462038517, "num_tokens": 4104894.0, "step": 2335 }, { "entropy": 5.929635095596313, "epoch": 1.8338557993730409, "grad_norm": 1.1796875, "learning_rate": 0.0004857342663846927, "loss": 5.8308, "mean_token_accuracy": 0.1402455188333988, "num_tokens": 4112878.0, "step": 2340 }, { "entropy": 5.906289291381836, "epoch": 1.8377742946708464, "grad_norm": 1.171875, "learning_rate": 0.0004856286782749456, "loss": 5.839, "mean_token_accuracy": 0.14992391616106032, "num_tokens": 4122501.0, "step": 2345 }, { "entropy": 5.893653440475464, "epoch": 1.841692789968652, "grad_norm": 1.1171875, "learning_rate": 0.0004855227137375986, "loss": 5.8018, "mean_token_accuracy": 0.13767838329076768, "num_tokens": 4131301.0, "step": 2350 }, { "entropy": 5.921551322937011, "epoch": 1.8456112852664577, "grad_norm": 1.09375, "learning_rate": 0.0004854163729620275, "loss": 5.8349, "mean_token_accuracy": 0.14852394610643388, "num_tokens": 4140076.0, "step": 2355 }, { "entropy": 5.864505481719971, "epoch": 1.8495297805642634, "grad_norm": 1.078125, "learning_rate": 0.0004853096561382805, "loss": 5.7663, "mean_token_accuracy": 0.15288633555173875, "num_tokens": 4148728.0, "step": 2360 }, { "entropy": 5.914716482162476, "epoch": 1.853448275862069, "grad_norm": 1.125, "learning_rate": 0.0004852025634570779, "loss": 5.9037, "mean_token_accuracy": 0.14291643872857093, "num_tokens": 4157520.0, "step": 2365 }, { "entropy": 5.905571317672729, "epoch": 1.8573667711598745, "grad_norm": 1.1484375, "learning_rate": 0.0004850950951098116, "loss": 5.7141, "mean_token_accuracy": 0.15238296538591384, "num_tokens": 4165547.0, "step": 2370 }, { "entropy": 5.928301191329956, "epoch": 1.8612852664576802, "grad_norm": 1.0859375, "learning_rate": 0.0004849872512885451, "loss": 5.8628, "mean_token_accuracy": 0.1383912220597267, "num_tokens": 4174656.0, "step": 2375 }, { "entropy": 5.904009199142456, "epoch": 1.865203761755486, "grad_norm": 1.078125, "learning_rate": 0.0004848790321860127, "loss": 5.9161, "mean_token_accuracy": 0.1413537159562111, "num_tokens": 4183452.0, "step": 2380 }, { "entropy": 5.909180545806885, "epoch": 1.8691222570532915, "grad_norm": 1.0703125, "learning_rate": 0.00048477043799561946, "loss": 5.7966, "mean_token_accuracy": 0.14063763692975045, "num_tokens": 4192306.0, "step": 2385 }, { "entropy": 5.8980663299560545, "epoch": 1.873040752351097, "grad_norm": 1.1796875, "learning_rate": 0.0004846614689114409, "loss": 5.8643, "mean_token_accuracy": 0.14282563477754592, "num_tokens": 4201201.0, "step": 2390 }, { "entropy": 5.890587997436524, "epoch": 1.876959247648903, "grad_norm": 1.2265625, "learning_rate": 0.0004845521251282223, "loss": 5.7437, "mean_token_accuracy": 0.15185199975967406, "num_tokens": 4209374.0, "step": 2395 }, { "entropy": 5.809947061538696, "epoch": 1.8808777429467085, "grad_norm": 1.1953125, "learning_rate": 0.0004844424068413789, "loss": 5.7215, "mean_token_accuracy": 0.1469734065234661, "num_tokens": 4217258.0, "step": 2400 }, { "entropy": 5.848538589477539, "epoch": 1.884796238244514, "grad_norm": 1.0625, "learning_rate": 0.00048433231424699504, "loss": 5.7551, "mean_token_accuracy": 0.1480185203254223, "num_tokens": 4226484.0, "step": 2405 }, { "entropy": 5.924857568740845, "epoch": 1.8887147335423198, "grad_norm": 1.0703125, "learning_rate": 0.00048422184754182384, "loss": 5.7889, "mean_token_accuracy": 0.14311063811182975, "num_tokens": 4235546.0, "step": 2410 }, { "entropy": 5.950683832168579, "epoch": 1.8926332288401255, "grad_norm": 1.1484375, "learning_rate": 0.0004841110069232875, "loss": 5.9386, "mean_token_accuracy": 0.1451379805803299, "num_tokens": 4244549.0, "step": 2415 }, { "entropy": 5.900457239151001, "epoch": 1.896551724137931, "grad_norm": 1.1640625, "learning_rate": 0.00048399979258947597, "loss": 5.8331, "mean_token_accuracy": 0.1464390404522419, "num_tokens": 4252918.0, "step": 2420 }, { "entropy": 5.9453812599182125, "epoch": 1.9004702194357366, "grad_norm": 1.09375, "learning_rate": 0.0004838882047391474, "loss": 5.8327, "mean_token_accuracy": 0.14703057184815407, "num_tokens": 4261674.0, "step": 2425 }, { "entropy": 5.875091028213501, "epoch": 1.9043887147335423, "grad_norm": 1.171875, "learning_rate": 0.00048377624357172724, "loss": 5.7425, "mean_token_accuracy": 0.14618095010519028, "num_tokens": 4269986.0, "step": 2430 }, { "entropy": 5.890668630599976, "epoch": 1.908307210031348, "grad_norm": 1.15625, "learning_rate": 0.00048366390928730843, "loss": 5.8377, "mean_token_accuracy": 0.14733434692025185, "num_tokens": 4278131.0, "step": 2435 }, { "entropy": 5.966364622116089, "epoch": 1.9122257053291536, "grad_norm": 1.4453125, "learning_rate": 0.0004835512020866504, "loss": 5.8243, "mean_token_accuracy": 0.14588867947459222, "num_tokens": 4287214.0, "step": 2440 }, { "entropy": 5.893796443939209, "epoch": 1.9161442006269591, "grad_norm": 1.140625, "learning_rate": 0.00048343812217117925, "loss": 5.7504, "mean_token_accuracy": 0.1461929127573967, "num_tokens": 4295949.0, "step": 2445 }, { "entropy": 5.786278104782104, "epoch": 1.9200626959247649, "grad_norm": 1.1015625, "learning_rate": 0.00048332466974298723, "loss": 5.7543, "mean_token_accuracy": 0.1465558797121048, "num_tokens": 4305014.0, "step": 2450 }, { "entropy": 5.851295328140258, "epoch": 1.9239811912225706, "grad_norm": 1.1484375, "learning_rate": 0.00048321084500483203, "loss": 5.7265, "mean_token_accuracy": 0.14898128807544708, "num_tokens": 4313560.0, "step": 2455 }, { "entropy": 5.87207818031311, "epoch": 1.9278996865203761, "grad_norm": 1.0859375, "learning_rate": 0.000483096648160137, "loss": 5.8056, "mean_token_accuracy": 0.1454322248697281, "num_tokens": 4323280.0, "step": 2460 }, { "entropy": 5.872972869873047, "epoch": 1.9318181818181817, "grad_norm": 1.1875, "learning_rate": 0.00048298207941299047, "loss": 5.861, "mean_token_accuracy": 0.14596157446503638, "num_tokens": 4331757.0, "step": 2465 }, { "entropy": 5.8036915302276615, "epoch": 1.9357366771159876, "grad_norm": 1.1015625, "learning_rate": 0.00048286713896814536, "loss": 5.7876, "mean_token_accuracy": 0.14330902695655823, "num_tokens": 4341453.0, "step": 2470 }, { "entropy": 6.028292417526245, "epoch": 1.9396551724137931, "grad_norm": 1.1484375, "learning_rate": 0.00048275182703101877, "loss": 5.7431, "mean_token_accuracy": 0.14791915863752364, "num_tokens": 4349095.0, "step": 2475 }, { "entropy": 5.750636386871338, "epoch": 1.9435736677115987, "grad_norm": 1.0703125, "learning_rate": 0.00048263614380769193, "loss": 5.8404, "mean_token_accuracy": 0.14387739300727845, "num_tokens": 4357913.0, "step": 2480 }, { "entropy": 5.927936792373657, "epoch": 1.9474921630094044, "grad_norm": 1.15625, "learning_rate": 0.00048252008950490957, "loss": 5.8083, "mean_token_accuracy": 0.14284604787826538, "num_tokens": 4366586.0, "step": 2485 }, { "entropy": 5.8729980945587155, "epoch": 1.9514106583072102, "grad_norm": 1.1484375, "learning_rate": 0.00048240366433007935, "loss": 5.7307, "mean_token_accuracy": 0.15130855292081832, "num_tokens": 4375393.0, "step": 2490 }, { "entropy": 5.848481798171997, "epoch": 1.9553291536050157, "grad_norm": 1.109375, "learning_rate": 0.00048228686849127213, "loss": 5.8159, "mean_token_accuracy": 0.15051234513521194, "num_tokens": 4383747.0, "step": 2495 }, { "entropy": 5.987670612335205, "epoch": 1.9592476489028212, "grad_norm": 1.0859375, "learning_rate": 0.0004821697021972209, "loss": 5.7804, "mean_token_accuracy": 0.14112372547388077, "num_tokens": 4392085.0, "step": 2500 }, { "epoch": 1.9592476489028212, "eval_entropy": 5.819716327874235, "eval_loss": 5.9731364250183105, "eval_mean_token_accuracy": 0.14396101977791667, "eval_num_tokens": 4392085.0, "eval_runtime": 2.8283, "eval_samples_per_second": 1457.403, "eval_steps_per_second": 182.441, "step": 2500 }, { "entropy": 5.882195997238159, "epoch": 1.963166144200627, "grad_norm": 1.2265625, "learning_rate": 0.0004820521656573208, "loss": 5.8043, "mean_token_accuracy": 0.14567812159657478, "num_tokens": 4400843.0, "step": 2505 }, { "entropy": 5.773218107223511, "epoch": 1.9670846394984327, "grad_norm": 1.15625, "learning_rate": 0.0004819342590816288, "loss": 5.7471, "mean_token_accuracy": 0.14761917144060135, "num_tokens": 4409126.0, "step": 2510 }, { "entropy": 5.89848108291626, "epoch": 1.9710031347962382, "grad_norm": 1.1015625, "learning_rate": 0.0004818159826808631, "loss": 5.9097, "mean_token_accuracy": 0.1417311027646065, "num_tokens": 4418401.0, "step": 2515 }, { "entropy": 5.897497129440308, "epoch": 1.9749216300940438, "grad_norm": 1.0703125, "learning_rate": 0.0004816973366664026, "loss": 5.811, "mean_token_accuracy": 0.1456060990691185, "num_tokens": 4428141.0, "step": 2520 }, { "entropy": 5.986608266830444, "epoch": 1.9788401253918495, "grad_norm": 1.078125, "learning_rate": 0.0004815783212502871, "loss": 5.828, "mean_token_accuracy": 0.1428292214870453, "num_tokens": 4437158.0, "step": 2525 }, { "entropy": 5.895078134536743, "epoch": 1.9827586206896552, "grad_norm": 1.234375, "learning_rate": 0.00048145893664521645, "loss": 5.8973, "mean_token_accuracy": 0.13774282485246658, "num_tokens": 4445901.0, "step": 2530 }, { "entropy": 5.878189134597778, "epoch": 1.9866771159874608, "grad_norm": 1.1875, "learning_rate": 0.00048133918306455023, "loss": 5.7054, "mean_token_accuracy": 0.15397633910179137, "num_tokens": 4454200.0, "step": 2535 }, { "entropy": 5.8978513240814205, "epoch": 1.9905956112852663, "grad_norm": 1.1171875, "learning_rate": 0.0004812190607223075, "loss": 5.8244, "mean_token_accuracy": 0.1423790991306305, "num_tokens": 4463484.0, "step": 2540 }, { "entropy": 5.743094491958618, "epoch": 1.9945141065830723, "grad_norm": 1.109375, "learning_rate": 0.00048109856983316655, "loss": 5.6436, "mean_token_accuracy": 0.1538312703371048, "num_tokens": 4472659.0, "step": 2545 }, { "entropy": 5.894764852523804, "epoch": 1.9984326018808778, "grad_norm": 1.1328125, "learning_rate": 0.000480977710612464, "loss": 5.7289, "mean_token_accuracy": 0.1462229423224926, "num_tokens": 4481355.0, "step": 2550 }, { "entropy": 5.841469097137451, "epoch": 2.0023510971786833, "grad_norm": 1.0703125, "learning_rate": 0.0004808564832761948, "loss": 5.5879, "mean_token_accuracy": 0.1482195809483528, "num_tokens": 4490005.0, "step": 2555 }, { "entropy": 5.781640338897705, "epoch": 2.006269592476489, "grad_norm": 1.0625, "learning_rate": 0.0004807348880410119, "loss": 5.3827, "mean_token_accuracy": 0.1616608127951622, "num_tokens": 4498645.0, "step": 2560 }, { "entropy": 5.733411645889282, "epoch": 2.010188087774295, "grad_norm": 1.0703125, "learning_rate": 0.0004806129251242258, "loss": 5.5173, "mean_token_accuracy": 0.15624455660581588, "num_tokens": 4509675.0, "step": 2565 }, { "entropy": 5.799925708770752, "epoch": 2.0141065830721003, "grad_norm": 1.0625, "learning_rate": 0.00048049059474380393, "loss": 5.3307, "mean_token_accuracy": 0.1625298425555229, "num_tokens": 4518463.0, "step": 2570 }, { "entropy": 5.857757616043091, "epoch": 2.018025078369906, "grad_norm": 1.1328125, "learning_rate": 0.00048036789711837047, "loss": 5.4816, "mean_token_accuracy": 0.15710628032684326, "num_tokens": 4527120.0, "step": 2575 }, { "entropy": 5.79098744392395, "epoch": 2.0219435736677114, "grad_norm": 1.1484375, "learning_rate": 0.00048024483246720607, "loss": 5.5141, "mean_token_accuracy": 0.1521046057343483, "num_tokens": 4535962.0, "step": 2580 }, { "entropy": 5.795133399963379, "epoch": 2.0258620689655173, "grad_norm": 1.0390625, "learning_rate": 0.0004801214010102472, "loss": 5.4748, "mean_token_accuracy": 0.15003894567489623, "num_tokens": 4545233.0, "step": 2585 }, { "entropy": 5.720596790313721, "epoch": 2.029780564263323, "grad_norm": 1.03125, "learning_rate": 0.0004799976029680858, "loss": 5.351, "mean_token_accuracy": 0.1570914052426815, "num_tokens": 4554658.0, "step": 2590 }, { "entropy": 5.793403244018554, "epoch": 2.0336990595611284, "grad_norm": 1.125, "learning_rate": 0.0004798734385619691, "loss": 5.5496, "mean_token_accuracy": 0.1535790517926216, "num_tokens": 4563964.0, "step": 2595 }, { "entropy": 5.807430982589722, "epoch": 2.0376175548589344, "grad_norm": 1.0625, "learning_rate": 0.000479748908013799, "loss": 5.4134, "mean_token_accuracy": 0.16276097446680068, "num_tokens": 4572865.0, "step": 2600 }, { "entropy": 5.672877264022827, "epoch": 2.04153605015674, "grad_norm": 1.125, "learning_rate": 0.0004796240115461319, "loss": 5.3736, "mean_token_accuracy": 0.17069609314203263, "num_tokens": 4581021.0, "step": 2605 }, { "entropy": 5.710117959976197, "epoch": 2.0454545454545454, "grad_norm": 1.0078125, "learning_rate": 0.0004794987493821779, "loss": 5.4687, "mean_token_accuracy": 0.15729401111602784, "num_tokens": 4590779.0, "step": 2610 }, { "entropy": 5.794814777374268, "epoch": 2.049373040752351, "grad_norm": 1.046875, "learning_rate": 0.00047937312174580084, "loss": 5.4478, "mean_token_accuracy": 0.1574981316924095, "num_tokens": 4599831.0, "step": 2615 }, { "entropy": 5.857023143768311, "epoch": 2.053291536050157, "grad_norm": 1.25, "learning_rate": 0.0004792471288615177, "loss": 5.5945, "mean_token_accuracy": 0.1500529244542122, "num_tokens": 4609264.0, "step": 2620 }, { "entropy": 5.8015649795532225, "epoch": 2.0572100313479624, "grad_norm": 1.0625, "learning_rate": 0.0004791207709544981, "loss": 5.5107, "mean_token_accuracy": 0.15906523615121843, "num_tokens": 4617885.0, "step": 2625 }, { "entropy": 5.6719653606414795, "epoch": 2.061128526645768, "grad_norm": 1.125, "learning_rate": 0.00047899404825056424, "loss": 5.4279, "mean_token_accuracy": 0.15619982779026031, "num_tokens": 4626536.0, "step": 2630 }, { "entropy": 5.770504665374756, "epoch": 2.0650470219435735, "grad_norm": 1.1796875, "learning_rate": 0.0004788669609761901, "loss": 5.5118, "mean_token_accuracy": 0.1544651284813881, "num_tokens": 4635563.0, "step": 2635 }, { "entropy": 5.720648860931396, "epoch": 2.0689655172413794, "grad_norm": 1.1328125, "learning_rate": 0.00047873950935850107, "loss": 5.4414, "mean_token_accuracy": 0.1552010580897331, "num_tokens": 4644822.0, "step": 2640 }, { "entropy": 5.75403094291687, "epoch": 2.072884012539185, "grad_norm": 1.234375, "learning_rate": 0.0004786116936252742, "loss": 5.4081, "mean_token_accuracy": 0.1630728781223297, "num_tokens": 4652709.0, "step": 2645 }, { "entropy": 5.651195621490478, "epoch": 2.0768025078369905, "grad_norm": 1.15625, "learning_rate": 0.0004784835140049367, "loss": 5.447, "mean_token_accuracy": 0.15986524671316146, "num_tokens": 4662014.0, "step": 2650 }, { "entropy": 5.7658521175384525, "epoch": 2.080721003134796, "grad_norm": 1.1875, "learning_rate": 0.0004783549707265663, "loss": 5.4033, "mean_token_accuracy": 0.15730245560407638, "num_tokens": 4670618.0, "step": 2655 }, { "entropy": 5.7565501689910885, "epoch": 2.084639498432602, "grad_norm": 1.1171875, "learning_rate": 0.00047822606401989084, "loss": 5.5384, "mean_token_accuracy": 0.14618516638875007, "num_tokens": 4679937.0, "step": 2660 }, { "entropy": 5.747328805923462, "epoch": 2.0885579937304075, "grad_norm": 1.1875, "learning_rate": 0.0004780967941152873, "loss": 5.3716, "mean_token_accuracy": 0.16848112791776657, "num_tokens": 4688006.0, "step": 2665 }, { "entropy": 5.598170948028565, "epoch": 2.092476489028213, "grad_norm": 1.09375, "learning_rate": 0.0004779671612437822, "loss": 5.3212, "mean_token_accuracy": 0.16888306885957718, "num_tokens": 4697106.0, "step": 2670 }, { "entropy": 5.709634304046631, "epoch": 2.096394984326019, "grad_norm": 1.171875, "learning_rate": 0.00047783716563705035, "loss": 5.4699, "mean_token_accuracy": 0.15956653356552125, "num_tokens": 4705582.0, "step": 2675 }, { "entropy": 5.789704513549805, "epoch": 2.1003134796238245, "grad_norm": 1.171875, "learning_rate": 0.000477706807527415, "loss": 5.4032, "mean_token_accuracy": 0.1633769229054451, "num_tokens": 4713640.0, "step": 2680 }, { "entropy": 5.7358156681060795, "epoch": 2.10423197492163, "grad_norm": 1.09375, "learning_rate": 0.0004775760871478472, "loss": 5.5193, "mean_token_accuracy": 0.1518564686179161, "num_tokens": 4722729.0, "step": 2685 }, { "entropy": 5.710189342498779, "epoch": 2.1081504702194356, "grad_norm": 1.2109375, "learning_rate": 0.00047744500473196564, "loss": 5.4226, "mean_token_accuracy": 0.16306858509778976, "num_tokens": 4732099.0, "step": 2690 }, { "entropy": 5.778943872451782, "epoch": 2.1120689655172415, "grad_norm": 1.2265625, "learning_rate": 0.00047731356051403556, "loss": 5.4259, "mean_token_accuracy": 0.15491524189710618, "num_tokens": 4741766.0, "step": 2695 }, { "entropy": 5.550606107711792, "epoch": 2.115987460815047, "grad_norm": 1.046875, "learning_rate": 0.0004771817547289693, "loss": 5.3501, "mean_token_accuracy": 0.16072812229394912, "num_tokens": 4750890.0, "step": 2700 }, { "entropy": 5.691572904586792, "epoch": 2.1199059561128526, "grad_norm": 1.171875, "learning_rate": 0.0004770495876123251, "loss": 5.4713, "mean_token_accuracy": 0.15760626047849655, "num_tokens": 4760063.0, "step": 2705 }, { "entropy": 5.7919927597045895, "epoch": 2.123824451410658, "grad_norm": 1.1640625, "learning_rate": 0.0004769170594003071, "loss": 5.4737, "mean_token_accuracy": 0.16406295895576478, "num_tokens": 4768928.0, "step": 2710 }, { "entropy": 5.65070915222168, "epoch": 2.127742946708464, "grad_norm": 1.109375, "learning_rate": 0.00047678417032976457, "loss": 5.4532, "mean_token_accuracy": 0.15579911768436433, "num_tokens": 4778129.0, "step": 2715 }, { "entropy": 5.69934196472168, "epoch": 2.1316614420062696, "grad_norm": 1.078125, "learning_rate": 0.0004766509206381919, "loss": 5.4479, "mean_token_accuracy": 0.1599138170480728, "num_tokens": 4786759.0, "step": 2720 }, { "entropy": 5.6193219184875485, "epoch": 2.135579937304075, "grad_norm": 1.1484375, "learning_rate": 0.0004765173105637279, "loss": 5.4547, "mean_token_accuracy": 0.16395413279533386, "num_tokens": 4795355.0, "step": 2725 }, { "entropy": 5.696455669403076, "epoch": 2.139498432601881, "grad_norm": 1.1328125, "learning_rate": 0.00047638334034515547, "loss": 5.3964, "mean_token_accuracy": 0.15683644711971284, "num_tokens": 4804030.0, "step": 2730 }, { "entropy": 5.580397653579712, "epoch": 2.1434169278996866, "grad_norm": 1.09375, "learning_rate": 0.00047624901022190106, "loss": 5.3523, "mean_token_accuracy": 0.17931961715221406, "num_tokens": 4812779.0, "step": 2735 }, { "entropy": 5.7006062984466555, "epoch": 2.147335423197492, "grad_norm": 1.15625, "learning_rate": 0.00047611432043403437, "loss": 5.4629, "mean_token_accuracy": 0.16019529104232788, "num_tokens": 4822292.0, "step": 2740 }, { "entropy": 5.834177780151367, "epoch": 2.1512539184952977, "grad_norm": 1.2421875, "learning_rate": 0.0004759792712222679, "loss": 5.535, "mean_token_accuracy": 0.15739217698574065, "num_tokens": 4830524.0, "step": 2745 }, { "entropy": 5.8059389114379885, "epoch": 2.1551724137931036, "grad_norm": 1.171875, "learning_rate": 0.0004758438628279565, "loss": 5.5164, "mean_token_accuracy": 0.15928612202405928, "num_tokens": 4839328.0, "step": 2750 }, { "entropy": 5.638897466659546, "epoch": 2.159090909090909, "grad_norm": 1.109375, "learning_rate": 0.00047570809549309697, "loss": 5.3591, "mean_token_accuracy": 0.1710207626223564, "num_tokens": 4847711.0, "step": 2755 }, { "entropy": 5.774710035324096, "epoch": 2.1630094043887147, "grad_norm": 1.484375, "learning_rate": 0.0004755719694603275, "loss": 5.549, "mean_token_accuracy": 0.15182463973760604, "num_tokens": 4856706.0, "step": 2760 }, { "entropy": 5.728883123397827, "epoch": 2.16692789968652, "grad_norm": 1.1953125, "learning_rate": 0.0004754354849729274, "loss": 5.5423, "mean_token_accuracy": 0.15759230852127076, "num_tokens": 4865250.0, "step": 2765 }, { "entropy": 5.702935409545899, "epoch": 2.170846394984326, "grad_norm": 1.2109375, "learning_rate": 0.00047529864227481653, "loss": 5.4631, "mean_token_accuracy": 0.15977989733219147, "num_tokens": 4873826.0, "step": 2770 }, { "entropy": 5.781952810287476, "epoch": 2.1747648902821317, "grad_norm": 1.140625, "learning_rate": 0.000475161441610555, "loss": 5.44, "mean_token_accuracy": 0.16142217814922333, "num_tokens": 4882631.0, "step": 2775 }, { "entropy": 5.665818929672241, "epoch": 2.1786833855799372, "grad_norm": 1.1875, "learning_rate": 0.0004750238832253427, "loss": 5.4775, "mean_token_accuracy": 0.15787807181477548, "num_tokens": 4891007.0, "step": 2780 }, { "entropy": 5.697315788269043, "epoch": 2.1826018808777428, "grad_norm": 1.1875, "learning_rate": 0.0004748859673650187, "loss": 5.386, "mean_token_accuracy": 0.16155248284339904, "num_tokens": 4899692.0, "step": 2785 }, { "entropy": 5.658601570129394, "epoch": 2.1865203761755487, "grad_norm": 1.1015625, "learning_rate": 0.00047474769427606115, "loss": 5.5095, "mean_token_accuracy": 0.15856993645429612, "num_tokens": 4909354.0, "step": 2790 }, { "entropy": 5.6818701267242435, "epoch": 2.1904388714733543, "grad_norm": 1.1328125, "learning_rate": 0.0004746090642055863, "loss": 5.4671, "mean_token_accuracy": 0.1562755212187767, "num_tokens": 4918400.0, "step": 2795 }, { "entropy": 5.739003992080688, "epoch": 2.19435736677116, "grad_norm": 1.15625, "learning_rate": 0.00047447007740134857, "loss": 5.5467, "mean_token_accuracy": 0.14892319440841675, "num_tokens": 4926852.0, "step": 2800 }, { "entropy": 5.7248610496521, "epoch": 2.1982758620689653, "grad_norm": 1.1015625, "learning_rate": 0.00047433073411174, "loss": 5.4509, "mean_token_accuracy": 0.15806010216474534, "num_tokens": 4935717.0, "step": 2805 }, { "entropy": 5.591716861724853, "epoch": 2.2021943573667713, "grad_norm": 1.0625, "learning_rate": 0.0004741910345857896, "loss": 5.3813, "mean_token_accuracy": 0.16058638840913772, "num_tokens": 4945389.0, "step": 2810 }, { "entropy": 5.752324533462525, "epoch": 2.206112852664577, "grad_norm": 1.2578125, "learning_rate": 0.00047405097907316315, "loss": 5.5201, "mean_token_accuracy": 0.1630863979458809, "num_tokens": 4953670.0, "step": 2815 }, { "entropy": 5.657805633544922, "epoch": 2.2100313479623823, "grad_norm": 1.109375, "learning_rate": 0.0004739105678241625, "loss": 5.5289, "mean_token_accuracy": 0.15918288826942445, "num_tokens": 4962383.0, "step": 2820 }, { "entropy": 5.662711334228516, "epoch": 2.2139498432601883, "grad_norm": 1.2265625, "learning_rate": 0.0004737698010897253, "loss": 5.3295, "mean_token_accuracy": 0.16085091829299927, "num_tokens": 4970672.0, "step": 2825 }, { "entropy": 5.757535028457641, "epoch": 2.217868338557994, "grad_norm": 1.1796875, "learning_rate": 0.0004736286791214245, "loss": 5.5162, "mean_token_accuracy": 0.15798387676477432, "num_tokens": 4979303.0, "step": 2830 }, { "entropy": 5.72713942527771, "epoch": 2.2217868338557993, "grad_norm": 1.15625, "learning_rate": 0.00047348720217146807, "loss": 5.3865, "mean_token_accuracy": 0.1653503268957138, "num_tokens": 4988322.0, "step": 2835 }, { "entropy": 5.582199764251709, "epoch": 2.225705329153605, "grad_norm": 1.1953125, "learning_rate": 0.00047334537049269806, "loss": 5.4754, "mean_token_accuracy": 0.15811690539121628, "num_tokens": 4997250.0, "step": 2840 }, { "entropy": 5.754626846313476, "epoch": 2.229623824451411, "grad_norm": 1.2578125, "learning_rate": 0.0004732031843385909, "loss": 5.5191, "mean_token_accuracy": 0.151338791847229, "num_tokens": 5005459.0, "step": 2845 }, { "entropy": 5.570264291763306, "epoch": 2.2335423197492164, "grad_norm": 1.1875, "learning_rate": 0.0004730606439632562, "loss": 5.2819, "mean_token_accuracy": 0.17884395718574525, "num_tokens": 5013704.0, "step": 2850 }, { "entropy": 5.602528190612793, "epoch": 2.237460815047022, "grad_norm": 1.21875, "learning_rate": 0.0004729177496214367, "loss": 5.3976, "mean_token_accuracy": 0.16089559048414231, "num_tokens": 5022545.0, "step": 2855 }, { "entropy": 5.619579839706421, "epoch": 2.2413793103448274, "grad_norm": 1.1484375, "learning_rate": 0.00047277450156850767, "loss": 5.4026, "mean_token_accuracy": 0.16599346250295638, "num_tokens": 5031133.0, "step": 2860 }, { "entropy": 5.6934120655059814, "epoch": 2.2452978056426334, "grad_norm": 1.21875, "learning_rate": 0.0004726309000604768, "loss": 5.4422, "mean_token_accuracy": 0.15853513032197952, "num_tokens": 5039424.0, "step": 2865 }, { "entropy": 5.7841479778289795, "epoch": 2.249216300940439, "grad_norm": 1.1640625, "learning_rate": 0.0004724869453539832, "loss": 5.6437, "mean_token_accuracy": 0.15349650308489798, "num_tokens": 5048868.0, "step": 2870 }, { "entropy": 5.743923711776733, "epoch": 2.2531347962382444, "grad_norm": 1.125, "learning_rate": 0.0004723426377062972, "loss": 5.4742, "mean_token_accuracy": 0.15738968551158905, "num_tokens": 5057532.0, "step": 2875 }, { "entropy": 5.752024412155151, "epoch": 2.2570532915360504, "grad_norm": 1.1328125, "learning_rate": 0.00047219797737532, "loss": 5.4875, "mean_token_accuracy": 0.15550234615802766, "num_tokens": 5066088.0, "step": 2880 }, { "entropy": 5.5935587882995605, "epoch": 2.260971786833856, "grad_norm": 1.1953125, "learning_rate": 0.00047205296461958314, "loss": 5.3623, "mean_token_accuracy": 0.1605392187833786, "num_tokens": 5074968.0, "step": 2885 }, { "entropy": 5.607887029647827, "epoch": 2.2648902821316614, "grad_norm": 1.1328125, "learning_rate": 0.00047190759969824785, "loss": 5.4266, "mean_token_accuracy": 0.1619516670703888, "num_tokens": 5083605.0, "step": 2890 }, { "entropy": 5.665104103088379, "epoch": 2.268808777429467, "grad_norm": 1.140625, "learning_rate": 0.00047176188287110485, "loss": 5.4267, "mean_token_accuracy": 0.16131291091442107, "num_tokens": 5092182.0, "step": 2895 }, { "entropy": 5.784565830230713, "epoch": 2.2727272727272725, "grad_norm": 1.125, "learning_rate": 0.0004716158143985737, "loss": 5.5997, "mean_token_accuracy": 0.15428409725427628, "num_tokens": 5100910.0, "step": 2900 }, { "entropy": 5.705980825424194, "epoch": 2.2766457680250785, "grad_norm": 1.1875, "learning_rate": 0.00047146939454170245, "loss": 5.4627, "mean_token_accuracy": 0.16400493532419205, "num_tokens": 5109760.0, "step": 2905 }, { "entropy": 5.720902061462402, "epoch": 2.280564263322884, "grad_norm": 1.203125, "learning_rate": 0.0004713226235621672, "loss": 5.4957, "mean_token_accuracy": 0.15792311280965804, "num_tokens": 5119193.0, "step": 2910 }, { "entropy": 5.61166467666626, "epoch": 2.2844827586206895, "grad_norm": 1.171875, "learning_rate": 0.0004711755017222714, "loss": 5.4122, "mean_token_accuracy": 0.1576576665043831, "num_tokens": 5127480.0, "step": 2915 }, { "entropy": 5.743918991088867, "epoch": 2.2884012539184955, "grad_norm": 1.2578125, "learning_rate": 0.00047102802928494563, "loss": 5.5857, "mean_token_accuracy": 0.15232446938753127, "num_tokens": 5136648.0, "step": 2920 }, { "entropy": 5.688281393051147, "epoch": 2.292319749216301, "grad_norm": 1.1953125, "learning_rate": 0.0004708802065137471, "loss": 5.3771, "mean_token_accuracy": 0.16786309629678725, "num_tokens": 5144708.0, "step": 2925 }, { "entropy": 5.643393802642822, "epoch": 2.2962382445141065, "grad_norm": 1.1328125, "learning_rate": 0.0004707320336728591, "loss": 5.5237, "mean_token_accuracy": 0.15052489414811135, "num_tokens": 5153388.0, "step": 2930 }, { "entropy": 5.797506046295166, "epoch": 2.300156739811912, "grad_norm": 1.2265625, "learning_rate": 0.0004705835110270904, "loss": 5.4824, "mean_token_accuracy": 0.15724072754383087, "num_tokens": 5161874.0, "step": 2935 }, { "entropy": 5.700078773498535, "epoch": 2.304075235109718, "grad_norm": 1.140625, "learning_rate": 0.00047043463884187517, "loss": 5.4791, "mean_token_accuracy": 0.16156153082847596, "num_tokens": 5170284.0, "step": 2940 }, { "entropy": 5.583881044387818, "epoch": 2.3079937304075235, "grad_norm": 1.1640625, "learning_rate": 0.00047028541738327207, "loss": 5.44, "mean_token_accuracy": 0.15695979446172714, "num_tokens": 5179547.0, "step": 2945 }, { "entropy": 5.691956996917725, "epoch": 2.311912225705329, "grad_norm": 1.28125, "learning_rate": 0.0004701358469179641, "loss": 5.4472, "mean_token_accuracy": 0.16403508335351943, "num_tokens": 5188256.0, "step": 2950 }, { "entropy": 5.756633996963501, "epoch": 2.3158307210031346, "grad_norm": 1.2265625, "learning_rate": 0.0004699859277132578, "loss": 5.5902, "mean_token_accuracy": 0.16007334217429162, "num_tokens": 5197464.0, "step": 2955 }, { "entropy": 5.734570837020874, "epoch": 2.3197492163009406, "grad_norm": 1.1796875, "learning_rate": 0.00046983566003708336, "loss": 5.3792, "mean_token_accuracy": 0.16429649144411088, "num_tokens": 5205989.0, "step": 2960 }, { "entropy": 5.587563323974609, "epoch": 2.323667711598746, "grad_norm": 1.2734375, "learning_rate": 0.00046968504415799325, "loss": 5.4346, "mean_token_accuracy": 0.1603007957339287, "num_tokens": 5214490.0, "step": 2965 }, { "entropy": 5.64924201965332, "epoch": 2.3275862068965516, "grad_norm": 1.109375, "learning_rate": 0.0004695340803451625, "loss": 5.464, "mean_token_accuracy": 0.16044287830591203, "num_tokens": 5223335.0, "step": 2970 }, { "entropy": 5.629110479354859, "epoch": 2.3315047021943576, "grad_norm": 1.125, "learning_rate": 0.0004693827688683879, "loss": 5.4147, "mean_token_accuracy": 0.15993678867816924, "num_tokens": 5231665.0, "step": 2975 }, { "entropy": 5.594243144989013, "epoch": 2.335423197492163, "grad_norm": 1.265625, "learning_rate": 0.0004692311099980878, "loss": 5.4005, "mean_token_accuracy": 0.16494683176279068, "num_tokens": 5239810.0, "step": 2980 }, { "entropy": 5.5794525146484375, "epoch": 2.3393416927899686, "grad_norm": 1.2109375, "learning_rate": 0.00046907910400530097, "loss": 5.369, "mean_token_accuracy": 0.16893674433231354, "num_tokens": 5247525.0, "step": 2985 }, { "entropy": 5.594454145431518, "epoch": 2.343260188087774, "grad_norm": 1.15625, "learning_rate": 0.0004689267511616868, "loss": 5.4189, "mean_token_accuracy": 0.15772538781166076, "num_tokens": 5255528.0, "step": 2990 }, { "entropy": 5.617967176437378, "epoch": 2.34717868338558, "grad_norm": 1.1640625, "learning_rate": 0.00046877405173952465, "loss": 5.3633, "mean_token_accuracy": 0.17105703949928283, "num_tokens": 5263944.0, "step": 2995 }, { "entropy": 5.657676124572754, "epoch": 2.3510971786833856, "grad_norm": 1.2421875, "learning_rate": 0.000468621006011713, "loss": 5.4912, "mean_token_accuracy": 0.15778652876615523, "num_tokens": 5271724.0, "step": 3000 }, { "epoch": 2.3510971786833856, "eval_entropy": 5.565460778021997, "eval_loss": 5.932183742523193, "eval_mean_token_accuracy": 0.14874126017815614, "eval_num_tokens": 5271724.0, "eval_runtime": 2.835, "eval_samples_per_second": 1453.963, "eval_steps_per_second": 182.01, "step": 3000 }, { "entropy": 5.714021825790406, "epoch": 2.355015673981191, "grad_norm": 1.171875, "learning_rate": 0.00046846761425176943, "loss": 5.5009, "mean_token_accuracy": 0.16061849147081375, "num_tokens": 5281199.0, "step": 3005 }, { "entropy": 5.739114904403687, "epoch": 2.3589341692789967, "grad_norm": 1.1875, "learning_rate": 0.0004683138767338299, "loss": 5.5238, "mean_token_accuracy": 0.15723237693309783, "num_tokens": 5289782.0, "step": 3010 }, { "entropy": 5.584011459350586, "epoch": 2.3628526645768027, "grad_norm": 1.2421875, "learning_rate": 0.0004681597937326483, "loss": 5.3349, "mean_token_accuracy": 0.16592346727848054, "num_tokens": 5297922.0, "step": 3015 }, { "entropy": 5.6475341796875, "epoch": 2.366771159874608, "grad_norm": 1.2109375, "learning_rate": 0.0004680053655235959, "loss": 5.4798, "mean_token_accuracy": 0.16429754048585893, "num_tokens": 5306178.0, "step": 3020 }, { "entropy": 5.601258897781372, "epoch": 2.3706896551724137, "grad_norm": 1.265625, "learning_rate": 0.0004678505923826609, "loss": 5.451, "mean_token_accuracy": 0.16447941958904266, "num_tokens": 5314936.0, "step": 3025 }, { "entropy": 5.6725733280181885, "epoch": 2.3746081504702197, "grad_norm": 1.171875, "learning_rate": 0.00046769547458644817, "loss": 5.4771, "mean_token_accuracy": 0.1562672033905983, "num_tokens": 5323549.0, "step": 3030 }, { "entropy": 5.661346340179444, "epoch": 2.378526645768025, "grad_norm": 1.1171875, "learning_rate": 0.0004675400124121782, "loss": 5.4679, "mean_token_accuracy": 0.15456231087446212, "num_tokens": 5332310.0, "step": 3035 }, { "entropy": 5.744946050643921, "epoch": 2.3824451410658307, "grad_norm": 1.1328125, "learning_rate": 0.00046738420613768716, "loss": 5.5901, "mean_token_accuracy": 0.15420550107955933, "num_tokens": 5340757.0, "step": 3040 }, { "entropy": 5.799703979492188, "epoch": 2.3863636363636362, "grad_norm": 1.1875, "learning_rate": 0.00046722805604142614, "loss": 5.573, "mean_token_accuracy": 0.1541631817817688, "num_tokens": 5349433.0, "step": 3045 }, { "entropy": 5.661766195297242, "epoch": 2.3902821316614418, "grad_norm": 1.1171875, "learning_rate": 0.00046707156240246076, "loss": 5.5251, "mean_token_accuracy": 0.15046581178903579, "num_tokens": 5358943.0, "step": 3050 }, { "entropy": 5.66755428314209, "epoch": 2.3942006269592477, "grad_norm": 1.2421875, "learning_rate": 0.00046691472550047027, "loss": 5.4398, "mean_token_accuracy": 0.16168949156999587, "num_tokens": 5367782.0, "step": 3055 }, { "entropy": 5.573693132400512, "epoch": 2.3981191222570533, "grad_norm": 1.2109375, "learning_rate": 0.00046675754561574783, "loss": 5.3238, "mean_token_accuracy": 0.17014608532190323, "num_tokens": 5376186.0, "step": 3060 }, { "entropy": 5.694865655899048, "epoch": 2.402037617554859, "grad_norm": 1.203125, "learning_rate": 0.00046660002302919933, "loss": 5.5838, "mean_token_accuracy": 0.15487318187952043, "num_tokens": 5384888.0, "step": 3065 }, { "entropy": 5.747969770431519, "epoch": 2.4059561128526648, "grad_norm": 1.2109375, "learning_rate": 0.0004664421580223433, "loss": 5.4618, "mean_token_accuracy": 0.1624412640929222, "num_tokens": 5393394.0, "step": 3070 }, { "entropy": 5.710650634765625, "epoch": 2.4098746081504703, "grad_norm": 1.15625, "learning_rate": 0.00046628395087730995, "loss": 5.5272, "mean_token_accuracy": 0.15911675691604615, "num_tokens": 5402901.0, "step": 3075 }, { "entropy": 5.690752744674683, "epoch": 2.413793103448276, "grad_norm": 1.125, "learning_rate": 0.0004661254018768411, "loss": 5.5223, "mean_token_accuracy": 0.15797928348183632, "num_tokens": 5412704.0, "step": 3080 }, { "entropy": 5.631660890579224, "epoch": 2.4177115987460813, "grad_norm": 1.2109375, "learning_rate": 0.0004659665113042897, "loss": 5.4885, "mean_token_accuracy": 0.16516524255275727, "num_tokens": 5420863.0, "step": 3085 }, { "entropy": 5.601195001602173, "epoch": 2.4216300940438873, "grad_norm": 1.234375, "learning_rate": 0.0004658072794436187, "loss": 5.4378, "mean_token_accuracy": 0.1611546367406845, "num_tokens": 5430002.0, "step": 3090 }, { "entropy": 5.615525960922241, "epoch": 2.425548589341693, "grad_norm": 1.2578125, "learning_rate": 0.00046564770657940146, "loss": 5.4165, "mean_token_accuracy": 0.16358508318662643, "num_tokens": 5438424.0, "step": 3095 }, { "entropy": 5.588921403884887, "epoch": 2.4294670846394983, "grad_norm": 1.1171875, "learning_rate": 0.0004654877929968205, "loss": 5.4199, "mean_token_accuracy": 0.16612940281629562, "num_tokens": 5446970.0, "step": 3100 }, { "entropy": 5.691753959655761, "epoch": 2.433385579937304, "grad_norm": 1.109375, "learning_rate": 0.0004653275389816673, "loss": 5.5083, "mean_token_accuracy": 0.15316883698105813, "num_tokens": 5456689.0, "step": 3105 }, { "entropy": 5.600510358810425, "epoch": 2.43730407523511, "grad_norm": 1.09375, "learning_rate": 0.00046516694482034174, "loss": 5.3974, "mean_token_accuracy": 0.16697040051221848, "num_tokens": 5465269.0, "step": 3110 }, { "entropy": 5.6311359882354735, "epoch": 2.4412225705329154, "grad_norm": 1.1484375, "learning_rate": 0.00046500601079985164, "loss": 5.4653, "mean_token_accuracy": 0.1607479929924011, "num_tokens": 5473930.0, "step": 3115 }, { "entropy": 5.596141195297241, "epoch": 2.445141065830721, "grad_norm": 1.1953125, "learning_rate": 0.0004648447372078123, "loss": 5.4947, "mean_token_accuracy": 0.1638297162950039, "num_tokens": 5482885.0, "step": 3120 }, { "entropy": 5.646732997894287, "epoch": 2.449059561128527, "grad_norm": 1.2265625, "learning_rate": 0.0004646831243324457, "loss": 5.4329, "mean_token_accuracy": 0.16114894300699234, "num_tokens": 5491489.0, "step": 3125 }, { "entropy": 5.697061824798584, "epoch": 2.4529780564263324, "grad_norm": 1.15625, "learning_rate": 0.0004645211724625802, "loss": 5.4942, "mean_token_accuracy": 0.1590850308537483, "num_tokens": 5499694.0, "step": 3130 }, { "entropy": 5.564854335784912, "epoch": 2.456896551724138, "grad_norm": 1.140625, "learning_rate": 0.00046435888188765015, "loss": 5.3795, "mean_token_accuracy": 0.16652424037456512, "num_tokens": 5508037.0, "step": 3135 }, { "entropy": 5.583059644699096, "epoch": 2.4608150470219434, "grad_norm": 1.1484375, "learning_rate": 0.0004641962528976951, "loss": 5.3679, "mean_token_accuracy": 0.16881446093320845, "num_tokens": 5516644.0, "step": 3140 }, { "entropy": 5.611056900024414, "epoch": 2.4647335423197494, "grad_norm": 1.2109375, "learning_rate": 0.0004640332857833593, "loss": 5.4417, "mean_token_accuracy": 0.15885722637176514, "num_tokens": 5524789.0, "step": 3145 }, { "entropy": 5.682217168807983, "epoch": 2.468652037617555, "grad_norm": 1.171875, "learning_rate": 0.00046386998083589156, "loss": 5.4746, "mean_token_accuracy": 0.16199354231357574, "num_tokens": 5533739.0, "step": 3150 }, { "entropy": 5.724471616744995, "epoch": 2.4725705329153604, "grad_norm": 1.2734375, "learning_rate": 0.0004637063383471442, "loss": 5.4943, "mean_token_accuracy": 0.16090431064367294, "num_tokens": 5542735.0, "step": 3155 }, { "entropy": 5.5542542934417725, "epoch": 2.476489028213166, "grad_norm": 1.1875, "learning_rate": 0.00046354235860957287, "loss": 5.3446, "mean_token_accuracy": 0.17048244625329972, "num_tokens": 5551272.0, "step": 3160 }, { "entropy": 5.644335508346558, "epoch": 2.480407523510972, "grad_norm": 1.2109375, "learning_rate": 0.0004633780419162361, "loss": 5.5349, "mean_token_accuracy": 0.1576820582151413, "num_tokens": 5559868.0, "step": 3165 }, { "entropy": 5.637001800537109, "epoch": 2.4843260188087775, "grad_norm": 1.1484375, "learning_rate": 0.00046321338856079435, "loss": 5.3278, "mean_token_accuracy": 0.17181121557950974, "num_tokens": 5568276.0, "step": 3170 }, { "entropy": 5.593462800979614, "epoch": 2.488244514106583, "grad_norm": 1.15625, "learning_rate": 0.00046304839883750987, "loss": 5.4098, "mean_token_accuracy": 0.1587411031126976, "num_tokens": 5576462.0, "step": 3175 }, { "entropy": 5.67059473991394, "epoch": 2.492163009404389, "grad_norm": 1.1796875, "learning_rate": 0.000462883073041246, "loss": 5.5546, "mean_token_accuracy": 0.15721471160650252, "num_tokens": 5586044.0, "step": 3180 }, { "entropy": 5.487815952301025, "epoch": 2.4960815047021945, "grad_norm": 1.1875, "learning_rate": 0.0004627174114674669, "loss": 5.3507, "mean_token_accuracy": 0.16307419240474702, "num_tokens": 5594626.0, "step": 3185 }, { "entropy": 5.688052940368652, "epoch": 2.5, "grad_norm": 1.1796875, "learning_rate": 0.0004625514144122365, "loss": 5.4132, "mean_token_accuracy": 0.15807681083679198, "num_tokens": 5603268.0, "step": 3190 }, { "entropy": 5.582400751113892, "epoch": 2.5039184952978055, "grad_norm": 1.3125, "learning_rate": 0.0004623850821722185, "loss": 5.3728, "mean_token_accuracy": 0.16239736527204512, "num_tokens": 5612053.0, "step": 3195 }, { "entropy": 5.606302261352539, "epoch": 2.507836990595611, "grad_norm": 1.140625, "learning_rate": 0.0004622184150446756, "loss": 5.6454, "mean_token_accuracy": 0.14613621309399605, "num_tokens": 5621297.0, "step": 3200 }, { "entropy": 5.737848901748658, "epoch": 2.511755485893417, "grad_norm": 1.1640625, "learning_rate": 0.00046205141332746904, "loss": 5.4563, "mean_token_accuracy": 0.16175559759140015, "num_tokens": 5629683.0, "step": 3205 }, { "entropy": 5.516789770126342, "epoch": 2.5156739811912225, "grad_norm": 1.140625, "learning_rate": 0.00046188407731905787, "loss": 5.3899, "mean_token_accuracy": 0.16603742241859437, "num_tokens": 5638707.0, "step": 3210 }, { "entropy": 5.679071044921875, "epoch": 2.519592476489028, "grad_norm": 1.1171875, "learning_rate": 0.0004617164073184987, "loss": 5.4352, "mean_token_accuracy": 0.16149932891130447, "num_tokens": 5648241.0, "step": 3215 }, { "entropy": 5.612945175170898, "epoch": 2.523510971786834, "grad_norm": 1.21875, "learning_rate": 0.00046154840362544496, "loss": 5.4819, "mean_token_accuracy": 0.15746894627809524, "num_tokens": 5656743.0, "step": 3220 }, { "entropy": 5.615722131729126, "epoch": 2.5274294670846396, "grad_norm": 1.2109375, "learning_rate": 0.0004613800665401466, "loss": 5.4624, "mean_token_accuracy": 0.1578985258936882, "num_tokens": 5666418.0, "step": 3225 }, { "entropy": 5.603916788101197, "epoch": 2.531347962382445, "grad_norm": 1.171875, "learning_rate": 0.0004612113963634493, "loss": 5.4203, "mean_token_accuracy": 0.15906696319580077, "num_tokens": 5675404.0, "step": 3230 }, { "entropy": 5.617366218566895, "epoch": 2.535266457680251, "grad_norm": 1.1484375, "learning_rate": 0.0004610423933967938, "loss": 5.4933, "mean_token_accuracy": 0.15284438133239747, "num_tokens": 5684699.0, "step": 3235 }, { "entropy": 5.690285110473633, "epoch": 2.5391849529780566, "grad_norm": 1.15625, "learning_rate": 0.000460873057942216, "loss": 5.4887, "mean_token_accuracy": 0.16056734919548035, "num_tokens": 5693219.0, "step": 3240 }, { "entropy": 5.702360773086548, "epoch": 2.543103448275862, "grad_norm": 1.1875, "learning_rate": 0.0004607033903023458, "loss": 5.522, "mean_token_accuracy": 0.15981431901454926, "num_tokens": 5702005.0, "step": 3245 }, { "entropy": 5.733270788192749, "epoch": 2.5470219435736676, "grad_norm": 1.1484375, "learning_rate": 0.00046053339078040674, "loss": 5.5047, "mean_token_accuracy": 0.15839407444000245, "num_tokens": 5710715.0, "step": 3250 }, { "entropy": 5.598942708969116, "epoch": 2.550940438871473, "grad_norm": 1.1953125, "learning_rate": 0.0004603630596802155, "loss": 5.4215, "mean_token_accuracy": 0.16150805950164795, "num_tokens": 5719444.0, "step": 3255 }, { "entropy": 5.517152309417725, "epoch": 2.554858934169279, "grad_norm": 1.3125, "learning_rate": 0.0004601923973061814, "loss": 5.4324, "mean_token_accuracy": 0.15545494109392166, "num_tokens": 5727755.0, "step": 3260 }, { "entropy": 5.615048742294311, "epoch": 2.5587774294670846, "grad_norm": 1.1328125, "learning_rate": 0.00046002140396330575, "loss": 5.335, "mean_token_accuracy": 0.1671610489487648, "num_tokens": 5736933.0, "step": 3265 }, { "entropy": 5.673175382614136, "epoch": 2.56269592476489, "grad_norm": 1.140625, "learning_rate": 0.00045985007995718154, "loss": 5.517, "mean_token_accuracy": 0.1642877921462059, "num_tokens": 5745831.0, "step": 3270 }, { "entropy": 5.678241300582886, "epoch": 2.566614420062696, "grad_norm": 1.15625, "learning_rate": 0.0004596784255939923, "loss": 5.5531, "mean_token_accuracy": 0.15819532945752143, "num_tokens": 5755419.0, "step": 3275 }, { "entropy": 5.631372880935669, "epoch": 2.5705329153605017, "grad_norm": 1.1953125, "learning_rate": 0.0004595064411805123, "loss": 5.4329, "mean_token_accuracy": 0.16395663022994994, "num_tokens": 5764008.0, "step": 3280 }, { "entropy": 5.574886322021484, "epoch": 2.574451410658307, "grad_norm": 1.265625, "learning_rate": 0.0004593341270241057, "loss": 5.3526, "mean_token_accuracy": 0.1645262286067009, "num_tokens": 5772129.0, "step": 3285 }, { "entropy": 5.605494165420533, "epoch": 2.5783699059561127, "grad_norm": 1.1171875, "learning_rate": 0.0004591614834327257, "loss": 5.4728, "mean_token_accuracy": 0.1581420123577118, "num_tokens": 5781079.0, "step": 3290 }, { "entropy": 5.610899257659912, "epoch": 2.5822884012539182, "grad_norm": 1.140625, "learning_rate": 0.00045898851071491444, "loss": 5.4462, "mean_token_accuracy": 0.16424137055873872, "num_tokens": 5790444.0, "step": 3295 }, { "entropy": 5.533226203918457, "epoch": 2.586206896551724, "grad_norm": 1.2109375, "learning_rate": 0.0004588152091798022, "loss": 5.3204, "mean_token_accuracy": 0.16239900141954422, "num_tokens": 5799828.0, "step": 3300 }, { "entropy": 5.688413190841675, "epoch": 2.5901253918495297, "grad_norm": 1.2578125, "learning_rate": 0.0004586415791371069, "loss": 5.6048, "mean_token_accuracy": 0.14757455736398697, "num_tokens": 5808765.0, "step": 3305 }, { "entropy": 5.726306390762329, "epoch": 2.5940438871473352, "grad_norm": 1.140625, "learning_rate": 0.0004584676208971336, "loss": 5.3726, "mean_token_accuracy": 0.16757129430770873, "num_tokens": 5817281.0, "step": 3310 }, { "entropy": 5.520856428146362, "epoch": 2.597962382445141, "grad_norm": 1.1875, "learning_rate": 0.00045829333477077384, "loss": 5.4159, "mean_token_accuracy": 0.17432797700166702, "num_tokens": 5825741.0, "step": 3315 }, { "entropy": 5.61427526473999, "epoch": 2.6018808777429467, "grad_norm": 1.3203125, "learning_rate": 0.0004581187210695053, "loss": 5.3821, "mean_token_accuracy": 0.1676635965704918, "num_tokens": 5834466.0, "step": 3320 }, { "entropy": 5.560508775711059, "epoch": 2.6057993730407523, "grad_norm": 1.203125, "learning_rate": 0.000457943780105391, "loss": 5.4323, "mean_token_accuracy": 0.16290040761232377, "num_tokens": 5843463.0, "step": 3325 }, { "entropy": 5.613198900222779, "epoch": 2.6097178683385582, "grad_norm": 1.2109375, "learning_rate": 0.00045776851219107856, "loss": 5.4575, "mean_token_accuracy": 0.15541307330131532, "num_tokens": 5851885.0, "step": 3330 }, { "entropy": 5.691914033889771, "epoch": 2.6136363636363638, "grad_norm": 1.0546875, "learning_rate": 0.00045759291763980035, "loss": 5.4538, "mean_token_accuracy": 0.16642256081104279, "num_tokens": 5861537.0, "step": 3335 }, { "entropy": 5.5215497493743895, "epoch": 2.6175548589341693, "grad_norm": 1.078125, "learning_rate": 0.00045741699676537227, "loss": 5.3842, "mean_token_accuracy": 0.16373006626963615, "num_tokens": 5870332.0, "step": 3340 }, { "entropy": 5.634155559539795, "epoch": 2.621473354231975, "grad_norm": 1.1640625, "learning_rate": 0.00045724074988219343, "loss": 5.5963, "mean_token_accuracy": 0.15622956901788712, "num_tokens": 5879240.0, "step": 3345 }, { "entropy": 5.525441694259643, "epoch": 2.6253918495297803, "grad_norm": 1.1015625, "learning_rate": 0.00045706417730524565, "loss": 5.3479, "mean_token_accuracy": 0.1612783044576645, "num_tokens": 5887961.0, "step": 3350 }, { "entropy": 5.667619323730468, "epoch": 2.6293103448275863, "grad_norm": 1.15625, "learning_rate": 0.0004568872793500927, "loss": 5.5637, "mean_token_accuracy": 0.1625274196267128, "num_tokens": 5896551.0, "step": 3355 }, { "entropy": 5.583553075790405, "epoch": 2.633228840125392, "grad_norm": 1.09375, "learning_rate": 0.00045671005633287986, "loss": 5.3704, "mean_token_accuracy": 0.16467083990573883, "num_tokens": 5905485.0, "step": 3360 }, { "entropy": 5.567110204696656, "epoch": 2.6371473354231973, "grad_norm": 1.1953125, "learning_rate": 0.0004565325085703336, "loss": 5.4011, "mean_token_accuracy": 0.16050159335136413, "num_tokens": 5915166.0, "step": 3365 }, { "entropy": 5.640669679641723, "epoch": 2.6410658307210033, "grad_norm": 1.1640625, "learning_rate": 0.0004563546363797602, "loss": 5.4344, "mean_token_accuracy": 0.16531543731689452, "num_tokens": 5923782.0, "step": 3370 }, { "entropy": 5.621229076385498, "epoch": 2.644984326018809, "grad_norm": 1.2265625, "learning_rate": 0.0004561764400790465, "loss": 5.528, "mean_token_accuracy": 0.15729653984308242, "num_tokens": 5931904.0, "step": 3375 }, { "entropy": 5.6186549186706545, "epoch": 2.6489028213166144, "grad_norm": 1.234375, "learning_rate": 0.00045599791998665796, "loss": 5.4374, "mean_token_accuracy": 0.16541447937488557, "num_tokens": 5940067.0, "step": 3380 }, { "entropy": 5.616342639923095, "epoch": 2.6528213166144203, "grad_norm": 1.1328125, "learning_rate": 0.0004558190764216389, "loss": 5.4176, "mean_token_accuracy": 0.16869635432958602, "num_tokens": 5948663.0, "step": 3385 }, { "entropy": 5.684710073471069, "epoch": 2.656739811912226, "grad_norm": 1.2109375, "learning_rate": 0.0004556399097036119, "loss": 5.4253, "mean_token_accuracy": 0.1626061663031578, "num_tokens": 5957224.0, "step": 3390 }, { "entropy": 5.622564649581909, "epoch": 2.6606583072100314, "grad_norm": 1.1796875, "learning_rate": 0.0004554604201527768, "loss": 5.5362, "mean_token_accuracy": 0.1558627665042877, "num_tokens": 5965604.0, "step": 3395 }, { "entropy": 5.571040296554566, "epoch": 2.664576802507837, "grad_norm": 1.296875, "learning_rate": 0.00045528060808991075, "loss": 5.3626, "mean_token_accuracy": 0.16696271449327468, "num_tokens": 5974309.0, "step": 3400 }, { "entropy": 5.546795654296875, "epoch": 2.6684952978056424, "grad_norm": 1.1796875, "learning_rate": 0.0004551004738363669, "loss": 5.3774, "mean_token_accuracy": 0.1614482581615448, "num_tokens": 5982315.0, "step": 3405 }, { "entropy": 5.631612110137939, "epoch": 2.6724137931034484, "grad_norm": 1.2578125, "learning_rate": 0.00045492001771407434, "loss": 5.3955, "mean_token_accuracy": 0.17008334398269653, "num_tokens": 5990515.0, "step": 3410 }, { "entropy": 5.632847738265991, "epoch": 2.676332288401254, "grad_norm": 1.1484375, "learning_rate": 0.0004547392400455374, "loss": 5.5513, "mean_token_accuracy": 0.1599658966064453, "num_tokens": 6000431.0, "step": 3415 }, { "entropy": 5.612500715255737, "epoch": 2.6802507836990594, "grad_norm": 1.2890625, "learning_rate": 0.0004545581411538353, "loss": 5.4756, "mean_token_accuracy": 0.15511866062879562, "num_tokens": 6009022.0, "step": 3420 }, { "entropy": 5.65234489440918, "epoch": 2.6841692789968654, "grad_norm": 1.0859375, "learning_rate": 0.00045437672136262083, "loss": 5.4005, "mean_token_accuracy": 0.16104743182659148, "num_tokens": 6019330.0, "step": 3425 }, { "entropy": 5.538012981414795, "epoch": 2.688087774294671, "grad_norm": 1.1171875, "learning_rate": 0.0004541949809961208, "loss": 5.4257, "mean_token_accuracy": 0.17339180707931517, "num_tokens": 6027492.0, "step": 3430 }, { "entropy": 5.658903741836548, "epoch": 2.6920062695924765, "grad_norm": 1.2109375, "learning_rate": 0.0004540129203791346, "loss": 5.4545, "mean_token_accuracy": 0.16489011645317078, "num_tokens": 6036467.0, "step": 3435 }, { "entropy": 5.572162246704101, "epoch": 2.695924764890282, "grad_norm": 1.125, "learning_rate": 0.00045383053983703413, "loss": 5.407, "mean_token_accuracy": 0.16315893530845643, "num_tokens": 6045711.0, "step": 3440 }, { "entropy": 5.554269123077392, "epoch": 2.6998432601880875, "grad_norm": 1.15625, "learning_rate": 0.00045364783969576296, "loss": 5.4705, "mean_token_accuracy": 0.16008178889751434, "num_tokens": 6054626.0, "step": 3445 }, { "entropy": 5.646736717224121, "epoch": 2.7037617554858935, "grad_norm": 1.1484375, "learning_rate": 0.00045346482028183583, "loss": 5.534, "mean_token_accuracy": 0.1618146926164627, "num_tokens": 6064118.0, "step": 3450 }, { "entropy": 5.637236022949219, "epoch": 2.707680250783699, "grad_norm": 1.171875, "learning_rate": 0.00045328148192233823, "loss": 5.4373, "mean_token_accuracy": 0.16148429214954377, "num_tokens": 6072723.0, "step": 3455 }, { "entropy": 5.633076858520508, "epoch": 2.7115987460815045, "grad_norm": 1.1328125, "learning_rate": 0.0004530978249449254, "loss": 5.4056, "mean_token_accuracy": 0.17186853736639024, "num_tokens": 6081535.0, "step": 3460 }, { "entropy": 5.590723657608033, "epoch": 2.7155172413793105, "grad_norm": 1.1796875, "learning_rate": 0.0004529138496778222, "loss": 5.3805, "mean_token_accuracy": 0.16959122717380523, "num_tokens": 6090524.0, "step": 3465 }, { "entropy": 5.596432876586914, "epoch": 2.719435736677116, "grad_norm": 1.1484375, "learning_rate": 0.0004527295564498222, "loss": 5.4465, "mean_token_accuracy": 0.16445804834365846, "num_tokens": 6099870.0, "step": 3470 }, { "entropy": 5.6235284328460695, "epoch": 2.7233542319749215, "grad_norm": 1.2265625, "learning_rate": 0.0004525449455902874, "loss": 5.4235, "mean_token_accuracy": 0.16751915067434311, "num_tokens": 6107760.0, "step": 3475 }, { "entropy": 5.6068034172058105, "epoch": 2.7272727272727275, "grad_norm": 1.0859375, "learning_rate": 0.0004523600174291473, "loss": 5.4883, "mean_token_accuracy": 0.16328197419643403, "num_tokens": 6117135.0, "step": 3480 }, { "entropy": 5.559489727020264, "epoch": 2.731191222570533, "grad_norm": 1.21875, "learning_rate": 0.0004521747722968985, "loss": 5.4378, "mean_token_accuracy": 0.1646498441696167, "num_tokens": 6125711.0, "step": 3485 }, { "entropy": 5.658491802215576, "epoch": 2.7351097178683386, "grad_norm": 1.171875, "learning_rate": 0.00045198921052460396, "loss": 5.5131, "mean_token_accuracy": 0.15748890489339828, "num_tokens": 6134623.0, "step": 3490 }, { "entropy": 5.7147058010101315, "epoch": 2.739028213166144, "grad_norm": 1.234375, "learning_rate": 0.0004518033324438928, "loss": 5.4871, "mean_token_accuracy": 0.15890766084194183, "num_tokens": 6143228.0, "step": 3495 }, { "entropy": 5.552452516555786, "epoch": 2.7429467084639496, "grad_norm": 1.1875, "learning_rate": 0.0004516171383869593, "loss": 5.469, "mean_token_accuracy": 0.1633103907108307, "num_tokens": 6152021.0, "step": 3500 }, { "epoch": 2.7429467084639496, "eval_entropy": 5.414383830026138, "eval_loss": 5.8682355880737305, "eval_mean_token_accuracy": 0.15274352232326371, "eval_num_tokens": 6152021.0, "eval_runtime": 2.8317, "eval_samples_per_second": 1455.64, "eval_steps_per_second": 182.22, "step": 3500 }, { "entropy": 5.643088293075562, "epoch": 2.7468652037617556, "grad_norm": 1.1640625, "learning_rate": 0.00045143062868656234, "loss": 5.5211, "mean_token_accuracy": 0.1622963473200798, "num_tokens": 6161389.0, "step": 3505 }, { "entropy": 5.6994085788726805, "epoch": 2.750783699059561, "grad_norm": 1.140625, "learning_rate": 0.000451243803676025, "loss": 5.4581, "mean_token_accuracy": 0.16562668979167938, "num_tokens": 6170381.0, "step": 3510 }, { "entropy": 5.587498950958252, "epoch": 2.7547021943573666, "grad_norm": 1.140625, "learning_rate": 0.00045105666368923397, "loss": 5.4058, "mean_token_accuracy": 0.1639431193470955, "num_tokens": 6179071.0, "step": 3515 }, { "entropy": 5.541202402114868, "epoch": 2.7586206896551726, "grad_norm": 1.171875, "learning_rate": 0.00045086920906063866, "loss": 5.4143, "mean_token_accuracy": 0.1619719982147217, "num_tokens": 6187896.0, "step": 3520 }, { "entropy": 5.555576372146606, "epoch": 2.762539184952978, "grad_norm": 1.171875, "learning_rate": 0.00045068144012525095, "loss": 5.3908, "mean_token_accuracy": 0.17135699987411498, "num_tokens": 6196328.0, "step": 3525 }, { "entropy": 5.511662530899048, "epoch": 2.7664576802507836, "grad_norm": 1.1796875, "learning_rate": 0.00045049335721864426, "loss": 5.4228, "mean_token_accuracy": 0.15722364485263823, "num_tokens": 6204703.0, "step": 3530 }, { "entropy": 5.598114967346191, "epoch": 2.7703761755485896, "grad_norm": 1.1640625, "learning_rate": 0.00045030496067695336, "loss": 5.4198, "mean_token_accuracy": 0.16125878989696502, "num_tokens": 6213496.0, "step": 3535 }, { "entropy": 5.602514410018921, "epoch": 2.774294670846395, "grad_norm": 1.171875, "learning_rate": 0.0004501162508368733, "loss": 5.3708, "mean_token_accuracy": 0.16854795217514038, "num_tokens": 6222112.0, "step": 3540 }, { "entropy": 5.5321849346160885, "epoch": 2.7782131661442007, "grad_norm": 1.1953125, "learning_rate": 0.0004499272280356594, "loss": 5.4907, "mean_token_accuracy": 0.1533934846520424, "num_tokens": 6230288.0, "step": 3545 }, { "entropy": 5.587235784530639, "epoch": 2.782131661442006, "grad_norm": 1.1796875, "learning_rate": 0.0004497378926111257, "loss": 5.345, "mean_token_accuracy": 0.17369745969772338, "num_tokens": 6238608.0, "step": 3550 }, { "entropy": 5.627431440353393, "epoch": 2.7860501567398117, "grad_norm": 1.1875, "learning_rate": 0.0004495482449016456, "loss": 5.4798, "mean_token_accuracy": 0.16244604885578157, "num_tokens": 6247902.0, "step": 3555 }, { "entropy": 5.574715185165405, "epoch": 2.7899686520376177, "grad_norm": 1.1015625, "learning_rate": 0.0004493582852461501, "loss": 5.4332, "mean_token_accuracy": 0.16872989386320114, "num_tokens": 6257433.0, "step": 3560 }, { "entropy": 5.572787284851074, "epoch": 2.793887147335423, "grad_norm": 1.1328125, "learning_rate": 0.0004491680139841281, "loss": 5.4158, "mean_token_accuracy": 0.16313964426517485, "num_tokens": 6266017.0, "step": 3565 }, { "entropy": 5.630651569366455, "epoch": 2.7978056426332287, "grad_norm": 1.28125, "learning_rate": 0.000448977431455625, "loss": 5.5203, "mean_token_accuracy": 0.15141590163111687, "num_tokens": 6274759.0, "step": 3570 }, { "entropy": 5.641405630111694, "epoch": 2.8017241379310347, "grad_norm": 1.234375, "learning_rate": 0.00044878653800124285, "loss": 5.3845, "mean_token_accuracy": 0.16816764771938325, "num_tokens": 6283699.0, "step": 3575 }, { "entropy": 5.567658567428589, "epoch": 2.80564263322884, "grad_norm": 1.1796875, "learning_rate": 0.0004485953339621391, "loss": 5.4174, "mean_token_accuracy": 0.16217992901802064, "num_tokens": 6292562.0, "step": 3580 }, { "entropy": 5.533685970306396, "epoch": 2.8095611285266457, "grad_norm": 1.1640625, "learning_rate": 0.0004484038196800265, "loss": 5.3776, "mean_token_accuracy": 0.1700371041893959, "num_tokens": 6300928.0, "step": 3585 }, { "entropy": 5.546323251724243, "epoch": 2.8134796238244513, "grad_norm": 1.0859375, "learning_rate": 0.0004482119954971719, "loss": 5.3093, "mean_token_accuracy": 0.16769690960645675, "num_tokens": 6309994.0, "step": 3590 }, { "entropy": 5.619518089294433, "epoch": 2.817398119122257, "grad_norm": 1.234375, "learning_rate": 0.00044801986175639635, "loss": 5.4345, "mean_token_accuracy": 0.16344505101442336, "num_tokens": 6318856.0, "step": 3595 }, { "entropy": 5.5285991668701175, "epoch": 2.8213166144200628, "grad_norm": 1.203125, "learning_rate": 0.0004478274188010741, "loss": 5.4324, "mean_token_accuracy": 0.1600890651345253, "num_tokens": 6327274.0, "step": 3600 }, { "entropy": 5.677964973449707, "epoch": 2.8252351097178683, "grad_norm": 1.1484375, "learning_rate": 0.00044763466697513173, "loss": 5.491, "mean_token_accuracy": 0.17462221533060074, "num_tokens": 6335957.0, "step": 3605 }, { "entropy": 5.668510293960571, "epoch": 2.829153605015674, "grad_norm": 1.171875, "learning_rate": 0.00044744160662304805, "loss": 5.4987, "mean_token_accuracy": 0.16850878596305846, "num_tokens": 6344673.0, "step": 3610 }, { "entropy": 5.561325788497925, "epoch": 2.83307210031348, "grad_norm": 1.21875, "learning_rate": 0.00044724823808985325, "loss": 5.3639, "mean_token_accuracy": 0.1669614925980568, "num_tokens": 6353085.0, "step": 3615 }, { "entropy": 5.608777952194214, "epoch": 2.8369905956112853, "grad_norm": 1.09375, "learning_rate": 0.0004470545617211283, "loss": 5.4259, "mean_token_accuracy": 0.16076537147164344, "num_tokens": 6362107.0, "step": 3620 }, { "entropy": 5.584958410263061, "epoch": 2.840909090909091, "grad_norm": 1.15625, "learning_rate": 0.00044686057786300423, "loss": 5.3999, "mean_token_accuracy": 0.16815428733825682, "num_tokens": 6370381.0, "step": 3625 }, { "entropy": 5.557851696014405, "epoch": 2.844827586206897, "grad_norm": 1.1796875, "learning_rate": 0.00044666628686216154, "loss": 5.4355, "mean_token_accuracy": 0.16155474483966828, "num_tokens": 6378790.0, "step": 3630 }, { "entropy": 5.510302448272705, "epoch": 2.8487460815047023, "grad_norm": 1.234375, "learning_rate": 0.00044647168906583, "loss": 5.4135, "mean_token_accuracy": 0.16343684494495392, "num_tokens": 6387997.0, "step": 3635 }, { "entropy": 5.608069324493409, "epoch": 2.852664576802508, "grad_norm": 1.125, "learning_rate": 0.00044627678482178716, "loss": 5.4346, "mean_token_accuracy": 0.16666958928108216, "num_tokens": 6397458.0, "step": 3640 }, { "entropy": 5.595707035064697, "epoch": 2.8565830721003134, "grad_norm": 1.1953125, "learning_rate": 0.0004460815744783587, "loss": 5.3743, "mean_token_accuracy": 0.16629649698734283, "num_tokens": 6405481.0, "step": 3645 }, { "entropy": 5.604511451721192, "epoch": 2.860501567398119, "grad_norm": 1.25, "learning_rate": 0.000445886058384417, "loss": 5.4924, "mean_token_accuracy": 0.15888008326292039, "num_tokens": 6414087.0, "step": 3650 }, { "entropy": 5.607482194900513, "epoch": 2.864420062695925, "grad_norm": 1.2265625, "learning_rate": 0.0004456902368893811, "loss": 5.3191, "mean_token_accuracy": 0.1759709596633911, "num_tokens": 6421934.0, "step": 3655 }, { "entropy": 5.54454607963562, "epoch": 2.8683385579937304, "grad_norm": 1.2421875, "learning_rate": 0.0004454941103432158, "loss": 5.3414, "mean_token_accuracy": 0.1674615979194641, "num_tokens": 6431028.0, "step": 3660 }, { "entropy": 5.46548752784729, "epoch": 2.872257053291536, "grad_norm": 1.1015625, "learning_rate": 0.00044529767909643093, "loss": 5.2974, "mean_token_accuracy": 0.17255107015371324, "num_tokens": 6439913.0, "step": 3665 }, { "entropy": 5.61080904006958, "epoch": 2.876175548589342, "grad_norm": 1.234375, "learning_rate": 0.0004451009435000811, "loss": 5.3955, "mean_token_accuracy": 0.16260457783937454, "num_tokens": 6449161.0, "step": 3670 }, { "entropy": 5.580524539947509, "epoch": 2.8800940438871474, "grad_norm": 1.25, "learning_rate": 0.0004449039039057647, "loss": 5.5008, "mean_token_accuracy": 0.16669165194034577, "num_tokens": 6457834.0, "step": 3675 }, { "entropy": 5.509667015075683, "epoch": 2.884012539184953, "grad_norm": 1.21875, "learning_rate": 0.00044470656066562336, "loss": 5.3213, "mean_token_accuracy": 0.17255610674619676, "num_tokens": 6465879.0, "step": 3680 }, { "entropy": 5.502434110641479, "epoch": 2.887931034482759, "grad_norm": 1.1875, "learning_rate": 0.0004445089141323415, "loss": 5.3251, "mean_token_accuracy": 0.16640148162841797, "num_tokens": 6474801.0, "step": 3685 }, { "entropy": 5.540452337265014, "epoch": 2.8918495297805644, "grad_norm": 1.25, "learning_rate": 0.00044431096465914554, "loss": 5.3657, "mean_token_accuracy": 0.1723470151424408, "num_tokens": 6483532.0, "step": 3690 }, { "entropy": 5.532334518432617, "epoch": 2.89576802507837, "grad_norm": 1.2265625, "learning_rate": 0.00044411271259980315, "loss": 5.3465, "mean_token_accuracy": 0.1612808346748352, "num_tokens": 6491372.0, "step": 3695 }, { "entropy": 5.501596736907959, "epoch": 2.8996865203761755, "grad_norm": 1.203125, "learning_rate": 0.0004439141583086231, "loss": 5.3612, "mean_token_accuracy": 0.16393718719482422, "num_tokens": 6500001.0, "step": 3700 }, { "entropy": 5.669638776779175, "epoch": 2.903605015673981, "grad_norm": 1.2890625, "learning_rate": 0.00044371530214045395, "loss": 5.4632, "mean_token_accuracy": 0.16313758194446565, "num_tokens": 6507750.0, "step": 3705 }, { "entropy": 5.489681100845337, "epoch": 2.907523510971787, "grad_norm": 1.25, "learning_rate": 0.00044351614445068413, "loss": 5.3241, "mean_token_accuracy": 0.16466565132141114, "num_tokens": 6517888.0, "step": 3710 }, { "entropy": 5.614689445495605, "epoch": 2.9114420062695925, "grad_norm": 1.171875, "learning_rate": 0.00044331668559524043, "loss": 5.4777, "mean_token_accuracy": 0.1595204085111618, "num_tokens": 6526573.0, "step": 3715 }, { "entropy": 5.553948974609375, "epoch": 2.915360501567398, "grad_norm": 1.109375, "learning_rate": 0.0004431169259305883, "loss": 5.3968, "mean_token_accuracy": 0.16814989745616912, "num_tokens": 6536544.0, "step": 3720 }, { "entropy": 5.6320888042449955, "epoch": 2.919278996865204, "grad_norm": 1.1171875, "learning_rate": 0.0004429168658137306, "loss": 5.4218, "mean_token_accuracy": 0.16275950148701668, "num_tokens": 6544982.0, "step": 3725 }, { "entropy": 5.551886749267578, "epoch": 2.9231974921630095, "grad_norm": 1.15625, "learning_rate": 0.00044271650560220746, "loss": 5.4456, "mean_token_accuracy": 0.16412553191184998, "num_tokens": 6553664.0, "step": 3730 }, { "entropy": 5.588349199295044, "epoch": 2.927115987460815, "grad_norm": 1.2265625, "learning_rate": 0.00044251584565409464, "loss": 5.4212, "mean_token_accuracy": 0.16144543141126633, "num_tokens": 6562234.0, "step": 3735 }, { "entropy": 5.522047853469848, "epoch": 2.9310344827586206, "grad_norm": 1.2421875, "learning_rate": 0.0004423148863280044, "loss": 5.3825, "mean_token_accuracy": 0.16582272052764893, "num_tokens": 6570707.0, "step": 3740 }, { "entropy": 5.623364686965942, "epoch": 2.934952978056426, "grad_norm": 1.1171875, "learning_rate": 0.00044211362798308334, "loss": 5.4298, "mean_token_accuracy": 0.16516808271408082, "num_tokens": 6580054.0, "step": 3745 }, { "entropy": 5.5762903690338135, "epoch": 2.938871473354232, "grad_norm": 1.203125, "learning_rate": 0.0004419120709790129, "loss": 5.487, "mean_token_accuracy": 0.16166716068983078, "num_tokens": 6588585.0, "step": 3750 }, { "entropy": 5.612081909179688, "epoch": 2.9427899686520376, "grad_norm": 1.1796875, "learning_rate": 0.00044171021567600814, "loss": 5.4734, "mean_token_accuracy": 0.16496185213327408, "num_tokens": 6597290.0, "step": 3755 }, { "entropy": 5.599429178237915, "epoch": 2.946708463949843, "grad_norm": 1.2421875, "learning_rate": 0.00044150806243481715, "loss": 5.3989, "mean_token_accuracy": 0.1622632399201393, "num_tokens": 6604966.0, "step": 3760 }, { "entropy": 5.605414915084839, "epoch": 2.950626959247649, "grad_norm": 1.203125, "learning_rate": 0.0004413056116167206, "loss": 5.3836, "mean_token_accuracy": 0.16811488717794418, "num_tokens": 6613588.0, "step": 3765 }, { "entropy": 5.694739866256714, "epoch": 2.9545454545454546, "grad_norm": 1.140625, "learning_rate": 0.0004411028635835309, "loss": 5.6062, "mean_token_accuracy": 0.14811502546072006, "num_tokens": 6622907.0, "step": 3770 }, { "entropy": 5.5383378028869625, "epoch": 2.95846394984326, "grad_norm": 1.1640625, "learning_rate": 0.0004408998186975917, "loss": 5.3947, "mean_token_accuracy": 0.16675213277339934, "num_tokens": 6632074.0, "step": 3775 }, { "entropy": 5.624499273300171, "epoch": 2.962382445141066, "grad_norm": 1.2109375, "learning_rate": 0.00044069647732177696, "loss": 5.4263, "mean_token_accuracy": 0.16085358709096909, "num_tokens": 6640595.0, "step": 3780 }, { "entropy": 5.641710186004639, "epoch": 2.9663009404388716, "grad_norm": 1.21875, "learning_rate": 0.00044049283981949103, "loss": 5.4727, "mean_token_accuracy": 0.15902462005615234, "num_tokens": 6649861.0, "step": 3785 }, { "entropy": 5.660275220870972, "epoch": 2.970219435736677, "grad_norm": 1.15625, "learning_rate": 0.0004402889065546667, "loss": 5.4867, "mean_token_accuracy": 0.15586813762784005, "num_tokens": 6659157.0, "step": 3790 }, { "entropy": 5.599219799041748, "epoch": 2.9741379310344827, "grad_norm": 1.265625, "learning_rate": 0.00044008467789176625, "loss": 5.4518, "mean_token_accuracy": 0.16005596220493318, "num_tokens": 6667979.0, "step": 3795 }, { "entropy": 5.590052700042724, "epoch": 2.978056426332288, "grad_norm": 1.125, "learning_rate": 0.0004398801541957791, "loss": 5.4034, "mean_token_accuracy": 0.16107274889945983, "num_tokens": 6677239.0, "step": 3800 }, { "entropy": 5.590973901748657, "epoch": 2.981974921630094, "grad_norm": 1.140625, "learning_rate": 0.0004396753358322223, "loss": 5.5295, "mean_token_accuracy": 0.15927643179893494, "num_tokens": 6687317.0, "step": 3805 }, { "entropy": 5.551632881164551, "epoch": 2.9858934169278997, "grad_norm": 1.15625, "learning_rate": 0.0004394702231671396, "loss": 5.413, "mean_token_accuracy": 0.161776627600193, "num_tokens": 6695732.0, "step": 3810 }, { "entropy": 5.546460056304932, "epoch": 2.989811912225705, "grad_norm": 1.1953125, "learning_rate": 0.0004392648165671004, "loss": 5.3254, "mean_token_accuracy": 0.17594451904296876, "num_tokens": 6704272.0, "step": 3815 }, { "entropy": 5.552908229827881, "epoch": 2.993730407523511, "grad_norm": 1.1875, "learning_rate": 0.0004390591163991998, "loss": 5.3706, "mean_token_accuracy": 0.16397657990455627, "num_tokens": 6713355.0, "step": 3820 }, { "entropy": 5.587871837615967, "epoch": 2.9976489028213167, "grad_norm": 1.2265625, "learning_rate": 0.00043885312303105725, "loss": 5.4658, "mean_token_accuracy": 0.1653267815709114, "num_tokens": 6721952.0, "step": 3825 }, { "entropy": 5.543058776855469, "epoch": 3.001567398119122, "grad_norm": 1.0703125, "learning_rate": 0.0004386468368308163, "loss": 5.2734, "mean_token_accuracy": 0.17160003930330275, "num_tokens": 6730582.0, "step": 3830 }, { "entropy": 5.568611288070679, "epoch": 3.0054858934169277, "grad_norm": 1.125, "learning_rate": 0.0004384402581671438, "loss": 5.0631, "mean_token_accuracy": 0.17696447372436525, "num_tokens": 6739370.0, "step": 3835 }, { "entropy": 5.526371908187866, "epoch": 3.0094043887147337, "grad_norm": 1.1640625, "learning_rate": 0.0004382333874092295, "loss": 5.0431, "mean_token_accuracy": 0.18205696493387222, "num_tokens": 6748282.0, "step": 3840 }, { "entropy": 5.6190471172332765, "epoch": 3.0133228840125392, "grad_norm": 1.1328125, "learning_rate": 0.00043802622492678466, "loss": 5.1684, "mean_token_accuracy": 0.1718669578433037, "num_tokens": 6757622.0, "step": 3845 }, { "entropy": 5.560507535934448, "epoch": 3.0172413793103448, "grad_norm": 1.2265625, "learning_rate": 0.0004378187710900426, "loss": 5.1056, "mean_token_accuracy": 0.17592935264110565, "num_tokens": 6765780.0, "step": 3850 }, { "entropy": 5.544382572174072, "epoch": 3.0211598746081503, "grad_norm": 1.2421875, "learning_rate": 0.00043761102626975674, "loss": 5.1099, "mean_token_accuracy": 0.17691410183906556, "num_tokens": 6774343.0, "step": 3855 }, { "entropy": 5.529460048675537, "epoch": 3.0250783699059562, "grad_norm": 1.15625, "learning_rate": 0.0004374029908372007, "loss": 5.1041, "mean_token_accuracy": 0.1808921843767166, "num_tokens": 6783765.0, "step": 3860 }, { "entropy": 5.6187409400939945, "epoch": 3.0289968652037618, "grad_norm": 1.28125, "learning_rate": 0.00043719466516416774, "loss": 5.166, "mean_token_accuracy": 0.17624239325523378, "num_tokens": 6792102.0, "step": 3865 }, { "entropy": 5.602795743942261, "epoch": 3.0329153605015673, "grad_norm": 1.1015625, "learning_rate": 0.00043698604962296946, "loss": 5.09, "mean_token_accuracy": 0.17620307356119155, "num_tokens": 6801435.0, "step": 3870 }, { "entropy": 5.456713485717773, "epoch": 3.0368338557993733, "grad_norm": 1.125, "learning_rate": 0.00043677714458643566, "loss": 5.1205, "mean_token_accuracy": 0.1735727608203888, "num_tokens": 6810409.0, "step": 3875 }, { "entropy": 5.4551129817962645, "epoch": 3.040752351097179, "grad_norm": 1.2109375, "learning_rate": 0.00043656795042791357, "loss": 5.0066, "mean_token_accuracy": 0.18492254316806794, "num_tokens": 6818752.0, "step": 3880 }, { "entropy": 5.5124578952789305, "epoch": 3.0446708463949843, "grad_norm": 1.21875, "learning_rate": 0.0004363584675212671, "loss": 5.0905, "mean_token_accuracy": 0.17838650196790695, "num_tokens": 6827880.0, "step": 3885 }, { "entropy": 5.454521226882934, "epoch": 3.04858934169279, "grad_norm": 1.1953125, "learning_rate": 0.0004361486962408761, "loss": 5.0865, "mean_token_accuracy": 0.18407093435525895, "num_tokens": 6835865.0, "step": 3890 }, { "entropy": 5.49459433555603, "epoch": 3.052507836990596, "grad_norm": 1.25, "learning_rate": 0.0004359386369616359, "loss": 5.0443, "mean_token_accuracy": 0.1849384769797325, "num_tokens": 6843851.0, "step": 3895 }, { "entropy": 5.471823406219483, "epoch": 3.0564263322884013, "grad_norm": 1.2109375, "learning_rate": 0.0004357282900589565, "loss": 5.0965, "mean_token_accuracy": 0.1811446502804756, "num_tokens": 6852649.0, "step": 3900 }, { "entropy": 5.47971544265747, "epoch": 3.060344827586207, "grad_norm": 1.1328125, "learning_rate": 0.00043551765590876183, "loss": 5.0536, "mean_token_accuracy": 0.18287423402070999, "num_tokens": 6861607.0, "step": 3905 }, { "entropy": 5.440199613571167, "epoch": 3.0642633228840124, "grad_norm": 1.25, "learning_rate": 0.0004353067348874894, "loss": 5.0757, "mean_token_accuracy": 0.1747647225856781, "num_tokens": 6870650.0, "step": 3910 }, { "entropy": 5.555355358123779, "epoch": 3.0681818181818183, "grad_norm": 1.21875, "learning_rate": 0.00043509552737208923, "loss": 5.1226, "mean_token_accuracy": 0.167980919778347, "num_tokens": 6879820.0, "step": 3915 }, { "entropy": 5.418352794647217, "epoch": 3.072100313479624, "grad_norm": 1.1796875, "learning_rate": 0.0004348840337400233, "loss": 5.0409, "mean_token_accuracy": 0.18314133137464522, "num_tokens": 6888329.0, "step": 3920 }, { "entropy": 5.443490362167358, "epoch": 3.0760188087774294, "grad_norm": 1.21875, "learning_rate": 0.00043467225436926517, "loss": 5.1256, "mean_token_accuracy": 0.17643692940473557, "num_tokens": 6897635.0, "step": 3925 }, { "entropy": 5.520525932312012, "epoch": 3.079937304075235, "grad_norm": 1.1796875, "learning_rate": 0.0004344601896382988, "loss": 5.1636, "mean_token_accuracy": 0.18288996070623398, "num_tokens": 6907275.0, "step": 3930 }, { "entropy": 5.523024606704712, "epoch": 3.083855799373041, "grad_norm": 1.2890625, "learning_rate": 0.00043424783992611837, "loss": 5.1091, "mean_token_accuracy": 0.17494795471429825, "num_tokens": 6916377.0, "step": 3935 }, { "entropy": 5.4875256538391115, "epoch": 3.0877742946708464, "grad_norm": 1.25, "learning_rate": 0.00043403520561222705, "loss": 5.0059, "mean_token_accuracy": 0.19060440957546235, "num_tokens": 6924355.0, "step": 3940 }, { "entropy": 5.420040082931519, "epoch": 3.091692789968652, "grad_norm": 1.28125, "learning_rate": 0.0004338222870766371, "loss": 5.1104, "mean_token_accuracy": 0.1827986016869545, "num_tokens": 6933328.0, "step": 3945 }, { "entropy": 5.550303268432617, "epoch": 3.0956112852664575, "grad_norm": 1.1484375, "learning_rate": 0.00043360908469986827, "loss": 5.1158, "mean_token_accuracy": 0.1787843018770218, "num_tokens": 6942189.0, "step": 3950 }, { "entropy": 5.462327671051026, "epoch": 3.0995297805642634, "grad_norm": 1.2890625, "learning_rate": 0.0004333955988629478, "loss": 5.076, "mean_token_accuracy": 0.1802074134349823, "num_tokens": 6951188.0, "step": 3955 }, { "entropy": 5.432707262039185, "epoch": 3.103448275862069, "grad_norm": 1.265625, "learning_rate": 0.00043318182994740945, "loss": 5.0336, "mean_token_accuracy": 0.18666609823703767, "num_tokens": 6958718.0, "step": 3960 }, { "entropy": 5.511406707763672, "epoch": 3.1073667711598745, "grad_norm": 1.1953125, "learning_rate": 0.0004329677783352931, "loss": 5.1119, "mean_token_accuracy": 0.17655452340841293, "num_tokens": 6968040.0, "step": 3965 }, { "entropy": 5.499683237075805, "epoch": 3.1112852664576804, "grad_norm": 1.2578125, "learning_rate": 0.0004327534444091436, "loss": 5.1113, "mean_token_accuracy": 0.1770559012889862, "num_tokens": 6976977.0, "step": 3970 }, { "entropy": 5.445043706893921, "epoch": 3.115203761755486, "grad_norm": 1.2734375, "learning_rate": 0.00043253882855201037, "loss": 5.0948, "mean_token_accuracy": 0.18210539519786834, "num_tokens": 6985859.0, "step": 3975 }, { "entropy": 5.35231466293335, "epoch": 3.1191222570532915, "grad_norm": 1.1875, "learning_rate": 0.00043232393114744683, "loss": 4.9988, "mean_token_accuracy": 0.19316509366035461, "num_tokens": 6994031.0, "step": 3980 }, { "entropy": 5.412227296829224, "epoch": 3.123040752351097, "grad_norm": 1.15625, "learning_rate": 0.0004321087525795095, "loss": 5.0556, "mean_token_accuracy": 0.18158914446830748, "num_tokens": 7002527.0, "step": 3985 }, { "entropy": 5.5304759502410885, "epoch": 3.126959247648903, "grad_norm": 1.1640625, "learning_rate": 0.0004318932932327573, "loss": 5.1739, "mean_token_accuracy": 0.1824020892381668, "num_tokens": 7011983.0, "step": 3990 }, { "entropy": 5.5312965393066404, "epoch": 3.1308777429467085, "grad_norm": 1.2734375, "learning_rate": 0.000431677553492251, "loss": 5.119, "mean_token_accuracy": 0.1740437164902687, "num_tokens": 7020819.0, "step": 3995 }, { "entropy": 5.410348653793335, "epoch": 3.134796238244514, "grad_norm": 1.2578125, "learning_rate": 0.00043146153374355256, "loss": 5.006, "mean_token_accuracy": 0.18658973425626754, "num_tokens": 7029257.0, "step": 4000 }, { "epoch": 3.134796238244514, "eval_entropy": 5.3644778922546745, "eval_loss": 5.828073978424072, "eval_mean_token_accuracy": 0.15688188456345437, "eval_num_tokens": 7029257.0, "eval_runtime": 2.8339, "eval_samples_per_second": 1454.534, "eval_steps_per_second": 182.081, "step": 4000 }, { "entropy": 5.479976797103882, "epoch": 3.1387147335423196, "grad_norm": 1.203125, "learning_rate": 0.00043124523437272427, "loss": 5.1671, "mean_token_accuracy": 0.1771426811814308, "num_tokens": 7038942.0, "step": 4005 }, { "entropy": 5.525044679641724, "epoch": 3.1426332288401255, "grad_norm": 1.2734375, "learning_rate": 0.0004310286557663282, "loss": 5.1505, "mean_token_accuracy": 0.17958650290966033, "num_tokens": 7048074.0, "step": 4010 }, { "entropy": 5.52232346534729, "epoch": 3.146551724137931, "grad_norm": 1.2265625, "learning_rate": 0.0004308117983114254, "loss": 5.0553, "mean_token_accuracy": 0.18663895428180693, "num_tokens": 7056887.0, "step": 4015 }, { "entropy": 5.435352230072022, "epoch": 3.1504702194357366, "grad_norm": 1.2734375, "learning_rate": 0.0004305946623955754, "loss": 5.0523, "mean_token_accuracy": 0.19024786949157715, "num_tokens": 7065462.0, "step": 4020 }, { "entropy": 5.471118497848511, "epoch": 3.1543887147335425, "grad_norm": 1.28125, "learning_rate": 0.00043037724840683516, "loss": 5.1474, "mean_token_accuracy": 0.17960819303989412, "num_tokens": 7074188.0, "step": 4025 }, { "entropy": 5.477698230743409, "epoch": 3.158307210031348, "grad_norm": 1.3359375, "learning_rate": 0.00043015955673375876, "loss": 5.122, "mean_token_accuracy": 0.18004380017518998, "num_tokens": 7082282.0, "step": 4030 }, { "entropy": 5.50840573310852, "epoch": 3.1622257053291536, "grad_norm": 1.234375, "learning_rate": 0.0004299415877653966, "loss": 5.1723, "mean_token_accuracy": 0.17248133569955826, "num_tokens": 7090631.0, "step": 4035 }, { "entropy": 5.473033285140991, "epoch": 3.166144200626959, "grad_norm": 1.2109375, "learning_rate": 0.0004297233418912945, "loss": 5.1086, "mean_token_accuracy": 0.17797058075666428, "num_tokens": 7099546.0, "step": 4040 }, { "entropy": 5.474239778518677, "epoch": 3.170062695924765, "grad_norm": 1.2109375, "learning_rate": 0.0004295048195014932, "loss": 5.1465, "mean_token_accuracy": 0.1752048373222351, "num_tokens": 7109423.0, "step": 4045 }, { "entropy": 5.412190675735474, "epoch": 3.1739811912225706, "grad_norm": 1.40625, "learning_rate": 0.0004292860209865277, "loss": 5.0146, "mean_token_accuracy": 0.185761658847332, "num_tokens": 7117717.0, "step": 4050 }, { "entropy": 5.462385177612305, "epoch": 3.177899686520376, "grad_norm": 1.1640625, "learning_rate": 0.0004290669467374263, "loss": 5.2407, "mean_token_accuracy": 0.1639290541410446, "num_tokens": 7126573.0, "step": 4055 }, { "entropy": 5.47260046005249, "epoch": 3.1818181818181817, "grad_norm": 1.3203125, "learning_rate": 0.00042884759714571037, "loss": 5.1361, "mean_token_accuracy": 0.17528676837682725, "num_tokens": 7134471.0, "step": 4060 }, { "entropy": 5.362086915969849, "epoch": 3.1857366771159876, "grad_norm": 1.25, "learning_rate": 0.0004286279726033932, "loss": 5.0691, "mean_token_accuracy": 0.18219161480665208, "num_tokens": 7143092.0, "step": 4065 }, { "entropy": 5.519996929168701, "epoch": 3.189655172413793, "grad_norm": 1.265625, "learning_rate": 0.00042840807350297933, "loss": 5.1474, "mean_token_accuracy": 0.17893998175859452, "num_tokens": 7152137.0, "step": 4070 }, { "entropy": 5.500849103927612, "epoch": 3.1935736677115987, "grad_norm": 1.3359375, "learning_rate": 0.00042818790023746407, "loss": 5.1012, "mean_token_accuracy": 0.17794661372900009, "num_tokens": 7160037.0, "step": 4075 }, { "entropy": 5.409151077270508, "epoch": 3.197492163009404, "grad_norm": 1.2421875, "learning_rate": 0.00042796745320033296, "loss": 5.0934, "mean_token_accuracy": 0.17724834829568864, "num_tokens": 7168354.0, "step": 4080 }, { "entropy": 5.4781488418579105, "epoch": 3.20141065830721, "grad_norm": 1.234375, "learning_rate": 0.00042774673278556043, "loss": 5.1611, "mean_token_accuracy": 0.1761411026120186, "num_tokens": 7176757.0, "step": 4085 }, { "entropy": 5.479331970214844, "epoch": 3.2053291536050157, "grad_norm": 1.21875, "learning_rate": 0.0004275257393876097, "loss": 5.1466, "mean_token_accuracy": 0.1715711236000061, "num_tokens": 7185489.0, "step": 4090 }, { "entropy": 5.523379135131836, "epoch": 3.209247648902821, "grad_norm": 1.1875, "learning_rate": 0.0004273044734014318, "loss": 5.0853, "mean_token_accuracy": 0.1794649213552475, "num_tokens": 7194431.0, "step": 4095 }, { "entropy": 5.3604803562164305, "epoch": 3.2131661442006267, "grad_norm": 1.2734375, "learning_rate": 0.00042708293522246486, "loss": 5.0738, "mean_token_accuracy": 0.17980273962020873, "num_tokens": 7203074.0, "step": 4100 }, { "entropy": 5.456855535507202, "epoch": 3.2170846394984327, "grad_norm": 1.2890625, "learning_rate": 0.0004268611252466337, "loss": 5.1036, "mean_token_accuracy": 0.17986776679754257, "num_tokens": 7211692.0, "step": 4105 }, { "entropy": 5.413537836074829, "epoch": 3.2210031347962382, "grad_norm": 1.28125, "learning_rate": 0.0004266390438703486, "loss": 5.093, "mean_token_accuracy": 0.17688279300928117, "num_tokens": 7220285.0, "step": 4110 }, { "entropy": 5.509257078170776, "epoch": 3.2249216300940438, "grad_norm": 1.2578125, "learning_rate": 0.00042641669149050493, "loss": 5.2339, "mean_token_accuracy": 0.17110467851161956, "num_tokens": 7230007.0, "step": 4115 }, { "entropy": 5.488560152053833, "epoch": 3.2288401253918497, "grad_norm": 1.25, "learning_rate": 0.0004261940685044825, "loss": 5.0905, "mean_token_accuracy": 0.17679268568754197, "num_tokens": 7238520.0, "step": 4120 }, { "entropy": 5.3976846694946286, "epoch": 3.2327586206896552, "grad_norm": 1.2265625, "learning_rate": 0.00042597117531014474, "loss": 5.0265, "mean_token_accuracy": 0.18420617133378983, "num_tokens": 7247228.0, "step": 4125 }, { "entropy": 5.532719135284424, "epoch": 3.2366771159874608, "grad_norm": 1.234375, "learning_rate": 0.0004257480123058378, "loss": 5.2695, "mean_token_accuracy": 0.1700225442647934, "num_tokens": 7255782.0, "step": 4130 }, { "entropy": 5.508197259902954, "epoch": 3.2405956112852663, "grad_norm": 1.171875, "learning_rate": 0.00042552457989039036, "loss": 5.1426, "mean_token_accuracy": 0.1676323667168617, "num_tokens": 7264946.0, "step": 4135 }, { "entropy": 5.349348306655884, "epoch": 3.2445141065830723, "grad_norm": 1.21875, "learning_rate": 0.00042530087846311213, "loss": 4.9496, "mean_token_accuracy": 0.193049119412899, "num_tokens": 7273647.0, "step": 4140 }, { "entropy": 5.459439086914062, "epoch": 3.248432601880878, "grad_norm": 1.296875, "learning_rate": 0.00042507690842379396, "loss": 5.1177, "mean_token_accuracy": 0.17937442511320115, "num_tokens": 7283393.0, "step": 4145 }, { "entropy": 5.53988881111145, "epoch": 3.2523510971786833, "grad_norm": 1.265625, "learning_rate": 0.00042485267017270664, "loss": 5.2106, "mean_token_accuracy": 0.17449098229408264, "num_tokens": 7292368.0, "step": 4150 }, { "entropy": 5.410909795761109, "epoch": 3.256269592476489, "grad_norm": 1.25, "learning_rate": 0.00042462816411060025, "loss": 5.1283, "mean_token_accuracy": 0.18129791021347047, "num_tokens": 7301505.0, "step": 4155 }, { "entropy": 5.447917032241821, "epoch": 3.260188087774295, "grad_norm": 1.171875, "learning_rate": 0.0004244033906387035, "loss": 5.0887, "mean_token_accuracy": 0.18481171876192093, "num_tokens": 7310011.0, "step": 4160 }, { "entropy": 5.489086627960205, "epoch": 3.2641065830721003, "grad_norm": 1.1484375, "learning_rate": 0.0004241783501587231, "loss": 5.132, "mean_token_accuracy": 0.18020764291286467, "num_tokens": 7319295.0, "step": 4165 }, { "entropy": 5.476145839691162, "epoch": 3.268025078369906, "grad_norm": 1.296875, "learning_rate": 0.00042395304307284284, "loss": 5.1608, "mean_token_accuracy": 0.1762930765748024, "num_tokens": 7328390.0, "step": 4170 }, { "entropy": 5.530626583099365, "epoch": 3.271943573667712, "grad_norm": 1.2578125, "learning_rate": 0.0004237274697837229, "loss": 5.1891, "mean_token_accuracy": 0.1730980709195137, "num_tokens": 7337062.0, "step": 4175 }, { "entropy": 5.411122417449951, "epoch": 3.2758620689655173, "grad_norm": 1.2109375, "learning_rate": 0.0004235016306944996, "loss": 5.1639, "mean_token_accuracy": 0.17733618319034578, "num_tokens": 7346364.0, "step": 4180 }, { "entropy": 5.473628377914428, "epoch": 3.279780564263323, "grad_norm": 1.265625, "learning_rate": 0.0004232755262087837, "loss": 5.1232, "mean_token_accuracy": 0.17577927559614182, "num_tokens": 7354896.0, "step": 4185 }, { "entropy": 5.492278003692627, "epoch": 3.2836990595611284, "grad_norm": 1.2109375, "learning_rate": 0.00042304915673066083, "loss": 5.156, "mean_token_accuracy": 0.18079642653465272, "num_tokens": 7363591.0, "step": 4190 }, { "entropy": 5.46770281791687, "epoch": 3.287617554858934, "grad_norm": 1.265625, "learning_rate": 0.00042282252266468985, "loss": 5.1541, "mean_token_accuracy": 0.1765994980931282, "num_tokens": 7372891.0, "step": 4195 }, { "entropy": 5.420037746429443, "epoch": 3.29153605015674, "grad_norm": 1.25, "learning_rate": 0.0004225956244159025, "loss": 5.0849, "mean_token_accuracy": 0.18195036351680755, "num_tokens": 7381945.0, "step": 4200 }, { "entropy": 5.56228461265564, "epoch": 3.2954545454545454, "grad_norm": 1.2421875, "learning_rate": 0.0004223684623898029, "loss": 5.2202, "mean_token_accuracy": 0.17319831550121306, "num_tokens": 7391449.0, "step": 4205 }, { "entropy": 5.467171859741211, "epoch": 3.299373040752351, "grad_norm": 1.265625, "learning_rate": 0.0004221410369923662, "loss": 5.0742, "mean_token_accuracy": 0.18290328085422516, "num_tokens": 7399762.0, "step": 4210 }, { "entropy": 5.36794114112854, "epoch": 3.303291536050157, "grad_norm": 1.2578125, "learning_rate": 0.00042191334863003873, "loss": 4.9833, "mean_token_accuracy": 0.18501525074243547, "num_tokens": 7408711.0, "step": 4215 }, { "entropy": 5.31069974899292, "epoch": 3.3072100313479624, "grad_norm": 1.2265625, "learning_rate": 0.0004216853977097363, "loss": 5.0883, "mean_token_accuracy": 0.1858847111463547, "num_tokens": 7416868.0, "step": 4220 }, { "entropy": 5.442585515975952, "epoch": 3.311128526645768, "grad_norm": 1.25, "learning_rate": 0.0004214571846388442, "loss": 5.194, "mean_token_accuracy": 0.1712857499718666, "num_tokens": 7425687.0, "step": 4225 }, { "entropy": 5.51009168624878, "epoch": 3.3150470219435735, "grad_norm": 1.2421875, "learning_rate": 0.0004212287098252164, "loss": 5.1079, "mean_token_accuracy": 0.1798792377114296, "num_tokens": 7433714.0, "step": 4230 }, { "entropy": 5.418067598342896, "epoch": 3.3189655172413794, "grad_norm": 1.1796875, "learning_rate": 0.0004209999736771742, "loss": 5.1424, "mean_token_accuracy": 0.18602844029664994, "num_tokens": 7442765.0, "step": 4235 }, { "entropy": 5.437410736083985, "epoch": 3.322884012539185, "grad_norm": 1.2734375, "learning_rate": 0.0004207709766035063, "loss": 5.1322, "mean_token_accuracy": 0.1810468316078186, "num_tokens": 7450672.0, "step": 4240 }, { "entropy": 5.498105001449585, "epoch": 3.3268025078369905, "grad_norm": 1.25, "learning_rate": 0.0004205417190134674, "loss": 5.191, "mean_token_accuracy": 0.17484226375818251, "num_tokens": 7458758.0, "step": 4245 }, { "entropy": 5.497838830947876, "epoch": 3.330721003134796, "grad_norm": 1.234375, "learning_rate": 0.0004203122013167783, "loss": 5.1422, "mean_token_accuracy": 0.173189277946949, "num_tokens": 7467527.0, "step": 4250 }, { "entropy": 5.4545204639434814, "epoch": 3.334639498432602, "grad_norm": 1.2578125, "learning_rate": 0.00042008242392362413, "loss": 5.1395, "mean_token_accuracy": 0.18133477866649628, "num_tokens": 7476659.0, "step": 4255 }, { "entropy": 5.405594730377198, "epoch": 3.3385579937304075, "grad_norm": 1.2265625, "learning_rate": 0.00041985238724465433, "loss": 5.0524, "mean_token_accuracy": 0.18593618124723435, "num_tokens": 7485813.0, "step": 4260 }, { "entropy": 5.499361944198609, "epoch": 3.342476489028213, "grad_norm": 1.1953125, "learning_rate": 0.00041962209169098193, "loss": 5.1782, "mean_token_accuracy": 0.17913033664226533, "num_tokens": 7494842.0, "step": 4265 }, { "entropy": 5.469057226181031, "epoch": 3.346394984326019, "grad_norm": 1.3125, "learning_rate": 0.0004193915376741823, "loss": 5.193, "mean_token_accuracy": 0.1746675878763199, "num_tokens": 7503633.0, "step": 4270 }, { "entropy": 5.455647945404053, "epoch": 3.3503134796238245, "grad_norm": 1.21875, "learning_rate": 0.0004191607256062928, "loss": 5.1075, "mean_token_accuracy": 0.180385085940361, "num_tokens": 7512759.0, "step": 4275 }, { "entropy": 5.456186103820801, "epoch": 3.35423197492163, "grad_norm": 1.28125, "learning_rate": 0.0004189296558998121, "loss": 5.2025, "mean_token_accuracy": 0.17833039313554763, "num_tokens": 7522284.0, "step": 4280 }, { "entropy": 5.457691764831543, "epoch": 3.3581504702194356, "grad_norm": 1.25, "learning_rate": 0.0004186983289676992, "loss": 5.116, "mean_token_accuracy": 0.18076436668634416, "num_tokens": 7531286.0, "step": 4285 }, { "entropy": 5.451044416427612, "epoch": 3.3620689655172415, "grad_norm": 1.125, "learning_rate": 0.00041846674522337296, "loss": 5.0976, "mean_token_accuracy": 0.17392572164535522, "num_tokens": 7540450.0, "step": 4290 }, { "entropy": 5.3857035636901855, "epoch": 3.365987460815047, "grad_norm": 1.2578125, "learning_rate": 0.00041823490508071076, "loss": 5.0542, "mean_token_accuracy": 0.18191471099853515, "num_tokens": 7549386.0, "step": 4295 }, { "entropy": 5.5002960681915285, "epoch": 3.3699059561128526, "grad_norm": 1.15625, "learning_rate": 0.000418002808954049, "loss": 5.1349, "mean_token_accuracy": 0.17532447278499602, "num_tokens": 7558236.0, "step": 4300 }, { "entropy": 5.335977363586426, "epoch": 3.373824451410658, "grad_norm": 1.203125, "learning_rate": 0.00041777045725818057, "loss": 5.0178, "mean_token_accuracy": 0.18048462569713591, "num_tokens": 7566984.0, "step": 4305 }, { "entropy": 5.358128404617309, "epoch": 3.377742946708464, "grad_norm": 1.2109375, "learning_rate": 0.000417537850408356, "loss": 5.1446, "mean_token_accuracy": 0.1744469001889229, "num_tokens": 7576018.0, "step": 4310 }, { "entropy": 5.489906167984008, "epoch": 3.3816614420062696, "grad_norm": 1.2578125, "learning_rate": 0.0004173049888202814, "loss": 5.15, "mean_token_accuracy": 0.18125383108854293, "num_tokens": 7585294.0, "step": 4315 }, { "entropy": 5.441188955307007, "epoch": 3.385579937304075, "grad_norm": 1.140625, "learning_rate": 0.0004170718729101179, "loss": 5.1048, "mean_token_accuracy": 0.18156641870737075, "num_tokens": 7594634.0, "step": 4320 }, { "entropy": 5.387170600891113, "epoch": 3.389498432601881, "grad_norm": 1.3046875, "learning_rate": 0.00041683850309448187, "loss": 5.1674, "mean_token_accuracy": 0.1728657752275467, "num_tokens": 7603153.0, "step": 4325 }, { "entropy": 5.456656408309937, "epoch": 3.3934169278996866, "grad_norm": 1.1875, "learning_rate": 0.00041660487979044264, "loss": 5.1014, "mean_token_accuracy": 0.1805425301194191, "num_tokens": 7612027.0, "step": 4330 }, { "entropy": 5.453139019012451, "epoch": 3.397335423197492, "grad_norm": 1.3046875, "learning_rate": 0.0004163710034155231, "loss": 5.096, "mean_token_accuracy": 0.17648906409740447, "num_tokens": 7620725.0, "step": 4335 }, { "entropy": 5.363250875473023, "epoch": 3.4012539184952977, "grad_norm": 1.1875, "learning_rate": 0.0004161368743876982, "loss": 5.0648, "mean_token_accuracy": 0.18540093004703523, "num_tokens": 7628847.0, "step": 4340 }, { "entropy": 5.432746839523316, "epoch": 3.405172413793103, "grad_norm": 1.1875, "learning_rate": 0.0004159024931253945, "loss": 5.1639, "mean_token_accuracy": 0.1767767533659935, "num_tokens": 7637668.0, "step": 4345 }, { "entropy": 5.462332487106323, "epoch": 3.409090909090909, "grad_norm": 1.15625, "learning_rate": 0.00041566786004748943, "loss": 5.2048, "mean_token_accuracy": 0.1763555735349655, "num_tokens": 7646545.0, "step": 4350 }, { "entropy": 5.489386320114136, "epoch": 3.4130094043887147, "grad_norm": 1.2890625, "learning_rate": 0.00041543297557331015, "loss": 5.1754, "mean_token_accuracy": 0.1723347634077072, "num_tokens": 7654967.0, "step": 4355 }, { "entropy": 5.40674934387207, "epoch": 3.41692789968652, "grad_norm": 1.203125, "learning_rate": 0.0004151978401226335, "loss": 5.124, "mean_token_accuracy": 0.1786526545882225, "num_tokens": 7663718.0, "step": 4360 }, { "entropy": 5.47047872543335, "epoch": 3.420846394984326, "grad_norm": 1.28125, "learning_rate": 0.00041496245411568435, "loss": 5.1677, "mean_token_accuracy": 0.17421529591083526, "num_tokens": 7672430.0, "step": 4365 }, { "entropy": 5.442583322525024, "epoch": 3.4247648902821317, "grad_norm": 1.234375, "learning_rate": 0.0004147268179731359, "loss": 5.1351, "mean_token_accuracy": 0.1748214393854141, "num_tokens": 7681270.0, "step": 4370 }, { "entropy": 5.454329872131348, "epoch": 3.4286833855799372, "grad_norm": 1.25, "learning_rate": 0.00041449093211610815, "loss": 5.1524, "mean_token_accuracy": 0.17915753722190858, "num_tokens": 7690124.0, "step": 4375 }, { "entropy": 5.512417268753052, "epoch": 3.4326018808777428, "grad_norm": 1.1875, "learning_rate": 0.00041425479696616734, "loss": 5.1595, "mean_token_accuracy": 0.1722505882382393, "num_tokens": 7699511.0, "step": 4380 }, { "entropy": 5.426167726516724, "epoch": 3.4365203761755487, "grad_norm": 1.2109375, "learning_rate": 0.0004140184129453253, "loss": 5.116, "mean_token_accuracy": 0.18224495351314546, "num_tokens": 7708796.0, "step": 4385 }, { "entropy": 5.44820647239685, "epoch": 3.4404388714733543, "grad_norm": 1.3046875, "learning_rate": 0.00041378178047603845, "loss": 5.1091, "mean_token_accuracy": 0.18672927916049958, "num_tokens": 7717446.0, "step": 4390 }, { "entropy": 5.462598896026611, "epoch": 3.44435736677116, "grad_norm": 1.2578125, "learning_rate": 0.0004135448999812074, "loss": 5.1568, "mean_token_accuracy": 0.17604882270097733, "num_tokens": 7726814.0, "step": 4395 }, { "entropy": 5.400063228607178, "epoch": 3.4482758620689653, "grad_norm": 1.1640625, "learning_rate": 0.0004133077718841763, "loss": 5.0632, "mean_token_accuracy": 0.18461630046367644, "num_tokens": 7735895.0, "step": 4400 }, { "entropy": 5.38959813117981, "epoch": 3.4521943573667713, "grad_norm": 1.3203125, "learning_rate": 0.00041307039660873113, "loss": 5.1216, "mean_token_accuracy": 0.18026788532733917, "num_tokens": 7744160.0, "step": 4405 }, { "entropy": 5.483940744400025, "epoch": 3.456112852664577, "grad_norm": 1.28125, "learning_rate": 0.0004128327745791002, "loss": 5.2246, "mean_token_accuracy": 0.1668292060494423, "num_tokens": 7752844.0, "step": 4410 }, { "entropy": 5.458703708648682, "epoch": 3.4600313479623823, "grad_norm": 1.21875, "learning_rate": 0.0004125949062199526, "loss": 5.1852, "mean_token_accuracy": 0.17616474032402038, "num_tokens": 7762327.0, "step": 4415 }, { "entropy": 5.402634143829346, "epoch": 3.4639498432601883, "grad_norm": 1.2578125, "learning_rate": 0.00041235679195639766, "loss": 5.0954, "mean_token_accuracy": 0.1764765590429306, "num_tokens": 7770813.0, "step": 4420 }, { "entropy": 5.373932027816773, "epoch": 3.467868338557994, "grad_norm": 1.25, "learning_rate": 0.00041211843221398406, "loss": 5.1366, "mean_token_accuracy": 0.1720389500260353, "num_tokens": 7779357.0, "step": 4425 }, { "entropy": 5.491668367385865, "epoch": 3.4717868338557993, "grad_norm": 1.3359375, "learning_rate": 0.0004118798274186994, "loss": 5.1477, "mean_token_accuracy": 0.17953715473413467, "num_tokens": 7787302.0, "step": 4430 }, { "entropy": 5.41265459060669, "epoch": 3.475705329153605, "grad_norm": 1.2890625, "learning_rate": 0.0004116409779969691, "loss": 5.1765, "mean_token_accuracy": 0.17347506135702134, "num_tokens": 7795651.0, "step": 4435 }, { "entropy": 5.396276617050171, "epoch": 3.479623824451411, "grad_norm": 1.40625, "learning_rate": 0.00041140188437565586, "loss": 5.1345, "mean_token_accuracy": 0.18256539553403855, "num_tokens": 7804644.0, "step": 4440 }, { "entropy": 5.536772632598877, "epoch": 3.4835423197492164, "grad_norm": 1.2578125, "learning_rate": 0.00041116254698205873, "loss": 5.1149, "mean_token_accuracy": 0.1830981507897377, "num_tokens": 7812479.0, "step": 4445 }, { "entropy": 5.399745845794678, "epoch": 3.487460815047022, "grad_norm": 1.234375, "learning_rate": 0.00041092296624391244, "loss": 5.1163, "mean_token_accuracy": 0.179618901014328, "num_tokens": 7820678.0, "step": 4450 }, { "entropy": 5.3498913764953615, "epoch": 3.4913793103448274, "grad_norm": 1.2421875, "learning_rate": 0.0004106831425893865, "loss": 5.1459, "mean_token_accuracy": 0.18171471655368804, "num_tokens": 7829063.0, "step": 4455 }, { "entropy": 5.520697450637817, "epoch": 3.4952978056426334, "grad_norm": 1.265625, "learning_rate": 0.0004104430764470849, "loss": 5.18, "mean_token_accuracy": 0.17809778749942778, "num_tokens": 7837274.0, "step": 4460 }, { "entropy": 5.494675731658935, "epoch": 3.499216300940439, "grad_norm": 1.171875, "learning_rate": 0.0004102027682460445, "loss": 5.0874, "mean_token_accuracy": 0.1875218093395233, "num_tokens": 7845379.0, "step": 4465 }, { "entropy": 5.400750637054443, "epoch": 3.5031347962382444, "grad_norm": 1.359375, "learning_rate": 0.0004099622184157353, "loss": 5.1269, "mean_token_accuracy": 0.1800209864974022, "num_tokens": 7853760.0, "step": 4470 }, { "entropy": 5.526426029205322, "epoch": 3.5070532915360504, "grad_norm": 1.203125, "learning_rate": 0.0004097214273860586, "loss": 5.183, "mean_token_accuracy": 0.17379164099693298, "num_tokens": 7862026.0, "step": 4475 }, { "entropy": 5.451271629333496, "epoch": 3.510971786833856, "grad_norm": 1.265625, "learning_rate": 0.0004094803955873471, "loss": 5.1363, "mean_token_accuracy": 0.1838302046060562, "num_tokens": 7870369.0, "step": 4480 }, { "entropy": 5.4413388729095455, "epoch": 3.5148902821316614, "grad_norm": 1.234375, "learning_rate": 0.0004092391234503638, "loss": 5.1357, "mean_token_accuracy": 0.18130728155374526, "num_tokens": 7878524.0, "step": 4485 }, { "entropy": 5.452888298034668, "epoch": 3.518808777429467, "grad_norm": 1.3046875, "learning_rate": 0.00040899761140630094, "loss": 5.1942, "mean_token_accuracy": 0.16720112562179565, "num_tokens": 7887369.0, "step": 4490 }, { "entropy": 5.4835591316223145, "epoch": 3.5227272727272725, "grad_norm": 1.21875, "learning_rate": 0.00040875585988677985, "loss": 5.1658, "mean_token_accuracy": 0.1724832221865654, "num_tokens": 7896112.0, "step": 4495 }, { "entropy": 5.361825084686279, "epoch": 3.5266457680250785, "grad_norm": 1.1796875, "learning_rate": 0.0004085138693238497, "loss": 5.0118, "mean_token_accuracy": 0.18901925683021545, "num_tokens": 7904580.0, "step": 4500 }, { "epoch": 3.5266457680250785, "eval_entropy": 5.337222361749457, "eval_loss": 5.80082368850708, "eval_mean_token_accuracy": 0.1584607647197653, "eval_num_tokens": 7904580.0, "eval_runtime": 3.0275, "eval_samples_per_second": 1361.513, "eval_steps_per_second": 170.437, "step": 4500 }, { "entropy": 5.480594873428345, "epoch": 3.530564263322884, "grad_norm": 1.234375, "learning_rate": 0.0004082716401499867, "loss": 5.1681, "mean_token_accuracy": 0.17277304977178573, "num_tokens": 7913755.0, "step": 4505 }, { "entropy": 5.470868158340454, "epoch": 3.5344827586206895, "grad_norm": 1.296875, "learning_rate": 0.00040802917279809383, "loss": 5.1477, "mean_token_accuracy": 0.17918196320533752, "num_tokens": 7922385.0, "step": 4510 }, { "entropy": 5.417375326156616, "epoch": 3.5384012539184955, "grad_norm": 1.1640625, "learning_rate": 0.00040778646770149953, "loss": 5.1731, "mean_token_accuracy": 0.16962890774011613, "num_tokens": 7931634.0, "step": 4515 }, { "entropy": 5.406470584869385, "epoch": 3.542319749216301, "grad_norm": 1.21875, "learning_rate": 0.00040754352529395716, "loss": 5.0869, "mean_token_accuracy": 0.17542838752269746, "num_tokens": 7939866.0, "step": 4520 }, { "entropy": 5.434367752075195, "epoch": 3.5462382445141065, "grad_norm": 1.125, "learning_rate": 0.00040730034600964415, "loss": 5.2075, "mean_token_accuracy": 0.17026106566190718, "num_tokens": 7948961.0, "step": 4525 }, { "entropy": 5.289756202697754, "epoch": 3.5501567398119125, "grad_norm": 1.09375, "learning_rate": 0.0004070569302831613, "loss": 5.0367, "mean_token_accuracy": 0.17900677770376205, "num_tokens": 7958526.0, "step": 4530 }, { "entropy": 5.443406343460083, "epoch": 3.554075235109718, "grad_norm": 1.2265625, "learning_rate": 0.000406813278549532, "loss": 5.1621, "mean_token_accuracy": 0.17983901798725127, "num_tokens": 7966887.0, "step": 4535 }, { "entropy": 5.476217126846313, "epoch": 3.5579937304075235, "grad_norm": 1.359375, "learning_rate": 0.00040656939124420144, "loss": 5.1251, "mean_token_accuracy": 0.17843578457832338, "num_tokens": 7974929.0, "step": 4540 }, { "entropy": 5.374083375930786, "epoch": 3.561912225705329, "grad_norm": 1.2734375, "learning_rate": 0.0004063252688030358, "loss": 5.0863, "mean_token_accuracy": 0.18091104477643966, "num_tokens": 7983821.0, "step": 4545 }, { "entropy": 5.38077392578125, "epoch": 3.5658307210031346, "grad_norm": 1.1875, "learning_rate": 0.0004060809116623213, "loss": 5.1152, "mean_token_accuracy": 0.17550334483385086, "num_tokens": 7992109.0, "step": 4550 }, { "entropy": 5.365566873550415, "epoch": 3.5697492163009406, "grad_norm": 1.171875, "learning_rate": 0.000405836320258764, "loss": 5.1301, "mean_token_accuracy": 0.1887578547000885, "num_tokens": 8001098.0, "step": 4555 }, { "entropy": 5.484101867675781, "epoch": 3.573667711598746, "grad_norm": 1.2265625, "learning_rate": 0.0004055914950294882, "loss": 5.1373, "mean_token_accuracy": 0.17732277810573577, "num_tokens": 8009559.0, "step": 4560 }, { "entropy": 5.372460269927979, "epoch": 3.5775862068965516, "grad_norm": 1.203125, "learning_rate": 0.00040534643641203645, "loss": 5.059, "mean_token_accuracy": 0.18330815881490709, "num_tokens": 8018021.0, "step": 4565 }, { "entropy": 5.381331825256348, "epoch": 3.5815047021943576, "grad_norm": 1.3125, "learning_rate": 0.0004051011448443681, "loss": 5.0922, "mean_token_accuracy": 0.17764217853546144, "num_tokens": 8026258.0, "step": 4570 }, { "entropy": 5.434953689575195, "epoch": 3.585423197492163, "grad_norm": 1.4453125, "learning_rate": 0.000404855620764859, "loss": 5.1622, "mean_token_accuracy": 0.18100216686725618, "num_tokens": 8036761.0, "step": 4575 }, { "entropy": 5.496177244186401, "epoch": 3.5893416927899686, "grad_norm": 1.3203125, "learning_rate": 0.0004046098646123006, "loss": 5.1788, "mean_token_accuracy": 0.1807200565934181, "num_tokens": 8045115.0, "step": 4580 }, { "entropy": 5.508572769165039, "epoch": 3.593260188087774, "grad_norm": 1.328125, "learning_rate": 0.00040436387682589876, "loss": 5.1826, "mean_token_accuracy": 0.1770282730460167, "num_tokens": 8054334.0, "step": 4585 }, { "entropy": 5.375803804397583, "epoch": 3.5971786833855797, "grad_norm": 1.1796875, "learning_rate": 0.0004041176578452737, "loss": 5.1423, "mean_token_accuracy": 0.1803617998957634, "num_tokens": 8063165.0, "step": 4590 }, { "entropy": 5.411370325088501, "epoch": 3.6010971786833856, "grad_norm": 1.21875, "learning_rate": 0.0004038712081104587, "loss": 5.1227, "mean_token_accuracy": 0.17998642325401307, "num_tokens": 8072498.0, "step": 4595 }, { "entropy": 5.437928962707519, "epoch": 3.605015673981191, "grad_norm": 1.2265625, "learning_rate": 0.00040362452806189927, "loss": 5.1442, "mean_token_accuracy": 0.17676037400960923, "num_tokens": 8081455.0, "step": 4600 }, { "entropy": 5.356879949569702, "epoch": 3.6089341692789967, "grad_norm": 1.25, "learning_rate": 0.0004033776181404527, "loss": 5.1483, "mean_token_accuracy": 0.17423220127820968, "num_tokens": 8090992.0, "step": 4605 }, { "entropy": 5.477758693695068, "epoch": 3.6128526645768027, "grad_norm": 1.15625, "learning_rate": 0.00040313047878738704, "loss": 5.1429, "mean_token_accuracy": 0.17423719316720962, "num_tokens": 8099976.0, "step": 4610 }, { "entropy": 5.43692717552185, "epoch": 3.616771159874608, "grad_norm": 1.2578125, "learning_rate": 0.0004028831104443805, "loss": 5.0531, "mean_token_accuracy": 0.19072302281856537, "num_tokens": 8108275.0, "step": 4615 }, { "entropy": 5.37940616607666, "epoch": 3.6206896551724137, "grad_norm": 1.3125, "learning_rate": 0.0004026355135535202, "loss": 5.1237, "mean_token_accuracy": 0.18099311292171477, "num_tokens": 8117441.0, "step": 4620 }, { "entropy": 5.383903551101684, "epoch": 3.6246081504702197, "grad_norm": 1.296875, "learning_rate": 0.00040238768855730214, "loss": 5.1077, "mean_token_accuracy": 0.17485350966453553, "num_tokens": 8125709.0, "step": 4625 }, { "entropy": 5.4046632766723635, "epoch": 3.628526645768025, "grad_norm": 1.203125, "learning_rate": 0.00040213963589862963, "loss": 5.1292, "mean_token_accuracy": 0.1794295147061348, "num_tokens": 8134418.0, "step": 4630 }, { "entropy": 5.482091903686523, "epoch": 3.6324451410658307, "grad_norm": 1.203125, "learning_rate": 0.0004018913560208131, "loss": 5.1953, "mean_token_accuracy": 0.17151551097631454, "num_tokens": 8143729.0, "step": 4635 }, { "entropy": 5.426027917861939, "epoch": 3.6363636363636362, "grad_norm": 1.1640625, "learning_rate": 0.0004016428493675689, "loss": 5.0717, "mean_token_accuracy": 0.1761728420853615, "num_tokens": 8152613.0, "step": 4640 }, { "entropy": 5.364865112304687, "epoch": 3.6402821316614418, "grad_norm": 1.1640625, "learning_rate": 0.0004013941163830187, "loss": 5.0649, "mean_token_accuracy": 0.18139948844909667, "num_tokens": 8160862.0, "step": 4645 }, { "entropy": 5.3869531631469725, "epoch": 3.6442006269592477, "grad_norm": 1.2421875, "learning_rate": 0.0004011451575116887, "loss": 5.1554, "mean_token_accuracy": 0.1723189875483513, "num_tokens": 8169369.0, "step": 4650 }, { "entropy": 5.4002196311950685, "epoch": 3.6481191222570533, "grad_norm": 1.2265625, "learning_rate": 0.0004008959731985087, "loss": 5.0635, "mean_token_accuracy": 0.18389806002378464, "num_tokens": 8178152.0, "step": 4655 }, { "entropy": 5.342556095123291, "epoch": 3.652037617554859, "grad_norm": 1.2578125, "learning_rate": 0.00040064656388881157, "loss": 5.0368, "mean_token_accuracy": 0.18386815786361693, "num_tokens": 8186690.0, "step": 4660 }, { "entropy": 5.345205497741699, "epoch": 3.6559561128526648, "grad_norm": 1.203125, "learning_rate": 0.0004003969300283321, "loss": 5.0211, "mean_token_accuracy": 0.1854146108031273, "num_tokens": 8194976.0, "step": 4665 }, { "entropy": 5.428677892684936, "epoch": 3.6598746081504703, "grad_norm": 1.25, "learning_rate": 0.00040014707206320653, "loss": 5.1304, "mean_token_accuracy": 0.1809231385588646, "num_tokens": 8202964.0, "step": 4670 }, { "entropy": 5.375498723983765, "epoch": 3.663793103448276, "grad_norm": 1.1953125, "learning_rate": 0.00039989699043997153, "loss": 5.1371, "mean_token_accuracy": 0.1780979588627815, "num_tokens": 8213084.0, "step": 4675 }, { "entropy": 5.407328462600708, "epoch": 3.6677115987460818, "grad_norm": 1.2890625, "learning_rate": 0.00039964668560556356, "loss": 5.0542, "mean_token_accuracy": 0.18738225400447844, "num_tokens": 8221645.0, "step": 4680 }, { "entropy": 5.374656867980957, "epoch": 3.6716300940438873, "grad_norm": 1.34375, "learning_rate": 0.00039939615800731784, "loss": 5.0422, "mean_token_accuracy": 0.18308537155389787, "num_tokens": 8230195.0, "step": 4685 }, { "entropy": 5.379098272323608, "epoch": 3.675548589341693, "grad_norm": 1.234375, "learning_rate": 0.00039914540809296795, "loss": 5.1367, "mean_token_accuracy": 0.17565943598747252, "num_tokens": 8239608.0, "step": 4690 }, { "entropy": 5.494941234588623, "epoch": 3.6794670846394983, "grad_norm": 1.1796875, "learning_rate": 0.0003988944363106445, "loss": 5.1561, "mean_token_accuracy": 0.18333661705255508, "num_tokens": 8247993.0, "step": 4695 }, { "entropy": 5.368853759765625, "epoch": 3.683385579937304, "grad_norm": 1.296875, "learning_rate": 0.0003986432431088749, "loss": 5.143, "mean_token_accuracy": 0.1748058944940567, "num_tokens": 8257279.0, "step": 4700 }, { "entropy": 5.482876777648926, "epoch": 3.68730407523511, "grad_norm": 1.3671875, "learning_rate": 0.000398391828936582, "loss": 5.1776, "mean_token_accuracy": 0.1722080945968628, "num_tokens": 8265166.0, "step": 4705 }, { "entropy": 5.340915870666504, "epoch": 3.6912225705329154, "grad_norm": 1.265625, "learning_rate": 0.0003981401942430838, "loss": 4.9655, "mean_token_accuracy": 0.18662159740924836, "num_tokens": 8273281.0, "step": 4710 }, { "entropy": 5.375476741790772, "epoch": 3.695141065830721, "grad_norm": 1.2421875, "learning_rate": 0.00039788833947809217, "loss": 5.1727, "mean_token_accuracy": 0.17165548503398895, "num_tokens": 8282432.0, "step": 4715 }, { "entropy": 5.347471475601196, "epoch": 3.699059561128527, "grad_norm": 1.2890625, "learning_rate": 0.0003976362650917125, "loss": 4.9954, "mean_token_accuracy": 0.19145373702049256, "num_tokens": 8290909.0, "step": 4720 }, { "entropy": 5.368044900894165, "epoch": 3.7029780564263324, "grad_norm": 1.28125, "learning_rate": 0.00039738397153444264, "loss": 5.1106, "mean_token_accuracy": 0.18162070959806442, "num_tokens": 8299581.0, "step": 4725 }, { "entropy": 5.46605544090271, "epoch": 3.706896551724138, "grad_norm": 1.2109375, "learning_rate": 0.0003971314592571719, "loss": 5.1308, "mean_token_accuracy": 0.1756805568933487, "num_tokens": 8308184.0, "step": 4730 }, { "entropy": 5.377258920669556, "epoch": 3.7108150470219434, "grad_norm": 1.28125, "learning_rate": 0.0003968787287111809, "loss": 5.1026, "mean_token_accuracy": 0.18265192806720734, "num_tokens": 8317054.0, "step": 4735 }, { "entropy": 5.343409299850464, "epoch": 3.714733542319749, "grad_norm": 1.25, "learning_rate": 0.00039662578034814, "loss": 5.1292, "mean_token_accuracy": 0.18576941788196563, "num_tokens": 8325961.0, "step": 4740 }, { "entropy": 5.552810716629028, "epoch": 3.718652037617555, "grad_norm": 1.2578125, "learning_rate": 0.00039637261462010886, "loss": 5.2267, "mean_token_accuracy": 0.17132796049118043, "num_tokens": 8334401.0, "step": 4745 }, { "entropy": 5.4945728302001955, "epoch": 3.7225705329153604, "grad_norm": 1.2734375, "learning_rate": 0.0003961192319795358, "loss": 5.1954, "mean_token_accuracy": 0.1732421785593033, "num_tokens": 8343786.0, "step": 4750 }, { "entropy": 5.4864644527435305, "epoch": 3.726489028213166, "grad_norm": 1.3125, "learning_rate": 0.0003958656328792565, "loss": 5.17, "mean_token_accuracy": 0.1812288358807564, "num_tokens": 8352286.0, "step": 4755 }, { "entropy": 5.339477300643921, "epoch": 3.730407523510972, "grad_norm": 1.1953125, "learning_rate": 0.00039561181777249396, "loss": 5.0962, "mean_token_accuracy": 0.18505542427301408, "num_tokens": 8361274.0, "step": 4760 }, { "entropy": 5.387203741073608, "epoch": 3.7343260188087775, "grad_norm": 1.21875, "learning_rate": 0.00039535778711285676, "loss": 5.1225, "mean_token_accuracy": 0.18194636851549148, "num_tokens": 8369881.0, "step": 4765 }, { "entropy": 5.474074840545654, "epoch": 3.738244514106583, "grad_norm": 1.296875, "learning_rate": 0.0003951035413543388, "loss": 5.1981, "mean_token_accuracy": 0.16484691351652145, "num_tokens": 8378234.0, "step": 4770 }, { "entropy": 5.408027410507202, "epoch": 3.742163009404389, "grad_norm": 1.2578125, "learning_rate": 0.00039484908095131874, "loss": 5.1144, "mean_token_accuracy": 0.18274004459381105, "num_tokens": 8387023.0, "step": 4775 }, { "entropy": 5.36602144241333, "epoch": 3.7460815047021945, "grad_norm": 1.2109375, "learning_rate": 0.0003945944063585582, "loss": 5.0755, "mean_token_accuracy": 0.1848648577928543, "num_tokens": 8396110.0, "step": 4780 }, { "entropy": 5.341309881210327, "epoch": 3.75, "grad_norm": 1.140625, "learning_rate": 0.00039433951803120225, "loss": 5.0402, "mean_token_accuracy": 0.1882338985800743, "num_tokens": 8405219.0, "step": 4785 }, { "entropy": 5.499637079238892, "epoch": 3.7539184952978055, "grad_norm": 1.234375, "learning_rate": 0.00039408441642477764, "loss": 5.2427, "mean_token_accuracy": 0.1696087598800659, "num_tokens": 8413996.0, "step": 4790 }, { "entropy": 5.383800268173218, "epoch": 3.757836990595611, "grad_norm": 1.2265625, "learning_rate": 0.0003938291019951922, "loss": 4.9861, "mean_token_accuracy": 0.19548303335905076, "num_tokens": 8421931.0, "step": 4795 }, { "entropy": 5.373687696456909, "epoch": 3.761755485893417, "grad_norm": 1.2578125, "learning_rate": 0.0003935735751987344, "loss": 5.0839, "mean_token_accuracy": 0.1746331587433815, "num_tokens": 8430090.0, "step": 4800 }, { "entropy": 5.327823877334595, "epoch": 3.7656739811912225, "grad_norm": 1.2421875, "learning_rate": 0.00039331783649207175, "loss": 5.0766, "mean_token_accuracy": 0.17887697219848633, "num_tokens": 8439311.0, "step": 4805 }, { "entropy": 5.389744234085083, "epoch": 3.769592476489028, "grad_norm": 1.2109375, "learning_rate": 0.00039306188633225097, "loss": 5.1405, "mean_token_accuracy": 0.1806069329380989, "num_tokens": 8448952.0, "step": 4810 }, { "entropy": 5.394601345062256, "epoch": 3.773510971786834, "grad_norm": 1.2734375, "learning_rate": 0.0003928057251766965, "loss": 5.0759, "mean_token_accuracy": 0.1838828906416893, "num_tokens": 8457177.0, "step": 4815 }, { "entropy": 5.355936098098755, "epoch": 3.7774294670846396, "grad_norm": 1.296875, "learning_rate": 0.00039254935348320984, "loss": 5.1241, "mean_token_accuracy": 0.18029553443193436, "num_tokens": 8465844.0, "step": 4820 }, { "entropy": 5.447889137268066, "epoch": 3.781347962382445, "grad_norm": 1.3125, "learning_rate": 0.00039229277170996885, "loss": 5.2206, "mean_token_accuracy": 0.17226959466934205, "num_tokens": 8475198.0, "step": 4825 }, { "entropy": 5.474216604232788, "epoch": 3.785266457680251, "grad_norm": 1.25, "learning_rate": 0.0003920359803155266, "loss": 5.0949, "mean_token_accuracy": 0.182598714530468, "num_tokens": 8484050.0, "step": 4830 }, { "entropy": 5.325715017318726, "epoch": 3.7891849529780566, "grad_norm": 1.296875, "learning_rate": 0.00039177897975881115, "loss": 5.0895, "mean_token_accuracy": 0.1856340140104294, "num_tokens": 8491825.0, "step": 4835 }, { "entropy": 5.339238119125366, "epoch": 3.793103448275862, "grad_norm": 1.1640625, "learning_rate": 0.0003915217704991239, "loss": 5.0818, "mean_token_accuracy": 0.18506858348846436, "num_tokens": 8500651.0, "step": 4840 }, { "entropy": 5.439371633529663, "epoch": 3.7970219435736676, "grad_norm": 1.2734375, "learning_rate": 0.0003912643529961397, "loss": 5.0825, "mean_token_accuracy": 0.1822042629122734, "num_tokens": 8508464.0, "step": 4845 }, { "entropy": 5.415101051330566, "epoch": 3.800940438871473, "grad_norm": 1.234375, "learning_rate": 0.0003910067277099053, "loss": 5.1316, "mean_token_accuracy": 0.18691204339265824, "num_tokens": 8517676.0, "step": 4850 }, { "entropy": 5.458886194229126, "epoch": 3.804858934169279, "grad_norm": 1.1875, "learning_rate": 0.00039074889510083894, "loss": 5.1881, "mean_token_accuracy": 0.1722204566001892, "num_tokens": 8526926.0, "step": 4855 }, { "entropy": 5.37487382888794, "epoch": 3.8087774294670846, "grad_norm": 1.296875, "learning_rate": 0.0003904908556297293, "loss": 5.0596, "mean_token_accuracy": 0.1804742068052292, "num_tokens": 8535814.0, "step": 4860 }, { "entropy": 5.458344507217407, "epoch": 3.81269592476489, "grad_norm": 1.2578125, "learning_rate": 0.0003902326097577345, "loss": 5.1719, "mean_token_accuracy": 0.17721525430679322, "num_tokens": 8544814.0, "step": 4865 }, { "entropy": 5.417445516586303, "epoch": 3.816614420062696, "grad_norm": 1.21875, "learning_rate": 0.00038997415794638206, "loss": 5.1667, "mean_token_accuracy": 0.17812894880771638, "num_tokens": 8553455.0, "step": 4870 }, { "entropy": 5.3424866676330565, "epoch": 3.8205329153605017, "grad_norm": 1.265625, "learning_rate": 0.0003897155006575672, "loss": 5.0781, "mean_token_accuracy": 0.1781342074275017, "num_tokens": 8562048.0, "step": 4875 }, { "entropy": 5.407690954208374, "epoch": 3.824451410658307, "grad_norm": 1.2265625, "learning_rate": 0.00038945663835355247, "loss": 5.1219, "mean_token_accuracy": 0.1807398185133934, "num_tokens": 8571855.0, "step": 4880 }, { "entropy": 5.394909906387329, "epoch": 3.8283699059561127, "grad_norm": 1.1953125, "learning_rate": 0.00038919757149696665, "loss": 5.1129, "mean_token_accuracy": 0.17730758637189864, "num_tokens": 8580425.0, "step": 4885 }, { "entropy": 5.411207771301269, "epoch": 3.8322884012539182, "grad_norm": 1.1640625, "learning_rate": 0.00038893830055080437, "loss": 5.1428, "mean_token_accuracy": 0.18344798386096955, "num_tokens": 8589751.0, "step": 4890 }, { "entropy": 5.435813570022583, "epoch": 3.836206896551724, "grad_norm": 1.28125, "learning_rate": 0.0003886788259784248, "loss": 5.1656, "mean_token_accuracy": 0.17248253524303436, "num_tokens": 8598077.0, "step": 4895 }, { "entropy": 5.494762659072876, "epoch": 3.8401253918495297, "grad_norm": 1.28125, "learning_rate": 0.00038841914824355093, "loss": 5.1758, "mean_token_accuracy": 0.17547922879457473, "num_tokens": 8606691.0, "step": 4900 }, { "entropy": 5.3925032138824465, "epoch": 3.8440438871473352, "grad_norm": 1.203125, "learning_rate": 0.00038815926781026914, "loss": 5.1202, "mean_token_accuracy": 0.17767349481582642, "num_tokens": 8615477.0, "step": 4905 }, { "entropy": 5.366964483261109, "epoch": 3.847962382445141, "grad_norm": 1.2265625, "learning_rate": 0.0003878991851430279, "loss": 5.1388, "mean_token_accuracy": 0.18286672681570054, "num_tokens": 8623982.0, "step": 4910 }, { "entropy": 5.405296802520752, "epoch": 3.8518808777429467, "grad_norm": 1.25, "learning_rate": 0.0003876389007066371, "loss": 5.1606, "mean_token_accuracy": 0.17795273512601853, "num_tokens": 8633017.0, "step": 4915 }, { "entropy": 5.375383281707764, "epoch": 3.8557993730407523, "grad_norm": 1.1875, "learning_rate": 0.0003873784149662672, "loss": 5.1049, "mean_token_accuracy": 0.18141486793756484, "num_tokens": 8642299.0, "step": 4920 }, { "entropy": 5.348470163345337, "epoch": 3.8597178683385582, "grad_norm": 1.1875, "learning_rate": 0.0003871177283874484, "loss": 5.0299, "mean_token_accuracy": 0.18119459301233293, "num_tokens": 8651008.0, "step": 4925 }, { "entropy": 5.368545627593994, "epoch": 3.8636363636363638, "grad_norm": 1.1875, "learning_rate": 0.00038685684143606995, "loss": 5.123, "mean_token_accuracy": 0.17978052347898482, "num_tokens": 8660103.0, "step": 4930 }, { "entropy": 5.316098833084107, "epoch": 3.8675548589341693, "grad_norm": 1.3203125, "learning_rate": 0.0003865957545783791, "loss": 5.076, "mean_token_accuracy": 0.1843046337366104, "num_tokens": 8668646.0, "step": 4935 }, { "entropy": 5.400100469589233, "epoch": 3.871473354231975, "grad_norm": 1.1796875, "learning_rate": 0.00038633446828098046, "loss": 5.2193, "mean_token_accuracy": 0.17444987893104552, "num_tokens": 8678190.0, "step": 4940 }, { "entropy": 5.516893720626831, "epoch": 3.8753918495297803, "grad_norm": 1.203125, "learning_rate": 0.000386072983010835, "loss": 5.2352, "mean_token_accuracy": 0.1684794768691063, "num_tokens": 8687336.0, "step": 4945 }, { "entropy": 5.461493587493896, "epoch": 3.8793103448275863, "grad_norm": 1.1640625, "learning_rate": 0.00038581129923525914, "loss": 5.1245, "mean_token_accuracy": 0.18254335820674897, "num_tokens": 8695939.0, "step": 4950 }, { "entropy": 5.378901958465576, "epoch": 3.883228840125392, "grad_norm": 1.2265625, "learning_rate": 0.00038554941742192445, "loss": 5.166, "mean_token_accuracy": 0.18020853847265245, "num_tokens": 8704828.0, "step": 4955 }, { "entropy": 5.441786289215088, "epoch": 3.8871473354231973, "grad_norm": 1.1796875, "learning_rate": 0.0003852873380388561, "loss": 5.0914, "mean_token_accuracy": 0.19002858847379683, "num_tokens": 8714427.0, "step": 4960 }, { "entropy": 5.476487874984741, "epoch": 3.8910658307210033, "grad_norm": 1.171875, "learning_rate": 0.0003850250615544323, "loss": 5.1655, "mean_token_accuracy": 0.18157008439302444, "num_tokens": 8723737.0, "step": 4965 }, { "entropy": 5.34540548324585, "epoch": 3.894984326018809, "grad_norm": 1.34375, "learning_rate": 0.00038476258843738386, "loss": 5.1234, "mean_token_accuracy": 0.18455416560173035, "num_tokens": 8733149.0, "step": 4970 }, { "entropy": 5.338756895065307, "epoch": 3.8989028213166144, "grad_norm": 1.2421875, "learning_rate": 0.00038449991915679273, "loss": 5.0862, "mean_token_accuracy": 0.18745338916778564, "num_tokens": 8742000.0, "step": 4975 }, { "entropy": 5.400898027420044, "epoch": 3.9028213166144203, "grad_norm": 1.125, "learning_rate": 0.0003842370541820915, "loss": 5.1547, "mean_token_accuracy": 0.1786161720752716, "num_tokens": 8751069.0, "step": 4980 }, { "entropy": 5.414169979095459, "epoch": 3.906739811912226, "grad_norm": 1.1484375, "learning_rate": 0.00038397399398306243, "loss": 5.0647, "mean_token_accuracy": 0.18002667427062988, "num_tokens": 8760092.0, "step": 4985 }, { "entropy": 5.392027568817139, "epoch": 3.9106583072100314, "grad_norm": 1.2421875, "learning_rate": 0.00038371073902983684, "loss": 5.0915, "mean_token_accuracy": 0.18434868305921553, "num_tokens": 8768039.0, "step": 4990 }, { "entropy": 5.399258708953857, "epoch": 3.914576802507837, "grad_norm": 1.28125, "learning_rate": 0.000383447289792894, "loss": 5.1307, "mean_token_accuracy": 0.17574300169944762, "num_tokens": 8776630.0, "step": 4995 }, { "entropy": 5.417212390899659, "epoch": 3.9184952978056424, "grad_norm": 1.1875, "learning_rate": 0.00038318364674306036, "loss": 5.081, "mean_token_accuracy": 0.18248913884162904, "num_tokens": 8785272.0, "step": 5000 }, { "epoch": 3.9184952978056424, "eval_entropy": 5.3187168975209085, "eval_loss": 5.745798110961914, "eval_mean_token_accuracy": 0.16200539206927136, "eval_num_tokens": 8785272.0, "eval_runtime": 2.843, "eval_samples_per_second": 1449.89, "eval_steps_per_second": 181.5, "step": 5000 } ], "logging_steps": 5, "max_steps": 12750, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1846248909056e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }