{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.4582132564841497, "eval_steps": 3000, "global_step": 36000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 4.81198468208313, "epoch": 0.0004803073967339097, "grad_norm": 15.3125, "learning_rate": 2e-06, "loss": 14.3995, "mean_token_accuracy": 0.0, "num_tokens": 10855.0, "step": 5 }, { "entropy": 4.828950214385986, "epoch": 0.0009606147934678194, "grad_norm": 16.0, "learning_rate": 4.5e-06, "loss": 14.4568, "mean_token_accuracy": 6.361323175951838e-05, "num_tokens": 24110.0, "step": 10 }, { "entropy": 4.885565328598022, "epoch": 0.001440922190201729, "grad_norm": 18.375, "learning_rate": 7e-06, "loss": 14.1468, "mean_token_accuracy": 0.0, "num_tokens": 35984.0, "step": 15 }, { "entropy": 5.113980484008789, "epoch": 0.0019212295869356388, "grad_norm": 25.5, "learning_rate": 9.5e-06, "loss": 13.5274, "mean_token_accuracy": 0.0, "num_tokens": 48152.0, "step": 20 }, { "entropy": 7.0846137523651125, "epoch": 0.0024015369836695487, "grad_norm": 18.875, "learning_rate": 1.2e-05, "loss": 11.983, "mean_token_accuracy": 5.9031875571236016e-05, "num_tokens": 59810.0, "step": 25 }, { "entropy": 10.311653995513916, "epoch": 0.002881844380403458, "grad_norm": 3.25, "learning_rate": 1.4500000000000002e-05, "loss": 10.8966, "mean_token_accuracy": 0.0035814862465485932, "num_tokens": 70852.0, "step": 30 }, { "entropy": 10.698549842834472, "epoch": 0.0033621517771373678, "grad_norm": 3.453125, "learning_rate": 1.7000000000000003e-05, "loss": 10.681, "mean_token_accuracy": 0.012990868836641311, "num_tokens": 83378.0, "step": 35 }, { "entropy": 10.70135440826416, "epoch": 0.0038424591738712775, "grad_norm": 2.890625, "learning_rate": 1.95e-05, "loss": 10.3702, "mean_token_accuracy": 0.015855902433395387, "num_tokens": 95505.0, "step": 40 }, { "entropy": 10.669420051574708, "epoch": 0.004322766570605188, "grad_norm": 2.609375, "learning_rate": 2.2e-05, "loss": 10.0399, "mean_token_accuracy": 0.019150405284017326, "num_tokens": 106812.0, "step": 45 }, { "entropy": 10.626140022277832, "epoch": 0.004803073967339097, "grad_norm": 2.171875, "learning_rate": 2.4500000000000003e-05, "loss": 9.8531, "mean_token_accuracy": 0.030371082201600074, "num_tokens": 118572.0, "step": 50 }, { "entropy": 10.630718421936034, "epoch": 0.005283381364073006, "grad_norm": 2.140625, "learning_rate": 2.7e-05, "loss": 9.7085, "mean_token_accuracy": 0.02918087989091873, "num_tokens": 130051.0, "step": 55 }, { "entropy": 10.632691478729248, "epoch": 0.005763688760806916, "grad_norm": 2.109375, "learning_rate": 2.95e-05, "loss": 9.6316, "mean_token_accuracy": 0.033551334962248804, "num_tokens": 141920.0, "step": 60 }, { "entropy": 10.621756076812744, "epoch": 0.006243996157540826, "grad_norm": 1.953125, "learning_rate": 3.2e-05, "loss": 9.4968, "mean_token_accuracy": 0.03377603869885206, "num_tokens": 152706.0, "step": 65 }, { "entropy": 10.59926996231079, "epoch": 0.0067243035542747355, "grad_norm": 2.0, "learning_rate": 3.4500000000000005e-05, "loss": 9.4671, "mean_token_accuracy": 0.030284658074378967, "num_tokens": 165253.0, "step": 70 }, { "entropy": 10.586241340637207, "epoch": 0.007204610951008645, "grad_norm": 1.9921875, "learning_rate": 3.7e-05, "loss": 9.3528, "mean_token_accuracy": 0.03066213186830282, "num_tokens": 176708.0, "step": 75 }, { "entropy": 10.572576808929444, "epoch": 0.007684918347742555, "grad_norm": 1.9921875, "learning_rate": 3.95e-05, "loss": 9.3119, "mean_token_accuracy": 0.02979854876175523, "num_tokens": 188240.0, "step": 80 }, { "entropy": 10.554954528808594, "epoch": 0.008165225744476465, "grad_norm": 1.96875, "learning_rate": 4.2000000000000004e-05, "loss": 9.1145, "mean_token_accuracy": 0.03125303704291582, "num_tokens": 198355.0, "step": 85 }, { "entropy": 10.53057928085327, "epoch": 0.008645533141210375, "grad_norm": 1.8515625, "learning_rate": 4.45e-05, "loss": 9.0646, "mean_token_accuracy": 0.02982727512717247, "num_tokens": 209497.0, "step": 90 }, { "entropy": 10.494773197174073, "epoch": 0.009125840537944284, "grad_norm": 1.9609375, "learning_rate": 4.7000000000000004e-05, "loss": 8.9936, "mean_token_accuracy": 0.02780488096177578, "num_tokens": 220859.0, "step": 95 }, { "entropy": 10.448780918121338, "epoch": 0.009606147934678195, "grad_norm": 1.78125, "learning_rate": 4.9500000000000004e-05, "loss": 8.9232, "mean_token_accuracy": 0.030998879671096803, "num_tokens": 231550.0, "step": 100 }, { "entropy": 10.376792049407959, "epoch": 0.010086455331412104, "grad_norm": 1.65625, "learning_rate": 5.2e-05, "loss": 8.7452, "mean_token_accuracy": 0.030790003202855586, "num_tokens": 244210.0, "step": 105 }, { "entropy": 10.282748031616212, "epoch": 0.010566762728146013, "grad_norm": 1.6953125, "learning_rate": 5.45e-05, "loss": 8.6175, "mean_token_accuracy": 0.040817446634173395, "num_tokens": 255745.0, "step": 110 }, { "entropy": 10.166150856018067, "epoch": 0.011047070124879923, "grad_norm": 1.4609375, "learning_rate": 5.7e-05, "loss": 8.5074, "mean_token_accuracy": 0.0365377115085721, "num_tokens": 266180.0, "step": 115 }, { "entropy": 10.028709888458252, "epoch": 0.011527377521613832, "grad_norm": 1.4140625, "learning_rate": 5.9499999999999996e-05, "loss": 8.3681, "mean_token_accuracy": 0.03765994198620319, "num_tokens": 277736.0, "step": 120 }, { "entropy": 9.827960968017578, "epoch": 0.012007684918347743, "grad_norm": 1.2734375, "learning_rate": 6.2e-05, "loss": 8.2429, "mean_token_accuracy": 0.035723325610160825, "num_tokens": 289069.0, "step": 125 }, { "entropy": 9.59237585067749, "epoch": 0.012487992315081652, "grad_norm": 1.1796875, "learning_rate": 6.450000000000001e-05, "loss": 8.0891, "mean_token_accuracy": 0.04738196656107903, "num_tokens": 300240.0, "step": 130 }, { "entropy": 9.368733978271484, "epoch": 0.012968299711815562, "grad_norm": 1.09375, "learning_rate": 6.7e-05, "loss": 8.0332, "mean_token_accuracy": 0.04018798861652613, "num_tokens": 311698.0, "step": 135 }, { "entropy": 9.110132884979247, "epoch": 0.013448607108549471, "grad_norm": 0.95703125, "learning_rate": 6.950000000000001e-05, "loss": 7.9056, "mean_token_accuracy": 0.0432288508862257, "num_tokens": 322844.0, "step": 140 }, { "entropy": 8.820003223419189, "epoch": 0.013928914505283382, "grad_norm": 0.98046875, "learning_rate": 7.2e-05, "loss": 7.8235, "mean_token_accuracy": 0.045638217404484746, "num_tokens": 335092.0, "step": 145 }, { "entropy": 8.585826587677001, "epoch": 0.01440922190201729, "grad_norm": 0.8359375, "learning_rate": 7.45e-05, "loss": 7.7332, "mean_token_accuracy": 0.04667803719639778, "num_tokens": 347033.0, "step": 150 }, { "entropy": 8.385289859771728, "epoch": 0.014889529298751201, "grad_norm": 0.9921875, "learning_rate": 7.7e-05, "loss": 7.6524, "mean_token_accuracy": 0.05755673125386238, "num_tokens": 358696.0, "step": 155 }, { "entropy": 8.231111812591553, "epoch": 0.01536983669548511, "grad_norm": 0.875, "learning_rate": 7.950000000000001e-05, "loss": 7.6369, "mean_token_accuracy": 0.05747554413974285, "num_tokens": 369390.0, "step": 160 }, { "entropy": 8.13049030303955, "epoch": 0.01585014409221902, "grad_norm": 0.921875, "learning_rate": 8.2e-05, "loss": 7.573, "mean_token_accuracy": 0.058345531672239305, "num_tokens": 380540.0, "step": 165 }, { "entropy": 8.037137985229492, "epoch": 0.01633045148895293, "grad_norm": 1.4375, "learning_rate": 8.450000000000001e-05, "loss": 7.5672, "mean_token_accuracy": 0.05862935781478882, "num_tokens": 391243.0, "step": 170 }, { "entropy": 7.971378183364868, "epoch": 0.01681075888568684, "grad_norm": 1.1328125, "learning_rate": 8.7e-05, "loss": 7.5403, "mean_token_accuracy": 0.06493047513067722, "num_tokens": 403336.0, "step": 175 }, { "entropy": 7.996695470809937, "epoch": 0.01729106628242075, "grad_norm": 1.7890625, "learning_rate": 8.95e-05, "loss": 7.4714, "mean_token_accuracy": 0.06883232817053794, "num_tokens": 413886.0, "step": 180 }, { "entropy": 7.944087362289428, "epoch": 0.01777137367915466, "grad_norm": 1.28125, "learning_rate": 9.2e-05, "loss": 7.5072, "mean_token_accuracy": 0.07003857865929604, "num_tokens": 425277.0, "step": 185 }, { "entropy": 7.903090763092041, "epoch": 0.01825168107588857, "grad_norm": 1.1484375, "learning_rate": 9.45e-05, "loss": 7.5901, "mean_token_accuracy": 0.07094852812588215, "num_tokens": 436868.0, "step": 190 }, { "entropy": 7.9524956226348875, "epoch": 0.018731988472622477, "grad_norm": 1.3671875, "learning_rate": 9.7e-05, "loss": 7.3956, "mean_token_accuracy": 0.0713607795536518, "num_tokens": 448349.0, "step": 195 }, { "entropy": 7.893163013458252, "epoch": 0.01921229586935639, "grad_norm": 1.078125, "learning_rate": 9.95e-05, "loss": 7.398, "mean_token_accuracy": 0.07450502514839172, "num_tokens": 459447.0, "step": 200 }, { "entropy": 7.827638578414917, "epoch": 0.0196926032660903, "grad_norm": 1.09375, "learning_rate": 0.000102, "loss": 7.3545, "mean_token_accuracy": 0.07836289256811142, "num_tokens": 470734.0, "step": 205 }, { "entropy": 7.920483875274658, "epoch": 0.020172910662824207, "grad_norm": 1.2890625, "learning_rate": 0.00010449999999999999, "loss": 7.3929, "mean_token_accuracy": 0.07436848841607571, "num_tokens": 482015.0, "step": 210 }, { "entropy": 7.829608154296875, "epoch": 0.020653218059558116, "grad_norm": 1.09375, "learning_rate": 0.000107, "loss": 7.3388, "mean_token_accuracy": 0.0812894694507122, "num_tokens": 493339.0, "step": 215 }, { "entropy": 7.832039451599121, "epoch": 0.021133525456292025, "grad_norm": 1.09375, "learning_rate": 0.0001095, "loss": 7.2806, "mean_token_accuracy": 0.08215347118675709, "num_tokens": 504924.0, "step": 220 }, { "entropy": 7.841120386123658, "epoch": 0.021613832853025938, "grad_norm": 1.3828125, "learning_rate": 0.000112, "loss": 7.2586, "mean_token_accuracy": 0.07783420942723751, "num_tokens": 516603.0, "step": 225 }, { "entropy": 7.667848110198975, "epoch": 0.022094140249759846, "grad_norm": 1.234375, "learning_rate": 0.0001145, "loss": 7.1767, "mean_token_accuracy": 0.0903685748577118, "num_tokens": 528347.0, "step": 230 }, { "entropy": 7.665532779693604, "epoch": 0.022574447646493755, "grad_norm": 1.453125, "learning_rate": 0.00011700000000000001, "loss": 7.2657, "mean_token_accuracy": 0.08881851136684418, "num_tokens": 539328.0, "step": 235 }, { "entropy": 7.787159252166748, "epoch": 0.023054755043227664, "grad_norm": 1.375, "learning_rate": 0.00011949999999999999, "loss": 7.2264, "mean_token_accuracy": 0.09179538786411286, "num_tokens": 549297.0, "step": 240 }, { "entropy": 7.68054313659668, "epoch": 0.023535062439961577, "grad_norm": 1.40625, "learning_rate": 0.000122, "loss": 7.1925, "mean_token_accuracy": 0.0870781309902668, "num_tokens": 560306.0, "step": 245 }, { "entropy": 7.722461795806884, "epoch": 0.024015369836695485, "grad_norm": 3.09375, "learning_rate": 0.0001245, "loss": 7.2601, "mean_token_accuracy": 0.08716249391436577, "num_tokens": 571972.0, "step": 250 }, { "entropy": 7.669500827789307, "epoch": 0.024495677233429394, "grad_norm": 1.125, "learning_rate": 0.000127, "loss": 7.1479, "mean_token_accuracy": 0.09271593019366264, "num_tokens": 582962.0, "step": 255 }, { "entropy": 7.6647216796875, "epoch": 0.024975984630163303, "grad_norm": 0.9296875, "learning_rate": 0.0001295, "loss": 7.1214, "mean_token_accuracy": 0.09072922170162201, "num_tokens": 597193.0, "step": 260 }, { "entropy": 7.66283483505249, "epoch": 0.025456292026897216, "grad_norm": 1.21875, "learning_rate": 0.000132, "loss": 7.1819, "mean_token_accuracy": 0.09304547160863877, "num_tokens": 608982.0, "step": 265 }, { "entropy": 7.661752843856812, "epoch": 0.025936599423631124, "grad_norm": 1.25, "learning_rate": 0.00013450000000000002, "loss": 7.2188, "mean_token_accuracy": 0.08966975659132004, "num_tokens": 619953.0, "step": 270 }, { "entropy": 7.643835210800171, "epoch": 0.026416906820365033, "grad_norm": 1.25, "learning_rate": 0.00013700000000000002, "loss": 7.1751, "mean_token_accuracy": 0.09371341913938522, "num_tokens": 631039.0, "step": 275 }, { "entropy": 7.632717418670654, "epoch": 0.026897214217098942, "grad_norm": 1.1328125, "learning_rate": 0.0001395, "loss": 7.1656, "mean_token_accuracy": 0.09481634944677353, "num_tokens": 642656.0, "step": 280 }, { "entropy": 7.468483591079712, "epoch": 0.027377521613832854, "grad_norm": 1.46875, "learning_rate": 0.00014199999999999998, "loss": 7.0285, "mean_token_accuracy": 0.10727941244840622, "num_tokens": 653748.0, "step": 285 }, { "entropy": 7.516920471191407, "epoch": 0.027857829010566763, "grad_norm": 1.171875, "learning_rate": 0.0001445, "loss": 7.0029, "mean_token_accuracy": 0.09661566317081452, "num_tokens": 665618.0, "step": 290 }, { "entropy": 7.486124277114868, "epoch": 0.028338136407300672, "grad_norm": 1.0625, "learning_rate": 0.000147, "loss": 7.0287, "mean_token_accuracy": 0.09913064762949944, "num_tokens": 677329.0, "step": 295 }, { "entropy": 7.49315767288208, "epoch": 0.02881844380403458, "grad_norm": 1.7109375, "learning_rate": 0.0001495, "loss": 6.9864, "mean_token_accuracy": 0.1033214770257473, "num_tokens": 688278.0, "step": 300 }, { "entropy": 7.431641435623169, "epoch": 0.029298751200768493, "grad_norm": 1.96875, "learning_rate": 0.000152, "loss": 7.046, "mean_token_accuracy": 0.10180941373109817, "num_tokens": 700739.0, "step": 305 }, { "entropy": 7.378959465026855, "epoch": 0.029779058597502402, "grad_norm": 2.0625, "learning_rate": 0.00015450000000000001, "loss": 6.9858, "mean_token_accuracy": 0.104751455783844, "num_tokens": 712527.0, "step": 310 }, { "entropy": 7.4179362773895265, "epoch": 0.03025936599423631, "grad_norm": 1.390625, "learning_rate": 0.000157, "loss": 7.0113, "mean_token_accuracy": 0.09946026802062988, "num_tokens": 724514.0, "step": 315 }, { "entropy": 7.464642429351807, "epoch": 0.03073967339097022, "grad_norm": 1.3125, "learning_rate": 0.0001595, "loss": 6.958, "mean_token_accuracy": 0.10636739879846573, "num_tokens": 735679.0, "step": 320 }, { "entropy": 7.379268789291382, "epoch": 0.03121998078770413, "grad_norm": 1.234375, "learning_rate": 0.000162, "loss": 6.9502, "mean_token_accuracy": 0.10707954466342925, "num_tokens": 747896.0, "step": 325 }, { "entropy": 7.4328147888183596, "epoch": 0.03170028818443804, "grad_norm": 1.1953125, "learning_rate": 0.00016450000000000001, "loss": 7.0008, "mean_token_accuracy": 0.10451544597744941, "num_tokens": 759081.0, "step": 330 }, { "entropy": 7.373377466201783, "epoch": 0.03218059558117195, "grad_norm": 1.6640625, "learning_rate": 0.00016700000000000002, "loss": 6.9349, "mean_token_accuracy": 0.10051383301615716, "num_tokens": 770459.0, "step": 335 }, { "entropy": 7.3182484149932865, "epoch": 0.03266090297790586, "grad_norm": 2.25, "learning_rate": 0.00016950000000000003, "loss": 6.9097, "mean_token_accuracy": 0.10436427593231201, "num_tokens": 783960.0, "step": 340 }, { "entropy": 7.2723020076751705, "epoch": 0.03314121037463977, "grad_norm": 1.34375, "learning_rate": 0.00017199999999999998, "loss": 6.9998, "mean_token_accuracy": 0.1017355315387249, "num_tokens": 795425.0, "step": 345 }, { "entropy": 7.288401937484741, "epoch": 0.03362151777137368, "grad_norm": 1.5625, "learning_rate": 0.00017449999999999999, "loss": 6.9466, "mean_token_accuracy": 0.1032905712723732, "num_tokens": 807536.0, "step": 350 }, { "entropy": 7.429675006866455, "epoch": 0.034101825168107586, "grad_norm": 1.25, "learning_rate": 0.000177, "loss": 6.9955, "mean_token_accuracy": 0.09869879111647606, "num_tokens": 818801.0, "step": 355 }, { "entropy": 7.303883075714111, "epoch": 0.0345821325648415, "grad_norm": 1.2578125, "learning_rate": 0.0001795, "loss": 6.8664, "mean_token_accuracy": 0.1042160525918007, "num_tokens": 831497.0, "step": 360 }, { "entropy": 7.275684547424317, "epoch": 0.03506243996157541, "grad_norm": 1.1328125, "learning_rate": 0.000182, "loss": 6.8349, "mean_token_accuracy": 0.10631057769060134, "num_tokens": 842491.0, "step": 365 }, { "entropy": 7.303065443038941, "epoch": 0.03554274735830932, "grad_norm": 1.328125, "learning_rate": 0.0001845, "loss": 6.9059, "mean_token_accuracy": 0.09917943850159645, "num_tokens": 854560.0, "step": 370 }, { "entropy": 7.275861215591431, "epoch": 0.03602305475504323, "grad_norm": 1.3515625, "learning_rate": 0.000187, "loss": 6.8151, "mean_token_accuracy": 0.11120132729411125, "num_tokens": 866688.0, "step": 375 }, { "entropy": 7.233143997192383, "epoch": 0.03650336215177714, "grad_norm": 1.65625, "learning_rate": 0.0001895, "loss": 6.9205, "mean_token_accuracy": 0.09971508085727691, "num_tokens": 879484.0, "step": 380 }, { "entropy": 7.290747499465942, "epoch": 0.036983669548511046, "grad_norm": 1.2109375, "learning_rate": 0.000192, "loss": 6.9039, "mean_token_accuracy": 0.10731675177812576, "num_tokens": 890807.0, "step": 385 }, { "entropy": 7.2609399318695065, "epoch": 0.037463976945244955, "grad_norm": 1.828125, "learning_rate": 0.0001945, "loss": 6.854, "mean_token_accuracy": 0.10835549905896187, "num_tokens": 901759.0, "step": 390 }, { "entropy": 7.174216985702515, "epoch": 0.037944284341978864, "grad_norm": 1.28125, "learning_rate": 0.00019700000000000002, "loss": 6.7707, "mean_token_accuracy": 0.1162538155913353, "num_tokens": 912212.0, "step": 395 }, { "entropy": 7.264402294158936, "epoch": 0.03842459173871278, "grad_norm": 1.171875, "learning_rate": 0.00019950000000000002, "loss": 6.8764, "mean_token_accuracy": 0.10775518119335174, "num_tokens": 923947.0, "step": 400 }, { "entropy": 7.194364166259765, "epoch": 0.03890489913544669, "grad_norm": 1.5703125, "learning_rate": 0.000202, "loss": 6.8149, "mean_token_accuracy": 0.1155998706817627, "num_tokens": 935732.0, "step": 405 }, { "entropy": 7.094007158279419, "epoch": 0.0393852065321806, "grad_norm": 1.5390625, "learning_rate": 0.00020449999999999998, "loss": 6.7534, "mean_token_accuracy": 0.11219719424843788, "num_tokens": 948261.0, "step": 410 }, { "entropy": 7.198687505722046, "epoch": 0.039865513928914506, "grad_norm": 1.5390625, "learning_rate": 0.000207, "loss": 6.8682, "mean_token_accuracy": 0.11036199703812599, "num_tokens": 959574.0, "step": 415 }, { "entropy": 7.14764518737793, "epoch": 0.040345821325648415, "grad_norm": 1.2109375, "learning_rate": 0.0002095, "loss": 6.9302, "mean_token_accuracy": 0.10567210242152214, "num_tokens": 970329.0, "step": 420 }, { "entropy": 7.284962558746338, "epoch": 0.040826128722382324, "grad_norm": 1.5078125, "learning_rate": 0.000212, "loss": 6.7852, "mean_token_accuracy": 0.11808342635631561, "num_tokens": 982037.0, "step": 425 }, { "entropy": 6.99963059425354, "epoch": 0.04130643611911623, "grad_norm": 1.15625, "learning_rate": 0.0002145, "loss": 6.7507, "mean_token_accuracy": 0.1121592566370964, "num_tokens": 994612.0, "step": 430 }, { "entropy": 7.1772722721099855, "epoch": 0.04178674351585014, "grad_norm": 1.203125, "learning_rate": 0.00021700000000000002, "loss": 6.8563, "mean_token_accuracy": 0.11890432462096215, "num_tokens": 1005960.0, "step": 435 }, { "entropy": 7.119032526016236, "epoch": 0.04226705091258405, "grad_norm": 1.234375, "learning_rate": 0.0002195, "loss": 6.726, "mean_token_accuracy": 0.11254842653870582, "num_tokens": 1017618.0, "step": 440 }, { "entropy": 7.120699787139893, "epoch": 0.042747358309317966, "grad_norm": 1.5234375, "learning_rate": 0.000222, "loss": 6.7617, "mean_token_accuracy": 0.11123086810112, "num_tokens": 1029307.0, "step": 445 }, { "entropy": 7.10453462600708, "epoch": 0.043227665706051875, "grad_norm": 1.21875, "learning_rate": 0.0002245, "loss": 6.7794, "mean_token_accuracy": 0.11213452070951462, "num_tokens": 1042027.0, "step": 450 }, { "entropy": 7.109935092926025, "epoch": 0.043707973102785784, "grad_norm": 1.1171875, "learning_rate": 0.00022700000000000002, "loss": 6.7726, "mean_token_accuracy": 0.11005142331123352, "num_tokens": 1053125.0, "step": 455 }, { "entropy": 7.093224906921387, "epoch": 0.04418828049951969, "grad_norm": 1.578125, "learning_rate": 0.00022950000000000002, "loss": 6.7646, "mean_token_accuracy": 0.11863623559474945, "num_tokens": 1064908.0, "step": 460 }, { "entropy": 7.0393500328063965, "epoch": 0.0446685878962536, "grad_norm": 1.1796875, "learning_rate": 0.00023200000000000003, "loss": 6.6415, "mean_token_accuracy": 0.12022090703248978, "num_tokens": 1076328.0, "step": 465 }, { "entropy": 7.159615230560303, "epoch": 0.04514889529298751, "grad_norm": 1.3203125, "learning_rate": 0.00023449999999999998, "loss": 6.8668, "mean_token_accuracy": 0.10638144612312317, "num_tokens": 1088469.0, "step": 470 }, { "entropy": 6.9358738422393795, "epoch": 0.04562920268972142, "grad_norm": 1.375, "learning_rate": 0.000237, "loss": 6.6608, "mean_token_accuracy": 0.11796007007360458, "num_tokens": 1099408.0, "step": 475 }, { "entropy": 6.921041584014892, "epoch": 0.04610951008645533, "grad_norm": 1.1484375, "learning_rate": 0.0002395, "loss": 6.596, "mean_token_accuracy": 0.12084084451198578, "num_tokens": 1111101.0, "step": 480 }, { "entropy": 6.980242967605591, "epoch": 0.046589817483189244, "grad_norm": 1.375, "learning_rate": 0.000242, "loss": 6.6189, "mean_token_accuracy": 0.11961494460701942, "num_tokens": 1122877.0, "step": 485 }, { "entropy": 6.998215103149414, "epoch": 0.04707012487992315, "grad_norm": 1.2890625, "learning_rate": 0.0002445, "loss": 6.7183, "mean_token_accuracy": 0.1069619596004486, "num_tokens": 1133956.0, "step": 490 }, { "entropy": 6.955817556381225, "epoch": 0.04755043227665706, "grad_norm": 1.265625, "learning_rate": 0.000247, "loss": 6.6106, "mean_token_accuracy": 0.12115221694111825, "num_tokens": 1146101.0, "step": 495 }, { "entropy": 6.991823005676269, "epoch": 0.04803073967339097, "grad_norm": 1.4453125, "learning_rate": 0.0002495, "loss": 6.704, "mean_token_accuracy": 0.1240153320133686, "num_tokens": 1157432.0, "step": 500 }, { "entropy": 6.995119285583496, "epoch": 0.04851104707012488, "grad_norm": 1.1875, "learning_rate": 0.000252, "loss": 6.6931, "mean_token_accuracy": 0.12121785953640937, "num_tokens": 1167601.0, "step": 505 }, { "entropy": 6.925166416168213, "epoch": 0.04899135446685879, "grad_norm": 1.2265625, "learning_rate": 0.0002545, "loss": 6.5948, "mean_token_accuracy": 0.11933866590261459, "num_tokens": 1178818.0, "step": 510 }, { "entropy": 7.102405261993408, "epoch": 0.0494716618635927, "grad_norm": 1.234375, "learning_rate": 0.000257, "loss": 6.8296, "mean_token_accuracy": 0.11879347264766693, "num_tokens": 1189977.0, "step": 515 }, { "entropy": 6.896050024032593, "epoch": 0.049951969260326606, "grad_norm": 1.09375, "learning_rate": 0.0002595, "loss": 6.6543, "mean_token_accuracy": 0.12233106046915054, "num_tokens": 1201039.0, "step": 520 }, { "entropy": 7.007365083694458, "epoch": 0.05043227665706052, "grad_norm": 1.1953125, "learning_rate": 0.000262, "loss": 6.6791, "mean_token_accuracy": 0.12215208187699318, "num_tokens": 1212573.0, "step": 525 }, { "entropy": 7.002063369750976, "epoch": 0.05091258405379443, "grad_norm": 1.1171875, "learning_rate": 0.00026450000000000003, "loss": 6.6208, "mean_token_accuracy": 0.1271028608083725, "num_tokens": 1223382.0, "step": 530 }, { "entropy": 6.9438478469848635, "epoch": 0.05139289145052834, "grad_norm": 1.0078125, "learning_rate": 0.00026700000000000004, "loss": 6.6969, "mean_token_accuracy": 0.12958464100956918, "num_tokens": 1236501.0, "step": 535 }, { "entropy": 6.931712675094604, "epoch": 0.05187319884726225, "grad_norm": 1.203125, "learning_rate": 0.00026950000000000005, "loss": 6.687, "mean_token_accuracy": 0.12256318107247352, "num_tokens": 1246798.0, "step": 540 }, { "entropy": 6.9002622127532955, "epoch": 0.05235350624399616, "grad_norm": 1.40625, "learning_rate": 0.00027200000000000005, "loss": 6.6164, "mean_token_accuracy": 0.12228193208575248, "num_tokens": 1258182.0, "step": 545 }, { "entropy": 6.873838090896607, "epoch": 0.052833813640730067, "grad_norm": 1.5625, "learning_rate": 0.0002745, "loss": 6.5781, "mean_token_accuracy": 0.11714496314525605, "num_tokens": 1270273.0, "step": 550 }, { "entropy": 6.869143629074097, "epoch": 0.053314121037463975, "grad_norm": 1.4296875, "learning_rate": 0.000277, "loss": 6.6336, "mean_token_accuracy": 0.11991709843277931, "num_tokens": 1281136.0, "step": 555 }, { "entropy": 6.914445209503174, "epoch": 0.053794428434197884, "grad_norm": 1.109375, "learning_rate": 0.0002795, "loss": 6.6257, "mean_token_accuracy": 0.12010404467582703, "num_tokens": 1294488.0, "step": 560 }, { "entropy": 6.732436418533325, "epoch": 0.05427473583093179, "grad_norm": 1.296875, "learning_rate": 0.00028199999999999997, "loss": 6.5262, "mean_token_accuracy": 0.12693093419075013, "num_tokens": 1304113.0, "step": 565 }, { "entropy": 6.927071809768677, "epoch": 0.05475504322766571, "grad_norm": 1.2890625, "learning_rate": 0.0002845, "loss": 6.5843, "mean_token_accuracy": 0.12877818644046785, "num_tokens": 1315417.0, "step": 570 }, { "entropy": 6.783261919021607, "epoch": 0.05523535062439962, "grad_norm": 1.34375, "learning_rate": 0.000287, "loss": 6.5521, "mean_token_accuracy": 0.1234595388174057, "num_tokens": 1328084.0, "step": 575 }, { "entropy": 6.8645414352417, "epoch": 0.05571565802113353, "grad_norm": 1.1328125, "learning_rate": 0.0002895, "loss": 6.6982, "mean_token_accuracy": 0.1229254849255085, "num_tokens": 1338696.0, "step": 580 }, { "entropy": 6.887264966964722, "epoch": 0.056195965417867436, "grad_norm": 1.1328125, "learning_rate": 0.000292, "loss": 6.6333, "mean_token_accuracy": 0.12206205278635025, "num_tokens": 1350240.0, "step": 585 }, { "entropy": 6.901881551742553, "epoch": 0.056676272814601344, "grad_norm": 1.390625, "learning_rate": 0.0002945, "loss": 6.5792, "mean_token_accuracy": 0.12374859303236008, "num_tokens": 1361720.0, "step": 590 }, { "entropy": 6.646714115142823, "epoch": 0.05715658021133525, "grad_norm": 1.453125, "learning_rate": 0.000297, "loss": 6.5831, "mean_token_accuracy": 0.12852583453059196, "num_tokens": 1373286.0, "step": 595 }, { "entropy": 6.89121675491333, "epoch": 0.05763688760806916, "grad_norm": 1.5390625, "learning_rate": 0.0002995, "loss": 6.5332, "mean_token_accuracy": 0.12378557696938515, "num_tokens": 1384274.0, "step": 600 }, { "entropy": 6.707057476043701, "epoch": 0.05811719500480307, "grad_norm": 1.2734375, "learning_rate": 0.000302, "loss": 6.5674, "mean_token_accuracy": 0.1248041570186615, "num_tokens": 1395355.0, "step": 605 }, { "entropy": 6.787681436538696, "epoch": 0.05859750240153699, "grad_norm": 1.59375, "learning_rate": 0.0003045, "loss": 6.5071, "mean_token_accuracy": 0.1337241604924202, "num_tokens": 1406664.0, "step": 610 }, { "entropy": 6.907395648956299, "epoch": 0.059077809798270896, "grad_norm": 1.1953125, "learning_rate": 0.000307, "loss": 6.6562, "mean_token_accuracy": 0.12113718539476395, "num_tokens": 1418450.0, "step": 615 }, { "entropy": 6.8045419216156, "epoch": 0.059558117195004805, "grad_norm": 1.1640625, "learning_rate": 0.0003095, "loss": 6.5466, "mean_token_accuracy": 0.12454390972852707, "num_tokens": 1430048.0, "step": 620 }, { "entropy": 6.808126592636109, "epoch": 0.060038424591738714, "grad_norm": 1.5703125, "learning_rate": 0.000312, "loss": 6.5911, "mean_token_accuracy": 0.12378140687942504, "num_tokens": 1441820.0, "step": 625 }, { "entropy": 6.753187370300293, "epoch": 0.06051873198847262, "grad_norm": 1.2109375, "learning_rate": 0.0003145, "loss": 6.445, "mean_token_accuracy": 0.13010460510849953, "num_tokens": 1453209.0, "step": 630 }, { "entropy": 6.6527941703796385, "epoch": 0.06099903938520653, "grad_norm": 1.3515625, "learning_rate": 0.000317, "loss": 6.4598, "mean_token_accuracy": 0.12725651860237122, "num_tokens": 1465423.0, "step": 635 }, { "entropy": 6.711978006362915, "epoch": 0.06147934678194044, "grad_norm": 1.234375, "learning_rate": 0.0003195, "loss": 6.4541, "mean_token_accuracy": 0.13069155365228652, "num_tokens": 1476575.0, "step": 640 }, { "entropy": 6.659121417999268, "epoch": 0.06195965417867435, "grad_norm": 1.734375, "learning_rate": 0.000322, "loss": 6.4109, "mean_token_accuracy": 0.12579366862773894, "num_tokens": 1486932.0, "step": 645 }, { "entropy": 6.691300868988037, "epoch": 0.06243996157540826, "grad_norm": 1.140625, "learning_rate": 0.00032450000000000003, "loss": 6.4399, "mean_token_accuracy": 0.12854820042848586, "num_tokens": 1498494.0, "step": 650 }, { "entropy": 6.7037928104400635, "epoch": 0.06292026897214217, "grad_norm": 1.2109375, "learning_rate": 0.00032700000000000003, "loss": 6.4936, "mean_token_accuracy": 0.12374913021922111, "num_tokens": 1509937.0, "step": 655 }, { "entropy": 6.782931184768676, "epoch": 0.06340057636887608, "grad_norm": 1.3125, "learning_rate": 0.00032950000000000004, "loss": 6.5147, "mean_token_accuracy": 0.13380258977413179, "num_tokens": 1519823.0, "step": 660 }, { "entropy": 6.726450872421265, "epoch": 0.06388088376560999, "grad_norm": 1.2890625, "learning_rate": 0.00033200000000000005, "loss": 6.5528, "mean_token_accuracy": 0.12575417309999465, "num_tokens": 1529943.0, "step": 665 }, { "entropy": 6.611954069137573, "epoch": 0.0643611911623439, "grad_norm": 1.2578125, "learning_rate": 0.00033450000000000005, "loss": 6.3767, "mean_token_accuracy": 0.13369367122650147, "num_tokens": 1540618.0, "step": 670 }, { "entropy": 6.685780334472656, "epoch": 0.06484149855907781, "grad_norm": 1.3515625, "learning_rate": 0.000337, "loss": 6.5048, "mean_token_accuracy": 0.1227756217122078, "num_tokens": 1553208.0, "step": 675 }, { "entropy": 6.6764894962310795, "epoch": 0.06532180595581172, "grad_norm": 1.3359375, "learning_rate": 0.0003395, "loss": 6.4589, "mean_token_accuracy": 0.1339925467967987, "num_tokens": 1563975.0, "step": 680 }, { "entropy": 6.717716455459595, "epoch": 0.06580211335254563, "grad_norm": 1.28125, "learning_rate": 0.000342, "loss": 6.5252, "mean_token_accuracy": 0.12458744868636132, "num_tokens": 1575998.0, "step": 685 }, { "entropy": 6.6251349449157715, "epoch": 0.06628242074927954, "grad_norm": 1.125, "learning_rate": 0.00034449999999999997, "loss": 6.3994, "mean_token_accuracy": 0.13568611592054367, "num_tokens": 1586041.0, "step": 690 }, { "entropy": 6.637330770492554, "epoch": 0.06676272814601344, "grad_norm": 1.4375, "learning_rate": 0.000347, "loss": 6.4796, "mean_token_accuracy": 0.12872253656387328, "num_tokens": 1597531.0, "step": 695 }, { "entropy": 6.617096710205078, "epoch": 0.06724303554274735, "grad_norm": 1.3828125, "learning_rate": 0.0003495, "loss": 6.4549, "mean_token_accuracy": 0.12859696000814438, "num_tokens": 1609255.0, "step": 700 }, { "entropy": 6.640483236312866, "epoch": 0.06772334293948126, "grad_norm": 1.265625, "learning_rate": 0.000352, "loss": 6.439, "mean_token_accuracy": 0.13394341096282006, "num_tokens": 1621098.0, "step": 705 }, { "entropy": 6.601499080657959, "epoch": 0.06820365033621517, "grad_norm": 1.2578125, "learning_rate": 0.0003545, "loss": 6.3504, "mean_token_accuracy": 0.14078185856342315, "num_tokens": 1631941.0, "step": 710 }, { "entropy": 6.551211166381836, "epoch": 0.0686839577329491, "grad_norm": 1.1484375, "learning_rate": 0.000357, "loss": 6.3471, "mean_token_accuracy": 0.13648251742124556, "num_tokens": 1643117.0, "step": 715 }, { "entropy": 6.5161905765533445, "epoch": 0.069164265129683, "grad_norm": 1.40625, "learning_rate": 0.0003595, "loss": 6.3952, "mean_token_accuracy": 0.13429828062653543, "num_tokens": 1653595.0, "step": 720 }, { "entropy": 6.614610481262207, "epoch": 0.06964457252641691, "grad_norm": 1.2734375, "learning_rate": 0.000362, "loss": 6.4168, "mean_token_accuracy": 0.13274685442447662, "num_tokens": 1664495.0, "step": 725 }, { "entropy": 6.5094832420349125, "epoch": 0.07012487992315082, "grad_norm": 1.1328125, "learning_rate": 0.0003645, "loss": 6.4047, "mean_token_accuracy": 0.136563728004694, "num_tokens": 1674923.0, "step": 730 }, { "entropy": 6.602942371368409, "epoch": 0.07060518731988473, "grad_norm": 1.234375, "learning_rate": 0.000367, "loss": 6.3045, "mean_token_accuracy": 0.13681301474571228, "num_tokens": 1685904.0, "step": 735 }, { "entropy": 6.596617603302002, "epoch": 0.07108549471661864, "grad_norm": 1.046875, "learning_rate": 0.0003695, "loss": 6.5324, "mean_token_accuracy": 0.12432878389954567, "num_tokens": 1699133.0, "step": 740 }, { "entropy": 6.504991292953491, "epoch": 0.07156580211335255, "grad_norm": 1.1796875, "learning_rate": 0.000372, "loss": 6.342, "mean_token_accuracy": 0.13271907046437265, "num_tokens": 1711559.0, "step": 745 }, { "entropy": 6.592547464370727, "epoch": 0.07204610951008646, "grad_norm": 1.15625, "learning_rate": 0.0003745, "loss": 6.2575, "mean_token_accuracy": 0.14460937380790712, "num_tokens": 1722526.0, "step": 750 }, { "entropy": 6.4313709259033205, "epoch": 0.07252641690682037, "grad_norm": 1.1015625, "learning_rate": 0.000377, "loss": 6.3265, "mean_token_accuracy": 0.1398925192654133, "num_tokens": 1734261.0, "step": 755 }, { "entropy": 6.5256377220153805, "epoch": 0.07300672430355427, "grad_norm": 1.203125, "learning_rate": 0.0003795, "loss": 6.3105, "mean_token_accuracy": 0.14366703033447265, "num_tokens": 1745151.0, "step": 760 }, { "entropy": 6.631883907318115, "epoch": 0.07348703170028818, "grad_norm": 1.3203125, "learning_rate": 0.000382, "loss": 6.4547, "mean_token_accuracy": 0.1341322012245655, "num_tokens": 1755463.0, "step": 765 }, { "entropy": 6.584089756011963, "epoch": 0.07396733909702209, "grad_norm": 1.3203125, "learning_rate": 0.0003845, "loss": 6.4178, "mean_token_accuracy": 0.1315837398171425, "num_tokens": 1767717.0, "step": 770 }, { "entropy": 6.3859930515289305, "epoch": 0.074447646493756, "grad_norm": 1.296875, "learning_rate": 0.00038700000000000003, "loss": 6.2619, "mean_token_accuracy": 0.14160886630415917, "num_tokens": 1779115.0, "step": 775 }, { "entropy": 6.3998737812042235, "epoch": 0.07492795389048991, "grad_norm": 1.3359375, "learning_rate": 0.00038950000000000003, "loss": 6.213, "mean_token_accuracy": 0.1398429863154888, "num_tokens": 1789644.0, "step": 780 }, { "entropy": 6.540688323974609, "epoch": 0.07540826128722382, "grad_norm": 1.140625, "learning_rate": 0.00039200000000000004, "loss": 6.4251, "mean_token_accuracy": 0.13578777611255646, "num_tokens": 1800606.0, "step": 785 }, { "entropy": 6.513448238372803, "epoch": 0.07588856868395773, "grad_norm": 1.1484375, "learning_rate": 0.00039450000000000005, "loss": 6.4264, "mean_token_accuracy": 0.12942690253257752, "num_tokens": 1812168.0, "step": 790 }, { "entropy": 6.5457319736480715, "epoch": 0.07636887608069164, "grad_norm": 1.2109375, "learning_rate": 0.00039700000000000005, "loss": 6.3796, "mean_token_accuracy": 0.1303087830543518, "num_tokens": 1823830.0, "step": 795 }, { "entropy": 6.495282316207886, "epoch": 0.07684918347742556, "grad_norm": 1.15625, "learning_rate": 0.0003995, "loss": 6.3456, "mean_token_accuracy": 0.13957973942160606, "num_tokens": 1835611.0, "step": 800 }, { "entropy": 6.467644214630127, "epoch": 0.07732949087415947, "grad_norm": 1.15625, "learning_rate": 0.000402, "loss": 6.4127, "mean_token_accuracy": 0.1334280975162983, "num_tokens": 1847036.0, "step": 805 }, { "entropy": 6.464094591140747, "epoch": 0.07780979827089338, "grad_norm": 1.296875, "learning_rate": 0.0004045, "loss": 6.3528, "mean_token_accuracy": 0.13223012760281563, "num_tokens": 1857476.0, "step": 810 }, { "entropy": 6.50727949142456, "epoch": 0.07829010566762729, "grad_norm": 1.1328125, "learning_rate": 0.00040699999999999997, "loss": 6.3773, "mean_token_accuracy": 0.1352442115545273, "num_tokens": 1869073.0, "step": 815 }, { "entropy": 6.384515810012817, "epoch": 0.0787704130643612, "grad_norm": 1.078125, "learning_rate": 0.0004095, "loss": 6.2486, "mean_token_accuracy": 0.14026699736714363, "num_tokens": 1880439.0, "step": 820 }, { "entropy": 6.561717510223389, "epoch": 0.0792507204610951, "grad_norm": 1.328125, "learning_rate": 0.000412, "loss": 6.4116, "mean_token_accuracy": 0.134783523529768, "num_tokens": 1891600.0, "step": 825 }, { "entropy": 6.414502573013306, "epoch": 0.07973102785782901, "grad_norm": 1.328125, "learning_rate": 0.0004145, "loss": 6.3783, "mean_token_accuracy": 0.13531816452741624, "num_tokens": 1903126.0, "step": 830 }, { "entropy": 6.5730548858642575, "epoch": 0.08021133525456292, "grad_norm": 1.296875, "learning_rate": 0.000417, "loss": 6.3467, "mean_token_accuracy": 0.14032403156161308, "num_tokens": 1913913.0, "step": 835 }, { "entropy": 6.344644355773926, "epoch": 0.08069164265129683, "grad_norm": 1.28125, "learning_rate": 0.0004195, "loss": 6.2684, "mean_token_accuracy": 0.1382530964910984, "num_tokens": 1924961.0, "step": 840 }, { "entropy": 6.523792457580567, "epoch": 0.08117195004803074, "grad_norm": 1.2421875, "learning_rate": 0.000422, "loss": 6.3612, "mean_token_accuracy": 0.12942377403378486, "num_tokens": 1936773.0, "step": 845 }, { "entropy": 6.355926513671875, "epoch": 0.08165225744476465, "grad_norm": 1.34375, "learning_rate": 0.0004245, "loss": 6.2783, "mean_token_accuracy": 0.13875910267233849, "num_tokens": 1948190.0, "step": 850 }, { "entropy": 6.331581449508667, "epoch": 0.08213256484149856, "grad_norm": 1.09375, "learning_rate": 0.000427, "loss": 6.2694, "mean_token_accuracy": 0.14160780385136604, "num_tokens": 1960038.0, "step": 855 }, { "entropy": 6.557125091552734, "epoch": 0.08261287223823247, "grad_norm": 1.2890625, "learning_rate": 0.0004295, "loss": 6.3489, "mean_token_accuracy": 0.14002878665924073, "num_tokens": 1970535.0, "step": 860 }, { "entropy": 6.411432456970215, "epoch": 0.08309317963496637, "grad_norm": 1.34375, "learning_rate": 0.000432, "loss": 6.3226, "mean_token_accuracy": 0.13546231836080552, "num_tokens": 1981386.0, "step": 865 }, { "entropy": 6.337710332870484, "epoch": 0.08357348703170028, "grad_norm": 1.1171875, "learning_rate": 0.0004345, "loss": 6.2428, "mean_token_accuracy": 0.1426716774702072, "num_tokens": 1993196.0, "step": 870 }, { "entropy": 6.432919025421143, "epoch": 0.08405379442843419, "grad_norm": 1.0859375, "learning_rate": 0.000437, "loss": 6.2741, "mean_token_accuracy": 0.14658503904938697, "num_tokens": 2004756.0, "step": 875 }, { "entropy": 6.315603113174438, "epoch": 0.0845341018251681, "grad_norm": 1.25, "learning_rate": 0.0004395, "loss": 6.2347, "mean_token_accuracy": 0.14145326390862464, "num_tokens": 2016020.0, "step": 880 }, { "entropy": 6.380750274658203, "epoch": 0.08501440922190202, "grad_norm": 1.2265625, "learning_rate": 0.000442, "loss": 6.2819, "mean_token_accuracy": 0.14082487300038338, "num_tokens": 2027747.0, "step": 885 }, { "entropy": 6.4264098644256595, "epoch": 0.08549471661863593, "grad_norm": 1.3984375, "learning_rate": 0.0004445, "loss": 6.2553, "mean_token_accuracy": 0.13818828240036965, "num_tokens": 2038841.0, "step": 890 }, { "entropy": 6.385887289047242, "epoch": 0.08597502401536984, "grad_norm": 1.046875, "learning_rate": 0.000447, "loss": 6.3043, "mean_token_accuracy": 0.13402576446533204, "num_tokens": 2049905.0, "step": 895 }, { "entropy": 6.424469089508056, "epoch": 0.08645533141210375, "grad_norm": 1.234375, "learning_rate": 0.00044950000000000003, "loss": 6.3803, "mean_token_accuracy": 0.13485484719276428, "num_tokens": 2062492.0, "step": 900 }, { "entropy": 6.387258577346802, "epoch": 0.08693563880883766, "grad_norm": 1.21875, "learning_rate": 0.00045200000000000004, "loss": 6.31, "mean_token_accuracy": 0.1353304862976074, "num_tokens": 2073840.0, "step": 905 }, { "entropy": 6.3580629348754885, "epoch": 0.08741594620557157, "grad_norm": 1.328125, "learning_rate": 0.00045450000000000004, "loss": 6.221, "mean_token_accuracy": 0.14060378223657607, "num_tokens": 2085720.0, "step": 910 }, { "entropy": 6.353258228302002, "epoch": 0.08789625360230548, "grad_norm": 1.0703125, "learning_rate": 0.00045700000000000005, "loss": 6.3039, "mean_token_accuracy": 0.1413162462413311, "num_tokens": 2096649.0, "step": 915 }, { "entropy": 6.436611890792847, "epoch": 0.08837656099903939, "grad_norm": 0.99609375, "learning_rate": 0.00045950000000000006, "loss": 6.3061, "mean_token_accuracy": 0.14285610914230346, "num_tokens": 2109030.0, "step": 920 }, { "entropy": 6.35608320236206, "epoch": 0.0888568683957733, "grad_norm": 1.15625, "learning_rate": 0.000462, "loss": 6.2113, "mean_token_accuracy": 0.14488047659397124, "num_tokens": 2121384.0, "step": 925 }, { "entropy": 6.269479846954345, "epoch": 0.0893371757925072, "grad_norm": 1.1796875, "learning_rate": 0.0004645, "loss": 6.1635, "mean_token_accuracy": 0.147640460729599, "num_tokens": 2131377.0, "step": 930 }, { "entropy": 6.344134902954101, "epoch": 0.08981748318924111, "grad_norm": 1.1484375, "learning_rate": 0.000467, "loss": 6.3531, "mean_token_accuracy": 0.1383367098867893, "num_tokens": 2142364.0, "step": 935 }, { "entropy": 6.356987571716308, "epoch": 0.09029779058597502, "grad_norm": 1.171875, "learning_rate": 0.0004695, "loss": 6.2296, "mean_token_accuracy": 0.14149210676550866, "num_tokens": 2153040.0, "step": 940 }, { "entropy": 6.35843825340271, "epoch": 0.09077809798270893, "grad_norm": 1.015625, "learning_rate": 0.000472, "loss": 6.2728, "mean_token_accuracy": 0.14314480721950532, "num_tokens": 2165571.0, "step": 945 }, { "entropy": 6.3020600318908695, "epoch": 0.09125840537944284, "grad_norm": 1.0546875, "learning_rate": 0.0004745, "loss": 6.2423, "mean_token_accuracy": 0.14072795882821082, "num_tokens": 2177241.0, "step": 950 }, { "entropy": 6.329180097579956, "epoch": 0.09173871277617675, "grad_norm": 1.2109375, "learning_rate": 0.000477, "loss": 6.2801, "mean_token_accuracy": 0.1361616224050522, "num_tokens": 2187475.0, "step": 955 }, { "entropy": 6.315436792373657, "epoch": 0.09221902017291066, "grad_norm": 1.0390625, "learning_rate": 0.0004795, "loss": 6.3087, "mean_token_accuracy": 0.14151085540652275, "num_tokens": 2198185.0, "step": 960 }, { "entropy": 6.303459358215332, "epoch": 0.09269932756964457, "grad_norm": 1.1640625, "learning_rate": 0.000482, "loss": 6.2346, "mean_token_accuracy": 0.14740882739424704, "num_tokens": 2210404.0, "step": 965 }, { "entropy": 6.370419549942016, "epoch": 0.09317963496637849, "grad_norm": 1.2109375, "learning_rate": 0.0004845, "loss": 6.2262, "mean_token_accuracy": 0.144054813683033, "num_tokens": 2222188.0, "step": 970 }, { "entropy": 6.290718269348145, "epoch": 0.0936599423631124, "grad_norm": 1.109375, "learning_rate": 0.000487, "loss": 6.2775, "mean_token_accuracy": 0.1421047918498516, "num_tokens": 2233418.0, "step": 975 }, { "entropy": 6.352431869506836, "epoch": 0.0941402497598463, "grad_norm": 1.125, "learning_rate": 0.0004895, "loss": 6.2415, "mean_token_accuracy": 0.14807373881340027, "num_tokens": 2245053.0, "step": 980 }, { "entropy": 6.250268840789795, "epoch": 0.09462055715658022, "grad_norm": 1.1328125, "learning_rate": 0.000492, "loss": 6.2715, "mean_token_accuracy": 0.14363499581813813, "num_tokens": 2256375.0, "step": 985 }, { "entropy": 6.225133609771729, "epoch": 0.09510086455331412, "grad_norm": 1.2265625, "learning_rate": 0.0004945, "loss": 6.1142, "mean_token_accuracy": 0.1477846160531044, "num_tokens": 2267074.0, "step": 990 }, { "entropy": 6.191523456573487, "epoch": 0.09558117195004803, "grad_norm": 1.15625, "learning_rate": 0.000497, "loss": 6.1547, "mean_token_accuracy": 0.14838184416294098, "num_tokens": 2277168.0, "step": 995 }, { "entropy": 6.25091781616211, "epoch": 0.09606147934678194, "grad_norm": 1.046875, "learning_rate": 0.0004995, "loss": 6.1381, "mean_token_accuracy": 0.14807945489883423, "num_tokens": 2288178.0, "step": 1000 }, { "entropy": 6.215264129638672, "epoch": 0.09654178674351585, "grad_norm": 1.1484375, "learning_rate": 0.0004999999983283737, "loss": 6.1686, "mean_token_accuracy": 0.1440332628786564, "num_tokens": 2299765.0, "step": 1005 }, { "entropy": 6.3124645233154295, "epoch": 0.09702209414024976, "grad_norm": 1.15625, "learning_rate": 0.0004999999915373924, "loss": 6.2644, "mean_token_accuracy": 0.13689299449324607, "num_tokens": 2312047.0, "step": 1010 }, { "entropy": 6.30297064781189, "epoch": 0.09750240153698367, "grad_norm": 1.1484375, "learning_rate": 0.0004999999795225793, "loss": 6.2563, "mean_token_accuracy": 0.1363622300326824, "num_tokens": 2324118.0, "step": 1015 }, { "entropy": 6.299112796783447, "epoch": 0.09798270893371758, "grad_norm": 1.203125, "learning_rate": 0.0004999999622839347, "loss": 6.2494, "mean_token_accuracy": 0.14326749965548516, "num_tokens": 2335171.0, "step": 1020 }, { "entropy": 6.283253812789917, "epoch": 0.09846301633045149, "grad_norm": 1.078125, "learning_rate": 0.0004999999398214593, "loss": 6.1501, "mean_token_accuracy": 0.14212532341480255, "num_tokens": 2346338.0, "step": 1025 }, { "entropy": 6.212884902954102, "epoch": 0.0989433237271854, "grad_norm": 1.1875, "learning_rate": 0.0004999999121351532, "loss": 6.1934, "mean_token_accuracy": 0.14963782876729964, "num_tokens": 2357185.0, "step": 1030 }, { "entropy": 6.190281915664673, "epoch": 0.0994236311239193, "grad_norm": 1.109375, "learning_rate": 0.0004999998792250173, "loss": 6.1183, "mean_token_accuracy": 0.15685753300786018, "num_tokens": 2368494.0, "step": 1035 }, { "entropy": 6.289627552032471, "epoch": 0.09990393852065321, "grad_norm": 1.15625, "learning_rate": 0.0004999998410910524, "loss": 6.3364, "mean_token_accuracy": 0.13329742476344109, "num_tokens": 2380800.0, "step": 1040 }, { "entropy": 6.3118733882904055, "epoch": 0.10038424591738712, "grad_norm": 1.0859375, "learning_rate": 0.0004999997977332592, "loss": 6.2551, "mean_token_accuracy": 0.13934137374162675, "num_tokens": 2391753.0, "step": 1045 }, { "entropy": 6.178606843948364, "epoch": 0.10086455331412104, "grad_norm": 1.0390625, "learning_rate": 0.0004999997491516389, "loss": 6.1391, "mean_token_accuracy": 0.1400229126214981, "num_tokens": 2403324.0, "step": 1050 }, { "entropy": 6.235824918746948, "epoch": 0.10134486071085495, "grad_norm": 1.15625, "learning_rate": 0.0004999996953461925, "loss": 6.2482, "mean_token_accuracy": 0.13423383459448815, "num_tokens": 2414873.0, "step": 1055 }, { "entropy": 6.138184642791748, "epoch": 0.10182516810758886, "grad_norm": 1.0, "learning_rate": 0.0004999996363169212, "loss": 6.0208, "mean_token_accuracy": 0.15671658217906953, "num_tokens": 2425308.0, "step": 1060 }, { "entropy": 6.144180011749268, "epoch": 0.10230547550432277, "grad_norm": 1.109375, "learning_rate": 0.0004999995720638266, "loss": 6.0654, "mean_token_accuracy": 0.1525282308459282, "num_tokens": 2436835.0, "step": 1065 }, { "entropy": 6.183439445495606, "epoch": 0.10278578290105668, "grad_norm": 1.140625, "learning_rate": 0.00049999950258691, "loss": 6.1921, "mean_token_accuracy": 0.1451313279569149, "num_tokens": 2446798.0, "step": 1070 }, { "entropy": 6.123720979690551, "epoch": 0.10326609029779059, "grad_norm": 1.15625, "learning_rate": 0.0004999994278861731, "loss": 6.0747, "mean_token_accuracy": 0.15084402859210969, "num_tokens": 2457308.0, "step": 1075 }, { "entropy": 6.215669107437134, "epoch": 0.1037463976945245, "grad_norm": 1.0390625, "learning_rate": 0.0004999993479616175, "loss": 6.1309, "mean_token_accuracy": 0.13830516785383223, "num_tokens": 2468917.0, "step": 1080 }, { "entropy": 6.227848720550537, "epoch": 0.1042267050912584, "grad_norm": 1.09375, "learning_rate": 0.0004999992628132451, "loss": 6.1529, "mean_token_accuracy": 0.14558819606900214, "num_tokens": 2481363.0, "step": 1085 }, { "entropy": 6.175233983993531, "epoch": 0.10470701248799232, "grad_norm": 1.046875, "learning_rate": 0.0004999991724410582, "loss": 6.1551, "mean_token_accuracy": 0.14347582682967186, "num_tokens": 2493082.0, "step": 1090 }, { "entropy": 6.150361251831055, "epoch": 0.10518731988472622, "grad_norm": 1.0703125, "learning_rate": 0.0004999990768450583, "loss": 6.106, "mean_token_accuracy": 0.1499667778611183, "num_tokens": 2503849.0, "step": 1095 }, { "entropy": 6.225272464752197, "epoch": 0.10566762728146013, "grad_norm": 1.140625, "learning_rate": 0.0004999989760252482, "loss": 6.1511, "mean_token_accuracy": 0.14817013815045357, "num_tokens": 2514528.0, "step": 1100 }, { "entropy": 6.097928714752197, "epoch": 0.10614793467819404, "grad_norm": 1.2578125, "learning_rate": 0.0004999988699816299, "loss": 6.1427, "mean_token_accuracy": 0.14771459847688675, "num_tokens": 2524971.0, "step": 1105 }, { "entropy": 6.153327941894531, "epoch": 0.10662824207492795, "grad_norm": 1.03125, "learning_rate": 0.0004999987587142058, "loss": 6.057, "mean_token_accuracy": 0.14452041387557985, "num_tokens": 2535674.0, "step": 1110 }, { "entropy": 6.2696786403656, "epoch": 0.10710854947166186, "grad_norm": 1.09375, "learning_rate": 0.0004999986422229789, "loss": 6.2903, "mean_token_accuracy": 0.13996392711997033, "num_tokens": 2547108.0, "step": 1115 }, { "entropy": 6.155757236480713, "epoch": 0.10758885686839577, "grad_norm": 1.015625, "learning_rate": 0.0004999985205079514, "loss": 6.1047, "mean_token_accuracy": 0.1451355442404747, "num_tokens": 2559474.0, "step": 1120 }, { "entropy": 6.012842035293579, "epoch": 0.10806916426512968, "grad_norm": 1.03125, "learning_rate": 0.0004999983935691265, "loss": 5.9441, "mean_token_accuracy": 0.16244944632053376, "num_tokens": 2571264.0, "step": 1125 }, { "entropy": 6.159362649917602, "epoch": 0.10854947166186359, "grad_norm": 1.03125, "learning_rate": 0.000499998261406507, "loss": 6.1208, "mean_token_accuracy": 0.1507526934146881, "num_tokens": 2583731.0, "step": 1130 }, { "entropy": 6.268857860565186, "epoch": 0.10902977905859751, "grad_norm": 1.25, "learning_rate": 0.0004999981240200958, "loss": 6.1607, "mean_token_accuracy": 0.14638862013816833, "num_tokens": 2595497.0, "step": 1135 }, { "entropy": 6.053813219070435, "epoch": 0.10951008645533142, "grad_norm": 0.9921875, "learning_rate": 0.0004999979814098966, "loss": 6.1148, "mean_token_accuracy": 0.1516471363604069, "num_tokens": 2607358.0, "step": 1140 }, { "entropy": 6.1449603080749515, "epoch": 0.10999039385206533, "grad_norm": 1.109375, "learning_rate": 0.0004999978335759121, "loss": 6.0354, "mean_token_accuracy": 0.15392047837376593, "num_tokens": 2618936.0, "step": 1145 }, { "entropy": 6.154958772659302, "epoch": 0.11047070124879924, "grad_norm": 1.0703125, "learning_rate": 0.0004999976805181461, "loss": 6.1981, "mean_token_accuracy": 0.14167412593960763, "num_tokens": 2631840.0, "step": 1150 }, { "entropy": 6.140295743942261, "epoch": 0.11095100864553314, "grad_norm": 1.109375, "learning_rate": 0.000499997522236602, "loss": 6.1443, "mean_token_accuracy": 0.15361175835132598, "num_tokens": 2642412.0, "step": 1155 }, { "entropy": 6.160842370986939, "epoch": 0.11143131604226705, "grad_norm": 0.97265625, "learning_rate": 0.0004999973587312837, "loss": 6.1067, "mean_token_accuracy": 0.14919153451919556, "num_tokens": 2653890.0, "step": 1160 }, { "entropy": 6.146590614318848, "epoch": 0.11191162343900096, "grad_norm": 1.1015625, "learning_rate": 0.0004999971900021947, "loss": 6.163, "mean_token_accuracy": 0.15273661985993386, "num_tokens": 2664888.0, "step": 1165 }, { "entropy": 6.159024953842163, "epoch": 0.11239193083573487, "grad_norm": 1.0078125, "learning_rate": 0.0004999970160493391, "loss": 6.0579, "mean_token_accuracy": 0.14569913148880004, "num_tokens": 2675550.0, "step": 1170 }, { "entropy": 6.02392611503601, "epoch": 0.11287223823246878, "grad_norm": 1.015625, "learning_rate": 0.0004999968368727209, "loss": 6.0724, "mean_token_accuracy": 0.15466973930597305, "num_tokens": 2688022.0, "step": 1175 }, { "entropy": 6.1862691879272464, "epoch": 0.11335254562920269, "grad_norm": 0.94921875, "learning_rate": 0.0004999966524723442, "loss": 6.0632, "mean_token_accuracy": 0.14964798092842102, "num_tokens": 2698737.0, "step": 1180 }, { "entropy": 6.077165365219116, "epoch": 0.1138328530259366, "grad_norm": 0.98046875, "learning_rate": 0.0004999964628482135, "loss": 6.0344, "mean_token_accuracy": 0.15742302685976028, "num_tokens": 2709844.0, "step": 1185 }, { "entropy": 6.127112817764282, "epoch": 0.1143131604226705, "grad_norm": 1.0, "learning_rate": 0.0004999962680003328, "loss": 6.1035, "mean_token_accuracy": 0.1519095703959465, "num_tokens": 2720273.0, "step": 1190 }, { "entropy": 6.1255943775177, "epoch": 0.11479346781940442, "grad_norm": 1.078125, "learning_rate": 0.000499996067928707, "loss": 6.1124, "mean_token_accuracy": 0.14679019302129745, "num_tokens": 2731354.0, "step": 1195 }, { "entropy": 6.127178192138672, "epoch": 0.11527377521613832, "grad_norm": 1.0703125, "learning_rate": 0.0004999958626333406, "loss": 6.1052, "mean_token_accuracy": 0.1527300015091896, "num_tokens": 2742966.0, "step": 1200 }, { "entropy": 6.03611798286438, "epoch": 0.11575408261287223, "grad_norm": 1.0703125, "learning_rate": 0.0004999956521142383, "loss": 6.009, "mean_token_accuracy": 0.1586822062730789, "num_tokens": 2755010.0, "step": 1205 }, { "entropy": 6.0991308212280275, "epoch": 0.11623439000960614, "grad_norm": 1.03125, "learning_rate": 0.0004999954363714051, "loss": 6.0361, "mean_token_accuracy": 0.14981242269277573, "num_tokens": 2766176.0, "step": 1210 }, { "entropy": 6.185801792144775, "epoch": 0.11671469740634005, "grad_norm": 1.015625, "learning_rate": 0.0004999952154048459, "loss": 6.1829, "mean_token_accuracy": 0.15044604614377022, "num_tokens": 2777861.0, "step": 1215 }, { "entropy": 6.021704149246216, "epoch": 0.11719500480307397, "grad_norm": 1.0234375, "learning_rate": 0.000499994989214566, "loss": 5.9954, "mean_token_accuracy": 0.1536705419421196, "num_tokens": 2788725.0, "step": 1220 }, { "entropy": 6.0181561470031735, "epoch": 0.11767531219980788, "grad_norm": 0.98046875, "learning_rate": 0.0004999947578005705, "loss": 6.0312, "mean_token_accuracy": 0.15193646997213364, "num_tokens": 2801613.0, "step": 1225 }, { "entropy": 6.218272686004639, "epoch": 0.11815561959654179, "grad_norm": 0.98828125, "learning_rate": 0.0004999945211628648, "loss": 6.0986, "mean_token_accuracy": 0.1493365317583084, "num_tokens": 2812474.0, "step": 1230 }, { "entropy": 5.971197032928467, "epoch": 0.1186359269932757, "grad_norm": 1.03125, "learning_rate": 0.0004999942793014544, "loss": 6.0103, "mean_token_accuracy": 0.15563429594039918, "num_tokens": 2823178.0, "step": 1235 }, { "entropy": 6.045905733108521, "epoch": 0.11911623439000961, "grad_norm": 0.9375, "learning_rate": 0.000499994032216345, "loss": 6.0211, "mean_token_accuracy": 0.15064174830913543, "num_tokens": 2836486.0, "step": 1240 }, { "entropy": 6.107371759414673, "epoch": 0.11959654178674352, "grad_norm": 1.0703125, "learning_rate": 0.0004999937799075422, "loss": 6.0746, "mean_token_accuracy": 0.1570821538567543, "num_tokens": 2847902.0, "step": 1245 }, { "entropy": 5.903108596801758, "epoch": 0.12007684918347743, "grad_norm": 0.99609375, "learning_rate": 0.000499993522375052, "loss": 5.9739, "mean_token_accuracy": 0.15461545437574387, "num_tokens": 2859991.0, "step": 1250 }, { "entropy": 6.248143100738526, "epoch": 0.12055715658021134, "grad_norm": 1.0546875, "learning_rate": 0.0004999932596188802, "loss": 6.1545, "mean_token_accuracy": 0.14593613222241403, "num_tokens": 2870269.0, "step": 1255 }, { "entropy": 6.034249687194825, "epoch": 0.12103746397694524, "grad_norm": 1.1171875, "learning_rate": 0.0004999929916390331, "loss": 6.0279, "mean_token_accuracy": 0.14597706943750383, "num_tokens": 2882191.0, "step": 1260 }, { "entropy": 5.966269588470459, "epoch": 0.12151777137367915, "grad_norm": 0.99609375, "learning_rate": 0.0004999927184355169, "loss": 6.0372, "mean_token_accuracy": 0.14836430177092552, "num_tokens": 2892775.0, "step": 1265 }, { "entropy": 6.147925519943238, "epoch": 0.12199807877041306, "grad_norm": 1.0234375, "learning_rate": 0.0004999924400083377, "loss": 6.0247, "mean_token_accuracy": 0.15831544399261474, "num_tokens": 2904750.0, "step": 1270 }, { "entropy": 6.081568050384521, "epoch": 0.12247838616714697, "grad_norm": 1.0078125, "learning_rate": 0.0004999921563575022, "loss": 6.0988, "mean_token_accuracy": 0.14920950308442116, "num_tokens": 2916150.0, "step": 1275 }, { "entropy": 6.07696213722229, "epoch": 0.12295869356388088, "grad_norm": 1.09375, "learning_rate": 0.0004999918674830169, "loss": 6.0644, "mean_token_accuracy": 0.1496642827987671, "num_tokens": 2928452.0, "step": 1280 }, { "entropy": 6.035782670974731, "epoch": 0.12343900096061479, "grad_norm": 1.0703125, "learning_rate": 0.0004999915733848886, "loss": 6.0442, "mean_token_accuracy": 0.1454036220908165, "num_tokens": 2940577.0, "step": 1285 }, { "entropy": 6.022758436203003, "epoch": 0.1239193083573487, "grad_norm": 1.0390625, "learning_rate": 0.000499991274063124, "loss": 6.0283, "mean_token_accuracy": 0.15150520876049994, "num_tokens": 2952302.0, "step": 1290 }, { "entropy": 6.0645428657531735, "epoch": 0.12439961575408261, "grad_norm": 1.1328125, "learning_rate": 0.0004999909695177301, "loss": 6.0669, "mean_token_accuracy": 0.15440516471862792, "num_tokens": 2964611.0, "step": 1295 }, { "entropy": 6.0961566925048825, "epoch": 0.12487992315081652, "grad_norm": 1.0078125, "learning_rate": 0.000499990659748714, "loss": 6.05, "mean_token_accuracy": 0.15006925463676452, "num_tokens": 2975668.0, "step": 1300 }, { "entropy": 6.146146440505982, "epoch": 0.12536023054755044, "grad_norm": 1.015625, "learning_rate": 0.0004999903447560828, "loss": 6.1198, "mean_token_accuracy": 0.14781473577022552, "num_tokens": 2987303.0, "step": 1305 }, { "entropy": 6.117984342575073, "epoch": 0.12584053794428435, "grad_norm": 0.9453125, "learning_rate": 0.0004999900245398439, "loss": 6.0166, "mean_token_accuracy": 0.16036698669195176, "num_tokens": 3000400.0, "step": 1310 }, { "entropy": 6.010946893692017, "epoch": 0.12632084534101826, "grad_norm": 1.0625, "learning_rate": 0.0004999896991000047, "loss": 5.9477, "mean_token_accuracy": 0.1495976448059082, "num_tokens": 3012336.0, "step": 1315 }, { "entropy": 6.054377698898316, "epoch": 0.12680115273775217, "grad_norm": 1.046875, "learning_rate": 0.0004999893684365729, "loss": 6.0047, "mean_token_accuracy": 0.15137309059500695, "num_tokens": 3023004.0, "step": 1320 }, { "entropy": 6.044629859924316, "epoch": 0.12728146013448607, "grad_norm": 0.984375, "learning_rate": 0.0004999890325495559, "loss": 6.0922, "mean_token_accuracy": 0.147823116928339, "num_tokens": 3035147.0, "step": 1325 }, { "entropy": 6.072157478332519, "epoch": 0.12776176753121998, "grad_norm": 1.0078125, "learning_rate": 0.0004999886914389617, "loss": 5.9177, "mean_token_accuracy": 0.1551705077290535, "num_tokens": 3045611.0, "step": 1330 }, { "entropy": 5.916638660430908, "epoch": 0.1282420749279539, "grad_norm": 0.92578125, "learning_rate": 0.0004999883451047981, "loss": 5.9296, "mean_token_accuracy": 0.1561925306916237, "num_tokens": 3056420.0, "step": 1335 }, { "entropy": 5.977782440185547, "epoch": 0.1287223823246878, "grad_norm": 0.9765625, "learning_rate": 0.0004999879935470733, "loss": 5.9227, "mean_token_accuracy": 0.15750788599252702, "num_tokens": 3068770.0, "step": 1340 }, { "entropy": 6.05616979598999, "epoch": 0.1292026897214217, "grad_norm": 1.015625, "learning_rate": 0.0004999876367657954, "loss": 6.0521, "mean_token_accuracy": 0.14580482840538025, "num_tokens": 3080806.0, "step": 1345 }, { "entropy": 6.143747854232788, "epoch": 0.12968299711815562, "grad_norm": 1.015625, "learning_rate": 0.0004999872747609725, "loss": 6.0742, "mean_token_accuracy": 0.1484417587518692, "num_tokens": 3091769.0, "step": 1350 }, { "entropy": 5.9879156112670895, "epoch": 0.13016330451488953, "grad_norm": 1.171875, "learning_rate": 0.0004999869075326132, "loss": 5.9938, "mean_token_accuracy": 0.15191702395677567, "num_tokens": 3103121.0, "step": 1355 }, { "entropy": 6.010816240310669, "epoch": 0.13064361191162344, "grad_norm": 0.890625, "learning_rate": 0.000499986535080726, "loss": 5.9724, "mean_token_accuracy": 0.16233935654163362, "num_tokens": 3115606.0, "step": 1360 }, { "entropy": 6.026129817962646, "epoch": 0.13112391930835735, "grad_norm": 0.94921875, "learning_rate": 0.0004999861574053196, "loss": 5.8723, "mean_token_accuracy": 0.16096271872520446, "num_tokens": 3127961.0, "step": 1365 }, { "entropy": 5.87260947227478, "epoch": 0.13160422670509125, "grad_norm": 1.015625, "learning_rate": 0.0004999857745064027, "loss": 5.8905, "mean_token_accuracy": 0.15895691215991975, "num_tokens": 3138316.0, "step": 1370 }, { "entropy": 5.953699588775635, "epoch": 0.13208453410182516, "grad_norm": 0.9296875, "learning_rate": 0.000499985386383984, "loss": 5.8671, "mean_token_accuracy": 0.15866711735725403, "num_tokens": 3150818.0, "step": 1375 }, { "entropy": 6.006815195083618, "epoch": 0.13256484149855907, "grad_norm": 1.1015625, "learning_rate": 0.0004999849930380729, "loss": 6.0195, "mean_token_accuracy": 0.1508159779012203, "num_tokens": 3162066.0, "step": 1380 }, { "entropy": 5.941660642623901, "epoch": 0.13304514889529298, "grad_norm": 1.09375, "learning_rate": 0.0004999845944686781, "loss": 5.9924, "mean_token_accuracy": 0.1508888617157936, "num_tokens": 3172209.0, "step": 1385 }, { "entropy": 5.954594707489013, "epoch": 0.1335254562920269, "grad_norm": 0.98046875, "learning_rate": 0.0004999841906758093, "loss": 5.8218, "mean_token_accuracy": 0.1675858825445175, "num_tokens": 3183248.0, "step": 1390 }, { "entropy": 5.94215030670166, "epoch": 0.1340057636887608, "grad_norm": 1.0625, "learning_rate": 0.0004999837816594757, "loss": 5.9139, "mean_token_accuracy": 0.15847276002168656, "num_tokens": 3194748.0, "step": 1395 }, { "entropy": 5.930553770065307, "epoch": 0.1344860710854947, "grad_norm": 1.0625, "learning_rate": 0.0004999833674196865, "loss": 5.8849, "mean_token_accuracy": 0.16950529664754868, "num_tokens": 3205669.0, "step": 1400 }, { "entropy": 5.932918214797974, "epoch": 0.13496637848222862, "grad_norm": 1.0234375, "learning_rate": 0.0004999829479564518, "loss": 5.9807, "mean_token_accuracy": 0.14995542094111441, "num_tokens": 3216035.0, "step": 1405 }, { "entropy": 6.064324188232422, "epoch": 0.13544668587896252, "grad_norm": 1.109375, "learning_rate": 0.000499982523269781, "loss": 5.9647, "mean_token_accuracy": 0.15931690335273743, "num_tokens": 3227192.0, "step": 1410 }, { "entropy": 5.975619888305664, "epoch": 0.13592699327569643, "grad_norm": 0.9609375, "learning_rate": 0.0004999820933596842, "loss": 5.9871, "mean_token_accuracy": 0.15620121210813523, "num_tokens": 3240237.0, "step": 1415 }, { "entropy": 5.962911701202392, "epoch": 0.13640730067243034, "grad_norm": 1.0234375, "learning_rate": 0.000499981658226171, "loss": 5.8734, "mean_token_accuracy": 0.16469697579741477, "num_tokens": 3251963.0, "step": 1420 }, { "entropy": 5.908741474151611, "epoch": 0.13688760806916425, "grad_norm": 1.0078125, "learning_rate": 0.000499981217869252, "loss": 5.9953, "mean_token_accuracy": 0.15814436972141266, "num_tokens": 3263101.0, "step": 1425 }, { "entropy": 5.985613679885864, "epoch": 0.1373679154658982, "grad_norm": 1.1015625, "learning_rate": 0.000499980772288937, "loss": 5.8679, "mean_token_accuracy": 0.16649020761251448, "num_tokens": 3275100.0, "step": 1430 }, { "entropy": 5.945235109329223, "epoch": 0.1378482228626321, "grad_norm": 0.9140625, "learning_rate": 0.0004999803214852367, "loss": 5.9638, "mean_token_accuracy": 0.15565589517354966, "num_tokens": 3287025.0, "step": 1435 }, { "entropy": 6.04934253692627, "epoch": 0.138328530259366, "grad_norm": 0.91796875, "learning_rate": 0.0004999798654581613, "loss": 5.9662, "mean_token_accuracy": 0.15883919447660447, "num_tokens": 3299867.0, "step": 1440 }, { "entropy": 5.918570852279663, "epoch": 0.13880883765609991, "grad_norm": 1.046875, "learning_rate": 0.0004999794042077214, "loss": 5.9038, "mean_token_accuracy": 0.16191874593496322, "num_tokens": 3311183.0, "step": 1445 }, { "entropy": 5.952925539016723, "epoch": 0.13928914505283382, "grad_norm": 1.1484375, "learning_rate": 0.0004999789377339279, "loss": 5.9687, "mean_token_accuracy": 0.15641413480043412, "num_tokens": 3322247.0, "step": 1450 }, { "entropy": 5.962415742874145, "epoch": 0.13976945244956773, "grad_norm": 1.03125, "learning_rate": 0.0004999784660367915, "loss": 5.8826, "mean_token_accuracy": 0.1588966131210327, "num_tokens": 3333369.0, "step": 1455 }, { "entropy": 5.904612874984741, "epoch": 0.14024975984630164, "grad_norm": 1.0546875, "learning_rate": 0.0004999779891163231, "loss": 5.9113, "mean_token_accuracy": 0.16011089235544204, "num_tokens": 3345876.0, "step": 1460 }, { "entropy": 5.91278772354126, "epoch": 0.14073006724303555, "grad_norm": 1.0234375, "learning_rate": 0.0004999775069725339, "loss": 5.8124, "mean_token_accuracy": 0.1629629462957382, "num_tokens": 3357323.0, "step": 1465 }, { "entropy": 5.912459039688111, "epoch": 0.14121037463976946, "grad_norm": 1.109375, "learning_rate": 0.000499977019605435, "loss": 5.897, "mean_token_accuracy": 0.15947655588388443, "num_tokens": 3367689.0, "step": 1470 }, { "entropy": 5.844752836227417, "epoch": 0.14169068203650337, "grad_norm": 0.9921875, "learning_rate": 0.0004999765270150378, "loss": 5.8568, "mean_token_accuracy": 0.15955205261707306, "num_tokens": 3379472.0, "step": 1475 }, { "entropy": 5.996302938461303, "epoch": 0.14217098943323728, "grad_norm": 1.015625, "learning_rate": 0.0004999760292013536, "loss": 5.8922, "mean_token_accuracy": 0.15859662368893623, "num_tokens": 3390929.0, "step": 1480 }, { "entropy": 5.99014687538147, "epoch": 0.14265129682997119, "grad_norm": 1.0625, "learning_rate": 0.0004999755261643941, "loss": 5.8976, "mean_token_accuracy": 0.16287715286016463, "num_tokens": 3401242.0, "step": 1485 }, { "entropy": 5.869934892654419, "epoch": 0.1431316042267051, "grad_norm": 1.0859375, "learning_rate": 0.0004999750179041709, "loss": 5.8878, "mean_token_accuracy": 0.16124220937490463, "num_tokens": 3411169.0, "step": 1490 }, { "entropy": 5.874157810211182, "epoch": 0.143611911623439, "grad_norm": 1.0859375, "learning_rate": 0.0004999745044206959, "loss": 5.7279, "mean_token_accuracy": 0.16647156924009324, "num_tokens": 3423265.0, "step": 1495 }, { "entropy": 5.832660913467407, "epoch": 0.1440922190201729, "grad_norm": 0.96484375, "learning_rate": 0.0004999739857139809, "loss": 5.8347, "mean_token_accuracy": 0.16908216327428818, "num_tokens": 3434793.0, "step": 1500 }, { "entropy": 5.757522106170654, "epoch": 0.14457252641690682, "grad_norm": 0.98828125, "learning_rate": 0.000499973461784038, "loss": 5.7679, "mean_token_accuracy": 0.17928926199674605, "num_tokens": 3445732.0, "step": 1505 }, { "entropy": 5.942258501052857, "epoch": 0.14505283381364073, "grad_norm": 0.98046875, "learning_rate": 0.0004999729326308792, "loss": 5.9516, "mean_token_accuracy": 0.15832037180662156, "num_tokens": 3457090.0, "step": 1510 }, { "entropy": 5.99946174621582, "epoch": 0.14553314121037464, "grad_norm": 1.1484375, "learning_rate": 0.000499972398254517, "loss": 5.9388, "mean_token_accuracy": 0.15340567082166673, "num_tokens": 3468087.0, "step": 1515 }, { "entropy": 5.941799163818359, "epoch": 0.14601344860710855, "grad_norm": 1.0625, "learning_rate": 0.000499971858654964, "loss": 5.8778, "mean_token_accuracy": 0.1609287366271019, "num_tokens": 3478820.0, "step": 1520 }, { "entropy": 5.859274196624756, "epoch": 0.14649375600384246, "grad_norm": 0.97265625, "learning_rate": 0.0004999713138322321, "loss": 5.9021, "mean_token_accuracy": 0.15754427909851074, "num_tokens": 3489878.0, "step": 1525 }, { "entropy": 5.942076396942139, "epoch": 0.14697406340057637, "grad_norm": 1.03125, "learning_rate": 0.0004999707637863346, "loss": 5.8905, "mean_token_accuracy": 0.1585473045706749, "num_tokens": 3500944.0, "step": 1530 }, { "entropy": 5.8406360149383545, "epoch": 0.14745437079731027, "grad_norm": 1.078125, "learning_rate": 0.0004999702085172838, "loss": 5.8719, "mean_token_accuracy": 0.16607238352298737, "num_tokens": 3511383.0, "step": 1535 }, { "entropy": 5.969763612747192, "epoch": 0.14793467819404418, "grad_norm": 0.9609375, "learning_rate": 0.0004999696480250929, "loss": 5.963, "mean_token_accuracy": 0.15430965945124625, "num_tokens": 3523300.0, "step": 1540 }, { "entropy": 5.970634698867798, "epoch": 0.1484149855907781, "grad_norm": 1.1953125, "learning_rate": 0.0004999690823097747, "loss": 5.8799, "mean_token_accuracy": 0.1521039791405201, "num_tokens": 3534371.0, "step": 1545 }, { "entropy": 5.841155576705932, "epoch": 0.148895292987512, "grad_norm": 1.1171875, "learning_rate": 0.0004999685113713426, "loss": 5.8552, "mean_token_accuracy": 0.16120514869689942, "num_tokens": 3544847.0, "step": 1550 }, { "entropy": 5.92685284614563, "epoch": 0.1493756003842459, "grad_norm": 0.9765625, "learning_rate": 0.0004999679352098096, "loss": 5.8223, "mean_token_accuracy": 0.16645588725805283, "num_tokens": 3555859.0, "step": 1555 }, { "entropy": 5.8343531608581545, "epoch": 0.14985590778097982, "grad_norm": 0.9375, "learning_rate": 0.0004999673538251891, "loss": 5.8389, "mean_token_accuracy": 0.15894080251455306, "num_tokens": 3568283.0, "step": 1560 }, { "entropy": 5.834793663024902, "epoch": 0.15033621517771373, "grad_norm": 0.9609375, "learning_rate": 0.0004999667672174947, "loss": 5.917, "mean_token_accuracy": 0.1583700641989708, "num_tokens": 3581442.0, "step": 1565 }, { "entropy": 6.0175745487213135, "epoch": 0.15081652257444764, "grad_norm": 1.015625, "learning_rate": 0.00049996617538674, "loss": 5.9571, "mean_token_accuracy": 0.15496992468833923, "num_tokens": 3594055.0, "step": 1570 }, { "entropy": 5.962413930892945, "epoch": 0.15129682997118155, "grad_norm": 1.0, "learning_rate": 0.0004999655783329386, "loss": 5.9187, "mean_token_accuracy": 0.15283605754375457, "num_tokens": 3605952.0, "step": 1575 }, { "entropy": 5.910793209075928, "epoch": 0.15177713736791545, "grad_norm": 0.98828125, "learning_rate": 0.0004999649760561046, "loss": 5.9577, "mean_token_accuracy": 0.158383572101593, "num_tokens": 3618544.0, "step": 1580 }, { "entropy": 5.908201408386231, "epoch": 0.15225744476464936, "grad_norm": 1.03125, "learning_rate": 0.0004999643685562519, "loss": 5.8929, "mean_token_accuracy": 0.16440413743257523, "num_tokens": 3630445.0, "step": 1585 }, { "entropy": 5.935053777694702, "epoch": 0.15273775216138327, "grad_norm": 1.0234375, "learning_rate": 0.0004999637558333945, "loss": 5.8797, "mean_token_accuracy": 0.16155748218297958, "num_tokens": 3642516.0, "step": 1590 }, { "entropy": 5.843541431427002, "epoch": 0.15321805955811718, "grad_norm": 1.046875, "learning_rate": 0.0004999631378875467, "loss": 5.8175, "mean_token_accuracy": 0.16581382006406784, "num_tokens": 3654425.0, "step": 1595 }, { "entropy": 5.805763053894043, "epoch": 0.15369836695485112, "grad_norm": 0.9765625, "learning_rate": 0.0004999625147187228, "loss": 5.8228, "mean_token_accuracy": 0.16464165300130845, "num_tokens": 3666521.0, "step": 1600 }, { "entropy": 6.019205856323242, "epoch": 0.15417867435158503, "grad_norm": 0.94140625, "learning_rate": 0.0004999618863269373, "loss": 5.8806, "mean_token_accuracy": 0.15575164407491685, "num_tokens": 3679121.0, "step": 1605 }, { "entropy": 5.91282377243042, "epoch": 0.15465898174831894, "grad_norm": 1.015625, "learning_rate": 0.0004999612527122049, "loss": 5.8941, "mean_token_accuracy": 0.15461272597312928, "num_tokens": 3691095.0, "step": 1610 }, { "entropy": 5.826972103118896, "epoch": 0.15513928914505284, "grad_norm": 0.87109375, "learning_rate": 0.0004999606138745402, "loss": 5.8562, "mean_token_accuracy": 0.16407538801431656, "num_tokens": 3703426.0, "step": 1615 }, { "entropy": 5.967412042617798, "epoch": 0.15561959654178675, "grad_norm": 1.0, "learning_rate": 0.0004999599698139581, "loss": 5.9309, "mean_token_accuracy": 0.1637990355491638, "num_tokens": 3715429.0, "step": 1620 }, { "entropy": 5.932253503799439, "epoch": 0.15609990393852066, "grad_norm": 1.03125, "learning_rate": 0.0004999593205304734, "loss": 5.909, "mean_token_accuracy": 0.15584128946065903, "num_tokens": 3726327.0, "step": 1625 }, { "entropy": 5.9037374496459964, "epoch": 0.15658021133525457, "grad_norm": 1.0, "learning_rate": 0.0004999586660241012, "loss": 5.8582, "mean_token_accuracy": 0.1553866222500801, "num_tokens": 3736818.0, "step": 1630 }, { "entropy": 5.929326868057251, "epoch": 0.15706051873198848, "grad_norm": 0.9921875, "learning_rate": 0.0004999580062948569, "loss": 5.8583, "mean_token_accuracy": 0.16254822611808778, "num_tokens": 3747776.0, "step": 1635 }, { "entropy": 5.7625970363616945, "epoch": 0.1575408261287224, "grad_norm": 1.0, "learning_rate": 0.0004999573413427556, "loss": 5.7301, "mean_token_accuracy": 0.164338056743145, "num_tokens": 3758990.0, "step": 1640 }, { "entropy": 5.8398857593536375, "epoch": 0.1580211335254563, "grad_norm": 0.9609375, "learning_rate": 0.0004999566711678128, "loss": 5.7961, "mean_token_accuracy": 0.1605479434132576, "num_tokens": 3769686.0, "step": 1645 }, { "entropy": 5.867894649505615, "epoch": 0.1585014409221902, "grad_norm": 0.92578125, "learning_rate": 0.0004999559957700442, "loss": 5.8554, "mean_token_accuracy": 0.16354380249977113, "num_tokens": 3781815.0, "step": 1650 }, { "entropy": 5.88207426071167, "epoch": 0.15898174831892412, "grad_norm": 0.99609375, "learning_rate": 0.0004999553151494653, "loss": 5.9139, "mean_token_accuracy": 0.15942219495773316, "num_tokens": 3793392.0, "step": 1655 }, { "entropy": 5.860579538345337, "epoch": 0.15946205571565802, "grad_norm": 1.015625, "learning_rate": 0.0004999546293060919, "loss": 5.8298, "mean_token_accuracy": 0.16041782200336457, "num_tokens": 3804974.0, "step": 1660 }, { "entropy": 5.799793004989624, "epoch": 0.15994236311239193, "grad_norm": 0.953125, "learning_rate": 0.00049995393823994, "loss": 5.7028, "mean_token_accuracy": 0.17192372530698777, "num_tokens": 3817166.0, "step": 1665 }, { "entropy": 5.849306297302246, "epoch": 0.16042267050912584, "grad_norm": 1.03125, "learning_rate": 0.0004999532419510255, "loss": 5.8307, "mean_token_accuracy": 0.1580624461174011, "num_tokens": 3828151.0, "step": 1670 }, { "entropy": 5.847281789779663, "epoch": 0.16090297790585975, "grad_norm": 0.97265625, "learning_rate": 0.000499952540439365, "loss": 5.8283, "mean_token_accuracy": 0.16032543033361435, "num_tokens": 3839439.0, "step": 1675 }, { "entropy": 5.906755828857422, "epoch": 0.16138328530259366, "grad_norm": 0.95703125, "learning_rate": 0.0004999518337049743, "loss": 5.8813, "mean_token_accuracy": 0.15963228195905685, "num_tokens": 3851694.0, "step": 1680 }, { "entropy": 5.831542205810547, "epoch": 0.16186359269932757, "grad_norm": 0.91015625, "learning_rate": 0.00049995112174787, "loss": 5.8589, "mean_token_accuracy": 0.15917099863290787, "num_tokens": 3863593.0, "step": 1685 }, { "entropy": 5.811672306060791, "epoch": 0.16234390009606148, "grad_norm": 0.95703125, "learning_rate": 0.0004999504045680687, "loss": 5.7935, "mean_token_accuracy": 0.1701650395989418, "num_tokens": 3874588.0, "step": 1690 }, { "entropy": 5.894420862197876, "epoch": 0.1628242074927954, "grad_norm": 1.046875, "learning_rate": 0.0004999496821655869, "loss": 5.8753, "mean_token_accuracy": 0.16022350043058395, "num_tokens": 3884662.0, "step": 1695 }, { "entropy": 5.956241655349731, "epoch": 0.1633045148895293, "grad_norm": 0.890625, "learning_rate": 0.0004999489545404414, "loss": 5.9739, "mean_token_accuracy": 0.15092033073306083, "num_tokens": 3896569.0, "step": 1700 }, { "entropy": 5.943658018112183, "epoch": 0.1637848222862632, "grad_norm": 0.8984375, "learning_rate": 0.0004999482216926493, "loss": 5.8162, "mean_token_accuracy": 0.1632000833749771, "num_tokens": 3907691.0, "step": 1705 }, { "entropy": 5.843317651748658, "epoch": 0.1642651296829971, "grad_norm": 1.09375, "learning_rate": 0.0004999474836222273, "loss": 5.83, "mean_token_accuracy": 0.1665841408073902, "num_tokens": 3918794.0, "step": 1710 }, { "entropy": 5.834485340118408, "epoch": 0.16474543707973102, "grad_norm": 0.94140625, "learning_rate": 0.0004999467403291928, "loss": 5.8301, "mean_token_accuracy": 0.1692491739988327, "num_tokens": 3929773.0, "step": 1715 }, { "entropy": 5.874946594238281, "epoch": 0.16522574447646493, "grad_norm": 1.0625, "learning_rate": 0.0004999459918135628, "loss": 5.8498, "mean_token_accuracy": 0.16062923073768615, "num_tokens": 3940264.0, "step": 1720 }, { "entropy": 5.791439247131348, "epoch": 0.16570605187319884, "grad_norm": 1.0078125, "learning_rate": 0.000499945238075355, "loss": 5.7456, "mean_token_accuracy": 0.1693306788802147, "num_tokens": 3951500.0, "step": 1725 }, { "entropy": 5.851829910278321, "epoch": 0.16618635926993275, "grad_norm": 1.046875, "learning_rate": 0.0004999444791145865, "loss": 5.8145, "mean_token_accuracy": 0.16588351577520372, "num_tokens": 3963580.0, "step": 1730 }, { "entropy": 5.804158353805542, "epoch": 0.16666666666666666, "grad_norm": 0.9375, "learning_rate": 0.0004999437149312754, "loss": 5.7585, "mean_token_accuracy": 0.17176578491926192, "num_tokens": 3975994.0, "step": 1735 }, { "entropy": 5.836318635940552, "epoch": 0.16714697406340057, "grad_norm": 1.015625, "learning_rate": 0.000499942945525439, "loss": 5.7658, "mean_token_accuracy": 0.15896687656641006, "num_tokens": 3987897.0, "step": 1740 }, { "entropy": 5.888211059570312, "epoch": 0.16762728146013448, "grad_norm": 1.03125, "learning_rate": 0.0004999421708970954, "loss": 5.93, "mean_token_accuracy": 0.15537445321679116, "num_tokens": 3999829.0, "step": 1745 }, { "entropy": 5.7658594131469725, "epoch": 0.16810758885686838, "grad_norm": 0.9765625, "learning_rate": 0.0004999413910462625, "loss": 5.7591, "mean_token_accuracy": 0.16620118021965027, "num_tokens": 4010882.0, "step": 1750 }, { "entropy": 5.861884737014771, "epoch": 0.1685878962536023, "grad_norm": 0.9453125, "learning_rate": 0.0004999406059729586, "loss": 5.7469, "mean_token_accuracy": 0.17034892737865448, "num_tokens": 4021423.0, "step": 1755 }, { "entropy": 5.888075494766236, "epoch": 0.1690682036503362, "grad_norm": 0.921875, "learning_rate": 0.0004999398156772016, "loss": 5.8931, "mean_token_accuracy": 0.15374189764261245, "num_tokens": 4033590.0, "step": 1760 }, { "entropy": 5.721970653533935, "epoch": 0.16954851104707014, "grad_norm": 1.078125, "learning_rate": 0.00049993902015901, "loss": 5.7562, "mean_token_accuracy": 0.16655992865562438, "num_tokens": 4043978.0, "step": 1765 }, { "entropy": 5.931190156936646, "epoch": 0.17002881844380405, "grad_norm": 1.0703125, "learning_rate": 0.0004999382194184023, "loss": 5.8756, "mean_token_accuracy": 0.16273052543401717, "num_tokens": 4054513.0, "step": 1770 }, { "entropy": 5.857993745803833, "epoch": 0.17050912584053796, "grad_norm": 0.9375, "learning_rate": 0.0004999374134553972, "loss": 5.8367, "mean_token_accuracy": 0.16276317089796066, "num_tokens": 4066019.0, "step": 1775 }, { "entropy": 5.841061735153199, "epoch": 0.17098943323727187, "grad_norm": 0.93359375, "learning_rate": 0.0004999366022700131, "loss": 5.7935, "mean_token_accuracy": 0.1673088401556015, "num_tokens": 4077688.0, "step": 1780 }, { "entropy": 5.860415935516357, "epoch": 0.17146974063400577, "grad_norm": 0.9765625, "learning_rate": 0.0004999357858622691, "loss": 5.8573, "mean_token_accuracy": 0.1664716601371765, "num_tokens": 4089803.0, "step": 1785 }, { "entropy": 5.8289069652557375, "epoch": 0.17195004803073968, "grad_norm": 0.91796875, "learning_rate": 0.0004999349642321842, "loss": 5.8073, "mean_token_accuracy": 0.16912547051906585, "num_tokens": 4101969.0, "step": 1790 }, { "entropy": 5.799117517471314, "epoch": 0.1724303554274736, "grad_norm": 0.99609375, "learning_rate": 0.0004999341373797772, "loss": 5.7955, "mean_token_accuracy": 0.15957102179527283, "num_tokens": 4113567.0, "step": 1795 }, { "entropy": 5.814974451065064, "epoch": 0.1729106628242075, "grad_norm": 0.9921875, "learning_rate": 0.0004999333053050675, "loss": 5.7575, "mean_token_accuracy": 0.1691056177020073, "num_tokens": 4125191.0, "step": 1800 }, { "entropy": 5.827954626083374, "epoch": 0.1733909702209414, "grad_norm": 1.140625, "learning_rate": 0.0004999324680080744, "loss": 5.8004, "mean_token_accuracy": 0.16687883883714677, "num_tokens": 4135050.0, "step": 1805 }, { "entropy": 5.842863750457764, "epoch": 0.17387127761767532, "grad_norm": 0.93359375, "learning_rate": 0.0004999316254888172, "loss": 5.8736, "mean_token_accuracy": 0.1648238182067871, "num_tokens": 4146874.0, "step": 1810 }, { "entropy": 5.857775688171387, "epoch": 0.17435158501440923, "grad_norm": 0.93359375, "learning_rate": 0.0004999307777473157, "loss": 5.7974, "mean_token_accuracy": 0.16151650995016098, "num_tokens": 4158118.0, "step": 1815 }, { "entropy": 5.818978691101075, "epoch": 0.17483189241114314, "grad_norm": 1.171875, "learning_rate": 0.0004999299247835893, "loss": 5.7561, "mean_token_accuracy": 0.17479462176561356, "num_tokens": 4169035.0, "step": 1820 }, { "entropy": 5.738432455062866, "epoch": 0.17531219980787704, "grad_norm": 1.03125, "learning_rate": 0.000499929066597658, "loss": 5.745, "mean_token_accuracy": 0.17148349434137344, "num_tokens": 4180314.0, "step": 1825 }, { "entropy": 5.883955717086792, "epoch": 0.17579250720461095, "grad_norm": 1.046875, "learning_rate": 0.0004999282031895418, "loss": 5.8239, "mean_token_accuracy": 0.16614590883255004, "num_tokens": 4192238.0, "step": 1830 }, { "entropy": 5.769097232818604, "epoch": 0.17627281460134486, "grad_norm": 1.0078125, "learning_rate": 0.0004999273345592604, "loss": 5.756, "mean_token_accuracy": 0.16652164459228516, "num_tokens": 4203346.0, "step": 1835 }, { "entropy": 5.811061954498291, "epoch": 0.17675312199807877, "grad_norm": 0.96875, "learning_rate": 0.0004999264607068343, "loss": 5.8159, "mean_token_accuracy": 0.17016567289829254, "num_tokens": 4213763.0, "step": 1840 }, { "entropy": 5.781940555572509, "epoch": 0.17723342939481268, "grad_norm": 0.90234375, "learning_rate": 0.0004999255816322837, "loss": 5.7699, "mean_token_accuracy": 0.16876950412988662, "num_tokens": 4225553.0, "step": 1845 }, { "entropy": 5.857665061950684, "epoch": 0.1777137367915466, "grad_norm": 0.99609375, "learning_rate": 0.000499924697335629, "loss": 5.702, "mean_token_accuracy": 0.17350574135780333, "num_tokens": 4236058.0, "step": 1850 }, { "entropy": 5.640166330337524, "epoch": 0.1781940441882805, "grad_norm": 0.92578125, "learning_rate": 0.0004999238078168906, "loss": 5.7763, "mean_token_accuracy": 0.17054813206195832, "num_tokens": 4248299.0, "step": 1855 }, { "entropy": 5.8273721694946286, "epoch": 0.1786743515850144, "grad_norm": 0.94921875, "learning_rate": 0.0004999229130760894, "loss": 5.7052, "mean_token_accuracy": 0.17111807465553283, "num_tokens": 4259704.0, "step": 1860 }, { "entropy": 5.691127586364746, "epoch": 0.17915465898174832, "grad_norm": 1.0, "learning_rate": 0.000499922013113246, "loss": 5.587, "mean_token_accuracy": 0.18398697525262833, "num_tokens": 4270480.0, "step": 1865 }, { "entropy": 5.780127954483032, "epoch": 0.17963496637848222, "grad_norm": 1.0234375, "learning_rate": 0.0004999211079283814, "loss": 5.8538, "mean_token_accuracy": 0.16719998568296432, "num_tokens": 4282104.0, "step": 1870 }, { "entropy": 5.849603605270386, "epoch": 0.18011527377521613, "grad_norm": 0.93359375, "learning_rate": 0.0004999201975215164, "loss": 5.8172, "mean_token_accuracy": 0.16666848957538605, "num_tokens": 4294251.0, "step": 1875 }, { "entropy": 5.757232236862182, "epoch": 0.18059558117195004, "grad_norm": 0.95703125, "learning_rate": 0.0004999192818926725, "loss": 5.7017, "mean_token_accuracy": 0.16847867369651795, "num_tokens": 4305569.0, "step": 1880 }, { "entropy": 5.859993028640747, "epoch": 0.18107588856868395, "grad_norm": 1.09375, "learning_rate": 0.0004999183610418706, "loss": 5.8283, "mean_token_accuracy": 0.16413767859339715, "num_tokens": 4317845.0, "step": 1885 }, { "entropy": 5.76594557762146, "epoch": 0.18155619596541786, "grad_norm": 0.90625, "learning_rate": 0.0004999174349691322, "loss": 5.6959, "mean_token_accuracy": 0.17179392874240876, "num_tokens": 4329987.0, "step": 1890 }, { "entropy": 5.697657203674316, "epoch": 0.18203650336215177, "grad_norm": 0.88671875, "learning_rate": 0.0004999165036744788, "loss": 5.7257, "mean_token_accuracy": 0.16847490072250365, "num_tokens": 4341628.0, "step": 1895 }, { "entropy": 5.861244201660156, "epoch": 0.18251681075888568, "grad_norm": 1.046875, "learning_rate": 0.0004999155671579322, "loss": 5.7851, "mean_token_accuracy": 0.1615397110581398, "num_tokens": 4352379.0, "step": 1900 }, { "entropy": 5.6849024295806885, "epoch": 0.1829971181556196, "grad_norm": 1.0234375, "learning_rate": 0.000499914625419514, "loss": 5.7181, "mean_token_accuracy": 0.171738800406456, "num_tokens": 4364800.0, "step": 1905 }, { "entropy": 5.776795959472656, "epoch": 0.1834774255523535, "grad_norm": 1.0859375, "learning_rate": 0.0004999136784592459, "loss": 5.7315, "mean_token_accuracy": 0.16872817426919937, "num_tokens": 4376048.0, "step": 1910 }, { "entropy": 5.730347061157227, "epoch": 0.1839577329490874, "grad_norm": 0.921875, "learning_rate": 0.0004999127262771502, "loss": 5.7297, "mean_token_accuracy": 0.16825871765613556, "num_tokens": 4388072.0, "step": 1915 }, { "entropy": 5.872533082962036, "epoch": 0.1844380403458213, "grad_norm": 1.046875, "learning_rate": 0.0004999117688732487, "loss": 5.8226, "mean_token_accuracy": 0.16391085535287858, "num_tokens": 4399843.0, "step": 1920 }, { "entropy": 5.713910245895386, "epoch": 0.18491834774255522, "grad_norm": 1.015625, "learning_rate": 0.0004999108062475638, "loss": 5.6757, "mean_token_accuracy": 0.17384760677814484, "num_tokens": 4411373.0, "step": 1925 }, { "entropy": 5.716005563735962, "epoch": 0.18539865513928913, "grad_norm": 1.03125, "learning_rate": 0.000499909838400118, "loss": 5.6614, "mean_token_accuracy": 0.173922398686409, "num_tokens": 4421857.0, "step": 1930 }, { "entropy": 5.820113229751587, "epoch": 0.18587896253602307, "grad_norm": 1.0078125, "learning_rate": 0.0004999088653309334, "loss": 5.7618, "mean_token_accuracy": 0.1711716189980507, "num_tokens": 4432728.0, "step": 1935 }, { "entropy": 5.708466053009033, "epoch": 0.18635926993275698, "grad_norm": 0.9375, "learning_rate": 0.0004999078870400329, "loss": 5.693, "mean_token_accuracy": 0.17283684760332108, "num_tokens": 4444683.0, "step": 1940 }, { "entropy": 5.8614743709564205, "epoch": 0.18683957732949089, "grad_norm": 0.953125, "learning_rate": 0.0004999069035274391, "loss": 5.8215, "mean_token_accuracy": 0.16018551886081694, "num_tokens": 4456961.0, "step": 1945 }, { "entropy": 5.694478511810303, "epoch": 0.1873198847262248, "grad_norm": 0.9140625, "learning_rate": 0.0004999059147931747, "loss": 5.665, "mean_token_accuracy": 0.1762719616293907, "num_tokens": 4468424.0, "step": 1950 }, { "entropy": 5.791493558883667, "epoch": 0.1878001921229587, "grad_norm": 0.94921875, "learning_rate": 0.0004999049208372629, "loss": 5.8694, "mean_token_accuracy": 0.15364666059613227, "num_tokens": 4479813.0, "step": 1955 }, { "entropy": 5.952554082870483, "epoch": 0.1882804995196926, "grad_norm": 1.03125, "learning_rate": 0.0004999039216597267, "loss": 5.862, "mean_token_accuracy": 0.16733278185129166, "num_tokens": 4491172.0, "step": 1960 }, { "entropy": 5.706536293029785, "epoch": 0.18876080691642652, "grad_norm": 0.92578125, "learning_rate": 0.0004999029172605892, "loss": 5.7439, "mean_token_accuracy": 0.1704375624656677, "num_tokens": 4503063.0, "step": 1965 }, { "entropy": 5.889812326431274, "epoch": 0.18924111431316043, "grad_norm": 0.91796875, "learning_rate": 0.0004999019076398738, "loss": 5.8177, "mean_token_accuracy": 0.15313875377178193, "num_tokens": 4514188.0, "step": 1970 }, { "entropy": 5.822384834289551, "epoch": 0.18972142170989434, "grad_norm": 0.95703125, "learning_rate": 0.000499900892797604, "loss": 5.7258, "mean_token_accuracy": 0.17310872822999954, "num_tokens": 4525293.0, "step": 1975 }, { "entropy": 5.80044903755188, "epoch": 0.19020172910662825, "grad_norm": 1.046875, "learning_rate": 0.0004998998727338031, "loss": 5.8139, "mean_token_accuracy": 0.1692732721567154, "num_tokens": 4536589.0, "step": 1980 }, { "entropy": 5.689789342880249, "epoch": 0.19068203650336216, "grad_norm": 0.98828125, "learning_rate": 0.0004998988474484952, "loss": 5.5648, "mean_token_accuracy": 0.19031796902418135, "num_tokens": 4547594.0, "step": 1985 }, { "entropy": 5.717133808135986, "epoch": 0.19116234390009607, "grad_norm": 0.90625, "learning_rate": 0.0004998978169417038, "loss": 5.78, "mean_token_accuracy": 0.1743384450674057, "num_tokens": 4559850.0, "step": 1990 }, { "entropy": 5.791743421554566, "epoch": 0.19164265129682997, "grad_norm": 1.0546875, "learning_rate": 0.0004998967812134529, "loss": 5.7138, "mean_token_accuracy": 0.17110339552164078, "num_tokens": 4570727.0, "step": 1995 }, { "entropy": 5.610540056228638, "epoch": 0.19212295869356388, "grad_norm": 0.99609375, "learning_rate": 0.0004998957402637664, "loss": 5.6542, "mean_token_accuracy": 0.17157155871391297, "num_tokens": 4582248.0, "step": 2000 }, { "entropy": 5.801579093933105, "epoch": 0.1926032660902978, "grad_norm": 1.1484375, "learning_rate": 0.0004998946940926687, "loss": 5.6973, "mean_token_accuracy": 0.17121600955724717, "num_tokens": 4592604.0, "step": 2005 }, { "entropy": 5.661766576766968, "epoch": 0.1930835734870317, "grad_norm": 1.015625, "learning_rate": 0.000499893642700184, "loss": 5.7182, "mean_token_accuracy": 0.17020188719034196, "num_tokens": 4604398.0, "step": 2010 }, { "entropy": 5.790825366973877, "epoch": 0.1935638808837656, "grad_norm": 0.921875, "learning_rate": 0.0004998925860863368, "loss": 5.7931, "mean_token_accuracy": 0.1685462474822998, "num_tokens": 4616434.0, "step": 2015 }, { "entropy": 5.820285224914551, "epoch": 0.19404418828049952, "grad_norm": 0.9296875, "learning_rate": 0.0004998915242511516, "loss": 5.7541, "mean_token_accuracy": 0.17625110745429992, "num_tokens": 4627577.0, "step": 2020 }, { "entropy": 5.7781401634216305, "epoch": 0.19452449567723343, "grad_norm": 1.0390625, "learning_rate": 0.0004998904571946528, "loss": 5.817, "mean_token_accuracy": 0.16743545606732368, "num_tokens": 4639698.0, "step": 2025 }, { "entropy": 5.838766145706177, "epoch": 0.19500480307396734, "grad_norm": 0.99609375, "learning_rate": 0.0004998893849168655, "loss": 5.8269, "mean_token_accuracy": 0.16433341503143312, "num_tokens": 4650643.0, "step": 2030 }, { "entropy": 5.762656116485596, "epoch": 0.19548511047070125, "grad_norm": 0.93359375, "learning_rate": 0.0004998883074178144, "loss": 5.7427, "mean_token_accuracy": 0.16878412663936615, "num_tokens": 4662897.0, "step": 2035 }, { "entropy": 5.818380117416382, "epoch": 0.19596541786743515, "grad_norm": 0.98828125, "learning_rate": 0.0004998872246975247, "loss": 5.8217, "mean_token_accuracy": 0.1706990644335747, "num_tokens": 4673701.0, "step": 2040 }, { "entropy": 5.910197305679321, "epoch": 0.19644572526416906, "grad_norm": 0.97265625, "learning_rate": 0.0004998861367560213, "loss": 5.7826, "mean_token_accuracy": 0.16689348816871644, "num_tokens": 4685873.0, "step": 2045 }, { "entropy": 5.714930677413941, "epoch": 0.19692603266090297, "grad_norm": 0.97265625, "learning_rate": 0.0004998850435933296, "loss": 5.6724, "mean_token_accuracy": 0.17364383190870286, "num_tokens": 4697179.0, "step": 2050 }, { "entropy": 5.752671766281128, "epoch": 0.19740634005763688, "grad_norm": 1.0390625, "learning_rate": 0.0004998839452094749, "loss": 5.7084, "mean_token_accuracy": 0.17288116365671158, "num_tokens": 4707752.0, "step": 2055 }, { "entropy": 5.625265073776245, "epoch": 0.1978866474543708, "grad_norm": 1.03125, "learning_rate": 0.0004998828416044829, "loss": 5.58, "mean_token_accuracy": 0.17766032367944717, "num_tokens": 4718413.0, "step": 2060 }, { "entropy": 5.750666522979737, "epoch": 0.1983669548511047, "grad_norm": 1.03125, "learning_rate": 0.000499881732778379, "loss": 5.7696, "mean_token_accuracy": 0.16185117661952972, "num_tokens": 4730033.0, "step": 2065 }, { "entropy": 5.668474435806274, "epoch": 0.1988472622478386, "grad_norm": 0.91015625, "learning_rate": 0.000499880618731189, "loss": 5.6346, "mean_token_accuracy": 0.17201206237077712, "num_tokens": 4742084.0, "step": 2070 }, { "entropy": 5.801948118209839, "epoch": 0.19932756964457252, "grad_norm": 0.98046875, "learning_rate": 0.0004998794994629388, "loss": 5.8485, "mean_token_accuracy": 0.16415513008832933, "num_tokens": 4753885.0, "step": 2075 }, { "entropy": 5.755141353607177, "epoch": 0.19980787704130643, "grad_norm": 1.0, "learning_rate": 0.0004998783749736545, "loss": 5.6852, "mean_token_accuracy": 0.17273288518190383, "num_tokens": 4765686.0, "step": 2080 }, { "entropy": 5.7318039894104, "epoch": 0.20028818443804033, "grad_norm": 0.96875, "learning_rate": 0.0004998772452633619, "loss": 5.7343, "mean_token_accuracy": 0.1667577311396599, "num_tokens": 4777157.0, "step": 2085 }, { "entropy": 5.734004545211792, "epoch": 0.20076849183477424, "grad_norm": 1.0078125, "learning_rate": 0.0004998761103320876, "loss": 5.6803, "mean_token_accuracy": 0.17569620162248611, "num_tokens": 4788583.0, "step": 2090 }, { "entropy": 5.81385350227356, "epoch": 0.20124879923150815, "grad_norm": 0.94140625, "learning_rate": 0.0004998749701798577, "loss": 5.795, "mean_token_accuracy": 0.164644692838192, "num_tokens": 4800749.0, "step": 2095 }, { "entropy": 5.652225208282471, "epoch": 0.2017291066282421, "grad_norm": 0.96875, "learning_rate": 0.0004998738248066986, "loss": 5.7001, "mean_token_accuracy": 0.17118856757879258, "num_tokens": 4812488.0, "step": 2100 }, { "entropy": 5.816308832168579, "epoch": 0.202209414024976, "grad_norm": 1.0859375, "learning_rate": 0.0004998726742126372, "loss": 5.6902, "mean_token_accuracy": 0.17228334546089172, "num_tokens": 4823495.0, "step": 2105 }, { "entropy": 5.622010517120361, "epoch": 0.2026897214217099, "grad_norm": 1.046875, "learning_rate": 0.0004998715183976999, "loss": 5.726, "mean_token_accuracy": 0.16997579634189605, "num_tokens": 4834450.0, "step": 2110 }, { "entropy": 5.763468551635742, "epoch": 0.20317002881844382, "grad_norm": 0.91796875, "learning_rate": 0.0004998703573619137, "loss": 5.6443, "mean_token_accuracy": 0.18120874017477034, "num_tokens": 4846826.0, "step": 2115 }, { "entropy": 5.804740762710571, "epoch": 0.20365033621517772, "grad_norm": 0.9296875, "learning_rate": 0.0004998691911053056, "loss": 5.8366, "mean_token_accuracy": 0.15913107842206956, "num_tokens": 4859668.0, "step": 2120 }, { "entropy": 5.727064418792724, "epoch": 0.20413064361191163, "grad_norm": 1.0546875, "learning_rate": 0.0004998680196279026, "loss": 5.7049, "mean_token_accuracy": 0.17213667631149293, "num_tokens": 4871727.0, "step": 2125 }, { "entropy": 5.794467830657959, "epoch": 0.20461095100864554, "grad_norm": 1.015625, "learning_rate": 0.0004998668429297319, "loss": 5.7674, "mean_token_accuracy": 0.17240212336182595, "num_tokens": 4882191.0, "step": 2130 }, { "entropy": 5.760322952270508, "epoch": 0.20509125840537945, "grad_norm": 1.078125, "learning_rate": 0.0004998656610108208, "loss": 5.6971, "mean_token_accuracy": 0.1685373991727829, "num_tokens": 4892416.0, "step": 2135 }, { "entropy": 5.694274854660034, "epoch": 0.20557156580211336, "grad_norm": 1.03125, "learning_rate": 0.0004998644738711969, "loss": 5.6674, "mean_token_accuracy": 0.1685459852218628, "num_tokens": 4903572.0, "step": 2140 }, { "entropy": 5.810105037689209, "epoch": 0.20605187319884727, "grad_norm": 0.875, "learning_rate": 0.0004998632815108874, "loss": 5.763, "mean_token_accuracy": 0.16395961344242097, "num_tokens": 4915417.0, "step": 2145 }, { "entropy": 5.73304591178894, "epoch": 0.20653218059558118, "grad_norm": 1.0234375, "learning_rate": 0.0004998620839299203, "loss": 5.6495, "mean_token_accuracy": 0.17259960770606994, "num_tokens": 4926943.0, "step": 2150 }, { "entropy": 5.6710865020751955, "epoch": 0.2070124879923151, "grad_norm": 1.0234375, "learning_rate": 0.0004998608811283233, "loss": 5.6095, "mean_token_accuracy": 0.17803010493516921, "num_tokens": 4937724.0, "step": 2155 }, { "entropy": 5.7808784484863285, "epoch": 0.207492795389049, "grad_norm": 0.984375, "learning_rate": 0.0004998596731061244, "loss": 5.7756, "mean_token_accuracy": 0.16368448734283447, "num_tokens": 4949970.0, "step": 2160 }, { "entropy": 5.784394645690918, "epoch": 0.2079731027857829, "grad_norm": 0.9765625, "learning_rate": 0.0004998584598633516, "loss": 5.774, "mean_token_accuracy": 0.16977567672729493, "num_tokens": 4961389.0, "step": 2165 }, { "entropy": 5.7822630405426025, "epoch": 0.2084534101825168, "grad_norm": 1.015625, "learning_rate": 0.0004998572414000329, "loss": 5.82, "mean_token_accuracy": 0.16696709543466567, "num_tokens": 4973888.0, "step": 2170 }, { "entropy": 5.75656681060791, "epoch": 0.20893371757925072, "grad_norm": 1.03125, "learning_rate": 0.0004998560177161969, "loss": 5.7667, "mean_token_accuracy": 0.1604086473584175, "num_tokens": 4985423.0, "step": 2175 }, { "entropy": 5.70469822883606, "epoch": 0.20941402497598463, "grad_norm": 0.93359375, "learning_rate": 0.0004998547888118718, "loss": 5.726, "mean_token_accuracy": 0.16619897931814193, "num_tokens": 4997711.0, "step": 2180 }, { "entropy": 5.7725687503814695, "epoch": 0.20989433237271854, "grad_norm": 0.97265625, "learning_rate": 0.0004998535546870862, "loss": 5.7454, "mean_token_accuracy": 0.1679087519645691, "num_tokens": 5009633.0, "step": 2185 }, { "entropy": 5.739374876022339, "epoch": 0.21037463976945245, "grad_norm": 1.0234375, "learning_rate": 0.0004998523153418687, "loss": 5.6759, "mean_token_accuracy": 0.17375072985887527, "num_tokens": 5021523.0, "step": 2190 }, { "entropy": 5.785361337661743, "epoch": 0.21085494716618636, "grad_norm": 1.0078125, "learning_rate": 0.0004998510707762481, "loss": 5.7695, "mean_token_accuracy": 0.1699072614312172, "num_tokens": 5033513.0, "step": 2195 }, { "entropy": 5.7873194217681885, "epoch": 0.21133525456292027, "grad_norm": 0.90625, "learning_rate": 0.0004998498209902533, "loss": 5.7758, "mean_token_accuracy": 0.16922611892223358, "num_tokens": 5047055.0, "step": 2200 }, { "entropy": 5.707646226882934, "epoch": 0.21181556195965417, "grad_norm": 1.0703125, "learning_rate": 0.0004998485659839134, "loss": 5.6497, "mean_token_accuracy": 0.17682456970214844, "num_tokens": 5057613.0, "step": 2205 }, { "entropy": 5.753945970535279, "epoch": 0.21229586935638808, "grad_norm": 1.015625, "learning_rate": 0.0004998473057572575, "loss": 5.7615, "mean_token_accuracy": 0.16833806186914443, "num_tokens": 5068886.0, "step": 2210 }, { "entropy": 5.742906093597412, "epoch": 0.212776176753122, "grad_norm": 1.0546875, "learning_rate": 0.0004998460403103146, "loss": 5.7494, "mean_token_accuracy": 0.16465574279427528, "num_tokens": 5079978.0, "step": 2215 }, { "entropy": 5.736083173751831, "epoch": 0.2132564841498559, "grad_norm": 1.078125, "learning_rate": 0.0004998447696431146, "loss": 5.7159, "mean_token_accuracy": 0.17075446248054504, "num_tokens": 5091021.0, "step": 2220 }, { "entropy": 5.6740076541900635, "epoch": 0.2137367915465898, "grad_norm": 1.046875, "learning_rate": 0.0004998434937556865, "loss": 5.5988, "mean_token_accuracy": 0.181574647128582, "num_tokens": 5101483.0, "step": 2225 }, { "entropy": 5.708674907684326, "epoch": 0.21421709894332372, "grad_norm": 0.98828125, "learning_rate": 0.0004998422126480602, "loss": 5.7447, "mean_token_accuracy": 0.16306292563676833, "num_tokens": 5113116.0, "step": 2230 }, { "entropy": 5.82704176902771, "epoch": 0.21469740634005763, "grad_norm": 1.0703125, "learning_rate": 0.0004998409263202653, "loss": 5.6819, "mean_token_accuracy": 0.1686948984861374, "num_tokens": 5124824.0, "step": 2235 }, { "entropy": 5.589908075332642, "epoch": 0.21517771373679154, "grad_norm": 1.0, "learning_rate": 0.0004998396347723318, "loss": 5.6335, "mean_token_accuracy": 0.16587817817926406, "num_tokens": 5137567.0, "step": 2240 }, { "entropy": 5.72907018661499, "epoch": 0.21565802113352545, "grad_norm": 0.94921875, "learning_rate": 0.0004998383380042895, "loss": 5.6846, "mean_token_accuracy": 0.16729460805654525, "num_tokens": 5149016.0, "step": 2245 }, { "entropy": 5.6214783668518065, "epoch": 0.21613832853025935, "grad_norm": 1.0390625, "learning_rate": 0.0004998370360161688, "loss": 5.5788, "mean_token_accuracy": 0.17212725132703782, "num_tokens": 5160356.0, "step": 2250 }, { "entropy": 5.79612250328064, "epoch": 0.21661863592699326, "grad_norm": 0.97265625, "learning_rate": 0.0004998357288079996, "loss": 5.7818, "mean_token_accuracy": 0.16184753328561782, "num_tokens": 5172100.0, "step": 2255 }, { "entropy": 5.740008592605591, "epoch": 0.21709894332372717, "grad_norm": 1.046875, "learning_rate": 0.0004998344163798125, "loss": 5.7405, "mean_token_accuracy": 0.16320510655641557, "num_tokens": 5183984.0, "step": 2260 }, { "entropy": 5.707123565673828, "epoch": 0.21757925072046108, "grad_norm": 0.93359375, "learning_rate": 0.0004998330987316379, "loss": 5.7153, "mean_token_accuracy": 0.167342671751976, "num_tokens": 5195853.0, "step": 2265 }, { "entropy": 5.6320737361907955, "epoch": 0.21805955811719502, "grad_norm": 0.99609375, "learning_rate": 0.0004998317758635062, "loss": 5.5593, "mean_token_accuracy": 0.17451774328947067, "num_tokens": 5206995.0, "step": 2270 }, { "entropy": 5.515458297729492, "epoch": 0.21853986551392893, "grad_norm": 0.99609375, "learning_rate": 0.0004998304477754484, "loss": 5.5989, "mean_token_accuracy": 0.17679600268602372, "num_tokens": 5219291.0, "step": 2275 }, { "entropy": 5.740645408630371, "epoch": 0.21902017291066284, "grad_norm": 1.046875, "learning_rate": 0.0004998291144674952, "loss": 5.6885, "mean_token_accuracy": 0.17223394364118577, "num_tokens": 5230856.0, "step": 2280 }, { "entropy": 5.601490020751953, "epoch": 0.21950048030739674, "grad_norm": 0.98046875, "learning_rate": 0.0004998277759396776, "loss": 5.5333, "mean_token_accuracy": 0.1814967930316925, "num_tokens": 5242871.0, "step": 2285 }, { "entropy": 5.656805944442749, "epoch": 0.21998078770413065, "grad_norm": 1.0078125, "learning_rate": 0.0004998264321920265, "loss": 5.64, "mean_token_accuracy": 0.17801354676485062, "num_tokens": 5253835.0, "step": 2290 }, { "entropy": 5.676252794265747, "epoch": 0.22046109510086456, "grad_norm": 0.890625, "learning_rate": 0.0004998250832245734, "loss": 5.6181, "mean_token_accuracy": 0.17702293545007705, "num_tokens": 5266195.0, "step": 2295 }, { "entropy": 5.641697740554809, "epoch": 0.22094140249759847, "grad_norm": 1.0390625, "learning_rate": 0.0004998237290373494, "loss": 5.6002, "mean_token_accuracy": 0.1801271617412567, "num_tokens": 5277499.0, "step": 2300 }, { "entropy": 5.739913368225098, "epoch": 0.22142170989433238, "grad_norm": 0.96875, "learning_rate": 0.000499822369630386, "loss": 5.7231, "mean_token_accuracy": 0.1597047820687294, "num_tokens": 5288622.0, "step": 2305 }, { "entropy": 5.738846015930176, "epoch": 0.2219020172910663, "grad_norm": 1.1484375, "learning_rate": 0.0004998210050037148, "loss": 5.7816, "mean_token_accuracy": 0.16195343434810638, "num_tokens": 5299664.0, "step": 2310 }, { "entropy": 5.717037725448608, "epoch": 0.2223823246878002, "grad_norm": 0.99609375, "learning_rate": 0.0004998196351573674, "loss": 5.6552, "mean_token_accuracy": 0.17402878403663635, "num_tokens": 5311627.0, "step": 2315 }, { "entropy": 5.5637411117553714, "epoch": 0.2228626320845341, "grad_norm": 0.98046875, "learning_rate": 0.0004998182600913757, "loss": 5.5627, "mean_token_accuracy": 0.17947529554367064, "num_tokens": 5323000.0, "step": 2320 }, { "entropy": 5.704880237579346, "epoch": 0.22334293948126802, "grad_norm": 0.98828125, "learning_rate": 0.0004998168798057715, "loss": 5.5992, "mean_token_accuracy": 0.18110302537679673, "num_tokens": 5333811.0, "step": 2325 }, { "entropy": 5.615099573135376, "epoch": 0.22382324687800192, "grad_norm": 1.015625, "learning_rate": 0.000499815494300587, "loss": 5.5991, "mean_token_accuracy": 0.17574110478162766, "num_tokens": 5344762.0, "step": 2330 }, { "entropy": 5.721481513977051, "epoch": 0.22430355427473583, "grad_norm": 1.0625, "learning_rate": 0.0004998141035758542, "loss": 5.6195, "mean_token_accuracy": 0.17343118488788606, "num_tokens": 5356112.0, "step": 2335 }, { "entropy": 5.655849504470825, "epoch": 0.22478386167146974, "grad_norm": 1.171875, "learning_rate": 0.0004998127076316054, "loss": 5.7311, "mean_token_accuracy": 0.17190437763929367, "num_tokens": 5367339.0, "step": 2340 }, { "entropy": 5.674526071548462, "epoch": 0.22526416906820365, "grad_norm": 0.99609375, "learning_rate": 0.0004998113064678734, "loss": 5.6665, "mean_token_accuracy": 0.17564141601324082, "num_tokens": 5378627.0, "step": 2345 }, { "entropy": 5.726110649108887, "epoch": 0.22574447646493756, "grad_norm": 1.0234375, "learning_rate": 0.0004998099000846901, "loss": 5.7012, "mean_token_accuracy": 0.1681268870830536, "num_tokens": 5390209.0, "step": 2350 }, { "entropy": 5.734390020370483, "epoch": 0.22622478386167147, "grad_norm": 1.1640625, "learning_rate": 0.0004998084884820887, "loss": 5.6833, "mean_token_accuracy": 0.17136491537094117, "num_tokens": 5401578.0, "step": 2355 }, { "entropy": 5.615032052993774, "epoch": 0.22670509125840538, "grad_norm": 1.0, "learning_rate": 0.0004998070716601016, "loss": 5.5881, "mean_token_accuracy": 0.17977205514907837, "num_tokens": 5413831.0, "step": 2360 }, { "entropy": 5.722073316574097, "epoch": 0.2271853986551393, "grad_norm": 1.0390625, "learning_rate": 0.0004998056496187618, "loss": 5.6496, "mean_token_accuracy": 0.1711253985762596, "num_tokens": 5425430.0, "step": 2365 }, { "entropy": 5.49839334487915, "epoch": 0.2276657060518732, "grad_norm": 1.0, "learning_rate": 0.0004998042223581025, "loss": 5.4985, "mean_token_accuracy": 0.1870403528213501, "num_tokens": 5435353.0, "step": 2370 }, { "entropy": 5.7514622688293455, "epoch": 0.2281460134486071, "grad_norm": 0.97265625, "learning_rate": 0.0004998027898781565, "loss": 5.6991, "mean_token_accuracy": 0.17083023190498353, "num_tokens": 5446925.0, "step": 2375 }, { "entropy": 5.589994049072265, "epoch": 0.228626320845341, "grad_norm": 1.046875, "learning_rate": 0.0004998013521789574, "loss": 5.5899, "mean_token_accuracy": 0.1772562175989151, "num_tokens": 5456613.0, "step": 2380 }, { "entropy": 5.697564649581909, "epoch": 0.22910662824207492, "grad_norm": 1.1015625, "learning_rate": 0.0004997999092605384, "loss": 5.6209, "mean_token_accuracy": 0.17314212173223495, "num_tokens": 5467790.0, "step": 2385 }, { "entropy": 5.672542333602905, "epoch": 0.22958693563880883, "grad_norm": 1.0, "learning_rate": 0.000499798461122933, "loss": 5.6065, "mean_token_accuracy": 0.17598363608121873, "num_tokens": 5479166.0, "step": 2390 }, { "entropy": 5.594286203384399, "epoch": 0.23006724303554274, "grad_norm": 0.99609375, "learning_rate": 0.0004997970077661748, "loss": 5.5932, "mean_token_accuracy": 0.18340873271226882, "num_tokens": 5490186.0, "step": 2395 }, { "entropy": 5.690382814407348, "epoch": 0.23054755043227665, "grad_norm": 1.03125, "learning_rate": 0.0004997955491902977, "loss": 5.5575, "mean_token_accuracy": 0.1718940794467926, "num_tokens": 5500416.0, "step": 2400 }, { "entropy": 5.582558584213257, "epoch": 0.23102785782901056, "grad_norm": 1.0390625, "learning_rate": 0.0004997940853953354, "loss": 5.6489, "mean_token_accuracy": 0.17370383739471434, "num_tokens": 5512189.0, "step": 2405 }, { "entropy": 5.628128719329834, "epoch": 0.23150816522574447, "grad_norm": 0.96484375, "learning_rate": 0.000499792616381322, "loss": 5.5142, "mean_token_accuracy": 0.1828036591410637, "num_tokens": 5523631.0, "step": 2410 }, { "entropy": 5.609222555160523, "epoch": 0.23198847262247838, "grad_norm": 0.96875, "learning_rate": 0.0004997911421482914, "loss": 5.5763, "mean_token_accuracy": 0.1823565348982811, "num_tokens": 5535637.0, "step": 2415 }, { "entropy": 5.639013814926147, "epoch": 0.23246878001921228, "grad_norm": 1.0078125, "learning_rate": 0.000499789662696278, "loss": 5.5869, "mean_token_accuracy": 0.18035637438297272, "num_tokens": 5546470.0, "step": 2420 }, { "entropy": 5.694498586654663, "epoch": 0.2329490874159462, "grad_norm": 0.95703125, "learning_rate": 0.0004997881780253162, "loss": 5.7456, "mean_token_accuracy": 0.1703657627105713, "num_tokens": 5558633.0, "step": 2425 }, { "entropy": 5.6558629989624025, "epoch": 0.2334293948126801, "grad_norm": 0.875, "learning_rate": 0.0004997866881354403, "loss": 5.6547, "mean_token_accuracy": 0.17033104449510575, "num_tokens": 5570427.0, "step": 2430 }, { "entropy": 5.6951744556427, "epoch": 0.23390970220941404, "grad_norm": 0.9765625, "learning_rate": 0.000499785193026685, "loss": 5.6383, "mean_token_accuracy": 0.17484120875597, "num_tokens": 5580991.0, "step": 2435 }, { "entropy": 5.701549911499024, "epoch": 0.23439000960614795, "grad_norm": 1.03125, "learning_rate": 0.0004997836926990851, "loss": 5.6816, "mean_token_accuracy": 0.17114701271057128, "num_tokens": 5592777.0, "step": 2440 }, { "entropy": 5.602617788314819, "epoch": 0.23487031700288186, "grad_norm": 1.015625, "learning_rate": 0.0004997821871526752, "loss": 5.5874, "mean_token_accuracy": 0.17974285781383514, "num_tokens": 5603326.0, "step": 2445 }, { "entropy": 5.631419324874878, "epoch": 0.23535062439961577, "grad_norm": 1.1171875, "learning_rate": 0.0004997806763874905, "loss": 5.5697, "mean_token_accuracy": 0.1791187435388565, "num_tokens": 5614504.0, "step": 2450 }, { "entropy": 5.617094326019287, "epoch": 0.23583093179634967, "grad_norm": 0.98046875, "learning_rate": 0.0004997791604035659, "loss": 5.6264, "mean_token_accuracy": 0.17776354700326918, "num_tokens": 5625150.0, "step": 2455 }, { "entropy": 5.6507199764251705, "epoch": 0.23631123919308358, "grad_norm": 0.9921875, "learning_rate": 0.0004997776392009366, "loss": 5.6458, "mean_token_accuracy": 0.169050732254982, "num_tokens": 5636815.0, "step": 2460 }, { "entropy": 5.706958866119384, "epoch": 0.2367915465898175, "grad_norm": 0.9453125, "learning_rate": 0.0004997761127796381, "loss": 5.6366, "mean_token_accuracy": 0.17092559188604356, "num_tokens": 5648272.0, "step": 2465 }, { "entropy": 5.628375577926636, "epoch": 0.2372718539865514, "grad_norm": 1.0078125, "learning_rate": 0.0004997745811397056, "loss": 5.5463, "mean_token_accuracy": 0.17801680713891982, "num_tokens": 5659227.0, "step": 2470 }, { "entropy": 5.6414820671081545, "epoch": 0.2377521613832853, "grad_norm": 1.046875, "learning_rate": 0.0004997730442811748, "loss": 5.6796, "mean_token_accuracy": 0.17399391829967498, "num_tokens": 5670411.0, "step": 2475 }, { "entropy": 5.5770539283752445, "epoch": 0.23823246878001922, "grad_norm": 1.0859375, "learning_rate": 0.0004997715022040814, "loss": 5.5182, "mean_token_accuracy": 0.1782184734940529, "num_tokens": 5681570.0, "step": 2480 }, { "entropy": 5.523485231399536, "epoch": 0.23871277617675313, "grad_norm": 0.984375, "learning_rate": 0.000499769954908461, "loss": 5.5022, "mean_token_accuracy": 0.1887900114059448, "num_tokens": 5693021.0, "step": 2485 }, { "entropy": 5.659896421432495, "epoch": 0.23919308357348704, "grad_norm": 0.9609375, "learning_rate": 0.0004997684023943498, "loss": 5.5883, "mean_token_accuracy": 0.17428779155015944, "num_tokens": 5704043.0, "step": 2490 }, { "entropy": 5.5805792808532715, "epoch": 0.23967339097022095, "grad_norm": 0.99609375, "learning_rate": 0.0004997668446617837, "loss": 5.6675, "mean_token_accuracy": 0.16685750484466552, "num_tokens": 5715735.0, "step": 2495 }, { "entropy": 5.760880804061889, "epoch": 0.24015369836695485, "grad_norm": 1.0625, "learning_rate": 0.0004997652817107989, "loss": 5.6294, "mean_token_accuracy": 0.17232899218797684, "num_tokens": 5725778.0, "step": 2500 }, { "entropy": 5.601306343078614, "epoch": 0.24063400576368876, "grad_norm": 1.0390625, "learning_rate": 0.0004997637135414315, "loss": 5.6628, "mean_token_accuracy": 0.17220552116632462, "num_tokens": 5737224.0, "step": 2505 }, { "entropy": 5.779234981536865, "epoch": 0.24111431316042267, "grad_norm": 0.9609375, "learning_rate": 0.0004997621401537183, "loss": 5.6855, "mean_token_accuracy": 0.17120948135852815, "num_tokens": 5749226.0, "step": 2510 }, { "entropy": 5.6741156578063965, "epoch": 0.24159462055715658, "grad_norm": 1.1015625, "learning_rate": 0.0004997605615476955, "loss": 5.6578, "mean_token_accuracy": 0.17114464193582535, "num_tokens": 5760282.0, "step": 2515 }, { "entropy": 5.539696168899536, "epoch": 0.2420749279538905, "grad_norm": 0.94921875, "learning_rate": 0.0004997589777234, "loss": 5.5633, "mean_token_accuracy": 0.181555312871933, "num_tokens": 5771756.0, "step": 2520 }, { "entropy": 5.650804233551026, "epoch": 0.2425552353506244, "grad_norm": 1.078125, "learning_rate": 0.0004997573886808684, "loss": 5.5835, "mean_token_accuracy": 0.16679947078227997, "num_tokens": 5783237.0, "step": 2525 }, { "entropy": 5.646309852600098, "epoch": 0.2430355427473583, "grad_norm": 1.0078125, "learning_rate": 0.0004997557944201375, "loss": 5.6814, "mean_token_accuracy": 0.17147036045789718, "num_tokens": 5794825.0, "step": 2530 }, { "entropy": 5.675209999084473, "epoch": 0.24351585014409222, "grad_norm": 1.0390625, "learning_rate": 0.0004997541949412445, "loss": 5.5712, "mean_token_accuracy": 0.18625136017799376, "num_tokens": 5805578.0, "step": 2535 }, { "entropy": 5.649836206436158, "epoch": 0.24399615754082613, "grad_norm": 0.984375, "learning_rate": 0.0004997525902442266, "loss": 5.6738, "mean_token_accuracy": 0.16476511359214782, "num_tokens": 5818201.0, "step": 2540 }, { "entropy": 5.602812147140503, "epoch": 0.24447646493756003, "grad_norm": 0.9296875, "learning_rate": 0.0004997509803291207, "loss": 5.5959, "mean_token_accuracy": 0.17587143927812576, "num_tokens": 5830319.0, "step": 2545 }, { "entropy": 5.5824614524841305, "epoch": 0.24495677233429394, "grad_norm": 1.0234375, "learning_rate": 0.0004997493651959647, "loss": 5.5428, "mean_token_accuracy": 0.17996817231178283, "num_tokens": 5840638.0, "step": 2550 }, { "entropy": 5.66239709854126, "epoch": 0.24543707973102785, "grad_norm": 0.90625, "learning_rate": 0.0004997477448447955, "loss": 5.5773, "mean_token_accuracy": 0.17367178648710252, "num_tokens": 5852472.0, "step": 2555 }, { "entropy": 5.678495073318482, "epoch": 0.24591738712776176, "grad_norm": 0.9921875, "learning_rate": 0.0004997461192756512, "loss": 5.6133, "mean_token_accuracy": 0.170744089782238, "num_tokens": 5863455.0, "step": 2560 }, { "entropy": 5.512450170516968, "epoch": 0.24639769452449567, "grad_norm": 1.1171875, "learning_rate": 0.0004997444884885694, "loss": 5.5251, "mean_token_accuracy": 0.17817995101213455, "num_tokens": 5873141.0, "step": 2565 }, { "entropy": 5.603986024856567, "epoch": 0.24687800192122958, "grad_norm": 1.0859375, "learning_rate": 0.0004997428524835879, "loss": 5.6316, "mean_token_accuracy": 0.17475323528051376, "num_tokens": 5884363.0, "step": 2570 }, { "entropy": 5.740997219085694, "epoch": 0.2473583093179635, "grad_norm": 0.96484375, "learning_rate": 0.0004997412112607446, "loss": 5.6721, "mean_token_accuracy": 0.17148932665586472, "num_tokens": 5895856.0, "step": 2575 }, { "entropy": 5.542859792709351, "epoch": 0.2478386167146974, "grad_norm": 1.046875, "learning_rate": 0.0004997395648200778, "loss": 5.4922, "mean_token_accuracy": 0.17950474172830583, "num_tokens": 5906657.0, "step": 2580 }, { "entropy": 5.600370979309082, "epoch": 0.2483189241114313, "grad_norm": 0.8984375, "learning_rate": 0.0004997379131616257, "loss": 5.6226, "mean_token_accuracy": 0.1700095072388649, "num_tokens": 5919496.0, "step": 2585 }, { "entropy": 5.690901279449463, "epoch": 0.24879923150816521, "grad_norm": 0.9375, "learning_rate": 0.0004997362562854266, "loss": 5.6843, "mean_token_accuracy": 0.16776154488325118, "num_tokens": 5932593.0, "step": 2590 }, { "entropy": 5.619813919067383, "epoch": 0.24927953890489912, "grad_norm": 1.015625, "learning_rate": 0.0004997345941915187, "loss": 5.6128, "mean_token_accuracy": 0.17226099967956543, "num_tokens": 5944080.0, "step": 2595 }, { "entropy": 5.602241802215576, "epoch": 0.24975984630163303, "grad_norm": 1.0078125, "learning_rate": 0.0004997329268799412, "loss": 5.5752, "mean_token_accuracy": 0.18460023701190947, "num_tokens": 5955703.0, "step": 2600 }, { "entropy": 5.62792739868164, "epoch": 0.25024015369836694, "grad_norm": 0.984375, "learning_rate": 0.0004997312543507322, "loss": 5.6565, "mean_token_accuracy": 0.1714890867471695, "num_tokens": 5966979.0, "step": 2605 }, { "entropy": 5.672908306121826, "epoch": 0.2507204610951009, "grad_norm": 1.03125, "learning_rate": 0.0004997295766039309, "loss": 5.545, "mean_token_accuracy": 0.17637500017881394, "num_tokens": 5978808.0, "step": 2610 }, { "entropy": 5.6401097774505615, "epoch": 0.25120076849183476, "grad_norm": 0.953125, "learning_rate": 0.0004997278936395761, "loss": 5.7288, "mean_token_accuracy": 0.16584430038928985, "num_tokens": 5992145.0, "step": 2615 }, { "entropy": 5.665263652801514, "epoch": 0.2516810758885687, "grad_norm": 0.96875, "learning_rate": 0.0004997262054577071, "loss": 5.5694, "mean_token_accuracy": 0.17564088106155396, "num_tokens": 6003723.0, "step": 2620 }, { "entropy": 5.6567973613739015, "epoch": 0.2521613832853026, "grad_norm": 1.0703125, "learning_rate": 0.0004997245120583627, "loss": 5.6351, "mean_token_accuracy": 0.1769047811627388, "num_tokens": 6014064.0, "step": 2625 }, { "entropy": 5.53907151222229, "epoch": 0.2526416906820365, "grad_norm": 0.93359375, "learning_rate": 0.0004997228134415825, "loss": 5.5168, "mean_token_accuracy": 0.1834915667772293, "num_tokens": 6025455.0, "step": 2630 }, { "entropy": 5.6452476501464846, "epoch": 0.2531219980787704, "grad_norm": 1.0390625, "learning_rate": 0.0004997211096074059, "loss": 5.6231, "mean_token_accuracy": 0.16973316073417663, "num_tokens": 6037347.0, "step": 2635 }, { "entropy": 5.600665187835693, "epoch": 0.25360230547550433, "grad_norm": 0.9921875, "learning_rate": 0.0004997194005558722, "loss": 5.5304, "mean_token_accuracy": 0.18019532412290573, "num_tokens": 6049236.0, "step": 2640 }, { "entropy": 5.534391641616821, "epoch": 0.2540826128722382, "grad_norm": 0.96484375, "learning_rate": 0.0004997176862870216, "loss": 5.5339, "mean_token_accuracy": 0.1798613414168358, "num_tokens": 6060982.0, "step": 2645 }, { "entropy": 5.637931680679321, "epoch": 0.25456292026897215, "grad_norm": 1.0546875, "learning_rate": 0.0004997159668008933, "loss": 5.5514, "mean_token_accuracy": 0.17985030263662338, "num_tokens": 6070925.0, "step": 2650 }, { "entropy": 5.526381587982177, "epoch": 0.25504322766570603, "grad_norm": 1.0078125, "learning_rate": 0.0004997142420975277, "loss": 5.514, "mean_token_accuracy": 0.18175738006830217, "num_tokens": 6081279.0, "step": 2655 }, { "entropy": 5.5633796691894535, "epoch": 0.25552353506243997, "grad_norm": 0.91796875, "learning_rate": 0.0004997125121769647, "loss": 5.6108, "mean_token_accuracy": 0.17793446481227876, "num_tokens": 6091797.0, "step": 2660 }, { "entropy": 5.687921333312988, "epoch": 0.25600384245917385, "grad_norm": 0.9296875, "learning_rate": 0.0004997107770392444, "loss": 5.6134, "mean_token_accuracy": 0.1804993599653244, "num_tokens": 6103435.0, "step": 2665 }, { "entropy": 5.648722791671753, "epoch": 0.2564841498559078, "grad_norm": 0.9375, "learning_rate": 0.000499709036684407, "loss": 5.6751, "mean_token_accuracy": 0.17587384432554246, "num_tokens": 6114531.0, "step": 2670 }, { "entropy": 5.569314622879029, "epoch": 0.25696445725264166, "grad_norm": 1.0078125, "learning_rate": 0.0004997072911124932, "loss": 5.5173, "mean_token_accuracy": 0.17945850938558577, "num_tokens": 6126110.0, "step": 2675 }, { "entropy": 5.670061159133911, "epoch": 0.2574447646493756, "grad_norm": 1.015625, "learning_rate": 0.0004997055403235432, "loss": 5.6187, "mean_token_accuracy": 0.1766670301556587, "num_tokens": 6137114.0, "step": 2680 }, { "entropy": 5.62683253288269, "epoch": 0.2579250720461095, "grad_norm": 0.984375, "learning_rate": 0.0004997037843175978, "loss": 5.5718, "mean_token_accuracy": 0.17658228576183319, "num_tokens": 6148696.0, "step": 2685 }, { "entropy": 5.59165620803833, "epoch": 0.2584053794428434, "grad_norm": 0.9609375, "learning_rate": 0.0004997020230946978, "loss": 5.568, "mean_token_accuracy": 0.1790614068508148, "num_tokens": 6160235.0, "step": 2690 }, { "entropy": 5.629477691650391, "epoch": 0.25888568683957736, "grad_norm": 0.96875, "learning_rate": 0.0004997002566548841, "loss": 5.5586, "mean_token_accuracy": 0.17292713820934297, "num_tokens": 6172031.0, "step": 2695 }, { "entropy": 5.48054838180542, "epoch": 0.25936599423631124, "grad_norm": 0.96875, "learning_rate": 0.0004996984849981976, "loss": 5.4233, "mean_token_accuracy": 0.1893267199397087, "num_tokens": 6183547.0, "step": 2700 }, { "entropy": 5.619540548324585, "epoch": 0.2598463016330452, "grad_norm": 0.97265625, "learning_rate": 0.0004996967081246794, "loss": 5.632, "mean_token_accuracy": 0.1678134724497795, "num_tokens": 6194768.0, "step": 2705 }, { "entropy": 5.6499683380126955, "epoch": 0.26032660902977905, "grad_norm": 0.984375, "learning_rate": 0.0004996949260343711, "loss": 5.6314, "mean_token_accuracy": 0.1706198126077652, "num_tokens": 6206099.0, "step": 2710 }, { "entropy": 5.624089670181275, "epoch": 0.260806916426513, "grad_norm": 1.046875, "learning_rate": 0.0004996931387273137, "loss": 5.6262, "mean_token_accuracy": 0.17660144418478013, "num_tokens": 6217530.0, "step": 2715 }, { "entropy": 5.713815212249756, "epoch": 0.2612872238232469, "grad_norm": 0.94140625, "learning_rate": 0.0004996913462035487, "loss": 5.6448, "mean_token_accuracy": 0.1767139658331871, "num_tokens": 6228564.0, "step": 2720 }, { "entropy": 5.539792156219482, "epoch": 0.2617675312199808, "grad_norm": 0.97265625, "learning_rate": 0.000499689548463118, "loss": 5.5174, "mean_token_accuracy": 0.17854675203561782, "num_tokens": 6239945.0, "step": 2725 }, { "entropy": 5.59919810295105, "epoch": 0.2622478386167147, "grad_norm": 0.9921875, "learning_rate": 0.0004996877455060631, "loss": 5.6312, "mean_token_accuracy": 0.17017472237348558, "num_tokens": 6251829.0, "step": 2730 }, { "entropy": 5.7330786228179935, "epoch": 0.2627281460134486, "grad_norm": 0.9765625, "learning_rate": 0.0004996859373324259, "loss": 5.7264, "mean_token_accuracy": 0.16224824339151384, "num_tokens": 6264823.0, "step": 2735 }, { "entropy": 5.5701476573944095, "epoch": 0.2632084534101825, "grad_norm": 1.015625, "learning_rate": 0.0004996841239422485, "loss": 5.4065, "mean_token_accuracy": 0.18482713848352433, "num_tokens": 6276247.0, "step": 2740 }, { "entropy": 5.470470857620239, "epoch": 0.26368876080691644, "grad_norm": 0.98828125, "learning_rate": 0.0004996823053355729, "loss": 5.5321, "mean_token_accuracy": 0.18076382875442504, "num_tokens": 6287593.0, "step": 2745 }, { "entropy": 5.685536909103393, "epoch": 0.2641690682036503, "grad_norm": 0.9921875, "learning_rate": 0.0004996804815124413, "loss": 5.6897, "mean_token_accuracy": 0.16898608654737474, "num_tokens": 6299918.0, "step": 2750 }, { "entropy": 5.568260049819946, "epoch": 0.26464937560038426, "grad_norm": 1.078125, "learning_rate": 0.0004996786524728962, "loss": 5.5287, "mean_token_accuracy": 0.18196363002061844, "num_tokens": 6311147.0, "step": 2755 }, { "entropy": 5.45229320526123, "epoch": 0.26512968299711814, "grad_norm": 0.96875, "learning_rate": 0.0004996768182169797, "loss": 5.4564, "mean_token_accuracy": 0.18652137070894242, "num_tokens": 6323239.0, "step": 2760 }, { "entropy": 5.692247343063355, "epoch": 0.2656099903938521, "grad_norm": 1.0234375, "learning_rate": 0.0004996749787447349, "loss": 5.5567, "mean_token_accuracy": 0.17187336832284927, "num_tokens": 6334625.0, "step": 2765 }, { "entropy": 5.545494651794433, "epoch": 0.26609029779058596, "grad_norm": 1.046875, "learning_rate": 0.000499673134056204, "loss": 5.5938, "mean_token_accuracy": 0.17517421692609786, "num_tokens": 6346068.0, "step": 2770 }, { "entropy": 5.584152412414551, "epoch": 0.2665706051873199, "grad_norm": 1.125, "learning_rate": 0.0004996712841514303, "loss": 5.5716, "mean_token_accuracy": 0.17334717959165574, "num_tokens": 6357097.0, "step": 2775 }, { "entropy": 5.656313180923462, "epoch": 0.2670509125840538, "grad_norm": 1.0859375, "learning_rate": 0.0004996694290304563, "loss": 5.6313, "mean_token_accuracy": 0.16709280461072923, "num_tokens": 6367481.0, "step": 2780 }, { "entropy": 5.52793607711792, "epoch": 0.2675312199807877, "grad_norm": 1.0, "learning_rate": 0.0004996675686933255, "loss": 5.5381, "mean_token_accuracy": 0.18144787847995758, "num_tokens": 6378873.0, "step": 2785 }, { "entropy": 5.664049291610718, "epoch": 0.2680115273775216, "grad_norm": 0.953125, "learning_rate": 0.0004996657031400807, "loss": 5.5768, "mean_token_accuracy": 0.18006865531206132, "num_tokens": 6390651.0, "step": 2790 }, { "entropy": 5.478256464004517, "epoch": 0.26849183477425553, "grad_norm": 1.078125, "learning_rate": 0.0004996638323707655, "loss": 5.446, "mean_token_accuracy": 0.1820421040058136, "num_tokens": 6401631.0, "step": 2795 }, { "entropy": 5.48651123046875, "epoch": 0.2689721421709894, "grad_norm": 0.97265625, "learning_rate": 0.0004996619563854232, "loss": 5.5308, "mean_token_accuracy": 0.1832943469285965, "num_tokens": 6413875.0, "step": 2800 }, { "entropy": 5.689049482345581, "epoch": 0.26945244956772335, "grad_norm": 1.03125, "learning_rate": 0.0004996600751840974, "loss": 5.5579, "mean_token_accuracy": 0.1733505442738533, "num_tokens": 6425764.0, "step": 2805 }, { "entropy": 5.478516244888306, "epoch": 0.26993275696445723, "grad_norm": 0.984375, "learning_rate": 0.0004996581887668317, "loss": 5.494, "mean_token_accuracy": 0.18221275955438615, "num_tokens": 6437911.0, "step": 2810 }, { "entropy": 5.534301519393921, "epoch": 0.27041306436119117, "grad_norm": 1.0859375, "learning_rate": 0.00049965629713367, "loss": 5.4961, "mean_token_accuracy": 0.18141991049051284, "num_tokens": 6449942.0, "step": 2815 }, { "entropy": 5.604593276977539, "epoch": 0.27089337175792505, "grad_norm": 0.953125, "learning_rate": 0.0004996544002846561, "loss": 5.6208, "mean_token_accuracy": 0.17682201713323592, "num_tokens": 6461729.0, "step": 2820 }, { "entropy": 5.614752101898193, "epoch": 0.271373679154659, "grad_norm": 1.0078125, "learning_rate": 0.0004996524982198343, "loss": 5.5988, "mean_token_accuracy": 0.17795798033475876, "num_tokens": 6472046.0, "step": 2825 }, { "entropy": 5.600375080108643, "epoch": 0.27185398655139287, "grad_norm": 1.0234375, "learning_rate": 0.0004996505909392485, "loss": 5.5667, "mean_token_accuracy": 0.17373612523078918, "num_tokens": 6483308.0, "step": 2830 }, { "entropy": 5.429362010955811, "epoch": 0.2723342939481268, "grad_norm": 0.98828125, "learning_rate": 0.0004996486784429429, "loss": 5.4311, "mean_token_accuracy": 0.18428465574979783, "num_tokens": 6495093.0, "step": 2835 }, { "entropy": 5.5981306552886965, "epoch": 0.2728146013448607, "grad_norm": 1.1015625, "learning_rate": 0.0004996467607309622, "loss": 5.5307, "mean_token_accuracy": 0.17854470163583755, "num_tokens": 6505933.0, "step": 2840 }, { "entropy": 5.626583003997803, "epoch": 0.2732949087415946, "grad_norm": 1.03125, "learning_rate": 0.0004996448378033507, "loss": 5.5893, "mean_token_accuracy": 0.17490534335374833, "num_tokens": 6517280.0, "step": 2845 }, { "entropy": 5.60156021118164, "epoch": 0.2737752161383285, "grad_norm": 1.03125, "learning_rate": 0.0004996429096601532, "loss": 5.6315, "mean_token_accuracy": 0.17191672027111055, "num_tokens": 6528980.0, "step": 2850 }, { "entropy": 5.601687097549439, "epoch": 0.27425552353506244, "grad_norm": 1.046875, "learning_rate": 0.0004996409763014144, "loss": 5.6235, "mean_token_accuracy": 0.17743158787488938, "num_tokens": 6540670.0, "step": 2855 }, { "entropy": 5.593181991577149, "epoch": 0.2747358309317964, "grad_norm": 1.0078125, "learning_rate": 0.0004996390377271791, "loss": 5.5855, "mean_token_accuracy": 0.18115401417016982, "num_tokens": 6551302.0, "step": 2860 }, { "entropy": 5.5507872104644775, "epoch": 0.27521613832853026, "grad_norm": 1.09375, "learning_rate": 0.0004996370939374924, "loss": 5.5433, "mean_token_accuracy": 0.1738438919186592, "num_tokens": 6563177.0, "step": 2865 }, { "entropy": 5.72943229675293, "epoch": 0.2756964457252642, "grad_norm": 1.1953125, "learning_rate": 0.0004996351449323994, "loss": 5.6521, "mean_token_accuracy": 0.17468605786561966, "num_tokens": 6573323.0, "step": 2870 }, { "entropy": 5.5880653858184814, "epoch": 0.2761767531219981, "grad_norm": 1.03125, "learning_rate": 0.0004996331907119455, "loss": 5.591, "mean_token_accuracy": 0.16756793707609177, "num_tokens": 6585382.0, "step": 2875 }, { "entropy": 5.474012231826782, "epoch": 0.276657060518732, "grad_norm": 1.0078125, "learning_rate": 0.0004996312312761758, "loss": 5.467, "mean_token_accuracy": 0.1900227263569832, "num_tokens": 6596629.0, "step": 2880 }, { "entropy": 5.6394744396209715, "epoch": 0.2771373679154659, "grad_norm": 1.0234375, "learning_rate": 0.000499629266625136, "loss": 5.5734, "mean_token_accuracy": 0.17828488498926162, "num_tokens": 6608408.0, "step": 2885 }, { "entropy": 5.638094282150268, "epoch": 0.27761767531219983, "grad_norm": 1.1015625, "learning_rate": 0.0004996272967588715, "loss": 5.5989, "mean_token_accuracy": 0.1704651966691017, "num_tokens": 6619375.0, "step": 2890 }, { "entropy": 5.618940448760986, "epoch": 0.2780979827089337, "grad_norm": 1.09375, "learning_rate": 0.0004996253216774283, "loss": 5.6398, "mean_token_accuracy": 0.17304042726755142, "num_tokens": 6631317.0, "step": 2895 }, { "entropy": 5.576578378677368, "epoch": 0.27857829010566765, "grad_norm": 1.1015625, "learning_rate": 0.0004996233413808521, "loss": 5.4904, "mean_token_accuracy": 0.18116467744112014, "num_tokens": 6642009.0, "step": 2900 }, { "entropy": 5.609902429580688, "epoch": 0.27905859750240153, "grad_norm": 1.09375, "learning_rate": 0.0004996213558691889, "loss": 5.6478, "mean_token_accuracy": 0.1682332620024681, "num_tokens": 6654713.0, "step": 2905 }, { "entropy": 5.651772451400757, "epoch": 0.27953890489913547, "grad_norm": 0.95703125, "learning_rate": 0.0004996193651424848, "loss": 5.6064, "mean_token_accuracy": 0.17700932323932647, "num_tokens": 6667157.0, "step": 2910 }, { "entropy": 5.575735330581665, "epoch": 0.28001921229586935, "grad_norm": 0.94140625, "learning_rate": 0.000499617369200786, "loss": 5.5599, "mean_token_accuracy": 0.18871267586946489, "num_tokens": 6679573.0, "step": 2915 }, { "entropy": 5.593114852905273, "epoch": 0.2804995196926033, "grad_norm": 0.859375, "learning_rate": 0.0004996153680441389, "loss": 5.624, "mean_token_accuracy": 0.17413021624088287, "num_tokens": 6691768.0, "step": 2920 }, { "entropy": 5.653490257263184, "epoch": 0.28097982708933716, "grad_norm": 1.015625, "learning_rate": 0.00049961336167259, "loss": 5.5864, "mean_token_accuracy": 0.17438612282276153, "num_tokens": 6701964.0, "step": 2925 }, { "entropy": 5.618965578079224, "epoch": 0.2814601344860711, "grad_norm": 1.0859375, "learning_rate": 0.0004996113500861857, "loss": 5.5759, "mean_token_accuracy": 0.1726679503917694, "num_tokens": 6713506.0, "step": 2930 }, { "entropy": 5.581022930145264, "epoch": 0.281940441882805, "grad_norm": 1.0859375, "learning_rate": 0.0004996093332849729, "loss": 5.593, "mean_token_accuracy": 0.1725487932562828, "num_tokens": 6724616.0, "step": 2935 }, { "entropy": 5.562248182296753, "epoch": 0.2824207492795389, "grad_norm": 1.0234375, "learning_rate": 0.0004996073112689983, "loss": 5.5803, "mean_token_accuracy": 0.17757243812084197, "num_tokens": 6735054.0, "step": 2940 }, { "entropy": 5.616918420791626, "epoch": 0.2829010566762728, "grad_norm": 0.9609375, "learning_rate": 0.0004996052840383088, "loss": 5.6325, "mean_token_accuracy": 0.17381539791822434, "num_tokens": 6746756.0, "step": 2945 }, { "entropy": 5.603857469558716, "epoch": 0.28338136407300674, "grad_norm": 0.89453125, "learning_rate": 0.0004996032515929516, "loss": 5.4992, "mean_token_accuracy": 0.1776091992855072, "num_tokens": 6759566.0, "step": 2950 }, { "entropy": 5.573670148849487, "epoch": 0.2838616714697406, "grad_norm": 1.0, "learning_rate": 0.0004996012139329738, "loss": 5.5225, "mean_token_accuracy": 0.17899418324232103, "num_tokens": 6771375.0, "step": 2955 }, { "entropy": 5.619125080108643, "epoch": 0.28434197886647455, "grad_norm": 1.1015625, "learning_rate": 0.0004995991710584228, "loss": 5.6311, "mean_token_accuracy": 0.16734524071216583, "num_tokens": 6783252.0, "step": 2960 }, { "entropy": 5.58878116607666, "epoch": 0.28482228626320844, "grad_norm": 0.953125, "learning_rate": 0.0004995971229693459, "loss": 5.5941, "mean_token_accuracy": 0.17340553402900696, "num_tokens": 6795525.0, "step": 2965 }, { "entropy": 5.610876131057739, "epoch": 0.28530259365994237, "grad_norm": 0.9296875, "learning_rate": 0.0004995950696657909, "loss": 5.5353, "mean_token_accuracy": 0.17990380227565766, "num_tokens": 6807212.0, "step": 2970 }, { "entropy": 5.52398419380188, "epoch": 0.28578290105667625, "grad_norm": 1.015625, "learning_rate": 0.0004995930111478051, "loss": 5.4712, "mean_token_accuracy": 0.1771505206823349, "num_tokens": 6819367.0, "step": 2975 }, { "entropy": 5.5713125705719, "epoch": 0.2862632084534102, "grad_norm": 1.046875, "learning_rate": 0.0004995909474154365, "loss": 5.5531, "mean_token_accuracy": 0.17791730761528016, "num_tokens": 6830405.0, "step": 2980 }, { "entropy": 5.524326038360596, "epoch": 0.28674351585014407, "grad_norm": 0.9765625, "learning_rate": 0.0004995888784687331, "loss": 5.5413, "mean_token_accuracy": 0.18089909702539445, "num_tokens": 6841479.0, "step": 2985 }, { "entropy": 5.545838022232056, "epoch": 0.287223823246878, "grad_norm": 1.015625, "learning_rate": 0.0004995868043077428, "loss": 5.5784, "mean_token_accuracy": 0.1739095240831375, "num_tokens": 6851585.0, "step": 2990 }, { "entropy": 5.605233526229858, "epoch": 0.2877041306436119, "grad_norm": 1.0390625, "learning_rate": 0.0004995847249325137, "loss": 5.5488, "mean_token_accuracy": 0.1776391088962555, "num_tokens": 6863176.0, "step": 2995 }, { "entropy": 5.596064901351928, "epoch": 0.2881844380403458, "grad_norm": 1.0703125, "learning_rate": 0.0004995826403430942, "loss": 5.595, "mean_token_accuracy": 0.17474860548973084, "num_tokens": 6874021.0, "step": 3000 }, { "epoch": 0.2881844380403458, "eval_entropy": 5.440896103871502, "eval_loss": 5.576871395111084, "eval_mean_token_accuracy": 0.18414354559419172, "eval_num_tokens": 6874021.0, "eval_runtime": 26.9459, "eval_samples_per_second": 1217.809, "eval_steps_per_second": 152.231, "step": 3000 }, { "entropy": 5.6302040100097654, "epoch": 0.2886647454370797, "grad_norm": 1.03125, "learning_rate": 0.0004995805505395328, "loss": 5.5584, "mean_token_accuracy": 0.17477040886878967, "num_tokens": 6884999.0, "step": 3005 }, { "entropy": 5.559301853179932, "epoch": 0.28914505283381364, "grad_norm": 1.0703125, "learning_rate": 0.0004995784555218778, "loss": 5.548, "mean_token_accuracy": 0.17850742042064666, "num_tokens": 6897021.0, "step": 3010 }, { "entropy": 5.518660974502564, "epoch": 0.2896253602305475, "grad_norm": 1.0703125, "learning_rate": 0.0004995763552901779, "loss": 5.5449, "mean_token_accuracy": 0.17909058481454848, "num_tokens": 6908320.0, "step": 3015 }, { "entropy": 5.68627028465271, "epoch": 0.29010566762728146, "grad_norm": 1.015625, "learning_rate": 0.0004995742498444818, "loss": 5.5342, "mean_token_accuracy": 0.18174685835838317, "num_tokens": 6919957.0, "step": 3020 }, { "entropy": 5.529996299743653, "epoch": 0.2905859750240154, "grad_norm": 0.9609375, "learning_rate": 0.0004995721391848387, "loss": 5.4942, "mean_token_accuracy": 0.17575003057718278, "num_tokens": 6930531.0, "step": 3025 }, { "entropy": 5.623160696029663, "epoch": 0.2910662824207493, "grad_norm": 1.0234375, "learning_rate": 0.0004995700233112972, "loss": 5.6325, "mean_token_accuracy": 0.17704310566186904, "num_tokens": 6942556.0, "step": 3030 }, { "entropy": 5.583187103271484, "epoch": 0.2915465898174832, "grad_norm": 0.9609375, "learning_rate": 0.0004995679022239066, "loss": 5.5762, "mean_token_accuracy": 0.17900587618350983, "num_tokens": 6954410.0, "step": 3035 }, { "entropy": 5.579293632507325, "epoch": 0.2920268972142171, "grad_norm": 1.0859375, "learning_rate": 0.0004995657759227162, "loss": 5.5857, "mean_token_accuracy": 0.17669540643692017, "num_tokens": 6964970.0, "step": 3040 }, { "entropy": 5.554018545150757, "epoch": 0.29250720461095103, "grad_norm": 0.9765625, "learning_rate": 0.0004995636444077751, "loss": 5.4673, "mean_token_accuracy": 0.1851392537355423, "num_tokens": 6976016.0, "step": 3045 }, { "entropy": 5.490430164337158, "epoch": 0.2929875120076849, "grad_norm": 1.046875, "learning_rate": 0.0004995615076791333, "loss": 5.4999, "mean_token_accuracy": 0.1816742718219757, "num_tokens": 6987199.0, "step": 3050 }, { "entropy": 5.5644313335418705, "epoch": 0.29346781940441885, "grad_norm": 1.0078125, "learning_rate": 0.0004995593657368399, "loss": 5.5218, "mean_token_accuracy": 0.18650518208742142, "num_tokens": 6999174.0, "step": 3055 }, { "entropy": 5.557963037490845, "epoch": 0.29394812680115273, "grad_norm": 0.97265625, "learning_rate": 0.000499557218580945, "loss": 5.5884, "mean_token_accuracy": 0.17525261044502258, "num_tokens": 7012148.0, "step": 3060 }, { "entropy": 5.486077213287354, "epoch": 0.29442843419788667, "grad_norm": 1.0234375, "learning_rate": 0.0004995550662114981, "loss": 5.4609, "mean_token_accuracy": 0.18215615749359132, "num_tokens": 7023238.0, "step": 3065 }, { "entropy": 5.561151647567749, "epoch": 0.29490874159462055, "grad_norm": 1.0234375, "learning_rate": 0.0004995529086285495, "loss": 5.5521, "mean_token_accuracy": 0.17758539766073228, "num_tokens": 7034944.0, "step": 3070 }, { "entropy": 5.563313627243042, "epoch": 0.2953890489913545, "grad_norm": 1.015625, "learning_rate": 0.000499550745832149, "loss": 5.4154, "mean_token_accuracy": 0.18512072116136552, "num_tokens": 7046880.0, "step": 3075 }, { "entropy": 5.486554431915283, "epoch": 0.29586935638808837, "grad_norm": 1.0390625, "learning_rate": 0.0004995485778223471, "loss": 5.4866, "mean_token_accuracy": 0.1800946146249771, "num_tokens": 7057678.0, "step": 3080 }, { "entropy": 5.4739940643310545, "epoch": 0.2963496637848223, "grad_norm": 1.0703125, "learning_rate": 0.0004995464045991939, "loss": 5.4688, "mean_token_accuracy": 0.18641662895679473, "num_tokens": 7068336.0, "step": 3085 }, { "entropy": 5.588371753692627, "epoch": 0.2968299711815562, "grad_norm": 0.98828125, "learning_rate": 0.00049954422616274, "loss": 5.5343, "mean_token_accuracy": 0.17594826519489287, "num_tokens": 7080341.0, "step": 3090 }, { "entropy": 5.6965454578399655, "epoch": 0.2973102785782901, "grad_norm": 1.1171875, "learning_rate": 0.0004995420425130359, "loss": 5.6866, "mean_token_accuracy": 0.17018966376781464, "num_tokens": 7090618.0, "step": 3095 }, { "entropy": 5.499913692474365, "epoch": 0.297790585975024, "grad_norm": 1.078125, "learning_rate": 0.0004995398536501324, "loss": 5.4331, "mean_token_accuracy": 0.18785624653100969, "num_tokens": 7101843.0, "step": 3100 }, { "entropy": 5.4791899681091305, "epoch": 0.29827089337175794, "grad_norm": 1.1640625, "learning_rate": 0.0004995376595740801, "loss": 5.5056, "mean_token_accuracy": 0.18063082695007324, "num_tokens": 7112014.0, "step": 3105 }, { "entropy": 5.632973289489746, "epoch": 0.2987512007684918, "grad_norm": 1.0078125, "learning_rate": 0.0004995354602849302, "loss": 5.5822, "mean_token_accuracy": 0.17074308097362517, "num_tokens": 7123860.0, "step": 3110 }, { "entropy": 5.571376514434815, "epoch": 0.29923150816522576, "grad_norm": 1.046875, "learning_rate": 0.0004995332557827337, "loss": 5.5564, "mean_token_accuracy": 0.17600722908973693, "num_tokens": 7135901.0, "step": 3115 }, { "entropy": 5.5778998851776125, "epoch": 0.29971181556195964, "grad_norm": 1.1171875, "learning_rate": 0.0004995310460675416, "loss": 5.5339, "mean_token_accuracy": 0.1845734417438507, "num_tokens": 7148743.0, "step": 3120 }, { "entropy": 5.589261770248413, "epoch": 0.3001921229586936, "grad_norm": 1.0546875, "learning_rate": 0.0004995288311394053, "loss": 5.5804, "mean_token_accuracy": 0.18021756410598755, "num_tokens": 7160731.0, "step": 3125 }, { "entropy": 5.574976587295533, "epoch": 0.30067243035542746, "grad_norm": 0.9765625, "learning_rate": 0.0004995266109983764, "loss": 5.5617, "mean_token_accuracy": 0.17890461087226867, "num_tokens": 7172861.0, "step": 3130 }, { "entropy": 5.5695881843566895, "epoch": 0.3011527377521614, "grad_norm": 1.015625, "learning_rate": 0.0004995243856445062, "loss": 5.5087, "mean_token_accuracy": 0.17425711154937745, "num_tokens": 7183954.0, "step": 3135 }, { "entropy": 5.523225164413452, "epoch": 0.3016330451488953, "grad_norm": 1.03125, "learning_rate": 0.0004995221550778466, "loss": 5.4793, "mean_token_accuracy": 0.1828732267022133, "num_tokens": 7195466.0, "step": 3140 }, { "entropy": 5.535993862152099, "epoch": 0.3021133525456292, "grad_norm": 1.046875, "learning_rate": 0.0004995199192984491, "loss": 5.4733, "mean_token_accuracy": 0.18358256071805953, "num_tokens": 7207173.0, "step": 3145 }, { "entropy": 5.601380920410156, "epoch": 0.3025936599423631, "grad_norm": 1.03125, "learning_rate": 0.0004995176783063657, "loss": 5.6094, "mean_token_accuracy": 0.17880836874246597, "num_tokens": 7220095.0, "step": 3150 }, { "entropy": 5.5713316917419435, "epoch": 0.30307396733909703, "grad_norm": 1.1015625, "learning_rate": 0.0004995154321016487, "loss": 5.5217, "mean_token_accuracy": 0.18463317751884462, "num_tokens": 7230664.0, "step": 3155 }, { "entropy": 5.5087896347045895, "epoch": 0.3035542747358309, "grad_norm": 1.0078125, "learning_rate": 0.0004995131806843499, "loss": 5.4837, "mean_token_accuracy": 0.18419086784124375, "num_tokens": 7241278.0, "step": 3160 }, { "entropy": 5.4533278465271, "epoch": 0.30403458213256485, "grad_norm": 1.0703125, "learning_rate": 0.0004995109240545218, "loss": 5.6281, "mean_token_accuracy": 0.1725993424654007, "num_tokens": 7252999.0, "step": 3165 }, { "entropy": 5.589286613464355, "epoch": 0.3045148895292987, "grad_norm": 1.0703125, "learning_rate": 0.0004995086622122167, "loss": 5.4738, "mean_token_accuracy": 0.17775996774435043, "num_tokens": 7263949.0, "step": 3170 }, { "entropy": 5.558937978744507, "epoch": 0.30499519692603266, "grad_norm": 1.0, "learning_rate": 0.0004995063951574871, "loss": 5.5219, "mean_token_accuracy": 0.18208030313253404, "num_tokens": 7275467.0, "step": 3175 }, { "entropy": 5.563764429092407, "epoch": 0.30547550432276654, "grad_norm": 1.03125, "learning_rate": 0.0004995041228903856, "loss": 5.4858, "mean_token_accuracy": 0.18617523461580276, "num_tokens": 7285534.0, "step": 3180 }, { "entropy": 5.614857864379883, "epoch": 0.3059558117195005, "grad_norm": 1.0234375, "learning_rate": 0.000499501845410965, "loss": 5.5985, "mean_token_accuracy": 0.18059034049510955, "num_tokens": 7297252.0, "step": 3185 }, { "entropy": 5.526304435729981, "epoch": 0.30643611911623436, "grad_norm": 0.93359375, "learning_rate": 0.0004994995627192781, "loss": 5.4686, "mean_token_accuracy": 0.18378556221723558, "num_tokens": 7308492.0, "step": 3190 }, { "entropy": 5.5130932331085205, "epoch": 0.3069164265129683, "grad_norm": 0.98828125, "learning_rate": 0.0004994972748153781, "loss": 5.5122, "mean_token_accuracy": 0.18087892532348632, "num_tokens": 7319703.0, "step": 3195 }, { "entropy": 5.598230838775635, "epoch": 0.30739673390970224, "grad_norm": 1.046875, "learning_rate": 0.000499494981699318, "loss": 5.4766, "mean_token_accuracy": 0.18629593551158904, "num_tokens": 7331022.0, "step": 3200 }, { "entropy": 5.5110736846923825, "epoch": 0.3078770413064361, "grad_norm": 1.0234375, "learning_rate": 0.000499492683371151, "loss": 5.5125, "mean_token_accuracy": 0.18337176293134688, "num_tokens": 7342977.0, "step": 3205 }, { "entropy": 5.602800512313843, "epoch": 0.30835734870317005, "grad_norm": 1.0234375, "learning_rate": 0.0004994903798309306, "loss": 5.5087, "mean_token_accuracy": 0.17746395766735076, "num_tokens": 7353227.0, "step": 3210 }, { "entropy": 5.563166570663452, "epoch": 0.30883765609990393, "grad_norm": 1.0703125, "learning_rate": 0.0004994880710787102, "loss": 5.5743, "mean_token_accuracy": 0.1642255187034607, "num_tokens": 7364165.0, "step": 3215 }, { "entropy": 5.544680643081665, "epoch": 0.30931796349663787, "grad_norm": 0.984375, "learning_rate": 0.0004994857571145432, "loss": 5.5023, "mean_token_accuracy": 0.18458254784345626, "num_tokens": 7374800.0, "step": 3220 }, { "entropy": 5.425434350967407, "epoch": 0.30979827089337175, "grad_norm": 1.0546875, "learning_rate": 0.0004994834379384837, "loss": 5.4565, "mean_token_accuracy": 0.18336665779352188, "num_tokens": 7386360.0, "step": 3225 }, { "entropy": 5.552868223190307, "epoch": 0.3102785782901057, "grad_norm": 1.0, "learning_rate": 0.0004994811135505851, "loss": 5.4698, "mean_token_accuracy": 0.18341365456581116, "num_tokens": 7397066.0, "step": 3230 }, { "entropy": 5.558938503265381, "epoch": 0.31075888568683957, "grad_norm": 1.0703125, "learning_rate": 0.0004994787839509018, "loss": 5.564, "mean_token_accuracy": 0.1713826075196266, "num_tokens": 7408349.0, "step": 3235 }, { "entropy": 5.5813216209411625, "epoch": 0.3112391930835735, "grad_norm": 1.0390625, "learning_rate": 0.0004994764491394876, "loss": 5.5886, "mean_token_accuracy": 0.17263369262218475, "num_tokens": 7420343.0, "step": 3240 }, { "entropy": 5.624362230300903, "epoch": 0.3117195004803074, "grad_norm": 0.921875, "learning_rate": 0.0004994741091163969, "loss": 5.4904, "mean_token_accuracy": 0.18449428975582122, "num_tokens": 7431683.0, "step": 3245 }, { "entropy": 5.41058030128479, "epoch": 0.3121998078770413, "grad_norm": 1.0234375, "learning_rate": 0.000499471763881684, "loss": 5.4083, "mean_token_accuracy": 0.18659997135400772, "num_tokens": 7443327.0, "step": 3250 }, { "entropy": 5.545905923843383, "epoch": 0.3126801152737752, "grad_norm": 1.03125, "learning_rate": 0.0004994694134354031, "loss": 5.517, "mean_token_accuracy": 0.18232496678829194, "num_tokens": 7454002.0, "step": 3255 }, { "entropy": 5.49485216140747, "epoch": 0.31316042267050914, "grad_norm": 1.078125, "learning_rate": 0.000499467057777609, "loss": 5.5092, "mean_token_accuracy": 0.18318750262260436, "num_tokens": 7464074.0, "step": 3260 }, { "entropy": 5.470322179794311, "epoch": 0.313640730067243, "grad_norm": 1.0859375, "learning_rate": 0.0004994646969083565, "loss": 5.434, "mean_token_accuracy": 0.1871152251958847, "num_tokens": 7475543.0, "step": 3265 }, { "entropy": 5.583432674407959, "epoch": 0.31412103746397696, "grad_norm": 1.0859375, "learning_rate": 0.0004994623308277002, "loss": 5.4947, "mean_token_accuracy": 0.18215811550617217, "num_tokens": 7486818.0, "step": 3270 }, { "entropy": 5.5460193157196045, "epoch": 0.31460134486071084, "grad_norm": 1.0078125, "learning_rate": 0.000499459959535695, "loss": 5.5431, "mean_token_accuracy": 0.17775923311710357, "num_tokens": 7499046.0, "step": 3275 }, { "entropy": 5.530418539047242, "epoch": 0.3150816522574448, "grad_norm": 1.109375, "learning_rate": 0.0004994575830323962, "loss": 5.4758, "mean_token_accuracy": 0.1772423878312111, "num_tokens": 7509853.0, "step": 3280 }, { "entropy": 5.422787761688232, "epoch": 0.31556195965417866, "grad_norm": 1.046875, "learning_rate": 0.0004994552013178586, "loss": 5.3345, "mean_token_accuracy": 0.1908559814095497, "num_tokens": 7521091.0, "step": 3285 }, { "entropy": 5.470391035079956, "epoch": 0.3160422670509126, "grad_norm": 1.078125, "learning_rate": 0.000499452814392138, "loss": 5.4638, "mean_token_accuracy": 0.19296756088733674, "num_tokens": 7531317.0, "step": 3290 }, { "entropy": 5.550863265991211, "epoch": 0.3165225744476465, "grad_norm": 1.0234375, "learning_rate": 0.0004994504222552894, "loss": 5.6115, "mean_token_accuracy": 0.17447966411709787, "num_tokens": 7542822.0, "step": 3295 }, { "entropy": 5.679572725296021, "epoch": 0.3170028818443804, "grad_norm": 1.0703125, "learning_rate": 0.0004994480249073684, "loss": 5.5371, "mean_token_accuracy": 0.17899394482374192, "num_tokens": 7552434.0, "step": 3300 }, { "entropy": 5.455837345123291, "epoch": 0.3174831892411143, "grad_norm": 0.98046875, "learning_rate": 0.0004994456223484308, "loss": 5.412, "mean_token_accuracy": 0.1847301483154297, "num_tokens": 7563895.0, "step": 3305 }, { "entropy": 5.356154918670654, "epoch": 0.31796349663784823, "grad_norm": 1.0, "learning_rate": 0.0004994432145785323, "loss": 5.4431, "mean_token_accuracy": 0.1852705791592598, "num_tokens": 7575391.0, "step": 3310 }, { "entropy": 5.603661298751831, "epoch": 0.3184438040345821, "grad_norm": 1.078125, "learning_rate": 0.0004994408015977288, "loss": 5.5895, "mean_token_accuracy": 0.18396379053592682, "num_tokens": 7587119.0, "step": 3315 }, { "entropy": 5.5791820049285885, "epoch": 0.31892411143131605, "grad_norm": 1.109375, "learning_rate": 0.0004994383834060764, "loss": 5.5529, "mean_token_accuracy": 0.17733592242002488, "num_tokens": 7598615.0, "step": 3320 }, { "entropy": 5.522308588027954, "epoch": 0.31940441882804993, "grad_norm": 0.9921875, "learning_rate": 0.0004994359600036311, "loss": 5.5022, "mean_token_accuracy": 0.18452920615673066, "num_tokens": 7610159.0, "step": 3325 }, { "entropy": 5.598204278945923, "epoch": 0.31988472622478387, "grad_norm": 1.171875, "learning_rate": 0.0004994335313904493, "loss": 5.4916, "mean_token_accuracy": 0.18418505936861038, "num_tokens": 7620922.0, "step": 3330 }, { "entropy": 5.45703272819519, "epoch": 0.32036503362151775, "grad_norm": 0.95703125, "learning_rate": 0.0004994310975665873, "loss": 5.4117, "mean_token_accuracy": 0.18754592537879944, "num_tokens": 7632343.0, "step": 3335 }, { "entropy": 5.619206094741822, "epoch": 0.3208453410182517, "grad_norm": 0.96875, "learning_rate": 0.0004994286585321017, "loss": 5.6097, "mean_token_accuracy": 0.1694990485906601, "num_tokens": 7644748.0, "step": 3340 }, { "entropy": 5.595988607406616, "epoch": 0.32132564841498557, "grad_norm": 1.1171875, "learning_rate": 0.000499426214287049, "loss": 5.5649, "mean_token_accuracy": 0.18684215247631072, "num_tokens": 7655449.0, "step": 3345 }, { "entropy": 5.522005844116211, "epoch": 0.3218059558117195, "grad_norm": 1.0546875, "learning_rate": 0.0004994237648314862, "loss": 5.5274, "mean_token_accuracy": 0.18205100297927856, "num_tokens": 7665623.0, "step": 3350 }, { "entropy": 5.492083740234375, "epoch": 0.3222862632084534, "grad_norm": 1.0, "learning_rate": 0.0004994213101654697, "loss": 5.4173, "mean_token_accuracy": 0.18764639347791673, "num_tokens": 7676860.0, "step": 3355 }, { "entropy": 5.5761909008026125, "epoch": 0.3227665706051873, "grad_norm": 1.015625, "learning_rate": 0.000499418850289057, "loss": 5.603, "mean_token_accuracy": 0.1757027193903923, "num_tokens": 7687778.0, "step": 3360 }, { "entropy": 5.565295886993408, "epoch": 0.32324687800192126, "grad_norm": 1.078125, "learning_rate": 0.0004994163852023048, "loss": 5.4981, "mean_token_accuracy": 0.18085954636335372, "num_tokens": 7699154.0, "step": 3365 }, { "entropy": 5.525069093704223, "epoch": 0.32372718539865514, "grad_norm": 1.0703125, "learning_rate": 0.0004994139149052706, "loss": 5.5175, "mean_token_accuracy": 0.18480815589427949, "num_tokens": 7711010.0, "step": 3370 }, { "entropy": 5.576666164398193, "epoch": 0.3242074927953891, "grad_norm": 1.0390625, "learning_rate": 0.0004994114393980117, "loss": 5.538, "mean_token_accuracy": 0.17918068915605545, "num_tokens": 7721969.0, "step": 3375 }, { "entropy": 5.561730909347534, "epoch": 0.32468780019212296, "grad_norm": 1.0078125, "learning_rate": 0.0004994089586805856, "loss": 5.4863, "mean_token_accuracy": 0.1827893927693367, "num_tokens": 7733762.0, "step": 3380 }, { "entropy": 5.549566268920898, "epoch": 0.3251681075888569, "grad_norm": 1.0859375, "learning_rate": 0.0004994064727530496, "loss": 5.4963, "mean_token_accuracy": 0.17758472561836242, "num_tokens": 7744614.0, "step": 3385 }, { "entropy": 5.498316717147827, "epoch": 0.3256484149855908, "grad_norm": 1.0, "learning_rate": 0.0004994039816154618, "loss": 5.4339, "mean_token_accuracy": 0.18473347425460815, "num_tokens": 7755799.0, "step": 3390 }, { "entropy": 5.455300903320312, "epoch": 0.3261287223823247, "grad_norm": 1.0078125, "learning_rate": 0.00049940148526788, "loss": 5.4848, "mean_token_accuracy": 0.18304541558027268, "num_tokens": 7768140.0, "step": 3395 }, { "entropy": 5.568225574493408, "epoch": 0.3266090297790586, "grad_norm": 1.125, "learning_rate": 0.0004993989837103618, "loss": 5.4898, "mean_token_accuracy": 0.1791609227657318, "num_tokens": 7778494.0, "step": 3400 }, { "entropy": 5.607134199142456, "epoch": 0.3270893371757925, "grad_norm": 1.0859375, "learning_rate": 0.0004993964769429657, "loss": 5.5675, "mean_token_accuracy": 0.18318891525268555, "num_tokens": 7789234.0, "step": 3405 }, { "entropy": 5.541140413284301, "epoch": 0.3275696445725264, "grad_norm": 0.9453125, "learning_rate": 0.0004993939649657498, "loss": 5.548, "mean_token_accuracy": 0.18319968730211258, "num_tokens": 7800602.0, "step": 3410 }, { "entropy": 5.469655227661133, "epoch": 0.32804995196926034, "grad_norm": 1.0078125, "learning_rate": 0.0004993914477787721, "loss": 5.3674, "mean_token_accuracy": 0.1912238970398903, "num_tokens": 7812803.0, "step": 3415 }, { "entropy": 5.625386571884155, "epoch": 0.3285302593659942, "grad_norm": 1.0703125, "learning_rate": 0.0004993889253820915, "loss": 5.6669, "mean_token_accuracy": 0.16849727183580399, "num_tokens": 7825432.0, "step": 3420 }, { "entropy": 5.567583656311035, "epoch": 0.32901056676272816, "grad_norm": 1.046875, "learning_rate": 0.0004993863977757663, "loss": 5.4819, "mean_token_accuracy": 0.18198901265859604, "num_tokens": 7837258.0, "step": 3425 }, { "entropy": 5.42762131690979, "epoch": 0.32949087415946204, "grad_norm": 1.046875, "learning_rate": 0.0004993838649598552, "loss": 5.3739, "mean_token_accuracy": 0.1897459015250206, "num_tokens": 7847573.0, "step": 3430 }, { "entropy": 5.551398038864136, "epoch": 0.329971181556196, "grad_norm": 1.0703125, "learning_rate": 0.0004993813269344171, "loss": 5.4969, "mean_token_accuracy": 0.17690201252698898, "num_tokens": 7857957.0, "step": 3435 }, { "entropy": 5.5013957023620605, "epoch": 0.33045148895292986, "grad_norm": 1.046875, "learning_rate": 0.0004993787836995108, "loss": 5.4174, "mean_token_accuracy": 0.1926833838224411, "num_tokens": 7867996.0, "step": 3440 }, { "entropy": 5.446499681472778, "epoch": 0.3309317963496638, "grad_norm": 1.0859375, "learning_rate": 0.0004993762352551954, "loss": 5.4766, "mean_token_accuracy": 0.1805843397974968, "num_tokens": 7879245.0, "step": 3445 }, { "entropy": 5.61943678855896, "epoch": 0.3314121037463977, "grad_norm": 1.09375, "learning_rate": 0.0004993736816015301, "loss": 5.5669, "mean_token_accuracy": 0.17582879960536957, "num_tokens": 7891186.0, "step": 3450 }, { "entropy": 5.609936046600342, "epoch": 0.3318924111431316, "grad_norm": 0.9609375, "learning_rate": 0.0004993711227385742, "loss": 5.5802, "mean_token_accuracy": 0.1823540985584259, "num_tokens": 7902231.0, "step": 3455 }, { "entropy": 5.523345851898194, "epoch": 0.3323727185398655, "grad_norm": 1.125, "learning_rate": 0.0004993685586663871, "loss": 5.5412, "mean_token_accuracy": 0.18139662891626357, "num_tokens": 7913364.0, "step": 3460 }, { "entropy": 5.735165405273437, "epoch": 0.33285302593659943, "grad_norm": 1.046875, "learning_rate": 0.0004993659893850281, "loss": 5.7308, "mean_token_accuracy": 0.16727230101823806, "num_tokens": 7925217.0, "step": 3465 }, { "entropy": 5.506084823608399, "epoch": 0.3333333333333333, "grad_norm": 0.92578125, "learning_rate": 0.0004993634148945573, "loss": 5.4639, "mean_token_accuracy": 0.17894653379917144, "num_tokens": 7937636.0, "step": 3470 }, { "entropy": 5.5272363185882565, "epoch": 0.33381364073006725, "grad_norm": 1.03125, "learning_rate": 0.0004993608351950341, "loss": 5.4896, "mean_token_accuracy": 0.17503666803240775, "num_tokens": 7948958.0, "step": 3475 }, { "entropy": 5.620566320419312, "epoch": 0.33429394812680113, "grad_norm": 1.0625, "learning_rate": 0.0004993582502865185, "loss": 5.5323, "mean_token_accuracy": 0.18402974754571916, "num_tokens": 7960013.0, "step": 3480 }, { "entropy": 5.462809419631958, "epoch": 0.33477425552353507, "grad_norm": 1.046875, "learning_rate": 0.0004993556601690706, "loss": 5.5416, "mean_token_accuracy": 0.17792800366878508, "num_tokens": 7971041.0, "step": 3485 }, { "entropy": 5.618744802474976, "epoch": 0.33525456292026895, "grad_norm": 1.03125, "learning_rate": 0.0004993530648427505, "loss": 5.576, "mean_token_accuracy": 0.1723045140504837, "num_tokens": 7982752.0, "step": 3490 }, { "entropy": 5.599891996383667, "epoch": 0.3357348703170029, "grad_norm": 1.0390625, "learning_rate": 0.0004993504643076184, "loss": 5.4278, "mean_token_accuracy": 0.18250093311071397, "num_tokens": 7993681.0, "step": 3495 }, { "entropy": 5.470984411239624, "epoch": 0.33621517771373677, "grad_norm": 1.0703125, "learning_rate": 0.0004993478585637347, "loss": 5.4781, "mean_token_accuracy": 0.18258391320705414, "num_tokens": 8004727.0, "step": 3500 }, { "entropy": 5.505999660491943, "epoch": 0.3366954851104707, "grad_norm": 1.0234375, "learning_rate": 0.0004993452476111599, "loss": 5.4797, "mean_token_accuracy": 0.18967788219451903, "num_tokens": 8015423.0, "step": 3505 }, { "entropy": 5.512713193893433, "epoch": 0.3371757925072046, "grad_norm": 0.9140625, "learning_rate": 0.0004993426314499546, "loss": 5.4536, "mean_token_accuracy": 0.18748492896556854, "num_tokens": 8027911.0, "step": 3510 }, { "entropy": 5.572777605056762, "epoch": 0.3376560999039385, "grad_norm": 0.96484375, "learning_rate": 0.0004993400100801796, "loss": 5.4747, "mean_token_accuracy": 0.1818804770708084, "num_tokens": 8038831.0, "step": 3515 }, { "entropy": 5.392134952545166, "epoch": 0.3381364073006724, "grad_norm": 1.0703125, "learning_rate": 0.0004993373835018956, "loss": 5.3718, "mean_token_accuracy": 0.18957587629556655, "num_tokens": 8049906.0, "step": 3520 }, { "entropy": 5.393214273452759, "epoch": 0.33861671469740634, "grad_norm": 1.0078125, "learning_rate": 0.0004993347517151638, "loss": 5.469, "mean_token_accuracy": 0.18386447727680205, "num_tokens": 8061158.0, "step": 3525 }, { "entropy": 5.6083544254302975, "epoch": 0.3390970220941403, "grad_norm": 1.0859375, "learning_rate": 0.0004993321147200452, "loss": 5.4326, "mean_token_accuracy": 0.181746444106102, "num_tokens": 8071958.0, "step": 3530 }, { "entropy": 5.465584182739258, "epoch": 0.33957732949087416, "grad_norm": 0.99609375, "learning_rate": 0.000499329472516601, "loss": 5.4294, "mean_token_accuracy": 0.17608542144298553, "num_tokens": 8084068.0, "step": 3535 }, { "entropy": 5.410733461380005, "epoch": 0.3400576368876081, "grad_norm": 1.015625, "learning_rate": 0.0004993268251048925, "loss": 5.3472, "mean_token_accuracy": 0.19578494429588317, "num_tokens": 8096132.0, "step": 3540 }, { "entropy": 5.503920364379883, "epoch": 0.340537944284342, "grad_norm": 1.0, "learning_rate": 0.0004993241724849814, "loss": 5.5102, "mean_token_accuracy": 0.18362511545419694, "num_tokens": 8107327.0, "step": 3545 }, { "entropy": 5.497963953018188, "epoch": 0.3410182516810759, "grad_norm": 1.015625, "learning_rate": 0.000499321514656929, "loss": 5.4779, "mean_token_accuracy": 0.18374822586774825, "num_tokens": 8118584.0, "step": 3550 }, { "entropy": 5.550964641571045, "epoch": 0.3414985590778098, "grad_norm": 1.03125, "learning_rate": 0.0004993188516207972, "loss": 5.5337, "mean_token_accuracy": 0.1793607845902443, "num_tokens": 8130081.0, "step": 3555 }, { "entropy": 5.507245492935181, "epoch": 0.34197886647454373, "grad_norm": 1.0546875, "learning_rate": 0.0004993161833766478, "loss": 5.4932, "mean_token_accuracy": 0.1838148668408394, "num_tokens": 8141463.0, "step": 3560 }, { "entropy": 5.541257572174072, "epoch": 0.3424591738712776, "grad_norm": 1.15625, "learning_rate": 0.0004993135099245426, "loss": 5.5042, "mean_token_accuracy": 0.17985130697488785, "num_tokens": 8153863.0, "step": 3565 }, { "entropy": 5.428792333602905, "epoch": 0.34293948126801155, "grad_norm": 1.140625, "learning_rate": 0.0004993108312645438, "loss": 5.463, "mean_token_accuracy": 0.18102106750011443, "num_tokens": 8165695.0, "step": 3570 }, { "entropy": 5.5374926090240475, "epoch": 0.34341978866474543, "grad_norm": 1.0546875, "learning_rate": 0.0004993081473967135, "loss": 5.5119, "mean_token_accuracy": 0.18098655641078948, "num_tokens": 8176456.0, "step": 3575 }, { "entropy": 5.58543210029602, "epoch": 0.34390009606147937, "grad_norm": 1.0390625, "learning_rate": 0.0004993054583211143, "loss": 5.5092, "mean_token_accuracy": 0.1822955548763275, "num_tokens": 8189050.0, "step": 3580 }, { "entropy": 5.43015308380127, "epoch": 0.34438040345821325, "grad_norm": 0.94921875, "learning_rate": 0.0004993027640378081, "loss": 5.4081, "mean_token_accuracy": 0.185765840113163, "num_tokens": 8200011.0, "step": 3585 }, { "entropy": 5.474026918411255, "epoch": 0.3448607108549472, "grad_norm": 1.0625, "learning_rate": 0.000499300064546858, "loss": 5.4183, "mean_token_accuracy": 0.1868817389011383, "num_tokens": 8211770.0, "step": 3590 }, { "entropy": 5.55191330909729, "epoch": 0.34534101825168106, "grad_norm": 1.140625, "learning_rate": 0.0004992973598483264, "loss": 5.4638, "mean_token_accuracy": 0.18688549250364303, "num_tokens": 8223582.0, "step": 3595 }, { "entropy": 5.575275611877442, "epoch": 0.345821325648415, "grad_norm": 1.109375, "learning_rate": 0.000499294649942276, "loss": 5.5846, "mean_token_accuracy": 0.1825041502714157, "num_tokens": 8234336.0, "step": 3600 }, { "entropy": 5.547464847564697, "epoch": 0.3463016330451489, "grad_norm": 1.0859375, "learning_rate": 0.0004992919348287699, "loss": 5.4941, "mean_token_accuracy": 0.18366153985261918, "num_tokens": 8244605.0, "step": 3605 }, { "entropy": 5.5259942531585695, "epoch": 0.3467819404418828, "grad_norm": 0.98046875, "learning_rate": 0.0004992892145078711, "loss": 5.5254, "mean_token_accuracy": 0.17931086868047713, "num_tokens": 8255876.0, "step": 3610 }, { "entropy": 5.4697678565979, "epoch": 0.3472622478386167, "grad_norm": 1.0703125, "learning_rate": 0.0004992864889796427, "loss": 5.4174, "mean_token_accuracy": 0.18721913993358613, "num_tokens": 8266602.0, "step": 3615 }, { "entropy": 5.546818780899048, "epoch": 0.34774255523535064, "grad_norm": 0.9609375, "learning_rate": 0.0004992837582441481, "loss": 5.4216, "mean_token_accuracy": 0.18347607105970382, "num_tokens": 8279804.0, "step": 3620 }, { "entropy": 5.569514989852905, "epoch": 0.3482228626320845, "grad_norm": 1.03125, "learning_rate": 0.0004992810223014506, "loss": 5.5242, "mean_token_accuracy": 0.1833881989121437, "num_tokens": 8291020.0, "step": 3625 }, { "entropy": 5.5203827857971195, "epoch": 0.34870317002881845, "grad_norm": 1.0390625, "learning_rate": 0.0004992782811516137, "loss": 5.4727, "mean_token_accuracy": 0.18729409873485564, "num_tokens": 8302192.0, "step": 3630 }, { "entropy": 5.496627855300903, "epoch": 0.34918347742555234, "grad_norm": 0.96484375, "learning_rate": 0.0004992755347947011, "loss": 5.4324, "mean_token_accuracy": 0.18265776634216307, "num_tokens": 8313649.0, "step": 3635 }, { "entropy": 5.44870662689209, "epoch": 0.34966378482228627, "grad_norm": 1.0234375, "learning_rate": 0.0004992727832307766, "loss": 5.4304, "mean_token_accuracy": 0.18587879687547684, "num_tokens": 8324694.0, "step": 3640 }, { "entropy": 5.604543972015381, "epoch": 0.35014409221902015, "grad_norm": 1.1015625, "learning_rate": 0.0004992700264599039, "loss": 5.594, "mean_token_accuracy": 0.1727964922785759, "num_tokens": 8336517.0, "step": 3645 }, { "entropy": 5.540855789184571, "epoch": 0.3506243996157541, "grad_norm": 0.98046875, "learning_rate": 0.0004992672644821473, "loss": 5.5425, "mean_token_accuracy": 0.1779757022857666, "num_tokens": 8349001.0, "step": 3650 }, { "entropy": 5.5626523971557615, "epoch": 0.35110470701248797, "grad_norm": 1.046875, "learning_rate": 0.0004992644972975707, "loss": 5.4537, "mean_token_accuracy": 0.1864044651389122, "num_tokens": 8361230.0, "step": 3655 }, { "entropy": 5.394788694381714, "epoch": 0.3515850144092219, "grad_norm": 1.015625, "learning_rate": 0.0004992617249062383, "loss": 5.3924, "mean_token_accuracy": 0.19216873198747636, "num_tokens": 8372159.0, "step": 3660 }, { "entropy": 5.543751049041748, "epoch": 0.3520653218059558, "grad_norm": 1.0546875, "learning_rate": 0.0004992589473082147, "loss": 5.5214, "mean_token_accuracy": 0.18608528524637222, "num_tokens": 8383228.0, "step": 3665 }, { "entropy": 5.509809923171997, "epoch": 0.3525456292026897, "grad_norm": 1.0625, "learning_rate": 0.0004992561645035641, "loss": 5.4561, "mean_token_accuracy": 0.18168068826198577, "num_tokens": 8394582.0, "step": 3670 }, { "entropy": 5.514116191864014, "epoch": 0.3530259365994236, "grad_norm": 0.9921875, "learning_rate": 0.0004992533764923515, "loss": 5.4481, "mean_token_accuracy": 0.18126334249973297, "num_tokens": 8406784.0, "step": 3675 }, { "entropy": 5.483726072311401, "epoch": 0.35350624399615754, "grad_norm": 1.015625, "learning_rate": 0.0004992505832746412, "loss": 5.4286, "mean_token_accuracy": 0.19101243019104003, "num_tokens": 8418405.0, "step": 3680 }, { "entropy": 5.5265562534332275, "epoch": 0.3539865513928914, "grad_norm": 1.0078125, "learning_rate": 0.0004992477848504983, "loss": 5.392, "mean_token_accuracy": 0.18716304898262023, "num_tokens": 8430432.0, "step": 3685 }, { "entropy": 5.479315328598022, "epoch": 0.35446685878962536, "grad_norm": 0.9375, "learning_rate": 0.0004992449812199877, "loss": 5.5635, "mean_token_accuracy": 0.17799893915653228, "num_tokens": 8442423.0, "step": 3690 }, { "entropy": 5.518668079376221, "epoch": 0.3549471661863593, "grad_norm": 0.99609375, "learning_rate": 0.0004992421723831745, "loss": 5.546, "mean_token_accuracy": 0.1842621758580208, "num_tokens": 8454951.0, "step": 3695 }, { "entropy": 5.520323848724365, "epoch": 0.3554274735830932, "grad_norm": 0.97265625, "learning_rate": 0.0004992393583401239, "loss": 5.4033, "mean_token_accuracy": 0.18851898312568666, "num_tokens": 8467758.0, "step": 3700 }, { "entropy": 5.475191354751587, "epoch": 0.3559077809798271, "grad_norm": 1.046875, "learning_rate": 0.0004992365390909014, "loss": 5.4854, "mean_token_accuracy": 0.17992179691791535, "num_tokens": 8479728.0, "step": 3705 }, { "entropy": 5.535838651657104, "epoch": 0.356388088376561, "grad_norm": 1.0078125, "learning_rate": 0.0004992337146355721, "loss": 5.552, "mean_token_accuracy": 0.17727553099393845, "num_tokens": 8492099.0, "step": 3710 }, { "entropy": 5.610863542556762, "epoch": 0.35686839577329493, "grad_norm": 1.015625, "learning_rate": 0.0004992308849742019, "loss": 5.4819, "mean_token_accuracy": 0.17355056405067443, "num_tokens": 8504657.0, "step": 3715 }, { "entropy": 5.48232364654541, "epoch": 0.3573487031700288, "grad_norm": 1.0390625, "learning_rate": 0.0004992280501068563, "loss": 5.4509, "mean_token_accuracy": 0.18914830237627028, "num_tokens": 8514728.0, "step": 3720 }, { "entropy": 5.528886175155639, "epoch": 0.35782901056676275, "grad_norm": 1.09375, "learning_rate": 0.0004992252100336012, "loss": 5.581, "mean_token_accuracy": 0.1833130970597267, "num_tokens": 8525588.0, "step": 3725 }, { "entropy": 5.540911626815796, "epoch": 0.35830931796349663, "grad_norm": 1.125, "learning_rate": 0.0004992223647545027, "loss": 5.527, "mean_token_accuracy": 0.18297800421714783, "num_tokens": 8537468.0, "step": 3730 }, { "entropy": 5.5527503490448, "epoch": 0.35878962536023057, "grad_norm": 0.99609375, "learning_rate": 0.0004992195142696266, "loss": 5.438, "mean_token_accuracy": 0.18914629518985748, "num_tokens": 8548598.0, "step": 3735 }, { "entropy": 5.33068585395813, "epoch": 0.35926993275696445, "grad_norm": 1.0078125, "learning_rate": 0.0004992166585790391, "loss": 5.3396, "mean_token_accuracy": 0.19562919437885284, "num_tokens": 8560301.0, "step": 3740 }, { "entropy": 5.483434391021729, "epoch": 0.3597502401536984, "grad_norm": 1.1171875, "learning_rate": 0.0004992137976828067, "loss": 5.4516, "mean_token_accuracy": 0.18603197634220123, "num_tokens": 8571186.0, "step": 3745 }, { "entropy": 5.484015607833863, "epoch": 0.36023054755043227, "grad_norm": 1.2734375, "learning_rate": 0.0004992109315809955, "loss": 5.4383, "mean_token_accuracy": 0.18905191421508788, "num_tokens": 8580725.0, "step": 3750 }, { "entropy": 5.519361686706543, "epoch": 0.3607108549471662, "grad_norm": 0.93359375, "learning_rate": 0.0004992080602736725, "loss": 5.5532, "mean_token_accuracy": 0.1773756206035614, "num_tokens": 8594598.0, "step": 3755 }, { "entropy": 5.643574905395508, "epoch": 0.3611911623439001, "grad_norm": 0.98828125, "learning_rate": 0.0004992051837609039, "loss": 5.5404, "mean_token_accuracy": 0.17730522602796556, "num_tokens": 8606733.0, "step": 3760 }, { "entropy": 5.508514451980591, "epoch": 0.361671469740634, "grad_norm": 1.015625, "learning_rate": 0.0004992023020427568, "loss": 5.4788, "mean_token_accuracy": 0.18672696501016617, "num_tokens": 8618863.0, "step": 3765 }, { "entropy": 5.3892511367797855, "epoch": 0.3621517771373679, "grad_norm": 1.0390625, "learning_rate": 0.0004991994151192979, "loss": 5.3304, "mean_token_accuracy": 0.18849435597658157, "num_tokens": 8629270.0, "step": 3770 }, { "entropy": 5.4767759323120115, "epoch": 0.36263208453410184, "grad_norm": 1.0546875, "learning_rate": 0.0004991965229905943, "loss": 5.5364, "mean_token_accuracy": 0.18494855612516403, "num_tokens": 8641363.0, "step": 3775 }, { "entropy": 5.6278270244598385, "epoch": 0.3631123919308357, "grad_norm": 1.0625, "learning_rate": 0.0004991936256567133, "loss": 5.4992, "mean_token_accuracy": 0.18451761305332184, "num_tokens": 8653233.0, "step": 3780 }, { "entropy": 5.4851010799407955, "epoch": 0.36359269932756966, "grad_norm": 0.91015625, "learning_rate": 0.000499190723117722, "loss": 5.487, "mean_token_accuracy": 0.17836329340934753, "num_tokens": 8665192.0, "step": 3785 }, { "entropy": 5.579302835464477, "epoch": 0.36407300672430354, "grad_norm": 1.0859375, "learning_rate": 0.0004991878153736877, "loss": 5.5583, "mean_token_accuracy": 0.17446503937244415, "num_tokens": 8677669.0, "step": 3790 }, { "entropy": 5.419927787780762, "epoch": 0.3645533141210375, "grad_norm": 1.0859375, "learning_rate": 0.0004991849024246781, "loss": 5.3676, "mean_token_accuracy": 0.18973670154809952, "num_tokens": 8688002.0, "step": 3795 }, { "entropy": 5.438193988800049, "epoch": 0.36503362151777136, "grad_norm": 1.03125, "learning_rate": 0.0004991819842707608, "loss": 5.4133, "mean_token_accuracy": 0.18962489068508148, "num_tokens": 8698396.0, "step": 3800 }, { "entropy": 5.543167686462402, "epoch": 0.3655139289145053, "grad_norm": 1.0546875, "learning_rate": 0.0004991790609120035, "loss": 5.4297, "mean_token_accuracy": 0.18700562715530394, "num_tokens": 8711135.0, "step": 3805 }, { "entropy": 5.469641494750976, "epoch": 0.3659942363112392, "grad_norm": 1.0390625, "learning_rate": 0.000499176132348474, "loss": 5.4735, "mean_token_accuracy": 0.1897922232747078, "num_tokens": 8723707.0, "step": 3810 }, { "entropy": 5.582857084274292, "epoch": 0.3664745437079731, "grad_norm": 1.1328125, "learning_rate": 0.0004991731985802405, "loss": 5.4338, "mean_token_accuracy": 0.18693850934505463, "num_tokens": 8734193.0, "step": 3815 }, { "entropy": 5.444149160385132, "epoch": 0.366954851104707, "grad_norm": 1.1171875, "learning_rate": 0.0004991702596073708, "loss": 5.4841, "mean_token_accuracy": 0.18134361505508423, "num_tokens": 8745619.0, "step": 3820 }, { "entropy": 5.426347923278809, "epoch": 0.36743515850144093, "grad_norm": 1.015625, "learning_rate": 0.0004991673154299335, "loss": 5.4231, "mean_token_accuracy": 0.18122087568044662, "num_tokens": 8757331.0, "step": 3825 }, { "entropy": 5.515204238891601, "epoch": 0.3679154658981748, "grad_norm": 0.98046875, "learning_rate": 0.0004991643660479967, "loss": 5.428, "mean_token_accuracy": 0.1868494287133217, "num_tokens": 8768840.0, "step": 3830 }, { "entropy": 5.460073804855346, "epoch": 0.36839577329490875, "grad_norm": 1.0, "learning_rate": 0.0004991614114616289, "loss": 5.3818, "mean_token_accuracy": 0.18779707103967666, "num_tokens": 8781214.0, "step": 3835 }, { "entropy": 5.510246324539184, "epoch": 0.3688760806916426, "grad_norm": 0.98046875, "learning_rate": 0.0004991584516708988, "loss": 5.4477, "mean_token_accuracy": 0.18548956960439683, "num_tokens": 8791645.0, "step": 3840 }, { "entropy": 5.5942995071411135, "epoch": 0.36935638808837656, "grad_norm": 1.078125, "learning_rate": 0.0004991554866758751, "loss": 5.6333, "mean_token_accuracy": 0.1739022307097912, "num_tokens": 8803286.0, "step": 3845 }, { "entropy": 5.493673467636109, "epoch": 0.36983669548511044, "grad_norm": 1.1015625, "learning_rate": 0.0004991525164766265, "loss": 5.4163, "mean_token_accuracy": 0.1872221603989601, "num_tokens": 8814207.0, "step": 3850 }, { "entropy": 5.503255462646484, "epoch": 0.3703170028818444, "grad_norm": 0.9609375, "learning_rate": 0.0004991495410732222, "loss": 5.4683, "mean_token_accuracy": 0.17725101560354234, "num_tokens": 8825540.0, "step": 3855 }, { "entropy": 5.5069482803344725, "epoch": 0.37079731027857826, "grad_norm": 1.015625, "learning_rate": 0.0004991465604657311, "loss": 5.5937, "mean_token_accuracy": 0.17322031259536744, "num_tokens": 8838182.0, "step": 3860 }, { "entropy": 5.526088094711303, "epoch": 0.3712776176753122, "grad_norm": 1.015625, "learning_rate": 0.0004991435746542224, "loss": 5.4654, "mean_token_accuracy": 0.18988653868436814, "num_tokens": 8850211.0, "step": 3865 }, { "entropy": 5.439452648162842, "epoch": 0.37175792507204614, "grad_norm": 0.97265625, "learning_rate": 0.0004991405836387655, "loss": 5.5032, "mean_token_accuracy": 0.18108827471733094, "num_tokens": 8862804.0, "step": 3870 }, { "entropy": 5.529762125015258, "epoch": 0.37223823246878, "grad_norm": 1.078125, "learning_rate": 0.0004991375874194298, "loss": 5.4602, "mean_token_accuracy": 0.17960784435272217, "num_tokens": 8874112.0, "step": 3875 }, { "entropy": 5.469674205780029, "epoch": 0.37271853986551395, "grad_norm": 1.03125, "learning_rate": 0.000499134585996285, "loss": 5.477, "mean_token_accuracy": 0.18614101260900498, "num_tokens": 8885114.0, "step": 3880 }, { "entropy": 5.554774141311645, "epoch": 0.37319884726224783, "grad_norm": 1.046875, "learning_rate": 0.0004991315793694004, "loss": 5.3691, "mean_token_accuracy": 0.18807282894849778, "num_tokens": 8895555.0, "step": 3885 }, { "entropy": 5.405085754394531, "epoch": 0.37367915465898177, "grad_norm": 1.0234375, "learning_rate": 0.0004991285675388463, "loss": 5.3765, "mean_token_accuracy": 0.19634046405553818, "num_tokens": 8906073.0, "step": 3890 }, { "entropy": 5.501630163192749, "epoch": 0.37415946205571565, "grad_norm": 1.125, "learning_rate": 0.0004991255505046922, "loss": 5.5188, "mean_token_accuracy": 0.1789945885539055, "num_tokens": 8916587.0, "step": 3895 }, { "entropy": 5.550557231903076, "epoch": 0.3746397694524496, "grad_norm": 1.0546875, "learning_rate": 0.0004991225282670083, "loss": 5.4113, "mean_token_accuracy": 0.1861289381980896, "num_tokens": 8927923.0, "step": 3900 }, { "entropy": 5.382868242263794, "epoch": 0.37512007684918347, "grad_norm": 1.078125, "learning_rate": 0.000499119500825865, "loss": 5.4579, "mean_token_accuracy": 0.18377629071474075, "num_tokens": 8939939.0, "step": 3905 }, { "entropy": 5.397466945648193, "epoch": 0.3756003842459174, "grad_norm": 1.09375, "learning_rate": 0.0004991164681813323, "loss": 5.4378, "mean_token_accuracy": 0.19209783971309663, "num_tokens": 8951748.0, "step": 3910 }, { "entropy": 5.485667037963867, "epoch": 0.3760806916426513, "grad_norm": 1.015625, "learning_rate": 0.0004991134303334807, "loss": 5.3588, "mean_token_accuracy": 0.19007459729909898, "num_tokens": 8962922.0, "step": 3915 }, { "entropy": 5.372178030014038, "epoch": 0.3765609990393852, "grad_norm": 1.03125, "learning_rate": 0.0004991103872823807, "loss": 5.3442, "mean_token_accuracy": 0.19452154785394668, "num_tokens": 8974013.0, "step": 3920 }, { "entropy": 5.436591958999633, "epoch": 0.3770413064361191, "grad_norm": 1.015625, "learning_rate": 0.000499107339028103, "loss": 5.4262, "mean_token_accuracy": 0.18169266134500503, "num_tokens": 8986032.0, "step": 3925 }, { "entropy": 5.542058515548706, "epoch": 0.37752161383285304, "grad_norm": 1.0546875, "learning_rate": 0.0004991042855707184, "loss": 5.4187, "mean_token_accuracy": 0.1796349912881851, "num_tokens": 8996889.0, "step": 3930 }, { "entropy": 5.436617517471314, "epoch": 0.3780019212295869, "grad_norm": 1.0546875, "learning_rate": 0.0004991012269102977, "loss": 5.3992, "mean_token_accuracy": 0.18429471999406816, "num_tokens": 9007594.0, "step": 3935 }, { "entropy": 5.426474618911743, "epoch": 0.37848222862632086, "grad_norm": 1.0859375, "learning_rate": 0.0004990981630469119, "loss": 5.402, "mean_token_accuracy": 0.18193352967500687, "num_tokens": 9018097.0, "step": 3940 }, { "entropy": 5.5093968391418455, "epoch": 0.37896253602305474, "grad_norm": 1.0234375, "learning_rate": 0.0004990950939806323, "loss": 5.5113, "mean_token_accuracy": 0.18117111474275588, "num_tokens": 9029554.0, "step": 3945 }, { "entropy": 5.489337825775147, "epoch": 0.3794428434197887, "grad_norm": 0.99609375, "learning_rate": 0.00049909201971153, "loss": 5.3772, "mean_token_accuracy": 0.1829820305109024, "num_tokens": 9042518.0, "step": 3950 }, { "entropy": 5.421378660202026, "epoch": 0.37992315081652256, "grad_norm": 1.1015625, "learning_rate": 0.0004990889402396763, "loss": 5.4316, "mean_token_accuracy": 0.18639881759881974, "num_tokens": 9054524.0, "step": 3955 }, { "entropy": 5.510490798950196, "epoch": 0.3804034582132565, "grad_norm": 0.9921875, "learning_rate": 0.0004990858555651431, "loss": 5.4016, "mean_token_accuracy": 0.18468015938997268, "num_tokens": 9065375.0, "step": 3960 }, { "entropy": 5.44808177947998, "epoch": 0.3808837656099904, "grad_norm": 1.109375, "learning_rate": 0.0004990827656880015, "loss": 5.3509, "mean_token_accuracy": 0.1859322890639305, "num_tokens": 9076338.0, "step": 3965 }, { "entropy": 5.432799911499023, "epoch": 0.3813640730067243, "grad_norm": 0.96875, "learning_rate": 0.0004990796706083235, "loss": 5.4011, "mean_token_accuracy": 0.18659975230693818, "num_tokens": 9088407.0, "step": 3970 }, { "entropy": 5.426470470428467, "epoch": 0.3818443804034582, "grad_norm": 1.03125, "learning_rate": 0.0004990765703261809, "loss": 5.3649, "mean_token_accuracy": 0.18807975053787232, "num_tokens": 9099833.0, "step": 3975 }, { "entropy": 5.350304222106933, "epoch": 0.38232468780019213, "grad_norm": 0.98046875, "learning_rate": 0.0004990734648416458, "loss": 5.3388, "mean_token_accuracy": 0.189335997402668, "num_tokens": 9111126.0, "step": 3980 }, { "entropy": 5.505539417266846, "epoch": 0.382804995196926, "grad_norm": 1.09375, "learning_rate": 0.0004990703541547901, "loss": 5.4548, "mean_token_accuracy": 0.1886373370885849, "num_tokens": 9121979.0, "step": 3985 }, { "entropy": 5.520917081832886, "epoch": 0.38328530259365995, "grad_norm": 1.109375, "learning_rate": 0.0004990672382656863, "loss": 5.4535, "mean_token_accuracy": 0.18644375950098038, "num_tokens": 9132929.0, "step": 3990 }, { "entropy": 5.485851383209228, "epoch": 0.38376560999039383, "grad_norm": 1.0, "learning_rate": 0.0004990641171744064, "loss": 5.4111, "mean_token_accuracy": 0.1882080391049385, "num_tokens": 9143903.0, "step": 3995 }, { "entropy": 5.495297384262085, "epoch": 0.38424591738712777, "grad_norm": 1.1484375, "learning_rate": 0.0004990609908810231, "loss": 5.5045, "mean_token_accuracy": 0.18192221075296403, "num_tokens": 9154416.0, "step": 4000 }, { "entropy": 5.513756942749024, "epoch": 0.38472622478386165, "grad_norm": 1.078125, "learning_rate": 0.0004990578593856089, "loss": 5.4805, "mean_token_accuracy": 0.18242392241954802, "num_tokens": 9165613.0, "step": 4005 }, { "entropy": 5.4664655208587645, "epoch": 0.3852065321805956, "grad_norm": 0.99609375, "learning_rate": 0.0004990547226882366, "loss": 5.433, "mean_token_accuracy": 0.18787842243909836, "num_tokens": 9177884.0, "step": 4010 }, { "entropy": 5.5449103832244875, "epoch": 0.38568683957732947, "grad_norm": 1.015625, "learning_rate": 0.0004990515807889788, "loss": 5.5669, "mean_token_accuracy": 0.17467134743928908, "num_tokens": 9190041.0, "step": 4015 }, { "entropy": 5.556881046295166, "epoch": 0.3861671469740634, "grad_norm": 1.0546875, "learning_rate": 0.0004990484336879087, "loss": 5.4402, "mean_token_accuracy": 0.18740091025829314, "num_tokens": 9202390.0, "step": 4020 }, { "entropy": 5.409300327301025, "epoch": 0.3866474543707973, "grad_norm": 1.09375, "learning_rate": 0.0004990452813850992, "loss": 5.4373, "mean_token_accuracy": 0.18635576069355012, "num_tokens": 9213437.0, "step": 4025 }, { "entropy": 5.554971408843994, "epoch": 0.3871277617675312, "grad_norm": 0.9765625, "learning_rate": 0.0004990421238806236, "loss": 5.517, "mean_token_accuracy": 0.17564513981342317, "num_tokens": 9226310.0, "step": 4030 }, { "entropy": 5.530429458618164, "epoch": 0.38760806916426516, "grad_norm": 1.0703125, "learning_rate": 0.0004990389611745551, "loss": 5.4495, "mean_token_accuracy": 0.1819504901766777, "num_tokens": 9236271.0, "step": 4035 }, { "entropy": 5.516104078292846, "epoch": 0.38808837656099904, "grad_norm": 1.1171875, "learning_rate": 0.0004990357932669672, "loss": 5.5245, "mean_token_accuracy": 0.18500009030103684, "num_tokens": 9247755.0, "step": 4040 }, { "entropy": 5.464123487472534, "epoch": 0.388568683957733, "grad_norm": 1.0703125, "learning_rate": 0.0004990326201579335, "loss": 5.361, "mean_token_accuracy": 0.19129124879837037, "num_tokens": 9259821.0, "step": 4045 }, { "entropy": 5.4668073654174805, "epoch": 0.38904899135446686, "grad_norm": 1.171875, "learning_rate": 0.0004990294418475274, "loss": 5.4631, "mean_token_accuracy": 0.18641964942216874, "num_tokens": 9270663.0, "step": 4050 }, { "entropy": 5.465627670288086, "epoch": 0.3895292987512008, "grad_norm": 1.234375, "learning_rate": 0.0004990262583358231, "loss": 5.4879, "mean_token_accuracy": 0.17998379915952684, "num_tokens": 9282588.0, "step": 4055 }, { "entropy": 5.510502290725708, "epoch": 0.3900096061479347, "grad_norm": 1.1015625, "learning_rate": 0.0004990230696228943, "loss": 5.4397, "mean_token_accuracy": 0.17829088270664215, "num_tokens": 9293368.0, "step": 4060 }, { "entropy": 5.477728748321534, "epoch": 0.3904899135446686, "grad_norm": 1.0234375, "learning_rate": 0.0004990198757088149, "loss": 5.5128, "mean_token_accuracy": 0.1811017781496048, "num_tokens": 9305962.0, "step": 4065 }, { "entropy": 5.508330774307251, "epoch": 0.3909702209414025, "grad_norm": 1.0625, "learning_rate": 0.0004990166765936593, "loss": 5.393, "mean_token_accuracy": 0.19244694262742995, "num_tokens": 9317955.0, "step": 4070 }, { "entropy": 5.450256824493408, "epoch": 0.3914505283381364, "grad_norm": 1.0078125, "learning_rate": 0.0004990134722775016, "loss": 5.3934, "mean_token_accuracy": 0.19047792106866837, "num_tokens": 9329491.0, "step": 4075 }, { "entropy": 5.451663637161255, "epoch": 0.3919308357348703, "grad_norm": 1.03125, "learning_rate": 0.0004990102627604162, "loss": 5.5273, "mean_token_accuracy": 0.19028781056404115, "num_tokens": 9341612.0, "step": 4080 }, { "entropy": 5.524235773086548, "epoch": 0.39241114313160425, "grad_norm": 1.046875, "learning_rate": 0.0004990070480424778, "loss": 5.458, "mean_token_accuracy": 0.18043633103370665, "num_tokens": 9352302.0, "step": 4085 }, { "entropy": 5.440912199020386, "epoch": 0.3928914505283381, "grad_norm": 0.97265625, "learning_rate": 0.0004990038281237608, "loss": 5.3919, "mean_token_accuracy": 0.1852226436138153, "num_tokens": 9363303.0, "step": 4090 }, { "entropy": 5.433840227127075, "epoch": 0.39337175792507206, "grad_norm": 1.0546875, "learning_rate": 0.0004990006030043401, "loss": 5.3732, "mean_token_accuracy": 0.1849522888660431, "num_tokens": 9375878.0, "step": 4095 }, { "entropy": 5.470492124557495, "epoch": 0.39385206532180594, "grad_norm": 1.0625, "learning_rate": 0.0004989973726842906, "loss": 5.4145, "mean_token_accuracy": 0.18103147149086, "num_tokens": 9388342.0, "step": 4100 }, { "entropy": 5.44459342956543, "epoch": 0.3943323727185399, "grad_norm": 1.1875, "learning_rate": 0.0004989941371636872, "loss": 5.3549, "mean_token_accuracy": 0.1901955187320709, "num_tokens": 9399047.0, "step": 4105 }, { "entropy": 5.449139881134033, "epoch": 0.39481268011527376, "grad_norm": 1.0703125, "learning_rate": 0.0004989908964426051, "loss": 5.4342, "mean_token_accuracy": 0.18933464139699935, "num_tokens": 9410172.0, "step": 4110 }, { "entropy": 5.547493505477905, "epoch": 0.3952929875120077, "grad_norm": 1.0703125, "learning_rate": 0.0004989876505211194, "loss": 5.5794, "mean_token_accuracy": 0.17717085629701615, "num_tokens": 9422287.0, "step": 4115 }, { "entropy": 5.5754584789276125, "epoch": 0.3957732949087416, "grad_norm": 1.046875, "learning_rate": 0.0004989843993993056, "loss": 5.44, "mean_token_accuracy": 0.18759053498506545, "num_tokens": 9433709.0, "step": 4120 }, { "entropy": 5.341240167617798, "epoch": 0.3962536023054755, "grad_norm": 1.125, "learning_rate": 0.0004989811430772392, "loss": 5.3199, "mean_token_accuracy": 0.189169280230999, "num_tokens": 9445138.0, "step": 4125 }, { "entropy": 5.4137170791625975, "epoch": 0.3967339097022094, "grad_norm": 1.1484375, "learning_rate": 0.0004989778815549957, "loss": 5.4579, "mean_token_accuracy": 0.1827932521700859, "num_tokens": 9455263.0, "step": 4130 }, { "entropy": 5.533003664016723, "epoch": 0.39721421709894333, "grad_norm": 1.15625, "learning_rate": 0.0004989746148326508, "loss": 5.4184, "mean_token_accuracy": 0.18644048422574996, "num_tokens": 9465491.0, "step": 4135 }, { "entropy": 5.372505331039429, "epoch": 0.3976945244956772, "grad_norm": 0.984375, "learning_rate": 0.0004989713429102805, "loss": 5.3821, "mean_token_accuracy": 0.1837732046842575, "num_tokens": 9477601.0, "step": 4140 }, { "entropy": 5.426533985137939, "epoch": 0.39817483189241115, "grad_norm": 1.0, "learning_rate": 0.0004989680657879607, "loss": 5.4426, "mean_token_accuracy": 0.18387902528047562, "num_tokens": 9489385.0, "step": 4145 }, { "entropy": 5.473710680007935, "epoch": 0.39865513928914503, "grad_norm": 1.03125, "learning_rate": 0.0004989647834657675, "loss": 5.3249, "mean_token_accuracy": 0.19230013936758042, "num_tokens": 9501131.0, "step": 4150 }, { "entropy": 5.420683908462524, "epoch": 0.39913544668587897, "grad_norm": 0.96875, "learning_rate": 0.000498961495943777, "loss": 5.4614, "mean_token_accuracy": 0.18854968398809432, "num_tokens": 9513094.0, "step": 4155 }, { "entropy": 5.577786207199097, "epoch": 0.39961575408261285, "grad_norm": 1.1015625, "learning_rate": 0.0004989582032220656, "loss": 5.5832, "mean_token_accuracy": 0.17526223361492158, "num_tokens": 9524538.0, "step": 4160 }, { "entropy": 5.522935295104981, "epoch": 0.4000960614793468, "grad_norm": 1.171875, "learning_rate": 0.0004989549053007096, "loss": 5.3961, "mean_token_accuracy": 0.19305580705404282, "num_tokens": 9535284.0, "step": 4165 }, { "entropy": 5.462124681472778, "epoch": 0.40057636887608067, "grad_norm": 1.1328125, "learning_rate": 0.0004989516021797858, "loss": 5.471, "mean_token_accuracy": 0.18390081077814102, "num_tokens": 9546472.0, "step": 4170 }, { "entropy": 5.499347305297851, "epoch": 0.4010566762728146, "grad_norm": 1.015625, "learning_rate": 0.000498948293859371, "loss": 5.4605, "mean_token_accuracy": 0.18212546557188034, "num_tokens": 9558358.0, "step": 4175 }, { "entropy": 5.496229076385498, "epoch": 0.4015369836695485, "grad_norm": 0.9765625, "learning_rate": 0.0004989449803395415, "loss": 5.4959, "mean_token_accuracy": 0.18471186012029647, "num_tokens": 9570653.0, "step": 4180 }, { "entropy": 5.556100845336914, "epoch": 0.4020172910662824, "grad_norm": 1.015625, "learning_rate": 0.0004989416616203747, "loss": 5.4386, "mean_token_accuracy": 0.18714374899864197, "num_tokens": 9582150.0, "step": 4185 }, { "entropy": 5.4823558807373045, "epoch": 0.4024975984630163, "grad_norm": 1.0546875, "learning_rate": 0.0004989383377019476, "loss": 5.38, "mean_token_accuracy": 0.19184014648199083, "num_tokens": 9592462.0, "step": 4190 }, { "entropy": 5.375227689743042, "epoch": 0.40297790585975024, "grad_norm": 1.0390625, "learning_rate": 0.0004989350085843371, "loss": 5.374, "mean_token_accuracy": 0.18951477408409118, "num_tokens": 9604027.0, "step": 4195 }, { "entropy": 5.387249088287353, "epoch": 0.4034582132564842, "grad_norm": 0.9609375, "learning_rate": 0.0004989316742676207, "loss": 5.3733, "mean_token_accuracy": 0.19109322130680084, "num_tokens": 9616325.0, "step": 4200 }, { "entropy": 5.396379852294922, "epoch": 0.40393852065321806, "grad_norm": 1.046875, "learning_rate": 0.0004989283347518757, "loss": 5.3338, "mean_token_accuracy": 0.18609212040901185, "num_tokens": 9628133.0, "step": 4205 }, { "entropy": 5.579652786254883, "epoch": 0.404418828049952, "grad_norm": 1.0625, "learning_rate": 0.0004989249900371797, "loss": 5.5629, "mean_token_accuracy": 0.17861852645874024, "num_tokens": 9639686.0, "step": 4210 }, { "entropy": 5.429533529281616, "epoch": 0.4048991354466859, "grad_norm": 1.09375, "learning_rate": 0.0004989216401236103, "loss": 5.4184, "mean_token_accuracy": 0.18496839255094527, "num_tokens": 9650222.0, "step": 4215 }, { "entropy": 5.367856836318969, "epoch": 0.4053794428434198, "grad_norm": 1.1171875, "learning_rate": 0.0004989182850112455, "loss": 5.3417, "mean_token_accuracy": 0.1997272178530693, "num_tokens": 9661792.0, "step": 4220 }, { "entropy": 5.516646957397461, "epoch": 0.4058597502401537, "grad_norm": 1.0546875, "learning_rate": 0.0004989149247001629, "loss": 5.4497, "mean_token_accuracy": 0.18383817970752717, "num_tokens": 9673000.0, "step": 4225 }, { "entropy": 5.532714462280273, "epoch": 0.40634005763688763, "grad_norm": 1.0234375, "learning_rate": 0.0004989115591904407, "loss": 5.3975, "mean_token_accuracy": 0.1901587262749672, "num_tokens": 9685253.0, "step": 4230 }, { "entropy": 5.391170501708984, "epoch": 0.4068203650336215, "grad_norm": 1.0546875, "learning_rate": 0.0004989081884821569, "loss": 5.4004, "mean_token_accuracy": 0.18320820480585098, "num_tokens": 9697245.0, "step": 4235 }, { "entropy": 5.450364589691162, "epoch": 0.40730067243035545, "grad_norm": 1.0390625, "learning_rate": 0.0004989048125753899, "loss": 5.4156, "mean_token_accuracy": 0.18504445552825927, "num_tokens": 9710095.0, "step": 4240 }, { "entropy": 5.407678937911987, "epoch": 0.40778097982708933, "grad_norm": 1.0625, "learning_rate": 0.000498901431470218, "loss": 5.2919, "mean_token_accuracy": 0.19396644979715347, "num_tokens": 9721488.0, "step": 4245 }, { "entropy": 5.2491998195648195, "epoch": 0.40826128722382327, "grad_norm": 0.98046875, "learning_rate": 0.0004988980451667198, "loss": 5.255, "mean_token_accuracy": 0.19170391261577607, "num_tokens": 9733280.0, "step": 4250 }, { "entropy": 5.455927753448487, "epoch": 0.40874159462055715, "grad_norm": 0.97265625, "learning_rate": 0.0004988946536649737, "loss": 5.3863, "mean_token_accuracy": 0.18661659061908722, "num_tokens": 9744514.0, "step": 4255 }, { "entropy": 5.413423871994018, "epoch": 0.4092219020172911, "grad_norm": 1.1015625, "learning_rate": 0.0004988912569650585, "loss": 5.3752, "mean_token_accuracy": 0.19112140834331512, "num_tokens": 9754931.0, "step": 4260 }, { "entropy": 5.389836359024048, "epoch": 0.40970220941402496, "grad_norm": 1.046875, "learning_rate": 0.0004988878550670533, "loss": 5.3725, "mean_token_accuracy": 0.19297343790531157, "num_tokens": 9765635.0, "step": 4265 }, { "entropy": 5.508016872406006, "epoch": 0.4101825168107589, "grad_norm": 1.0546875, "learning_rate": 0.0004988844479710369, "loss": 5.4792, "mean_token_accuracy": 0.18072771430015563, "num_tokens": 9777512.0, "step": 4270 }, { "entropy": 5.541130542755127, "epoch": 0.4106628242074928, "grad_norm": 1.015625, "learning_rate": 0.0004988810356770884, "loss": 5.4764, "mean_token_accuracy": 0.1744610548019409, "num_tokens": 9790128.0, "step": 4275 }, { "entropy": 5.451146841049194, "epoch": 0.4111431316042267, "grad_norm": 0.98046875, "learning_rate": 0.000498877618185287, "loss": 5.4112, "mean_token_accuracy": 0.19078320413827896, "num_tokens": 9802549.0, "step": 4280 }, { "entropy": 5.365971374511719, "epoch": 0.4116234390009606, "grad_norm": 1.0859375, "learning_rate": 0.0004988741954957121, "loss": 5.3574, "mean_token_accuracy": 0.18884203881025313, "num_tokens": 9813736.0, "step": 4285 }, { "entropy": 5.380771827697754, "epoch": 0.41210374639769454, "grad_norm": 1.1171875, "learning_rate": 0.0004988707676084432, "loss": 5.3584, "mean_token_accuracy": 0.19705824106931685, "num_tokens": 9823785.0, "step": 4290 }, { "entropy": 5.432324981689453, "epoch": 0.4125840537944284, "grad_norm": 1.0625, "learning_rate": 0.0004988673345235597, "loss": 5.3197, "mean_token_accuracy": 0.1934140741825104, "num_tokens": 9834910.0, "step": 4295 }, { "entropy": 5.437625408172607, "epoch": 0.41306436119116235, "grad_norm": 1.0, "learning_rate": 0.0004988638962411416, "loss": 5.363, "mean_token_accuracy": 0.18818716257810592, "num_tokens": 9845593.0, "step": 4300 }, { "entropy": 5.392855072021485, "epoch": 0.41354466858789624, "grad_norm": 0.9765625, "learning_rate": 0.0004988604527612685, "loss": 5.2697, "mean_token_accuracy": 0.2009762555360794, "num_tokens": 9856763.0, "step": 4305 }, { "entropy": 5.503190565109253, "epoch": 0.4140249759846302, "grad_norm": 1.0625, "learning_rate": 0.0004988570040840205, "loss": 5.4945, "mean_token_accuracy": 0.18051616251468658, "num_tokens": 9869528.0, "step": 4310 }, { "entropy": 5.407845735549927, "epoch": 0.41450528338136405, "grad_norm": 1.09375, "learning_rate": 0.0004988535502094774, "loss": 5.3958, "mean_token_accuracy": 0.18804680705070495, "num_tokens": 9881170.0, "step": 4315 }, { "entropy": 5.461514711380005, "epoch": 0.414985590778098, "grad_norm": 1.0859375, "learning_rate": 0.0004988500911377198, "loss": 5.4803, "mean_token_accuracy": 0.18439086973667146, "num_tokens": 9893119.0, "step": 4320 }, { "entropy": 5.368999385833741, "epoch": 0.41546589817483187, "grad_norm": 1.015625, "learning_rate": 0.0004988466268688276, "loss": 5.3154, "mean_token_accuracy": 0.19932861626148224, "num_tokens": 9905339.0, "step": 4325 }, { "entropy": 5.482837677001953, "epoch": 0.4159462055715658, "grad_norm": 0.9765625, "learning_rate": 0.0004988431574028814, "loss": 5.4002, "mean_token_accuracy": 0.19202394932508468, "num_tokens": 9917500.0, "step": 4330 }, { "entropy": 5.466025495529175, "epoch": 0.4164265129682997, "grad_norm": 1.1875, "learning_rate": 0.0004988396827399618, "loss": 5.4808, "mean_token_accuracy": 0.18326758295297624, "num_tokens": 9929667.0, "step": 4335 }, { "entropy": 5.48503007888794, "epoch": 0.4169068203650336, "grad_norm": 1.03125, "learning_rate": 0.0004988362028801495, "loss": 5.4048, "mean_token_accuracy": 0.18796583414077758, "num_tokens": 9941102.0, "step": 4340 }, { "entropy": 5.412125444412231, "epoch": 0.4173871277617675, "grad_norm": 1.078125, "learning_rate": 0.0004988327178235253, "loss": 5.3058, "mean_token_accuracy": 0.1973835989832878, "num_tokens": 9951986.0, "step": 4345 }, { "entropy": 5.383547782897949, "epoch": 0.41786743515850144, "grad_norm": 0.95703125, "learning_rate": 0.0004988292275701699, "loss": 5.3119, "mean_token_accuracy": 0.19086995273828505, "num_tokens": 9964486.0, "step": 4350 }, { "entropy": 5.406881952285767, "epoch": 0.4183477425552353, "grad_norm": 1.046875, "learning_rate": 0.0004988257321201646, "loss": 5.4094, "mean_token_accuracy": 0.1860354095697403, "num_tokens": 9975909.0, "step": 4355 }, { "entropy": 5.473488092422485, "epoch": 0.41882804995196926, "grad_norm": 1.078125, "learning_rate": 0.0004988222314735902, "loss": 5.4171, "mean_token_accuracy": 0.18617332428693772, "num_tokens": 9986951.0, "step": 4360 }, { "entropy": 5.517805814743042, "epoch": 0.41930835734870314, "grad_norm": 1.1484375, "learning_rate": 0.0004988187256305284, "loss": 5.5057, "mean_token_accuracy": 0.1791812226176262, "num_tokens": 9999234.0, "step": 4365 }, { "entropy": 5.405948638916016, "epoch": 0.4197886647454371, "grad_norm": 1.046875, "learning_rate": 0.0004988152145910603, "loss": 5.3792, "mean_token_accuracy": 0.1959477871656418, "num_tokens": 10010178.0, "step": 4370 }, { "entropy": 5.391415548324585, "epoch": 0.420268972142171, "grad_norm": 1.0859375, "learning_rate": 0.0004988116983552675, "loss": 5.3218, "mean_token_accuracy": 0.18838354647159578, "num_tokens": 10021183.0, "step": 4375 }, { "entropy": 5.590651321411133, "epoch": 0.4207492795389049, "grad_norm": 1.0390625, "learning_rate": 0.0004988081769232317, "loss": 5.6204, "mean_token_accuracy": 0.17428677082061766, "num_tokens": 10033686.0, "step": 4380 }, { "entropy": 5.384156322479248, "epoch": 0.42122958693563883, "grad_norm": 1.0859375, "learning_rate": 0.0004988046502950346, "loss": 5.3079, "mean_token_accuracy": 0.187077134847641, "num_tokens": 10045923.0, "step": 4385 }, { "entropy": 5.270208120346069, "epoch": 0.4217098943323727, "grad_norm": 0.99609375, "learning_rate": 0.000498801118470758, "loss": 5.2402, "mean_token_accuracy": 0.19899773895740508, "num_tokens": 10057196.0, "step": 4390 }, { "entropy": 5.409784030914307, "epoch": 0.42219020172910665, "grad_norm": 1.109375, "learning_rate": 0.000498797581450484, "loss": 5.4295, "mean_token_accuracy": 0.18354050666093827, "num_tokens": 10069655.0, "step": 4395 }, { "entropy": 5.448616600036621, "epoch": 0.42267050912584053, "grad_norm": 1.234375, "learning_rate": 0.0004987940392342948, "loss": 5.3095, "mean_token_accuracy": 0.19377071112394334, "num_tokens": 10080876.0, "step": 4400 }, { "entropy": 5.421027898788452, "epoch": 0.42315081652257447, "grad_norm": 0.9921875, "learning_rate": 0.0004987904918222726, "loss": 5.415, "mean_token_accuracy": 0.18513490557670592, "num_tokens": 10091986.0, "step": 4405 }, { "entropy": 5.5097509860992435, "epoch": 0.42363112391930835, "grad_norm": 1.109375, "learning_rate": 0.0004987869392144996, "loss": 5.499, "mean_token_accuracy": 0.18492884635925294, "num_tokens": 10104027.0, "step": 4410 }, { "entropy": 5.425499534606933, "epoch": 0.4241114313160423, "grad_norm": 1.09375, "learning_rate": 0.0004987833814110584, "loss": 5.3567, "mean_token_accuracy": 0.1865203857421875, "num_tokens": 10114665.0, "step": 4415 }, { "entropy": 5.385516119003296, "epoch": 0.42459173871277617, "grad_norm": 1.015625, "learning_rate": 0.0004987798184120316, "loss": 5.3742, "mean_token_accuracy": 0.19014959633350373, "num_tokens": 10126032.0, "step": 4420 }, { "entropy": 5.512171411514283, "epoch": 0.4250720461095101, "grad_norm": 1.125, "learning_rate": 0.0004987762502175018, "loss": 5.4288, "mean_token_accuracy": 0.1829407036304474, "num_tokens": 10137256.0, "step": 4425 }, { "entropy": 5.3579336643219, "epoch": 0.425552353506244, "grad_norm": 1.09375, "learning_rate": 0.000498772676827552, "loss": 5.3117, "mean_token_accuracy": 0.1916539713740349, "num_tokens": 10149445.0, "step": 4430 }, { "entropy": 5.474416351318359, "epoch": 0.4260326609029779, "grad_norm": 1.03125, "learning_rate": 0.0004987690982422652, "loss": 5.4495, "mean_token_accuracy": 0.18037094324827194, "num_tokens": 10161607.0, "step": 4435 }, { "entropy": 5.448618030548095, "epoch": 0.4265129682997118, "grad_norm": 1.0703125, "learning_rate": 0.0004987655144617243, "loss": 5.4681, "mean_token_accuracy": 0.18403236269950868, "num_tokens": 10173184.0, "step": 4440 }, { "entropy": 5.4251587867736815, "epoch": 0.42699327569644574, "grad_norm": 1.0234375, "learning_rate": 0.0004987619254860126, "loss": 5.328, "mean_token_accuracy": 0.19698531180620193, "num_tokens": 10184617.0, "step": 4445 }, { "entropy": 5.4672339916229244, "epoch": 0.4274735830931796, "grad_norm": 1.0703125, "learning_rate": 0.0004987583313152134, "loss": 5.3568, "mean_token_accuracy": 0.18906597346067427, "num_tokens": 10195608.0, "step": 4450 }, { "entropy": 5.386989736557007, "epoch": 0.42795389048991356, "grad_norm": 1.0859375, "learning_rate": 0.0004987547319494104, "loss": 5.4529, "mean_token_accuracy": 0.18423379063606263, "num_tokens": 10206763.0, "step": 4455 }, { "entropy": 5.486404466629028, "epoch": 0.42843419788664744, "grad_norm": 0.99609375, "learning_rate": 0.0004987511273886867, "loss": 5.3933, "mean_token_accuracy": 0.1908423647284508, "num_tokens": 10218714.0, "step": 4460 }, { "entropy": 5.427644729614258, "epoch": 0.4289145052833814, "grad_norm": 1.15625, "learning_rate": 0.0004987475176331263, "loss": 5.415, "mean_token_accuracy": 0.18401106595993041, "num_tokens": 10229902.0, "step": 4465 }, { "entropy": 5.423227453231812, "epoch": 0.42939481268011526, "grad_norm": 1.0390625, "learning_rate": 0.0004987439026828129, "loss": 5.288, "mean_token_accuracy": 0.19139131158590317, "num_tokens": 10241578.0, "step": 4470 }, { "entropy": 5.324700498580933, "epoch": 0.4298751200768492, "grad_norm": 1.1796875, "learning_rate": 0.0004987402825378305, "loss": 5.2595, "mean_token_accuracy": 0.19443607479333877, "num_tokens": 10252109.0, "step": 4475 }, { "entropy": 5.429213285446167, "epoch": 0.4303554274735831, "grad_norm": 0.99609375, "learning_rate": 0.0004987366571982631, "loss": 5.4252, "mean_token_accuracy": 0.18883214443922042, "num_tokens": 10263357.0, "step": 4480 }, { "entropy": 5.487810945510864, "epoch": 0.430835734870317, "grad_norm": 1.078125, "learning_rate": 0.0004987330266641948, "loss": 5.4308, "mean_token_accuracy": 0.18471152931451798, "num_tokens": 10275536.0, "step": 4485 }, { "entropy": 5.453687620162964, "epoch": 0.4313160422670509, "grad_norm": 1.046875, "learning_rate": 0.0004987293909357101, "loss": 5.415, "mean_token_accuracy": 0.19442622363567352, "num_tokens": 10286901.0, "step": 4490 }, { "entropy": 5.365311050415039, "epoch": 0.43179634966378483, "grad_norm": 0.9921875, "learning_rate": 0.0004987257500128933, "loss": 5.3172, "mean_token_accuracy": 0.18610639423131942, "num_tokens": 10298961.0, "step": 4495 }, { "entropy": 5.462113523483277, "epoch": 0.4322766570605187, "grad_norm": 1.046875, "learning_rate": 0.0004987221038958288, "loss": 5.4543, "mean_token_accuracy": 0.18748044222593307, "num_tokens": 10310911.0, "step": 4500 }, { "entropy": 5.510283613204956, "epoch": 0.43275696445725265, "grad_norm": 1.0234375, "learning_rate": 0.0004987184525846015, "loss": 5.4389, "mean_token_accuracy": 0.1841048017144203, "num_tokens": 10322267.0, "step": 4505 }, { "entropy": 5.411655378341675, "epoch": 0.4332372718539865, "grad_norm": 1.125, "learning_rate": 0.0004987147960792958, "loss": 5.459, "mean_token_accuracy": 0.18804670721292496, "num_tokens": 10335111.0, "step": 4510 }, { "entropy": 5.520284938812256, "epoch": 0.43371757925072046, "grad_norm": 0.97265625, "learning_rate": 0.0004987111343799971, "loss": 5.3974, "mean_token_accuracy": 0.1907435804605484, "num_tokens": 10345672.0, "step": 4515 }, { "entropy": 5.501500225067138, "epoch": 0.43419788664745435, "grad_norm": 1.0859375, "learning_rate": 0.00049870746748679, "loss": 5.3725, "mean_token_accuracy": 0.1861974611878395, "num_tokens": 10357369.0, "step": 4520 }, { "entropy": 5.38987283706665, "epoch": 0.4346781940441883, "grad_norm": 1.09375, "learning_rate": 0.0004987037953997598, "loss": 5.3935, "mean_token_accuracy": 0.18683493435382842, "num_tokens": 10368842.0, "step": 4525 }, { "entropy": 5.43892183303833, "epoch": 0.43515850144092216, "grad_norm": 1.015625, "learning_rate": 0.0004987001181189918, "loss": 5.3539, "mean_token_accuracy": 0.18663013726472855, "num_tokens": 10380096.0, "step": 4530 }, { "entropy": 5.306481552124024, "epoch": 0.4356388088376561, "grad_norm": 1.046875, "learning_rate": 0.0004986964356445713, "loss": 5.3772, "mean_token_accuracy": 0.19005681425333024, "num_tokens": 10391996.0, "step": 4535 }, { "entropy": 5.48760027885437, "epoch": 0.43611911623439004, "grad_norm": 1.0546875, "learning_rate": 0.0004986927479765837, "loss": 5.3288, "mean_token_accuracy": 0.18343985229730606, "num_tokens": 10403607.0, "step": 4540 }, { "entropy": 5.396467876434326, "epoch": 0.4365994236311239, "grad_norm": 1.140625, "learning_rate": 0.0004986890551151148, "loss": 5.3604, "mean_token_accuracy": 0.184589384496212, "num_tokens": 10413580.0, "step": 4545 }, { "entropy": 5.349568462371826, "epoch": 0.43707973102785785, "grad_norm": 0.99609375, "learning_rate": 0.0004986853570602503, "loss": 5.3881, "mean_token_accuracy": 0.18719975054264068, "num_tokens": 10426456.0, "step": 4550 }, { "entropy": 5.520879220962525, "epoch": 0.43756003842459174, "grad_norm": 0.98828125, "learning_rate": 0.0004986816538120758, "loss": 5.4101, "mean_token_accuracy": 0.18188669979572297, "num_tokens": 10438869.0, "step": 4555 }, { "entropy": 5.397240781784058, "epoch": 0.43804034582132567, "grad_norm": 1.09375, "learning_rate": 0.0004986779453706778, "loss": 5.4142, "mean_token_accuracy": 0.1816550999879837, "num_tokens": 10450672.0, "step": 4560 }, { "entropy": 5.4152685642242435, "epoch": 0.43852065321805955, "grad_norm": 1.125, "learning_rate": 0.0004986742317361419, "loss": 5.3271, "mean_token_accuracy": 0.19575155526399612, "num_tokens": 10461890.0, "step": 4565 }, { "entropy": 5.498744964599609, "epoch": 0.4390009606147935, "grad_norm": 1.125, "learning_rate": 0.0004986705129085546, "loss": 5.4613, "mean_token_accuracy": 0.17549378722906112, "num_tokens": 10473866.0, "step": 4570 }, { "entropy": 5.460689496994019, "epoch": 0.43948126801152737, "grad_norm": 1.0859375, "learning_rate": 0.0004986667888880021, "loss": 5.381, "mean_token_accuracy": 0.18632390201091767, "num_tokens": 10484889.0, "step": 4575 }, { "entropy": 5.412662744522095, "epoch": 0.4399615754082613, "grad_norm": 1.1015625, "learning_rate": 0.0004986630596745709, "loss": 5.4207, "mean_token_accuracy": 0.1880632683634758, "num_tokens": 10496108.0, "step": 4580 }, { "entropy": 5.389367771148682, "epoch": 0.4404418828049952, "grad_norm": 1.203125, "learning_rate": 0.0004986593252683477, "loss": 5.363, "mean_token_accuracy": 0.18732869774103164, "num_tokens": 10505472.0, "step": 4585 }, { "entropy": 5.307269144058227, "epoch": 0.4409221902017291, "grad_norm": 1.0625, "learning_rate": 0.0004986555856694191, "loss": 5.2773, "mean_token_accuracy": 0.19333918690681456, "num_tokens": 10516954.0, "step": 4590 }, { "entropy": 5.524228239059449, "epoch": 0.441402497598463, "grad_norm": 1.0390625, "learning_rate": 0.0004986518408778718, "loss": 5.3859, "mean_token_accuracy": 0.18945636600255966, "num_tokens": 10528166.0, "step": 4595 }, { "entropy": 5.38381519317627, "epoch": 0.44188280499519694, "grad_norm": 1.140625, "learning_rate": 0.0004986480908937929, "loss": 5.3113, "mean_token_accuracy": 0.18772315680980683, "num_tokens": 10538112.0, "step": 4600 }, { "entropy": 5.444307518005371, "epoch": 0.4423631123919308, "grad_norm": 1.0546875, "learning_rate": 0.0004986443357172695, "loss": 5.4568, "mean_token_accuracy": 0.18497458845376968, "num_tokens": 10549888.0, "step": 4605 }, { "entropy": 5.58274884223938, "epoch": 0.44284341978866476, "grad_norm": 0.984375, "learning_rate": 0.0004986405753483887, "loss": 5.5294, "mean_token_accuracy": 0.17502811402082444, "num_tokens": 10561710.0, "step": 4610 }, { "entropy": 5.410598850250244, "epoch": 0.44332372718539864, "grad_norm": 1.0, "learning_rate": 0.0004986368097872377, "loss": 5.379, "mean_token_accuracy": 0.18401092439889907, "num_tokens": 10574564.0, "step": 4615 }, { "entropy": 5.41968560218811, "epoch": 0.4438040345821326, "grad_norm": 1.0703125, "learning_rate": 0.0004986330390339042, "loss": 5.3586, "mean_token_accuracy": 0.18878330439329147, "num_tokens": 10586639.0, "step": 4620 }, { "entropy": 5.373893547058105, "epoch": 0.44428434197886646, "grad_norm": 1.0390625, "learning_rate": 0.0004986292630884755, "loss": 5.3645, "mean_token_accuracy": 0.18980913162231444, "num_tokens": 10598730.0, "step": 4625 }, { "entropy": 5.395772886276245, "epoch": 0.4447646493756004, "grad_norm": 1.0703125, "learning_rate": 0.0004986254819510393, "loss": 5.2863, "mean_token_accuracy": 0.2030077889561653, "num_tokens": 10610352.0, "step": 4630 }, { "entropy": 5.410120058059692, "epoch": 0.4452449567723343, "grad_norm": 1.0234375, "learning_rate": 0.0004986216956216835, "loss": 5.3544, "mean_token_accuracy": 0.18991922438144684, "num_tokens": 10621951.0, "step": 4635 }, { "entropy": 5.380520057678223, "epoch": 0.4457252641690682, "grad_norm": 1.125, "learning_rate": 0.000498617904100496, "loss": 5.3114, "mean_token_accuracy": 0.1913859009742737, "num_tokens": 10633207.0, "step": 4640 }, { "entropy": 5.473378133773804, "epoch": 0.4462055715658021, "grad_norm": 0.9921875, "learning_rate": 0.0004986141073875646, "loss": 5.4035, "mean_token_accuracy": 0.18385644257068634, "num_tokens": 10645853.0, "step": 4645 }, { "entropy": 5.330105209350586, "epoch": 0.44668587896253603, "grad_norm": 1.0078125, "learning_rate": 0.0004986103054829779, "loss": 5.3305, "mean_token_accuracy": 0.18985379487276077, "num_tokens": 10656892.0, "step": 4650 }, { "entropy": 5.424197340011597, "epoch": 0.4471661863592699, "grad_norm": 1.09375, "learning_rate": 0.0004986064983868237, "loss": 5.3095, "mean_token_accuracy": 0.18436852544546128, "num_tokens": 10670110.0, "step": 4655 }, { "entropy": 5.429648303985596, "epoch": 0.44764649375600385, "grad_norm": 1.09375, "learning_rate": 0.0004986026860991906, "loss": 5.4385, "mean_token_accuracy": 0.185771344602108, "num_tokens": 10681255.0, "step": 4660 }, { "entropy": 5.471052789688111, "epoch": 0.44812680115273773, "grad_norm": 1.1875, "learning_rate": 0.0004985988686201672, "loss": 5.5041, "mean_token_accuracy": 0.1844386264681816, "num_tokens": 10692631.0, "step": 4665 }, { "entropy": 5.442734622955323, "epoch": 0.44860710854947167, "grad_norm": 1.0234375, "learning_rate": 0.0004985950459498419, "loss": 5.3372, "mean_token_accuracy": 0.19462240785360335, "num_tokens": 10704880.0, "step": 4670 }, { "entropy": 5.390188550949096, "epoch": 0.44908741594620555, "grad_norm": 1.0703125, "learning_rate": 0.0004985912180883037, "loss": 5.3095, "mean_token_accuracy": 0.19716786891222, "num_tokens": 10715561.0, "step": 4675 }, { "entropy": 5.376702499389649, "epoch": 0.4495677233429395, "grad_norm": 1.125, "learning_rate": 0.0004985873850356411, "loss": 5.3369, "mean_token_accuracy": 0.19014816135168075, "num_tokens": 10727232.0, "step": 4680 }, { "entropy": 5.387975978851318, "epoch": 0.45004803073967337, "grad_norm": 1.0546875, "learning_rate": 0.0004985835467919436, "loss": 5.3461, "mean_token_accuracy": 0.19422013461589813, "num_tokens": 10739404.0, "step": 4685 }, { "entropy": 5.369897413253784, "epoch": 0.4505283381364073, "grad_norm": 1.0390625, "learning_rate": 0.0004985797033572999, "loss": 5.3767, "mean_token_accuracy": 0.18446222841739654, "num_tokens": 10751948.0, "step": 4690 }, { "entropy": 5.362226104736328, "epoch": 0.4510086455331412, "grad_norm": 1.0, "learning_rate": 0.0004985758547317994, "loss": 5.3363, "mean_token_accuracy": 0.18433189690113067, "num_tokens": 10764611.0, "step": 4695 }, { "entropy": 5.447867727279663, "epoch": 0.4514889529298751, "grad_norm": 1.0859375, "learning_rate": 0.0004985720009155315, "loss": 5.3727, "mean_token_accuracy": 0.1841047078371048, "num_tokens": 10775954.0, "step": 4700 }, { "entropy": 5.409327983856201, "epoch": 0.45196926032660906, "grad_norm": 1.15625, "learning_rate": 0.0004985681419085856, "loss": 5.3909, "mean_token_accuracy": 0.18282371312379836, "num_tokens": 10788723.0, "step": 4705 }, { "entropy": 5.421317195892334, "epoch": 0.45244956772334294, "grad_norm": 1.078125, "learning_rate": 0.0004985642777110513, "loss": 5.3841, "mean_token_accuracy": 0.1885462448000908, "num_tokens": 10799879.0, "step": 4710 }, { "entropy": 5.3301918506622314, "epoch": 0.4529298751200769, "grad_norm": 1.0234375, "learning_rate": 0.0004985604083230183, "loss": 5.3231, "mean_token_accuracy": 0.18998679518699646, "num_tokens": 10811838.0, "step": 4715 }, { "entropy": 5.428510332107544, "epoch": 0.45341018251681076, "grad_norm": 1.1640625, "learning_rate": 0.0004985565337445765, "loss": 5.3434, "mean_token_accuracy": 0.19171882420778275, "num_tokens": 10822910.0, "step": 4720 }, { "entropy": 5.471314573287964, "epoch": 0.4538904899135447, "grad_norm": 1.0234375, "learning_rate": 0.0004985526539758158, "loss": 5.3992, "mean_token_accuracy": 0.18527638167142868, "num_tokens": 10835344.0, "step": 4725 }, { "entropy": 5.375976181030273, "epoch": 0.4543707973102786, "grad_norm": 1.015625, "learning_rate": 0.0004985487690168263, "loss": 5.4034, "mean_token_accuracy": 0.19202104806900025, "num_tokens": 10846043.0, "step": 4730 }, { "entropy": 5.380132484436035, "epoch": 0.4548511047070125, "grad_norm": 1.1015625, "learning_rate": 0.000498544878867698, "loss": 5.298, "mean_token_accuracy": 0.19829845130443574, "num_tokens": 10857783.0, "step": 4735 }, { "entropy": 5.434480476379394, "epoch": 0.4553314121037464, "grad_norm": 0.99609375, "learning_rate": 0.0004985409835285215, "loss": 5.373, "mean_token_accuracy": 0.19089124351739883, "num_tokens": 10870527.0, "step": 4740 }, { "entropy": 5.414768075942993, "epoch": 0.45581171950048033, "grad_norm": 1.046875, "learning_rate": 0.0004985370829993873, "loss": 5.3646, "mean_token_accuracy": 0.19075230062007903, "num_tokens": 10882285.0, "step": 4745 }, { "entropy": 5.423041200637817, "epoch": 0.4562920268972142, "grad_norm": 1.0390625, "learning_rate": 0.0004985331772803857, "loss": 5.3874, "mean_token_accuracy": 0.19265468865633012, "num_tokens": 10895319.0, "step": 4750 }, { "entropy": 5.484057378768921, "epoch": 0.45677233429394815, "grad_norm": 1.0234375, "learning_rate": 0.0004985292663716074, "loss": 5.382, "mean_token_accuracy": 0.19183963984251023, "num_tokens": 10906253.0, "step": 4755 }, { "entropy": 5.229197072982788, "epoch": 0.457252641690682, "grad_norm": 0.96875, "learning_rate": 0.0004985253502731435, "loss": 5.2575, "mean_token_accuracy": 0.19930023998022078, "num_tokens": 10918197.0, "step": 4760 }, { "entropy": 5.455323648452759, "epoch": 0.45773294908741596, "grad_norm": 1.015625, "learning_rate": 0.0004985214289850845, "loss": 5.4579, "mean_token_accuracy": 0.17997599244117737, "num_tokens": 10930771.0, "step": 4765 }, { "entropy": 5.443937206268311, "epoch": 0.45821325648414984, "grad_norm": 0.98828125, "learning_rate": 0.0004985175025075217, "loss": 5.3491, "mean_token_accuracy": 0.18804308474063874, "num_tokens": 10942759.0, "step": 4770 }, { "entropy": 5.591840028762817, "epoch": 0.4586935638808838, "grad_norm": 1.03125, "learning_rate": 0.0004985135708405462, "loss": 5.5609, "mean_token_accuracy": 0.17564835995435715, "num_tokens": 10953557.0, "step": 4775 }, { "entropy": 5.411443281173706, "epoch": 0.45917387127761766, "grad_norm": 1.0546875, "learning_rate": 0.0004985096339842493, "loss": 5.3321, "mean_token_accuracy": 0.19676847159862518, "num_tokens": 10963142.0, "step": 4780 }, { "entropy": 5.309838056564331, "epoch": 0.4596541786743516, "grad_norm": 1.046875, "learning_rate": 0.0004985056919387224, "loss": 5.2856, "mean_token_accuracy": 0.19894758760929107, "num_tokens": 10974321.0, "step": 4785 }, { "entropy": 5.502527189254761, "epoch": 0.4601344860710855, "grad_norm": 1.1328125, "learning_rate": 0.0004985017447040569, "loss": 5.4874, "mean_token_accuracy": 0.18695860356092453, "num_tokens": 10985524.0, "step": 4790 }, { "entropy": 5.457700490951538, "epoch": 0.4606147934678194, "grad_norm": 1.0625, "learning_rate": 0.0004984977922803447, "loss": 5.3727, "mean_token_accuracy": 0.1937094435095787, "num_tokens": 10997606.0, "step": 4795 }, { "entropy": 5.4323536396026615, "epoch": 0.4610951008645533, "grad_norm": 1.140625, "learning_rate": 0.0004984938346676772, "loss": 5.3833, "mean_token_accuracy": 0.18257274031639098, "num_tokens": 11010692.0, "step": 4800 }, { "entropy": 5.40803747177124, "epoch": 0.46157540826128723, "grad_norm": 0.98828125, "learning_rate": 0.0004984898718661468, "loss": 5.3099, "mean_token_accuracy": 0.19199058413505554, "num_tokens": 11022517.0, "step": 4805 }, { "entropy": 5.350576591491699, "epoch": 0.4620557156580211, "grad_norm": 1.1796875, "learning_rate": 0.0004984859038758451, "loss": 5.3253, "mean_token_accuracy": 0.19188573807477952, "num_tokens": 11033141.0, "step": 4810 }, { "entropy": 5.32304048538208, "epoch": 0.46253602305475505, "grad_norm": 1.0625, "learning_rate": 0.0004984819306968642, "loss": 5.3173, "mean_token_accuracy": 0.19185021072626113, "num_tokens": 11044619.0, "step": 4815 }, { "entropy": 5.495067167282104, "epoch": 0.46301633045148893, "grad_norm": 0.98828125, "learning_rate": 0.0004984779523292966, "loss": 5.3646, "mean_token_accuracy": 0.18967657685279846, "num_tokens": 11055934.0, "step": 4820 }, { "entropy": 5.383758926391602, "epoch": 0.46349663784822287, "grad_norm": 1.015625, "learning_rate": 0.0004984739687732345, "loss": 5.2493, "mean_token_accuracy": 0.19513811767101288, "num_tokens": 11066203.0, "step": 4825 }, { "entropy": 5.187354946136475, "epoch": 0.46397694524495675, "grad_norm": 0.9921875, "learning_rate": 0.0004984699800287705, "loss": 5.1973, "mean_token_accuracy": 0.19977913796901703, "num_tokens": 11079664.0, "step": 4830 }, { "entropy": 5.341605234146118, "epoch": 0.4644572526416907, "grad_norm": 1.0, "learning_rate": 0.000498465986095997, "loss": 5.2652, "mean_token_accuracy": 0.19821466654539108, "num_tokens": 11091186.0, "step": 4835 }, { "entropy": 5.42094578742981, "epoch": 0.46493756003842457, "grad_norm": 1.1328125, "learning_rate": 0.0004984619869750069, "loss": 5.383, "mean_token_accuracy": 0.18526540249586104, "num_tokens": 11102710.0, "step": 4840 }, { "entropy": 5.292195415496826, "epoch": 0.4654178674351585, "grad_norm": 1.0625, "learning_rate": 0.000498457982665893, "loss": 5.2795, "mean_token_accuracy": 0.19302588403224946, "num_tokens": 11114746.0, "step": 4845 }, { "entropy": 5.397561931610108, "epoch": 0.4658981748318924, "grad_norm": 1.0546875, "learning_rate": 0.0004984539731687483, "loss": 5.3462, "mean_token_accuracy": 0.18983854949474335, "num_tokens": 11126572.0, "step": 4850 }, { "entropy": 5.380267095565796, "epoch": 0.4663784822286263, "grad_norm": 1.1328125, "learning_rate": 0.0004984499584836659, "loss": 5.2431, "mean_token_accuracy": 0.19321491122245787, "num_tokens": 11137830.0, "step": 4855 }, { "entropy": 5.32379674911499, "epoch": 0.4668587896253602, "grad_norm": 1.09375, "learning_rate": 0.000498445938610739, "loss": 5.281, "mean_token_accuracy": 0.19294328689575196, "num_tokens": 11148860.0, "step": 4860 }, { "entropy": 5.419743824005127, "epoch": 0.46733909702209414, "grad_norm": 1.0859375, "learning_rate": 0.0004984419135500608, "loss": 5.4081, "mean_token_accuracy": 0.17859717160463334, "num_tokens": 11161311.0, "step": 4865 }, { "entropy": 5.430191612243652, "epoch": 0.4678194044188281, "grad_norm": 1.046875, "learning_rate": 0.0004984378833017249, "loss": 5.2942, "mean_token_accuracy": 0.19046030193567276, "num_tokens": 11173124.0, "step": 4870 }, { "entropy": 5.344765472412109, "epoch": 0.46829971181556196, "grad_norm": 1.140625, "learning_rate": 0.0004984338478658248, "loss": 5.3783, "mean_token_accuracy": 0.19164984971284865, "num_tokens": 11184879.0, "step": 4875 }, { "entropy": 5.45609302520752, "epoch": 0.4687800192122959, "grad_norm": 1.1015625, "learning_rate": 0.0004984298072424542, "loss": 5.378, "mean_token_accuracy": 0.1874854624271393, "num_tokens": 11196243.0, "step": 4880 }, { "entropy": 5.339529609680175, "epoch": 0.4692603266090298, "grad_norm": 1.1015625, "learning_rate": 0.000498425761431707, "loss": 5.2513, "mean_token_accuracy": 0.20040780752897264, "num_tokens": 11207485.0, "step": 4885 }, { "entropy": 5.312271356582642, "epoch": 0.4697406340057637, "grad_norm": 1.046875, "learning_rate": 0.000498421710433677, "loss": 5.279, "mean_token_accuracy": 0.19036460667848587, "num_tokens": 11219891.0, "step": 4890 }, { "entropy": 5.4914182186126705, "epoch": 0.4702209414024976, "grad_norm": 1.0234375, "learning_rate": 0.0004984176542484584, "loss": 5.388, "mean_token_accuracy": 0.18597144782543182, "num_tokens": 11231329.0, "step": 4895 }, { "entropy": 5.378525733947754, "epoch": 0.47070124879923153, "grad_norm": 1.0625, "learning_rate": 0.0004984135928761452, "loss": 5.266, "mean_token_accuracy": 0.1995886370539665, "num_tokens": 11241367.0, "step": 4900 }, { "entropy": 5.358568334579468, "epoch": 0.4711815561959654, "grad_norm": 1.0234375, "learning_rate": 0.0004984095263168317, "loss": 5.3589, "mean_token_accuracy": 0.18466073721647264, "num_tokens": 11254532.0, "step": 4905 }, { "entropy": 5.4979103088378904, "epoch": 0.47166186359269935, "grad_norm": 1.0859375, "learning_rate": 0.0004984054545706124, "loss": 5.4398, "mean_token_accuracy": 0.18243181705474854, "num_tokens": 11265223.0, "step": 4910 }, { "entropy": 5.3696846008300785, "epoch": 0.47214217098943323, "grad_norm": 1.0390625, "learning_rate": 0.000498401377637582, "loss": 5.3635, "mean_token_accuracy": 0.18885526210069656, "num_tokens": 11278228.0, "step": 4915 }, { "entropy": 5.484466791152954, "epoch": 0.47262247838616717, "grad_norm": 1.03125, "learning_rate": 0.000498397295517835, "loss": 5.4846, "mean_token_accuracy": 0.1801117405295372, "num_tokens": 11289654.0, "step": 4920 }, { "entropy": 5.394139242172241, "epoch": 0.47310278578290105, "grad_norm": 1.03125, "learning_rate": 0.0004983932082114659, "loss": 5.2357, "mean_token_accuracy": 0.19755308330059052, "num_tokens": 11301911.0, "step": 4925 }, { "entropy": 5.4873377799987795, "epoch": 0.473583093179635, "grad_norm": 1.1328125, "learning_rate": 0.0004983891157185699, "loss": 5.4364, "mean_token_accuracy": 0.18308536261320113, "num_tokens": 11312945.0, "step": 4930 }, { "entropy": 5.549541664123535, "epoch": 0.47406340057636887, "grad_norm": 1.046875, "learning_rate": 0.0004983850180392421, "loss": 5.4774, "mean_token_accuracy": 0.18022425770759581, "num_tokens": 11324126.0, "step": 4935 }, { "entropy": 5.402717351913452, "epoch": 0.4745437079731028, "grad_norm": 1.0546875, "learning_rate": 0.0004983809151735775, "loss": 5.4133, "mean_token_accuracy": 0.18017226606607437, "num_tokens": 11336395.0, "step": 4940 }, { "entropy": 5.403596019744873, "epoch": 0.4750240153698367, "grad_norm": 1.0078125, "learning_rate": 0.0004983768071216713, "loss": 5.3135, "mean_token_accuracy": 0.1902969852089882, "num_tokens": 11347387.0, "step": 4945 }, { "entropy": 5.353836917877198, "epoch": 0.4755043227665706, "grad_norm": 1.25, "learning_rate": 0.0004983726938836189, "loss": 5.308, "mean_token_accuracy": 0.19681546241044998, "num_tokens": 11358467.0, "step": 4950 }, { "entropy": 5.486645841598511, "epoch": 0.4759846301633045, "grad_norm": 1.0703125, "learning_rate": 0.0004983685754595159, "loss": 5.4724, "mean_token_accuracy": 0.18010423183441163, "num_tokens": 11370322.0, "step": 4955 }, { "entropy": 5.333859491348266, "epoch": 0.47646493756003844, "grad_norm": 1.0703125, "learning_rate": 0.0004983644518494578, "loss": 5.2697, "mean_token_accuracy": 0.20096147507429124, "num_tokens": 11381719.0, "step": 4960 }, { "entropy": 5.328320550918579, "epoch": 0.4769452449567723, "grad_norm": 1.0859375, "learning_rate": 0.0004983603230535403, "loss": 5.2895, "mean_token_accuracy": 0.1948627695441246, "num_tokens": 11393561.0, "step": 4965 }, { "entropy": 5.460376167297364, "epoch": 0.47742555235350626, "grad_norm": 0.96875, "learning_rate": 0.0004983561890718594, "loss": 5.3849, "mean_token_accuracy": 0.18933912962675095, "num_tokens": 11405411.0, "step": 4970 }, { "entropy": 5.5110303401947025, "epoch": 0.47790585975024014, "grad_norm": 1.1796875, "learning_rate": 0.000498352049904511, "loss": 5.4771, "mean_token_accuracy": 0.17981591820716858, "num_tokens": 11417419.0, "step": 4975 }, { "entropy": 5.429950714111328, "epoch": 0.4783861671469741, "grad_norm": 1.1328125, "learning_rate": 0.0004983479055515914, "loss": 5.2844, "mean_token_accuracy": 0.18997065275907515, "num_tokens": 11428145.0, "step": 4980 }, { "entropy": 5.290281534194946, "epoch": 0.47886647454370795, "grad_norm": 1.0, "learning_rate": 0.0004983437560131964, "loss": 5.2422, "mean_token_accuracy": 0.1993091583251953, "num_tokens": 11439224.0, "step": 4985 }, { "entropy": 5.409195756912231, "epoch": 0.4793467819404419, "grad_norm": 1.0859375, "learning_rate": 0.0004983396012894228, "loss": 5.3477, "mean_token_accuracy": 0.18979695290327073, "num_tokens": 11451731.0, "step": 4990 }, { "entropy": 5.435146522521973, "epoch": 0.47982708933717577, "grad_norm": 1.1171875, "learning_rate": 0.0004983354413803666, "loss": 5.3375, "mean_token_accuracy": 0.1958609476685524, "num_tokens": 11463058.0, "step": 4995 }, { "entropy": 5.473912382125855, "epoch": 0.4803073967339097, "grad_norm": 1.1640625, "learning_rate": 0.0004983312762861248, "loss": 5.4305, "mean_token_accuracy": 0.18449530750513077, "num_tokens": 11472618.0, "step": 5000 }, { "entropy": 5.364778709411621, "epoch": 0.4807877041306436, "grad_norm": 1.0625, "learning_rate": 0.0004983271060067939, "loss": 5.3246, "mean_token_accuracy": 0.18677808940410615, "num_tokens": 11483114.0, "step": 5005 }, { "entropy": 5.3417730808258055, "epoch": 0.4812680115273775, "grad_norm": 1.15625, "learning_rate": 0.0004983229305424707, "loss": 5.2799, "mean_token_accuracy": 0.19405496269464492, "num_tokens": 11494281.0, "step": 5010 }, { "entropy": 5.351672601699829, "epoch": 0.4817483189241114, "grad_norm": 1.0078125, "learning_rate": 0.0004983187498932522, "loss": 5.3503, "mean_token_accuracy": 0.18800514042377472, "num_tokens": 11505962.0, "step": 5015 }, { "entropy": 5.4874766826629635, "epoch": 0.48222862632084534, "grad_norm": 1.0078125, "learning_rate": 0.0004983145640592354, "loss": 5.4492, "mean_token_accuracy": 0.18352760821580888, "num_tokens": 11517558.0, "step": 5020 }, { "entropy": 5.448751974105835, "epoch": 0.4827089337175792, "grad_norm": 1.1015625, "learning_rate": 0.0004983103730405176, "loss": 5.4179, "mean_token_accuracy": 0.18682138621807098, "num_tokens": 11529184.0, "step": 5025 }, { "entropy": 5.338459253311157, "epoch": 0.48318924111431316, "grad_norm": 1.09375, "learning_rate": 0.000498306176837196, "loss": 5.3335, "mean_token_accuracy": 0.18406548202037812, "num_tokens": 11540727.0, "step": 5030 }, { "entropy": 5.360374689102173, "epoch": 0.48366954851104704, "grad_norm": 1.0390625, "learning_rate": 0.0004983019754493681, "loss": 5.261, "mean_token_accuracy": 0.1907915487885475, "num_tokens": 11551510.0, "step": 5035 }, { "entropy": 5.47594895362854, "epoch": 0.484149855907781, "grad_norm": 1.078125, "learning_rate": 0.0004982977688771314, "loss": 5.4187, "mean_token_accuracy": 0.18854755759239197, "num_tokens": 11563203.0, "step": 5040 }, { "entropy": 5.308377647399903, "epoch": 0.4846301633045149, "grad_norm": 1.0234375, "learning_rate": 0.0004982935571205835, "loss": 5.2718, "mean_token_accuracy": 0.19544857442379, "num_tokens": 11576013.0, "step": 5045 }, { "entropy": 5.291185140609741, "epoch": 0.4851104707012488, "grad_norm": 1.1328125, "learning_rate": 0.0004982893401798223, "loss": 5.2498, "mean_token_accuracy": 0.20830876976251603, "num_tokens": 11587535.0, "step": 5050 }, { "entropy": 5.403550291061402, "epoch": 0.48559077809798273, "grad_norm": 1.0234375, "learning_rate": 0.0004982851180549456, "loss": 5.2771, "mean_token_accuracy": 0.19294197112321854, "num_tokens": 11598487.0, "step": 5055 }, { "entropy": 5.25755033493042, "epoch": 0.4860710854947166, "grad_norm": 1.046875, "learning_rate": 0.0004982808907460515, "loss": 5.1559, "mean_token_accuracy": 0.20932556241750716, "num_tokens": 11609457.0, "step": 5060 }, { "entropy": 5.265308237075805, "epoch": 0.48655139289145055, "grad_norm": 1.015625, "learning_rate": 0.0004982766582532382, "loss": 5.2257, "mean_token_accuracy": 0.19795275181531907, "num_tokens": 11620251.0, "step": 5065 }, { "entropy": 5.307956266403198, "epoch": 0.48703170028818443, "grad_norm": 1.1640625, "learning_rate": 0.0004982724205766038, "loss": 5.2262, "mean_token_accuracy": 0.19880327582359314, "num_tokens": 11630956.0, "step": 5070 }, { "entropy": 5.348564767837525, "epoch": 0.48751200768491837, "grad_norm": 0.9609375, "learning_rate": 0.0004982681777162468, "loss": 5.2773, "mean_token_accuracy": 0.1949208691716194, "num_tokens": 11642560.0, "step": 5075 }, { "entropy": 5.300316572189331, "epoch": 0.48799231508165225, "grad_norm": 1.109375, "learning_rate": 0.0004982639296722657, "loss": 5.2365, "mean_token_accuracy": 0.19546635299921036, "num_tokens": 11654050.0, "step": 5080 }, { "entropy": 5.333183813095093, "epoch": 0.4884726224783862, "grad_norm": 1.109375, "learning_rate": 0.0004982596764447591, "loss": 5.4035, "mean_token_accuracy": 0.19310665130615234, "num_tokens": 11664947.0, "step": 5085 }, { "entropy": 5.469000768661499, "epoch": 0.48895292987512007, "grad_norm": 1.0390625, "learning_rate": 0.0004982554180338258, "loss": 5.3106, "mean_token_accuracy": 0.19500951319932938, "num_tokens": 11676927.0, "step": 5090 }, { "entropy": 5.502379417419434, "epoch": 0.489433237271854, "grad_norm": 1.1484375, "learning_rate": 0.0004982511544395646, "loss": 5.4242, "mean_token_accuracy": 0.18115128874778746, "num_tokens": 11688573.0, "step": 5095 }, { "entropy": 5.288805294036865, "epoch": 0.4899135446685879, "grad_norm": 1.171875, "learning_rate": 0.0004982468856620745, "loss": 5.3128, "mean_token_accuracy": 0.18783441036939622, "num_tokens": 11698704.0, "step": 5100 }, { "entropy": 5.3273578643798825, "epoch": 0.4903938520653218, "grad_norm": 1.0859375, "learning_rate": 0.0004982426117014545, "loss": 5.2533, "mean_token_accuracy": 0.19392533451318741, "num_tokens": 11709466.0, "step": 5105 }, { "entropy": 5.3791663646698, "epoch": 0.4908741594620557, "grad_norm": 1.0703125, "learning_rate": 0.0004982383325578041, "loss": 5.3413, "mean_token_accuracy": 0.1898537114262581, "num_tokens": 11721120.0, "step": 5110 }, { "entropy": 5.4256843566894535, "epoch": 0.49135446685878964, "grad_norm": 1.0546875, "learning_rate": 0.0004982340482312226, "loss": 5.3358, "mean_token_accuracy": 0.18456312417984008, "num_tokens": 11732120.0, "step": 5115 }, { "entropy": 5.288364553451538, "epoch": 0.4918347742555235, "grad_norm": 1.046875, "learning_rate": 0.0004982297587218092, "loss": 5.2294, "mean_token_accuracy": 0.1978309139609337, "num_tokens": 11743501.0, "step": 5120 }, { "entropy": 5.363348197937012, "epoch": 0.49231508165225746, "grad_norm": 1.109375, "learning_rate": 0.0004982254640296637, "loss": 5.3152, "mean_token_accuracy": 0.1956743210554123, "num_tokens": 11755051.0, "step": 5125 }, { "entropy": 5.436681079864502, "epoch": 0.49279538904899134, "grad_norm": 1.0546875, "learning_rate": 0.0004982211641548857, "loss": 5.4609, "mean_token_accuracy": 0.1842927649617195, "num_tokens": 11767663.0, "step": 5130 }, { "entropy": 5.419048309326172, "epoch": 0.4932756964457253, "grad_norm": 1.09375, "learning_rate": 0.0004982168590975752, "loss": 5.3034, "mean_token_accuracy": 0.19774986803531647, "num_tokens": 11778828.0, "step": 5135 }, { "entropy": 5.459513902664185, "epoch": 0.49375600384245916, "grad_norm": 1.046875, "learning_rate": 0.0004982125488578321, "loss": 5.4794, "mean_token_accuracy": 0.18496931344270706, "num_tokens": 11790654.0, "step": 5140 }, { "entropy": 5.433895540237427, "epoch": 0.4942363112391931, "grad_norm": 1.140625, "learning_rate": 0.0004982082334357563, "loss": 5.2837, "mean_token_accuracy": 0.1902835488319397, "num_tokens": 11801489.0, "step": 5145 }, { "entropy": 5.311564207077026, "epoch": 0.494716618635927, "grad_norm": 1.1328125, "learning_rate": 0.0004982039128314481, "loss": 5.2873, "mean_token_accuracy": 0.19224448949098588, "num_tokens": 11813818.0, "step": 5150 }, { "entropy": 5.333755207061768, "epoch": 0.4951969260326609, "grad_norm": 1.0625, "learning_rate": 0.0004981995870450079, "loss": 5.2929, "mean_token_accuracy": 0.191859370470047, "num_tokens": 11824814.0, "step": 5155 }, { "entropy": 5.45896692276001, "epoch": 0.4956772334293948, "grad_norm": 1.140625, "learning_rate": 0.0004981952560765361, "loss": 5.3373, "mean_token_accuracy": 0.18679553270339966, "num_tokens": 11836252.0, "step": 5160 }, { "entropy": 5.314207363128662, "epoch": 0.49615754082612873, "grad_norm": 1.09375, "learning_rate": 0.0004981909199261331, "loss": 5.2629, "mean_token_accuracy": 0.19086166322231293, "num_tokens": 11847715.0, "step": 5165 }, { "entropy": 5.273135042190551, "epoch": 0.4966378482228626, "grad_norm": 1.0625, "learning_rate": 0.0004981865785938998, "loss": 5.2629, "mean_token_accuracy": 0.19300127327442168, "num_tokens": 11860309.0, "step": 5170 }, { "entropy": 5.348716497421265, "epoch": 0.49711815561959655, "grad_norm": 1.015625, "learning_rate": 0.0004981822320799367, "loss": 5.2577, "mean_token_accuracy": 0.1956932559609413, "num_tokens": 11872569.0, "step": 5175 }, { "entropy": 5.3287012577056885, "epoch": 0.49759846301633043, "grad_norm": 1.0546875, "learning_rate": 0.0004981778803843449, "loss": 5.2523, "mean_token_accuracy": 0.19481286704540252, "num_tokens": 11884778.0, "step": 5180 }, { "entropy": 5.390296173095703, "epoch": 0.49807877041306436, "grad_norm": 1.0859375, "learning_rate": 0.0004981735235072256, "loss": 5.3358, "mean_token_accuracy": 0.1911753833293915, "num_tokens": 11897324.0, "step": 5185 }, { "entropy": 5.467144203186035, "epoch": 0.49855907780979825, "grad_norm": 1.03125, "learning_rate": 0.0004981691614486796, "loss": 5.366, "mean_token_accuracy": 0.18982964605093003, "num_tokens": 11909145.0, "step": 5190 }, { "entropy": 5.322554683685302, "epoch": 0.4990393852065322, "grad_norm": 1.0625, "learning_rate": 0.0004981647942088084, "loss": 5.2697, "mean_token_accuracy": 0.20009808093309403, "num_tokens": 11921021.0, "step": 5195 }, { "entropy": 5.487699699401856, "epoch": 0.49951969260326606, "grad_norm": 1.046875, "learning_rate": 0.0004981604217877135, "loss": 5.4279, "mean_token_accuracy": 0.1888749822974205, "num_tokens": 11932565.0, "step": 5200 }, { "entropy": 5.318529844284058, "epoch": 0.5, "grad_norm": 1.046875, "learning_rate": 0.000498156044185496, "loss": 5.3392, "mean_token_accuracy": 0.19370948225259782, "num_tokens": 11943225.0, "step": 5205 }, { "entropy": 5.364103078842163, "epoch": 0.5004803073967339, "grad_norm": 1.140625, "learning_rate": 0.0004981516614022579, "loss": 5.3219, "mean_token_accuracy": 0.1932568922638893, "num_tokens": 11954821.0, "step": 5210 }, { "entropy": 5.446450281143188, "epoch": 0.5009606147934679, "grad_norm": 1.1328125, "learning_rate": 0.0004981472734381008, "loss": 5.2738, "mean_token_accuracy": 0.1951069414615631, "num_tokens": 11966090.0, "step": 5215 }, { "entropy": 5.353061962127685, "epoch": 0.5014409221902018, "grad_norm": 1.1015625, "learning_rate": 0.0004981428802931267, "loss": 5.3074, "mean_token_accuracy": 0.1921882688999176, "num_tokens": 11977410.0, "step": 5220 }, { "entropy": 5.339950656890869, "epoch": 0.5019212295869356, "grad_norm": 1.15625, "learning_rate": 0.0004981384819674375, "loss": 5.2841, "mean_token_accuracy": 0.19126271605491638, "num_tokens": 11989119.0, "step": 5225 }, { "entropy": 5.432912015914917, "epoch": 0.5024015369836695, "grad_norm": 1.0390625, "learning_rate": 0.0004981340784611354, "loss": 5.3942, "mean_token_accuracy": 0.19018032401800156, "num_tokens": 12000165.0, "step": 5230 }, { "entropy": 5.395741987228393, "epoch": 0.5028818443804035, "grad_norm": 1.0546875, "learning_rate": 0.0004981296697743224, "loss": 5.3475, "mean_token_accuracy": 0.18768104463815688, "num_tokens": 12012118.0, "step": 5235 }, { "entropy": 5.430673694610595, "epoch": 0.5033621517771374, "grad_norm": 1.1015625, "learning_rate": 0.0004981252559071012, "loss": 5.4181, "mean_token_accuracy": 0.1866712138056755, "num_tokens": 12023432.0, "step": 5240 }, { "entropy": 5.427559089660645, "epoch": 0.5038424591738713, "grad_norm": 1.1953125, "learning_rate": 0.0004981208368595739, "loss": 5.2939, "mean_token_accuracy": 0.1980261042714119, "num_tokens": 12034323.0, "step": 5245 }, { "entropy": 5.264776802062988, "epoch": 0.5043227665706052, "grad_norm": 1.09375, "learning_rate": 0.0004981164126318435, "loss": 5.3022, "mean_token_accuracy": 0.19116167575120926, "num_tokens": 12045532.0, "step": 5250 }, { "entropy": 5.449652862548828, "epoch": 0.5048030739673391, "grad_norm": 1.015625, "learning_rate": 0.0004981119832240124, "loss": 5.3111, "mean_token_accuracy": 0.19520313441753387, "num_tokens": 12057346.0, "step": 5255 }, { "entropy": 5.301677227020264, "epoch": 0.505283381364073, "grad_norm": 1.046875, "learning_rate": 0.0004981075486361837, "loss": 5.2825, "mean_token_accuracy": 0.19872631430625914, "num_tokens": 12068670.0, "step": 5260 }, { "entropy": 5.390146923065186, "epoch": 0.5057636887608069, "grad_norm": 1.0703125, "learning_rate": 0.0004981031088684601, "loss": 5.4028, "mean_token_accuracy": 0.18470921665430068, "num_tokens": 12079664.0, "step": 5265 }, { "entropy": 5.474726438522339, "epoch": 0.5062439961575408, "grad_norm": 1.0859375, "learning_rate": 0.0004980986639209448, "loss": 5.3285, "mean_token_accuracy": 0.1994831383228302, "num_tokens": 12089984.0, "step": 5270 }, { "entropy": 5.29730339050293, "epoch": 0.5067243035542748, "grad_norm": 1.1484375, "learning_rate": 0.000498094213793741, "loss": 5.2835, "mean_token_accuracy": 0.1948940023779869, "num_tokens": 12101182.0, "step": 5275 }, { "entropy": 5.408280658721924, "epoch": 0.5072046109510087, "grad_norm": 1.125, "learning_rate": 0.000498089758486952, "loss": 5.353, "mean_token_accuracy": 0.18289182782173158, "num_tokens": 12112002.0, "step": 5280 }, { "entropy": 5.495666790008545, "epoch": 0.5076849183477425, "grad_norm": 1.0, "learning_rate": 0.0004980852980006812, "loss": 5.4392, "mean_token_accuracy": 0.1805154114961624, "num_tokens": 12124194.0, "step": 5285 }, { "entropy": 5.392632579803466, "epoch": 0.5081652257444764, "grad_norm": 1.1640625, "learning_rate": 0.0004980808323350323, "loss": 5.359, "mean_token_accuracy": 0.1960368499159813, "num_tokens": 12133966.0, "step": 5290 }, { "entropy": 5.391989612579346, "epoch": 0.5086455331412104, "grad_norm": 1.1484375, "learning_rate": 0.0004980763614901089, "loss": 5.2967, "mean_token_accuracy": 0.19686038345098494, "num_tokens": 12145643.0, "step": 5295 }, { "entropy": 5.379247760772705, "epoch": 0.5091258405379443, "grad_norm": 1.1015625, "learning_rate": 0.0004980718854660146, "loss": 5.3464, "mean_token_accuracy": 0.18789971768856048, "num_tokens": 12156804.0, "step": 5300 }, { "entropy": 5.400803756713867, "epoch": 0.5096061479346782, "grad_norm": 1.0234375, "learning_rate": 0.0004980674042628537, "loss": 5.2967, "mean_token_accuracy": 0.19052283465862274, "num_tokens": 12168700.0, "step": 5305 }, { "entropy": 5.401619243621826, "epoch": 0.5100864553314121, "grad_norm": 1.0078125, "learning_rate": 0.00049806291788073, "loss": 5.3123, "mean_token_accuracy": 0.18629832863807677, "num_tokens": 12181050.0, "step": 5310 }, { "entropy": 5.469602966308594, "epoch": 0.510566762728146, "grad_norm": 1.0859375, "learning_rate": 0.0004980584263197477, "loss": 5.3949, "mean_token_accuracy": 0.1858072027564049, "num_tokens": 12192001.0, "step": 5315 }, { "entropy": 5.508568143844604, "epoch": 0.5110470701248799, "grad_norm": 1.1875, "learning_rate": 0.0004980539295800111, "loss": 5.509, "mean_token_accuracy": 0.18043418526649474, "num_tokens": 12202436.0, "step": 5320 }, { "entropy": 5.362590551376343, "epoch": 0.5115273775216138, "grad_norm": 1.0546875, "learning_rate": 0.0004980494276616246, "loss": 5.3016, "mean_token_accuracy": 0.18966611623764038, "num_tokens": 12214454.0, "step": 5325 }, { "entropy": 5.349428033828735, "epoch": 0.5120076849183477, "grad_norm": 1.0390625, "learning_rate": 0.0004980449205646926, "loss": 5.3122, "mean_token_accuracy": 0.19553214311599731, "num_tokens": 12225924.0, "step": 5330 }, { "entropy": 5.415020084381103, "epoch": 0.5124879923150817, "grad_norm": 1.1328125, "learning_rate": 0.00049804040828932, "loss": 5.3326, "mean_token_accuracy": 0.19512139409780502, "num_tokens": 12236456.0, "step": 5335 }, { "entropy": 5.421989011764526, "epoch": 0.5129682997118156, "grad_norm": 1.03125, "learning_rate": 0.0004980358908356113, "loss": 5.3535, "mean_token_accuracy": 0.18762658089399337, "num_tokens": 12247719.0, "step": 5340 }, { "entropy": 5.350346803665161, "epoch": 0.5134486071085494, "grad_norm": 1.0703125, "learning_rate": 0.0004980313682036717, "loss": 5.381, "mean_token_accuracy": 0.1927213490009308, "num_tokens": 12259141.0, "step": 5345 }, { "entropy": 5.49134635925293, "epoch": 0.5139289145052833, "grad_norm": 1.2109375, "learning_rate": 0.0004980268403936058, "loss": 5.4456, "mean_token_accuracy": 0.18453603684902192, "num_tokens": 12269748.0, "step": 5350 }, { "entropy": 5.434391784667969, "epoch": 0.5144092219020173, "grad_norm": 1.0546875, "learning_rate": 0.0004980223074055189, "loss": 5.379, "mean_token_accuracy": 0.1960138276219368, "num_tokens": 12281456.0, "step": 5355 }, { "entropy": 5.409012746810913, "epoch": 0.5148895292987512, "grad_norm": 1.125, "learning_rate": 0.0004980177692395164, "loss": 5.3518, "mean_token_accuracy": 0.18338604271411896, "num_tokens": 12293763.0, "step": 5360 }, { "entropy": 5.351993417739868, "epoch": 0.5153698366954851, "grad_norm": 0.98046875, "learning_rate": 0.0004980132258957035, "loss": 5.2808, "mean_token_accuracy": 0.1969463735818863, "num_tokens": 12305398.0, "step": 5365 }, { "entropy": 5.274507617950439, "epoch": 0.515850144092219, "grad_norm": 1.1796875, "learning_rate": 0.0004980086773741856, "loss": 5.2796, "mean_token_accuracy": 0.19121709913015367, "num_tokens": 12316582.0, "step": 5370 }, { "entropy": 5.483122396469116, "epoch": 0.516330451488953, "grad_norm": 1.1875, "learning_rate": 0.0004980041236750685, "loss": 5.3846, "mean_token_accuracy": 0.18809578120708464, "num_tokens": 12328463.0, "step": 5375 }, { "entropy": 5.445298194885254, "epoch": 0.5168107588856868, "grad_norm": 1.0390625, "learning_rate": 0.0004979995647984577, "loss": 5.3698, "mean_token_accuracy": 0.19524169117212295, "num_tokens": 12341040.0, "step": 5380 }, { "entropy": 5.2983297348022464, "epoch": 0.5172910662824207, "grad_norm": 0.9765625, "learning_rate": 0.0004979950007444593, "loss": 5.261, "mean_token_accuracy": 0.1934810236096382, "num_tokens": 12353024.0, "step": 5385 }, { "entropy": 5.358570623397827, "epoch": 0.5177713736791547, "grad_norm": 1.0625, "learning_rate": 0.0004979904315131792, "loss": 5.2844, "mean_token_accuracy": 0.19403222799301148, "num_tokens": 12366100.0, "step": 5390 }, { "entropy": 5.293501186370849, "epoch": 0.5182516810758886, "grad_norm": 1.0546875, "learning_rate": 0.0004979858571047233, "loss": 5.2707, "mean_token_accuracy": 0.19768950045108796, "num_tokens": 12377829.0, "step": 5395 }, { "entropy": 5.466844320297241, "epoch": 0.5187319884726225, "grad_norm": 1.15625, "learning_rate": 0.0004979812775191979, "loss": 5.4031, "mean_token_accuracy": 0.18979473859071733, "num_tokens": 12390830.0, "step": 5400 }, { "entropy": 5.328051805496216, "epoch": 0.5192122958693564, "grad_norm": 1.0546875, "learning_rate": 0.0004979766927567094, "loss": 5.2545, "mean_token_accuracy": 0.19470396041870117, "num_tokens": 12401642.0, "step": 5405 }, { "entropy": 5.3456236839294435, "epoch": 0.5196926032660903, "grad_norm": 1.109375, "learning_rate": 0.0004979721028173643, "loss": 5.3476, "mean_token_accuracy": 0.1877232700586319, "num_tokens": 12411653.0, "step": 5410 }, { "entropy": 5.386164760589599, "epoch": 0.5201729106628242, "grad_norm": 1.0859375, "learning_rate": 0.000497967507701269, "loss": 5.2486, "mean_token_accuracy": 0.20038487911224365, "num_tokens": 12422891.0, "step": 5415 }, { "entropy": 5.397801113128662, "epoch": 0.5206532180595581, "grad_norm": 1.0625, "learning_rate": 0.0004979629074085303, "loss": 5.3408, "mean_token_accuracy": 0.19329493790864943, "num_tokens": 12434190.0, "step": 5420 }, { "entropy": 5.424389457702636, "epoch": 0.521133525456292, "grad_norm": 1.0234375, "learning_rate": 0.0004979583019392548, "loss": 5.3974, "mean_token_accuracy": 0.18989453911781312, "num_tokens": 12445796.0, "step": 5425 }, { "entropy": 5.483598613739014, "epoch": 0.521613832853026, "grad_norm": 1.140625, "learning_rate": 0.0004979536912935497, "loss": 5.4639, "mean_token_accuracy": 0.18501935750246049, "num_tokens": 12456212.0, "step": 5430 }, { "entropy": 5.330318355560303, "epoch": 0.5220941402497599, "grad_norm": 1.1171875, "learning_rate": 0.000497949075471522, "loss": 5.1899, "mean_token_accuracy": 0.19820088148117065, "num_tokens": 12467871.0, "step": 5435 }, { "entropy": 5.372925519943237, "epoch": 0.5225744476464937, "grad_norm": 1.0625, "learning_rate": 0.0004979444544732786, "loss": 5.2819, "mean_token_accuracy": 0.1852207139134407, "num_tokens": 12478626.0, "step": 5440 }, { "entropy": 5.313206958770752, "epoch": 0.5230547550432276, "grad_norm": 1.015625, "learning_rate": 0.000497939828298927, "loss": 5.3741, "mean_token_accuracy": 0.19033849388360977, "num_tokens": 12491487.0, "step": 5445 }, { "entropy": 5.462804317474365, "epoch": 0.5235350624399616, "grad_norm": 1.1484375, "learning_rate": 0.0004979351969485747, "loss": 5.3383, "mean_token_accuracy": 0.18805173933506011, "num_tokens": 12503240.0, "step": 5450 }, { "entropy": 5.4243183612823485, "epoch": 0.5240153698366955, "grad_norm": 1.0859375, "learning_rate": 0.0004979305604223291, "loss": 5.2774, "mean_token_accuracy": 0.1903422147035599, "num_tokens": 12513860.0, "step": 5455 }, { "entropy": 5.313809871673584, "epoch": 0.5244956772334294, "grad_norm": 1.1171875, "learning_rate": 0.0004979259187202978, "loss": 5.352, "mean_token_accuracy": 0.1945337176322937, "num_tokens": 12525884.0, "step": 5460 }, { "entropy": 5.442373895645142, "epoch": 0.5249759846301633, "grad_norm": 1.1171875, "learning_rate": 0.0004979212718425887, "loss": 5.2672, "mean_token_accuracy": 0.1932208612561226, "num_tokens": 12536709.0, "step": 5465 }, { "entropy": 5.334468412399292, "epoch": 0.5254562920268973, "grad_norm": 1.15625, "learning_rate": 0.0004979166197893096, "loss": 5.2663, "mean_token_accuracy": 0.19677013605833055, "num_tokens": 12549727.0, "step": 5470 }, { "entropy": 5.339883422851562, "epoch": 0.5259365994236311, "grad_norm": 0.98828125, "learning_rate": 0.0004979119625605683, "loss": 5.3345, "mean_token_accuracy": 0.18942939788103103, "num_tokens": 12562053.0, "step": 5475 }, { "entropy": 5.287409067153931, "epoch": 0.526416906820365, "grad_norm": 1.1171875, "learning_rate": 0.0004979073001564734, "loss": 5.2257, "mean_token_accuracy": 0.20170782059431075, "num_tokens": 12574096.0, "step": 5480 }, { "entropy": 5.40628571510315, "epoch": 0.5268972142170989, "grad_norm": 1.046875, "learning_rate": 0.0004979026325771328, "loss": 5.4013, "mean_token_accuracy": 0.18865474164485932, "num_tokens": 12585416.0, "step": 5485 }, { "entropy": 5.369120025634766, "epoch": 0.5273775216138329, "grad_norm": 1.0078125, "learning_rate": 0.0004978979598226549, "loss": 5.2525, "mean_token_accuracy": 0.1964880034327507, "num_tokens": 12596861.0, "step": 5490 }, { "entropy": 5.307511520385742, "epoch": 0.5278578290105668, "grad_norm": 1.109375, "learning_rate": 0.0004978932818931483, "loss": 5.2672, "mean_token_accuracy": 0.19722044318914414, "num_tokens": 12607761.0, "step": 5495 }, { "entropy": 5.4275431632995605, "epoch": 0.5283381364073007, "grad_norm": 1.1171875, "learning_rate": 0.0004978885987887216, "loss": 5.3898, "mean_token_accuracy": 0.19588741660118103, "num_tokens": 12619889.0, "step": 5500 }, { "entropy": 5.4371997833251955, "epoch": 0.5288184438040345, "grad_norm": 1.1171875, "learning_rate": 0.0004978839105094833, "loss": 5.3606, "mean_token_accuracy": 0.19224700778722764, "num_tokens": 12630604.0, "step": 5505 }, { "entropy": 5.222589921951294, "epoch": 0.5292987512007685, "grad_norm": 1.078125, "learning_rate": 0.0004978792170555426, "loss": 5.2618, "mean_token_accuracy": 0.19633477181196213, "num_tokens": 12641172.0, "step": 5510 }, { "entropy": 5.292724561691284, "epoch": 0.5297790585975024, "grad_norm": 1.046875, "learning_rate": 0.0004978745184270083, "loss": 5.1601, "mean_token_accuracy": 0.20660953521728515, "num_tokens": 12651731.0, "step": 5515 }, { "entropy": 5.392834901809692, "epoch": 0.5302593659942363, "grad_norm": 1.0859375, "learning_rate": 0.0004978698146239893, "loss": 5.2978, "mean_token_accuracy": 0.1936490774154663, "num_tokens": 12663050.0, "step": 5520 }, { "entropy": 5.409347009658814, "epoch": 0.5307396733909702, "grad_norm": 1.0703125, "learning_rate": 0.0004978651056465952, "loss": 5.3862, "mean_token_accuracy": 0.18999682515859603, "num_tokens": 12674732.0, "step": 5525 }, { "entropy": 5.332290983200073, "epoch": 0.5312199807877042, "grad_norm": 1.140625, "learning_rate": 0.000497860391494935, "loss": 5.2171, "mean_token_accuracy": 0.19382983297109604, "num_tokens": 12685981.0, "step": 5530 }, { "entropy": 5.412051010131836, "epoch": 0.531700288184438, "grad_norm": 1.1328125, "learning_rate": 0.0004978556721691183, "loss": 5.3525, "mean_token_accuracy": 0.19065555483102797, "num_tokens": 12697139.0, "step": 5535 }, { "entropy": 5.317591810226441, "epoch": 0.5321805955811719, "grad_norm": 1.1015625, "learning_rate": 0.0004978509476692547, "loss": 5.2966, "mean_token_accuracy": 0.18611351698637008, "num_tokens": 12708268.0, "step": 5540 }, { "entropy": 5.375318956375122, "epoch": 0.5326609029779059, "grad_norm": 1.1015625, "learning_rate": 0.0004978462179954538, "loss": 5.2958, "mean_token_accuracy": 0.18993753045797349, "num_tokens": 12720715.0, "step": 5545 }, { "entropy": 5.3367125511169435, "epoch": 0.5331412103746398, "grad_norm": 1.1171875, "learning_rate": 0.0004978414831478253, "loss": 5.269, "mean_token_accuracy": 0.19713337272405623, "num_tokens": 12732409.0, "step": 5550 }, { "entropy": 5.323969554901123, "epoch": 0.5336215177713737, "grad_norm": 1.1171875, "learning_rate": 0.0004978367431264794, "loss": 5.397, "mean_token_accuracy": 0.18209069669246675, "num_tokens": 12745174.0, "step": 5555 }, { "entropy": 5.410878992080688, "epoch": 0.5341018251681076, "grad_norm": 1.078125, "learning_rate": 0.0004978319979315261, "loss": 5.3328, "mean_token_accuracy": 0.19573558866977692, "num_tokens": 12756116.0, "step": 5560 }, { "entropy": 5.376229763031006, "epoch": 0.5345821325648416, "grad_norm": 1.21875, "learning_rate": 0.0004978272475630752, "loss": 5.2851, "mean_token_accuracy": 0.1916971653699875, "num_tokens": 12768183.0, "step": 5565 }, { "entropy": 5.264455699920655, "epoch": 0.5350624399615754, "grad_norm": 1.203125, "learning_rate": 0.0004978224920212374, "loss": 5.2931, "mean_token_accuracy": 0.1934914067387581, "num_tokens": 12778537.0, "step": 5570 }, { "entropy": 5.313297891616822, "epoch": 0.5355427473583093, "grad_norm": 1.109375, "learning_rate": 0.0004978177313061232, "loss": 5.3228, "mean_token_accuracy": 0.19088124930858613, "num_tokens": 12789691.0, "step": 5575 }, { "entropy": 5.473337554931641, "epoch": 0.5360230547550432, "grad_norm": 1.078125, "learning_rate": 0.0004978129654178426, "loss": 5.3433, "mean_token_accuracy": 0.18791570216417314, "num_tokens": 12801438.0, "step": 5580 }, { "entropy": 5.4069455623626705, "epoch": 0.5365033621517772, "grad_norm": 1.0546875, "learning_rate": 0.0004978081943565067, "loss": 5.3061, "mean_token_accuracy": 0.18656288981437683, "num_tokens": 12812425.0, "step": 5585 }, { "entropy": 5.307536172866821, "epoch": 0.5369836695485111, "grad_norm": 1.0390625, "learning_rate": 0.0004978034181222261, "loss": 5.2769, "mean_token_accuracy": 0.18625542372465134, "num_tokens": 12824735.0, "step": 5590 }, { "entropy": 5.430880117416382, "epoch": 0.537463976945245, "grad_norm": 1.1796875, "learning_rate": 0.0004977986367151119, "loss": 5.3688, "mean_token_accuracy": 0.1952778786420822, "num_tokens": 12835454.0, "step": 5595 }, { "entropy": 5.434065580368042, "epoch": 0.5379442843419788, "grad_norm": 1.046875, "learning_rate": 0.0004977938501352747, "loss": 5.4122, "mean_token_accuracy": 0.18514797538518907, "num_tokens": 12847086.0, "step": 5600 }, { "entropy": 5.385431623458862, "epoch": 0.5384245917387128, "grad_norm": 1.078125, "learning_rate": 0.0004977890583828259, "loss": 5.3549, "mean_token_accuracy": 0.1888865575194359, "num_tokens": 12857713.0, "step": 5605 }, { "entropy": 5.36136646270752, "epoch": 0.5389048991354467, "grad_norm": 1.125, "learning_rate": 0.0004977842614578768, "loss": 5.3356, "mean_token_accuracy": 0.18914903849363326, "num_tokens": 12869967.0, "step": 5610 }, { "entropy": 5.433460998535156, "epoch": 0.5393852065321806, "grad_norm": 1.0859375, "learning_rate": 0.0004977794593605386, "loss": 5.3684, "mean_token_accuracy": 0.18960850983858107, "num_tokens": 12881230.0, "step": 5615 }, { "entropy": 5.352547121047974, "epoch": 0.5398655139289145, "grad_norm": 1.109375, "learning_rate": 0.000497774652090923, "loss": 5.3222, "mean_token_accuracy": 0.18944347649812698, "num_tokens": 12892376.0, "step": 5620 }, { "entropy": 5.436691570281982, "epoch": 0.5403458213256485, "grad_norm": 1.09375, "learning_rate": 0.0004977698396491414, "loss": 5.3307, "mean_token_accuracy": 0.19240753799676896, "num_tokens": 12903709.0, "step": 5625 }, { "entropy": 5.2928542137146, "epoch": 0.5408261287223823, "grad_norm": 1.03125, "learning_rate": 0.0004977650220353055, "loss": 5.1629, "mean_token_accuracy": 0.19530351608991622, "num_tokens": 12914958.0, "step": 5630 }, { "entropy": 5.280749416351318, "epoch": 0.5413064361191162, "grad_norm": 1.1171875, "learning_rate": 0.0004977601992495274, "loss": 5.2875, "mean_token_accuracy": 0.1923414632678032, "num_tokens": 12927418.0, "step": 5635 }, { "entropy": 5.413435602188111, "epoch": 0.5417867435158501, "grad_norm": 0.98046875, "learning_rate": 0.0004977553712919189, "loss": 5.3325, "mean_token_accuracy": 0.1892315372824669, "num_tokens": 12939874.0, "step": 5640 }, { "entropy": 5.463119792938232, "epoch": 0.5422670509125841, "grad_norm": 1.1484375, "learning_rate": 0.0004977505381625921, "loss": 5.3542, "mean_token_accuracy": 0.18793897628784179, "num_tokens": 12951113.0, "step": 5645 }, { "entropy": 5.333239316940308, "epoch": 0.542747358309318, "grad_norm": 1.0, "learning_rate": 0.0004977456998616593, "loss": 5.247, "mean_token_accuracy": 0.19487171471118928, "num_tokens": 12961940.0, "step": 5650 }, { "entropy": 5.247047281265258, "epoch": 0.5432276657060519, "grad_norm": 1.1015625, "learning_rate": 0.0004977408563892327, "loss": 5.2389, "mean_token_accuracy": 0.19528348445892335, "num_tokens": 12973938.0, "step": 5655 }, { "entropy": 5.355054330825806, "epoch": 0.5437079731027857, "grad_norm": 1.1015625, "learning_rate": 0.0004977360077454249, "loss": 5.2669, "mean_token_accuracy": 0.19261687248945236, "num_tokens": 12985400.0, "step": 5660 }, { "entropy": 5.381504774093628, "epoch": 0.5441882804995197, "grad_norm": 1.0390625, "learning_rate": 0.0004977311539303483, "loss": 5.2984, "mean_token_accuracy": 0.202898870408535, "num_tokens": 12996402.0, "step": 5665 }, { "entropy": 5.339759063720703, "epoch": 0.5446685878962536, "grad_norm": 1.2421875, "learning_rate": 0.0004977262949441158, "loss": 5.1882, "mean_token_accuracy": 0.20247950553894042, "num_tokens": 13006991.0, "step": 5670 }, { "entropy": 5.329454803466797, "epoch": 0.5451488952929875, "grad_norm": 1.171875, "learning_rate": 0.0004977214307868399, "loss": 5.2909, "mean_token_accuracy": 0.19646303355693817, "num_tokens": 13016969.0, "step": 5675 }, { "entropy": 5.333616399765015, "epoch": 0.5456292026897214, "grad_norm": 1.265625, "learning_rate": 0.000497716561458634, "loss": 5.2395, "mean_token_accuracy": 0.1989587128162384, "num_tokens": 13027759.0, "step": 5680 }, { "entropy": 5.4932708740234375, "epoch": 0.5461095100864554, "grad_norm": 1.0625, "learning_rate": 0.0004977116869596107, "loss": 5.4415, "mean_token_accuracy": 0.1860479310154915, "num_tokens": 13039881.0, "step": 5685 }, { "entropy": 5.399776601791382, "epoch": 0.5465898174831892, "grad_norm": 1.0625, "learning_rate": 0.0004977068072898834, "loss": 5.3041, "mean_token_accuracy": 0.18947898745536804, "num_tokens": 13051443.0, "step": 5690 }, { "entropy": 5.3822290897369385, "epoch": 0.5470701248799231, "grad_norm": 1.0859375, "learning_rate": 0.0004977019224495652, "loss": 5.3697, "mean_token_accuracy": 0.18962922990322112, "num_tokens": 13063474.0, "step": 5695 }, { "entropy": 5.307476902008057, "epoch": 0.547550432276657, "grad_norm": 1.1796875, "learning_rate": 0.0004976970324387698, "loss": 5.234, "mean_token_accuracy": 0.20077043473720552, "num_tokens": 13074365.0, "step": 5700 }, { "entropy": 5.339881372451782, "epoch": 0.548030739673391, "grad_norm": 1.0625, "learning_rate": 0.0004976921372576104, "loss": 5.3033, "mean_token_accuracy": 0.19367703795433044, "num_tokens": 13087354.0, "step": 5705 }, { "entropy": 5.32935528755188, "epoch": 0.5485110470701249, "grad_norm": 1.03125, "learning_rate": 0.0004976872369062011, "loss": 5.2787, "mean_token_accuracy": 0.19071510583162307, "num_tokens": 13099306.0, "step": 5710 }, { "entropy": 5.4302033424377445, "epoch": 0.5489913544668588, "grad_norm": 1.109375, "learning_rate": 0.0004976823313846552, "loss": 5.4164, "mean_token_accuracy": 0.19036435931921006, "num_tokens": 13111259.0, "step": 5715 }, { "entropy": 5.4693896770477295, "epoch": 0.5494716618635928, "grad_norm": 1.0625, "learning_rate": 0.0004976774206930869, "loss": 5.3256, "mean_token_accuracy": 0.18587163984775543, "num_tokens": 13123589.0, "step": 5720 }, { "entropy": 5.253912925720215, "epoch": 0.5499519692603266, "grad_norm": 1.109375, "learning_rate": 0.0004976725048316101, "loss": 5.322, "mean_token_accuracy": 0.19089159667491912, "num_tokens": 13136485.0, "step": 5725 }, { "entropy": 5.40102801322937, "epoch": 0.5504322766570605, "grad_norm": 0.98828125, "learning_rate": 0.0004976675838003388, "loss": 5.2997, "mean_token_accuracy": 0.19145811647176741, "num_tokens": 13148067.0, "step": 5730 }, { "entropy": 5.367999935150147, "epoch": 0.5509125840537944, "grad_norm": 1.171875, "learning_rate": 0.0004976626575993877, "loss": 5.2818, "mean_token_accuracy": 0.18961854726076127, "num_tokens": 13159813.0, "step": 5735 }, { "entropy": 5.410087442398071, "epoch": 0.5513928914505284, "grad_norm": 1.234375, "learning_rate": 0.0004976577262288705, "loss": 5.356, "mean_token_accuracy": 0.18928916603326798, "num_tokens": 13170828.0, "step": 5740 }, { "entropy": 5.265670728683472, "epoch": 0.5518731988472623, "grad_norm": 1.125, "learning_rate": 0.0004976527896889023, "loss": 5.181, "mean_token_accuracy": 0.20403801798820495, "num_tokens": 13181883.0, "step": 5745 }, { "entropy": 5.295314884185791, "epoch": 0.5523535062439962, "grad_norm": 1.21875, "learning_rate": 0.0004976478479795974, "loss": 5.2557, "mean_token_accuracy": 0.1949864685535431, "num_tokens": 13193530.0, "step": 5750 }, { "entropy": 5.484155082702637, "epoch": 0.55283381364073, "grad_norm": 1.1015625, "learning_rate": 0.0004976429011010706, "loss": 5.4823, "mean_token_accuracy": 0.17912757843732835, "num_tokens": 13205822.0, "step": 5755 }, { "entropy": 5.3539347648620605, "epoch": 0.553314121037464, "grad_norm": 1.0546875, "learning_rate": 0.0004976379490534366, "loss": 5.2081, "mean_token_accuracy": 0.19992550164461137, "num_tokens": 13216698.0, "step": 5760 }, { "entropy": 5.291062736511231, "epoch": 0.5537944284341979, "grad_norm": 1.0625, "learning_rate": 0.0004976329918368107, "loss": 5.2968, "mean_token_accuracy": 0.19075367897748946, "num_tokens": 13228389.0, "step": 5765 }, { "entropy": 5.433424997329712, "epoch": 0.5542747358309318, "grad_norm": 1.109375, "learning_rate": 0.0004976280294513079, "loss": 5.3505, "mean_token_accuracy": 0.18287664502859116, "num_tokens": 13239628.0, "step": 5770 }, { "entropy": 5.404953861236573, "epoch": 0.5547550432276657, "grad_norm": 1.078125, "learning_rate": 0.0004976230618970431, "loss": 5.352, "mean_token_accuracy": 0.19548004865646362, "num_tokens": 13251149.0, "step": 5775 }, { "entropy": 5.455016326904297, "epoch": 0.5552353506243997, "grad_norm": 1.0625, "learning_rate": 0.000497618089174132, "loss": 5.413, "mean_token_accuracy": 0.18660195618867875, "num_tokens": 13264846.0, "step": 5780 }, { "entropy": 5.248121690750122, "epoch": 0.5557156580211335, "grad_norm": 1.03125, "learning_rate": 0.0004976131112826898, "loss": 5.1913, "mean_token_accuracy": 0.2054605171084404, "num_tokens": 13275409.0, "step": 5785 }, { "entropy": 5.259016036987305, "epoch": 0.5561959654178674, "grad_norm": 1.03125, "learning_rate": 0.0004976081282228323, "loss": 5.1657, "mean_token_accuracy": 0.20358884781599046, "num_tokens": 13287173.0, "step": 5790 }, { "entropy": 5.411679124832153, "epoch": 0.5566762728146013, "grad_norm": 1.03125, "learning_rate": 0.000497603139994675, "loss": 5.2377, "mean_token_accuracy": 0.19680293649435043, "num_tokens": 13298225.0, "step": 5795 }, { "entropy": 5.2930761814117435, "epoch": 0.5571565802113353, "grad_norm": 1.0703125, "learning_rate": 0.0004975981465983338, "loss": 5.2468, "mean_token_accuracy": 0.19053254425525665, "num_tokens": 13309685.0, "step": 5800 }, { "entropy": 5.304633331298828, "epoch": 0.5576368876080692, "grad_norm": 1.1171875, "learning_rate": 0.0004975931480339246, "loss": 5.2554, "mean_token_accuracy": 0.19651708900928497, "num_tokens": 13320837.0, "step": 5805 }, { "entropy": 5.383905267715454, "epoch": 0.5581171950048031, "grad_norm": 1.0390625, "learning_rate": 0.0004975881443015635, "loss": 5.3718, "mean_token_accuracy": 0.19027461260557174, "num_tokens": 13333512.0, "step": 5810 }, { "entropy": 5.465289068222046, "epoch": 0.5585975024015369, "grad_norm": 1.0390625, "learning_rate": 0.0004975831354013667, "loss": 5.3829, "mean_token_accuracy": 0.19368760734796525, "num_tokens": 13345189.0, "step": 5815 }, { "entropy": 5.329316329956055, "epoch": 0.5590778097982709, "grad_norm": 1.078125, "learning_rate": 0.0004975781213334503, "loss": 5.2472, "mean_token_accuracy": 0.20152513086795806, "num_tokens": 13356123.0, "step": 5820 }, { "entropy": 5.329442405700684, "epoch": 0.5595581171950048, "grad_norm": 1.1875, "learning_rate": 0.0004975731020979309, "loss": 5.2949, "mean_token_accuracy": 0.19351785629987717, "num_tokens": 13366902.0, "step": 5825 }, { "entropy": 5.4559613227844235, "epoch": 0.5600384245917387, "grad_norm": 1.15625, "learning_rate": 0.0004975680776949249, "loss": 5.3542, "mean_token_accuracy": 0.18989898711442948, "num_tokens": 13377567.0, "step": 5830 }, { "entropy": 5.390386629104614, "epoch": 0.5605187319884726, "grad_norm": 1.0625, "learning_rate": 0.0004975630481245492, "loss": 5.2869, "mean_token_accuracy": 0.2009364992380142, "num_tokens": 13387297.0, "step": 5835 }, { "entropy": 5.348505544662475, "epoch": 0.5609990393852066, "grad_norm": 1.1796875, "learning_rate": 0.0004975580133869202, "loss": 5.3381, "mean_token_accuracy": 0.1932346299290657, "num_tokens": 13397723.0, "step": 5840 }, { "entropy": 5.408625984191895, "epoch": 0.5614793467819404, "grad_norm": 1.0625, "learning_rate": 0.0004975529734821552, "loss": 5.3863, "mean_token_accuracy": 0.18635910749435425, "num_tokens": 13409875.0, "step": 5845 }, { "entropy": 5.352054500579834, "epoch": 0.5619596541786743, "grad_norm": 1.1015625, "learning_rate": 0.0004975479284103708, "loss": 5.2921, "mean_token_accuracy": 0.1954024314880371, "num_tokens": 13421338.0, "step": 5850 }, { "entropy": 5.418287992477417, "epoch": 0.5624399615754082, "grad_norm": 1.171875, "learning_rate": 0.0004975428781716845, "loss": 5.3258, "mean_token_accuracy": 0.19152757823467254, "num_tokens": 13431373.0, "step": 5855 }, { "entropy": 5.360725784301758, "epoch": 0.5629202689721422, "grad_norm": 0.99609375, "learning_rate": 0.0004975378227662134, "loss": 5.3208, "mean_token_accuracy": 0.19721843004226686, "num_tokens": 13443158.0, "step": 5860 }, { "entropy": 5.44525113105774, "epoch": 0.5634005763688761, "grad_norm": 1.0859375, "learning_rate": 0.0004975327621940746, "loss": 5.3795, "mean_token_accuracy": 0.18757863938808442, "num_tokens": 13454559.0, "step": 5865 }, { "entropy": 5.453475904464722, "epoch": 0.56388088376561, "grad_norm": 1.0703125, "learning_rate": 0.0004975276964553861, "loss": 5.4604, "mean_token_accuracy": 0.1895272508263588, "num_tokens": 13466934.0, "step": 5870 }, { "entropy": 5.349884796142578, "epoch": 0.5643611911623438, "grad_norm": 1.3125, "learning_rate": 0.0004975226255502651, "loss": 5.2124, "mean_token_accuracy": 0.20376883447170258, "num_tokens": 13477770.0, "step": 5875 }, { "entropy": 5.428862237930298, "epoch": 0.5648414985590778, "grad_norm": 1.125, "learning_rate": 0.0004975175494788297, "loss": 5.4214, "mean_token_accuracy": 0.1833633303642273, "num_tokens": 13490093.0, "step": 5880 }, { "entropy": 5.4273130893707275, "epoch": 0.5653218059558117, "grad_norm": 1.390625, "learning_rate": 0.0004975124682411974, "loss": 5.2743, "mean_token_accuracy": 0.19006698280572892, "num_tokens": 13500663.0, "step": 5885 }, { "entropy": 5.404650068283081, "epoch": 0.5658021133525456, "grad_norm": 1.0546875, "learning_rate": 0.0004975073818374863, "loss": 5.3747, "mean_token_accuracy": 0.19194794446229935, "num_tokens": 13512369.0, "step": 5890 }, { "entropy": 5.352162408828735, "epoch": 0.5662824207492796, "grad_norm": 1.1953125, "learning_rate": 0.0004975022902678145, "loss": 5.2518, "mean_token_accuracy": 0.18981288820505143, "num_tokens": 13523181.0, "step": 5895 }, { "entropy": 5.307896852493286, "epoch": 0.5667627281460135, "grad_norm": 1.09375, "learning_rate": 0.0004974971935323003, "loss": 5.2062, "mean_token_accuracy": 0.19488532990217208, "num_tokens": 13534113.0, "step": 5900 }, { "entropy": 5.3025891304016115, "epoch": 0.5672430355427474, "grad_norm": 1.078125, "learning_rate": 0.0004974920916310619, "loss": 5.2425, "mean_token_accuracy": 0.19460777193307877, "num_tokens": 13545037.0, "step": 5905 }, { "entropy": 5.368872261047363, "epoch": 0.5677233429394812, "grad_norm": 1.109375, "learning_rate": 0.0004974869845642178, "loss": 5.2926, "mean_token_accuracy": 0.19421349167823793, "num_tokens": 13555541.0, "step": 5910 }, { "entropy": 5.389457654953003, "epoch": 0.5682036503362152, "grad_norm": 1.1875, "learning_rate": 0.0004974818723318866, "loss": 5.2973, "mean_token_accuracy": 0.19764145314693451, "num_tokens": 13566951.0, "step": 5915 }, { "entropy": 5.347638368606567, "epoch": 0.5686839577329491, "grad_norm": 1.078125, "learning_rate": 0.0004974767549341868, "loss": 5.3505, "mean_token_accuracy": 0.18888978958129882, "num_tokens": 13578492.0, "step": 5920 }, { "entropy": 5.425949621200561, "epoch": 0.569164265129683, "grad_norm": 1.1640625, "learning_rate": 0.0004974716323712376, "loss": 5.2433, "mean_token_accuracy": 0.20290264040231704, "num_tokens": 13589183.0, "step": 5925 }, { "entropy": 5.37887659072876, "epoch": 0.5696445725264169, "grad_norm": 1.140625, "learning_rate": 0.0004974665046431576, "loss": 5.3868, "mean_token_accuracy": 0.19258931577205657, "num_tokens": 13600588.0, "step": 5930 }, { "entropy": 5.309185123443603, "epoch": 0.5701248799231509, "grad_norm": 1.1015625, "learning_rate": 0.0004974613717500659, "loss": 5.2605, "mean_token_accuracy": 0.20295644104480742, "num_tokens": 13612107.0, "step": 5935 }, { "entropy": 5.485657453536987, "epoch": 0.5706051873198847, "grad_norm": 1.1640625, "learning_rate": 0.0004974562336920818, "loss": 5.4246, "mean_token_accuracy": 0.18908909112215042, "num_tokens": 13623973.0, "step": 5940 }, { "entropy": 5.3633698463439945, "epoch": 0.5710854947166186, "grad_norm": 1.03125, "learning_rate": 0.0004974510904693245, "loss": 5.2372, "mean_token_accuracy": 0.19648284167051316, "num_tokens": 13634994.0, "step": 5945 }, { "entropy": 5.412157249450684, "epoch": 0.5715658021133525, "grad_norm": 1.1171875, "learning_rate": 0.0004974459420819134, "loss": 5.3895, "mean_token_accuracy": 0.19440043568611146, "num_tokens": 13646361.0, "step": 5950 }, { "entropy": 5.36341814994812, "epoch": 0.5720461095100865, "grad_norm": 1.125, "learning_rate": 0.000497440788529968, "loss": 5.2834, "mean_token_accuracy": 0.19329349249601363, "num_tokens": 13656975.0, "step": 5955 }, { "entropy": 5.428890562057495, "epoch": 0.5725264169068204, "grad_norm": 1.0859375, "learning_rate": 0.0004974356298136081, "loss": 5.3207, "mean_token_accuracy": 0.18961571753025055, "num_tokens": 13668434.0, "step": 5960 }, { "entropy": 5.403112125396729, "epoch": 0.5730067243035543, "grad_norm": 1.1328125, "learning_rate": 0.0004974304659329533, "loss": 5.301, "mean_token_accuracy": 0.1921529397368431, "num_tokens": 13679266.0, "step": 5965 }, { "entropy": 5.291449975967407, "epoch": 0.5734870317002881, "grad_norm": 1.140625, "learning_rate": 0.0004974252968881236, "loss": 5.3247, "mean_token_accuracy": 0.18704658299684523, "num_tokens": 13690921.0, "step": 5970 }, { "entropy": 5.385117483139038, "epoch": 0.5739673390970221, "grad_norm": 1.078125, "learning_rate": 0.000497420122679239, "loss": 5.2579, "mean_token_accuracy": 0.19390686601400375, "num_tokens": 13702329.0, "step": 5975 }, { "entropy": 5.317170143127441, "epoch": 0.574447646493756, "grad_norm": 1.109375, "learning_rate": 0.0004974149433064196, "loss": 5.2295, "mean_token_accuracy": 0.20150385797023773, "num_tokens": 13713356.0, "step": 5980 }, { "entropy": 5.237676763534546, "epoch": 0.5749279538904899, "grad_norm": 1.0625, "learning_rate": 0.0004974097587697856, "loss": 5.2294, "mean_token_accuracy": 0.19473931789398194, "num_tokens": 13724718.0, "step": 5985 }, { "entropy": 5.28824028968811, "epoch": 0.5754082612872238, "grad_norm": 1.046875, "learning_rate": 0.0004974045690694575, "loss": 5.2596, "mean_token_accuracy": 0.196784345805645, "num_tokens": 13736113.0, "step": 5990 }, { "entropy": 5.417406034469605, "epoch": 0.5758885686839578, "grad_norm": 0.99609375, "learning_rate": 0.0004973993742055557, "loss": 5.272, "mean_token_accuracy": 0.19672393202781677, "num_tokens": 13748322.0, "step": 5995 }, { "entropy": 5.3009929180145265, "epoch": 0.5763688760806917, "grad_norm": 1.1328125, "learning_rate": 0.0004973941741782007, "loss": 5.2743, "mean_token_accuracy": 0.18973211497068404, "num_tokens": 13759433.0, "step": 6000 }, { "epoch": 0.5763688760806917, "eval_entropy": 5.216975544093005, "eval_loss": 5.320178508758545, "eval_mean_token_accuracy": 0.1993778554485636, "eval_num_tokens": 13759433.0, "eval_runtime": 27.3927, "eval_samples_per_second": 1197.949, "eval_steps_per_second": 149.748, "step": 6000 }, { "entropy": 5.300173044204712, "epoch": 0.5768491834774255, "grad_norm": 1.125, "learning_rate": 0.0004973889689875135, "loss": 5.195, "mean_token_accuracy": 0.1984873592853546, "num_tokens": 13770181.0, "step": 6005 }, { "entropy": 5.272101497650146, "epoch": 0.5773294908741594, "grad_norm": 1.1015625, "learning_rate": 0.0004973837586336147, "loss": 5.2443, "mean_token_accuracy": 0.19233150780200958, "num_tokens": 13781792.0, "step": 6010 }, { "entropy": 5.360331630706787, "epoch": 0.5778097982708934, "grad_norm": 1.0859375, "learning_rate": 0.0004973785431166254, "loss": 5.3034, "mean_token_accuracy": 0.1883278176188469, "num_tokens": 13792101.0, "step": 6015 }, { "entropy": 5.372050094604492, "epoch": 0.5782901056676273, "grad_norm": 1.0546875, "learning_rate": 0.0004973733224366666, "loss": 5.2927, "mean_token_accuracy": 0.19648923128843307, "num_tokens": 13803640.0, "step": 6020 }, { "entropy": 5.3110956192016605, "epoch": 0.5787704130643612, "grad_norm": 1.1484375, "learning_rate": 0.0004973680965938597, "loss": 5.2097, "mean_token_accuracy": 0.1993360698223114, "num_tokens": 13815017.0, "step": 6025 }, { "entropy": 5.383702278137207, "epoch": 0.579250720461095, "grad_norm": 1.015625, "learning_rate": 0.0004973628655883258, "loss": 5.4354, "mean_token_accuracy": 0.18790345638990402, "num_tokens": 13826119.0, "step": 6030 }, { "entropy": 5.362691211700439, "epoch": 0.579731027857829, "grad_norm": 1.1953125, "learning_rate": 0.0004973576294201865, "loss": 5.2425, "mean_token_accuracy": 0.193815678358078, "num_tokens": 13837869.0, "step": 6035 }, { "entropy": 5.532571697235108, "epoch": 0.5802113352545629, "grad_norm": 1.1015625, "learning_rate": 0.0004973523880895633, "loss": 5.4173, "mean_token_accuracy": 0.1810302734375, "num_tokens": 13849333.0, "step": 6040 }, { "entropy": 5.346681356430054, "epoch": 0.5806916426512968, "grad_norm": 1.1171875, "learning_rate": 0.0004973471415965779, "loss": 5.2732, "mean_token_accuracy": 0.19453433007001877, "num_tokens": 13860648.0, "step": 6045 }, { "entropy": 5.426644325256348, "epoch": 0.5811719500480308, "grad_norm": 1.1328125, "learning_rate": 0.000497341889941352, "loss": 5.4282, "mean_token_accuracy": 0.18453214317560196, "num_tokens": 13872979.0, "step": 6050 }, { "entropy": 5.398784351348877, "epoch": 0.5816522574447647, "grad_norm": 1.0859375, "learning_rate": 0.0004973366331240078, "loss": 5.2973, "mean_token_accuracy": 0.19479347318410872, "num_tokens": 13884363.0, "step": 6055 }, { "entropy": 5.369167709350586, "epoch": 0.5821325648414986, "grad_norm": 1.0390625, "learning_rate": 0.000497331371144667, "loss": 5.2897, "mean_token_accuracy": 0.1914879024028778, "num_tokens": 13895424.0, "step": 6060 }, { "entropy": 5.348582983016968, "epoch": 0.5826128722382324, "grad_norm": 0.98828125, "learning_rate": 0.0004973261040034521, "loss": 5.4136, "mean_token_accuracy": 0.1861998423933983, "num_tokens": 13907319.0, "step": 6065 }, { "entropy": 5.298081350326538, "epoch": 0.5830931796349664, "grad_norm": 1.046875, "learning_rate": 0.0004973208317004852, "loss": 5.2514, "mean_token_accuracy": 0.19497257471084595, "num_tokens": 13920013.0, "step": 6070 }, { "entropy": 5.347387409210205, "epoch": 0.5835734870317003, "grad_norm": 1.140625, "learning_rate": 0.0004973155542358889, "loss": 5.2194, "mean_token_accuracy": 0.19683697521686555, "num_tokens": 13932033.0, "step": 6075 }, { "entropy": 5.275155830383301, "epoch": 0.5840537944284342, "grad_norm": 1.140625, "learning_rate": 0.0004973102716097853, "loss": 5.2393, "mean_token_accuracy": 0.19724634289741516, "num_tokens": 13943324.0, "step": 6080 }, { "entropy": 5.339147853851318, "epoch": 0.5845341018251681, "grad_norm": 1.0703125, "learning_rate": 0.0004973049838222973, "loss": 5.3067, "mean_token_accuracy": 0.19921963214874266, "num_tokens": 13954291.0, "step": 6085 }, { "entropy": 5.465323781967163, "epoch": 0.5850144092219021, "grad_norm": 1.1875, "learning_rate": 0.0004972996908735479, "loss": 5.414, "mean_token_accuracy": 0.1826832190155983, "num_tokens": 13966264.0, "step": 6090 }, { "entropy": 5.453038024902344, "epoch": 0.585494716618636, "grad_norm": 1.09375, "learning_rate": 0.0004972943927636597, "loss": 5.2937, "mean_token_accuracy": 0.18843238651752472, "num_tokens": 13977785.0, "step": 6095 }, { "entropy": 5.289157247543335, "epoch": 0.5859750240153698, "grad_norm": 1.1640625, "learning_rate": 0.0004972890894927558, "loss": 5.2774, "mean_token_accuracy": 0.1957810938358307, "num_tokens": 13989704.0, "step": 6100 }, { "entropy": 5.3781982421875, "epoch": 0.5864553314121037, "grad_norm": 1.0625, "learning_rate": 0.0004972837810609592, "loss": 5.2735, "mean_token_accuracy": 0.1950765624642372, "num_tokens": 14000565.0, "step": 6105 }, { "entropy": 5.25339150428772, "epoch": 0.5869356388088377, "grad_norm": 1.1171875, "learning_rate": 0.0004972784674683933, "loss": 5.1817, "mean_token_accuracy": 0.20146536976099014, "num_tokens": 14011521.0, "step": 6110 }, { "entropy": 5.334528207778931, "epoch": 0.5874159462055716, "grad_norm": 1.078125, "learning_rate": 0.0004972731487151815, "loss": 5.2246, "mean_token_accuracy": 0.20138936042785643, "num_tokens": 14022966.0, "step": 6115 }, { "entropy": 5.453242635726928, "epoch": 0.5878962536023055, "grad_norm": 1.0859375, "learning_rate": 0.0004972678248014471, "loss": 5.3627, "mean_token_accuracy": 0.1907268077135086, "num_tokens": 14034905.0, "step": 6120 }, { "entropy": 5.245919466018677, "epoch": 0.5883765609990393, "grad_norm": 1.0390625, "learning_rate": 0.0004972624957273139, "loss": 5.2067, "mean_token_accuracy": 0.1968609645962715, "num_tokens": 14045816.0, "step": 6125 }, { "entropy": 5.300683164596558, "epoch": 0.5888568683957733, "grad_norm": 1.1328125, "learning_rate": 0.0004972571614929055, "loss": 5.1877, "mean_token_accuracy": 0.1941026657819748, "num_tokens": 14057316.0, "step": 6130 }, { "entropy": 5.375409936904907, "epoch": 0.5893371757925072, "grad_norm": 1.2265625, "learning_rate": 0.0004972518220983457, "loss": 5.308, "mean_token_accuracy": 0.18491660058498383, "num_tokens": 14067542.0, "step": 6135 }, { "entropy": 5.255684518814087, "epoch": 0.5898174831892411, "grad_norm": 1.078125, "learning_rate": 0.0004972464775437586, "loss": 5.1798, "mean_token_accuracy": 0.19628757536411284, "num_tokens": 14079467.0, "step": 6140 }, { "entropy": 5.297601890563965, "epoch": 0.590297790585975, "grad_norm": 1.1875, "learning_rate": 0.0004972411278292683, "loss": 5.1377, "mean_token_accuracy": 0.2022266536951065, "num_tokens": 14090695.0, "step": 6145 }, { "entropy": 5.310152339935303, "epoch": 0.590778097982709, "grad_norm": 1.296875, "learning_rate": 0.0004972357729549988, "loss": 5.1883, "mean_token_accuracy": 0.20686898082494737, "num_tokens": 14101380.0, "step": 6150 }, { "entropy": 5.255633974075318, "epoch": 0.5912584053794429, "grad_norm": 1.03125, "learning_rate": 0.0004972304129210746, "loss": 5.2317, "mean_token_accuracy": 0.19752228856086732, "num_tokens": 14112265.0, "step": 6155 }, { "entropy": 5.3430475234985355, "epoch": 0.5917387127761767, "grad_norm": 1.0078125, "learning_rate": 0.0004972250477276202, "loss": 5.2263, "mean_token_accuracy": 0.19471232295036317, "num_tokens": 14124294.0, "step": 6160 }, { "entropy": 5.320528650283814, "epoch": 0.5922190201729106, "grad_norm": 1.0390625, "learning_rate": 0.0004972196773747599, "loss": 5.2509, "mean_token_accuracy": 0.1957827016711235, "num_tokens": 14135230.0, "step": 6165 }, { "entropy": 5.363646554946899, "epoch": 0.5926993275696446, "grad_norm": 1.1484375, "learning_rate": 0.0004972143018626186, "loss": 5.3444, "mean_token_accuracy": 0.19247063994407654, "num_tokens": 14147576.0, "step": 6170 }, { "entropy": 5.2979090213775635, "epoch": 0.5931796349663785, "grad_norm": 1.2734375, "learning_rate": 0.0004972089211913211, "loss": 5.2239, "mean_token_accuracy": 0.20307936817407607, "num_tokens": 14158834.0, "step": 6175 }, { "entropy": 5.386188983917236, "epoch": 0.5936599423631124, "grad_norm": 1.078125, "learning_rate": 0.0004972035353609923, "loss": 5.3616, "mean_token_accuracy": 0.1897743433713913, "num_tokens": 14170694.0, "step": 6180 }, { "entropy": 5.350439167022705, "epoch": 0.5941402497598463, "grad_norm": 1.0625, "learning_rate": 0.0004971981443717572, "loss": 5.275, "mean_token_accuracy": 0.19451749473810195, "num_tokens": 14183184.0, "step": 6185 }, { "entropy": 5.413315868377685, "epoch": 0.5946205571565802, "grad_norm": 1.2890625, "learning_rate": 0.0004971927482237409, "loss": 5.3614, "mean_token_accuracy": 0.18740272670984268, "num_tokens": 14194761.0, "step": 6190 }, { "entropy": 5.309095239639282, "epoch": 0.5951008645533141, "grad_norm": 1.375, "learning_rate": 0.0004971873469170689, "loss": 5.1715, "mean_token_accuracy": 0.19937018156051636, "num_tokens": 14205820.0, "step": 6195 }, { "entropy": 5.198407316207886, "epoch": 0.595581171950048, "grad_norm": 1.2265625, "learning_rate": 0.0004971819404518664, "loss": 5.1875, "mean_token_accuracy": 0.20422977358102798, "num_tokens": 14217826.0, "step": 6200 }, { "entropy": 5.258822679519653, "epoch": 0.5960614793467819, "grad_norm": 1.0234375, "learning_rate": 0.000497176528828259, "loss": 5.1531, "mean_token_accuracy": 0.20904283672571183, "num_tokens": 14228632.0, "step": 6205 }, { "entropy": 5.395007658004761, "epoch": 0.5965417867435159, "grad_norm": 1.0078125, "learning_rate": 0.0004971711120463722, "loss": 5.3763, "mean_token_accuracy": 0.1843624085187912, "num_tokens": 14240231.0, "step": 6210 }, { "entropy": 5.421968078613281, "epoch": 0.5970220941402498, "grad_norm": 1.140625, "learning_rate": 0.000497165690106332, "loss": 5.2243, "mean_token_accuracy": 0.19937607198953627, "num_tokens": 14251718.0, "step": 6215 }, { "entropy": 5.301095485687256, "epoch": 0.5975024015369836, "grad_norm": 1.1953125, "learning_rate": 0.0004971602630082642, "loss": 5.2678, "mean_token_accuracy": 0.20054133832454682, "num_tokens": 14263129.0, "step": 6220 }, { "entropy": 5.272244215011597, "epoch": 0.5979827089337176, "grad_norm": 1.3984375, "learning_rate": 0.0004971548307522947, "loss": 5.1874, "mean_token_accuracy": 0.195827853679657, "num_tokens": 14274465.0, "step": 6225 }, { "entropy": 5.465359544754028, "epoch": 0.5984630163304515, "grad_norm": 1.0859375, "learning_rate": 0.0004971493933385498, "loss": 5.3672, "mean_token_accuracy": 0.19250321239233018, "num_tokens": 14286190.0, "step": 6230 }, { "entropy": 5.372733306884766, "epoch": 0.5989433237271854, "grad_norm": 1.0703125, "learning_rate": 0.0004971439507671556, "loss": 5.3155, "mean_token_accuracy": 0.1895249903202057, "num_tokens": 14297638.0, "step": 6235 }, { "entropy": 5.272776174545288, "epoch": 0.5994236311239193, "grad_norm": 1.0859375, "learning_rate": 0.0004971385030382384, "loss": 5.261, "mean_token_accuracy": 0.19516938775777817, "num_tokens": 14309773.0, "step": 6240 }, { "entropy": 5.37848744392395, "epoch": 0.5999039385206533, "grad_norm": 1.2265625, "learning_rate": 0.0004971330501519248, "loss": 5.2141, "mean_token_accuracy": 0.1981222003698349, "num_tokens": 14320543.0, "step": 6245 }, { "entropy": 5.3494123935699465, "epoch": 0.6003842459173871, "grad_norm": 1.1328125, "learning_rate": 0.0004971275921083414, "loss": 5.2969, "mean_token_accuracy": 0.1915585696697235, "num_tokens": 14332076.0, "step": 6250 }, { "entropy": 5.2921350479125975, "epoch": 0.600864553314121, "grad_norm": 1.15625, "learning_rate": 0.000497122128907615, "loss": 5.2163, "mean_token_accuracy": 0.19611912965774536, "num_tokens": 14343768.0, "step": 6255 }, { "entropy": 5.304634475708008, "epoch": 0.6013448607108549, "grad_norm": 1.03125, "learning_rate": 0.0004971166605498722, "loss": 5.259, "mean_token_accuracy": 0.1918262854218483, "num_tokens": 14357051.0, "step": 6260 }, { "entropy": 5.361700868606567, "epoch": 0.6018251681075889, "grad_norm": 1.140625, "learning_rate": 0.0004971111870352401, "loss": 5.2776, "mean_token_accuracy": 0.19047526866197587, "num_tokens": 14368800.0, "step": 6265 }, { "entropy": 5.3359825134277346, "epoch": 0.6023054755043228, "grad_norm": 1.1875, "learning_rate": 0.0004971057083638458, "loss": 5.2601, "mean_token_accuracy": 0.19838642477989196, "num_tokens": 14379617.0, "step": 6270 }, { "entropy": 5.304495000839234, "epoch": 0.6027857829010567, "grad_norm": 1.0078125, "learning_rate": 0.0004971002245358166, "loss": 5.2402, "mean_token_accuracy": 0.19964935183525084, "num_tokens": 14391454.0, "step": 6275 }, { "entropy": 5.369532489776612, "epoch": 0.6032660902977905, "grad_norm": 1.09375, "learning_rate": 0.0004970947355512795, "loss": 5.2397, "mean_token_accuracy": 0.20377653539180757, "num_tokens": 14402379.0, "step": 6280 }, { "entropy": 5.291622447967529, "epoch": 0.6037463976945245, "grad_norm": 1.1328125, "learning_rate": 0.0004970892414103622, "loss": 5.2178, "mean_token_accuracy": 0.19688135832548143, "num_tokens": 14415040.0, "step": 6285 }, { "entropy": 5.331115865707398, "epoch": 0.6042267050912584, "grad_norm": 1.125, "learning_rate": 0.0004970837421131921, "loss": 5.3402, "mean_token_accuracy": 0.18655368387699128, "num_tokens": 14426677.0, "step": 6290 }, { "entropy": 5.365583896636963, "epoch": 0.6047070124879923, "grad_norm": 1.125, "learning_rate": 0.0004970782376598972, "loss": 5.2407, "mean_token_accuracy": 0.19816339612007142, "num_tokens": 14436676.0, "step": 6295 }, { "entropy": 5.344175672531128, "epoch": 0.6051873198847262, "grad_norm": 1.15625, "learning_rate": 0.0004970727280506048, "loss": 5.2782, "mean_token_accuracy": 0.19335077702999115, "num_tokens": 14448294.0, "step": 6300 }, { "entropy": 5.324075174331665, "epoch": 0.6056676272814602, "grad_norm": 1.1796875, "learning_rate": 0.0004970672132854431, "loss": 5.2542, "mean_token_accuracy": 0.19286826699972154, "num_tokens": 14460642.0, "step": 6305 }, { "entropy": 5.305904293060303, "epoch": 0.6061479346781941, "grad_norm": 1.0, "learning_rate": 0.0004970616933645403, "loss": 5.1445, "mean_token_accuracy": 0.20802572518587112, "num_tokens": 14471370.0, "step": 6310 }, { "entropy": 5.404938650131226, "epoch": 0.6066282420749279, "grad_norm": 1.171875, "learning_rate": 0.0004970561682880242, "loss": 5.3134, "mean_token_accuracy": 0.19435038715600966, "num_tokens": 14482358.0, "step": 6315 }, { "entropy": 5.315574312210083, "epoch": 0.6071085494716618, "grad_norm": 1.1015625, "learning_rate": 0.0004970506380560233, "loss": 5.3291, "mean_token_accuracy": 0.1953802764415741, "num_tokens": 14494413.0, "step": 6320 }, { "entropy": 5.382768726348877, "epoch": 0.6075888568683958, "grad_norm": 1.15625, "learning_rate": 0.0004970451026686659, "loss": 5.2398, "mean_token_accuracy": 0.203892120718956, "num_tokens": 14506370.0, "step": 6325 }, { "entropy": 5.3352419376373295, "epoch": 0.6080691642651297, "grad_norm": 1.1328125, "learning_rate": 0.0004970395621260806, "loss": 5.2322, "mean_token_accuracy": 0.19640335738658904, "num_tokens": 14517198.0, "step": 6330 }, { "entropy": 5.415618896484375, "epoch": 0.6085494716618636, "grad_norm": 1.1328125, "learning_rate": 0.000497034016428396, "loss": 5.3509, "mean_token_accuracy": 0.18669991344213485, "num_tokens": 14529434.0, "step": 6335 }, { "entropy": 5.267712688446045, "epoch": 0.6090297790585975, "grad_norm": 1.09375, "learning_rate": 0.0004970284655757409, "loss": 5.234, "mean_token_accuracy": 0.1992996484041214, "num_tokens": 14540857.0, "step": 6340 }, { "entropy": 5.300823831558228, "epoch": 0.6095100864553314, "grad_norm": 1.09375, "learning_rate": 0.0004970229095682439, "loss": 5.2642, "mean_token_accuracy": 0.1999596104025841, "num_tokens": 14552594.0, "step": 6345 }, { "entropy": 5.428168153762817, "epoch": 0.6099903938520653, "grad_norm": 1.046875, "learning_rate": 0.0004970173484060344, "loss": 5.3089, "mean_token_accuracy": 0.18913527578115463, "num_tokens": 14563599.0, "step": 6350 }, { "entropy": 5.352987909317017, "epoch": 0.6104707012487992, "grad_norm": 1.078125, "learning_rate": 0.0004970117820892414, "loss": 5.2652, "mean_token_accuracy": 0.19545669108629227, "num_tokens": 14575905.0, "step": 6355 }, { "entropy": 5.381795263290405, "epoch": 0.6109510086455331, "grad_norm": 1.0625, "learning_rate": 0.0004970062106179939, "loss": 5.281, "mean_token_accuracy": 0.1997828796505928, "num_tokens": 14587800.0, "step": 6360 }, { "entropy": 5.3206565380096436, "epoch": 0.6114313160422671, "grad_norm": 1.0546875, "learning_rate": 0.0004970006339924214, "loss": 5.2193, "mean_token_accuracy": 0.20122889876365663, "num_tokens": 14600654.0, "step": 6365 }, { "entropy": 5.323697805404663, "epoch": 0.611911623439001, "grad_norm": 1.2578125, "learning_rate": 0.0004969950522126534, "loss": 5.2165, "mean_token_accuracy": 0.1956930086016655, "num_tokens": 14611985.0, "step": 6370 }, { "entropy": 5.253861331939698, "epoch": 0.6123919308357348, "grad_norm": 1.1171875, "learning_rate": 0.0004969894652788196, "loss": 5.2112, "mean_token_accuracy": 0.19875800609588623, "num_tokens": 14625004.0, "step": 6375 }, { "entropy": 5.403214597702027, "epoch": 0.6128722382324687, "grad_norm": 1.1796875, "learning_rate": 0.0004969838731910494, "loss": 5.2834, "mean_token_accuracy": 0.19371581822633743, "num_tokens": 14635381.0, "step": 6380 }, { "entropy": 5.254595232009888, "epoch": 0.6133525456292027, "grad_norm": 1.109375, "learning_rate": 0.0004969782759494729, "loss": 5.2267, "mean_token_accuracy": 0.2021428868174553, "num_tokens": 14646582.0, "step": 6385 }, { "entropy": 5.305480861663819, "epoch": 0.6138328530259366, "grad_norm": 1.0390625, "learning_rate": 0.00049697267355422, "loss": 5.2809, "mean_token_accuracy": 0.19987702816724778, "num_tokens": 14658024.0, "step": 6390 }, { "entropy": 5.461784887313843, "epoch": 0.6143131604226705, "grad_norm": 1.0859375, "learning_rate": 0.0004969670660054208, "loss": 5.3349, "mean_token_accuracy": 0.18528691679239273, "num_tokens": 14669933.0, "step": 6395 }, { "entropy": 5.338407850265503, "epoch": 0.6147934678194045, "grad_norm": 1.109375, "learning_rate": 0.0004969614533032054, "loss": 5.2732, "mean_token_accuracy": 0.19340415596961974, "num_tokens": 14681331.0, "step": 6400 }, { "entropy": 5.253151512145996, "epoch": 0.6152737752161384, "grad_norm": 1.0546875, "learning_rate": 0.0004969558354477041, "loss": 5.1417, "mean_token_accuracy": 0.20471659004688264, "num_tokens": 14691361.0, "step": 6405 }, { "entropy": 5.294320201873779, "epoch": 0.6157540826128722, "grad_norm": 1.171875, "learning_rate": 0.0004969502124390474, "loss": 5.2437, "mean_token_accuracy": 0.19678280949592591, "num_tokens": 14701791.0, "step": 6410 }, { "entropy": 5.343457746505737, "epoch": 0.6162343900096061, "grad_norm": 1.1328125, "learning_rate": 0.0004969445842773658, "loss": 5.2944, "mean_token_accuracy": 0.1905987396836281, "num_tokens": 14713672.0, "step": 6415 }, { "entropy": 5.280415821075439, "epoch": 0.6167146974063401, "grad_norm": 1.1328125, "learning_rate": 0.00049693895096279, "loss": 5.134, "mean_token_accuracy": 0.20532522946596146, "num_tokens": 14724789.0, "step": 6420 }, { "entropy": 5.300199031829834, "epoch": 0.617195004803074, "grad_norm": 1.0859375, "learning_rate": 0.0004969333124954508, "loss": 5.2255, "mean_token_accuracy": 0.19219568222761155, "num_tokens": 14737212.0, "step": 6425 }, { "entropy": 5.260709667205811, "epoch": 0.6176753121998079, "grad_norm": 1.015625, "learning_rate": 0.0004969276688754791, "loss": 5.2243, "mean_token_accuracy": 0.19760479480028154, "num_tokens": 14748387.0, "step": 6430 }, { "entropy": 5.331292247772216, "epoch": 0.6181556195965417, "grad_norm": 1.109375, "learning_rate": 0.0004969220201030059, "loss": 5.2515, "mean_token_accuracy": 0.19721336513757706, "num_tokens": 14758477.0, "step": 6435 }, { "entropy": 5.393822145462036, "epoch": 0.6186359269932757, "grad_norm": 1.3046875, "learning_rate": 0.0004969163661781624, "loss": 5.3479, "mean_token_accuracy": 0.19438280314207076, "num_tokens": 14769650.0, "step": 6440 }, { "entropy": 5.244111680984497, "epoch": 0.6191162343900096, "grad_norm": 1.21875, "learning_rate": 0.0004969107071010798, "loss": 5.1141, "mean_token_accuracy": 0.2060042515397072, "num_tokens": 14780988.0, "step": 6445 }, { "entropy": 5.328417253494263, "epoch": 0.6195965417867435, "grad_norm": 1.03125, "learning_rate": 0.0004969050428718895, "loss": 5.2141, "mean_token_accuracy": 0.20170180201530458, "num_tokens": 14792458.0, "step": 6450 }, { "entropy": 5.370841217041016, "epoch": 0.6200768491834774, "grad_norm": 1.2890625, "learning_rate": 0.000496899373490723, "loss": 5.302, "mean_token_accuracy": 0.19348903000354767, "num_tokens": 14804772.0, "step": 6455 }, { "entropy": 5.38532247543335, "epoch": 0.6205571565802114, "grad_norm": 1.1796875, "learning_rate": 0.000496893698957712, "loss": 5.2386, "mean_token_accuracy": 0.19131766855716706, "num_tokens": 14816126.0, "step": 6460 }, { "entropy": 5.34750804901123, "epoch": 0.6210374639769453, "grad_norm": 1.15625, "learning_rate": 0.0004968880192729882, "loss": 5.3551, "mean_token_accuracy": 0.19073225259780885, "num_tokens": 14829376.0, "step": 6465 }, { "entropy": 5.3107569217681885, "epoch": 0.6215177713736791, "grad_norm": 1.109375, "learning_rate": 0.0004968823344366835, "loss": 5.1864, "mean_token_accuracy": 0.20422402322292327, "num_tokens": 14841098.0, "step": 6470 }, { "entropy": 5.321711921691895, "epoch": 0.621998078770413, "grad_norm": 1.171875, "learning_rate": 0.0004968766444489298, "loss": 5.1995, "mean_token_accuracy": 0.19894031435251236, "num_tokens": 14852690.0, "step": 6475 }, { "entropy": 5.283687448501587, "epoch": 0.622478386167147, "grad_norm": 1.171875, "learning_rate": 0.0004968709493098593, "loss": 5.1839, "mean_token_accuracy": 0.199807707965374, "num_tokens": 14863327.0, "step": 6480 }, { "entropy": 5.36080002784729, "epoch": 0.6229586935638809, "grad_norm": 1.0390625, "learning_rate": 0.0004968652490196041, "loss": 5.3413, "mean_token_accuracy": 0.19556114822626114, "num_tokens": 14875213.0, "step": 6485 }, { "entropy": 5.360726022720337, "epoch": 0.6234390009606148, "grad_norm": 1.0859375, "learning_rate": 0.0004968595435782967, "loss": 5.26, "mean_token_accuracy": 0.20531406849622727, "num_tokens": 14886129.0, "step": 6490 }, { "entropy": 5.433628988265991, "epoch": 0.6239193083573487, "grad_norm": 1.1953125, "learning_rate": 0.0004968538329860695, "loss": 5.3958, "mean_token_accuracy": 0.18427062630653382, "num_tokens": 14897217.0, "step": 6495 }, { "entropy": 5.34465217590332, "epoch": 0.6243996157540826, "grad_norm": 1.234375, "learning_rate": 0.0004968481172430549, "loss": 5.304, "mean_token_accuracy": 0.19546790570020675, "num_tokens": 14908438.0, "step": 6500 }, { "entropy": 5.292671966552734, "epoch": 0.6248799231508165, "grad_norm": 1.09375, "learning_rate": 0.000496842396349386, "loss": 5.1776, "mean_token_accuracy": 0.1998446449637413, "num_tokens": 14920472.0, "step": 6505 }, { "entropy": 5.301685905456543, "epoch": 0.6253602305475504, "grad_norm": 1.234375, "learning_rate": 0.0004968366703051952, "loss": 5.2294, "mean_token_accuracy": 0.1975826621055603, "num_tokens": 14932075.0, "step": 6510 }, { "entropy": 5.327823829650879, "epoch": 0.6258405379442843, "grad_norm": 1.0078125, "learning_rate": 0.0004968309391106157, "loss": 5.2856, "mean_token_accuracy": 0.1950155645608902, "num_tokens": 14942971.0, "step": 6515 }, { "entropy": 5.307694292068481, "epoch": 0.6263208453410183, "grad_norm": 1.015625, "learning_rate": 0.0004968252027657806, "loss": 5.1169, "mean_token_accuracy": 0.20416004061698914, "num_tokens": 14954288.0, "step": 6520 }, { "entropy": 5.208348560333252, "epoch": 0.6268011527377522, "grad_norm": 1.046875, "learning_rate": 0.0004968194612708229, "loss": 5.1838, "mean_token_accuracy": 0.19983574450016023, "num_tokens": 14966017.0, "step": 6525 }, { "entropy": 5.391433906555176, "epoch": 0.627281460134486, "grad_norm": 1.0234375, "learning_rate": 0.0004968137146258759, "loss": 5.268, "mean_token_accuracy": 0.19897303581237794, "num_tokens": 14978022.0, "step": 6530 }, { "entropy": 5.215319442749023, "epoch": 0.6277617675312199, "grad_norm": 1.0859375, "learning_rate": 0.0004968079628310732, "loss": 5.178, "mean_token_accuracy": 0.20022727847099303, "num_tokens": 14990370.0, "step": 6535 }, { "entropy": 5.197961759567261, "epoch": 0.6282420749279539, "grad_norm": 1.1796875, "learning_rate": 0.0004968022058865482, "loss": 5.2013, "mean_token_accuracy": 0.20392390042543412, "num_tokens": 15001535.0, "step": 6540 }, { "entropy": 5.426657629013062, "epoch": 0.6287223823246878, "grad_norm": 1.1171875, "learning_rate": 0.0004967964437924346, "loss": 5.2742, "mean_token_accuracy": 0.19859713315963745, "num_tokens": 15012560.0, "step": 6545 }, { "entropy": 5.225249814987182, "epoch": 0.6292026897214217, "grad_norm": 1.1796875, "learning_rate": 0.0004967906765488662, "loss": 5.1805, "mean_token_accuracy": 0.20398483723402022, "num_tokens": 15024594.0, "step": 6550 }, { "entropy": 5.287734794616699, "epoch": 0.6296829971181557, "grad_norm": 1.1484375, "learning_rate": 0.0004967849041559769, "loss": 5.209, "mean_token_accuracy": 0.19682117998600007, "num_tokens": 15034735.0, "step": 6555 }, { "entropy": 5.371293401718139, "epoch": 0.6301633045148896, "grad_norm": 1.1484375, "learning_rate": 0.0004967791266139006, "loss": 5.3218, "mean_token_accuracy": 0.192607519030571, "num_tokens": 15046955.0, "step": 6560 }, { "entropy": 5.280347061157227, "epoch": 0.6306436119116234, "grad_norm": 1.125, "learning_rate": 0.0004967733439227716, "loss": 5.181, "mean_token_accuracy": 0.20368633568286895, "num_tokens": 15058250.0, "step": 6565 }, { "entropy": 5.304067659378052, "epoch": 0.6311239193083573, "grad_norm": 1.0703125, "learning_rate": 0.000496767556082724, "loss": 5.3101, "mean_token_accuracy": 0.19713823050260543, "num_tokens": 15069451.0, "step": 6570 }, { "entropy": 5.3541919708251955, "epoch": 0.6316042267050913, "grad_norm": 1.0859375, "learning_rate": 0.0004967617630938924, "loss": 5.2624, "mean_token_accuracy": 0.20069709718227385, "num_tokens": 15080920.0, "step": 6575 }, { "entropy": 5.265263462066651, "epoch": 0.6320845341018252, "grad_norm": 1.09375, "learning_rate": 0.000496755964956411, "loss": 5.1853, "mean_token_accuracy": 0.195594023168087, "num_tokens": 15091579.0, "step": 6580 }, { "entropy": 5.33905520439148, "epoch": 0.6325648414985591, "grad_norm": 1.171875, "learning_rate": 0.0004967501616704147, "loss": 5.3058, "mean_token_accuracy": 0.19578456729650498, "num_tokens": 15103097.0, "step": 6585 }, { "entropy": 5.338488435745239, "epoch": 0.633045148895293, "grad_norm": 1.0859375, "learning_rate": 0.000496744353236038, "loss": 5.2011, "mean_token_accuracy": 0.1994476333260536, "num_tokens": 15114646.0, "step": 6590 }, { "entropy": 5.396579837799072, "epoch": 0.633525456292027, "grad_norm": 1.1015625, "learning_rate": 0.000496738539653416, "loss": 5.2462, "mean_token_accuracy": 0.19710550010204314, "num_tokens": 15126019.0, "step": 6595 }, { "entropy": 5.320884847640992, "epoch": 0.6340057636887608, "grad_norm": 1.171875, "learning_rate": 0.0004967327209226835, "loss": 5.2417, "mean_token_accuracy": 0.20047877579927445, "num_tokens": 15136132.0, "step": 6600 }, { "entropy": 5.328518581390381, "epoch": 0.6344860710854947, "grad_norm": 1.0390625, "learning_rate": 0.0004967268970439755, "loss": 5.3295, "mean_token_accuracy": 0.19120048433542253, "num_tokens": 15149016.0, "step": 6605 }, { "entropy": 5.390049648284912, "epoch": 0.6349663784822286, "grad_norm": 1.1328125, "learning_rate": 0.0004967210680174274, "loss": 5.2831, "mean_token_accuracy": 0.1991571456193924, "num_tokens": 15160302.0, "step": 6610 }, { "entropy": 5.326699829101562, "epoch": 0.6354466858789626, "grad_norm": 1.1640625, "learning_rate": 0.0004967152338431745, "loss": 5.266, "mean_token_accuracy": 0.19298024773597716, "num_tokens": 15171770.0, "step": 6615 }, { "entropy": 5.307769346237182, "epoch": 0.6359269932756965, "grad_norm": 1.34375, "learning_rate": 0.0004967093945213523, "loss": 5.1512, "mean_token_accuracy": 0.20107036381959914, "num_tokens": 15182283.0, "step": 6620 }, { "entropy": 5.251153898239136, "epoch": 0.6364073006724303, "grad_norm": 1.078125, "learning_rate": 0.0004967035500520962, "loss": 5.2544, "mean_token_accuracy": 0.19852742105722426, "num_tokens": 15193917.0, "step": 6625 }, { "entropy": 5.439336967468262, "epoch": 0.6368876080691642, "grad_norm": 1.2734375, "learning_rate": 0.0004966977004355421, "loss": 5.3349, "mean_token_accuracy": 0.18941647708415985, "num_tokens": 15205854.0, "step": 6630 }, { "entropy": 5.270390892028809, "epoch": 0.6373679154658982, "grad_norm": 1.0390625, "learning_rate": 0.0004966918456718256, "loss": 5.1945, "mean_token_accuracy": 0.20114167034626007, "num_tokens": 15217884.0, "step": 6635 }, { "entropy": 5.367384433746338, "epoch": 0.6378482228626321, "grad_norm": 1.1875, "learning_rate": 0.0004966859857610828, "loss": 5.3235, "mean_token_accuracy": 0.19650316685438157, "num_tokens": 15229493.0, "step": 6640 }, { "entropy": 5.309024906158447, "epoch": 0.638328530259366, "grad_norm": 1.1015625, "learning_rate": 0.0004966801207034497, "loss": 5.2748, "mean_token_accuracy": 0.1954337552189827, "num_tokens": 15241402.0, "step": 6645 }, { "entropy": 5.396312856674195, "epoch": 0.6388088376560999, "grad_norm": 1.1171875, "learning_rate": 0.0004966742504990624, "loss": 5.2793, "mean_token_accuracy": 0.19446442872285843, "num_tokens": 15252981.0, "step": 6650 }, { "entropy": 5.396939516067505, "epoch": 0.6392891450528339, "grad_norm": 1.125, "learning_rate": 0.0004966683751480572, "loss": 5.2068, "mean_token_accuracy": 0.20260323137044906, "num_tokens": 15264171.0, "step": 6655 }, { "entropy": 5.215060138702393, "epoch": 0.6397694524495677, "grad_norm": 1.2890625, "learning_rate": 0.0004966624946505706, "loss": 5.2122, "mean_token_accuracy": 0.2042062520980835, "num_tokens": 15275595.0, "step": 6660 }, { "entropy": 5.32531771659851, "epoch": 0.6402497598463016, "grad_norm": 1.2578125, "learning_rate": 0.0004966566090067391, "loss": 5.284, "mean_token_accuracy": 0.19337738156318665, "num_tokens": 15286074.0, "step": 6665 }, { "entropy": 5.371769189834595, "epoch": 0.6407300672430355, "grad_norm": 1.1484375, "learning_rate": 0.0004966507182166993, "loss": 5.2436, "mean_token_accuracy": 0.19362216591835021, "num_tokens": 15296230.0, "step": 6670 }, { "entropy": 5.315968751907349, "epoch": 0.6412103746397695, "grad_norm": 1.1328125, "learning_rate": 0.000496644822280588, "loss": 5.2326, "mean_token_accuracy": 0.19759678691625596, "num_tokens": 15307428.0, "step": 6675 }, { "entropy": 5.394734859466553, "epoch": 0.6416906820365034, "grad_norm": 1.2265625, "learning_rate": 0.000496638921198542, "loss": 5.3513, "mean_token_accuracy": 0.18394145518541336, "num_tokens": 15318796.0, "step": 6680 }, { "entropy": 5.314431810379029, "epoch": 0.6421709894332372, "grad_norm": 1.109375, "learning_rate": 0.0004966330149706984, "loss": 5.2571, "mean_token_accuracy": 0.19566282480955124, "num_tokens": 15330914.0, "step": 6685 }, { "entropy": 5.29867787361145, "epoch": 0.6426512968299711, "grad_norm": 1.3984375, "learning_rate": 0.0004966271035971944, "loss": 5.2047, "mean_token_accuracy": 0.19110651910305024, "num_tokens": 15341756.0, "step": 6690 }, { "entropy": 5.369540071487426, "epoch": 0.6431316042267051, "grad_norm": 1.1640625, "learning_rate": 0.0004966211870781671, "loss": 5.2993, "mean_token_accuracy": 0.19377565532922744, "num_tokens": 15353217.0, "step": 6695 }, { "entropy": 5.348928213119507, "epoch": 0.643611911623439, "grad_norm": 1.1171875, "learning_rate": 0.000496615265413754, "loss": 5.2096, "mean_token_accuracy": 0.1918511837720871, "num_tokens": 15363813.0, "step": 6700 }, { "entropy": 5.329305982589721, "epoch": 0.6440922190201729, "grad_norm": 1.1640625, "learning_rate": 0.0004966093386040923, "loss": 5.2085, "mean_token_accuracy": 0.19547198563814164, "num_tokens": 15375223.0, "step": 6705 }, { "entropy": 5.3240186214447025, "epoch": 0.6445725264169068, "grad_norm": 1.1796875, "learning_rate": 0.00049660340664932, "loss": 5.2076, "mean_token_accuracy": 0.19727633148431778, "num_tokens": 15385523.0, "step": 6710 }, { "entropy": 5.337905216217041, "epoch": 0.6450528338136408, "grad_norm": 1.15625, "learning_rate": 0.0004965974695495746, "loss": 5.2756, "mean_token_accuracy": 0.1926635518670082, "num_tokens": 15397262.0, "step": 6715 }, { "entropy": 5.240037250518799, "epoch": 0.6455331412103746, "grad_norm": 1.0546875, "learning_rate": 0.0004965915273049938, "loss": 5.2106, "mean_token_accuracy": 0.19746471792459488, "num_tokens": 15409043.0, "step": 6720 }, { "entropy": 5.357408618927002, "epoch": 0.6460134486071085, "grad_norm": 1.171875, "learning_rate": 0.0004965855799157158, "loss": 5.2057, "mean_token_accuracy": 0.19802628457546234, "num_tokens": 15420859.0, "step": 6725 }, { "entropy": 5.3712615966796875, "epoch": 0.6464937560038425, "grad_norm": 1.1484375, "learning_rate": 0.0004965796273818787, "loss": 5.2864, "mean_token_accuracy": 0.189888796210289, "num_tokens": 15433650.0, "step": 6730 }, { "entropy": 5.233816766738892, "epoch": 0.6469740634005764, "grad_norm": 1.140625, "learning_rate": 0.0004965736697036206, "loss": 5.1634, "mean_token_accuracy": 0.2005162462592125, "num_tokens": 15445158.0, "step": 6735 }, { "entropy": 5.397826766967773, "epoch": 0.6474543707973103, "grad_norm": 0.99609375, "learning_rate": 0.0004965677068810798, "loss": 5.4198, "mean_token_accuracy": 0.18722799867391587, "num_tokens": 15456974.0, "step": 6740 }, { "entropy": 5.41676893234253, "epoch": 0.6479346781940442, "grad_norm": 1.1484375, "learning_rate": 0.0004965617389143947, "loss": 5.2898, "mean_token_accuracy": 0.19216358810663223, "num_tokens": 15467395.0, "step": 6745 }, { "entropy": 5.299721145629883, "epoch": 0.6484149855907781, "grad_norm": 1.40625, "learning_rate": 0.000496555765803704, "loss": 5.2777, "mean_token_accuracy": 0.19596335887908936, "num_tokens": 15479396.0, "step": 6750 }, { "entropy": 5.318911552429199, "epoch": 0.648895292987512, "grad_norm": 1.0859375, "learning_rate": 0.0004965497875491462, "loss": 5.2232, "mean_token_accuracy": 0.20304810851812363, "num_tokens": 15490992.0, "step": 6755 }, { "entropy": 5.438581037521362, "epoch": 0.6493756003842459, "grad_norm": 1.0859375, "learning_rate": 0.0004965438041508604, "loss": 5.3009, "mean_token_accuracy": 0.18606224209070205, "num_tokens": 15502413.0, "step": 6760 }, { "entropy": 5.337257432937622, "epoch": 0.6498559077809798, "grad_norm": 1.1953125, "learning_rate": 0.0004965378156089851, "loss": 5.2402, "mean_token_accuracy": 0.19228145480155945, "num_tokens": 15512588.0, "step": 6765 }, { "entropy": 5.320986652374268, "epoch": 0.6503362151777138, "grad_norm": 1.1328125, "learning_rate": 0.0004965318219236597, "loss": 5.2151, "mean_token_accuracy": 0.19258550703525543, "num_tokens": 15523252.0, "step": 6770 }, { "entropy": 5.275221490859986, "epoch": 0.6508165225744477, "grad_norm": 1.125, "learning_rate": 0.0004965258230950232, "loss": 5.3192, "mean_token_accuracy": 0.19502570629119872, "num_tokens": 15534975.0, "step": 6775 }, { "entropy": 5.3065108299255375, "epoch": 0.6512968299711815, "grad_norm": 1.078125, "learning_rate": 0.0004965198191232148, "loss": 5.1863, "mean_token_accuracy": 0.2026347428560257, "num_tokens": 15545709.0, "step": 6780 }, { "entropy": 5.357606697082519, "epoch": 0.6517771373679154, "grad_norm": 1.109375, "learning_rate": 0.000496513810008374, "loss": 5.2888, "mean_token_accuracy": 0.2010987401008606, "num_tokens": 15557270.0, "step": 6785 }, { "entropy": 5.383016109466553, "epoch": 0.6522574447646494, "grad_norm": 1.0078125, "learning_rate": 0.0004965077957506403, "loss": 5.2384, "mean_token_accuracy": 0.1945773482322693, "num_tokens": 15569156.0, "step": 6790 }, { "entropy": 5.228867387771606, "epoch": 0.6527377521613833, "grad_norm": 1.28125, "learning_rate": 0.0004965017763501533, "loss": 5.1821, "mean_token_accuracy": 0.20229731500148773, "num_tokens": 15579270.0, "step": 6795 }, { "entropy": 5.204952526092529, "epoch": 0.6532180595581172, "grad_norm": 1.1328125, "learning_rate": 0.0004964957518070529, "loss": 5.1436, "mean_token_accuracy": 0.2022804006934166, "num_tokens": 15589912.0, "step": 6800 }, { "entropy": 5.351508712768554, "epoch": 0.6536983669548511, "grad_norm": 1.109375, "learning_rate": 0.0004964897221214788, "loss": 5.2636, "mean_token_accuracy": 0.1962052032351494, "num_tokens": 15601088.0, "step": 6805 }, { "entropy": 5.400100564956665, "epoch": 0.654178674351585, "grad_norm": 1.15625, "learning_rate": 0.000496483687293571, "loss": 5.2467, "mean_token_accuracy": 0.19797869324684142, "num_tokens": 15612520.0, "step": 6810 }, { "entropy": 5.225302648544312, "epoch": 0.6546589817483189, "grad_norm": 1.21875, "learning_rate": 0.0004964776473234696, "loss": 5.1854, "mean_token_accuracy": 0.20556757897138594, "num_tokens": 15623991.0, "step": 6815 }, { "entropy": 5.196185064315796, "epoch": 0.6551392891450528, "grad_norm": 1.140625, "learning_rate": 0.000496471602211315, "loss": 5.11, "mean_token_accuracy": 0.2007066160440445, "num_tokens": 15635389.0, "step": 6820 }, { "entropy": 5.309195470809937, "epoch": 0.6556195965417867, "grad_norm": 1.25, "learning_rate": 0.0004964655519572475, "loss": 5.2207, "mean_token_accuracy": 0.20262846797704698, "num_tokens": 15646427.0, "step": 6825 }, { "entropy": 5.216340732574463, "epoch": 0.6560999039385207, "grad_norm": 1.2578125, "learning_rate": 0.0004964594965614072, "loss": 5.2179, "mean_token_accuracy": 0.192514306306839, "num_tokens": 15657518.0, "step": 6830 }, { "entropy": 5.357060527801513, "epoch": 0.6565802113352546, "grad_norm": 1.1015625, "learning_rate": 0.0004964534360239353, "loss": 5.2477, "mean_token_accuracy": 0.19148818552494049, "num_tokens": 15669775.0, "step": 6835 }, { "entropy": 5.3501969337463375, "epoch": 0.6570605187319885, "grad_norm": 1.0859375, "learning_rate": 0.000496447370344972, "loss": 5.236, "mean_token_accuracy": 0.18657899051904678, "num_tokens": 15682566.0, "step": 6840 }, { "entropy": 5.258563375473022, "epoch": 0.6575408261287223, "grad_norm": 1.0703125, "learning_rate": 0.0004964412995246584, "loss": 5.2279, "mean_token_accuracy": 0.19911282062530516, "num_tokens": 15693168.0, "step": 6845 }, { "entropy": 5.332230424880981, "epoch": 0.6580211335254563, "grad_norm": 1.140625, "learning_rate": 0.0004964352235631354, "loss": 5.2429, "mean_token_accuracy": 0.19526638984680175, "num_tokens": 15703879.0, "step": 6850 }, { "entropy": 5.370284652709961, "epoch": 0.6585014409221902, "grad_norm": 1.265625, "learning_rate": 0.000496429142460544, "loss": 5.2742, "mean_token_accuracy": 0.19143834859132766, "num_tokens": 15716015.0, "step": 6855 }, { "entropy": 5.361678266525269, "epoch": 0.6589817483189241, "grad_norm": 1.0703125, "learning_rate": 0.0004964230562170254, "loss": 5.2845, "mean_token_accuracy": 0.19869090169668197, "num_tokens": 15728254.0, "step": 6860 }, { "entropy": 5.305975437164307, "epoch": 0.659462055715658, "grad_norm": 1.1953125, "learning_rate": 0.0004964169648327209, "loss": 5.2485, "mean_token_accuracy": 0.1971095770597458, "num_tokens": 15738778.0, "step": 6865 }, { "entropy": 5.41188497543335, "epoch": 0.659942363112392, "grad_norm": 1.171875, "learning_rate": 0.000496410868307772, "loss": 5.2555, "mean_token_accuracy": 0.194293774664402, "num_tokens": 15750305.0, "step": 6870 }, { "entropy": 5.304081630706787, "epoch": 0.6604226705091258, "grad_norm": 1.1953125, "learning_rate": 0.0004964047666423203, "loss": 5.2242, "mean_token_accuracy": 0.19772389829158782, "num_tokens": 15761303.0, "step": 6875 }, { "entropy": 5.241645288467407, "epoch": 0.6609029779058597, "grad_norm": 1.1875, "learning_rate": 0.0004963986598365072, "loss": 5.1887, "mean_token_accuracy": 0.20886375904083251, "num_tokens": 15773095.0, "step": 6880 }, { "entropy": 5.377842140197754, "epoch": 0.6613832853025937, "grad_norm": 1.1484375, "learning_rate": 0.0004963925478904746, "loss": 5.2658, "mean_token_accuracy": 0.19911141097545623, "num_tokens": 15784405.0, "step": 6885 }, { "entropy": 5.203935384750366, "epoch": 0.6618635926993276, "grad_norm": 1.125, "learning_rate": 0.0004963864308043645, "loss": 5.198, "mean_token_accuracy": 0.19471461623907088, "num_tokens": 15795178.0, "step": 6890 }, { "entropy": 5.240695381164551, "epoch": 0.6623439000960615, "grad_norm": 1.1953125, "learning_rate": 0.0004963803085783189, "loss": 5.1989, "mean_token_accuracy": 0.20192865282297134, "num_tokens": 15806205.0, "step": 6895 }, { "entropy": 5.311961603164673, "epoch": 0.6628242074927954, "grad_norm": 1.1875, "learning_rate": 0.0004963741812124799, "loss": 5.1274, "mean_token_accuracy": 0.20400931388139726, "num_tokens": 15817474.0, "step": 6900 }, { "entropy": 5.277816581726074, "epoch": 0.6633045148895294, "grad_norm": 1.1484375, "learning_rate": 0.0004963680487069898, "loss": 5.2417, "mean_token_accuracy": 0.19115398675203324, "num_tokens": 15829728.0, "step": 6905 }, { "entropy": 5.356722640991211, "epoch": 0.6637848222862632, "grad_norm": 1.15625, "learning_rate": 0.0004963619110619908, "loss": 5.3267, "mean_token_accuracy": 0.19444778561592102, "num_tokens": 15840082.0, "step": 6910 }, { "entropy": 5.332076549530029, "epoch": 0.6642651296829971, "grad_norm": 1.234375, "learning_rate": 0.0004963557682776256, "loss": 5.2147, "mean_token_accuracy": 0.19637496322393416, "num_tokens": 15851450.0, "step": 6915 }, { "entropy": 5.394643926620484, "epoch": 0.664745437079731, "grad_norm": 1.046875, "learning_rate": 0.0004963496203540368, "loss": 5.3604, "mean_token_accuracy": 0.18928576707839967, "num_tokens": 15864168.0, "step": 6920 }, { "entropy": 5.382654428482056, "epoch": 0.665225744476465, "grad_norm": 1.0390625, "learning_rate": 0.0004963434672913671, "loss": 5.3005, "mean_token_accuracy": 0.1974198803305626, "num_tokens": 15875634.0, "step": 6925 }, { "entropy": 5.406427907943725, "epoch": 0.6657060518731989, "grad_norm": 1.1015625, "learning_rate": 0.0004963373090897592, "loss": 5.304, "mean_token_accuracy": 0.19297229051589965, "num_tokens": 15888411.0, "step": 6930 }, { "entropy": 5.341877174377442, "epoch": 0.6661863592699327, "grad_norm": 1.1328125, "learning_rate": 0.0004963311457493563, "loss": 5.2933, "mean_token_accuracy": 0.19015721529722213, "num_tokens": 15901084.0, "step": 6935 }, { "entropy": 5.429636812210083, "epoch": 0.6666666666666666, "grad_norm": 1.1328125, "learning_rate": 0.0004963249772703015, "loss": 5.3105, "mean_token_accuracy": 0.1917794793844223, "num_tokens": 15912061.0, "step": 6940 }, { "entropy": 5.352426385879516, "epoch": 0.6671469740634006, "grad_norm": 1.140625, "learning_rate": 0.0004963188036527378, "loss": 5.3612, "mean_token_accuracy": 0.1935911074280739, "num_tokens": 15925439.0, "step": 6945 }, { "entropy": 5.256782197952271, "epoch": 0.6676272814601345, "grad_norm": 1.1640625, "learning_rate": 0.0004963126248968087, "loss": 5.0929, "mean_token_accuracy": 0.2068867191672325, "num_tokens": 15937762.0, "step": 6950 }, { "entropy": 5.249649286270142, "epoch": 0.6681075888568684, "grad_norm": 1.2421875, "learning_rate": 0.0004963064410026577, "loss": 5.1521, "mean_token_accuracy": 0.20011164397001266, "num_tokens": 15948656.0, "step": 6955 }, { "entropy": 5.343927001953125, "epoch": 0.6685878962536023, "grad_norm": 1.2265625, "learning_rate": 0.0004963002519704281, "loss": 5.2221, "mean_token_accuracy": 0.19271822571754454, "num_tokens": 15960376.0, "step": 6960 }, { "entropy": 5.325286912918091, "epoch": 0.6690682036503363, "grad_norm": 1.3125, "learning_rate": 0.000496294057800264, "loss": 5.2315, "mean_token_accuracy": 0.19581420868635177, "num_tokens": 15971913.0, "step": 6965 }, { "entropy": 5.3356259822845455, "epoch": 0.6695485110470701, "grad_norm": 1.1953125, "learning_rate": 0.0004962878584923089, "loss": 5.2895, "mean_token_accuracy": 0.18817632645368576, "num_tokens": 15984775.0, "step": 6970 }, { "entropy": 5.344109296798706, "epoch": 0.670028818443804, "grad_norm": 1.140625, "learning_rate": 0.0004962816540467068, "loss": 5.2166, "mean_token_accuracy": 0.19363451302051543, "num_tokens": 15996717.0, "step": 6975 }, { "entropy": 5.3354270458221436, "epoch": 0.6705091258405379, "grad_norm": 1.3359375, "learning_rate": 0.0004962754444636017, "loss": 5.2025, "mean_token_accuracy": 0.20301510095596315, "num_tokens": 16007964.0, "step": 6980 }, { "entropy": 5.232488775253296, "epoch": 0.6709894332372719, "grad_norm": 1.1953125, "learning_rate": 0.000496269229743138, "loss": 5.2268, "mean_token_accuracy": 0.19161542654037475, "num_tokens": 16019185.0, "step": 6985 }, { "entropy": 5.335939073562622, "epoch": 0.6714697406340058, "grad_norm": 1.7265625, "learning_rate": 0.0004962630098854597, "loss": 5.2341, "mean_token_accuracy": 0.2045590490102768, "num_tokens": 16029983.0, "step": 6990 }, { "entropy": 5.36190619468689, "epoch": 0.6719500480307397, "grad_norm": 1.1015625, "learning_rate": 0.0004962567848907113, "loss": 5.1731, "mean_token_accuracy": 0.1986493170261383, "num_tokens": 16040574.0, "step": 6995 }, { "entropy": 5.364934206008911, "epoch": 0.6724303554274735, "grad_norm": 1.2265625, "learning_rate": 0.0004962505547590374, "loss": 5.3243, "mean_token_accuracy": 0.19473587423563005, "num_tokens": 16052037.0, "step": 7000 }, { "entropy": 5.329007911682129, "epoch": 0.6729106628242075, "grad_norm": 1.171875, "learning_rate": 0.0004962443194905826, "loss": 5.2961, "mean_token_accuracy": 0.19470866173505783, "num_tokens": 16063413.0, "step": 7005 }, { "entropy": 5.34464545249939, "epoch": 0.6733909702209414, "grad_norm": 1.2421875, "learning_rate": 0.0004962380790854916, "loss": 5.186, "mean_token_accuracy": 0.1919792741537094, "num_tokens": 16074373.0, "step": 7010 }, { "entropy": 5.395242977142334, "epoch": 0.6738712776176753, "grad_norm": 1.2265625, "learning_rate": 0.0004962318335439094, "loss": 5.3215, "mean_token_accuracy": 0.1902454525232315, "num_tokens": 16086575.0, "step": 7015 }, { "entropy": 5.374180316925049, "epoch": 0.6743515850144092, "grad_norm": 1.171875, "learning_rate": 0.0004962255828659809, "loss": 5.1814, "mean_token_accuracy": 0.19680528789758683, "num_tokens": 16098529.0, "step": 7020 }, { "entropy": 5.237930870056152, "epoch": 0.6748318924111432, "grad_norm": 1.203125, "learning_rate": 0.0004962193270518513, "loss": 5.2299, "mean_token_accuracy": 0.20323789566755296, "num_tokens": 16110085.0, "step": 7025 }, { "entropy": 5.248058891296386, "epoch": 0.675312199807877, "grad_norm": 1.2421875, "learning_rate": 0.0004962130661016659, "loss": 5.1142, "mean_token_accuracy": 0.20192392021417618, "num_tokens": 16120249.0, "step": 7030 }, { "entropy": 5.408255577087402, "epoch": 0.6757925072046109, "grad_norm": 1.1640625, "learning_rate": 0.0004962068000155699, "loss": 5.4028, "mean_token_accuracy": 0.18510753959417342, "num_tokens": 16132645.0, "step": 7035 }, { "entropy": 5.360714340209961, "epoch": 0.6762728146013448, "grad_norm": 1.0625, "learning_rate": 0.0004962005287937088, "loss": 5.2683, "mean_token_accuracy": 0.19801645576953888, "num_tokens": 16143808.0, "step": 7040 }, { "entropy": 5.2652663230896, "epoch": 0.6767531219980788, "grad_norm": 1.1953125, "learning_rate": 0.0004961942524362283, "loss": 5.2683, "mean_token_accuracy": 0.199874410033226, "num_tokens": 16154309.0, "step": 7045 }, { "entropy": 5.331776762008667, "epoch": 0.6772334293948127, "grad_norm": 1.359375, "learning_rate": 0.0004961879709432741, "loss": 5.2157, "mean_token_accuracy": 0.19288192838430404, "num_tokens": 16164654.0, "step": 7050 }, { "entropy": 5.368539190292358, "epoch": 0.6777137367915466, "grad_norm": 1.2890625, "learning_rate": 0.000496181684314992, "loss": 5.2722, "mean_token_accuracy": 0.1937314122915268, "num_tokens": 16177837.0, "step": 7055 }, { "entropy": 5.267057132720947, "epoch": 0.6781940441882806, "grad_norm": 1.140625, "learning_rate": 0.0004961753925515279, "loss": 5.1551, "mean_token_accuracy": 0.20363912582397461, "num_tokens": 16189073.0, "step": 7060 }, { "entropy": 5.379831266403198, "epoch": 0.6786743515850144, "grad_norm": 1.34375, "learning_rate": 0.000496169095653028, "loss": 5.3474, "mean_token_accuracy": 0.19800200462341308, "num_tokens": 16200371.0, "step": 7065 }, { "entropy": 5.303866577148438, "epoch": 0.6791546589817483, "grad_norm": 1.34375, "learning_rate": 0.0004961627936196384, "loss": 5.1378, "mean_token_accuracy": 0.19792077392339708, "num_tokens": 16210526.0, "step": 7070 }, { "entropy": 5.324541854858398, "epoch": 0.6796349663784822, "grad_norm": 1.25, "learning_rate": 0.0004961564864515055, "loss": 5.2485, "mean_token_accuracy": 0.19714123010635376, "num_tokens": 16221687.0, "step": 7075 }, { "entropy": 5.198392057418824, "epoch": 0.6801152737752162, "grad_norm": 1.1875, "learning_rate": 0.0004961501741487757, "loss": 5.1228, "mean_token_accuracy": 0.207232241332531, "num_tokens": 16233828.0, "step": 7080 }, { "entropy": 5.265508413314819, "epoch": 0.6805955811719501, "grad_norm": 1.25, "learning_rate": 0.0004961438567115955, "loss": 5.1098, "mean_token_accuracy": 0.20777646452188492, "num_tokens": 16243900.0, "step": 7085 }, { "entropy": 5.29700231552124, "epoch": 0.681075888568684, "grad_norm": 1.4921875, "learning_rate": 0.0004961375341401116, "loss": 5.2056, "mean_token_accuracy": 0.19741868525743483, "num_tokens": 16256347.0, "step": 7090 }, { "entropy": 5.292010688781739, "epoch": 0.6815561959654178, "grad_norm": 1.2734375, "learning_rate": 0.0004961312064344708, "loss": 5.1188, "mean_token_accuracy": 0.20195768475532533, "num_tokens": 16267743.0, "step": 7095 }, { "entropy": 5.1783538341522215, "epoch": 0.6820365033621518, "grad_norm": 1.3125, "learning_rate": 0.00049612487359482, "loss": 5.1529, "mean_token_accuracy": 0.20347070544958115, "num_tokens": 16278280.0, "step": 7100 }, { "entropy": 5.204392337799073, "epoch": 0.6825168107588857, "grad_norm": 1.21875, "learning_rate": 0.0004961185356213062, "loss": 5.1439, "mean_token_accuracy": 0.20720461010932922, "num_tokens": 16288568.0, "step": 7105 }, { "entropy": 5.255360317230225, "epoch": 0.6829971181556196, "grad_norm": 1.125, "learning_rate": 0.0004961121925140767, "loss": 5.2192, "mean_token_accuracy": 0.20132138431072236, "num_tokens": 16300730.0, "step": 7110 }, { "entropy": 5.277251672744751, "epoch": 0.6834774255523535, "grad_norm": 1.1328125, "learning_rate": 0.0004961058442732786, "loss": 5.2216, "mean_token_accuracy": 0.19964685887098313, "num_tokens": 16311789.0, "step": 7115 }, { "entropy": 5.32997989654541, "epoch": 0.6839577329490875, "grad_norm": 1.109375, "learning_rate": 0.0004960994908990594, "loss": 5.1993, "mean_token_accuracy": 0.1934235706925392, "num_tokens": 16324439.0, "step": 7120 }, { "entropy": 5.3754744052886965, "epoch": 0.6844380403458213, "grad_norm": 1.21875, "learning_rate": 0.0004960931323915665, "loss": 5.3411, "mean_token_accuracy": 0.20248763710260392, "num_tokens": 16335344.0, "step": 7125 }, { "entropy": 5.296399450302124, "epoch": 0.6849183477425552, "grad_norm": 1.0234375, "learning_rate": 0.0004960867687509475, "loss": 5.1806, "mean_token_accuracy": 0.2043926537036896, "num_tokens": 16349018.0, "step": 7130 }, { "entropy": 5.313053369522095, "epoch": 0.6853986551392891, "grad_norm": 1.265625, "learning_rate": 0.0004960803999773504, "loss": 5.2619, "mean_token_accuracy": 0.1975033849477768, "num_tokens": 16360137.0, "step": 7135 }, { "entropy": 5.3848051071167, "epoch": 0.6858789625360231, "grad_norm": 1.25, "learning_rate": 0.0004960740260709228, "loss": 5.2692, "mean_token_accuracy": 0.19346715658903121, "num_tokens": 16372277.0, "step": 7140 }, { "entropy": 5.36100664138794, "epoch": 0.686359269932757, "grad_norm": 1.2265625, "learning_rate": 0.0004960676470318128, "loss": 5.2727, "mean_token_accuracy": 0.19226655662059783, "num_tokens": 16383440.0, "step": 7145 }, { "entropy": 5.314020681381225, "epoch": 0.6868395773294909, "grad_norm": 1.3828125, "learning_rate": 0.0004960612628601683, "loss": 5.228, "mean_token_accuracy": 0.19899825602769852, "num_tokens": 16394330.0, "step": 7150 }, { "entropy": 5.215233564376831, "epoch": 0.6873198847262247, "grad_norm": 1.1875, "learning_rate": 0.0004960548735561379, "loss": 5.15, "mean_token_accuracy": 0.20503795742988587, "num_tokens": 16405734.0, "step": 7155 }, { "entropy": 5.318601846694946, "epoch": 0.6878001921229587, "grad_norm": 1.234375, "learning_rate": 0.0004960484791198697, "loss": 5.1957, "mean_token_accuracy": 0.1932004436850548, "num_tokens": 16416025.0, "step": 7160 }, { "entropy": 5.244638299942016, "epoch": 0.6882804995196926, "grad_norm": 1.2109375, "learning_rate": 0.0004960420795515121, "loss": 5.1858, "mean_token_accuracy": 0.19369462579488755, "num_tokens": 16427416.0, "step": 7165 }, { "entropy": 5.237499618530274, "epoch": 0.6887608069164265, "grad_norm": 1.34375, "learning_rate": 0.0004960356748512138, "loss": 5.185, "mean_token_accuracy": 0.20407173335552214, "num_tokens": 16438073.0, "step": 7170 }, { "entropy": 5.274411201477051, "epoch": 0.6892411143131604, "grad_norm": 1.3046875, "learning_rate": 0.0004960292650191236, "loss": 5.1486, "mean_token_accuracy": 0.1983790621161461, "num_tokens": 16449994.0, "step": 7175 }, { "entropy": 5.2747314929962155, "epoch": 0.6897214217098944, "grad_norm": 1.2421875, "learning_rate": 0.0004960228500553899, "loss": 5.1435, "mean_token_accuracy": 0.21412984877824784, "num_tokens": 16460355.0, "step": 7180 }, { "entropy": 5.401017570495606, "epoch": 0.6902017291066282, "grad_norm": 1.265625, "learning_rate": 0.0004960164299601623, "loss": 5.2192, "mean_token_accuracy": 0.19781634211540222, "num_tokens": 16472274.0, "step": 7185 }, { "entropy": 5.366089820861816, "epoch": 0.6906820365033621, "grad_norm": 1.140625, "learning_rate": 0.0004960100047335892, "loss": 5.2644, "mean_token_accuracy": 0.19702471643686295, "num_tokens": 16482959.0, "step": 7190 }, { "entropy": 5.314202547073364, "epoch": 0.691162343900096, "grad_norm": 1.3125, "learning_rate": 0.0004960035743758202, "loss": 5.2786, "mean_token_accuracy": 0.1941748395562172, "num_tokens": 16494533.0, "step": 7195 }, { "entropy": 5.2420876026153564, "epoch": 0.69164265129683, "grad_norm": 1.2421875, "learning_rate": 0.0004959971388870044, "loss": 5.1539, "mean_token_accuracy": 0.20189982801675796, "num_tokens": 16506124.0, "step": 7200 }, { "entropy": 5.253983736038208, "epoch": 0.6921229586935639, "grad_norm": 1.140625, "learning_rate": 0.0004959906982672912, "loss": 5.1791, "mean_token_accuracy": 0.19119898676872255, "num_tokens": 16517867.0, "step": 7205 }, { "entropy": 5.217931318283081, "epoch": 0.6926032660902978, "grad_norm": 1.203125, "learning_rate": 0.0004959842525168302, "loss": 5.11, "mean_token_accuracy": 0.20074271708726882, "num_tokens": 16529075.0, "step": 7210 }, { "entropy": 5.294466924667359, "epoch": 0.6930835734870316, "grad_norm": 1.2578125, "learning_rate": 0.0004959778016357712, "loss": 5.1452, "mean_token_accuracy": 0.20483478754758835, "num_tokens": 16540326.0, "step": 7215 }, { "entropy": 5.280926847457886, "epoch": 0.6935638808837656, "grad_norm": 1.1484375, "learning_rate": 0.0004959713456242637, "loss": 5.2002, "mean_token_accuracy": 0.1991882160305977, "num_tokens": 16551570.0, "step": 7220 }, { "entropy": 5.373426103591919, "epoch": 0.6940441882804995, "grad_norm": 1.40625, "learning_rate": 0.0004959648844824576, "loss": 5.2523, "mean_token_accuracy": 0.19577774852514268, "num_tokens": 16562636.0, "step": 7225 }, { "entropy": 5.285389518737793, "epoch": 0.6945244956772334, "grad_norm": 1.1484375, "learning_rate": 0.0004959584182105032, "loss": 5.1307, "mean_token_accuracy": 0.2037241354584694, "num_tokens": 16573867.0, "step": 7230 }, { "entropy": 5.235301113128662, "epoch": 0.6950048030739674, "grad_norm": 1.2265625, "learning_rate": 0.0004959519468085504, "loss": 5.1263, "mean_token_accuracy": 0.20582810789346695, "num_tokens": 16584533.0, "step": 7235 }, { "entropy": 5.211501741409302, "epoch": 0.6954851104707013, "grad_norm": 1.1640625, "learning_rate": 0.0004959454702767494, "loss": 5.3437, "mean_token_accuracy": 0.20351918488740922, "num_tokens": 16596562.0, "step": 7240 }, { "entropy": 5.31222095489502, "epoch": 0.6959654178674352, "grad_norm": 1.140625, "learning_rate": 0.0004959389886152507, "loss": 5.1793, "mean_token_accuracy": 0.21050842702388764, "num_tokens": 16607508.0, "step": 7245 }, { "entropy": 5.342617702484131, "epoch": 0.696445725264169, "grad_norm": 1.09375, "learning_rate": 0.0004959325018242048, "loss": 5.2606, "mean_token_accuracy": 0.19728082418441772, "num_tokens": 16617737.0, "step": 7250 }, { "entropy": 5.327224397659302, "epoch": 0.696926032660903, "grad_norm": 1.21875, "learning_rate": 0.0004959260099037622, "loss": 5.2593, "mean_token_accuracy": 0.19428612291812897, "num_tokens": 16628518.0, "step": 7255 }, { "entropy": 5.318410634994507, "epoch": 0.6974063400576369, "grad_norm": 1.15625, "learning_rate": 0.0004959195128540737, "loss": 5.2864, "mean_token_accuracy": 0.18943870663642884, "num_tokens": 16639644.0, "step": 7260 }, { "entropy": 5.433784484863281, "epoch": 0.6978866474543708, "grad_norm": 1.2265625, "learning_rate": 0.00049591301067529, "loss": 5.3405, "mean_token_accuracy": 0.18801091760396957, "num_tokens": 16650882.0, "step": 7265 }, { "entropy": 5.336638355255127, "epoch": 0.6983669548511047, "grad_norm": 1.4140625, "learning_rate": 0.0004959065033675623, "loss": 5.2265, "mean_token_accuracy": 0.19460077136754989, "num_tokens": 16662464.0, "step": 7270 }, { "entropy": 5.355379629135132, "epoch": 0.6988472622478387, "grad_norm": 1.2265625, "learning_rate": 0.0004958999909310414, "loss": 5.2623, "mean_token_accuracy": 0.19179683029651642, "num_tokens": 16673718.0, "step": 7275 }, { "entropy": 5.290545892715454, "epoch": 0.6993275696445725, "grad_norm": 1.1171875, "learning_rate": 0.0004958934733658788, "loss": 5.1873, "mean_token_accuracy": 0.19871881008148193, "num_tokens": 16684957.0, "step": 7280 }, { "entropy": 5.306415462493897, "epoch": 0.6998078770413064, "grad_norm": 1.3046875, "learning_rate": 0.0004958869506722256, "loss": 5.2294, "mean_token_accuracy": 0.19656572341918946, "num_tokens": 16695782.0, "step": 7285 }, { "entropy": 5.280803632736206, "epoch": 0.7002881844380403, "grad_norm": 1.15625, "learning_rate": 0.0004958804228502332, "loss": 5.1363, "mean_token_accuracy": 0.20514173954725265, "num_tokens": 16707448.0, "step": 7290 }, { "entropy": 5.23841404914856, "epoch": 0.7007684918347743, "grad_norm": 1.203125, "learning_rate": 0.0004958738899000534, "loss": 5.1653, "mean_token_accuracy": 0.19584250301122666, "num_tokens": 16718074.0, "step": 7295 }, { "entropy": 5.29450945854187, "epoch": 0.7012487992315082, "grad_norm": 1.4140625, "learning_rate": 0.0004958673518218377, "loss": 5.2046, "mean_token_accuracy": 0.20462729632854462, "num_tokens": 16728656.0, "step": 7300 }, { "entropy": 5.307183456420899, "epoch": 0.7017291066282421, "grad_norm": 1.328125, "learning_rate": 0.0004958608086157379, "loss": 5.2517, "mean_token_accuracy": 0.19895842373371125, "num_tokens": 16740703.0, "step": 7305 }, { "entropy": 5.321242046356201, "epoch": 0.7022094140249759, "grad_norm": 1.125, "learning_rate": 0.000495854260281906, "loss": 5.1298, "mean_token_accuracy": 0.2080679327249527, "num_tokens": 16753320.0, "step": 7310 }, { "entropy": 5.1749231815338135, "epoch": 0.7026897214217099, "grad_norm": 1.2265625, "learning_rate": 0.0004958477068204941, "loss": 5.2007, "mean_token_accuracy": 0.20202370434999467, "num_tokens": 16764889.0, "step": 7315 }, { "entropy": 5.385407257080078, "epoch": 0.7031700288184438, "grad_norm": 1.1484375, "learning_rate": 0.000495841148231654, "loss": 5.2381, "mean_token_accuracy": 0.2000655323266983, "num_tokens": 16776301.0, "step": 7320 }, { "entropy": 5.351953983306885, "epoch": 0.7036503362151777, "grad_norm": 1.15625, "learning_rate": 0.0004958345845155383, "loss": 5.2239, "mean_token_accuracy": 0.20341352075338365, "num_tokens": 16786935.0, "step": 7325 }, { "entropy": 5.241073179244995, "epoch": 0.7041306436119116, "grad_norm": 1.2109375, "learning_rate": 0.0004958280156722992, "loss": 5.2238, "mean_token_accuracy": 0.1962451696395874, "num_tokens": 16799335.0, "step": 7330 }, { "entropy": 5.23271107673645, "epoch": 0.7046109510086456, "grad_norm": 1.125, "learning_rate": 0.0004958214417020894, "loss": 5.1471, "mean_token_accuracy": 0.20265070348978043, "num_tokens": 16811376.0, "step": 7335 }, { "entropy": 5.326413011550903, "epoch": 0.7050912584053795, "grad_norm": 1.2578125, "learning_rate": 0.0004958148626050614, "loss": 5.2133, "mean_token_accuracy": 0.19387653321027756, "num_tokens": 16822859.0, "step": 7340 }, { "entropy": 5.298766660690307, "epoch": 0.7055715658021133, "grad_norm": 1.3984375, "learning_rate": 0.000495808278381368, "loss": 5.2019, "mean_token_accuracy": 0.1966390699148178, "num_tokens": 16833831.0, "step": 7345 }, { "entropy": 5.3197509288787845, "epoch": 0.7060518731988472, "grad_norm": 1.125, "learning_rate": 0.000495801689031162, "loss": 5.234, "mean_token_accuracy": 0.2002715587615967, "num_tokens": 16846827.0, "step": 7350 }, { "entropy": 5.338577556610107, "epoch": 0.7065321805955812, "grad_norm": 1.25, "learning_rate": 0.0004957950945545965, "loss": 5.2022, "mean_token_accuracy": 0.20344888269901276, "num_tokens": 16858166.0, "step": 7355 }, { "entropy": 5.377301120758057, "epoch": 0.7070124879923151, "grad_norm": 1.1953125, "learning_rate": 0.0004957884949518246, "loss": 5.3351, "mean_token_accuracy": 0.1826968491077423, "num_tokens": 16870201.0, "step": 7360 }, { "entropy": 5.329647397994995, "epoch": 0.707492795389049, "grad_norm": 1.21875, "learning_rate": 0.0004957818902229992, "loss": 5.2073, "mean_token_accuracy": 0.19947872757911683, "num_tokens": 16880891.0, "step": 7365 }, { "entropy": 5.316723299026489, "epoch": 0.7079731027857828, "grad_norm": 1.109375, "learning_rate": 0.0004957752803682741, "loss": 5.2801, "mean_token_accuracy": 0.19825806319713593, "num_tokens": 16893498.0, "step": 7370 }, { "entropy": 5.346575832366943, "epoch": 0.7084534101825168, "grad_norm": 1.375, "learning_rate": 0.0004957686653878024, "loss": 5.2538, "mean_token_accuracy": 0.1996032789349556, "num_tokens": 16904959.0, "step": 7375 }, { "entropy": 5.391040182113647, "epoch": 0.7089337175792507, "grad_norm": 1.34375, "learning_rate": 0.000495762045281738, "loss": 5.2735, "mean_token_accuracy": 0.19459239691495894, "num_tokens": 16916547.0, "step": 7380 }, { "entropy": 5.243378686904907, "epoch": 0.7094140249759846, "grad_norm": 1.1640625, "learning_rate": 0.0004957554200502344, "loss": 5.2145, "mean_token_accuracy": 0.1971280872821808, "num_tokens": 16928580.0, "step": 7385 }, { "entropy": 5.325099229812622, "epoch": 0.7098943323727186, "grad_norm": 1.109375, "learning_rate": 0.0004957487896934454, "loss": 5.232, "mean_token_accuracy": 0.19476460963487624, "num_tokens": 16941247.0, "step": 7390 }, { "entropy": 5.422113227844238, "epoch": 0.7103746397694525, "grad_norm": 1.15625, "learning_rate": 0.000495742154211525, "loss": 5.2508, "mean_token_accuracy": 0.19776693880558013, "num_tokens": 16951119.0, "step": 7395 }, { "entropy": 5.253897190093994, "epoch": 0.7108549471661864, "grad_norm": 1.2421875, "learning_rate": 0.0004957355136046272, "loss": 5.1788, "mean_token_accuracy": 0.20065926164388656, "num_tokens": 16962608.0, "step": 7400 }, { "entropy": 5.298086404800415, "epoch": 0.7113352545629202, "grad_norm": 1.3046875, "learning_rate": 0.0004957288678729064, "loss": 5.2149, "mean_token_accuracy": 0.19710940271615982, "num_tokens": 16973291.0, "step": 7405 }, { "entropy": 5.391532135009766, "epoch": 0.7118155619596542, "grad_norm": 1.28125, "learning_rate": 0.0004957222170165166, "loss": 5.2282, "mean_token_accuracy": 0.1964917078614235, "num_tokens": 16984895.0, "step": 7410 }, { "entropy": 5.302619600296021, "epoch": 0.7122958693563881, "grad_norm": 1.15625, "learning_rate": 0.0004957155610356124, "loss": 5.2556, "mean_token_accuracy": 0.19016777127981185, "num_tokens": 16997109.0, "step": 7415 }, { "entropy": 5.284333562850952, "epoch": 0.712776176753122, "grad_norm": 1.09375, "learning_rate": 0.0004957088999303481, "loss": 5.1909, "mean_token_accuracy": 0.19305464029312133, "num_tokens": 17008640.0, "step": 7420 }, { "entropy": 5.3054125785827635, "epoch": 0.7132564841498559, "grad_norm": 1.203125, "learning_rate": 0.0004957022337008787, "loss": 5.1643, "mean_token_accuracy": 0.2041031762957573, "num_tokens": 17018924.0, "step": 7425 }, { "entropy": 5.2749724864959715, "epoch": 0.7137367915465899, "grad_norm": 1.328125, "learning_rate": 0.0004956955623473587, "loss": 5.2059, "mean_token_accuracy": 0.19430594891309738, "num_tokens": 17029932.0, "step": 7430 }, { "entropy": 5.317395496368408, "epoch": 0.7142170989433237, "grad_norm": 1.296875, "learning_rate": 0.000495688885869943, "loss": 5.2942, "mean_token_accuracy": 0.19469617754220964, "num_tokens": 17042164.0, "step": 7435 }, { "entropy": 5.349671697616577, "epoch": 0.7146974063400576, "grad_norm": 1.2265625, "learning_rate": 0.0004956822042687868, "loss": 5.2677, "mean_token_accuracy": 0.1921522706747055, "num_tokens": 17054729.0, "step": 7440 }, { "entropy": 5.250608634948731, "epoch": 0.7151777137367915, "grad_norm": 1.1640625, "learning_rate": 0.0004956755175440451, "loss": 5.1269, "mean_token_accuracy": 0.21212284564971923, "num_tokens": 17066537.0, "step": 7445 }, { "entropy": 5.282503080368042, "epoch": 0.7156580211335255, "grad_norm": 1.1875, "learning_rate": 0.0004956688256958731, "loss": 5.1974, "mean_token_accuracy": 0.200366573035717, "num_tokens": 17077787.0, "step": 7450 }, { "entropy": 5.301390218734741, "epoch": 0.7161383285302594, "grad_norm": 1.171875, "learning_rate": 0.0004956621287244262, "loss": 5.2218, "mean_token_accuracy": 0.1973409503698349, "num_tokens": 17089555.0, "step": 7455 }, { "entropy": 5.344415140151978, "epoch": 0.7166186359269933, "grad_norm": 1.2109375, "learning_rate": 0.0004956554266298599, "loss": 5.1796, "mean_token_accuracy": 0.2040402039885521, "num_tokens": 17099911.0, "step": 7460 }, { "entropy": 5.202849531173706, "epoch": 0.7170989433237271, "grad_norm": 1.2265625, "learning_rate": 0.0004956487194123298, "loss": 5.1746, "mean_token_accuracy": 0.2012821167707443, "num_tokens": 17110880.0, "step": 7465 }, { "entropy": 5.285581064224243, "epoch": 0.7175792507204611, "grad_norm": 1.2109375, "learning_rate": 0.0004956420070719918, "loss": 5.1395, "mean_token_accuracy": 0.20026769638061523, "num_tokens": 17122272.0, "step": 7470 }, { "entropy": 5.362657308578491, "epoch": 0.718059558117195, "grad_norm": 1.1484375, "learning_rate": 0.0004956352896090014, "loss": 5.274, "mean_token_accuracy": 0.19526351988315582, "num_tokens": 17133995.0, "step": 7475 }, { "entropy": 5.318215322494507, "epoch": 0.7185398655139289, "grad_norm": 1.21875, "learning_rate": 0.0004956285670235147, "loss": 5.1494, "mean_token_accuracy": 0.20162675380706788, "num_tokens": 17145970.0, "step": 7480 }, { "entropy": 5.310476064682007, "epoch": 0.7190201729106628, "grad_norm": 1.328125, "learning_rate": 0.0004956218393156879, "loss": 5.3381, "mean_token_accuracy": 0.1912968397140503, "num_tokens": 17157747.0, "step": 7485 }, { "entropy": 5.311709976196289, "epoch": 0.7195004803073968, "grad_norm": 1.1875, "learning_rate": 0.0004956151064856772, "loss": 5.2701, "mean_token_accuracy": 0.20224885493516923, "num_tokens": 17168357.0, "step": 7490 }, { "entropy": 5.349070930480957, "epoch": 0.7199807877041307, "grad_norm": 1.1796875, "learning_rate": 0.0004956083685336386, "loss": 5.2336, "mean_token_accuracy": 0.20705265551805496, "num_tokens": 17179871.0, "step": 7495 }, { "entropy": 5.332374906539917, "epoch": 0.7204610951008645, "grad_norm": 1.4140625, "learning_rate": 0.0004956016254597289, "loss": 5.1764, "mean_token_accuracy": 0.20444130003452302, "num_tokens": 17190456.0, "step": 7500 }, { "entropy": 5.290010213851929, "epoch": 0.7209414024975984, "grad_norm": 1.390625, "learning_rate": 0.0004955948772641044, "loss": 5.2627, "mean_token_accuracy": 0.19260406792163848, "num_tokens": 17201623.0, "step": 7505 }, { "entropy": 5.2717503070831295, "epoch": 0.7214217098943324, "grad_norm": 1.1953125, "learning_rate": 0.000495588123946922, "loss": 5.1584, "mean_token_accuracy": 0.2030091643333435, "num_tokens": 17212891.0, "step": 7510 }, { "entropy": 5.332910203933716, "epoch": 0.7219020172910663, "grad_norm": 1.3046875, "learning_rate": 0.0004955813655083384, "loss": 5.2124, "mean_token_accuracy": 0.19683704823255538, "num_tokens": 17223983.0, "step": 7515 }, { "entropy": 5.344393396377564, "epoch": 0.7223823246878002, "grad_norm": 1.25, "learning_rate": 0.0004955746019485104, "loss": 5.2591, "mean_token_accuracy": 0.20013708472251893, "num_tokens": 17233701.0, "step": 7520 }, { "entropy": 5.320908880233764, "epoch": 0.722862632084534, "grad_norm": 1.09375, "learning_rate": 0.000495567833267595, "loss": 5.2505, "mean_token_accuracy": 0.19587977081537247, "num_tokens": 17245958.0, "step": 7525 }, { "entropy": 5.409331274032593, "epoch": 0.723342939481268, "grad_norm": 1.109375, "learning_rate": 0.0004955610594657496, "loss": 5.2579, "mean_token_accuracy": 0.18650022745132447, "num_tokens": 17257687.0, "step": 7530 }, { "entropy": 5.327173328399658, "epoch": 0.7238232468780019, "grad_norm": 1.0625, "learning_rate": 0.0004955542805431313, "loss": 5.287, "mean_token_accuracy": 0.19946179389953614, "num_tokens": 17270247.0, "step": 7535 }, { "entropy": 5.272875452041626, "epoch": 0.7243035542747358, "grad_norm": 1.109375, "learning_rate": 0.0004955474964998976, "loss": 5.2117, "mean_token_accuracy": 0.20549824982881545, "num_tokens": 17281920.0, "step": 7540 }, { "entropy": 5.4193642139434814, "epoch": 0.7247838616714697, "grad_norm": 1.203125, "learning_rate": 0.0004955407073362058, "loss": 5.3923, "mean_token_accuracy": 0.1859695628285408, "num_tokens": 17293602.0, "step": 7545 }, { "entropy": 5.421948957443237, "epoch": 0.7252641690682037, "grad_norm": 1.15625, "learning_rate": 0.0004955339130522136, "loss": 5.1722, "mean_token_accuracy": 0.19999373257160186, "num_tokens": 17304484.0, "step": 7550 }, { "entropy": 5.196074771881103, "epoch": 0.7257444764649376, "grad_norm": 1.1484375, "learning_rate": 0.000495527113648079, "loss": 5.1102, "mean_token_accuracy": 0.2052151992917061, "num_tokens": 17314953.0, "step": 7555 }, { "entropy": 5.263898038864136, "epoch": 0.7262247838616714, "grad_norm": 1.2890625, "learning_rate": 0.0004955203091239596, "loss": 5.2313, "mean_token_accuracy": 0.19495706707239152, "num_tokens": 17326813.0, "step": 7560 }, { "entropy": 5.3781215190887455, "epoch": 0.7267050912584054, "grad_norm": 1.3125, "learning_rate": 0.0004955134994800134, "loss": 5.3161, "mean_token_accuracy": 0.1874473437666893, "num_tokens": 17337968.0, "step": 7565 }, { "entropy": 5.309977293014526, "epoch": 0.7271853986551393, "grad_norm": 1.171875, "learning_rate": 0.0004955066847163984, "loss": 5.2156, "mean_token_accuracy": 0.1947050377726555, "num_tokens": 17350406.0, "step": 7570 }, { "entropy": 5.217679643630982, "epoch": 0.7276657060518732, "grad_norm": 1.453125, "learning_rate": 0.0004954998648332731, "loss": 5.1128, "mean_token_accuracy": 0.20684792548418046, "num_tokens": 17361888.0, "step": 7575 }, { "entropy": 5.307045364379883, "epoch": 0.7281460134486071, "grad_norm": 1.3828125, "learning_rate": 0.0004954930398307956, "loss": 5.2392, "mean_token_accuracy": 0.196373288333416, "num_tokens": 17374047.0, "step": 7580 }, { "entropy": 5.35437798500061, "epoch": 0.7286263208453411, "grad_norm": 1.2109375, "learning_rate": 0.0004954862097091245, "loss": 5.2254, "mean_token_accuracy": 0.18948904722929, "num_tokens": 17386175.0, "step": 7585 }, { "entropy": 5.318008661270142, "epoch": 0.729106628242075, "grad_norm": 1.2578125, "learning_rate": 0.0004954793744684184, "loss": 5.2641, "mean_token_accuracy": 0.20655235201120375, "num_tokens": 17398168.0, "step": 7590 }, { "entropy": 5.230118227005005, "epoch": 0.7295869356388088, "grad_norm": 1.1484375, "learning_rate": 0.0004954725341088358, "loss": 5.1178, "mean_token_accuracy": 0.2040240153670311, "num_tokens": 17408825.0, "step": 7595 }, { "entropy": 5.2700879096984865, "epoch": 0.7300672430355427, "grad_norm": 1.28125, "learning_rate": 0.0004954656886305356, "loss": 5.2158, "mean_token_accuracy": 0.1991657391190529, "num_tokens": 17419813.0, "step": 7600 }, { "entropy": 5.354073095321655, "epoch": 0.7305475504322767, "grad_norm": 1.28125, "learning_rate": 0.0004954588380336768, "loss": 5.2669, "mean_token_accuracy": 0.1913167342543602, "num_tokens": 17431134.0, "step": 7605 }, { "entropy": 5.251888942718506, "epoch": 0.7310278578290106, "grad_norm": 1.3515625, "learning_rate": 0.0004954519823184184, "loss": 5.1767, "mean_token_accuracy": 0.200624917447567, "num_tokens": 17442614.0, "step": 7610 }, { "entropy": 5.315543699264526, "epoch": 0.7315081652257445, "grad_norm": 1.1875, "learning_rate": 0.0004954451214849196, "loss": 5.1714, "mean_token_accuracy": 0.19947559833526612, "num_tokens": 17454615.0, "step": 7615 }, { "entropy": 5.376033115386963, "epoch": 0.7319884726224783, "grad_norm": 1.1171875, "learning_rate": 0.0004954382555333397, "loss": 5.2185, "mean_token_accuracy": 0.1976114273071289, "num_tokens": 17467025.0, "step": 7620 }, { "entropy": 5.226950979232788, "epoch": 0.7324687800192123, "grad_norm": 1.1796875, "learning_rate": 0.000495431384463838, "loss": 5.1785, "mean_token_accuracy": 0.20502081364393235, "num_tokens": 17477681.0, "step": 7625 }, { "entropy": 5.305552339553833, "epoch": 0.7329490874159462, "grad_norm": 1.0546875, "learning_rate": 0.0004954245082765741, "loss": 5.2359, "mean_token_accuracy": 0.1983788013458252, "num_tokens": 17489814.0, "step": 7630 }, { "entropy": 5.342312955856324, "epoch": 0.7334293948126801, "grad_norm": 0.99609375, "learning_rate": 0.0004954176269717077, "loss": 5.231, "mean_token_accuracy": 0.19795534610748292, "num_tokens": 17501701.0, "step": 7635 }, { "entropy": 5.281050443649292, "epoch": 0.733909702209414, "grad_norm": 1.1328125, "learning_rate": 0.0004954107405493984, "loss": 5.1416, "mean_token_accuracy": 0.203464774787426, "num_tokens": 17513585.0, "step": 7640 }, { "entropy": 5.367125082015991, "epoch": 0.734390009606148, "grad_norm": 1.25, "learning_rate": 0.0004954038490098064, "loss": 5.2764, "mean_token_accuracy": 0.20417422205209732, "num_tokens": 17525557.0, "step": 7645 }, { "entropy": 5.269024848937988, "epoch": 0.7348703170028819, "grad_norm": 1.0703125, "learning_rate": 0.0004953969523530914, "loss": 5.1457, "mean_token_accuracy": 0.195808245241642, "num_tokens": 17538312.0, "step": 7650 }, { "entropy": 5.354407835006714, "epoch": 0.7353506243996157, "grad_norm": 1.0625, "learning_rate": 0.0004953900505794136, "loss": 5.2732, "mean_token_accuracy": 0.1993303641676903, "num_tokens": 17550248.0, "step": 7655 }, { "entropy": 5.282435178756714, "epoch": 0.7358309317963496, "grad_norm": 1.1875, "learning_rate": 0.0004953831436889332, "loss": 5.1346, "mean_token_accuracy": 0.19791701585054397, "num_tokens": 17560624.0, "step": 7660 }, { "entropy": 5.303052854537964, "epoch": 0.7363112391930836, "grad_norm": 1.1875, "learning_rate": 0.0004953762316818106, "loss": 5.1963, "mean_token_accuracy": 0.19871004968881606, "num_tokens": 17572439.0, "step": 7665 }, { "entropy": 5.351328325271607, "epoch": 0.7367915465898175, "grad_norm": 1.125, "learning_rate": 0.0004953693145582064, "loss": 5.2461, "mean_token_accuracy": 0.18978616893291472, "num_tokens": 17583120.0, "step": 7670 }, { "entropy": 5.191801738739014, "epoch": 0.7372718539865514, "grad_norm": 1.1328125, "learning_rate": 0.000495362392318281, "loss": 5.0753, "mean_token_accuracy": 0.2021123856306076, "num_tokens": 17593677.0, "step": 7675 }, { "entropy": 5.268652153015137, "epoch": 0.7377521613832853, "grad_norm": 1.015625, "learning_rate": 0.0004953554649621951, "loss": 5.2193, "mean_token_accuracy": 0.19628288149833678, "num_tokens": 17605180.0, "step": 7680 }, { "entropy": 5.3253484725952145, "epoch": 0.7382324687800192, "grad_norm": 1.0546875, "learning_rate": 0.0004953485324901098, "loss": 5.1844, "mean_token_accuracy": 0.20035452097654344, "num_tokens": 17617459.0, "step": 7685 }, { "entropy": 5.265099334716797, "epoch": 0.7387127761767531, "grad_norm": 1.1953125, "learning_rate": 0.0004953415949021857, "loss": 5.185, "mean_token_accuracy": 0.19952280819416046, "num_tokens": 17628024.0, "step": 7690 }, { "entropy": 5.240673971176148, "epoch": 0.739193083573487, "grad_norm": 1.109375, "learning_rate": 0.0004953346521985843, "loss": 5.2044, "mean_token_accuracy": 0.19829180389642714, "num_tokens": 17639833.0, "step": 7695 }, { "entropy": 5.402639627456665, "epoch": 0.7396733909702209, "grad_norm": 1.140625, "learning_rate": 0.0004953277043794663, "loss": 5.2946, "mean_token_accuracy": 0.18773587048053741, "num_tokens": 17651057.0, "step": 7700 }, { "entropy": 5.30786247253418, "epoch": 0.7401536983669549, "grad_norm": 1.234375, "learning_rate": 0.0004953207514449933, "loss": 5.1709, "mean_token_accuracy": 0.20614788979291915, "num_tokens": 17662288.0, "step": 7705 }, { "entropy": 5.260968112945557, "epoch": 0.7406340057636888, "grad_norm": 1.078125, "learning_rate": 0.0004953137933953267, "loss": 5.1842, "mean_token_accuracy": 0.1984902873635292, "num_tokens": 17673885.0, "step": 7710 }, { "entropy": 5.276099634170532, "epoch": 0.7411143131604226, "grad_norm": 1.1015625, "learning_rate": 0.000495306830230628, "loss": 5.2038, "mean_token_accuracy": 0.20187791883945466, "num_tokens": 17685030.0, "step": 7715 }, { "entropy": 5.298851442337036, "epoch": 0.7415946205571565, "grad_norm": 1.1484375, "learning_rate": 0.0004952998619510589, "loss": 5.1554, "mean_token_accuracy": 0.20665809214115144, "num_tokens": 17696624.0, "step": 7720 }, { "entropy": 5.264416313171386, "epoch": 0.7420749279538905, "grad_norm": 1.203125, "learning_rate": 0.0004952928885567811, "loss": 5.1403, "mean_token_accuracy": 0.20183341205120087, "num_tokens": 17707386.0, "step": 7725 }, { "entropy": 5.397011804580688, "epoch": 0.7425552353506244, "grad_norm": 1.2890625, "learning_rate": 0.0004952859100479566, "loss": 5.3804, "mean_token_accuracy": 0.18951933234930038, "num_tokens": 17718605.0, "step": 7730 }, { "entropy": 5.272250413894653, "epoch": 0.7430355427473583, "grad_norm": 1.109375, "learning_rate": 0.0004952789264247474, "loss": 5.1757, "mean_token_accuracy": 0.1989718645811081, "num_tokens": 17730275.0, "step": 7735 }, { "entropy": 5.323815870285034, "epoch": 0.7435158501440923, "grad_norm": 1.2421875, "learning_rate": 0.0004952719376873156, "loss": 5.2921, "mean_token_accuracy": 0.19190367460250854, "num_tokens": 17741390.0, "step": 7740 }, { "entropy": 5.308934831619263, "epoch": 0.7439961575408262, "grad_norm": 1.2109375, "learning_rate": 0.0004952649438358234, "loss": 5.1731, "mean_token_accuracy": 0.20842925161123277, "num_tokens": 17752354.0, "step": 7745 }, { "entropy": 5.308169984817505, "epoch": 0.74447646493756, "grad_norm": 1.1875, "learning_rate": 0.0004952579448704334, "loss": 5.1631, "mean_token_accuracy": 0.20139929056167602, "num_tokens": 17762839.0, "step": 7750 }, { "entropy": 5.350342273712158, "epoch": 0.7449567723342939, "grad_norm": 1.1171875, "learning_rate": 0.000495250940791308, "loss": 5.2605, "mean_token_accuracy": 0.19137903451919555, "num_tokens": 17775800.0, "step": 7755 }, { "entropy": 5.306150579452515, "epoch": 0.7454370797310279, "grad_norm": 1.1875, "learning_rate": 0.0004952439315986096, "loss": 5.1811, "mean_token_accuracy": 0.19805798083543777, "num_tokens": 17787804.0, "step": 7760 }, { "entropy": 5.341886854171753, "epoch": 0.7459173871277618, "grad_norm": 1.234375, "learning_rate": 0.0004952369172925012, "loss": 5.2853, "mean_token_accuracy": 0.1993953987956047, "num_tokens": 17800291.0, "step": 7765 }, { "entropy": 5.292854881286621, "epoch": 0.7463976945244957, "grad_norm": 1.1796875, "learning_rate": 0.0004952298978731454, "loss": 5.147, "mean_token_accuracy": 0.20547049790620803, "num_tokens": 17811548.0, "step": 7770 }, { "entropy": 5.28916335105896, "epoch": 0.7468780019212296, "grad_norm": 1.140625, "learning_rate": 0.0004952228733407055, "loss": 5.1011, "mean_token_accuracy": 0.20431289821863174, "num_tokens": 17822589.0, "step": 7775 }, { "entropy": 5.175790548324585, "epoch": 0.7473583093179635, "grad_norm": 1.0859375, "learning_rate": 0.0004952158436953444, "loss": 5.1236, "mean_token_accuracy": 0.20223413705825805, "num_tokens": 17834203.0, "step": 7780 }, { "entropy": 5.22423152923584, "epoch": 0.7478386167146974, "grad_norm": 1.2578125, "learning_rate": 0.0004952088089372252, "loss": 5.2105, "mean_token_accuracy": 0.19397516399621964, "num_tokens": 17846238.0, "step": 7785 }, { "entropy": 5.331250286102295, "epoch": 0.7483189241114313, "grad_norm": 1.203125, "learning_rate": 0.0004952017690665114, "loss": 5.1324, "mean_token_accuracy": 0.2026346132159233, "num_tokens": 17857640.0, "step": 7790 }, { "entropy": 5.280352592468262, "epoch": 0.7487992315081652, "grad_norm": 1.1484375, "learning_rate": 0.0004951947240833664, "loss": 5.1374, "mean_token_accuracy": 0.20421989113092423, "num_tokens": 17868755.0, "step": 7795 }, { "entropy": 5.23347110748291, "epoch": 0.7492795389048992, "grad_norm": 1.171875, "learning_rate": 0.0004951876739879537, "loss": 5.2158, "mean_token_accuracy": 0.1939207211136818, "num_tokens": 17881078.0, "step": 7800 }, { "entropy": 5.29048261642456, "epoch": 0.7497598463016331, "grad_norm": 1.1328125, "learning_rate": 0.0004951806187804371, "loss": 5.1629, "mean_token_accuracy": 0.19929923862218857, "num_tokens": 17893888.0, "step": 7805 }, { "entropy": 5.296859693527222, "epoch": 0.7502401536983669, "grad_norm": 1.1015625, "learning_rate": 0.0004951735584609804, "loss": 5.2196, "mean_token_accuracy": 0.19920673221349716, "num_tokens": 17904443.0, "step": 7810 }, { "entropy": 5.286273050308227, "epoch": 0.7507204610951008, "grad_norm": 1.21875, "learning_rate": 0.0004951664930297474, "loss": 5.217, "mean_token_accuracy": 0.20082310885190963, "num_tokens": 17918090.0, "step": 7815 }, { "entropy": 5.227938318252564, "epoch": 0.7512007684918348, "grad_norm": 1.1875, "learning_rate": 0.000495159422486902, "loss": 5.161, "mean_token_accuracy": 0.20609356909990312, "num_tokens": 17929233.0, "step": 7820 }, { "entropy": 5.242263078689575, "epoch": 0.7516810758885687, "grad_norm": 1.34375, "learning_rate": 0.0004951523468326088, "loss": 5.1965, "mean_token_accuracy": 0.19512112885713578, "num_tokens": 17940580.0, "step": 7825 }, { "entropy": 5.342494058609009, "epoch": 0.7521613832853026, "grad_norm": 1.28125, "learning_rate": 0.0004951452660670317, "loss": 5.278, "mean_token_accuracy": 0.18720510900020598, "num_tokens": 17953993.0, "step": 7830 }, { "entropy": 5.285890769958496, "epoch": 0.7526416906820365, "grad_norm": 1.125, "learning_rate": 0.0004951381801903352, "loss": 5.11, "mean_token_accuracy": 0.20024892687797546, "num_tokens": 17966033.0, "step": 7835 }, { "entropy": 5.114803695678711, "epoch": 0.7531219980787704, "grad_norm": 1.21875, "learning_rate": 0.0004951310892026839, "loss": 5.0968, "mean_token_accuracy": 0.2095889687538147, "num_tokens": 17977943.0, "step": 7840 }, { "entropy": 5.293044853210449, "epoch": 0.7536023054755043, "grad_norm": 1.1484375, "learning_rate": 0.0004951239931042424, "loss": 5.1698, "mean_token_accuracy": 0.20413365513086318, "num_tokens": 17990135.0, "step": 7845 }, { "entropy": 5.279680919647217, "epoch": 0.7540826128722382, "grad_norm": 1.171875, "learning_rate": 0.0004951168918951753, "loss": 5.2056, "mean_token_accuracy": 0.20387261509895324, "num_tokens": 18002126.0, "step": 7850 }, { "entropy": 5.190751075744629, "epoch": 0.7545629202689721, "grad_norm": 1.1640625, "learning_rate": 0.0004951097855756476, "loss": 5.0763, "mean_token_accuracy": 0.20258077830076218, "num_tokens": 18013147.0, "step": 7855 }, { "entropy": 5.328785943984985, "epoch": 0.7550432276657061, "grad_norm": 1.2109375, "learning_rate": 0.0004951026741458243, "loss": 5.1906, "mean_token_accuracy": 0.19995464086532594, "num_tokens": 18025146.0, "step": 7860 }, { "entropy": 5.261568832397461, "epoch": 0.75552353506244, "grad_norm": 1.171875, "learning_rate": 0.0004950955576058705, "loss": 5.1556, "mean_token_accuracy": 0.19386046081781388, "num_tokens": 18036412.0, "step": 7865 }, { "entropy": 5.259433937072754, "epoch": 0.7560038424591738, "grad_norm": 1.3515625, "learning_rate": 0.0004950884359559513, "loss": 5.2224, "mean_token_accuracy": 0.20132519155740738, "num_tokens": 18048041.0, "step": 7870 }, { "entropy": 5.339427757263183, "epoch": 0.7564841498559077, "grad_norm": 1.2265625, "learning_rate": 0.0004950813091962324, "loss": 5.277, "mean_token_accuracy": 0.19186609387397766, "num_tokens": 18060163.0, "step": 7875 }, { "entropy": 5.334090280532837, "epoch": 0.7569644572526417, "grad_norm": 1.1171875, "learning_rate": 0.0004950741773268788, "loss": 5.1936, "mean_token_accuracy": 0.1961333230137825, "num_tokens": 18071628.0, "step": 7880 }, { "entropy": 5.1901530742645265, "epoch": 0.7574447646493756, "grad_norm": 1.15625, "learning_rate": 0.0004950670403480562, "loss": 5.0997, "mean_token_accuracy": 0.20009191036224366, "num_tokens": 18082979.0, "step": 7885 }, { "entropy": 5.252532863616944, "epoch": 0.7579250720461095, "grad_norm": 1.296875, "learning_rate": 0.0004950598982599306, "loss": 5.1792, "mean_token_accuracy": 0.20101021528244017, "num_tokens": 18093889.0, "step": 7890 }, { "entropy": 5.262256002426147, "epoch": 0.7584053794428435, "grad_norm": 1.0703125, "learning_rate": 0.0004950527510626675, "loss": 5.2165, "mean_token_accuracy": 0.19798852652311325, "num_tokens": 18105559.0, "step": 7895 }, { "entropy": 5.3081278800964355, "epoch": 0.7588856868395774, "grad_norm": 1.1875, "learning_rate": 0.0004950455987564329, "loss": 5.2965, "mean_token_accuracy": 0.19645372927188873, "num_tokens": 18116316.0, "step": 7900 }, { "entropy": 5.334239339828491, "epoch": 0.7593659942363112, "grad_norm": 1.1796875, "learning_rate": 0.0004950384413413931, "loss": 5.1851, "mean_token_accuracy": 0.19774624109268188, "num_tokens": 18126851.0, "step": 7905 }, { "entropy": 5.322545146942138, "epoch": 0.7598463016330451, "grad_norm": 1.2578125, "learning_rate": 0.0004950312788177139, "loss": 5.277, "mean_token_accuracy": 0.19229816943407058, "num_tokens": 18139571.0, "step": 7910 }, { "entropy": 5.347561597824097, "epoch": 0.7603266090297791, "grad_norm": 1.125, "learning_rate": 0.0004950241111855618, "loss": 5.2449, "mean_token_accuracy": 0.19438967555761338, "num_tokens": 18150680.0, "step": 7915 }, { "entropy": 5.285850143432617, "epoch": 0.760806916426513, "grad_norm": 1.140625, "learning_rate": 0.0004950169384451031, "loss": 5.1977, "mean_token_accuracy": 0.20325633138418198, "num_tokens": 18161911.0, "step": 7920 }, { "entropy": 5.3055215835571286, "epoch": 0.7612872238232469, "grad_norm": 1.3515625, "learning_rate": 0.0004950097605965045, "loss": 5.1865, "mean_token_accuracy": 0.20584756135940552, "num_tokens": 18172714.0, "step": 7925 }, { "entropy": 5.3049579620361325, "epoch": 0.7617675312199808, "grad_norm": 1.2890625, "learning_rate": 0.0004950025776399326, "loss": 5.1777, "mean_token_accuracy": 0.20212606489658355, "num_tokens": 18184250.0, "step": 7930 }, { "entropy": 5.343746089935303, "epoch": 0.7622478386167147, "grad_norm": 1.1484375, "learning_rate": 0.000494995389575554, "loss": 5.298, "mean_token_accuracy": 0.19154924601316453, "num_tokens": 18195299.0, "step": 7935 }, { "entropy": 5.377411794662476, "epoch": 0.7627281460134486, "grad_norm": 1.1875, "learning_rate": 0.0004949881964035357, "loss": 5.3633, "mean_token_accuracy": 0.1878654807806015, "num_tokens": 18206863.0, "step": 7940 }, { "entropy": 5.3593430519104, "epoch": 0.7632084534101825, "grad_norm": 1.3828125, "learning_rate": 0.0004949809981240448, "loss": 5.2789, "mean_token_accuracy": 0.19559144079685212, "num_tokens": 18219643.0, "step": 7945 }, { "entropy": 5.277910900115967, "epoch": 0.7636887608069164, "grad_norm": 1.0703125, "learning_rate": 0.0004949737947372483, "loss": 5.1325, "mean_token_accuracy": 0.20357694774866103, "num_tokens": 18230461.0, "step": 7950 }, { "entropy": 5.320190668106079, "epoch": 0.7641690682036504, "grad_norm": 2.125, "learning_rate": 0.0004949665862433134, "loss": 5.2868, "mean_token_accuracy": 0.19289156794548035, "num_tokens": 18243768.0, "step": 7955 }, { "entropy": 5.296690511703491, "epoch": 0.7646493756003843, "grad_norm": 1.3984375, "learning_rate": 0.0004949593726424077, "loss": 5.1664, "mean_token_accuracy": 0.2003849595785141, "num_tokens": 18255322.0, "step": 7960 }, { "entropy": 5.291482019424438, "epoch": 0.7651296829971181, "grad_norm": 1.28125, "learning_rate": 0.0004949521539346985, "loss": 5.2128, "mean_token_accuracy": 0.20742505341768264, "num_tokens": 18267262.0, "step": 7965 }, { "entropy": 5.294230127334595, "epoch": 0.765609990393852, "grad_norm": 1.2109375, "learning_rate": 0.0004949449301203533, "loss": 5.1096, "mean_token_accuracy": 0.20100534409284593, "num_tokens": 18277332.0, "step": 7970 }, { "entropy": 5.269995260238647, "epoch": 0.766090297790586, "grad_norm": 1.0625, "learning_rate": 0.0004949377011995399, "loss": 5.2462, "mean_token_accuracy": 0.20773502439260483, "num_tokens": 18289547.0, "step": 7975 }, { "entropy": 5.273986148834228, "epoch": 0.7665706051873199, "grad_norm": 1.203125, "learning_rate": 0.0004949304671724263, "loss": 5.1432, "mean_token_accuracy": 0.20398979485034943, "num_tokens": 18302097.0, "step": 7980 }, { "entropy": 5.271750497817993, "epoch": 0.7670509125840538, "grad_norm": 1.171875, "learning_rate": 0.0004949232280391802, "loss": 5.2367, "mean_token_accuracy": 0.19723534286022187, "num_tokens": 18314151.0, "step": 7985 }, { "entropy": 5.288878488540649, "epoch": 0.7675312199807877, "grad_norm": 1.25, "learning_rate": 0.0004949159837999698, "loss": 5.17, "mean_token_accuracy": 0.19713514000177385, "num_tokens": 18326085.0, "step": 7990 }, { "entropy": 5.297585439682007, "epoch": 0.7680115273775217, "grad_norm": 1.15625, "learning_rate": 0.0004949087344549633, "loss": 5.1623, "mean_token_accuracy": 0.19341499507427215, "num_tokens": 18338158.0, "step": 7995 }, { "entropy": 5.30235710144043, "epoch": 0.7684918347742555, "grad_norm": 1.1484375, "learning_rate": 0.000494901480004329, "loss": 5.1809, "mean_token_accuracy": 0.20551337599754332, "num_tokens": 18349418.0, "step": 8000 }, { "entropy": 5.301124715805054, "epoch": 0.7689721421709894, "grad_norm": 1.25, "learning_rate": 0.0004948942204482351, "loss": 5.1761, "mean_token_accuracy": 0.20214684456586837, "num_tokens": 18361964.0, "step": 8005 }, { "entropy": 5.219300603866577, "epoch": 0.7694524495677233, "grad_norm": 1.203125, "learning_rate": 0.0004948869557868506, "loss": 5.0979, "mean_token_accuracy": 0.21326844096183778, "num_tokens": 18373108.0, "step": 8010 }, { "entropy": 5.1787127494812015, "epoch": 0.7699327569644573, "grad_norm": 1.1953125, "learning_rate": 0.0004948796860203439, "loss": 5.123, "mean_token_accuracy": 0.20177519619464873, "num_tokens": 18385310.0, "step": 8015 }, { "entropy": 5.337287092208863, "epoch": 0.7704130643611912, "grad_norm": 1.1171875, "learning_rate": 0.0004948724111488838, "loss": 5.2967, "mean_token_accuracy": 0.19337289929389953, "num_tokens": 18396132.0, "step": 8020 }, { "entropy": 5.4783307075500485, "epoch": 0.770893371757925, "grad_norm": 1.09375, "learning_rate": 0.0004948651311726391, "loss": 5.4042, "mean_token_accuracy": 0.18754971623420716, "num_tokens": 18409930.0, "step": 8025 }, { "entropy": 5.384222173690796, "epoch": 0.7713736791546589, "grad_norm": 1.21875, "learning_rate": 0.0004948578460917789, "loss": 5.2773, "mean_token_accuracy": 0.19422808140516282, "num_tokens": 18421204.0, "step": 8030 }, { "entropy": 5.340588712692261, "epoch": 0.7718539865513929, "grad_norm": 1.109375, "learning_rate": 0.0004948505559064725, "loss": 5.1639, "mean_token_accuracy": 0.20545457750558854, "num_tokens": 18433194.0, "step": 8035 }, { "entropy": 5.259980058670044, "epoch": 0.7723342939481268, "grad_norm": 1.296875, "learning_rate": 0.0004948432606168889, "loss": 5.1246, "mean_token_accuracy": 0.20445887744426727, "num_tokens": 18445282.0, "step": 8040 }, { "entropy": 5.232318782806397, "epoch": 0.7728146013448607, "grad_norm": 1.25, "learning_rate": 0.0004948359602231976, "loss": 5.1841, "mean_token_accuracy": 0.20268695801496506, "num_tokens": 18456264.0, "step": 8045 }, { "entropy": 5.2557531833648685, "epoch": 0.7732949087415946, "grad_norm": 1.15625, "learning_rate": 0.0004948286547255681, "loss": 5.12, "mean_token_accuracy": 0.20428049117326735, "num_tokens": 18467573.0, "step": 8050 }, { "entropy": 5.278136920928955, "epoch": 0.7737752161383286, "grad_norm": 1.2109375, "learning_rate": 0.00049482134412417, "loss": 5.0755, "mean_token_accuracy": 0.2096497043967247, "num_tokens": 18478333.0, "step": 8055 }, { "entropy": 5.19740858078003, "epoch": 0.7742555235350624, "grad_norm": 1.3828125, "learning_rate": 0.000494814028419173, "loss": 5.1055, "mean_token_accuracy": 0.20423219799995423, "num_tokens": 18489106.0, "step": 8060 }, { "entropy": 5.313218450546264, "epoch": 0.7747358309317963, "grad_norm": 1.1875, "learning_rate": 0.000494806707610747, "loss": 5.2388, "mean_token_accuracy": 0.1978022873401642, "num_tokens": 18500090.0, "step": 8065 }, { "entropy": 5.315935182571411, "epoch": 0.7752161383285303, "grad_norm": 1.2578125, "learning_rate": 0.000494799381699062, "loss": 5.1933, "mean_token_accuracy": 0.20687556862831116, "num_tokens": 18510305.0, "step": 8070 }, { "entropy": 5.306492900848388, "epoch": 0.7756964457252642, "grad_norm": 1.2734375, "learning_rate": 0.0004947920506842879, "loss": 5.1982, "mean_token_accuracy": 0.1990632399916649, "num_tokens": 18523048.0, "step": 8075 }, { "entropy": 5.279481077194214, "epoch": 0.7761767531219981, "grad_norm": 1.171875, "learning_rate": 0.0004947847145665951, "loss": 5.1368, "mean_token_accuracy": 0.2043842852115631, "num_tokens": 18534145.0, "step": 8080 }, { "entropy": 5.309163236618042, "epoch": 0.776657060518732, "grad_norm": 1.171875, "learning_rate": 0.0004947773733461539, "loss": 5.18, "mean_token_accuracy": 0.20123105943202974, "num_tokens": 18545045.0, "step": 8085 }, { "entropy": 5.278306007385254, "epoch": 0.777137367915466, "grad_norm": 1.1484375, "learning_rate": 0.0004947700270231347, "loss": 5.1526, "mean_token_accuracy": 0.2032118022441864, "num_tokens": 18557531.0, "step": 8090 }, { "entropy": 5.23764853477478, "epoch": 0.7776176753121998, "grad_norm": 1.1328125, "learning_rate": 0.0004947626755977079, "loss": 5.1887, "mean_token_accuracy": 0.19730121344327928, "num_tokens": 18569127.0, "step": 8095 }, { "entropy": 5.3856611251831055, "epoch": 0.7780979827089337, "grad_norm": 1.34375, "learning_rate": 0.0004947553190700444, "loss": 5.2255, "mean_token_accuracy": 0.20432638376951218, "num_tokens": 18580606.0, "step": 8100 }, { "entropy": 5.353847932815552, "epoch": 0.7785782901056676, "grad_norm": 1.2109375, "learning_rate": 0.000494747957440315, "loss": 5.3448, "mean_token_accuracy": 0.1883766993880272, "num_tokens": 18592330.0, "step": 8105 }, { "entropy": 5.406799602508545, "epoch": 0.7790585975024016, "grad_norm": 1.2734375, "learning_rate": 0.0004947405907086905, "loss": 5.3101, "mean_token_accuracy": 0.1930047556757927, "num_tokens": 18604721.0, "step": 8110 }, { "entropy": 5.2951795101165775, "epoch": 0.7795389048991355, "grad_norm": 1.1953125, "learning_rate": 0.0004947332188753419, "loss": 5.1811, "mean_token_accuracy": 0.2039830431342125, "num_tokens": 18616814.0, "step": 8115 }, { "entropy": 5.2599467754364015, "epoch": 0.7800192122958693, "grad_norm": 1.2578125, "learning_rate": 0.0004947258419404405, "loss": 5.1832, "mean_token_accuracy": 0.19927904903888702, "num_tokens": 18628224.0, "step": 8120 }, { "entropy": 5.2949143409729, "epoch": 0.7804995196926032, "grad_norm": 1.171875, "learning_rate": 0.0004947184599041576, "loss": 5.2286, "mean_token_accuracy": 0.19865068048238754, "num_tokens": 18639777.0, "step": 8125 }, { "entropy": 5.241250896453858, "epoch": 0.7809798270893372, "grad_norm": 1.1953125, "learning_rate": 0.0004947110727666644, "loss": 5.1412, "mean_token_accuracy": 0.2019078940153122, "num_tokens": 18651044.0, "step": 8130 }, { "entropy": 5.28899393081665, "epoch": 0.7814601344860711, "grad_norm": 1.140625, "learning_rate": 0.0004947036805281325, "loss": 5.2099, "mean_token_accuracy": 0.2031030997633934, "num_tokens": 18663142.0, "step": 8135 }, { "entropy": 5.4257384777069095, "epoch": 0.781940441882805, "grad_norm": 1.3671875, "learning_rate": 0.0004946962831887336, "loss": 5.2878, "mean_token_accuracy": 0.1936602771282196, "num_tokens": 18674079.0, "step": 8140 }, { "entropy": 5.335578870773316, "epoch": 0.7824207492795389, "grad_norm": 1.3125, "learning_rate": 0.0004946888807486393, "loss": 5.272, "mean_token_accuracy": 0.1942149966955185, "num_tokens": 18685744.0, "step": 8145 }, { "entropy": 5.229654836654663, "epoch": 0.7829010566762729, "grad_norm": 1.234375, "learning_rate": 0.0004946814732080214, "loss": 5.1577, "mean_token_accuracy": 0.19906039535999298, "num_tokens": 18697049.0, "step": 8150 }, { "entropy": 5.2480401515960695, "epoch": 0.7833813640730067, "grad_norm": 1.1484375, "learning_rate": 0.0004946740605670523, "loss": 5.171, "mean_token_accuracy": 0.20052818953990936, "num_tokens": 18708765.0, "step": 8155 }, { "entropy": 5.28274884223938, "epoch": 0.7838616714697406, "grad_norm": 1.15625, "learning_rate": 0.0004946666428259037, "loss": 5.1497, "mean_token_accuracy": 0.20465652495622635, "num_tokens": 18719819.0, "step": 8160 }, { "entropy": 5.361910057067871, "epoch": 0.7843419788664745, "grad_norm": 1.2421875, "learning_rate": 0.0004946592199847478, "loss": 5.2101, "mean_token_accuracy": 0.19702319502830506, "num_tokens": 18730668.0, "step": 8165 }, { "entropy": 5.166529130935669, "epoch": 0.7848222862632085, "grad_norm": 1.46875, "learning_rate": 0.0004946517920437571, "loss": 5.0232, "mean_token_accuracy": 0.205000402033329, "num_tokens": 18741991.0, "step": 8170 }, { "entropy": 5.288968944549561, "epoch": 0.7853025936599424, "grad_norm": 1.1640625, "learning_rate": 0.0004946443590031041, "loss": 5.2402, "mean_token_accuracy": 0.1971651256084442, "num_tokens": 18753919.0, "step": 8175 }, { "entropy": 5.322829055786133, "epoch": 0.7857829010566763, "grad_norm": 1.203125, "learning_rate": 0.0004946369208629613, "loss": 5.136, "mean_token_accuracy": 0.20793365240097045, "num_tokens": 18764342.0, "step": 8180 }, { "entropy": 5.140931224822998, "epoch": 0.7862632084534101, "grad_norm": 1.2265625, "learning_rate": 0.0004946294776235013, "loss": 5.1578, "mean_token_accuracy": 0.2009105786681175, "num_tokens": 18776044.0, "step": 8185 }, { "entropy": 5.405412912368774, "epoch": 0.7867435158501441, "grad_norm": 1.1640625, "learning_rate": 0.0004946220292848971, "loss": 5.2153, "mean_token_accuracy": 0.20181388556957244, "num_tokens": 18787354.0, "step": 8190 }, { "entropy": 5.33922643661499, "epoch": 0.787223823246878, "grad_norm": 1.2265625, "learning_rate": 0.0004946145758473214, "loss": 5.1366, "mean_token_accuracy": 0.20076511055231094, "num_tokens": 18797845.0, "step": 8195 }, { "entropy": 5.270247936248779, "epoch": 0.7877041306436119, "grad_norm": 1.25, "learning_rate": 0.0004946071173109475, "loss": 5.2099, "mean_token_accuracy": 0.19680924713611603, "num_tokens": 18809253.0, "step": 8200 }, { "entropy": 5.210458469390869, "epoch": 0.7881844380403458, "grad_norm": 1.1875, "learning_rate": 0.0004945996536759484, "loss": 5.0893, "mean_token_accuracy": 0.20848129391670228, "num_tokens": 18819768.0, "step": 8205 }, { "entropy": 5.4275593757629395, "epoch": 0.7886647454370798, "grad_norm": 1.234375, "learning_rate": 0.0004945921849424974, "loss": 5.3408, "mean_token_accuracy": 0.19263991117477416, "num_tokens": 18831151.0, "step": 8210 }, { "entropy": 5.344443464279175, "epoch": 0.7891450528338136, "grad_norm": 1.3203125, "learning_rate": 0.0004945847111107679, "loss": 5.113, "mean_token_accuracy": 0.20607621520757674, "num_tokens": 18842133.0, "step": 8215 }, { "entropy": 5.285537433624268, "epoch": 0.7896253602305475, "grad_norm": 1.265625, "learning_rate": 0.0004945772321809334, "loss": 5.2747, "mean_token_accuracy": 0.19406631737947463, "num_tokens": 18853295.0, "step": 8220 }, { "entropy": 5.242657232284546, "epoch": 0.7901056676272814, "grad_norm": 1.1484375, "learning_rate": 0.0004945697481531677, "loss": 5.2147, "mean_token_accuracy": 0.20358818471431733, "num_tokens": 18865802.0, "step": 8225 }, { "entropy": 5.291993951797485, "epoch": 0.7905859750240154, "grad_norm": 1.2578125, "learning_rate": 0.0004945622590276443, "loss": 5.1269, "mean_token_accuracy": 0.21141389459371568, "num_tokens": 18877693.0, "step": 8230 }, { "entropy": 5.316649341583252, "epoch": 0.7910662824207493, "grad_norm": 1.25, "learning_rate": 0.0004945547648045373, "loss": 5.1811, "mean_token_accuracy": 0.20542819797992706, "num_tokens": 18888549.0, "step": 8235 }, { "entropy": 5.251237916946411, "epoch": 0.7915465898174832, "grad_norm": 1.5, "learning_rate": 0.0004945472654840206, "loss": 5.1278, "mean_token_accuracy": 0.20496677309274675, "num_tokens": 18899132.0, "step": 8240 }, { "entropy": 5.192249727249146, "epoch": 0.7920268972142172, "grad_norm": 1.2421875, "learning_rate": 0.0004945397610662683, "loss": 5.1362, "mean_token_accuracy": 0.1992405891418457, "num_tokens": 18911774.0, "step": 8245 }, { "entropy": 5.370453500747681, "epoch": 0.792507204610951, "grad_norm": 1.640625, "learning_rate": 0.0004945322515514547, "loss": 5.2096, "mean_token_accuracy": 0.20203327834606172, "num_tokens": 18922806.0, "step": 8250 }, { "entropy": 5.284592056274414, "epoch": 0.7929875120076849, "grad_norm": 1.203125, "learning_rate": 0.000494524736939754, "loss": 5.1768, "mean_token_accuracy": 0.20510386675596237, "num_tokens": 18934861.0, "step": 8255 }, { "entropy": 5.287734031677246, "epoch": 0.7934678194044188, "grad_norm": 1.1953125, "learning_rate": 0.0004945172172313408, "loss": 5.1468, "mean_token_accuracy": 0.20960791260004044, "num_tokens": 18944269.0, "step": 8260 }, { "entropy": 5.277343654632569, "epoch": 0.7939481268011528, "grad_norm": 1.625, "learning_rate": 0.0004945096924263896, "loss": 5.2071, "mean_token_accuracy": 0.19710262566804887, "num_tokens": 18957197.0, "step": 8265 }, { "entropy": 5.340837478637695, "epoch": 0.7944284341978867, "grad_norm": 1.1875, "learning_rate": 0.0004945021625250753, "loss": 5.1948, "mean_token_accuracy": 0.19287520945072173, "num_tokens": 18968254.0, "step": 8270 }, { "entropy": 5.223880767822266, "epoch": 0.7949087415946205, "grad_norm": 1.21875, "learning_rate": 0.0004944946275275724, "loss": 5.115, "mean_token_accuracy": 0.2050468847155571, "num_tokens": 18979372.0, "step": 8275 }, { "entropy": 5.223621273040772, "epoch": 0.7953890489913544, "grad_norm": 1.28125, "learning_rate": 0.0004944870874340561, "loss": 5.1239, "mean_token_accuracy": 0.20524471253156662, "num_tokens": 18991075.0, "step": 8280 }, { "entropy": 5.2544965744018555, "epoch": 0.7958693563880884, "grad_norm": 1.171875, "learning_rate": 0.0004944795422447013, "loss": 5.0548, "mean_token_accuracy": 0.20748359262943267, "num_tokens": 19002324.0, "step": 8285 }, { "entropy": 5.34930009841919, "epoch": 0.7963496637848223, "grad_norm": 1.2265625, "learning_rate": 0.0004944719919596835, "loss": 5.2493, "mean_token_accuracy": 0.1979260191321373, "num_tokens": 19014406.0, "step": 8290 }, { "entropy": 5.200772380828857, "epoch": 0.7968299711815562, "grad_norm": 1.1328125, "learning_rate": 0.0004944644365791776, "loss": 5.1136, "mean_token_accuracy": 0.20155889242887498, "num_tokens": 19025984.0, "step": 8295 }, { "entropy": 5.297162580490112, "epoch": 0.7973102785782901, "grad_norm": 1.296875, "learning_rate": 0.000494456876103359, "loss": 5.2354, "mean_token_accuracy": 0.20328541100025177, "num_tokens": 19036189.0, "step": 8300 }, { "entropy": 5.296557950973511, "epoch": 0.7977905859750241, "grad_norm": 1.171875, "learning_rate": 0.0004944493105324035, "loss": 5.207, "mean_token_accuracy": 0.19707799553871155, "num_tokens": 19047587.0, "step": 8305 }, { "entropy": 5.326691627502441, "epoch": 0.7982708933717579, "grad_norm": 1.2265625, "learning_rate": 0.0004944417398664866, "loss": 5.2014, "mean_token_accuracy": 0.1997044637799263, "num_tokens": 19058467.0, "step": 8310 }, { "entropy": 5.3209089756011965, "epoch": 0.7987512007684918, "grad_norm": 1.2421875, "learning_rate": 0.0004944341641057843, "loss": 5.2646, "mean_token_accuracy": 0.19579226821660994, "num_tokens": 19070235.0, "step": 8315 }, { "entropy": 5.347403049468994, "epoch": 0.7992315081652257, "grad_norm": 1.2734375, "learning_rate": 0.0004944265832504721, "loss": 5.2227, "mean_token_accuracy": 0.19858405888080596, "num_tokens": 19082005.0, "step": 8320 }, { "entropy": 5.348876476287842, "epoch": 0.7997118155619597, "grad_norm": 1.3046875, "learning_rate": 0.0004944189973007262, "loss": 5.2395, "mean_token_accuracy": 0.1973268657922745, "num_tokens": 19092922.0, "step": 8325 }, { "entropy": 5.313145542144776, "epoch": 0.8001921229586936, "grad_norm": 1.1953125, "learning_rate": 0.0004944114062567229, "loss": 5.2259, "mean_token_accuracy": 0.19848893135786055, "num_tokens": 19104832.0, "step": 8330 }, { "entropy": 5.300992155075074, "epoch": 0.8006724303554275, "grad_norm": 1.34375, "learning_rate": 0.0004944038101186381, "loss": 5.2693, "mean_token_accuracy": 0.20075047612190247, "num_tokens": 19116261.0, "step": 8335 }, { "entropy": 5.354841804504394, "epoch": 0.8011527377521613, "grad_norm": 1.234375, "learning_rate": 0.0004943962088866483, "loss": 5.2437, "mean_token_accuracy": 0.19408100843429565, "num_tokens": 19127195.0, "step": 8340 }, { "entropy": 5.398009014129639, "epoch": 0.8016330451488953, "grad_norm": 1.25, "learning_rate": 0.0004943886025609301, "loss": 5.2731, "mean_token_accuracy": 0.19554793536663057, "num_tokens": 19138164.0, "step": 8345 }, { "entropy": 5.338754367828369, "epoch": 0.8021133525456292, "grad_norm": 1.2734375, "learning_rate": 0.00049438099114166, "loss": 5.2362, "mean_token_accuracy": 0.19320564866065978, "num_tokens": 19149153.0, "step": 8350 }, { "entropy": 5.311693477630615, "epoch": 0.8025936599423631, "grad_norm": 1.2734375, "learning_rate": 0.0004943733746290147, "loss": 5.2499, "mean_token_accuracy": 0.19263479709625245, "num_tokens": 19161023.0, "step": 8355 }, { "entropy": 5.233406496047974, "epoch": 0.803073967339097, "grad_norm": 1.28125, "learning_rate": 0.000494365753023171, "loss": 5.1045, "mean_token_accuracy": 0.21303804814815522, "num_tokens": 19172466.0, "step": 8360 }, { "entropy": 5.315703201293945, "epoch": 0.803554274735831, "grad_norm": 1.203125, "learning_rate": 0.0004943581263243059, "loss": 5.1847, "mean_token_accuracy": 0.20322347730398177, "num_tokens": 19183684.0, "step": 8365 }, { "entropy": 5.203450679779053, "epoch": 0.8040345821325648, "grad_norm": 1.234375, "learning_rate": 0.0004943504945325965, "loss": 5.0808, "mean_token_accuracy": 0.20951220840215684, "num_tokens": 19194479.0, "step": 8370 }, { "entropy": 5.310950231552124, "epoch": 0.8045148895292987, "grad_norm": 1.0546875, "learning_rate": 0.0004943428576482198, "loss": 5.1797, "mean_token_accuracy": 0.19598036706447602, "num_tokens": 19206323.0, "step": 8375 }, { "entropy": 5.409931755065918, "epoch": 0.8049951969260326, "grad_norm": 1.28125, "learning_rate": 0.0004943352156713535, "loss": 5.2646, "mean_token_accuracy": 0.19424921572208403, "num_tokens": 19218849.0, "step": 8380 }, { "entropy": 5.2228189468383786, "epoch": 0.8054755043227666, "grad_norm": 1.2109375, "learning_rate": 0.0004943275686021747, "loss": 5.0933, "mean_token_accuracy": 0.2045307993888855, "num_tokens": 19229603.0, "step": 8385 }, { "entropy": 5.316561031341553, "epoch": 0.8059558117195005, "grad_norm": 1.21875, "learning_rate": 0.000494319916440861, "loss": 5.2698, "mean_token_accuracy": 0.19353571087121962, "num_tokens": 19241318.0, "step": 8390 }, { "entropy": 5.331172943115234, "epoch": 0.8064361191162344, "grad_norm": 1.2109375, "learning_rate": 0.0004943122591875901, "loss": 5.1608, "mean_token_accuracy": 0.201753132045269, "num_tokens": 19252640.0, "step": 8395 }, { "entropy": 5.2201464653015135, "epoch": 0.8069164265129684, "grad_norm": 1.2421875, "learning_rate": 0.0004943045968425398, "loss": 5.1455, "mean_token_accuracy": 0.20201311111450196, "num_tokens": 19262971.0, "step": 8400 }, { "entropy": 5.3145428657531735, "epoch": 0.8073967339097022, "grad_norm": 1.1328125, "learning_rate": 0.0004942969294058878, "loss": 5.2328, "mean_token_accuracy": 0.1995360553264618, "num_tokens": 19274426.0, "step": 8405 }, { "entropy": 5.307784461975098, "epoch": 0.8078770413064361, "grad_norm": 1.1640625, "learning_rate": 0.0004942892568778125, "loss": 5.1895, "mean_token_accuracy": 0.20282406657934188, "num_tokens": 19286806.0, "step": 8410 }, { "entropy": 5.257102823257446, "epoch": 0.80835734870317, "grad_norm": 1.1953125, "learning_rate": 0.0004942815792584917, "loss": 5.164, "mean_token_accuracy": 0.20003714710474013, "num_tokens": 19297997.0, "step": 8415 }, { "entropy": 5.2559874057769775, "epoch": 0.808837656099904, "grad_norm": 1.1796875, "learning_rate": 0.0004942738965481038, "loss": 5.1548, "mean_token_accuracy": 0.2016161561012268, "num_tokens": 19309789.0, "step": 8420 }, { "entropy": 5.270598459243774, "epoch": 0.8093179634966379, "grad_norm": 1.140625, "learning_rate": 0.0004942662087468272, "loss": 5.1497, "mean_token_accuracy": 0.2061583325266838, "num_tokens": 19320688.0, "step": 8425 }, { "entropy": 5.307732200622558, "epoch": 0.8097982708933718, "grad_norm": 1.2265625, "learning_rate": 0.0004942585158548402, "loss": 5.1946, "mean_token_accuracy": 0.2020473822951317, "num_tokens": 19331670.0, "step": 8430 }, { "entropy": 5.252246427536011, "epoch": 0.8102785782901056, "grad_norm": 1.3359375, "learning_rate": 0.0004942508178723214, "loss": 5.1434, "mean_token_accuracy": 0.19983597844839096, "num_tokens": 19343578.0, "step": 8435 }, { "entropy": 5.280768489837646, "epoch": 0.8107588856868396, "grad_norm": 1.234375, "learning_rate": 0.0004942431147994499, "loss": 5.1919, "mean_token_accuracy": 0.20007235407829285, "num_tokens": 19354875.0, "step": 8440 }, { "entropy": 5.264632368087769, "epoch": 0.8112391930835735, "grad_norm": 1.09375, "learning_rate": 0.0004942354066364042, "loss": 5.1298, "mean_token_accuracy": 0.20225782990455626, "num_tokens": 19366353.0, "step": 8445 }, { "entropy": 5.154706716537476, "epoch": 0.8117195004803074, "grad_norm": 1.1953125, "learning_rate": 0.0004942276933833634, "loss": 5.0671, "mean_token_accuracy": 0.2101285368204117, "num_tokens": 19377534.0, "step": 8450 }, { "entropy": 5.28000054359436, "epoch": 0.8121998078770413, "grad_norm": 1.234375, "learning_rate": 0.0004942199750405064, "loss": 5.2018, "mean_token_accuracy": 0.19314154237508774, "num_tokens": 19388188.0, "step": 8455 }, { "entropy": 5.197183513641358, "epoch": 0.8126801152737753, "grad_norm": 1.1875, "learning_rate": 0.0004942122516080127, "loss": 5.0585, "mean_token_accuracy": 0.21394696682691575, "num_tokens": 19399910.0, "step": 8460 }, { "entropy": 5.321963691711426, "epoch": 0.8131604226705091, "grad_norm": 1.203125, "learning_rate": 0.0004942045230860614, "loss": 5.2039, "mean_token_accuracy": 0.20521147847175597, "num_tokens": 19411715.0, "step": 8465 }, { "entropy": 5.375604724884033, "epoch": 0.813640730067243, "grad_norm": 1.1875, "learning_rate": 0.0004941967894748319, "loss": 5.2942, "mean_token_accuracy": 0.19328842014074327, "num_tokens": 19423275.0, "step": 8470 }, { "entropy": 5.287278127670288, "epoch": 0.8141210374639769, "grad_norm": 1.15625, "learning_rate": 0.0004941890507745039, "loss": 5.221, "mean_token_accuracy": 0.1964712470769882, "num_tokens": 19436035.0, "step": 8475 }, { "entropy": 5.246680879592896, "epoch": 0.8146013448607109, "grad_norm": 1.2421875, "learning_rate": 0.0004941813069852569, "loss": 5.1855, "mean_token_accuracy": 0.19806223958730698, "num_tokens": 19447755.0, "step": 8480 }, { "entropy": 5.337065172195435, "epoch": 0.8150816522574448, "grad_norm": 1.1796875, "learning_rate": 0.0004941735581072708, "loss": 5.1272, "mean_token_accuracy": 0.20841425210237502, "num_tokens": 19459044.0, "step": 8485 }, { "entropy": 5.253565120697021, "epoch": 0.8155619596541787, "grad_norm": 1.4453125, "learning_rate": 0.0004941658041407255, "loss": 5.1429, "mean_token_accuracy": 0.20152915716171266, "num_tokens": 19471486.0, "step": 8490 }, { "entropy": 5.191393518447876, "epoch": 0.8160422670509125, "grad_norm": 1.296875, "learning_rate": 0.000494158045085801, "loss": 5.0798, "mean_token_accuracy": 0.2094142973423004, "num_tokens": 19482428.0, "step": 8495 }, { "entropy": 5.303775215148926, "epoch": 0.8165225744476465, "grad_norm": 1.3125, "learning_rate": 0.0004941502809426776, "loss": 5.2344, "mean_token_accuracy": 0.20312505811452866, "num_tokens": 19494616.0, "step": 8500 }, { "entropy": 5.311273241043091, "epoch": 0.8170028818443804, "grad_norm": 1.359375, "learning_rate": 0.0004941425117115354, "loss": 5.1312, "mean_token_accuracy": 0.19991155862808227, "num_tokens": 19504953.0, "step": 8505 }, { "entropy": 5.307030820846558, "epoch": 0.8174831892411143, "grad_norm": 1.1171875, "learning_rate": 0.0004941347373925547, "loss": 5.2125, "mean_token_accuracy": 0.20029536336660386, "num_tokens": 19517408.0, "step": 8510 }, { "entropy": 5.188129425048828, "epoch": 0.8179634966378482, "grad_norm": 1.2734375, "learning_rate": 0.0004941269579859161, "loss": 5.1463, "mean_token_accuracy": 0.20623117536306382, "num_tokens": 19529190.0, "step": 8515 }, { "entropy": 5.283451843261719, "epoch": 0.8184438040345822, "grad_norm": 1.21875, "learning_rate": 0.0004941191734918002, "loss": 5.1822, "mean_token_accuracy": 0.19969442188739778, "num_tokens": 19540278.0, "step": 8520 }, { "entropy": 5.270493650436402, "epoch": 0.818924111431316, "grad_norm": 1.15625, "learning_rate": 0.000494111383910388, "loss": 5.1628, "mean_token_accuracy": 0.2021285906434059, "num_tokens": 19551224.0, "step": 8525 }, { "entropy": 5.344288444519043, "epoch": 0.8194044188280499, "grad_norm": 1.1640625, "learning_rate": 0.0004941035892418597, "loss": 5.2, "mean_token_accuracy": 0.19705056995153428, "num_tokens": 19564386.0, "step": 8530 }, { "entropy": 5.330119323730469, "epoch": 0.8198847262247838, "grad_norm": 1.125, "learning_rate": 0.0004940957894863968, "loss": 5.1846, "mean_token_accuracy": 0.20080768764019014, "num_tokens": 19576170.0, "step": 8535 }, { "entropy": 5.294156408309936, "epoch": 0.8203650336215178, "grad_norm": 1.1953125, "learning_rate": 0.0004940879846441804, "loss": 5.1453, "mean_token_accuracy": 0.20027249306440353, "num_tokens": 19587220.0, "step": 8540 }, { "entropy": 5.254690933227539, "epoch": 0.8208453410182517, "grad_norm": 1.140625, "learning_rate": 0.0004940801747153914, "loss": 5.152, "mean_token_accuracy": 0.20080652981996536, "num_tokens": 19598649.0, "step": 8545 }, { "entropy": 5.252722549438476, "epoch": 0.8213256484149856, "grad_norm": 1.234375, "learning_rate": 0.0004940723597002113, "loss": 5.2088, "mean_token_accuracy": 0.1964610293507576, "num_tokens": 19610243.0, "step": 8550 }, { "entropy": 5.271253156661987, "epoch": 0.8218059558117194, "grad_norm": 1.296875, "learning_rate": 0.0004940645395988216, "loss": 5.2269, "mean_token_accuracy": 0.1978047624230385, "num_tokens": 19621467.0, "step": 8555 }, { "entropy": 5.397564172744751, "epoch": 0.8222862632084534, "grad_norm": 1.203125, "learning_rate": 0.0004940567144114036, "loss": 5.3089, "mean_token_accuracy": 0.18792566508054734, "num_tokens": 19633367.0, "step": 8560 }, { "entropy": 5.240535068511963, "epoch": 0.8227665706051873, "grad_norm": 1.21875, "learning_rate": 0.0004940488841381393, "loss": 5.1488, "mean_token_accuracy": 0.20554967522621154, "num_tokens": 19643144.0, "step": 8565 }, { "entropy": 5.280447959899902, "epoch": 0.8232468780019212, "grad_norm": 1.171875, "learning_rate": 0.0004940410487792103, "loss": 5.238, "mean_token_accuracy": 0.19904158860445023, "num_tokens": 19654501.0, "step": 8570 }, { "entropy": 5.359654140472412, "epoch": 0.8237271853986552, "grad_norm": 1.4296875, "learning_rate": 0.0004940332083347986, "loss": 5.1943, "mean_token_accuracy": 0.195090389251709, "num_tokens": 19665382.0, "step": 8575 }, { "entropy": 5.263187026977539, "epoch": 0.8242074927953891, "grad_norm": 1.0859375, "learning_rate": 0.0004940253628050861, "loss": 5.0497, "mean_token_accuracy": 0.2040240526199341, "num_tokens": 19677222.0, "step": 8580 }, { "entropy": 5.184061717987061, "epoch": 0.824687800192123, "grad_norm": 1.234375, "learning_rate": 0.0004940175121902552, "loss": 5.1351, "mean_token_accuracy": 0.2033315122127533, "num_tokens": 19688550.0, "step": 8585 }, { "entropy": 5.364740371704102, "epoch": 0.8251681075888568, "grad_norm": 1.3203125, "learning_rate": 0.000494009656490488, "loss": 5.2543, "mean_token_accuracy": 0.19725525230169297, "num_tokens": 19700125.0, "step": 8590 }, { "entropy": 5.361681652069092, "epoch": 0.8256484149855908, "grad_norm": 1.125, "learning_rate": 0.0004940017957059668, "loss": 5.2424, "mean_token_accuracy": 0.19780150651931763, "num_tokens": 19711969.0, "step": 8595 }, { "entropy": 5.24332218170166, "epoch": 0.8261287223823247, "grad_norm": 1.2109375, "learning_rate": 0.0004939939298368742, "loss": 5.1943, "mean_token_accuracy": 0.20086236745119096, "num_tokens": 19723813.0, "step": 8600 }, { "entropy": 5.300223016738892, "epoch": 0.8266090297790586, "grad_norm": 1.2578125, "learning_rate": 0.0004939860588833929, "loss": 5.2175, "mean_token_accuracy": 0.1928972378373146, "num_tokens": 19735539.0, "step": 8605 }, { "entropy": 5.409295749664307, "epoch": 0.8270893371757925, "grad_norm": 1.1875, "learning_rate": 0.0004939781828457055, "loss": 5.3039, "mean_token_accuracy": 0.19560184627771376, "num_tokens": 19747106.0, "step": 8610 }, { "entropy": 5.279783630371094, "epoch": 0.8275696445725265, "grad_norm": 1.1015625, "learning_rate": 0.000493970301723995, "loss": 5.2012, "mean_token_accuracy": 0.21216825842857362, "num_tokens": 19760156.0, "step": 8615 }, { "entropy": 5.249637460708618, "epoch": 0.8280499519692603, "grad_norm": 1.03125, "learning_rate": 0.0004939624155184443, "loss": 5.0916, "mean_token_accuracy": 0.20376598685979844, "num_tokens": 19771256.0, "step": 8620 }, { "entropy": 5.247362232208252, "epoch": 0.8285302593659942, "grad_norm": 1.53125, "learning_rate": 0.0004939545242292365, "loss": 5.0985, "mean_token_accuracy": 0.20896128118038176, "num_tokens": 19781332.0, "step": 8625 }, { "entropy": 5.298260116577149, "epoch": 0.8290105667627281, "grad_norm": 1.03125, "learning_rate": 0.0004939466278565547, "loss": 5.2686, "mean_token_accuracy": 0.19464389234781265, "num_tokens": 19793573.0, "step": 8630 }, { "entropy": 5.274344348907471, "epoch": 0.8294908741594621, "grad_norm": 1.1875, "learning_rate": 0.0004939387264005825, "loss": 5.1697, "mean_token_accuracy": 0.20536390393972398, "num_tokens": 19803421.0, "step": 8635 }, { "entropy": 5.219406032562256, "epoch": 0.829971181556196, "grad_norm": 1.1953125, "learning_rate": 0.0004939308198615031, "loss": 5.0333, "mean_token_accuracy": 0.21232510209083558, "num_tokens": 19814440.0, "step": 8640 }, { "entropy": 5.218847370147705, "epoch": 0.8304514889529299, "grad_norm": 1.1484375, "learning_rate": 0.0004939229082395001, "loss": 5.1656, "mean_token_accuracy": 0.203065949678421, "num_tokens": 19825721.0, "step": 8645 }, { "entropy": 5.23597731590271, "epoch": 0.8309317963496637, "grad_norm": 1.1953125, "learning_rate": 0.0004939149915347573, "loss": 5.118, "mean_token_accuracy": 0.20436252951622008, "num_tokens": 19837273.0, "step": 8650 }, { "entropy": 5.303889560699463, "epoch": 0.8314121037463977, "grad_norm": 1.2109375, "learning_rate": 0.0004939070697474585, "loss": 5.1542, "mean_token_accuracy": 0.20653810799121858, "num_tokens": 19849436.0, "step": 8655 }, { "entropy": 5.191540098190307, "epoch": 0.8318924111431316, "grad_norm": 1.2109375, "learning_rate": 0.0004938991428777875, "loss": 5.024, "mean_token_accuracy": 0.21526659429073333, "num_tokens": 19860222.0, "step": 8660 }, { "entropy": 5.258981561660766, "epoch": 0.8323727185398655, "grad_norm": 1.203125, "learning_rate": 0.0004938912109259284, "loss": 5.1197, "mean_token_accuracy": 0.20716548562049866, "num_tokens": 19870934.0, "step": 8665 }, { "entropy": 5.302361869812012, "epoch": 0.8328530259365994, "grad_norm": 1.328125, "learning_rate": 0.0004938832738920654, "loss": 5.1999, "mean_token_accuracy": 0.19550460278987886, "num_tokens": 19882149.0, "step": 8670 }, { "entropy": 5.235181427001953, "epoch": 0.8333333333333334, "grad_norm": 1.1640625, "learning_rate": 0.0004938753317763826, "loss": 5.1105, "mean_token_accuracy": 0.20328403115272523, "num_tokens": 19893276.0, "step": 8675 }, { "entropy": 5.312360382080078, "epoch": 0.8338136407300673, "grad_norm": 1.1796875, "learning_rate": 0.0004938673845790646, "loss": 5.285, "mean_token_accuracy": 0.193190498650074, "num_tokens": 19904723.0, "step": 8680 }, { "entropy": 5.3193567276000975, "epoch": 0.8342939481268011, "grad_norm": 1.171875, "learning_rate": 0.0004938594323002957, "loss": 5.1907, "mean_token_accuracy": 0.1986311361193657, "num_tokens": 19915226.0, "step": 8685 }, { "entropy": 5.285708665847778, "epoch": 0.834774255523535, "grad_norm": 1.15625, "learning_rate": 0.0004938514749402609, "loss": 5.2077, "mean_token_accuracy": 0.19915120750665666, "num_tokens": 19927911.0, "step": 8690 }, { "entropy": 5.2792503356933596, "epoch": 0.835254562920269, "grad_norm": 1.2109375, "learning_rate": 0.0004938435124991447, "loss": 5.2415, "mean_token_accuracy": 0.2002886116504669, "num_tokens": 19940356.0, "step": 8695 }, { "entropy": 5.316353034973145, "epoch": 0.8357348703170029, "grad_norm": 1.265625, "learning_rate": 0.0004938355449771318, "loss": 5.2102, "mean_token_accuracy": 0.1947380557656288, "num_tokens": 19951108.0, "step": 8700 }, { "entropy": 5.217819595336914, "epoch": 0.8362151777137368, "grad_norm": 1.2421875, "learning_rate": 0.0004938275723744075, "loss": 5.1384, "mean_token_accuracy": 0.20654254257678986, "num_tokens": 19962427.0, "step": 8705 }, { "entropy": 5.329318904876709, "epoch": 0.8366954851104706, "grad_norm": 1.2265625, "learning_rate": 0.0004938195946911567, "loss": 5.2467, "mean_token_accuracy": 0.1941828101873398, "num_tokens": 19973476.0, "step": 8710 }, { "entropy": 5.303026580810547, "epoch": 0.8371757925072046, "grad_norm": 1.2578125, "learning_rate": 0.0004938116119275645, "loss": 5.1976, "mean_token_accuracy": 0.20169365853071214, "num_tokens": 19984034.0, "step": 8715 }, { "entropy": 5.358089828491211, "epoch": 0.8376560999039385, "grad_norm": 1.1171875, "learning_rate": 0.0004938036240838166, "loss": 5.2273, "mean_token_accuracy": 0.19537217020988465, "num_tokens": 19996035.0, "step": 8720 }, { "entropy": 5.276670169830322, "epoch": 0.8381364073006724, "grad_norm": 1.203125, "learning_rate": 0.0004937956311600983, "loss": 5.1475, "mean_token_accuracy": 0.20285791158676147, "num_tokens": 20007259.0, "step": 8725 }, { "entropy": 5.261865663528442, "epoch": 0.8386167146974063, "grad_norm": 1.2109375, "learning_rate": 0.0004937876331565951, "loss": 5.1463, "mean_token_accuracy": 0.19915680289268495, "num_tokens": 20018673.0, "step": 8730 }, { "entropy": 5.294298076629639, "epoch": 0.8390970220941403, "grad_norm": 1.203125, "learning_rate": 0.0004937796300734926, "loss": 5.1975, "mean_token_accuracy": 0.20387612730264665, "num_tokens": 20030767.0, "step": 8735 }, { "entropy": 5.3252601146698, "epoch": 0.8395773294908742, "grad_norm": 1.203125, "learning_rate": 0.0004937716219109769, "loss": 5.2188, "mean_token_accuracy": 0.19471233934164048, "num_tokens": 20041268.0, "step": 8740 }, { "entropy": 5.241558361053467, "epoch": 0.840057636887608, "grad_norm": 1.2265625, "learning_rate": 0.0004937636086692336, "loss": 5.0847, "mean_token_accuracy": 0.21300528049468995, "num_tokens": 20052285.0, "step": 8745 }, { "entropy": 5.35512056350708, "epoch": 0.840537944284342, "grad_norm": 1.3671875, "learning_rate": 0.000493755590348449, "loss": 5.2941, "mean_token_accuracy": 0.19627934098243713, "num_tokens": 20065113.0, "step": 8750 }, { "entropy": 5.437167024612426, "epoch": 0.8410182516810759, "grad_norm": 1.40625, "learning_rate": 0.0004937475669488091, "loss": 5.2709, "mean_token_accuracy": 0.1977367326617241, "num_tokens": 20076151.0, "step": 8755 }, { "entropy": 5.263547420501709, "epoch": 0.8414985590778098, "grad_norm": 1.1640625, "learning_rate": 0.0004937395384705004, "loss": 5.2141, "mean_token_accuracy": 0.19887446761131286, "num_tokens": 20088195.0, "step": 8760 }, { "entropy": 5.211323404312134, "epoch": 0.8419788664745437, "grad_norm": 1.21875, "learning_rate": 0.0004937315049137089, "loss": 5.0603, "mean_token_accuracy": 0.21020377576351165, "num_tokens": 20098576.0, "step": 8765 }, { "entropy": 5.343924474716187, "epoch": 0.8424591738712777, "grad_norm": 1.21875, "learning_rate": 0.0004937234662786216, "loss": 5.2761, "mean_token_accuracy": 0.19456166923046112, "num_tokens": 20110176.0, "step": 8770 }, { "entropy": 5.3267858028411865, "epoch": 0.8429394812680115, "grad_norm": 1.2890625, "learning_rate": 0.0004937154225654246, "loss": 5.18, "mean_token_accuracy": 0.20727563351392747, "num_tokens": 20121713.0, "step": 8775 }, { "entropy": 5.364311075210571, "epoch": 0.8434197886647454, "grad_norm": 1.2421875, "learning_rate": 0.0004937073737743051, "loss": 5.2787, "mean_token_accuracy": 0.19634425789117813, "num_tokens": 20134126.0, "step": 8780 }, { "entropy": 5.2509232521057125, "epoch": 0.8439000960614793, "grad_norm": 1.1171875, "learning_rate": 0.0004936993199054499, "loss": 5.1719, "mean_token_accuracy": 0.20614974945783615, "num_tokens": 20144791.0, "step": 8785 }, { "entropy": 5.353460693359375, "epoch": 0.8443804034582133, "grad_norm": 1.171875, "learning_rate": 0.0004936912609590458, "loss": 5.3211, "mean_token_accuracy": 0.19727177768945695, "num_tokens": 20157214.0, "step": 8790 }, { "entropy": 5.415266323089599, "epoch": 0.8448607108549472, "grad_norm": 1.203125, "learning_rate": 0.00049368319693528, "loss": 5.3047, "mean_token_accuracy": 0.18851037174463273, "num_tokens": 20168952.0, "step": 8795 }, { "entropy": 5.313450288772583, "epoch": 0.8453410182516811, "grad_norm": 1.3125, "learning_rate": 0.0004936751278343397, "loss": 5.1884, "mean_token_accuracy": 0.1982526332139969, "num_tokens": 20181829.0, "step": 8800 }, { "entropy": 5.392148017883301, "epoch": 0.845821325648415, "grad_norm": 1.28125, "learning_rate": 0.0004936670536564123, "loss": 5.3346, "mean_token_accuracy": 0.18645845502614974, "num_tokens": 20193362.0, "step": 8805 }, { "entropy": 5.310501289367676, "epoch": 0.8463016330451489, "grad_norm": 1.203125, "learning_rate": 0.0004936589744016853, "loss": 5.2495, "mean_token_accuracy": 0.1979646310210228, "num_tokens": 20205159.0, "step": 8810 }, { "entropy": 5.309202194213867, "epoch": 0.8467819404418828, "grad_norm": 1.265625, "learning_rate": 0.000493650890070346, "loss": 5.134, "mean_token_accuracy": 0.21007836610078812, "num_tokens": 20217075.0, "step": 8815 }, { "entropy": 5.284643697738647, "epoch": 0.8472622478386167, "grad_norm": 1.109375, "learning_rate": 0.0004936428006625824, "loss": 5.145, "mean_token_accuracy": 0.20609464943408967, "num_tokens": 20227901.0, "step": 8820 }, { "entropy": 5.279466247558593, "epoch": 0.8477425552353506, "grad_norm": 1.1796875, "learning_rate": 0.0004936347061785823, "loss": 5.1655, "mean_token_accuracy": 0.20799711346626282, "num_tokens": 20239945.0, "step": 8825 }, { "entropy": 5.381019020080567, "epoch": 0.8482228626320846, "grad_norm": 1.1953125, "learning_rate": 0.0004936266066185334, "loss": 5.2448, "mean_token_accuracy": 0.19857099950313567, "num_tokens": 20251503.0, "step": 8830 }, { "entropy": 5.273257160186768, "epoch": 0.8487031700288185, "grad_norm": 1.2890625, "learning_rate": 0.0004936185019826239, "loss": 5.1679, "mean_token_accuracy": 0.20055765956640242, "num_tokens": 20263044.0, "step": 8835 }, { "entropy": 5.303754806518555, "epoch": 0.8491834774255523, "grad_norm": 1.171875, "learning_rate": 0.0004936103922710419, "loss": 5.1429, "mean_token_accuracy": 0.20382609218358994, "num_tokens": 20273563.0, "step": 8840 }, { "entropy": 5.319860410690308, "epoch": 0.8496637848222862, "grad_norm": 1.171875, "learning_rate": 0.0004936022774839759, "loss": 5.1983, "mean_token_accuracy": 0.20010559260845184, "num_tokens": 20285482.0, "step": 8845 }, { "entropy": 5.3607221126556395, "epoch": 0.8501440922190202, "grad_norm": 1.1796875, "learning_rate": 0.0004935941576216141, "loss": 5.4022, "mean_token_accuracy": 0.19553606808185578, "num_tokens": 20296666.0, "step": 8850 }, { "entropy": 5.242348289489746, "epoch": 0.8506243996157541, "grad_norm": 1.15625, "learning_rate": 0.000493586032684145, "loss": 5.0657, "mean_token_accuracy": 0.20746321827173234, "num_tokens": 20308978.0, "step": 8855 }, { "entropy": 5.249849462509156, "epoch": 0.851104707012488, "grad_norm": 1.1171875, "learning_rate": 0.0004935779026717573, "loss": 5.1432, "mean_token_accuracy": 0.20203766077756882, "num_tokens": 20321488.0, "step": 8860 }, { "entropy": 5.2123401165008545, "epoch": 0.8515850144092219, "grad_norm": 1.2578125, "learning_rate": 0.0004935697675846396, "loss": 4.9968, "mean_token_accuracy": 0.21457493007183076, "num_tokens": 20332938.0, "step": 8865 }, { "entropy": 5.208367538452149, "epoch": 0.8520653218059558, "grad_norm": 1.2734375, "learning_rate": 0.0004935616274229811, "loss": 5.0981, "mean_token_accuracy": 0.21241182535886766, "num_tokens": 20342986.0, "step": 8870 }, { "entropy": 5.260292434692383, "epoch": 0.8525456292026897, "grad_norm": 1.09375, "learning_rate": 0.0004935534821869705, "loss": 5.1548, "mean_token_accuracy": 0.19940277189016342, "num_tokens": 20355791.0, "step": 8875 }, { "entropy": 5.18192982673645, "epoch": 0.8530259365994236, "grad_norm": 1.125, "learning_rate": 0.0004935453318767971, "loss": 5.0285, "mean_token_accuracy": 0.21193305552005767, "num_tokens": 20367080.0, "step": 8880 }, { "entropy": 5.33548674583435, "epoch": 0.8535062439961575, "grad_norm": 1.2109375, "learning_rate": 0.00049353717649265, "loss": 5.1438, "mean_token_accuracy": 0.2096237510442734, "num_tokens": 20376697.0, "step": 8885 }, { "entropy": 5.283786773681641, "epoch": 0.8539865513928915, "grad_norm": 1.4453125, "learning_rate": 0.0004935290160347185, "loss": 5.1626, "mean_token_accuracy": 0.20003535747528076, "num_tokens": 20387430.0, "step": 8890 }, { "entropy": 5.23840708732605, "epoch": 0.8544668587896254, "grad_norm": 1.3828125, "learning_rate": 0.0004935208505031922, "loss": 5.1485, "mean_token_accuracy": 0.20197722762823106, "num_tokens": 20398488.0, "step": 8895 }, { "entropy": 5.174388408660889, "epoch": 0.8549471661863592, "grad_norm": 2.046875, "learning_rate": 0.0004935126798982606, "loss": 5.1712, "mean_token_accuracy": 0.19997829645872117, "num_tokens": 20410316.0, "step": 8900 }, { "entropy": 5.2989085674285885, "epoch": 0.8554274735830932, "grad_norm": 1.3046875, "learning_rate": 0.0004935045042201135, "loss": 5.0965, "mean_token_accuracy": 0.2099878177046776, "num_tokens": 20421645.0, "step": 8905 }, { "entropy": 5.297290563583374, "epoch": 0.8559077809798271, "grad_norm": 1.3125, "learning_rate": 0.0004934963234689407, "loss": 5.2136, "mean_token_accuracy": 0.19259357154369355, "num_tokens": 20433397.0, "step": 8910 }, { "entropy": 5.210119295120239, "epoch": 0.856388088376561, "grad_norm": 1.1796875, "learning_rate": 0.000493488137644932, "loss": 5.1128, "mean_token_accuracy": 0.2082274630665779, "num_tokens": 20446218.0, "step": 8915 }, { "entropy": 5.327347612380981, "epoch": 0.8568683957732949, "grad_norm": 1.3203125, "learning_rate": 0.0004934799467482774, "loss": 5.2096, "mean_token_accuracy": 0.2002415493130684, "num_tokens": 20457265.0, "step": 8920 }, { "entropy": 5.317370796203614, "epoch": 0.8573487031700289, "grad_norm": 1.265625, "learning_rate": 0.0004934717507791673, "loss": 5.2003, "mean_token_accuracy": 0.19859042763710022, "num_tokens": 20468748.0, "step": 8925 }, { "entropy": 5.320759439468384, "epoch": 0.8578290105667628, "grad_norm": 1.1484375, "learning_rate": 0.0004934635497377919, "loss": 5.2082, "mean_token_accuracy": 0.19929444640874863, "num_tokens": 20481416.0, "step": 8930 }, { "entropy": 5.236276292800904, "epoch": 0.8583093179634966, "grad_norm": 1.375, "learning_rate": 0.0004934553436243415, "loss": 5.1091, "mean_token_accuracy": 0.20469743758440018, "num_tokens": 20493063.0, "step": 8935 }, { "entropy": 5.221277904510498, "epoch": 0.8587896253602305, "grad_norm": 1.3828125, "learning_rate": 0.0004934471324390067, "loss": 5.1152, "mean_token_accuracy": 0.2038355737924576, "num_tokens": 20504000.0, "step": 8940 }, { "entropy": 5.267146587371826, "epoch": 0.8592699327569645, "grad_norm": 1.1171875, "learning_rate": 0.0004934389161819783, "loss": 5.2179, "mean_token_accuracy": 0.20483950674533843, "num_tokens": 20516419.0, "step": 8945 }, { "entropy": 5.273205709457398, "epoch": 0.8597502401536984, "grad_norm": 1.1875, "learning_rate": 0.0004934306948534467, "loss": 5.1552, "mean_token_accuracy": 0.19798202067613602, "num_tokens": 20527385.0, "step": 8950 }, { "entropy": 5.291792201995849, "epoch": 0.8602305475504323, "grad_norm": 1.203125, "learning_rate": 0.0004934224684536031, "loss": 5.1449, "mean_token_accuracy": 0.2052535355091095, "num_tokens": 20538051.0, "step": 8955 }, { "entropy": 5.321442127227783, "epoch": 0.8607108549471661, "grad_norm": 1.1953125, "learning_rate": 0.0004934142369826382, "loss": 5.1746, "mean_token_accuracy": 0.19756327718496322, "num_tokens": 20550321.0, "step": 8960 }, { "entropy": 5.309886932373047, "epoch": 0.8611911623439001, "grad_norm": 1.3125, "learning_rate": 0.0004934060004407434, "loss": 5.1616, "mean_token_accuracy": 0.2022399291396141, "num_tokens": 20561229.0, "step": 8965 }, { "entropy": 5.2354882717132565, "epoch": 0.861671469740634, "grad_norm": 1.21875, "learning_rate": 0.0004933977588281099, "loss": 5.1065, "mean_token_accuracy": 0.20714430809020995, "num_tokens": 20572040.0, "step": 8970 }, { "entropy": 5.225004816055298, "epoch": 0.8621517771373679, "grad_norm": 1.1640625, "learning_rate": 0.0004933895121449288, "loss": 5.1445, "mean_token_accuracy": 0.20627815425395965, "num_tokens": 20583110.0, "step": 8975 }, { "entropy": 5.25708327293396, "epoch": 0.8626320845341018, "grad_norm": 1.125, "learning_rate": 0.0004933812603913917, "loss": 5.1451, "mean_token_accuracy": 0.20151159167289734, "num_tokens": 20593647.0, "step": 8980 }, { "entropy": 5.185816431045533, "epoch": 0.8631123919308358, "grad_norm": 1.203125, "learning_rate": 0.0004933730035676903, "loss": 5.0209, "mean_token_accuracy": 0.21777433753013611, "num_tokens": 20604428.0, "step": 8985 }, { "entropy": 5.165512609481811, "epoch": 0.8635926993275697, "grad_norm": 1.234375, "learning_rate": 0.0004933647416740161, "loss": 5.0746, "mean_token_accuracy": 0.21601256728172302, "num_tokens": 20615811.0, "step": 8990 }, { "entropy": 5.30065655708313, "epoch": 0.8640730067243035, "grad_norm": 1.2265625, "learning_rate": 0.000493356474710561, "loss": 5.1769, "mean_token_accuracy": 0.20163519084453582, "num_tokens": 20627679.0, "step": 8995 }, { "entropy": 5.325274658203125, "epoch": 0.8645533141210374, "grad_norm": 1.2265625, "learning_rate": 0.000493348202677517, "loss": 5.2067, "mean_token_accuracy": 0.20573359727859497, "num_tokens": 20638879.0, "step": 9000 }, { "epoch": 0.8645533141210374, "eval_entropy": 5.134741041033748, "eval_loss": 5.199076175689697, "eval_mean_token_accuracy": 0.20850473279537574, "eval_num_tokens": 20638879.0, "eval_runtime": 26.7295, "eval_samples_per_second": 1227.669, "eval_steps_per_second": 153.463, "step": 9000 }, { "entropy": 5.26629228591919, "epoch": 0.8650336215177714, "grad_norm": 1.3671875, "learning_rate": 0.0004933399255750761, "loss": 5.1956, "mean_token_accuracy": 0.20548682659864426, "num_tokens": 20649729.0, "step": 9005 }, { "entropy": 5.366102600097657, "epoch": 0.8655139289145053, "grad_norm": 1.21875, "learning_rate": 0.0004933316434034304, "loss": 5.2311, "mean_token_accuracy": 0.20523984879255294, "num_tokens": 20660473.0, "step": 9010 }, { "entropy": 5.326435089111328, "epoch": 0.8659942363112392, "grad_norm": 1.265625, "learning_rate": 0.0004933233561627723, "loss": 5.1972, "mean_token_accuracy": 0.2020814150571823, "num_tokens": 20671776.0, "step": 9015 }, { "entropy": 5.251844644546509, "epoch": 0.866474543707973, "grad_norm": 1.2421875, "learning_rate": 0.0004933150638532942, "loss": 5.1605, "mean_token_accuracy": 0.2062242418527603, "num_tokens": 20684147.0, "step": 9020 }, { "entropy": 5.3002519607543945, "epoch": 0.866954851104707, "grad_norm": 1.2109375, "learning_rate": 0.0004933067664751885, "loss": 5.1469, "mean_token_accuracy": 0.20623468309640886, "num_tokens": 20695248.0, "step": 9025 }, { "entropy": 5.244437265396118, "epoch": 0.8674351585014409, "grad_norm": 1.1484375, "learning_rate": 0.000493298464028648, "loss": 5.168, "mean_token_accuracy": 0.2045750394463539, "num_tokens": 20707772.0, "step": 9030 }, { "entropy": 5.263054895401001, "epoch": 0.8679154658981748, "grad_norm": 1.265625, "learning_rate": 0.0004932901565138653, "loss": 5.1264, "mean_token_accuracy": 0.1987837016582489, "num_tokens": 20718813.0, "step": 9035 }, { "entropy": 5.2314427375793455, "epoch": 0.8683957732949087, "grad_norm": 1.171875, "learning_rate": 0.0004932818439310334, "loss": 5.1175, "mean_token_accuracy": 0.2132244125008583, "num_tokens": 20730939.0, "step": 9040 }, { "entropy": 5.376590538024902, "epoch": 0.8688760806916427, "grad_norm": 1.15625, "learning_rate": 0.0004932735262803452, "loss": 5.2384, "mean_token_accuracy": 0.1961486503481865, "num_tokens": 20742990.0, "step": 9045 }, { "entropy": 5.2218879699707035, "epoch": 0.8693563880883766, "grad_norm": 1.265625, "learning_rate": 0.0004932652035619939, "loss": 5.0787, "mean_token_accuracy": 0.2043047398328781, "num_tokens": 20754076.0, "step": 9050 }, { "entropy": 5.269600582122803, "epoch": 0.8698366954851104, "grad_norm": 1.15625, "learning_rate": 0.0004932568757761727, "loss": 5.1352, "mean_token_accuracy": 0.20455852448940276, "num_tokens": 20765538.0, "step": 9055 }, { "entropy": 5.2922038555145265, "epoch": 0.8703170028818443, "grad_norm": 1.25, "learning_rate": 0.0004932485429230748, "loss": 5.1899, "mean_token_accuracy": 0.19730205535888673, "num_tokens": 20776359.0, "step": 9060 }, { "entropy": 5.234628915786743, "epoch": 0.8707973102785783, "grad_norm": 1.078125, "learning_rate": 0.000493240205002894, "loss": 5.1682, "mean_token_accuracy": 0.2084574043750763, "num_tokens": 20787581.0, "step": 9065 }, { "entropy": 5.34184308052063, "epoch": 0.8712776176753122, "grad_norm": 1.203125, "learning_rate": 0.0004932318620158235, "loss": 5.2041, "mean_token_accuracy": 0.19864192605018616, "num_tokens": 20799904.0, "step": 9070 }, { "entropy": 5.319941759109497, "epoch": 0.8717579250720461, "grad_norm": 1.1328125, "learning_rate": 0.0004932235139620574, "loss": 5.1384, "mean_token_accuracy": 0.20746065229177474, "num_tokens": 20810238.0, "step": 9075 }, { "entropy": 5.2344482898712155, "epoch": 0.8722382324687801, "grad_norm": 1.28125, "learning_rate": 0.0004932151608417892, "loss": 5.0957, "mean_token_accuracy": 0.20455455929040908, "num_tokens": 20821349.0, "step": 9080 }, { "entropy": 5.202734899520874, "epoch": 0.872718539865514, "grad_norm": 1.21875, "learning_rate": 0.0004932068026552127, "loss": 5.1513, "mean_token_accuracy": 0.20545032173395156, "num_tokens": 20834788.0, "step": 9085 }, { "entropy": 5.319971227645874, "epoch": 0.8731988472622478, "grad_norm": 1.1953125, "learning_rate": 0.0004931984394025224, "loss": 5.2178, "mean_token_accuracy": 0.20217667371034623, "num_tokens": 20845571.0, "step": 9090 }, { "entropy": 5.311048793792724, "epoch": 0.8736791546589817, "grad_norm": 1.3046875, "learning_rate": 0.0004931900710839123, "loss": 5.1952, "mean_token_accuracy": 0.19956784099340438, "num_tokens": 20857209.0, "step": 9095 }, { "entropy": 5.329868745803833, "epoch": 0.8741594620557157, "grad_norm": 1.1171875, "learning_rate": 0.0004931816976995766, "loss": 5.2614, "mean_token_accuracy": 0.19563933461904526, "num_tokens": 20870624.0, "step": 9100 }, { "entropy": 5.286147880554199, "epoch": 0.8746397694524496, "grad_norm": 1.28125, "learning_rate": 0.0004931733192497097, "loss": 5.1638, "mean_token_accuracy": 0.20552606284618377, "num_tokens": 20881769.0, "step": 9105 }, { "entropy": 5.256227636337281, "epoch": 0.8751200768491835, "grad_norm": 1.2265625, "learning_rate": 0.0004931649357345062, "loss": 5.1336, "mean_token_accuracy": 0.19931492060422898, "num_tokens": 20892817.0, "step": 9110 }, { "entropy": 5.266511297225952, "epoch": 0.8756003842459174, "grad_norm": 1.4453125, "learning_rate": 0.0004931565471541606, "loss": 5.0994, "mean_token_accuracy": 0.20882656574249267, "num_tokens": 20903042.0, "step": 9115 }, { "entropy": 5.197468280792236, "epoch": 0.8760806916426513, "grad_norm": 1.34375, "learning_rate": 0.0004931481535088679, "loss": 5.0548, "mean_token_accuracy": 0.2176084190607071, "num_tokens": 20914684.0, "step": 9120 }, { "entropy": 5.21255669593811, "epoch": 0.8765609990393852, "grad_norm": 1.1875, "learning_rate": 0.0004931397547988229, "loss": 5.1169, "mean_token_accuracy": 0.21571636497974395, "num_tokens": 20926585.0, "step": 9125 }, { "entropy": 5.315608882904053, "epoch": 0.8770413064361191, "grad_norm": 1.0859375, "learning_rate": 0.0004931313510242204, "loss": 5.1677, "mean_token_accuracy": 0.2050992101430893, "num_tokens": 20939729.0, "step": 9130 }, { "entropy": 5.231499481201172, "epoch": 0.877521613832853, "grad_norm": 1.15625, "learning_rate": 0.0004931229421852557, "loss": 5.103, "mean_token_accuracy": 0.2057361498475075, "num_tokens": 20951697.0, "step": 9135 }, { "entropy": 5.321991300582885, "epoch": 0.878001921229587, "grad_norm": 1.2265625, "learning_rate": 0.000493114528282124, "loss": 5.2363, "mean_token_accuracy": 0.20253994166851044, "num_tokens": 20962729.0, "step": 9140 }, { "entropy": 5.2189311504364015, "epoch": 0.8784822286263209, "grad_norm": 1.2109375, "learning_rate": 0.0004931061093150206, "loss": 5.0919, "mean_token_accuracy": 0.20677362531423568, "num_tokens": 20973331.0, "step": 9145 }, { "entropy": 5.188636112213135, "epoch": 0.8789625360230547, "grad_norm": 1.125, "learning_rate": 0.0004930976852841409, "loss": 5.0942, "mean_token_accuracy": 0.20331761091947556, "num_tokens": 20985609.0, "step": 9150 }, { "entropy": 5.212237691879272, "epoch": 0.8794428434197886, "grad_norm": 1.1640625, "learning_rate": 0.0004930892561896806, "loss": 5.1191, "mean_token_accuracy": 0.19904601573944092, "num_tokens": 20997231.0, "step": 9155 }, { "entropy": 5.302338361740112, "epoch": 0.8799231508165226, "grad_norm": 1.234375, "learning_rate": 0.0004930808220318354, "loss": 5.1304, "mean_token_accuracy": 0.20466675609350204, "num_tokens": 21008511.0, "step": 9160 }, { "entropy": 5.251391744613647, "epoch": 0.8804034582132565, "grad_norm": 1.09375, "learning_rate": 0.0004930723828108012, "loss": 5.07, "mean_token_accuracy": 0.20593566447496414, "num_tokens": 21019108.0, "step": 9165 }, { "entropy": 5.1970141410827635, "epoch": 0.8808837656099904, "grad_norm": 1.765625, "learning_rate": 0.0004930639385267736, "loss": 5.1312, "mean_token_accuracy": 0.20703590363264085, "num_tokens": 21030621.0, "step": 9170 }, { "entropy": 5.254196977615356, "epoch": 0.8813640730067243, "grad_norm": 1.25, "learning_rate": 0.000493055489179949, "loss": 5.106, "mean_token_accuracy": 0.20593850463628768, "num_tokens": 21041778.0, "step": 9175 }, { "entropy": 5.293916034698486, "epoch": 0.8818443804034583, "grad_norm": 1.2421875, "learning_rate": 0.0004930470347705234, "loss": 5.1213, "mean_token_accuracy": 0.20545565485954284, "num_tokens": 21054257.0, "step": 9180 }, { "entropy": 5.25820026397705, "epoch": 0.8823246878001921, "grad_norm": 1.265625, "learning_rate": 0.000493038575298693, "loss": 5.1774, "mean_token_accuracy": 0.1998462751507759, "num_tokens": 21066378.0, "step": 9185 }, { "entropy": 5.217289543151855, "epoch": 0.882804995196926, "grad_norm": 1.25, "learning_rate": 0.0004930301107646545, "loss": 5.1022, "mean_token_accuracy": 0.20249929428100585, "num_tokens": 21078913.0, "step": 9190 }, { "entropy": 5.315297651290893, "epoch": 0.8832853025936599, "grad_norm": 1.1640625, "learning_rate": 0.0004930216411686042, "loss": 5.1549, "mean_token_accuracy": 0.1985946238040924, "num_tokens": 21090500.0, "step": 9195 }, { "entropy": 5.2111443996429445, "epoch": 0.8837656099903939, "grad_norm": 1.109375, "learning_rate": 0.0004930131665107387, "loss": 5.1334, "mean_token_accuracy": 0.2010358154773712, "num_tokens": 21102793.0, "step": 9200 }, { "entropy": 5.382960557937622, "epoch": 0.8842459173871278, "grad_norm": 1.1875, "learning_rate": 0.000493004686791255, "loss": 5.2304, "mean_token_accuracy": 0.19272204041481017, "num_tokens": 21114504.0, "step": 9205 }, { "entropy": 5.27924222946167, "epoch": 0.8847262247838616, "grad_norm": 1.296875, "learning_rate": 0.0004929962020103496, "loss": 5.1007, "mean_token_accuracy": 0.20397736132144928, "num_tokens": 21126733.0, "step": 9210 }, { "entropy": 5.19653902053833, "epoch": 0.8852065321805955, "grad_norm": 1.1953125, "learning_rate": 0.0004929877121682198, "loss": 5.0931, "mean_token_accuracy": 0.20474224388599396, "num_tokens": 21138045.0, "step": 9215 }, { "entropy": 5.260480785369873, "epoch": 0.8856868395773295, "grad_norm": 1.125, "learning_rate": 0.0004929792172650627, "loss": 5.1796, "mean_token_accuracy": 0.19880712181329727, "num_tokens": 21151562.0, "step": 9220 }, { "entropy": 5.315613460540772, "epoch": 0.8861671469740634, "grad_norm": 1.1953125, "learning_rate": 0.0004929707173010753, "loss": 5.1299, "mean_token_accuracy": 0.2056412249803543, "num_tokens": 21162943.0, "step": 9225 }, { "entropy": 5.241054391860962, "epoch": 0.8866474543707973, "grad_norm": 1.3046875, "learning_rate": 0.0004929622122764552, "loss": 5.1699, "mean_token_accuracy": 0.2012902170419693, "num_tokens": 21174392.0, "step": 9230 }, { "entropy": 5.1802393913269045, "epoch": 0.8871277617675313, "grad_norm": 1.203125, "learning_rate": 0.0004929537021913997, "loss": 5.008, "mean_token_accuracy": 0.21468252092599868, "num_tokens": 21185372.0, "step": 9235 }, { "entropy": 5.236097574234009, "epoch": 0.8876080691642652, "grad_norm": 1.21875, "learning_rate": 0.0004929451870461064, "loss": 5.1562, "mean_token_accuracy": 0.20373494178056717, "num_tokens": 21197044.0, "step": 9240 }, { "entropy": 5.191161966323852, "epoch": 0.888088376560999, "grad_norm": 1.125, "learning_rate": 0.0004929366668407731, "loss": 5.1047, "mean_token_accuracy": 0.20978552401065825, "num_tokens": 21207729.0, "step": 9245 }, { "entropy": 5.312554979324341, "epoch": 0.8885686839577329, "grad_norm": 1.2734375, "learning_rate": 0.0004929281415755974, "loss": 5.1436, "mean_token_accuracy": 0.20457518696784974, "num_tokens": 21218909.0, "step": 9250 }, { "entropy": 5.344946384429932, "epoch": 0.8890489913544669, "grad_norm": 1.1484375, "learning_rate": 0.0004929196112507775, "loss": 5.2498, "mean_token_accuracy": 0.19993363320827484, "num_tokens": 21230543.0, "step": 9255 }, { "entropy": 5.270783472061157, "epoch": 0.8895292987512008, "grad_norm": 1.2578125, "learning_rate": 0.0004929110758665112, "loss": 5.1876, "mean_token_accuracy": 0.1981159120798111, "num_tokens": 21242064.0, "step": 9260 }, { "entropy": 5.326478481292725, "epoch": 0.8900096061479347, "grad_norm": 1.296875, "learning_rate": 0.0004929025354229969, "loss": 5.2097, "mean_token_accuracy": 0.2005533829331398, "num_tokens": 21254321.0, "step": 9265 }, { "entropy": 5.180127668380737, "epoch": 0.8904899135446686, "grad_norm": 1.1796875, "learning_rate": 0.0004928939899204326, "loss": 5.0312, "mean_token_accuracy": 0.20711840987205504, "num_tokens": 21264741.0, "step": 9270 }, { "entropy": 5.250730323791504, "epoch": 0.8909702209414025, "grad_norm": 1.1484375, "learning_rate": 0.000492885439359017, "loss": 5.1329, "mean_token_accuracy": 0.20080725252628326, "num_tokens": 21276834.0, "step": 9275 }, { "entropy": 5.236090469360351, "epoch": 0.8914505283381364, "grad_norm": 1.1875, "learning_rate": 0.0004928768837389485, "loss": 5.0918, "mean_token_accuracy": 0.20890207290649415, "num_tokens": 21287108.0, "step": 9280 }, { "entropy": 5.248825597763061, "epoch": 0.8919308357348703, "grad_norm": 1.140625, "learning_rate": 0.0004928683230604257, "loss": 5.1298, "mean_token_accuracy": 0.20136982649564744, "num_tokens": 21299942.0, "step": 9285 }, { "entropy": 5.365978527069092, "epoch": 0.8924111431316042, "grad_norm": 1.1328125, "learning_rate": 0.0004928597573236474, "loss": 5.2691, "mean_token_accuracy": 0.2037052556872368, "num_tokens": 21311243.0, "step": 9290 }, { "entropy": 5.274964046478272, "epoch": 0.8928914505283382, "grad_norm": 1.1328125, "learning_rate": 0.0004928511865288123, "loss": 5.1035, "mean_token_accuracy": 0.2068115308880806, "num_tokens": 21322291.0, "step": 9295 }, { "entropy": 5.258368492126465, "epoch": 0.8933717579250721, "grad_norm": 1.0859375, "learning_rate": 0.0004928426106761197, "loss": 5.1982, "mean_token_accuracy": 0.20522145330905914, "num_tokens": 21333257.0, "step": 9300 }, { "entropy": 5.194037771224975, "epoch": 0.8938520653218059, "grad_norm": 1.09375, "learning_rate": 0.0004928340297657685, "loss": 5.1119, "mean_token_accuracy": 0.20364685207605362, "num_tokens": 21345848.0, "step": 9305 }, { "entropy": 5.306222867965698, "epoch": 0.8943323727185398, "grad_norm": 1.2890625, "learning_rate": 0.0004928254437979578, "loss": 5.1371, "mean_token_accuracy": 0.2047370731830597, "num_tokens": 21357693.0, "step": 9310 }, { "entropy": 5.277711868286133, "epoch": 0.8948126801152738, "grad_norm": 1.25, "learning_rate": 0.0004928168527728873, "loss": 5.229, "mean_token_accuracy": 0.20137819200754165, "num_tokens": 21369653.0, "step": 9315 }, { "entropy": 5.314446830749512, "epoch": 0.8952929875120077, "grad_norm": 1.15625, "learning_rate": 0.0004928082566907562, "loss": 5.1813, "mean_token_accuracy": 0.20231199115514756, "num_tokens": 21383924.0, "step": 9320 }, { "entropy": 5.318646097183228, "epoch": 0.8957732949087416, "grad_norm": 1.1328125, "learning_rate": 0.0004927996555517642, "loss": 5.156, "mean_token_accuracy": 0.1997460052371025, "num_tokens": 21395963.0, "step": 9325 }, { "entropy": 5.2909129619598385, "epoch": 0.8962536023054755, "grad_norm": 1.109375, "learning_rate": 0.0004927910493561109, "loss": 5.1562, "mean_token_accuracy": 0.20208995938301086, "num_tokens": 21408200.0, "step": 9330 }, { "entropy": 5.299256086349487, "epoch": 0.8967339097022095, "grad_norm": 1.1953125, "learning_rate": 0.000492782438103996, "loss": 5.26, "mean_token_accuracy": 0.1976392984390259, "num_tokens": 21419963.0, "step": 9335 }, { "entropy": 5.280540561676025, "epoch": 0.8972142170989433, "grad_norm": 1.3203125, "learning_rate": 0.0004927738217956197, "loss": 5.2154, "mean_token_accuracy": 0.20124684274196625, "num_tokens": 21431824.0, "step": 9340 }, { "entropy": 5.220011901855469, "epoch": 0.8976945244956772, "grad_norm": 1.34375, "learning_rate": 0.0004927652004311819, "loss": 4.9671, "mean_token_accuracy": 0.21261375546455383, "num_tokens": 21442354.0, "step": 9345 }, { "entropy": 5.272494840621948, "epoch": 0.8981748318924111, "grad_norm": 1.234375, "learning_rate": 0.0004927565740108828, "loss": 5.1539, "mean_token_accuracy": 0.19939538985490798, "num_tokens": 21453734.0, "step": 9350 }, { "entropy": 5.264281797409057, "epoch": 0.8986551392891451, "grad_norm": 1.171875, "learning_rate": 0.0004927479425349226, "loss": 5.1664, "mean_token_accuracy": 0.20830067843198777, "num_tokens": 21465471.0, "step": 9355 }, { "entropy": 5.347072267532349, "epoch": 0.899135446685879, "grad_norm": 1.125, "learning_rate": 0.0004927393060035018, "loss": 5.3012, "mean_token_accuracy": 0.19275195002555848, "num_tokens": 21477775.0, "step": 9360 }, { "entropy": 5.256478118896484, "epoch": 0.8996157540826129, "grad_norm": 1.2109375, "learning_rate": 0.0004927306644168207, "loss": 5.0715, "mean_token_accuracy": 0.2134536847472191, "num_tokens": 21489319.0, "step": 9365 }, { "entropy": 5.3430397510528564, "epoch": 0.9000960614793467, "grad_norm": 1.15625, "learning_rate": 0.0004927220177750803, "loss": 5.2993, "mean_token_accuracy": 0.20141739547252654, "num_tokens": 21499742.0, "step": 9370 }, { "entropy": 5.2615800380706785, "epoch": 0.9005763688760807, "grad_norm": 1.21875, "learning_rate": 0.0004927133660784811, "loss": 5.0778, "mean_token_accuracy": 0.20828621387481688, "num_tokens": 21511063.0, "step": 9375 }, { "entropy": 5.2883483409881595, "epoch": 0.9010566762728146, "grad_norm": 1.15625, "learning_rate": 0.0004927047093272241, "loss": 5.0993, "mean_token_accuracy": 0.2080937907099724, "num_tokens": 21522500.0, "step": 9380 }, { "entropy": 5.292206716537476, "epoch": 0.9015369836695485, "grad_norm": 1.109375, "learning_rate": 0.00049269604752151, "loss": 5.183, "mean_token_accuracy": 0.19954841285943986, "num_tokens": 21533578.0, "step": 9385 }, { "entropy": 5.2589469909667965, "epoch": 0.9020172910662824, "grad_norm": 1.671875, "learning_rate": 0.0004926873806615403, "loss": 5.1761, "mean_token_accuracy": 0.2022814229130745, "num_tokens": 21544296.0, "step": 9390 }, { "entropy": 5.285726165771484, "epoch": 0.9024975984630164, "grad_norm": 1.21875, "learning_rate": 0.0004926787087475158, "loss": 5.2485, "mean_token_accuracy": 0.19234858453273773, "num_tokens": 21555386.0, "step": 9395 }, { "entropy": 5.2691357135772705, "epoch": 0.9029779058597502, "grad_norm": 1.125, "learning_rate": 0.0004926700317796382, "loss": 5.0119, "mean_token_accuracy": 0.2185451105237007, "num_tokens": 21566527.0, "step": 9400 }, { "entropy": 5.316603708267212, "epoch": 0.9034582132564841, "grad_norm": 1.1484375, "learning_rate": 0.0004926613497581088, "loss": 5.1657, "mean_token_accuracy": 0.19770514070987702, "num_tokens": 21576870.0, "step": 9405 }, { "entropy": 5.246594953536987, "epoch": 0.9039385206532181, "grad_norm": 1.375, "learning_rate": 0.0004926526626831292, "loss": 5.1326, "mean_token_accuracy": 0.20468196123838425, "num_tokens": 21588113.0, "step": 9410 }, { "entropy": 5.279461526870728, "epoch": 0.904418828049952, "grad_norm": 1.3671875, "learning_rate": 0.0004926439705549011, "loss": 5.1535, "mean_token_accuracy": 0.2016696736216545, "num_tokens": 21599307.0, "step": 9415 }, { "entropy": 5.25780029296875, "epoch": 0.9048991354466859, "grad_norm": 1.34375, "learning_rate": 0.0004926352733736262, "loss": 5.1166, "mean_token_accuracy": 0.2065201461315155, "num_tokens": 21609961.0, "step": 9420 }, { "entropy": 5.2240455627441404, "epoch": 0.9053794428434198, "grad_norm": 1.1953125, "learning_rate": 0.0004926265711395065, "loss": 5.1391, "mean_token_accuracy": 0.2021078497171402, "num_tokens": 21622222.0, "step": 9425 }, { "entropy": 5.2489923477172855, "epoch": 0.9058597502401537, "grad_norm": 1.265625, "learning_rate": 0.000492617863852744, "loss": 5.1228, "mean_token_accuracy": 0.21097581535577775, "num_tokens": 21632843.0, "step": 9430 }, { "entropy": 5.273687887191772, "epoch": 0.9063400576368876, "grad_norm": 1.1484375, "learning_rate": 0.0004926091515135409, "loss": 5.1694, "mean_token_accuracy": 0.20245194882154466, "num_tokens": 21645387.0, "step": 9435 }, { "entropy": 5.272740983963013, "epoch": 0.9068203650336215, "grad_norm": 1.171875, "learning_rate": 0.0004926004341220995, "loss": 5.1151, "mean_token_accuracy": 0.20472093671560287, "num_tokens": 21656787.0, "step": 9440 }, { "entropy": 5.286762046813965, "epoch": 0.9073006724303554, "grad_norm": 1.2890625, "learning_rate": 0.0004925917116786222, "loss": 5.1467, "mean_token_accuracy": 0.202509106695652, "num_tokens": 21667800.0, "step": 9445 }, { "entropy": 5.323235177993775, "epoch": 0.9077809798270894, "grad_norm": 1.2265625, "learning_rate": 0.0004925829841833114, "loss": 5.2022, "mean_token_accuracy": 0.1957914039492607, "num_tokens": 21679297.0, "step": 9450 }, { "entropy": 5.2604146003723145, "epoch": 0.9082612872238233, "grad_norm": 1.1484375, "learning_rate": 0.0004925742516363699, "loss": 5.2104, "mean_token_accuracy": 0.19677306711673737, "num_tokens": 21692956.0, "step": 9455 }, { "entropy": 5.239795923233032, "epoch": 0.9087415946205571, "grad_norm": 1.265625, "learning_rate": 0.0004925655140380002, "loss": 4.9955, "mean_token_accuracy": 0.20952331125736237, "num_tokens": 21704852.0, "step": 9460 }, { "entropy": 5.240779304504395, "epoch": 0.909221902017291, "grad_norm": 1.171875, "learning_rate": 0.0004925567713884054, "loss": 5.1345, "mean_token_accuracy": 0.20403669029474258, "num_tokens": 21715886.0, "step": 9465 }, { "entropy": 5.256079244613647, "epoch": 0.909702209414025, "grad_norm": 1.2578125, "learning_rate": 0.0004925480236877884, "loss": 5.126, "mean_token_accuracy": 0.20910231918096542, "num_tokens": 21727157.0, "step": 9470 }, { "entropy": 5.261584663391114, "epoch": 0.9101825168107589, "grad_norm": 1.2890625, "learning_rate": 0.0004925392709363522, "loss": 5.1387, "mean_token_accuracy": 0.20324090272188186, "num_tokens": 21738232.0, "step": 9475 }, { "entropy": 5.268222141265869, "epoch": 0.9106628242074928, "grad_norm": 1.1484375, "learning_rate": 0.0004925305131343001, "loss": 5.1774, "mean_token_accuracy": 0.20176736861467362, "num_tokens": 21749029.0, "step": 9480 }, { "entropy": 5.357953786849976, "epoch": 0.9111431316042267, "grad_norm": 1.234375, "learning_rate": 0.0004925217502818355, "loss": 5.1727, "mean_token_accuracy": 0.1988372653722763, "num_tokens": 21761243.0, "step": 9485 }, { "entropy": 5.250569820404053, "epoch": 0.9116234390009607, "grad_norm": 1.140625, "learning_rate": 0.0004925129823791616, "loss": 5.1391, "mean_token_accuracy": 0.20034718960523606, "num_tokens": 21772623.0, "step": 9490 }, { "entropy": 5.282710075378418, "epoch": 0.9121037463976945, "grad_norm": 1.125, "learning_rate": 0.0004925042094264822, "loss": 5.1644, "mean_token_accuracy": 0.20180542021989822, "num_tokens": 21782611.0, "step": 9495 }, { "entropy": 5.176083374023437, "epoch": 0.9125840537944284, "grad_norm": 1.2578125, "learning_rate": 0.000492495431424001, "loss": 5.0361, "mean_token_accuracy": 0.210744047164917, "num_tokens": 21793946.0, "step": 9500 }, { "entropy": 5.317784595489502, "epoch": 0.9130643611911623, "grad_norm": 1.3359375, "learning_rate": 0.0004924866483719216, "loss": 5.2217, "mean_token_accuracy": 0.18893510699272156, "num_tokens": 21803878.0, "step": 9505 }, { "entropy": 5.291093206405639, "epoch": 0.9135446685878963, "grad_norm": 1.453125, "learning_rate": 0.0004924778602704481, "loss": 5.1693, "mean_token_accuracy": 0.20558474063873292, "num_tokens": 21815187.0, "step": 9510 }, { "entropy": 5.253582382202149, "epoch": 0.9140249759846302, "grad_norm": 1.28125, "learning_rate": 0.0004924690671197845, "loss": 5.1219, "mean_token_accuracy": 0.21276892423629762, "num_tokens": 21825597.0, "step": 9515 }, { "entropy": 5.222238779067993, "epoch": 0.914505283381364, "grad_norm": 1.2265625, "learning_rate": 0.0004924602689201348, "loss": 5.1079, "mean_token_accuracy": 0.2087915927171707, "num_tokens": 21837110.0, "step": 9520 }, { "entropy": 5.478323316574096, "epoch": 0.9149855907780979, "grad_norm": 1.2265625, "learning_rate": 0.0004924514656717034, "loss": 5.3741, "mean_token_accuracy": 0.19211723804473876, "num_tokens": 21847754.0, "step": 9525 }, { "entropy": 5.294663047790527, "epoch": 0.9154658981748319, "grad_norm": 1.1640625, "learning_rate": 0.0004924426573746948, "loss": 5.1594, "mean_token_accuracy": 0.20195448100566865, "num_tokens": 21859162.0, "step": 9530 }, { "entropy": 5.239957857131958, "epoch": 0.9159462055715658, "grad_norm": 1.125, "learning_rate": 0.0004924338440293131, "loss": 5.1104, "mean_token_accuracy": 0.20837367475032806, "num_tokens": 21870826.0, "step": 9535 }, { "entropy": 5.256366109848022, "epoch": 0.9164265129682997, "grad_norm": 1.265625, "learning_rate": 0.0004924250256357635, "loss": 5.1534, "mean_token_accuracy": 0.19792882353067398, "num_tokens": 21882003.0, "step": 9540 }, { "entropy": 5.215576648712158, "epoch": 0.9169068203650336, "grad_norm": 1.09375, "learning_rate": 0.0004924162021942502, "loss": 5.097, "mean_token_accuracy": 0.2071886330842972, "num_tokens": 21894132.0, "step": 9545 }, { "entropy": 5.181234216690063, "epoch": 0.9173871277617676, "grad_norm": 1.1015625, "learning_rate": 0.0004924073737049784, "loss": 5.1089, "mean_token_accuracy": 0.21016984134912492, "num_tokens": 21904951.0, "step": 9550 }, { "entropy": 5.26510066986084, "epoch": 0.9178674351585014, "grad_norm": 1.2109375, "learning_rate": 0.0004923985401681528, "loss": 5.1376, "mean_token_accuracy": 0.20959776937961577, "num_tokens": 21917100.0, "step": 9555 }, { "entropy": 5.28123664855957, "epoch": 0.9183477425552353, "grad_norm": 1.1953125, "learning_rate": 0.0004923897015839788, "loss": 5.1579, "mean_token_accuracy": 0.1991439864039421, "num_tokens": 21927541.0, "step": 9560 }, { "entropy": 5.2830277442932125, "epoch": 0.9188280499519692, "grad_norm": 1.21875, "learning_rate": 0.0004923808579526613, "loss": 5.0914, "mean_token_accuracy": 0.20503710806369782, "num_tokens": 21938723.0, "step": 9565 }, { "entropy": 5.24866738319397, "epoch": 0.9193083573487032, "grad_norm": 1.2109375, "learning_rate": 0.0004923720092744059, "loss": 5.0354, "mean_token_accuracy": 0.21492197811603547, "num_tokens": 21950424.0, "step": 9570 }, { "entropy": 5.25103907585144, "epoch": 0.9197886647454371, "grad_norm": 1.40625, "learning_rate": 0.0004923631555494179, "loss": 5.1937, "mean_token_accuracy": 0.2016189157962799, "num_tokens": 21961030.0, "step": 9575 }, { "entropy": 5.231373453140259, "epoch": 0.920268972142171, "grad_norm": 1.3359375, "learning_rate": 0.0004923542967779028, "loss": 5.0957, "mean_token_accuracy": 0.20799438655376434, "num_tokens": 21971625.0, "step": 9580 }, { "entropy": 5.300740003585815, "epoch": 0.920749279538905, "grad_norm": 1.21875, "learning_rate": 0.0004923454329600664, "loss": 5.1185, "mean_token_accuracy": 0.20712572187185288, "num_tokens": 21983733.0, "step": 9585 }, { "entropy": 5.19854097366333, "epoch": 0.9212295869356388, "grad_norm": 1.5703125, "learning_rate": 0.0004923365640961143, "loss": 5.0651, "mean_token_accuracy": 0.21446898579597473, "num_tokens": 21995621.0, "step": 9590 }, { "entropy": 5.249282026290894, "epoch": 0.9217098943323727, "grad_norm": 1.3203125, "learning_rate": 0.0004923276901862526, "loss": 5.1486, "mean_token_accuracy": 0.20122848600149154, "num_tokens": 22007325.0, "step": 9595 }, { "entropy": 5.247177934646606, "epoch": 0.9221902017291066, "grad_norm": 1.515625, "learning_rate": 0.0004923188112306874, "loss": 5.1148, "mean_token_accuracy": 0.2028706982731819, "num_tokens": 22017733.0, "step": 9600 }, { "entropy": 5.297493267059326, "epoch": 0.9226705091258406, "grad_norm": 1.0625, "learning_rate": 0.0004923099272296246, "loss": 5.227, "mean_token_accuracy": 0.1984498158097267, "num_tokens": 22030451.0, "step": 9605 }, { "entropy": 5.293121433258056, "epoch": 0.9231508165225745, "grad_norm": 1.2265625, "learning_rate": 0.0004923010381832706, "loss": 5.1655, "mean_token_accuracy": 0.1920482635498047, "num_tokens": 22042626.0, "step": 9610 }, { "entropy": 5.263902759552002, "epoch": 0.9236311239193083, "grad_norm": 1.234375, "learning_rate": 0.0004922921440918318, "loss": 5.1479, "mean_token_accuracy": 0.20066307634115219, "num_tokens": 22053314.0, "step": 9615 }, { "entropy": 5.3540332317352295, "epoch": 0.9241114313160422, "grad_norm": 1.2265625, "learning_rate": 0.0004922832449555144, "loss": 5.2321, "mean_token_accuracy": 0.19173655807971954, "num_tokens": 22064395.0, "step": 9620 }, { "entropy": 5.229344749450684, "epoch": 0.9245917387127762, "grad_norm": 1.4453125, "learning_rate": 0.0004922743407745255, "loss": 5.123, "mean_token_accuracy": 0.20057824850082398, "num_tokens": 22075960.0, "step": 9625 }, { "entropy": 5.272555780410767, "epoch": 0.9250720461095101, "grad_norm": 1.1796875, "learning_rate": 0.0004922654315490714, "loss": 5.1871, "mean_token_accuracy": 0.20159071534872056, "num_tokens": 22086034.0, "step": 9630 }, { "entropy": 5.249064683914185, "epoch": 0.925552353506244, "grad_norm": 1.5546875, "learning_rate": 0.0004922565172793593, "loss": 5.2023, "mean_token_accuracy": 0.20035000890493393, "num_tokens": 22096184.0, "step": 9635 }, { "entropy": 5.262630033493042, "epoch": 0.9260326609029779, "grad_norm": 1.078125, "learning_rate": 0.0004922475979655958, "loss": 5.1593, "mean_token_accuracy": 0.2061972975730896, "num_tokens": 22108795.0, "step": 9640 }, { "entropy": 5.240458583831787, "epoch": 0.9265129682997119, "grad_norm": 1.296875, "learning_rate": 0.0004922386736079883, "loss": 5.1362, "mean_token_accuracy": 0.20278566032648088, "num_tokens": 22119608.0, "step": 9645 }, { "entropy": 5.239983415603637, "epoch": 0.9269932756964457, "grad_norm": 1.1484375, "learning_rate": 0.0004922297442067438, "loss": 5.1009, "mean_token_accuracy": 0.21008216142654418, "num_tokens": 22131621.0, "step": 9650 }, { "entropy": 5.303403711318969, "epoch": 0.9274735830931796, "grad_norm": 1.109375, "learning_rate": 0.0004922208097620697, "loss": 5.0679, "mean_token_accuracy": 0.20454230904579163, "num_tokens": 22142745.0, "step": 9655 }, { "entropy": 5.2560042381286625, "epoch": 0.9279538904899135, "grad_norm": 1.3984375, "learning_rate": 0.0004922118702741735, "loss": 5.2697, "mean_token_accuracy": 0.19514112025499344, "num_tokens": 22155457.0, "step": 9660 }, { "entropy": 5.320225811004638, "epoch": 0.9284341978866475, "grad_norm": 1.328125, "learning_rate": 0.0004922029257432625, "loss": 5.1395, "mean_token_accuracy": 0.2117284744977951, "num_tokens": 22165955.0, "step": 9665 }, { "entropy": 5.256221914291382, "epoch": 0.9289145052833814, "grad_norm": 1.3671875, "learning_rate": 0.0004921939761695446, "loss": 5.0865, "mean_token_accuracy": 0.2021948665380478, "num_tokens": 22178142.0, "step": 9670 }, { "entropy": 5.171184015274048, "epoch": 0.9293948126801153, "grad_norm": 1.296875, "learning_rate": 0.0004921850215532275, "loss": 5.0653, "mean_token_accuracy": 0.21012310534715653, "num_tokens": 22190315.0, "step": 9675 }, { "entropy": 5.2543559074401855, "epoch": 0.9298751200768491, "grad_norm": 1.5625, "learning_rate": 0.0004921760618945192, "loss": 5.1284, "mean_token_accuracy": 0.20433304756879805, "num_tokens": 22201785.0, "step": 9680 }, { "entropy": 5.295661354064942, "epoch": 0.9303554274735831, "grad_norm": 1.3515625, "learning_rate": 0.0004921670971936276, "loss": 5.0781, "mean_token_accuracy": 0.20774878412485123, "num_tokens": 22212471.0, "step": 9685 }, { "entropy": 5.277561855316162, "epoch": 0.930835734870317, "grad_norm": 1.3359375, "learning_rate": 0.0004921581274507607, "loss": 5.1692, "mean_token_accuracy": 0.20450907647609712, "num_tokens": 22223188.0, "step": 9690 }, { "entropy": 5.20819878578186, "epoch": 0.9313160422670509, "grad_norm": 1.328125, "learning_rate": 0.000492149152666127, "loss": 5.1741, "mean_token_accuracy": 0.2045721873641014, "num_tokens": 22233978.0, "step": 9695 }, { "entropy": 5.227841567993164, "epoch": 0.9317963496637848, "grad_norm": 1.15625, "learning_rate": 0.0004921401728399348, "loss": 5.1147, "mean_token_accuracy": 0.21509994715452194, "num_tokens": 22244713.0, "step": 9700 }, { "entropy": 5.259960889816284, "epoch": 0.9322766570605188, "grad_norm": 1.2109375, "learning_rate": 0.0004921311879723926, "loss": 5.1705, "mean_token_accuracy": 0.20220176130533218, "num_tokens": 22256192.0, "step": 9705 }, { "entropy": 5.332875108718872, "epoch": 0.9327569644572526, "grad_norm": 1.328125, "learning_rate": 0.0004921221980637088, "loss": 5.1401, "mean_token_accuracy": 0.20141558051109315, "num_tokens": 22268294.0, "step": 9710 }, { "entropy": 5.3014007091522215, "epoch": 0.9332372718539865, "grad_norm": 1.40625, "learning_rate": 0.0004921132031140925, "loss": 5.1616, "mean_token_accuracy": 0.20787308365106583, "num_tokens": 22278952.0, "step": 9715 }, { "entropy": 5.249713897705078, "epoch": 0.9337175792507204, "grad_norm": 1.1953125, "learning_rate": 0.0004921042031237521, "loss": 5.1181, "mean_token_accuracy": 0.1999865725636482, "num_tokens": 22291057.0, "step": 9720 }, { "entropy": 5.333038187026977, "epoch": 0.9341978866474544, "grad_norm": 1.390625, "learning_rate": 0.0004920951980928969, "loss": 5.2022, "mean_token_accuracy": 0.20720864981412887, "num_tokens": 22302479.0, "step": 9725 }, { "entropy": 5.381272459030152, "epoch": 0.9346781940441883, "grad_norm": 1.2734375, "learning_rate": 0.0004920861880217359, "loss": 5.27, "mean_token_accuracy": 0.19498737156391144, "num_tokens": 22315116.0, "step": 9730 }, { "entropy": 5.309507656097412, "epoch": 0.9351585014409222, "grad_norm": 1.15625, "learning_rate": 0.0004920771729104781, "loss": 5.1831, "mean_token_accuracy": 0.20069352984428407, "num_tokens": 22327548.0, "step": 9735 }, { "entropy": 5.204008626937866, "epoch": 0.9356388088376562, "grad_norm": 1.1953125, "learning_rate": 0.0004920681527593329, "loss": 5.0612, "mean_token_accuracy": 0.20920901447534562, "num_tokens": 22339154.0, "step": 9740 }, { "entropy": 5.256301832199097, "epoch": 0.93611911623439, "grad_norm": 1.140625, "learning_rate": 0.0004920591275685098, "loss": 5.1518, "mean_token_accuracy": 0.20383056104183198, "num_tokens": 22350781.0, "step": 9745 }, { "entropy": 5.336814022064209, "epoch": 0.9365994236311239, "grad_norm": 1.265625, "learning_rate": 0.0004920500973382184, "loss": 5.1758, "mean_token_accuracy": 0.20595642030239106, "num_tokens": 22361990.0, "step": 9750 }, { "entropy": 5.223576879501342, "epoch": 0.9370797310278578, "grad_norm": 1.25, "learning_rate": 0.0004920410620686682, "loss": 5.0488, "mean_token_accuracy": 0.21444960832595825, "num_tokens": 22372973.0, "step": 9755 }, { "entropy": 5.18360276222229, "epoch": 0.9375600384245918, "grad_norm": 1.140625, "learning_rate": 0.0004920320217600689, "loss": 5.0665, "mean_token_accuracy": 0.21210620701313018, "num_tokens": 22384369.0, "step": 9760 }, { "entropy": 5.320396280288696, "epoch": 0.9380403458213257, "grad_norm": 1.28125, "learning_rate": 0.0004920229764126306, "loss": 5.1679, "mean_token_accuracy": 0.2056802451610565, "num_tokens": 22395792.0, "step": 9765 }, { "entropy": 5.2880340099334715, "epoch": 0.9385206532180596, "grad_norm": 1.3203125, "learning_rate": 0.0004920139260265632, "loss": 5.1575, "mean_token_accuracy": 0.19827589392662048, "num_tokens": 22408182.0, "step": 9770 }, { "entropy": 5.341842079162598, "epoch": 0.9390009606147934, "grad_norm": 1.578125, "learning_rate": 0.0004920048706020769, "loss": 5.2885, "mean_token_accuracy": 0.19330597370862962, "num_tokens": 22419774.0, "step": 9775 }, { "entropy": 5.187641191482544, "epoch": 0.9394812680115274, "grad_norm": 1.3671875, "learning_rate": 0.0004919958101393817, "loss": 4.9989, "mean_token_accuracy": 0.21211641579866408, "num_tokens": 22430210.0, "step": 9780 }, { "entropy": 5.290931463241577, "epoch": 0.9399615754082613, "grad_norm": 1.28125, "learning_rate": 0.0004919867446386883, "loss": 5.153, "mean_token_accuracy": 0.20970916748046875, "num_tokens": 22442444.0, "step": 9785 }, { "entropy": 5.184951877593994, "epoch": 0.9404418828049952, "grad_norm": 1.1328125, "learning_rate": 0.000491977674100207, "loss": 5.1207, "mean_token_accuracy": 0.2109922468662262, "num_tokens": 22455521.0, "step": 9790 }, { "entropy": 5.312680387496949, "epoch": 0.9409221902017291, "grad_norm": 1.359375, "learning_rate": 0.0004919685985241483, "loss": 5.1845, "mean_token_accuracy": 0.20736344754695893, "num_tokens": 22466997.0, "step": 9795 }, { "entropy": 5.291236543655396, "epoch": 0.9414024975984631, "grad_norm": 1.1953125, "learning_rate": 0.000491959517910723, "loss": 5.0996, "mean_token_accuracy": 0.21323858797550202, "num_tokens": 22477851.0, "step": 9800 }, { "entropy": 5.2509393215179445, "epoch": 0.9418828049951969, "grad_norm": 1.3046875, "learning_rate": 0.0004919504322601421, "loss": 5.193, "mean_token_accuracy": 0.205467090010643, "num_tokens": 22489319.0, "step": 9805 }, { "entropy": 5.174720096588135, "epoch": 0.9423631123919308, "grad_norm": 1.2265625, "learning_rate": 0.0004919413415726162, "loss": 5.0491, "mean_token_accuracy": 0.21085420697927476, "num_tokens": 22500847.0, "step": 9810 }, { "entropy": 5.309349250793457, "epoch": 0.9428434197886647, "grad_norm": 1.2578125, "learning_rate": 0.0004919322458483566, "loss": 5.1415, "mean_token_accuracy": 0.20241572856903076, "num_tokens": 22512719.0, "step": 9815 }, { "entropy": 5.222389364242554, "epoch": 0.9433237271853987, "grad_norm": 1.3125, "learning_rate": 0.0004919231450875745, "loss": 5.0661, "mean_token_accuracy": 0.21022214293479918, "num_tokens": 22522984.0, "step": 9820 }, { "entropy": 5.2375284194946286, "epoch": 0.9438040345821326, "grad_norm": 1.3359375, "learning_rate": 0.0004919140392904809, "loss": 5.1092, "mean_token_accuracy": 0.21000211089849471, "num_tokens": 22534816.0, "step": 9825 }, { "entropy": 5.230174970626831, "epoch": 0.9442843419788665, "grad_norm": 1.1953125, "learning_rate": 0.0004919049284572875, "loss": 5.0975, "mean_token_accuracy": 0.20355214923620224, "num_tokens": 22545753.0, "step": 9830 }, { "entropy": 5.301757907867431, "epoch": 0.9447646493756003, "grad_norm": 1.3125, "learning_rate": 0.0004918958125882058, "loss": 5.1963, "mean_token_accuracy": 0.1956578239798546, "num_tokens": 22557237.0, "step": 9835 }, { "entropy": 5.310576248168945, "epoch": 0.9452449567723343, "grad_norm": 1.265625, "learning_rate": 0.0004918866916834474, "loss": 5.1236, "mean_token_accuracy": 0.20908855646848679, "num_tokens": 22568909.0, "step": 9840 }, { "entropy": 5.315052127838134, "epoch": 0.9457252641690682, "grad_norm": 1.15625, "learning_rate": 0.0004918775657432239, "loss": 5.2595, "mean_token_accuracy": 0.19276428669691087, "num_tokens": 22582162.0, "step": 9845 }, { "entropy": 5.258047676086425, "epoch": 0.9462055715658021, "grad_norm": 1.2734375, "learning_rate": 0.0004918684347677474, "loss": 5.0962, "mean_token_accuracy": 0.20234745740890503, "num_tokens": 22592405.0, "step": 9850 }, { "entropy": 5.273072290420532, "epoch": 0.946685878962536, "grad_norm": 1.25, "learning_rate": 0.0004918592987572298, "loss": 5.1377, "mean_token_accuracy": 0.20278570502996446, "num_tokens": 22603588.0, "step": 9855 }, { "entropy": 5.321579885482788, "epoch": 0.94716618635927, "grad_norm": 1.1875, "learning_rate": 0.0004918501577118832, "loss": 5.2008, "mean_token_accuracy": 0.2043844997882843, "num_tokens": 22614995.0, "step": 9860 }, { "entropy": 5.288969469070435, "epoch": 0.9476464937560038, "grad_norm": 1.171875, "learning_rate": 0.00049184101163192, "loss": 5.1545, "mean_token_accuracy": 0.20031799376010895, "num_tokens": 22627556.0, "step": 9865 }, { "entropy": 5.311606693267822, "epoch": 0.9481268011527377, "grad_norm": 1.3359375, "learning_rate": 0.0004918318605175522, "loss": 5.1448, "mean_token_accuracy": 0.20381494760513305, "num_tokens": 22638339.0, "step": 9870 }, { "entropy": 5.309900140762329, "epoch": 0.9486071085494716, "grad_norm": 1.3125, "learning_rate": 0.0004918227043689924, "loss": 5.1063, "mean_token_accuracy": 0.2075771450996399, "num_tokens": 22648922.0, "step": 9875 }, { "entropy": 5.18968620300293, "epoch": 0.9490874159462056, "grad_norm": 1.3046875, "learning_rate": 0.0004918135431864534, "loss": 5.1583, "mean_token_accuracy": 0.20583543330430984, "num_tokens": 22662006.0, "step": 9880 }, { "entropy": 5.232013797760009, "epoch": 0.9495677233429395, "grad_norm": 1.2265625, "learning_rate": 0.0004918043769701478, "loss": 5.0866, "mean_token_accuracy": 0.2079631954431534, "num_tokens": 22674649.0, "step": 9885 }, { "entropy": 5.360668706893921, "epoch": 0.9500480307396734, "grad_norm": 1.1953125, "learning_rate": 0.0004917952057202882, "loss": 5.2037, "mean_token_accuracy": 0.1974567338824272, "num_tokens": 22685971.0, "step": 9890 }, { "entropy": 5.22627215385437, "epoch": 0.9505283381364072, "grad_norm": 1.203125, "learning_rate": 0.0004917860294370877, "loss": 5.0656, "mean_token_accuracy": 0.2093571364879608, "num_tokens": 22696174.0, "step": 9895 }, { "entropy": 5.31975827217102, "epoch": 0.9510086455331412, "grad_norm": 1.21875, "learning_rate": 0.0004917768481207593, "loss": 5.1813, "mean_token_accuracy": 0.20513910204172134, "num_tokens": 22706983.0, "step": 9900 }, { "entropy": 5.248136568069458, "epoch": 0.9514889529298751, "grad_norm": 1.1953125, "learning_rate": 0.0004917676617715162, "loss": 5.1088, "mean_token_accuracy": 0.206376151740551, "num_tokens": 22718251.0, "step": 9905 }, { "entropy": 5.161273384094239, "epoch": 0.951969260326609, "grad_norm": 1.2734375, "learning_rate": 0.0004917584703895717, "loss": 5.0842, "mean_token_accuracy": 0.2124750316143036, "num_tokens": 22730071.0, "step": 9910 }, { "entropy": 5.238349151611328, "epoch": 0.952449567723343, "grad_norm": 1.28125, "learning_rate": 0.0004917492739751391, "loss": 5.1473, "mean_token_accuracy": 0.20351121425628663, "num_tokens": 22742027.0, "step": 9915 }, { "entropy": 5.210487508773804, "epoch": 0.9529298751200769, "grad_norm": 1.1015625, "learning_rate": 0.000491740072528432, "loss": 5.0373, "mean_token_accuracy": 0.21686818301677704, "num_tokens": 22752946.0, "step": 9920 }, { "entropy": 5.287168884277344, "epoch": 0.9534101825168108, "grad_norm": 1.15625, "learning_rate": 0.000491730866049664, "loss": 5.108, "mean_token_accuracy": 0.20497333854436875, "num_tokens": 22763944.0, "step": 9925 }, { "entropy": 5.225655937194825, "epoch": 0.9538904899135446, "grad_norm": 1.1875, "learning_rate": 0.0004917216545390489, "loss": 5.0843, "mean_token_accuracy": 0.20906523764133453, "num_tokens": 22774414.0, "step": 9930 }, { "entropy": 5.186794948577881, "epoch": 0.9543707973102786, "grad_norm": 1.234375, "learning_rate": 0.0004917124379968004, "loss": 5.0443, "mean_token_accuracy": 0.21451948434114457, "num_tokens": 22785533.0, "step": 9935 }, { "entropy": 5.153272867202759, "epoch": 0.9548511047070125, "grad_norm": 1.328125, "learning_rate": 0.0004917032164231327, "loss": 4.9939, "mean_token_accuracy": 0.21023591607809067, "num_tokens": 22795809.0, "step": 9940 }, { "entropy": 5.244364500045776, "epoch": 0.9553314121037464, "grad_norm": 1.3828125, "learning_rate": 0.0004916939898182598, "loss": 5.2216, "mean_token_accuracy": 0.20205324590206147, "num_tokens": 22807705.0, "step": 9945 }, { "entropy": 5.34041018486023, "epoch": 0.9558117195004803, "grad_norm": 1.40625, "learning_rate": 0.0004916847581823958, "loss": 5.1064, "mean_token_accuracy": 0.20732269585132598, "num_tokens": 22818852.0, "step": 9950 }, { "entropy": 5.187279415130615, "epoch": 0.9562920268972143, "grad_norm": 1.2265625, "learning_rate": 0.0004916755215157552, "loss": 5.0225, "mean_token_accuracy": 0.21118980795145034, "num_tokens": 22829146.0, "step": 9955 }, { "entropy": 5.152674341201783, "epoch": 0.9567723342939481, "grad_norm": 1.171875, "learning_rate": 0.0004916662798185524, "loss": 5.107, "mean_token_accuracy": 0.21148771941661834, "num_tokens": 22840088.0, "step": 9960 }, { "entropy": 5.259473514556885, "epoch": 0.957252641690682, "grad_norm": 1.2109375, "learning_rate": 0.0004916570330910019, "loss": 5.1244, "mean_token_accuracy": 0.20842421650886536, "num_tokens": 22852470.0, "step": 9965 }, { "entropy": 5.29966549873352, "epoch": 0.9577329490874159, "grad_norm": 1.140625, "learning_rate": 0.0004916477813333185, "loss": 5.1655, "mean_token_accuracy": 0.19774912297725677, "num_tokens": 22863673.0, "step": 9970 }, { "entropy": 5.227234315872193, "epoch": 0.9582132564841499, "grad_norm": 1.5078125, "learning_rate": 0.0004916385245457168, "loss": 5.1421, "mean_token_accuracy": 0.2026590123772621, "num_tokens": 22874888.0, "step": 9975 }, { "entropy": 5.2691041946411135, "epoch": 0.9586935638808838, "grad_norm": 1.4765625, "learning_rate": 0.000491629262728412, "loss": 5.1591, "mean_token_accuracy": 0.19835399985313415, "num_tokens": 22886811.0, "step": 9980 }, { "entropy": 5.282389736175537, "epoch": 0.9591738712776177, "grad_norm": 1.2421875, "learning_rate": 0.0004916199958816188, "loss": 5.1101, "mean_token_accuracy": 0.20272685140371322, "num_tokens": 22898777.0, "step": 9985 }, { "entropy": 5.259513235092163, "epoch": 0.9596541786743515, "grad_norm": 1.390625, "learning_rate": 0.0004916107240055527, "loss": 5.0984, "mean_token_accuracy": 0.20606767982244492, "num_tokens": 22910804.0, "step": 9990 }, { "entropy": 5.3379199504852295, "epoch": 0.9601344860710855, "grad_norm": 1.1796875, "learning_rate": 0.0004916014471004287, "loss": 5.2127, "mean_token_accuracy": 0.20945288687944413, "num_tokens": 22922002.0, "step": 9995 }, { "entropy": 5.268113040924073, "epoch": 0.9606147934678194, "grad_norm": 1.2109375, "learning_rate": 0.0004915921651664622, "loss": 5.1176, "mean_token_accuracy": 0.20583815425634383, "num_tokens": 22933471.0, "step": 10000 }, { "entropy": 5.167844009399414, "epoch": 0.9610951008645533, "grad_norm": 1.2421875, "learning_rate": 0.000491582878203869, "loss": 5.0343, "mean_token_accuracy": 0.21021876633167266, "num_tokens": 22945303.0, "step": 10005 }, { "entropy": 5.2859704971313475, "epoch": 0.9615754082612872, "grad_norm": 1.1796875, "learning_rate": 0.0004915735862128643, "loss": 5.1734, "mean_token_accuracy": 0.1960235893726349, "num_tokens": 22956620.0, "step": 10010 }, { "entropy": 5.301449775695801, "epoch": 0.9620557156580212, "grad_norm": 1.3671875, "learning_rate": 0.0004915642891936641, "loss": 5.1695, "mean_token_accuracy": 0.20270660370588303, "num_tokens": 22968941.0, "step": 10015 }, { "entropy": 5.263174438476563, "epoch": 0.962536023054755, "grad_norm": 1.4140625, "learning_rate": 0.0004915549871464841, "loss": 5.1471, "mean_token_accuracy": 0.2005195811390877, "num_tokens": 22980222.0, "step": 10020 }, { "entropy": 5.377077054977417, "epoch": 0.9630163304514889, "grad_norm": 1.328125, "learning_rate": 0.0004915456800715403, "loss": 5.1674, "mean_token_accuracy": 0.19867794066667557, "num_tokens": 22991156.0, "step": 10025 }, { "entropy": 5.341533660888672, "epoch": 0.9634966378482228, "grad_norm": 1.3671875, "learning_rate": 0.000491536367969049, "loss": 5.2939, "mean_token_accuracy": 0.19976369738578797, "num_tokens": 23002939.0, "step": 10030 }, { "entropy": 5.351443099975586, "epoch": 0.9639769452449568, "grad_norm": 1.265625, "learning_rate": 0.0004915270508392261, "loss": 5.1535, "mean_token_accuracy": 0.20271336436271667, "num_tokens": 23015590.0, "step": 10035 }, { "entropy": 5.186623668670654, "epoch": 0.9644572526416907, "grad_norm": 1.2578125, "learning_rate": 0.000491517728682288, "loss": 5.0173, "mean_token_accuracy": 0.21069204956293106, "num_tokens": 23026387.0, "step": 10040 }, { "entropy": 5.1746241569519045, "epoch": 0.9649375600384246, "grad_norm": 1.21875, "learning_rate": 0.0004915084014984512, "loss": 5.0882, "mean_token_accuracy": 0.21177269369363785, "num_tokens": 23037475.0, "step": 10045 }, { "entropy": 5.21121768951416, "epoch": 0.9654178674351584, "grad_norm": 1.3046875, "learning_rate": 0.0004914990692879322, "loss": 5.0636, "mean_token_accuracy": 0.21139880418777465, "num_tokens": 23049305.0, "step": 10050 }, { "entropy": 5.226834392547607, "epoch": 0.9658981748318924, "grad_norm": 1.2421875, "learning_rate": 0.0004914897320509478, "loss": 5.0927, "mean_token_accuracy": 0.208532877266407, "num_tokens": 23061765.0, "step": 10055 }, { "entropy": 5.257470321655274, "epoch": 0.9663784822286263, "grad_norm": 1.1796875, "learning_rate": 0.0004914803897877146, "loss": 5.0923, "mean_token_accuracy": 0.20083025693893433, "num_tokens": 23072355.0, "step": 10060 }, { "entropy": 5.321026134490967, "epoch": 0.9668587896253602, "grad_norm": 1.1640625, "learning_rate": 0.0004914710424984495, "loss": 5.1071, "mean_token_accuracy": 0.19924261420965195, "num_tokens": 23085583.0, "step": 10065 }, { "entropy": 5.320055437088013, "epoch": 0.9673390970220941, "grad_norm": 1.3671875, "learning_rate": 0.0004914616901833696, "loss": 5.1466, "mean_token_accuracy": 0.2025774121284485, "num_tokens": 23095942.0, "step": 10070 }, { "entropy": 5.203504228591919, "epoch": 0.9678194044188281, "grad_norm": 1.1796875, "learning_rate": 0.000491452332842692, "loss": 5.0852, "mean_token_accuracy": 0.21284036785364152, "num_tokens": 23106540.0, "step": 10075 }, { "entropy": 5.239231157302856, "epoch": 0.968299711815562, "grad_norm": 1.390625, "learning_rate": 0.000491442970476634, "loss": 5.1603, "mean_token_accuracy": 0.20696305185556413, "num_tokens": 23118006.0, "step": 10080 }, { "entropy": 5.230704307556152, "epoch": 0.9687800192122958, "grad_norm": 1.3671875, "learning_rate": 0.0004914336030854129, "loss": 5.0625, "mean_token_accuracy": 0.2126757651567459, "num_tokens": 23129103.0, "step": 10085 }, { "entropy": 5.221098470687866, "epoch": 0.9692603266090298, "grad_norm": 1.4296875, "learning_rate": 0.0004914242306692461, "loss": 5.0595, "mean_token_accuracy": 0.21233255714178084, "num_tokens": 23140009.0, "step": 10090 }, { "entropy": 5.2262026309967045, "epoch": 0.9697406340057637, "grad_norm": 1.234375, "learning_rate": 0.0004914148532283516, "loss": 5.123, "mean_token_accuracy": 0.2098432034254074, "num_tokens": 23150982.0, "step": 10095 }, { "entropy": 5.3084290504455565, "epoch": 0.9702209414024976, "grad_norm": 1.25, "learning_rate": 0.0004914054707629466, "loss": 5.1217, "mean_token_accuracy": 0.203516785800457, "num_tokens": 23161834.0, "step": 10100 }, { "entropy": 5.321819496154785, "epoch": 0.9707012487992315, "grad_norm": 1.234375, "learning_rate": 0.0004913960832732493, "loss": 5.2516, "mean_token_accuracy": 0.1973107188940048, "num_tokens": 23173355.0, "step": 10105 }, { "entropy": 5.291294431686401, "epoch": 0.9711815561959655, "grad_norm": 1.84375, "learning_rate": 0.0004913866907594774, "loss": 5.183, "mean_token_accuracy": 0.20283153355121614, "num_tokens": 23185075.0, "step": 10110 }, { "entropy": 5.235888957977295, "epoch": 0.9716618635926993, "grad_norm": 1.5234375, "learning_rate": 0.0004913772932218491, "loss": 5.1614, "mean_token_accuracy": 0.21172062009572984, "num_tokens": 23195590.0, "step": 10115 }, { "entropy": 5.287680578231812, "epoch": 0.9721421709894332, "grad_norm": 1.2578125, "learning_rate": 0.0004913678906605825, "loss": 5.1168, "mean_token_accuracy": 0.20626269578933715, "num_tokens": 23207668.0, "step": 10120 }, { "entropy": 5.210545921325684, "epoch": 0.9726224783861671, "grad_norm": 1.703125, "learning_rate": 0.0004913584830758961, "loss": 5.1037, "mean_token_accuracy": 0.2144807457923889, "num_tokens": 23218497.0, "step": 10125 }, { "entropy": 5.295179796218872, "epoch": 0.9731027857829011, "grad_norm": 1.234375, "learning_rate": 0.0004913490704680081, "loss": 5.1883, "mean_token_accuracy": 0.19650790989398956, "num_tokens": 23230575.0, "step": 10130 }, { "entropy": 5.3129924774169925, "epoch": 0.973583093179635, "grad_norm": 1.28125, "learning_rate": 0.0004913396528371371, "loss": 5.1775, "mean_token_accuracy": 0.2083025798201561, "num_tokens": 23242348.0, "step": 10135 }, { "entropy": 5.318413543701172, "epoch": 0.9740634005763689, "grad_norm": 1.25, "learning_rate": 0.0004913302301835018, "loss": 5.1449, "mean_token_accuracy": 0.20831867009401323, "num_tokens": 23253297.0, "step": 10140 }, { "entropy": 5.209083795547485, "epoch": 0.9745437079731027, "grad_norm": 1.1171875, "learning_rate": 0.000491320802507321, "loss": 5.0829, "mean_token_accuracy": 0.2160535603761673, "num_tokens": 23265830.0, "step": 10145 }, { "entropy": 5.241401433944702, "epoch": 0.9750240153698367, "grad_norm": 1.3125, "learning_rate": 0.0004913113698088133, "loss": 5.1587, "mean_token_accuracy": 0.2019126072525978, "num_tokens": 23275591.0, "step": 10150 }, { "entropy": 5.27093539237976, "epoch": 0.9755043227665706, "grad_norm": 1.3203125, "learning_rate": 0.000491301932088198, "loss": 5.0606, "mean_token_accuracy": 0.20884221643209458, "num_tokens": 23286685.0, "step": 10155 }, { "entropy": 5.236410522460938, "epoch": 0.9759846301633045, "grad_norm": 1.4765625, "learning_rate": 0.0004912924893456942, "loss": 5.0771, "mean_token_accuracy": 0.21038100719451905, "num_tokens": 23298776.0, "step": 10160 }, { "entropy": 5.189069700241089, "epoch": 0.9764649375600384, "grad_norm": 1.3515625, "learning_rate": 0.000491283041581521, "loss": 5.0817, "mean_token_accuracy": 0.2063506156206131, "num_tokens": 23310498.0, "step": 10165 }, { "entropy": 5.217845678329468, "epoch": 0.9769452449567724, "grad_norm": 1.5703125, "learning_rate": 0.0004912735887958978, "loss": 5.1382, "mean_token_accuracy": 0.20284378677606582, "num_tokens": 23321089.0, "step": 10170 }, { "entropy": 5.288270330429077, "epoch": 0.9774255523535063, "grad_norm": 1.34375, "learning_rate": 0.0004912641309890441, "loss": 5.1083, "mean_token_accuracy": 0.20696865767240524, "num_tokens": 23332142.0, "step": 10175 }, { "entropy": 5.252698373794556, "epoch": 0.9779058597502401, "grad_norm": 1.390625, "learning_rate": 0.0004912546681611794, "loss": 5.0731, "mean_token_accuracy": 0.21283762007951737, "num_tokens": 23343014.0, "step": 10180 }, { "entropy": 5.207805871963501, "epoch": 0.978386167146974, "grad_norm": 1.2109375, "learning_rate": 0.0004912452003125234, "loss": 5.0497, "mean_token_accuracy": 0.2128495082259178, "num_tokens": 23354611.0, "step": 10185 }, { "entropy": 5.191194486618042, "epoch": 0.978866474543708, "grad_norm": 1.40625, "learning_rate": 0.000491235727443296, "loss": 5.0971, "mean_token_accuracy": 0.200925113260746, "num_tokens": 23365608.0, "step": 10190 }, { "entropy": 5.283109283447265, "epoch": 0.9793467819404419, "grad_norm": 1.171875, "learning_rate": 0.0004912262495537171, "loss": 5.1403, "mean_token_accuracy": 0.20711569488048553, "num_tokens": 23377884.0, "step": 10195 }, { "entropy": 5.176312112808228, "epoch": 0.9798270893371758, "grad_norm": 2.828125, "learning_rate": 0.0004912167666440068, "loss": 5.0456, "mean_token_accuracy": 0.21011523604393006, "num_tokens": 23389553.0, "step": 10200 }, { "entropy": 5.18413896560669, "epoch": 0.9803073967339097, "grad_norm": 1.3671875, "learning_rate": 0.0004912072787143852, "loss": 5.0395, "mean_token_accuracy": 0.20854321867227554, "num_tokens": 23401079.0, "step": 10205 }, { "entropy": 5.191148519515991, "epoch": 0.9807877041306436, "grad_norm": 1.21875, "learning_rate": 0.0004911977857650725, "loss": 5.0952, "mean_token_accuracy": 0.20658079236745835, "num_tokens": 23412886.0, "step": 10210 }, { "entropy": 5.287184333801269, "epoch": 0.9812680115273775, "grad_norm": 1.234375, "learning_rate": 0.0004911882877962893, "loss": 5.1568, "mean_token_accuracy": 0.2016318693757057, "num_tokens": 23424758.0, "step": 10215 }, { "entropy": 5.213660001754761, "epoch": 0.9817483189241114, "grad_norm": 1.484375, "learning_rate": 0.0004911787848082559, "loss": 5.0263, "mean_token_accuracy": 0.2168577641248703, "num_tokens": 23435552.0, "step": 10220 }, { "entropy": 5.178222560882569, "epoch": 0.9822286263208453, "grad_norm": 1.3515625, "learning_rate": 0.0004911692768011931, "loss": 5.0387, "mean_token_accuracy": 0.21100341975688935, "num_tokens": 23446584.0, "step": 10225 }, { "entropy": 5.319640445709228, "epoch": 0.9827089337175793, "grad_norm": 1.1328125, "learning_rate": 0.0004911597637753217, "loss": 5.2566, "mean_token_accuracy": 0.19432248920202255, "num_tokens": 23458452.0, "step": 10230 }, { "entropy": 5.280249691009521, "epoch": 0.9831892411143132, "grad_norm": 1.453125, "learning_rate": 0.0004911502457308623, "loss": 5.1235, "mean_token_accuracy": 0.1981524184346199, "num_tokens": 23470310.0, "step": 10235 }, { "entropy": 5.322625064849854, "epoch": 0.983669548511047, "grad_norm": 1.2421875, "learning_rate": 0.000491140722668036, "loss": 5.1723, "mean_token_accuracy": 0.20412614494562148, "num_tokens": 23481166.0, "step": 10240 }, { "entropy": 5.271013641357422, "epoch": 0.984149855907781, "grad_norm": 1.203125, "learning_rate": 0.000491131194587064, "loss": 5.1469, "mean_token_accuracy": 0.207887963950634, "num_tokens": 23493134.0, "step": 10245 }, { "entropy": 5.145558023452759, "epoch": 0.9846301633045149, "grad_norm": 1.3828125, "learning_rate": 0.0004911216614881675, "loss": 5.0461, "mean_token_accuracy": 0.21059294939041137, "num_tokens": 23504983.0, "step": 10250 }, { "entropy": 5.310237264633178, "epoch": 0.9851104707012488, "grad_norm": 1.25, "learning_rate": 0.0004911121233715677, "loss": 5.1215, "mean_token_accuracy": 0.2087342619895935, "num_tokens": 23516119.0, "step": 10255 }, { "entropy": 5.301252555847168, "epoch": 0.9855907780979827, "grad_norm": 1.296875, "learning_rate": 0.0004911025802374861, "loss": 5.1551, "mean_token_accuracy": 0.2059706538915634, "num_tokens": 23528242.0, "step": 10260 }, { "entropy": 5.182842016220093, "epoch": 0.9860710854947167, "grad_norm": 1.296875, "learning_rate": 0.0004910930320861442, "loss": 5.0482, "mean_token_accuracy": 0.21699930280447005, "num_tokens": 23539738.0, "step": 10265 }, { "entropy": 5.0920305252075195, "epoch": 0.9865513928914506, "grad_norm": 2.046875, "learning_rate": 0.0004910834789177639, "loss": 5.0687, "mean_token_accuracy": 0.2103741407394409, "num_tokens": 23551228.0, "step": 10270 }, { "entropy": 5.33592963218689, "epoch": 0.9870317002881844, "grad_norm": 1.4296875, "learning_rate": 0.0004910739207325668, "loss": 5.1207, "mean_token_accuracy": 0.2097514569759369, "num_tokens": 23563084.0, "step": 10275 }, { "entropy": 5.250389766693115, "epoch": 0.9875120076849183, "grad_norm": 1.3984375, "learning_rate": 0.0004910643575307749, "loss": 5.0891, "mean_token_accuracy": 0.2097397819161415, "num_tokens": 23574328.0, "step": 10280 }, { "entropy": 5.176170492172242, "epoch": 0.9879923150816523, "grad_norm": 1.265625, "learning_rate": 0.0004910547893126102, "loss": 5.0627, "mean_token_accuracy": 0.21138110905885696, "num_tokens": 23585230.0, "step": 10285 }, { "entropy": 5.22738127708435, "epoch": 0.9884726224783862, "grad_norm": 1.2890625, "learning_rate": 0.0004910452160782948, "loss": 5.1049, "mean_token_accuracy": 0.20212821811437606, "num_tokens": 23596951.0, "step": 10290 }, { "entropy": 5.28731255531311, "epoch": 0.9889529298751201, "grad_norm": 1.1953125, "learning_rate": 0.000491035637828051, "loss": 5.1244, "mean_token_accuracy": 0.21019872575998305, "num_tokens": 23607759.0, "step": 10295 }, { "entropy": 5.2878436088562015, "epoch": 0.989433237271854, "grad_norm": 1.3046875, "learning_rate": 0.0004910260545621012, "loss": 5.1489, "mean_token_accuracy": 0.20213536471128463, "num_tokens": 23619631.0, "step": 10300 }, { "entropy": 5.243245649337768, "epoch": 0.9899135446685879, "grad_norm": 1.296875, "learning_rate": 0.0004910164662806679, "loss": 5.1312, "mean_token_accuracy": 0.20988930463790895, "num_tokens": 23630601.0, "step": 10305 }, { "entropy": 5.271698808670044, "epoch": 0.9903938520653218, "grad_norm": 1.3359375, "learning_rate": 0.0004910068729839736, "loss": 5.0656, "mean_token_accuracy": 0.21258559077978134, "num_tokens": 23641330.0, "step": 10310 }, { "entropy": 5.2771772861480715, "epoch": 0.9908741594620557, "grad_norm": 1.234375, "learning_rate": 0.0004909972746722413, "loss": 5.1537, "mean_token_accuracy": 0.2006964460015297, "num_tokens": 23651492.0, "step": 10315 }, { "entropy": 5.199603843688965, "epoch": 0.9913544668587896, "grad_norm": 1.1796875, "learning_rate": 0.0004909876713456935, "loss": 5.0443, "mean_token_accuracy": 0.2088362917304039, "num_tokens": 23661773.0, "step": 10320 }, { "entropy": 5.239661455154419, "epoch": 0.9918347742555236, "grad_norm": 1.171875, "learning_rate": 0.0004909780630045534, "loss": 5.0905, "mean_token_accuracy": 0.20916487127542496, "num_tokens": 23673534.0, "step": 10325 }, { "entropy": 5.266284799575805, "epoch": 0.9923150816522575, "grad_norm": 1.2109375, "learning_rate": 0.000490968449649044, "loss": 5.1886, "mean_token_accuracy": 0.20307926088571548, "num_tokens": 23684892.0, "step": 10330 }, { "entropy": 5.284223937988282, "epoch": 0.9927953890489913, "grad_norm": 1.1640625, "learning_rate": 0.0004909588312793884, "loss": 5.1518, "mean_token_accuracy": 0.2053588092327118, "num_tokens": 23696076.0, "step": 10335 }, { "entropy": 5.283962202072144, "epoch": 0.9932756964457252, "grad_norm": 1.2890625, "learning_rate": 0.0004909492078958101, "loss": 5.1537, "mean_token_accuracy": 0.20028176456689833, "num_tokens": 23707795.0, "step": 10340 }, { "entropy": 5.249281978607177, "epoch": 0.9937560038424592, "grad_norm": 1.453125, "learning_rate": 0.0004909395794985324, "loss": 5.057, "mean_token_accuracy": 0.2048047587275505, "num_tokens": 23720802.0, "step": 10345 }, { "entropy": 5.25334734916687, "epoch": 0.9942363112391931, "grad_norm": 1.6171875, "learning_rate": 0.0004909299460877788, "loss": 5.0896, "mean_token_accuracy": 0.20352237075567245, "num_tokens": 23732854.0, "step": 10350 }, { "entropy": 5.2786060810089115, "epoch": 0.994716618635927, "grad_norm": 1.328125, "learning_rate": 0.0004909203076637732, "loss": 5.1659, "mean_token_accuracy": 0.2006428435444832, "num_tokens": 23743593.0, "step": 10355 }, { "entropy": 5.3259721279144285, "epoch": 0.9951969260326609, "grad_norm": 1.2734375, "learning_rate": 0.0004909106642267392, "loss": 5.1651, "mean_token_accuracy": 0.20107742697000502, "num_tokens": 23755447.0, "step": 10360 }, { "entropy": 5.273270559310913, "epoch": 0.9956772334293948, "grad_norm": 1.546875, "learning_rate": 0.0004909010157769006, "loss": 5.1412, "mean_token_accuracy": 0.20289405286312104, "num_tokens": 23767181.0, "step": 10365 }, { "entropy": 5.343906021118164, "epoch": 0.9961575408261287, "grad_norm": 1.421875, "learning_rate": 0.0004908913623144814, "loss": 5.2162, "mean_token_accuracy": 0.1965470626950264, "num_tokens": 23776356.0, "step": 10370 }, { "entropy": 5.270178937911988, "epoch": 0.9966378482228626, "grad_norm": 1.4609375, "learning_rate": 0.000490881703839706, "loss": 5.1747, "mean_token_accuracy": 0.20546858310699462, "num_tokens": 23787965.0, "step": 10375 }, { "entropy": 5.23285551071167, "epoch": 0.9971181556195965, "grad_norm": 1.265625, "learning_rate": 0.0004908720403527984, "loss": 5.0634, "mean_token_accuracy": 0.21320411562919617, "num_tokens": 23800327.0, "step": 10380 }, { "entropy": 5.2099464416503904, "epoch": 0.9975984630163305, "grad_norm": 1.1796875, "learning_rate": 0.000490862371853983, "loss": 5.0535, "mean_token_accuracy": 0.20708101391792297, "num_tokens": 23812845.0, "step": 10385 }, { "entropy": 5.323228597640991, "epoch": 0.9980787704130644, "grad_norm": 1.34375, "learning_rate": 0.0004908526983434844, "loss": 5.2069, "mean_token_accuracy": 0.19844041019678116, "num_tokens": 23824831.0, "step": 10390 }, { "entropy": 5.255801010131836, "epoch": 0.9985590778097982, "grad_norm": 1.1953125, "learning_rate": 0.000490843019821527, "loss": 5.0732, "mean_token_accuracy": 0.20827420055866241, "num_tokens": 23836697.0, "step": 10395 }, { "entropy": 5.204647493362427, "epoch": 0.9990393852065321, "grad_norm": 1.25, "learning_rate": 0.0004908333362883358, "loss": 5.0994, "mean_token_accuracy": 0.20774794071912767, "num_tokens": 23847112.0, "step": 10400 }, { "entropy": 5.334963607788086, "epoch": 0.9995196926032661, "grad_norm": 1.3515625, "learning_rate": 0.0004908236477441353, "loss": 5.193, "mean_token_accuracy": 0.2045993834733963, "num_tokens": 23858185.0, "step": 10405 }, { "entropy": 5.197092485427857, "epoch": 1.0, "grad_norm": 1.75, "learning_rate": 0.0004908139541891505, "loss": 4.9697, "mean_token_accuracy": 0.21775645166635513, "num_tokens": 23868536.0, "step": 10410 }, { "entropy": 5.299159669876099, "epoch": 1.0004803073967339, "grad_norm": 1.15625, "learning_rate": 0.0004908042556236066, "loss": 5.0114, "mean_token_accuracy": 0.21747902780771255, "num_tokens": 23880283.0, "step": 10415 }, { "entropy": 5.265295839309692, "epoch": 1.0009606147934678, "grad_norm": 1.1875, "learning_rate": 0.0004907945520477286, "loss": 5.0792, "mean_token_accuracy": 0.20754191726446153, "num_tokens": 23892413.0, "step": 10420 }, { "entropy": 5.34681248664856, "epoch": 1.0014409221902016, "grad_norm": 1.3046875, "learning_rate": 0.0004907848434617419, "loss": 5.1832, "mean_token_accuracy": 0.19456289261579512, "num_tokens": 23903977.0, "step": 10425 }, { "entropy": 5.302938079833984, "epoch": 1.0019212295869357, "grad_norm": 1.3828125, "learning_rate": 0.000490775129865872, "loss": 5.1463, "mean_token_accuracy": 0.2010764569044113, "num_tokens": 23915153.0, "step": 10430 }, { "entropy": 5.225161218643189, "epoch": 1.0024015369836696, "grad_norm": 1.3671875, "learning_rate": 0.0004907654112603442, "loss": 5.0186, "mean_token_accuracy": 0.2120182618498802, "num_tokens": 23926043.0, "step": 10435 }, { "entropy": 5.143458271026612, "epoch": 1.0028818443804035, "grad_norm": 1.3125, "learning_rate": 0.0004907556876453843, "loss": 4.9208, "mean_token_accuracy": 0.2206213116645813, "num_tokens": 23936658.0, "step": 10440 }, { "entropy": 5.225710868835449, "epoch": 1.0033621517771374, "grad_norm": 1.3515625, "learning_rate": 0.000490745959021218, "loss": 5.0434, "mean_token_accuracy": 0.20257661491632462, "num_tokens": 23947676.0, "step": 10445 }, { "entropy": 5.27921199798584, "epoch": 1.0038424591738713, "grad_norm": 1.4296875, "learning_rate": 0.0004907362253880711, "loss": 5.0296, "mean_token_accuracy": 0.2058090642094612, "num_tokens": 23959130.0, "step": 10450 }, { "entropy": 5.305146789550781, "epoch": 1.0043227665706052, "grad_norm": 1.265625, "learning_rate": 0.0004907264867461697, "loss": 5.1168, "mean_token_accuracy": 0.20227408558130264, "num_tokens": 23969905.0, "step": 10455 }, { "entropy": 5.156929969787598, "epoch": 1.004803073967339, "grad_norm": 1.21875, "learning_rate": 0.0004907167430957399, "loss": 5.005, "mean_token_accuracy": 0.2087326243519783, "num_tokens": 23982016.0, "step": 10460 }, { "entropy": 5.258314514160157, "epoch": 1.005283381364073, "grad_norm": 1.34375, "learning_rate": 0.0004907069944370077, "loss": 5.1583, "mean_token_accuracy": 0.20535677224397658, "num_tokens": 23994200.0, "step": 10465 }, { "entropy": 5.350189876556397, "epoch": 1.005763688760807, "grad_norm": 1.375, "learning_rate": 0.0004906972407701998, "loss": 5.0445, "mean_token_accuracy": 0.21789115369319917, "num_tokens": 24004695.0, "step": 10470 }, { "entropy": 5.15443787574768, "epoch": 1.006243996157541, "grad_norm": 1.390625, "learning_rate": 0.0004906874820955423, "loss": 4.9954, "mean_token_accuracy": 0.21623784005641938, "num_tokens": 24015922.0, "step": 10475 }, { "entropy": 5.146531486511231, "epoch": 1.0067243035542748, "grad_norm": 1.2734375, "learning_rate": 0.0004906777184132621, "loss": 4.9992, "mean_token_accuracy": 0.21183741688728333, "num_tokens": 24026759.0, "step": 10480 }, { "entropy": 5.300349760055542, "epoch": 1.0072046109510087, "grad_norm": 1.359375, "learning_rate": 0.0004906679497235856, "loss": 5.0743, "mean_token_accuracy": 0.19960159063339233, "num_tokens": 24037988.0, "step": 10485 }, { "entropy": 5.296932697296143, "epoch": 1.0076849183477425, "grad_norm": 1.2265625, "learning_rate": 0.0004906581760267397, "loss": 5.1147, "mean_token_accuracy": 0.2084190621972084, "num_tokens": 24050837.0, "step": 10490 }, { "entropy": 5.258708572387695, "epoch": 1.0081652257444764, "grad_norm": 1.375, "learning_rate": 0.0004906483973229513, "loss": 4.9916, "mean_token_accuracy": 0.2128538578748703, "num_tokens": 24063085.0, "step": 10495 }, { "entropy": 5.162822246551514, "epoch": 1.0086455331412103, "grad_norm": 1.3359375, "learning_rate": 0.0004906386136124476, "loss": 5.0433, "mean_token_accuracy": 0.21004260182380677, "num_tokens": 24074082.0, "step": 10500 }, { "entropy": 5.3224996566772464, "epoch": 1.0091258405379442, "grad_norm": 1.4765625, "learning_rate": 0.0004906288248954554, "loss": 5.0928, "mean_token_accuracy": 0.20969793498516082, "num_tokens": 24085050.0, "step": 10505 }, { "entropy": 5.235025644302368, "epoch": 1.0096061479346783, "grad_norm": 1.3046875, "learning_rate": 0.0004906190311722023, "loss": 5.0543, "mean_token_accuracy": 0.21070992648601533, "num_tokens": 24095523.0, "step": 10510 }, { "entropy": 5.172921419143677, "epoch": 1.0100864553314122, "grad_norm": 1.234375, "learning_rate": 0.0004906092324429155, "loss": 5.0162, "mean_token_accuracy": 0.20338939726352692, "num_tokens": 24107157.0, "step": 10515 }, { "entropy": 5.295492935180664, "epoch": 1.010566762728146, "grad_norm": 1.2734375, "learning_rate": 0.0004905994287078227, "loss": 5.0377, "mean_token_accuracy": 0.21216456443071366, "num_tokens": 24118668.0, "step": 10520 }, { "entropy": 5.215558815002441, "epoch": 1.01104707012488, "grad_norm": 1.1015625, "learning_rate": 0.0004905896199671512, "loss": 5.087, "mean_token_accuracy": 0.20571289211511612, "num_tokens": 24129563.0, "step": 10525 }, { "entropy": 5.31974778175354, "epoch": 1.0115273775216138, "grad_norm": 1.2109375, "learning_rate": 0.000490579806221129, "loss": 5.0963, "mean_token_accuracy": 0.20234022289514542, "num_tokens": 24139965.0, "step": 10530 }, { "entropy": 5.339018249511719, "epoch": 1.0120076849183477, "grad_norm": 1.2109375, "learning_rate": 0.0004905699874699838, "loss": 5.0927, "mean_token_accuracy": 0.2037150517106056, "num_tokens": 24152109.0, "step": 10535 }, { "entropy": 5.20176329612732, "epoch": 1.0124879923150816, "grad_norm": 1.1953125, "learning_rate": 0.0004905601637139436, "loss": 5.0382, "mean_token_accuracy": 0.2062271788716316, "num_tokens": 24162558.0, "step": 10540 }, { "entropy": 5.153134155273437, "epoch": 1.0129682997118155, "grad_norm": 1.1875, "learning_rate": 0.0004905503349532365, "loss": 4.951, "mean_token_accuracy": 0.2199328899383545, "num_tokens": 24173963.0, "step": 10545 }, { "entropy": 5.1954326152801515, "epoch": 1.0134486071085496, "grad_norm": 1.15625, "learning_rate": 0.0004905405011880906, "loss": 4.9813, "mean_token_accuracy": 0.2153230667114258, "num_tokens": 24184369.0, "step": 10550 }, { "entropy": 5.316426420211792, "epoch": 1.0139289145052834, "grad_norm": 1.1875, "learning_rate": 0.0004905306624187343, "loss": 5.1457, "mean_token_accuracy": 0.20428456813097, "num_tokens": 24194692.0, "step": 10555 }, { "entropy": 5.306823635101319, "epoch": 1.0144092219020173, "grad_norm": 1.2421875, "learning_rate": 0.0004905208186453961, "loss": 5.0775, "mean_token_accuracy": 0.20443981587886811, "num_tokens": 24205363.0, "step": 10560 }, { "entropy": 5.286128664016724, "epoch": 1.0148895292987512, "grad_norm": 1.1796875, "learning_rate": 0.0004905109698683044, "loss": 5.0921, "mean_token_accuracy": 0.20271058976650239, "num_tokens": 24216678.0, "step": 10565 }, { "entropy": 5.180634689331055, "epoch": 1.015369836695485, "grad_norm": 1.2421875, "learning_rate": 0.0004905011160876878, "loss": 4.9789, "mean_token_accuracy": 0.20550620704889297, "num_tokens": 24227541.0, "step": 10570 }, { "entropy": 5.23009238243103, "epoch": 1.015850144092219, "grad_norm": 1.59375, "learning_rate": 0.0004904912573037753, "loss": 4.9955, "mean_token_accuracy": 0.2097481057047844, "num_tokens": 24238118.0, "step": 10575 }, { "entropy": 5.300039005279541, "epoch": 1.0163304514889528, "grad_norm": 1.3046875, "learning_rate": 0.0004904813935167957, "loss": 5.124, "mean_token_accuracy": 0.20036156624555587, "num_tokens": 24250044.0, "step": 10580 }, { "entropy": 5.260974168777466, "epoch": 1.0168107588856867, "grad_norm": 1.3203125, "learning_rate": 0.0004904715247269779, "loss": 5.1359, "mean_token_accuracy": 0.19710972905158997, "num_tokens": 24262805.0, "step": 10585 }, { "entropy": 5.25907940864563, "epoch": 1.0172910662824208, "grad_norm": 1.203125, "learning_rate": 0.0004904616509345514, "loss": 5.0512, "mean_token_accuracy": 0.19826420694589614, "num_tokens": 24274287.0, "step": 10590 }, { "entropy": 5.131495952606201, "epoch": 1.0177713736791547, "grad_norm": 1.5, "learning_rate": 0.0004904517721397449, "loss": 4.9545, "mean_token_accuracy": 0.2095574140548706, "num_tokens": 24284839.0, "step": 10595 }, { "entropy": 5.185259437561035, "epoch": 1.0182516810758886, "grad_norm": 1.1953125, "learning_rate": 0.0004904418883427881, "loss": 5.0363, "mean_token_accuracy": 0.20771684646606445, "num_tokens": 24295812.0, "step": 10600 }, { "entropy": 5.253675317764282, "epoch": 1.0187319884726225, "grad_norm": 1.234375, "learning_rate": 0.0004904319995439104, "loss": 5.0144, "mean_token_accuracy": 0.21546317189931868, "num_tokens": 24306365.0, "step": 10605 }, { "entropy": 5.241600894927979, "epoch": 1.0192122958693564, "grad_norm": 1.328125, "learning_rate": 0.0004904221057433412, "loss": 5.0161, "mean_token_accuracy": 0.20605212748050689, "num_tokens": 24317557.0, "step": 10610 }, { "entropy": 5.208569526672363, "epoch": 1.0196926032660902, "grad_norm": 1.34375, "learning_rate": 0.0004904122069413105, "loss": 4.9752, "mean_token_accuracy": 0.21478671878576278, "num_tokens": 24328874.0, "step": 10615 }, { "entropy": 5.325365495681763, "epoch": 1.0201729106628241, "grad_norm": 1.2734375, "learning_rate": 0.000490402303138048, "loss": 5.1671, "mean_token_accuracy": 0.2036365568637848, "num_tokens": 24340918.0, "step": 10620 }, { "entropy": 5.296480655670166, "epoch": 1.0206532180595582, "grad_norm": 1.21875, "learning_rate": 0.0004903923943337836, "loss": 5.09, "mean_token_accuracy": 0.20379555076360703, "num_tokens": 24352642.0, "step": 10625 }, { "entropy": 5.210208606719971, "epoch": 1.021133525456292, "grad_norm": 1.640625, "learning_rate": 0.0004903824805287475, "loss": 4.9669, "mean_token_accuracy": 0.21470995843410492, "num_tokens": 24364874.0, "step": 10630 }, { "entropy": 5.171673250198364, "epoch": 1.021613832853026, "grad_norm": 1.2890625, "learning_rate": 0.0004903725617231696, "loss": 5.0951, "mean_token_accuracy": 0.21187301725149155, "num_tokens": 24376566.0, "step": 10635 }, { "entropy": 5.203986120223999, "epoch": 1.0220941402497599, "grad_norm": 1.3203125, "learning_rate": 0.0004903626379172805, "loss": 4.9178, "mean_token_accuracy": 0.21737915873527527, "num_tokens": 24387283.0, "step": 10640 }, { "entropy": 5.284794282913208, "epoch": 1.0225744476464937, "grad_norm": 1.265625, "learning_rate": 0.0004903527091113102, "loss": 5.123, "mean_token_accuracy": 0.2050114780664444, "num_tokens": 24397970.0, "step": 10645 }, { "entropy": 5.227866220474243, "epoch": 1.0230547550432276, "grad_norm": 1.1796875, "learning_rate": 0.0004903427753054897, "loss": 5.0674, "mean_token_accuracy": 0.2044738933444023, "num_tokens": 24411223.0, "step": 10650 }, { "entropy": 5.2655031204223635, "epoch": 1.0235350624399615, "grad_norm": 1.390625, "learning_rate": 0.0004903328365000492, "loss": 5.0926, "mean_token_accuracy": 0.20762381106615066, "num_tokens": 24422117.0, "step": 10655 }, { "entropy": 5.236895370483398, "epoch": 1.0240153698366954, "grad_norm": 1.1796875, "learning_rate": 0.0004903228926952199, "loss": 5.0426, "mean_token_accuracy": 0.20284196585416794, "num_tokens": 24434142.0, "step": 10660 }, { "entropy": 5.223333692550659, "epoch": 1.0244956772334295, "grad_norm": 1.2578125, "learning_rate": 0.0004903129438912322, "loss": 4.9533, "mean_token_accuracy": 0.2077935144305229, "num_tokens": 24445295.0, "step": 10665 }, { "entropy": 5.216018962860107, "epoch": 1.0249759846301634, "grad_norm": 1.1484375, "learning_rate": 0.0004903029900883174, "loss": 5.0294, "mean_token_accuracy": 0.21322072446346282, "num_tokens": 24456092.0, "step": 10670 }, { "entropy": 5.246077156066894, "epoch": 1.0254562920268973, "grad_norm": 1.1953125, "learning_rate": 0.0004902930312867063, "loss": 5.1249, "mean_token_accuracy": 0.20178017616271973, "num_tokens": 24467653.0, "step": 10675 }, { "entropy": 5.290040493011475, "epoch": 1.0259365994236311, "grad_norm": 1.1015625, "learning_rate": 0.0004902830674866306, "loss": 5.0763, "mean_token_accuracy": 0.20854619145393372, "num_tokens": 24479164.0, "step": 10680 }, { "entropy": 5.184268283843994, "epoch": 1.026416906820365, "grad_norm": 1.15625, "learning_rate": 0.0004902730986883211, "loss": 4.9426, "mean_token_accuracy": 0.21813494712114334, "num_tokens": 24489785.0, "step": 10685 }, { "entropy": 5.223792457580567, "epoch": 1.026897214217099, "grad_norm": 1.3515625, "learning_rate": 0.0004902631248920096, "loss": 5.0399, "mean_token_accuracy": 0.20972464382648467, "num_tokens": 24500158.0, "step": 10690 }, { "entropy": 5.308423471450806, "epoch": 1.0273775216138328, "grad_norm": 1.140625, "learning_rate": 0.0004902531460979274, "loss": 5.1518, "mean_token_accuracy": 0.20060888230800628, "num_tokens": 24512851.0, "step": 10695 }, { "entropy": 5.2649911403656, "epoch": 1.0278578290105667, "grad_norm": 1.3125, "learning_rate": 0.0004902431623063065, "loss": 5.0938, "mean_token_accuracy": 0.20821331739425658, "num_tokens": 24524016.0, "step": 10700 }, { "entropy": 5.1589634895324705, "epoch": 1.0283381364073008, "grad_norm": 1.2578125, "learning_rate": 0.0004902331735173785, "loss": 4.979, "mean_token_accuracy": 0.2058223605155945, "num_tokens": 24536348.0, "step": 10705 }, { "entropy": 5.149962663650513, "epoch": 1.0288184438040346, "grad_norm": 1.265625, "learning_rate": 0.0004902231797313752, "loss": 5.0329, "mean_token_accuracy": 0.2095619484782219, "num_tokens": 24548718.0, "step": 10710 }, { "entropy": 5.180563497543335, "epoch": 1.0292987512007685, "grad_norm": 1.3046875, "learning_rate": 0.0004902131809485288, "loss": 4.9461, "mean_token_accuracy": 0.21883852481842042, "num_tokens": 24560567.0, "step": 10715 }, { "entropy": 5.216690635681152, "epoch": 1.0297790585975024, "grad_norm": 1.265625, "learning_rate": 0.0004902031771690713, "loss": 4.973, "mean_token_accuracy": 0.21082175374031067, "num_tokens": 24572610.0, "step": 10720 }, { "entropy": 5.22666974067688, "epoch": 1.0302593659942363, "grad_norm": 1.25, "learning_rate": 0.0004901931683932352, "loss": 5.0303, "mean_token_accuracy": 0.21025995314121246, "num_tokens": 24584738.0, "step": 10725 }, { "entropy": 5.256303358078003, "epoch": 1.0307396733909702, "grad_norm": 1.1875, "learning_rate": 0.0004901831546212526, "loss": 5.081, "mean_token_accuracy": 0.20822075754404068, "num_tokens": 24596603.0, "step": 10730 }, { "entropy": 5.166937351226807, "epoch": 1.031219980787704, "grad_norm": 1.28125, "learning_rate": 0.0004901731358533562, "loss": 4.9585, "mean_token_accuracy": 0.21120154708623887, "num_tokens": 24607061.0, "step": 10735 }, { "entropy": 5.163893556594848, "epoch": 1.031700288184438, "grad_norm": 1.296875, "learning_rate": 0.0004901631120897785, "loss": 5.0366, "mean_token_accuracy": 0.21032600998878478, "num_tokens": 24619177.0, "step": 10740 }, { "entropy": 5.19158706665039, "epoch": 1.032180595581172, "grad_norm": 1.1953125, "learning_rate": 0.0004901530833307522, "loss": 4.9629, "mean_token_accuracy": 0.2108414351940155, "num_tokens": 24631336.0, "step": 10745 }, { "entropy": 5.147015523910523, "epoch": 1.032660902977906, "grad_norm": 1.4296875, "learning_rate": 0.0004901430495765103, "loss": 4.9281, "mean_token_accuracy": 0.22001660764217376, "num_tokens": 24641743.0, "step": 10750 }, { "entropy": 5.218332290649414, "epoch": 1.0331412103746398, "grad_norm": 1.34375, "learning_rate": 0.0004901330108272855, "loss": 4.9683, "mean_token_accuracy": 0.21641426235437394, "num_tokens": 24652318.0, "step": 10755 }, { "entropy": 5.181002473831176, "epoch": 1.0336215177713737, "grad_norm": 1.3359375, "learning_rate": 0.0004901229670833111, "loss": 4.9866, "mean_token_accuracy": 0.21016779094934462, "num_tokens": 24664129.0, "step": 10760 }, { "entropy": 5.25865159034729, "epoch": 1.0341018251681076, "grad_norm": 1.15625, "learning_rate": 0.0004901129183448201, "loss": 5.0585, "mean_token_accuracy": 0.20536702275276184, "num_tokens": 24674921.0, "step": 10765 }, { "entropy": 5.233693790435791, "epoch": 1.0345821325648414, "grad_norm": 1.2734375, "learning_rate": 0.0004901028646120459, "loss": 5.0129, "mean_token_accuracy": 0.20729674845933915, "num_tokens": 24686052.0, "step": 10770 }, { "entropy": 5.181743049621582, "epoch": 1.0350624399615753, "grad_norm": 1.265625, "learning_rate": 0.000490092805885222, "loss": 5.0101, "mean_token_accuracy": 0.20977197587490082, "num_tokens": 24698625.0, "step": 10775 }, { "entropy": 5.177609062194824, "epoch": 1.0355427473583094, "grad_norm": 1.203125, "learning_rate": 0.0004900827421645816, "loss": 4.9688, "mean_token_accuracy": 0.2193769931793213, "num_tokens": 24709322.0, "step": 10780 }, { "entropy": 5.15111927986145, "epoch": 1.0360230547550433, "grad_norm": 1.703125, "learning_rate": 0.0004900726734503589, "loss": 4.9438, "mean_token_accuracy": 0.21662437915802002, "num_tokens": 24719512.0, "step": 10785 }, { "entropy": 5.16123480796814, "epoch": 1.0365033621517772, "grad_norm": 1.2421875, "learning_rate": 0.0004900625997427872, "loss": 4.9806, "mean_token_accuracy": 0.21338418126106262, "num_tokens": 24729947.0, "step": 10790 }, { "entropy": 5.135127162933349, "epoch": 1.036983669548511, "grad_norm": 1.34375, "learning_rate": 0.0004900525210421006, "loss": 4.9767, "mean_token_accuracy": 0.21927962452173233, "num_tokens": 24741423.0, "step": 10795 }, { "entropy": 5.223546314239502, "epoch": 1.037463976945245, "grad_norm": 1.34375, "learning_rate": 0.0004900424373485329, "loss": 5.014, "mean_token_accuracy": 0.20575396567583085, "num_tokens": 24753403.0, "step": 10800 }, { "entropy": 5.266994619369507, "epoch": 1.0379442843419788, "grad_norm": 1.2578125, "learning_rate": 0.0004900323486623185, "loss": 5.1261, "mean_token_accuracy": 0.20268698483705522, "num_tokens": 24763660.0, "step": 10805 }, { "entropy": 5.23413200378418, "epoch": 1.0384245917387127, "grad_norm": 1.3125, "learning_rate": 0.0004900222549836914, "loss": 5.0232, "mean_token_accuracy": 0.21380564272403718, "num_tokens": 24775061.0, "step": 10810 }, { "entropy": 5.259318685531616, "epoch": 1.0389048991354466, "grad_norm": 1.15625, "learning_rate": 0.000490012156312886, "loss": 5.0158, "mean_token_accuracy": 0.20518611520528793, "num_tokens": 24785248.0, "step": 10815 }, { "entropy": 5.128819990158081, "epoch": 1.0393852065321807, "grad_norm": 1.2265625, "learning_rate": 0.0004900020526501369, "loss": 4.9191, "mean_token_accuracy": 0.21387154012918472, "num_tokens": 24797024.0, "step": 10820 }, { "entropy": 5.312929439544678, "epoch": 1.0398655139289146, "grad_norm": 1.234375, "learning_rate": 0.0004899919439956785, "loss": 5.1953, "mean_token_accuracy": 0.20268491804599761, "num_tokens": 24808500.0, "step": 10825 }, { "entropy": 5.34240870475769, "epoch": 1.0403458213256485, "grad_norm": 1.21875, "learning_rate": 0.0004899818303497455, "loss": 5.1314, "mean_token_accuracy": 0.2014186292886734, "num_tokens": 24818805.0, "step": 10830 }, { "entropy": 5.268064880371094, "epoch": 1.0408261287223823, "grad_norm": 1.8203125, "learning_rate": 0.0004899717117125728, "loss": 5.0649, "mean_token_accuracy": 0.20589411109685898, "num_tokens": 24829247.0, "step": 10835 }, { "entropy": 5.1017598628997805, "epoch": 1.0413064361191162, "grad_norm": 1.1328125, "learning_rate": 0.0004899615880843953, "loss": 5.0078, "mean_token_accuracy": 0.21258261501789094, "num_tokens": 24840139.0, "step": 10840 }, { "entropy": 5.246176147460938, "epoch": 1.04178674351585, "grad_norm": 1.296875, "learning_rate": 0.0004899514594654481, "loss": 5.1039, "mean_token_accuracy": 0.20273203402757645, "num_tokens": 24851734.0, "step": 10845 }, { "entropy": 5.264043140411377, "epoch": 1.042267050912584, "grad_norm": 1.21875, "learning_rate": 0.0004899413258559662, "loss": 5.0466, "mean_token_accuracy": 0.21014518439769744, "num_tokens": 24863424.0, "step": 10850 }, { "entropy": 5.277558660507202, "epoch": 1.0427473583093179, "grad_norm": 1.2734375, "learning_rate": 0.0004899311872561849, "loss": 5.0636, "mean_token_accuracy": 0.20547475218772887, "num_tokens": 24875086.0, "step": 10855 }, { "entropy": 5.196067905426025, "epoch": 1.043227665706052, "grad_norm": 1.1171875, "learning_rate": 0.0004899210436663398, "loss": 4.9935, "mean_token_accuracy": 0.21280764788389206, "num_tokens": 24888408.0, "step": 10860 }, { "entropy": 5.165633726119995, "epoch": 1.0437079731027858, "grad_norm": 1.265625, "learning_rate": 0.0004899108950866661, "loss": 4.9365, "mean_token_accuracy": 0.21639619767665863, "num_tokens": 24900357.0, "step": 10865 }, { "entropy": 5.252479410171508, "epoch": 1.0441882804995197, "grad_norm": 1.5078125, "learning_rate": 0.0004899007415173997, "loss": 5.0378, "mean_token_accuracy": 0.2107843890786171, "num_tokens": 24910790.0, "step": 10870 }, { "entropy": 5.1988269805908205, "epoch": 1.0446685878962536, "grad_norm": 1.15625, "learning_rate": 0.0004898905829587762, "loss": 5.013, "mean_token_accuracy": 0.20995523184537887, "num_tokens": 24922124.0, "step": 10875 }, { "entropy": 5.104007339477539, "epoch": 1.0451488952929875, "grad_norm": 1.5234375, "learning_rate": 0.0004898804194110313, "loss": 4.9304, "mean_token_accuracy": 0.21980289071798326, "num_tokens": 24933591.0, "step": 10880 }, { "entropy": 5.195203590393066, "epoch": 1.0456292026897214, "grad_norm": 1.3203125, "learning_rate": 0.0004898702508744012, "loss": 4.9496, "mean_token_accuracy": 0.21639021039009093, "num_tokens": 24944708.0, "step": 10885 }, { "entropy": 5.157971286773682, "epoch": 1.0461095100864553, "grad_norm": 1.2265625, "learning_rate": 0.0004898600773491221, "loss": 4.9176, "mean_token_accuracy": 0.21461566239595414, "num_tokens": 24955966.0, "step": 10890 }, { "entropy": 5.254655361175537, "epoch": 1.0465898174831891, "grad_norm": 1.3515625, "learning_rate": 0.0004898498988354297, "loss": 5.0325, "mean_token_accuracy": 0.21173021644353868, "num_tokens": 24967292.0, "step": 10895 }, { "entropy": 5.230365228652954, "epoch": 1.0470701248799232, "grad_norm": 1.296875, "learning_rate": 0.0004898397153335608, "loss": 5.0959, "mean_token_accuracy": 0.20530790984630584, "num_tokens": 24977407.0, "step": 10900 }, { "entropy": 5.294993305206299, "epoch": 1.0475504322766571, "grad_norm": 1.2109375, "learning_rate": 0.0004898295268437517, "loss": 5.1541, "mean_token_accuracy": 0.20490354150533677, "num_tokens": 24988804.0, "step": 10905 }, { "entropy": 5.229137849807739, "epoch": 1.048030739673391, "grad_norm": 1.2109375, "learning_rate": 0.0004898193333662388, "loss": 5.0796, "mean_token_accuracy": 0.20612839758396148, "num_tokens": 25000297.0, "step": 10910 }, { "entropy": 5.225936555862427, "epoch": 1.0485110470701249, "grad_norm": 1.203125, "learning_rate": 0.0004898091349012588, "loss": 5.0167, "mean_token_accuracy": 0.20787729918956757, "num_tokens": 25012135.0, "step": 10915 }, { "entropy": 5.138573503494262, "epoch": 1.0489913544668588, "grad_norm": 1.3828125, "learning_rate": 0.0004897989314490486, "loss": 4.9946, "mean_token_accuracy": 0.21607837826013565, "num_tokens": 25023572.0, "step": 10920 }, { "entropy": 5.2005609512329105, "epoch": 1.0494716618635926, "grad_norm": 1.234375, "learning_rate": 0.0004897887230098451, "loss": 5.0626, "mean_token_accuracy": 0.20565639436244965, "num_tokens": 25035015.0, "step": 10925 }, { "entropy": 5.209970331192016, "epoch": 1.0499519692603265, "grad_norm": 1.1953125, "learning_rate": 0.0004897785095838852, "loss": 4.9928, "mean_token_accuracy": 0.2115662842988968, "num_tokens": 25045931.0, "step": 10930 }, { "entropy": 5.248309993743897, "epoch": 1.0504322766570606, "grad_norm": 1.34375, "learning_rate": 0.0004897682911714061, "loss": 5.0403, "mean_token_accuracy": 0.2143391728401184, "num_tokens": 25056767.0, "step": 10935 }, { "entropy": 5.159217071533203, "epoch": 1.0509125840537945, "grad_norm": 1.1015625, "learning_rate": 0.000489758067772645, "loss": 4.9852, "mean_token_accuracy": 0.21812189370393753, "num_tokens": 25068731.0, "step": 10940 }, { "entropy": 5.239347696304321, "epoch": 1.0513928914505284, "grad_norm": 1.140625, "learning_rate": 0.0004897478393878392, "loss": 5.0752, "mean_token_accuracy": 0.20207206010818482, "num_tokens": 25081268.0, "step": 10945 }, { "entropy": 5.1585955142974855, "epoch": 1.0518731988472623, "grad_norm": 1.2578125, "learning_rate": 0.0004897376060172264, "loss": 4.9696, "mean_token_accuracy": 0.2180320918560028, "num_tokens": 25093105.0, "step": 10950 }, { "entropy": 5.193137502670288, "epoch": 1.0523535062439962, "grad_norm": 1.2578125, "learning_rate": 0.0004897273676610438, "loss": 4.9759, "mean_token_accuracy": 0.21481747329235076, "num_tokens": 25103766.0, "step": 10955 }, { "entropy": 5.226558351516724, "epoch": 1.05283381364073, "grad_norm": 1.28125, "learning_rate": 0.0004897171243195295, "loss": 5.1226, "mean_token_accuracy": 0.21184030324220657, "num_tokens": 25115675.0, "step": 10960 }, { "entropy": 5.209109592437744, "epoch": 1.053314121037464, "grad_norm": 1.109375, "learning_rate": 0.000489706875992921, "loss": 4.991, "mean_token_accuracy": 0.21007043421268462, "num_tokens": 25127907.0, "step": 10965 }, { "entropy": 5.1817710399627686, "epoch": 1.0537944284341978, "grad_norm": 1.265625, "learning_rate": 0.0004896966226814565, "loss": 5.0141, "mean_token_accuracy": 0.2095083549618721, "num_tokens": 25139675.0, "step": 10970 }, { "entropy": 5.254951429367066, "epoch": 1.054274735830932, "grad_norm": 1.3125, "learning_rate": 0.0004896863643853739, "loss": 5.1364, "mean_token_accuracy": 0.20798720717430114, "num_tokens": 25150960.0, "step": 10975 }, { "entropy": 5.297211503982544, "epoch": 1.0547550432276658, "grad_norm": 1.1640625, "learning_rate": 0.0004896761011049114, "loss": 5.1038, "mean_token_accuracy": 0.20611060559749603, "num_tokens": 25163676.0, "step": 10980 }, { "entropy": 5.181599044799805, "epoch": 1.0552353506243997, "grad_norm": 1.2734375, "learning_rate": 0.0004896658328403074, "loss": 5.0374, "mean_token_accuracy": 0.20499148815870286, "num_tokens": 25174317.0, "step": 10985 }, { "entropy": 5.152895545959472, "epoch": 1.0557156580211335, "grad_norm": 1.140625, "learning_rate": 0.0004896555595918001, "loss": 5.0424, "mean_token_accuracy": 0.20741064995527267, "num_tokens": 25186585.0, "step": 10990 }, { "entropy": 5.254479122161865, "epoch": 1.0561959654178674, "grad_norm": 1.4609375, "learning_rate": 0.0004896452813596281, "loss": 5.064, "mean_token_accuracy": 0.20580837428569793, "num_tokens": 25199014.0, "step": 10995 }, { "entropy": 5.286199140548706, "epoch": 1.0566762728146013, "grad_norm": 1.2578125, "learning_rate": 0.0004896349981440301, "loss": 5.1538, "mean_token_accuracy": 0.20260929614305495, "num_tokens": 25210544.0, "step": 11000 }, { "entropy": 5.2207067012786865, "epoch": 1.0571565802113352, "grad_norm": 1.25, "learning_rate": 0.0004896247099452447, "loss": 5.025, "mean_token_accuracy": 0.21664920300245286, "num_tokens": 25221583.0, "step": 11005 }, { "entropy": 5.203857946395874, "epoch": 1.057636887608069, "grad_norm": 1.2109375, "learning_rate": 0.0004896144167635108, "loss": 5.0237, "mean_token_accuracy": 0.21649594753980636, "num_tokens": 25231724.0, "step": 11010 }, { "entropy": 5.265408086776733, "epoch": 1.0581171950048032, "grad_norm": 1.140625, "learning_rate": 0.0004896041185990675, "loss": 5.1366, "mean_token_accuracy": 0.2034787967801094, "num_tokens": 25243021.0, "step": 11015 }, { "entropy": 5.302087926864624, "epoch": 1.058597502401537, "grad_norm": 1.25, "learning_rate": 0.0004895938154521538, "loss": 5.0813, "mean_token_accuracy": 0.20792468786239623, "num_tokens": 25254189.0, "step": 11020 }, { "entropy": 5.238738918304444, "epoch": 1.059077809798271, "grad_norm": 1.4140625, "learning_rate": 0.0004895835073230089, "loss": 5.1264, "mean_token_accuracy": 0.20500208884477616, "num_tokens": 25265556.0, "step": 11025 }, { "entropy": 5.12807183265686, "epoch": 1.0595581171950048, "grad_norm": 1.1640625, "learning_rate": 0.0004895731942118722, "loss": 4.9421, "mean_token_accuracy": 0.2060550183057785, "num_tokens": 25276789.0, "step": 11030 }, { "entropy": 5.241643381118775, "epoch": 1.0600384245917387, "grad_norm": 1.2734375, "learning_rate": 0.0004895628761189829, "loss": 5.1203, "mean_token_accuracy": 0.2057103246450424, "num_tokens": 25288505.0, "step": 11035 }, { "entropy": 5.179819774627686, "epoch": 1.0605187319884726, "grad_norm": 1.25, "learning_rate": 0.0004895525530445809, "loss": 4.9773, "mean_token_accuracy": 0.22087481170892714, "num_tokens": 25301490.0, "step": 11040 }, { "entropy": 5.209347820281982, "epoch": 1.0609990393852065, "grad_norm": 1.1953125, "learning_rate": 0.0004895422249889057, "loss": 5.0721, "mean_token_accuracy": 0.20252202302217484, "num_tokens": 25313303.0, "step": 11045 }, { "entropy": 5.27926664352417, "epoch": 1.0614793467819403, "grad_norm": 1.390625, "learning_rate": 0.0004895318919521971, "loss": 5.1309, "mean_token_accuracy": 0.19722591042518617, "num_tokens": 25324379.0, "step": 11050 }, { "entropy": 5.209228229522705, "epoch": 1.0619596541786744, "grad_norm": 1.28125, "learning_rate": 0.0004895215539346949, "loss": 4.9702, "mean_token_accuracy": 0.21044884771108627, "num_tokens": 25335834.0, "step": 11055 }, { "entropy": 5.195343494415283, "epoch": 1.0624399615754083, "grad_norm": 1.1875, "learning_rate": 0.0004895112109366393, "loss": 5.0312, "mean_token_accuracy": 0.2079668939113617, "num_tokens": 25347591.0, "step": 11060 }, { "entropy": 5.194220972061157, "epoch": 1.0629202689721422, "grad_norm": 1.203125, "learning_rate": 0.0004895008629582703, "loss": 5.0066, "mean_token_accuracy": 0.21342374235391617, "num_tokens": 25358483.0, "step": 11065 }, { "entropy": 5.19577956199646, "epoch": 1.063400576368876, "grad_norm": 1.1953125, "learning_rate": 0.0004894905099998283, "loss": 5.0158, "mean_token_accuracy": 0.20696393847465516, "num_tokens": 25369434.0, "step": 11070 }, { "entropy": 5.223496198654175, "epoch": 1.06388088376561, "grad_norm": 1.125, "learning_rate": 0.0004894801520615535, "loss": 5.0318, "mean_token_accuracy": 0.21212296783924103, "num_tokens": 25381007.0, "step": 11075 }, { "entropy": 5.193694734573365, "epoch": 1.0643611911623438, "grad_norm": 1.1796875, "learning_rate": 0.0004894697891436863, "loss": 5.0148, "mean_token_accuracy": 0.20833683609962464, "num_tokens": 25393809.0, "step": 11080 }, { "entropy": 5.198791122436523, "epoch": 1.0648414985590777, "grad_norm": 1.1953125, "learning_rate": 0.0004894594212464676, "loss": 5.0451, "mean_token_accuracy": 0.21562531143426894, "num_tokens": 25404967.0, "step": 11085 }, { "entropy": 5.2563148021698, "epoch": 1.0653218059558118, "grad_norm": 1.265625, "learning_rate": 0.0004894490483701381, "loss": 5.0122, "mean_token_accuracy": 0.21502433270215987, "num_tokens": 25417092.0, "step": 11090 }, { "entropy": 5.2914710521698, "epoch": 1.0658021133525457, "grad_norm": 1.203125, "learning_rate": 0.0004894386705149382, "loss": 5.1036, "mean_token_accuracy": 0.2005739152431488, "num_tokens": 25428425.0, "step": 11095 }, { "entropy": 5.151738977432251, "epoch": 1.0662824207492796, "grad_norm": 1.203125, "learning_rate": 0.0004894282876811093, "loss": 4.9347, "mean_token_accuracy": 0.21947899460792542, "num_tokens": 25440134.0, "step": 11100 }, { "entropy": 5.143119049072266, "epoch": 1.0667627281460135, "grad_norm": 1.21875, "learning_rate": 0.0004894178998688921, "loss": 5.0003, "mean_token_accuracy": 0.21364154070615768, "num_tokens": 25452222.0, "step": 11105 }, { "entropy": 5.19956374168396, "epoch": 1.0672430355427474, "grad_norm": 1.40625, "learning_rate": 0.0004894075070785281, "loss": 5.0462, "mean_token_accuracy": 0.2120614990592003, "num_tokens": 25464541.0, "step": 11110 }, { "entropy": 5.293523740768433, "epoch": 1.0677233429394812, "grad_norm": 1.453125, "learning_rate": 0.0004893971093102585, "loss": 5.0531, "mean_token_accuracy": 0.19972920715808867, "num_tokens": 25476537.0, "step": 11115 }, { "entropy": 5.360321044921875, "epoch": 1.0682036503362151, "grad_norm": 1.2734375, "learning_rate": 0.0004893867065643245, "loss": 5.1091, "mean_token_accuracy": 0.20334839224815368, "num_tokens": 25486737.0, "step": 11120 }, { "entropy": 5.192300510406494, "epoch": 1.068683957732949, "grad_norm": 1.3046875, "learning_rate": 0.0004893762988409678, "loss": 5.0534, "mean_token_accuracy": 0.20364596098661422, "num_tokens": 25497278.0, "step": 11125 }, { "entropy": 5.208005428314209, "epoch": 1.069164265129683, "grad_norm": 1.4765625, "learning_rate": 0.0004893658861404301, "loss": 5.0304, "mean_token_accuracy": 0.2047014966607094, "num_tokens": 25508716.0, "step": 11130 }, { "entropy": 5.282382202148438, "epoch": 1.069644572526417, "grad_norm": 1.2421875, "learning_rate": 0.0004893554684629529, "loss": 5.1053, "mean_token_accuracy": 0.20216587483882903, "num_tokens": 25519439.0, "step": 11135 }, { "entropy": 5.184592771530151, "epoch": 1.0701248799231509, "grad_norm": 1.921875, "learning_rate": 0.0004893450458087784, "loss": 4.9136, "mean_token_accuracy": 0.2212449848651886, "num_tokens": 25530911.0, "step": 11140 }, { "entropy": 5.250302124023437, "epoch": 1.0706051873198847, "grad_norm": 1.265625, "learning_rate": 0.0004893346181781483, "loss": 5.149, "mean_token_accuracy": 0.1957184687256813, "num_tokens": 25542452.0, "step": 11145 }, { "entropy": 5.269041585922241, "epoch": 1.0710854947166186, "grad_norm": 1.25, "learning_rate": 0.0004893241855713048, "loss": 5.0786, "mean_token_accuracy": 0.19805409461259843, "num_tokens": 25554105.0, "step": 11150 }, { "entropy": 5.312788391113282, "epoch": 1.0715658021133525, "grad_norm": 1.125, "learning_rate": 0.0004893137479884903, "loss": 5.1134, "mean_token_accuracy": 0.20513837188482284, "num_tokens": 25564806.0, "step": 11155 }, { "entropy": 5.186615085601806, "epoch": 1.0720461095100864, "grad_norm": 1.109375, "learning_rate": 0.0004893033054299468, "loss": 5.0038, "mean_token_accuracy": 0.21586932092905045, "num_tokens": 25575664.0, "step": 11160 }, { "entropy": 5.158471345901489, "epoch": 1.0725264169068203, "grad_norm": 1.2890625, "learning_rate": 0.000489292857895917, "loss": 4.9254, "mean_token_accuracy": 0.21706438809633255, "num_tokens": 25586227.0, "step": 11165 }, { "entropy": 5.24803352355957, "epoch": 1.0730067243035544, "grad_norm": 1.28125, "learning_rate": 0.0004892824053866432, "loss": 5.1114, "mean_token_accuracy": 0.20555976331233977, "num_tokens": 25597475.0, "step": 11170 }, { "entropy": 5.169841670989991, "epoch": 1.0734870317002883, "grad_norm": 1.2421875, "learning_rate": 0.0004892719479023683, "loss": 4.9757, "mean_token_accuracy": 0.220069320499897, "num_tokens": 25608098.0, "step": 11175 }, { "entropy": 5.268892574310303, "epoch": 1.0739673390970221, "grad_norm": 1.2421875, "learning_rate": 0.000489261485443335, "loss": 5.101, "mean_token_accuracy": 0.20311392694711686, "num_tokens": 25620053.0, "step": 11180 }, { "entropy": 5.293476009368897, "epoch": 1.074447646493756, "grad_norm": 1.4140625, "learning_rate": 0.0004892510180097863, "loss": 5.0365, "mean_token_accuracy": 0.2041410133242607, "num_tokens": 25630534.0, "step": 11185 }, { "entropy": 5.236781454086303, "epoch": 1.07492795389049, "grad_norm": 1.203125, "learning_rate": 0.0004892405456019651, "loss": 5.0553, "mean_token_accuracy": 0.20958582758903505, "num_tokens": 25641413.0, "step": 11190 }, { "entropy": 5.12468638420105, "epoch": 1.0754082612872238, "grad_norm": 1.3125, "learning_rate": 0.0004892300682201147, "loss": 4.9719, "mean_token_accuracy": 0.21782579123973847, "num_tokens": 25652081.0, "step": 11195 }, { "entropy": 5.232947635650635, "epoch": 1.0758885686839577, "grad_norm": 1.1015625, "learning_rate": 0.0004892195858644782, "loss": 5.0749, "mean_token_accuracy": 0.21364531815052032, "num_tokens": 25664282.0, "step": 11200 }, { "entropy": 5.29130368232727, "epoch": 1.0763688760806915, "grad_norm": 1.1796875, "learning_rate": 0.000489209098535299, "loss": 5.1488, "mean_token_accuracy": 0.2104579210281372, "num_tokens": 25675310.0, "step": 11205 }, { "entropy": 5.364002227783203, "epoch": 1.0768491834774256, "grad_norm": 1.2265625, "learning_rate": 0.0004891986062328205, "loss": 5.125, "mean_token_accuracy": 0.21008958518505097, "num_tokens": 25686895.0, "step": 11210 }, { "entropy": 5.1909034729003904, "epoch": 1.0773294908741595, "grad_norm": 1.25, "learning_rate": 0.0004891881089572865, "loss": 4.9291, "mean_token_accuracy": 0.21216631978750228, "num_tokens": 25697778.0, "step": 11215 }, { "entropy": 5.194598817825318, "epoch": 1.0778097982708934, "grad_norm": 1.265625, "learning_rate": 0.0004891776067089406, "loss": 5.0898, "mean_token_accuracy": 0.20165782868862153, "num_tokens": 25708602.0, "step": 11220 }, { "entropy": 5.248305320739746, "epoch": 1.0782901056676273, "grad_norm": 1.2109375, "learning_rate": 0.0004891670994880266, "loss": 4.9873, "mean_token_accuracy": 0.21111140102148057, "num_tokens": 25719671.0, "step": 11225 }, { "entropy": 5.291704702377319, "epoch": 1.0787704130643612, "grad_norm": 1.2578125, "learning_rate": 0.0004891565872947888, "loss": 5.1287, "mean_token_accuracy": 0.2033605992794037, "num_tokens": 25731797.0, "step": 11230 }, { "entropy": 5.131606006622315, "epoch": 1.079250720461095, "grad_norm": 1.234375, "learning_rate": 0.0004891460701294706, "loss": 4.9989, "mean_token_accuracy": 0.2117511048913002, "num_tokens": 25743984.0, "step": 11235 }, { "entropy": 5.14411768913269, "epoch": 1.079731027857829, "grad_norm": 1.234375, "learning_rate": 0.0004891355479923167, "loss": 5.0089, "mean_token_accuracy": 0.21176680326461791, "num_tokens": 25755252.0, "step": 11240 }, { "entropy": 5.19481086730957, "epoch": 1.080211335254563, "grad_norm": 1.359375, "learning_rate": 0.0004891250208835712, "loss": 4.9358, "mean_token_accuracy": 0.21151957362890245, "num_tokens": 25765715.0, "step": 11245 }, { "entropy": 5.282035970687867, "epoch": 1.080691642651297, "grad_norm": 1.6484375, "learning_rate": 0.0004891144888034784, "loss": 5.062, "mean_token_accuracy": 0.2050844192504883, "num_tokens": 25777866.0, "step": 11250 }, { "entropy": 5.208021640777588, "epoch": 1.0811719500480308, "grad_norm": 1.28125, "learning_rate": 0.0004891039517522832, "loss": 4.9757, "mean_token_accuracy": 0.21298900246620178, "num_tokens": 25791199.0, "step": 11255 }, { "entropy": 5.092755365371704, "epoch": 1.0816522574447647, "grad_norm": 1.1953125, "learning_rate": 0.0004890934097302299, "loss": 4.9385, "mean_token_accuracy": 0.2256488636136055, "num_tokens": 25802979.0, "step": 11260 }, { "entropy": 5.082042789459228, "epoch": 1.0821325648414986, "grad_norm": 1.140625, "learning_rate": 0.0004890828627375632, "loss": 4.9459, "mean_token_accuracy": 0.2113230675458908, "num_tokens": 25814696.0, "step": 11265 }, { "entropy": 5.222199535369873, "epoch": 1.0826128722382324, "grad_norm": 1.1875, "learning_rate": 0.0004890723107745283, "loss": 5.0634, "mean_token_accuracy": 0.20086456686258317, "num_tokens": 25825376.0, "step": 11270 }, { "entropy": 5.239181756973267, "epoch": 1.0830931796349663, "grad_norm": 1.2734375, "learning_rate": 0.0004890617538413699, "loss": 5.0172, "mean_token_accuracy": 0.21430771350860595, "num_tokens": 25835491.0, "step": 11275 }, { "entropy": 5.273404932022094, "epoch": 1.0835734870317002, "grad_norm": 1.53125, "learning_rate": 0.0004890511919383333, "loss": 5.0797, "mean_token_accuracy": 0.20752860009670257, "num_tokens": 25848154.0, "step": 11280 }, { "entropy": 5.189536762237549, "epoch": 1.084053794428434, "grad_norm": 1.3359375, "learning_rate": 0.0004890406250656636, "loss": 5.0563, "mean_token_accuracy": 0.20726050287485123, "num_tokens": 25859471.0, "step": 11285 }, { "entropy": 5.1854105472564695, "epoch": 1.0845341018251682, "grad_norm": 1.296875, "learning_rate": 0.0004890300532236062, "loss": 4.9667, "mean_token_accuracy": 0.21461206972599028, "num_tokens": 25869460.0, "step": 11290 }, { "entropy": 5.1921216487884525, "epoch": 1.085014409221902, "grad_norm": 1.15625, "learning_rate": 0.0004890194764124064, "loss": 4.9847, "mean_token_accuracy": 0.21348736435174942, "num_tokens": 25881892.0, "step": 11295 }, { "entropy": 5.274989652633667, "epoch": 1.085494716618636, "grad_norm": 1.21875, "learning_rate": 0.0004890088946323099, "loss": 5.1136, "mean_token_accuracy": 0.20064806640148164, "num_tokens": 25893774.0, "step": 11300 }, { "entropy": 5.141970014572143, "epoch": 1.0859750240153698, "grad_norm": 1.375, "learning_rate": 0.0004889983078835623, "loss": 4.924, "mean_token_accuracy": 0.21487925201654434, "num_tokens": 25904758.0, "step": 11305 }, { "entropy": 5.145872449874878, "epoch": 1.0864553314121037, "grad_norm": 1.2578125, "learning_rate": 0.0004889877161664096, "loss": 5.0486, "mean_token_accuracy": 0.20480419993400573, "num_tokens": 25917013.0, "step": 11310 }, { "entropy": 5.349794626235962, "epoch": 1.0869356388088376, "grad_norm": 1.2578125, "learning_rate": 0.0004889771194810974, "loss": 5.1048, "mean_token_accuracy": 0.2101388841867447, "num_tokens": 25927780.0, "step": 11315 }, { "entropy": 5.162079620361328, "epoch": 1.0874159462055715, "grad_norm": 1.5546875, "learning_rate": 0.0004889665178278719, "loss": 4.99, "mean_token_accuracy": 0.21398296654224397, "num_tokens": 25939339.0, "step": 11320 }, { "entropy": 5.184692430496216, "epoch": 1.0878962536023056, "grad_norm": 1.3515625, "learning_rate": 0.0004889559112069792, "loss": 4.9803, "mean_token_accuracy": 0.21223179250955582, "num_tokens": 25950440.0, "step": 11325 }, { "entropy": 5.23531174659729, "epoch": 1.0883765609990395, "grad_norm": 1.1953125, "learning_rate": 0.0004889452996186657, "loss": 5.0247, "mean_token_accuracy": 0.20834243446588516, "num_tokens": 25962849.0, "step": 11330 }, { "entropy": 5.221828603744507, "epoch": 1.0888568683957733, "grad_norm": 1.296875, "learning_rate": 0.0004889346830631774, "loss": 5.0695, "mean_token_accuracy": 0.20499206930398942, "num_tokens": 25973616.0, "step": 11335 }, { "entropy": 5.175124979019165, "epoch": 1.0893371757925072, "grad_norm": 1.2421875, "learning_rate": 0.000488924061540761, "loss": 4.9543, "mean_token_accuracy": 0.21570177525281906, "num_tokens": 25984727.0, "step": 11340 }, { "entropy": 5.218035411834717, "epoch": 1.089817483189241, "grad_norm": 1.3203125, "learning_rate": 0.0004889134350516633, "loss": 5.069, "mean_token_accuracy": 0.20992496013641357, "num_tokens": 25996431.0, "step": 11345 }, { "entropy": 5.199566984176636, "epoch": 1.090297790585975, "grad_norm": 1.140625, "learning_rate": 0.0004889028035961308, "loss": 5.0212, "mean_token_accuracy": 0.20875319093465805, "num_tokens": 26008936.0, "step": 11350 }, { "entropy": 5.28378643989563, "epoch": 1.0907780979827089, "grad_norm": 1.25, "learning_rate": 0.0004888921671744103, "loss": 5.0843, "mean_token_accuracy": 0.20148587226867676, "num_tokens": 26019308.0, "step": 11355 }, { "entropy": 5.234175491333008, "epoch": 1.0912584053794427, "grad_norm": 1.328125, "learning_rate": 0.0004888815257867488, "loss": 4.9794, "mean_token_accuracy": 0.21648937463760376, "num_tokens": 26030705.0, "step": 11360 }, { "entropy": 5.227200984954834, "epoch": 1.0917387127761768, "grad_norm": 1.1484375, "learning_rate": 0.0004888708794333934, "loss": 5.0079, "mean_token_accuracy": 0.21071529090404512, "num_tokens": 26042759.0, "step": 11365 }, { "entropy": 5.233187103271485, "epoch": 1.0922190201729107, "grad_norm": 1.2421875, "learning_rate": 0.0004888602281145913, "loss": 5.0673, "mean_token_accuracy": 0.20930221676826477, "num_tokens": 26054719.0, "step": 11370 }, { "entropy": 5.210502481460571, "epoch": 1.0926993275696446, "grad_norm": 1.2890625, "learning_rate": 0.0004888495718305897, "loss": 5.0531, "mean_token_accuracy": 0.20732715278863906, "num_tokens": 26065765.0, "step": 11375 }, { "entropy": 5.092276668548584, "epoch": 1.0931796349663785, "grad_norm": 1.1875, "learning_rate": 0.000488838910581636, "loss": 4.9681, "mean_token_accuracy": 0.21723177582025527, "num_tokens": 26077719.0, "step": 11380 }, { "entropy": 5.271825551986694, "epoch": 1.0936599423631124, "grad_norm": 1.3984375, "learning_rate": 0.0004888282443679777, "loss": 5.1021, "mean_token_accuracy": 0.1978613868355751, "num_tokens": 26089924.0, "step": 11385 }, { "entropy": 5.389468097686768, "epoch": 1.0941402497598463, "grad_norm": 1.3671875, "learning_rate": 0.0004888175731898627, "loss": 5.1585, "mean_token_accuracy": 0.20117444396018982, "num_tokens": 26100312.0, "step": 11390 }, { "entropy": 5.196537494659424, "epoch": 1.0946205571565801, "grad_norm": 1.2578125, "learning_rate": 0.0004888068970475384, "loss": 5.0671, "mean_token_accuracy": 0.21175539195537568, "num_tokens": 26111932.0, "step": 11395 }, { "entropy": 5.233985948562622, "epoch": 1.0951008645533142, "grad_norm": 1.1484375, "learning_rate": 0.0004887962159412529, "loss": 4.9669, "mean_token_accuracy": 0.2145277202129364, "num_tokens": 26123989.0, "step": 11400 }, { "entropy": 5.24011116027832, "epoch": 1.0955811719500481, "grad_norm": 1.3359375, "learning_rate": 0.0004887855298712541, "loss": 5.0822, "mean_token_accuracy": 0.2078133523464203, "num_tokens": 26135589.0, "step": 11405 }, { "entropy": 5.1291491985321045, "epoch": 1.096061479346782, "grad_norm": 1.2578125, "learning_rate": 0.00048877483883779, "loss": 4.9785, "mean_token_accuracy": 0.21442267745733262, "num_tokens": 26147069.0, "step": 11410 }, { "entropy": 5.19802622795105, "epoch": 1.0965417867435159, "grad_norm": 1.1875, "learning_rate": 0.0004887641428411091, "loss": 5.0536, "mean_token_accuracy": 0.2031223937869072, "num_tokens": 26159331.0, "step": 11415 }, { "entropy": 5.239735078811646, "epoch": 1.0970220941402498, "grad_norm": 1.9140625, "learning_rate": 0.0004887534418814595, "loss": 5.0489, "mean_token_accuracy": 0.21215286552906037, "num_tokens": 26169863.0, "step": 11420 }, { "entropy": 5.201609802246094, "epoch": 1.0975024015369836, "grad_norm": 1.3515625, "learning_rate": 0.0004887427359590897, "loss": 5.018, "mean_token_accuracy": 0.20545354038476943, "num_tokens": 26182888.0, "step": 11425 }, { "entropy": 5.196053218841553, "epoch": 1.0979827089337175, "grad_norm": 1.3046875, "learning_rate": 0.0004887320250742482, "loss": 5.0074, "mean_token_accuracy": 0.21169717162847518, "num_tokens": 26194979.0, "step": 11430 }, { "entropy": 5.163520240783692, "epoch": 1.0984630163304514, "grad_norm": 1.3984375, "learning_rate": 0.0004887213092271838, "loss": 5.0171, "mean_token_accuracy": 0.218392214179039, "num_tokens": 26207309.0, "step": 11435 }, { "entropy": 5.180823183059692, "epoch": 1.0989433237271853, "grad_norm": 1.265625, "learning_rate": 0.0004887105884181451, "loss": 5.0562, "mean_token_accuracy": 0.20584176182746888, "num_tokens": 26219231.0, "step": 11440 }, { "entropy": 5.28138575553894, "epoch": 1.0994236311239194, "grad_norm": 1.9765625, "learning_rate": 0.0004886998626473813, "loss": 5.0914, "mean_token_accuracy": 0.2082364484667778, "num_tokens": 26229355.0, "step": 11445 }, { "entropy": 5.259874248504639, "epoch": 1.0999039385206533, "grad_norm": 1.25, "learning_rate": 0.0004886891319151411, "loss": 4.9917, "mean_token_accuracy": 0.21067868769168854, "num_tokens": 26239069.0, "step": 11450 }, { "entropy": 5.232890796661377, "epoch": 1.1003842459173871, "grad_norm": 1.21875, "learning_rate": 0.0004886783962216738, "loss": 5.1051, "mean_token_accuracy": 0.20159524232149123, "num_tokens": 26250403.0, "step": 11455 }, { "entropy": 5.2542445182800295, "epoch": 1.100864553314121, "grad_norm": 1.21875, "learning_rate": 0.0004886676555672287, "loss": 5.0155, "mean_token_accuracy": 0.20787968933582307, "num_tokens": 26262926.0, "step": 11460 }, { "entropy": 5.3055487155914305, "epoch": 1.101344860710855, "grad_norm": 1.3515625, "learning_rate": 0.0004886569099520551, "loss": 5.1355, "mean_token_accuracy": 0.2017137423157692, "num_tokens": 26274030.0, "step": 11465 }, { "entropy": 5.146465301513672, "epoch": 1.1018251681075888, "grad_norm": 1.3828125, "learning_rate": 0.0004886461593764024, "loss": 5.0465, "mean_token_accuracy": 0.21580926030874253, "num_tokens": 26284799.0, "step": 11470 }, { "entropy": 5.112147951126099, "epoch": 1.1023054755043227, "grad_norm": 1.4140625, "learning_rate": 0.0004886354038405204, "loss": 4.9238, "mean_token_accuracy": 0.22144615203142165, "num_tokens": 26295154.0, "step": 11475 }, { "entropy": 5.2879190921783445, "epoch": 1.1027857829010568, "grad_norm": 1.296875, "learning_rate": 0.0004886246433446586, "loss": 5.0586, "mean_token_accuracy": 0.20436291843652726, "num_tokens": 26306181.0, "step": 11480 }, { "entropy": 5.296609544754029, "epoch": 1.1032660902977907, "grad_norm": 1.390625, "learning_rate": 0.0004886138778890669, "loss": 5.1495, "mean_token_accuracy": 0.19853242188692094, "num_tokens": 26318674.0, "step": 11485 }, { "entropy": 5.262785196304321, "epoch": 1.1037463976945245, "grad_norm": 1.25, "learning_rate": 0.0004886031074739953, "loss": 5.0698, "mean_token_accuracy": 0.2011367380619049, "num_tokens": 26330257.0, "step": 11490 }, { "entropy": 5.268162727355957, "epoch": 1.1042267050912584, "grad_norm": 1.1953125, "learning_rate": 0.0004885923320996938, "loss": 5.0199, "mean_token_accuracy": 0.20887107402086258, "num_tokens": 26342035.0, "step": 11495 }, { "entropy": 5.2100663661956785, "epoch": 1.1047070124879923, "grad_norm": 1.28125, "learning_rate": 0.0004885815517664127, "loss": 5.0453, "mean_token_accuracy": 0.20797477215528487, "num_tokens": 26352703.0, "step": 11500 }, { "entropy": 5.280159616470337, "epoch": 1.1051873198847262, "grad_norm": 1.3046875, "learning_rate": 0.000488570766474402, "loss": 5.0525, "mean_token_accuracy": 0.2123723268508911, "num_tokens": 26362324.0, "step": 11505 }, { "entropy": 5.240962123870849, "epoch": 1.10566762728146, "grad_norm": 1.3671875, "learning_rate": 0.0004885599762239124, "loss": 5.0814, "mean_token_accuracy": 0.20114895701408386, "num_tokens": 26373540.0, "step": 11510 }, { "entropy": 5.198315954208374, "epoch": 1.106147934678194, "grad_norm": 1.3515625, "learning_rate": 0.0004885491810151943, "loss": 5.0311, "mean_token_accuracy": 0.21315819025039673, "num_tokens": 26384259.0, "step": 11515 }, { "entropy": 5.164407539367676, "epoch": 1.106628242074928, "grad_norm": 1.453125, "learning_rate": 0.0004885383808484982, "loss": 5.0093, "mean_token_accuracy": 0.2147809937596321, "num_tokens": 26395193.0, "step": 11520 }, { "entropy": 5.166898584365844, "epoch": 1.107108549471662, "grad_norm": 1.2734375, "learning_rate": 0.0004885275757240751, "loss": 4.9888, "mean_token_accuracy": 0.21306061148643493, "num_tokens": 26408556.0, "step": 11525 }, { "entropy": 5.181211996078491, "epoch": 1.1075888568683958, "grad_norm": 1.25, "learning_rate": 0.0004885167656421757, "loss": 5.0399, "mean_token_accuracy": 0.21158163398504257, "num_tokens": 26420066.0, "step": 11530 }, { "entropy": 5.251511573791504, "epoch": 1.1080691642651297, "grad_norm": 1.34375, "learning_rate": 0.000488505950603051, "loss": 5.0044, "mean_token_accuracy": 0.21201496720314025, "num_tokens": 26432533.0, "step": 11535 }, { "entropy": 5.220172452926636, "epoch": 1.1085494716618636, "grad_norm": 1.4296875, "learning_rate": 0.000488495130606952, "loss": 5.0101, "mean_token_accuracy": 0.20595130324363708, "num_tokens": 26443878.0, "step": 11540 }, { "entropy": 5.173053646087647, "epoch": 1.1090297790585975, "grad_norm": 1.3203125, "learning_rate": 0.0004884843056541302, "loss": 5.0053, "mean_token_accuracy": 0.20983056724071503, "num_tokens": 26455111.0, "step": 11545 }, { "entropy": 5.2253295421600345, "epoch": 1.1095100864553313, "grad_norm": 1.28125, "learning_rate": 0.0004884734757448367, "loss": 5.0401, "mean_token_accuracy": 0.2117287129163742, "num_tokens": 26466577.0, "step": 11550 }, { "entropy": 5.212938976287842, "epoch": 1.1099903938520654, "grad_norm": 1.2578125, "learning_rate": 0.000488462640879323, "loss": 4.9652, "mean_token_accuracy": 0.21090029329061508, "num_tokens": 26479637.0, "step": 11555 }, { "entropy": 5.146299743652344, "epoch": 1.1104707012487993, "grad_norm": 1.2421875, "learning_rate": 0.0004884518010578405, "loss": 5.0053, "mean_token_accuracy": 0.2093895897269249, "num_tokens": 26489923.0, "step": 11560 }, { "entropy": 5.262247848510742, "epoch": 1.1109510086455332, "grad_norm": 1.390625, "learning_rate": 0.0004884409562806411, "loss": 5.107, "mean_token_accuracy": 0.2048266798257828, "num_tokens": 26501236.0, "step": 11565 }, { "entropy": 5.210489082336426, "epoch": 1.111431316042267, "grad_norm": 1.1953125, "learning_rate": 0.0004884301065479765, "loss": 4.9631, "mean_token_accuracy": 0.2114759638905525, "num_tokens": 26511509.0, "step": 11570 }, { "entropy": 5.148941612243652, "epoch": 1.111911623439001, "grad_norm": 1.203125, "learning_rate": 0.0004884192518600986, "loss": 4.9407, "mean_token_accuracy": 0.2189345121383667, "num_tokens": 26520931.0, "step": 11575 }, { "entropy": 5.10898380279541, "epoch": 1.1123919308357348, "grad_norm": 1.3046875, "learning_rate": 0.0004884083922172593, "loss": 4.9487, "mean_token_accuracy": 0.2212027356028557, "num_tokens": 26531333.0, "step": 11580 }, { "entropy": 5.150489377975464, "epoch": 1.1128722382324687, "grad_norm": 1.2890625, "learning_rate": 0.0004883975276197108, "loss": 5.0245, "mean_token_accuracy": 0.2141410857439041, "num_tokens": 26543696.0, "step": 11585 }, { "entropy": 5.21362886428833, "epoch": 1.1133525456292026, "grad_norm": 1.328125, "learning_rate": 0.0004883866580677055, "loss": 4.9893, "mean_token_accuracy": 0.21139197200536727, "num_tokens": 26556292.0, "step": 11590 }, { "entropy": 5.244434595108032, "epoch": 1.1138328530259365, "grad_norm": 1.3984375, "learning_rate": 0.0004883757835614956, "loss": 5.0254, "mean_token_accuracy": 0.2002588540315628, "num_tokens": 26568023.0, "step": 11595 }, { "entropy": 5.1676887512207035, "epoch": 1.1143131604226706, "grad_norm": 1.40625, "learning_rate": 0.0004883649041013335, "loss": 5.0, "mean_token_accuracy": 0.20850686728954315, "num_tokens": 26579469.0, "step": 11600 }, { "entropy": 5.249563598632813, "epoch": 1.1147934678194045, "grad_norm": 1.453125, "learning_rate": 0.000488354019687472, "loss": 5.0175, "mean_token_accuracy": 0.20751053243875503, "num_tokens": 26591592.0, "step": 11605 }, { "entropy": 5.24787974357605, "epoch": 1.1152737752161384, "grad_norm": 1.1328125, "learning_rate": 0.0004883431303201636, "loss": 5.0213, "mean_token_accuracy": 0.2119702085852623, "num_tokens": 26604075.0, "step": 11610 }, { "entropy": 5.092301654815674, "epoch": 1.1157540826128722, "grad_norm": 1.296875, "learning_rate": 0.0004883322359996613, "loss": 4.9402, "mean_token_accuracy": 0.21203264445066453, "num_tokens": 26616448.0, "step": 11615 }, { "entropy": 5.232432794570923, "epoch": 1.1162343900096061, "grad_norm": 1.234375, "learning_rate": 0.0004883213367262179, "loss": 5.014, "mean_token_accuracy": 0.21606809943914412, "num_tokens": 26628749.0, "step": 11620 }, { "entropy": 5.1754385471344, "epoch": 1.11671469740634, "grad_norm": 1.4296875, "learning_rate": 0.0004883104325000866, "loss": 4.9661, "mean_token_accuracy": 0.21770241409540175, "num_tokens": 26640163.0, "step": 11625 }, { "entropy": 5.214389657974243, "epoch": 1.1171950048030739, "grad_norm": 1.265625, "learning_rate": 0.0004882995233215203, "loss": 5.0279, "mean_token_accuracy": 0.21030279397964477, "num_tokens": 26650729.0, "step": 11630 }, { "entropy": 5.1765196323394775, "epoch": 1.117675312199808, "grad_norm": 1.15625, "learning_rate": 0.0004882886091907726, "loss": 5.0681, "mean_token_accuracy": 0.20886261761188507, "num_tokens": 26661995.0, "step": 11635 }, { "entropy": 5.166051578521729, "epoch": 1.1181556195965419, "grad_norm": 1.1875, "learning_rate": 0.00048827769010809666, "loss": 4.9625, "mean_token_accuracy": 0.21373932361602782, "num_tokens": 26673224.0, "step": 11640 }, { "entropy": 5.2252014636993405, "epoch": 1.1186359269932757, "grad_norm": 1.2421875, "learning_rate": 0.00048826676607374606, "loss": 4.9973, "mean_token_accuracy": 0.20560778081417083, "num_tokens": 26686331.0, "step": 11645 }, { "entropy": 5.253619718551636, "epoch": 1.1191162343900096, "grad_norm": 1.296875, "learning_rate": 0.00048825583708797434, "loss": 5.0623, "mean_token_accuracy": 0.2126183569431305, "num_tokens": 26696816.0, "step": 11650 }, { "entropy": 5.149569368362426, "epoch": 1.1195965417867435, "grad_norm": 1.359375, "learning_rate": 0.0004882449031510354, "loss": 5.0084, "mean_token_accuracy": 0.21252903193235398, "num_tokens": 26708126.0, "step": 11655 }, { "entropy": 5.2515003204345705, "epoch": 1.1200768491834774, "grad_norm": 1.15625, "learning_rate": 0.000488233964263183, "loss": 5.1267, "mean_token_accuracy": 0.20479959100484849, "num_tokens": 26718951.0, "step": 11660 }, { "entropy": 5.24708366394043, "epoch": 1.1205571565802113, "grad_norm": 1.5234375, "learning_rate": 0.00048822302042467115, "loss": 5.0769, "mean_token_accuracy": 0.20536175221204758, "num_tokens": 26730550.0, "step": 11665 }, { "entropy": 5.11295714378357, "epoch": 1.1210374639769451, "grad_norm": 1.28125, "learning_rate": 0.0004882120716357539, "loss": 4.9113, "mean_token_accuracy": 0.21777141392230986, "num_tokens": 26741485.0, "step": 11670 }, { "entropy": 5.075635814666748, "epoch": 1.1215177713736793, "grad_norm": 1.2265625, "learning_rate": 0.0004882011178966854, "loss": 4.9079, "mean_token_accuracy": 0.2191823497414589, "num_tokens": 26753947.0, "step": 11675 }, { "entropy": 5.206958436965943, "epoch": 1.1219980787704131, "grad_norm": 1.2265625, "learning_rate": 0.0004881901592077201, "loss": 4.9939, "mean_token_accuracy": 0.2158915787935257, "num_tokens": 26764921.0, "step": 11680 }, { "entropy": 5.221724176406861, "epoch": 1.122478386167147, "grad_norm": 1.1875, "learning_rate": 0.0004881791955691123, "loss": 4.9439, "mean_token_accuracy": 0.21361148059368135, "num_tokens": 26777384.0, "step": 11685 }, { "entropy": 5.2075098037719725, "epoch": 1.122958693563881, "grad_norm": 1.296875, "learning_rate": 0.00048816822698111655, "loss": 5.07, "mean_token_accuracy": 0.20504006147384643, "num_tokens": 26789916.0, "step": 11690 }, { "entropy": 5.194676733016967, "epoch": 1.1234390009606148, "grad_norm": 1.5546875, "learning_rate": 0.00048815725344398766, "loss": 4.9768, "mean_token_accuracy": 0.2126043662428856, "num_tokens": 26801167.0, "step": 11695 }, { "entropy": 5.290380001068115, "epoch": 1.1239193083573487, "grad_norm": 1.2578125, "learning_rate": 0.00048814627495798017, "loss": 5.0949, "mean_token_accuracy": 0.20276835262775422, "num_tokens": 26813235.0, "step": 11700 }, { "entropy": 5.283368635177612, "epoch": 1.1243996157540825, "grad_norm": 1.2109375, "learning_rate": 0.0004881352915233492, "loss": 5.0815, "mean_token_accuracy": 0.2050356462597847, "num_tokens": 26824758.0, "step": 11705 }, { "entropy": 5.222380495071411, "epoch": 1.1248799231508164, "grad_norm": 1.171875, "learning_rate": 0.00048812430314034956, "loss": 5.0459, "mean_token_accuracy": 0.20643949508666992, "num_tokens": 26836877.0, "step": 11710 }, { "entropy": 5.146471929550171, "epoch": 1.1253602305475505, "grad_norm": 1.3828125, "learning_rate": 0.0004881133098092365, "loss": 4.8847, "mean_token_accuracy": 0.22436288893222808, "num_tokens": 26848394.0, "step": 11715 }, { "entropy": 5.169895315170288, "epoch": 1.1258405379442844, "grad_norm": 1.390625, "learning_rate": 0.0004881023115302652, "loss": 5.0248, "mean_token_accuracy": 0.20842950493097306, "num_tokens": 26859064.0, "step": 11720 }, { "entropy": 5.259196662902832, "epoch": 1.1263208453410183, "grad_norm": 1.296875, "learning_rate": 0.000488091308303691, "loss": 5.1436, "mean_token_accuracy": 0.19910948574543, "num_tokens": 26870401.0, "step": 11725 }, { "entropy": 5.256836318969727, "epoch": 1.1268011527377522, "grad_norm": 1.21875, "learning_rate": 0.0004880803001297694, "loss": 5.0458, "mean_token_accuracy": 0.21228888928890227, "num_tokens": 26881767.0, "step": 11730 }, { "entropy": 5.177135944366455, "epoch": 1.127281460134486, "grad_norm": 1.265625, "learning_rate": 0.0004880692870087559, "loss": 4.9447, "mean_token_accuracy": 0.21096309274435043, "num_tokens": 26892854.0, "step": 11735 }, { "entropy": 5.290387105941773, "epoch": 1.12776176753122, "grad_norm": 1.328125, "learning_rate": 0.00048805826894090626, "loss": 5.1022, "mean_token_accuracy": 0.2025263249874115, "num_tokens": 26905079.0, "step": 11740 }, { "entropy": 5.267297887802124, "epoch": 1.1282420749279538, "grad_norm": 1.25, "learning_rate": 0.00048804724592647626, "loss": 5.1096, "mean_token_accuracy": 0.205536325275898, "num_tokens": 26915640.0, "step": 11745 }, { "entropy": 5.2140075206756595, "epoch": 1.1287223823246877, "grad_norm": 1.4375, "learning_rate": 0.0004880362179657218, "loss": 5.0008, "mean_token_accuracy": 0.2080523982644081, "num_tokens": 26927039.0, "step": 11750 }, { "entropy": 5.185867214202881, "epoch": 1.1292026897214218, "grad_norm": 1.296875, "learning_rate": 0.00048802518505889904, "loss": 4.9986, "mean_token_accuracy": 0.21176180839538575, "num_tokens": 26939368.0, "step": 11755 }, { "entropy": 5.182089567184448, "epoch": 1.1296829971181557, "grad_norm": 1.3203125, "learning_rate": 0.00048801414720626404, "loss": 5.1614, "mean_token_accuracy": 0.19189264625310898, "num_tokens": 26952051.0, "step": 11760 }, { "entropy": 5.198432683944702, "epoch": 1.1301633045148896, "grad_norm": 1.4609375, "learning_rate": 0.00048800310440807294, "loss": 4.9895, "mean_token_accuracy": 0.20608253926038742, "num_tokens": 26963680.0, "step": 11765 }, { "entropy": 5.098900985717774, "epoch": 1.1306436119116234, "grad_norm": 1.421875, "learning_rate": 0.0004879920566645823, "loss": 4.8958, "mean_token_accuracy": 0.22000947147607802, "num_tokens": 26973899.0, "step": 11770 }, { "entropy": 5.164202928543091, "epoch": 1.1311239193083573, "grad_norm": 1.1640625, "learning_rate": 0.0004879810039760486, "loss": 5.0344, "mean_token_accuracy": 0.20242914706468582, "num_tokens": 26985692.0, "step": 11775 }, { "entropy": 5.224712562561035, "epoch": 1.1316042267050912, "grad_norm": 1.265625, "learning_rate": 0.0004879699463427284, "loss": 4.9784, "mean_token_accuracy": 0.21051635444164277, "num_tokens": 26996354.0, "step": 11780 }, { "entropy": 5.236588096618652, "epoch": 1.132084534101825, "grad_norm": 1.1484375, "learning_rate": 0.0004879588837648785, "loss": 5.0068, "mean_token_accuracy": 0.20998671054840087, "num_tokens": 27007925.0, "step": 11785 }, { "entropy": 5.186428022384644, "epoch": 1.1325648414985592, "grad_norm": 1.2890625, "learning_rate": 0.00048794781624275554, "loss": 4.975, "mean_token_accuracy": 0.2176157593727112, "num_tokens": 27019222.0, "step": 11790 }, { "entropy": 5.125696468353271, "epoch": 1.133045148895293, "grad_norm": 1.203125, "learning_rate": 0.00048793674377661664, "loss": 4.9674, "mean_token_accuracy": 0.21915102750062943, "num_tokens": 27029973.0, "step": 11795 }, { "entropy": 5.233280372619629, "epoch": 1.133525456292027, "grad_norm": 1.328125, "learning_rate": 0.00048792566636671886, "loss": 5.1306, "mean_token_accuracy": 0.2096991240978241, "num_tokens": 27042002.0, "step": 11800 }, { "entropy": 5.178752517700195, "epoch": 1.1340057636887608, "grad_norm": 1.28125, "learning_rate": 0.0004879145840133194, "loss": 5.0308, "mean_token_accuracy": 0.21618867963552474, "num_tokens": 27052770.0, "step": 11805 }, { "entropy": 5.270467710494995, "epoch": 1.1344860710854947, "grad_norm": 1.75, "learning_rate": 0.0004879034967166755, "loss": 5.0632, "mean_token_accuracy": 0.20635210424661637, "num_tokens": 27063375.0, "step": 11810 }, { "entropy": 5.201555490493774, "epoch": 1.1349663784822286, "grad_norm": 2.015625, "learning_rate": 0.0004878924044770446, "loss": 5.0013, "mean_token_accuracy": 0.20745259374380112, "num_tokens": 27075908.0, "step": 11815 }, { "entropy": 5.1736366748809814, "epoch": 1.1354466858789625, "grad_norm": 1.203125, "learning_rate": 0.0004878813072946843, "loss": 5.0447, "mean_token_accuracy": 0.21043994426727294, "num_tokens": 27087590.0, "step": 11820 }, { "entropy": 5.225687408447266, "epoch": 1.1359269932756964, "grad_norm": 1.2109375, "learning_rate": 0.00048787020516985203, "loss": 4.9593, "mean_token_accuracy": 0.218149633705616, "num_tokens": 27098076.0, "step": 11825 }, { "entropy": 5.2235795021057125, "epoch": 1.1364073006724302, "grad_norm": 1.3359375, "learning_rate": 0.00048785909810280576, "loss": 4.9734, "mean_token_accuracy": 0.2160875007510185, "num_tokens": 27109851.0, "step": 11830 }, { "entropy": 5.222425508499145, "epoch": 1.1368876080691643, "grad_norm": 1.3984375, "learning_rate": 0.0004878479860938033, "loss": 4.9228, "mean_token_accuracy": 0.21766222417354583, "num_tokens": 27121288.0, "step": 11835 }, { "entropy": 5.195209598541259, "epoch": 1.1373679154658982, "grad_norm": 1.265625, "learning_rate": 0.00048783686914310266, "loss": 5.0172, "mean_token_accuracy": 0.21215075105428696, "num_tokens": 27133877.0, "step": 11840 }, { "entropy": 5.063023233413697, "epoch": 1.137848222862632, "grad_norm": 1.5, "learning_rate": 0.0004878257472509619, "loss": 4.8351, "mean_token_accuracy": 0.22125699520111083, "num_tokens": 27145616.0, "step": 11845 }, { "entropy": 5.107744407653809, "epoch": 1.138328530259366, "grad_norm": 1.2109375, "learning_rate": 0.0004878146204176392, "loss": 4.9853, "mean_token_accuracy": 0.20994766801595688, "num_tokens": 27157182.0, "step": 11850 }, { "entropy": 5.2036010265350345, "epoch": 1.1388088376560999, "grad_norm": 1.3125, "learning_rate": 0.000487803488643393, "loss": 4.9386, "mean_token_accuracy": 0.2160127192735672, "num_tokens": 27167609.0, "step": 11855 }, { "entropy": 5.24319167137146, "epoch": 1.1392891450528337, "grad_norm": 1.2734375, "learning_rate": 0.00048779235192848166, "loss": 5.1012, "mean_token_accuracy": 0.2044668361544609, "num_tokens": 27180043.0, "step": 11860 }, { "entropy": 5.169292831420899, "epoch": 1.1397694524495678, "grad_norm": 1.2421875, "learning_rate": 0.0004877812102731638, "loss": 4.8945, "mean_token_accuracy": 0.22032397091388703, "num_tokens": 27191870.0, "step": 11865 }, { "entropy": 5.157991981506347, "epoch": 1.1402497598463017, "grad_norm": 1.265625, "learning_rate": 0.00048777006367769804, "loss": 5.0222, "mean_token_accuracy": 0.213824962079525, "num_tokens": 27203291.0, "step": 11870 }, { "entropy": 5.2055253982543945, "epoch": 1.1407300672430356, "grad_norm": 1.2890625, "learning_rate": 0.0004877589121423432, "loss": 5.041, "mean_token_accuracy": 0.21266603320837021, "num_tokens": 27214607.0, "step": 11875 }, { "entropy": 5.185617160797119, "epoch": 1.1412103746397695, "grad_norm": 1.21875, "learning_rate": 0.0004877477556673582, "loss": 4.9066, "mean_token_accuracy": 0.21550966054201126, "num_tokens": 27224658.0, "step": 11880 }, { "entropy": 5.1661797046661375, "epoch": 1.1416906820365034, "grad_norm": 1.203125, "learning_rate": 0.000487736594253002, "loss": 5.0204, "mean_token_accuracy": 0.20654748678207396, "num_tokens": 27235306.0, "step": 11885 }, { "entropy": 5.276842164993286, "epoch": 1.1421709894332372, "grad_norm": 1.46875, "learning_rate": 0.00048772542789953384, "loss": 5.0314, "mean_token_accuracy": 0.20629957020282746, "num_tokens": 27246679.0, "step": 11890 }, { "entropy": 5.28350601196289, "epoch": 1.1426512968299711, "grad_norm": 1.1875, "learning_rate": 0.00048771425660721284, "loss": 5.065, "mean_token_accuracy": 0.20193494856357574, "num_tokens": 27257799.0, "step": 11895 }, { "entropy": 5.124622106552124, "epoch": 1.143131604226705, "grad_norm": 1.265625, "learning_rate": 0.00048770308037629853, "loss": 5.0324, "mean_token_accuracy": 0.2094832718372345, "num_tokens": 27268651.0, "step": 11900 }, { "entropy": 5.176864957809448, "epoch": 1.143611911623439, "grad_norm": 1.1875, "learning_rate": 0.0004876918992070502, "loss": 5.0004, "mean_token_accuracy": 0.2177245572209358, "num_tokens": 27280655.0, "step": 11905 }, { "entropy": 5.243468952178955, "epoch": 1.144092219020173, "grad_norm": 1.1875, "learning_rate": 0.0004876807130997276, "loss": 5.0788, "mean_token_accuracy": 0.20621824115514756, "num_tokens": 27292664.0, "step": 11910 }, { "entropy": 5.224751567840576, "epoch": 1.1445725264169069, "grad_norm": 1.1875, "learning_rate": 0.0004876695220545903, "loss": 5.0655, "mean_token_accuracy": 0.20803812742233277, "num_tokens": 27306143.0, "step": 11915 }, { "entropy": 5.201717710494995, "epoch": 1.1450528338136408, "grad_norm": 1.265625, "learning_rate": 0.00048765832607189824, "loss": 4.9712, "mean_token_accuracy": 0.2136443629860878, "num_tokens": 27316926.0, "step": 11920 }, { "entropy": 5.28920431137085, "epoch": 1.1455331412103746, "grad_norm": 1.640625, "learning_rate": 0.00048764712515191136, "loss": 5.0737, "mean_token_accuracy": 0.2188516676425934, "num_tokens": 27327472.0, "step": 11925 }, { "entropy": 5.269708442687988, "epoch": 1.1460134486071085, "grad_norm": 1.25, "learning_rate": 0.00048763591929488966, "loss": 5.045, "mean_token_accuracy": 0.21446569710969926, "num_tokens": 27338388.0, "step": 11930 }, { "entropy": 5.159726858139038, "epoch": 1.1464937560038424, "grad_norm": 1.234375, "learning_rate": 0.0004876247085010933, "loss": 4.9518, "mean_token_accuracy": 0.2131276786327362, "num_tokens": 27349879.0, "step": 11935 }, { "entropy": 5.11401858329773, "epoch": 1.1469740634005763, "grad_norm": 1.375, "learning_rate": 0.00048761349277078253, "loss": 4.9533, "mean_token_accuracy": 0.2188461974263191, "num_tokens": 27361226.0, "step": 11940 }, { "entropy": 5.213291311264038, "epoch": 1.1474543707973104, "grad_norm": 1.359375, "learning_rate": 0.00048760227210421775, "loss": 4.9345, "mean_token_accuracy": 0.2213941693305969, "num_tokens": 27373512.0, "step": 11945 }, { "entropy": 5.145865774154663, "epoch": 1.1479346781940443, "grad_norm": 1.1875, "learning_rate": 0.0004875910465016596, "loss": 5.0049, "mean_token_accuracy": 0.21635116934776305, "num_tokens": 27384950.0, "step": 11950 }, { "entropy": 5.295030307769776, "epoch": 1.1484149855907781, "grad_norm": 1.3359375, "learning_rate": 0.0004875798159633686, "loss": 5.0487, "mean_token_accuracy": 0.20899975001811982, "num_tokens": 27395877.0, "step": 11955 }, { "entropy": 5.202318477630615, "epoch": 1.148895292987512, "grad_norm": 1.328125, "learning_rate": 0.0004875685804896055, "loss": 5.0986, "mean_token_accuracy": 0.21304885745048524, "num_tokens": 27407783.0, "step": 11960 }, { "entropy": 5.181471586227417, "epoch": 1.149375600384246, "grad_norm": 1.3046875, "learning_rate": 0.0004875573400806312, "loss": 4.9887, "mean_token_accuracy": 0.21124700605869293, "num_tokens": 27420598.0, "step": 11965 }, { "entropy": 5.237709093093872, "epoch": 1.1498559077809798, "grad_norm": 1.3359375, "learning_rate": 0.00048754609473670654, "loss": 5.0651, "mean_token_accuracy": 0.21090197712182998, "num_tokens": 27431961.0, "step": 11970 }, { "entropy": 5.21086540222168, "epoch": 1.1503362151777137, "grad_norm": 1.140625, "learning_rate": 0.0004875348444580927, "loss": 4.9156, "mean_token_accuracy": 0.22040790617465972, "num_tokens": 27444010.0, "step": 11975 }, { "entropy": 5.2328328609466555, "epoch": 1.1508165225744476, "grad_norm": 1.171875, "learning_rate": 0.000487523589245051, "loss": 4.9771, "mean_token_accuracy": 0.2139630988240242, "num_tokens": 27455576.0, "step": 11980 }, { "entropy": 5.184505844116211, "epoch": 1.1512968299711814, "grad_norm": 1.203125, "learning_rate": 0.0004875123290978425, "loss": 5.0403, "mean_token_accuracy": 0.20502331107854843, "num_tokens": 27468957.0, "step": 11985 }, { "entropy": 5.2058931350708, "epoch": 1.1517771373679155, "grad_norm": 1.171875, "learning_rate": 0.00048750106401672876, "loss": 4.9986, "mean_token_accuracy": 0.21766173243522643, "num_tokens": 27479992.0, "step": 11990 }, { "entropy": 5.238912153244018, "epoch": 1.1522574447646494, "grad_norm": 1.3671875, "learning_rate": 0.00048748979400197134, "loss": 5.0967, "mean_token_accuracy": 0.20631994754076005, "num_tokens": 27490878.0, "step": 11995 }, { "entropy": 5.145661306381226, "epoch": 1.1527377521613833, "grad_norm": 1.125, "learning_rate": 0.00048747851905383183, "loss": 4.9302, "mean_token_accuracy": 0.21805770546197892, "num_tokens": 27502009.0, "step": 12000 }, { "epoch": 1.1527377521613833, "eval_entropy": 5.072871884969547, "eval_loss": 5.132204532623291, "eval_mean_token_accuracy": 0.21277229704311537, "eval_num_tokens": 27502009.0, "eval_runtime": 26.6332, "eval_samples_per_second": 1232.108, "eval_steps_per_second": 154.018, "step": 12000 }, { "entropy": 5.252287817001343, "epoch": 1.1532180595581172, "grad_norm": 1.2109375, "learning_rate": 0.0004874672391725721, "loss": 5.1089, "mean_token_accuracy": 0.20380218029022218, "num_tokens": 27513376.0, "step": 12005 }, { "entropy": 5.117784070968628, "epoch": 1.153698366954851, "grad_norm": 1.25, "learning_rate": 0.0004874559543584539, "loss": 4.9715, "mean_token_accuracy": 0.2118404433131218, "num_tokens": 27525166.0, "step": 12010 }, { "entropy": 5.092162704467773, "epoch": 1.154178674351585, "grad_norm": 1.171875, "learning_rate": 0.0004874446646117394, "loss": 4.8865, "mean_token_accuracy": 0.226571424305439, "num_tokens": 27535994.0, "step": 12015 }, { "entropy": 5.154507493972778, "epoch": 1.154658981748319, "grad_norm": 1.234375, "learning_rate": 0.0004874333699326906, "loss": 4.9842, "mean_token_accuracy": 0.213858063519001, "num_tokens": 27546883.0, "step": 12020 }, { "entropy": 5.312671184539795, "epoch": 1.155139289145053, "grad_norm": 1.2734375, "learning_rate": 0.0004874220703215697, "loss": 5.0903, "mean_token_accuracy": 0.20375553965568544, "num_tokens": 27558840.0, "step": 12025 }, { "entropy": 5.188431692123413, "epoch": 1.1556195965417868, "grad_norm": 1.2734375, "learning_rate": 0.0004874107657786391, "loss": 5.0142, "mean_token_accuracy": 0.2109085887670517, "num_tokens": 27569776.0, "step": 12030 }, { "entropy": 5.199657583236695, "epoch": 1.1560999039385207, "grad_norm": 2.171875, "learning_rate": 0.00048739945630416124, "loss": 4.9845, "mean_token_accuracy": 0.20919086486101152, "num_tokens": 27580097.0, "step": 12035 }, { "entropy": 5.2678807258605955, "epoch": 1.1565802113352546, "grad_norm": 1.1953125, "learning_rate": 0.0004873881418983987, "loss": 5.1066, "mean_token_accuracy": 0.21011823117733003, "num_tokens": 27592035.0, "step": 12040 }, { "entropy": 5.2372087955474855, "epoch": 1.1570605187319885, "grad_norm": 1.15625, "learning_rate": 0.0004873768225616141, "loss": 5.0383, "mean_token_accuracy": 0.21346299797296525, "num_tokens": 27604409.0, "step": 12045 }, { "entropy": 5.251517963409424, "epoch": 1.1575408261287223, "grad_norm": 1.21875, "learning_rate": 0.00048736549829407047, "loss": 4.9845, "mean_token_accuracy": 0.21415583789348602, "num_tokens": 27615019.0, "step": 12050 }, { "entropy": 5.244622087478637, "epoch": 1.1580211335254562, "grad_norm": 1.34375, "learning_rate": 0.0004873541690960305, "loss": 5.1, "mean_token_accuracy": 0.2044678211212158, "num_tokens": 27627924.0, "step": 12055 }, { "entropy": 5.12835431098938, "epoch": 1.15850144092219, "grad_norm": 1.2734375, "learning_rate": 0.0004873428349677573, "loss": 4.925, "mean_token_accuracy": 0.21388751715421678, "num_tokens": 27639188.0, "step": 12060 }, { "entropy": 5.273781251907349, "epoch": 1.1589817483189242, "grad_norm": 1.4375, "learning_rate": 0.000487331495909514, "loss": 5.1448, "mean_token_accuracy": 0.2023579403758049, "num_tokens": 27652621.0, "step": 12065 }, { "entropy": 5.239981460571289, "epoch": 1.159462055715658, "grad_norm": 1.3671875, "learning_rate": 0.00048732015192156383, "loss": 5.0773, "mean_token_accuracy": 0.20497591197490692, "num_tokens": 27665726.0, "step": 12070 }, { "entropy": 5.234362506866455, "epoch": 1.159942363112392, "grad_norm": 1.3515625, "learning_rate": 0.00048730880300417015, "loss": 5.0152, "mean_token_accuracy": 0.20888158231973647, "num_tokens": 27676984.0, "step": 12075 }, { "entropy": 5.258095026016235, "epoch": 1.1604226705091258, "grad_norm": 1.15625, "learning_rate": 0.00048729744915759657, "loss": 5.0926, "mean_token_accuracy": 0.20860619992017745, "num_tokens": 27688812.0, "step": 12080 }, { "entropy": 5.133250331878662, "epoch": 1.1609029779058597, "grad_norm": 1.25, "learning_rate": 0.00048728609038210655, "loss": 4.8731, "mean_token_accuracy": 0.21741154789924622, "num_tokens": 27699152.0, "step": 12085 }, { "entropy": 5.181648588180542, "epoch": 1.1613832853025936, "grad_norm": 1.203125, "learning_rate": 0.00048727472667796395, "loss": 5.063, "mean_token_accuracy": 0.20912941545248032, "num_tokens": 27710347.0, "step": 12090 }, { "entropy": 5.160825157165528, "epoch": 1.1618635926993275, "grad_norm": 1.171875, "learning_rate": 0.0004872633580454325, "loss": 4.9654, "mean_token_accuracy": 0.21775319874286653, "num_tokens": 27721112.0, "step": 12095 }, { "entropy": 5.271908760070801, "epoch": 1.1623439000960616, "grad_norm": 1.3046875, "learning_rate": 0.00048725198448477616, "loss": 5.0553, "mean_token_accuracy": 0.20950192213058472, "num_tokens": 27731766.0, "step": 12100 }, { "entropy": 5.173818635940552, "epoch": 1.1628242074927955, "grad_norm": 1.25, "learning_rate": 0.00048724060599625893, "loss": 4.9923, "mean_token_accuracy": 0.2151247590780258, "num_tokens": 27743718.0, "step": 12105 }, { "entropy": 5.192879867553711, "epoch": 1.1633045148895294, "grad_norm": 1.2421875, "learning_rate": 0.00048722922258014506, "loss": 5.0514, "mean_token_accuracy": 0.20927377343177794, "num_tokens": 27754999.0, "step": 12110 }, { "entropy": 5.269402647018433, "epoch": 1.1637848222862632, "grad_norm": 1.1484375, "learning_rate": 0.0004872178342366989, "loss": 5.1583, "mean_token_accuracy": 0.20009388625621796, "num_tokens": 27767684.0, "step": 12115 }, { "entropy": 5.148327445983886, "epoch": 1.1642651296829971, "grad_norm": 1.2109375, "learning_rate": 0.00048720644096618475, "loss": 5.0272, "mean_token_accuracy": 0.21265908777713777, "num_tokens": 27779103.0, "step": 12120 }, { "entropy": 5.183534049987793, "epoch": 1.164745437079731, "grad_norm": 1.2421875, "learning_rate": 0.0004871950427688672, "loss": 5.0281, "mean_token_accuracy": 0.21695935279130935, "num_tokens": 27791223.0, "step": 12125 }, { "entropy": 5.227220249176026, "epoch": 1.1652257444764649, "grad_norm": 1.1796875, "learning_rate": 0.00048718363964501087, "loss": 4.964, "mean_token_accuracy": 0.2172775998711586, "num_tokens": 27802660.0, "step": 12130 }, { "entropy": 5.253418016433716, "epoch": 1.1657060518731988, "grad_norm": 1.4140625, "learning_rate": 0.0004871722315948805, "loss": 5.0586, "mean_token_accuracy": 0.20542750507593155, "num_tokens": 27814370.0, "step": 12135 }, { "entropy": 5.152675437927246, "epoch": 1.1661863592699326, "grad_norm": 1.265625, "learning_rate": 0.0004871608186187408, "loss": 4.9636, "mean_token_accuracy": 0.2136980563402176, "num_tokens": 27825035.0, "step": 12140 }, { "entropy": 5.11957426071167, "epoch": 1.1666666666666667, "grad_norm": 1.265625, "learning_rate": 0.00048714940071685703, "loss": 4.8978, "mean_token_accuracy": 0.21549834907054902, "num_tokens": 27837276.0, "step": 12145 }, { "entropy": 5.30233063697815, "epoch": 1.1671469740634006, "grad_norm": 1.28125, "learning_rate": 0.00048713797788949405, "loss": 5.0647, "mean_token_accuracy": 0.2075889676809311, "num_tokens": 27848618.0, "step": 12150 }, { "entropy": 5.188276481628418, "epoch": 1.1676272814601345, "grad_norm": 1.3203125, "learning_rate": 0.00048712655013691714, "loss": 5.0438, "mean_token_accuracy": 0.20078416913747787, "num_tokens": 27861556.0, "step": 12155 }, { "entropy": 5.225617361068726, "epoch": 1.1681075888568684, "grad_norm": 1.28125, "learning_rate": 0.00048711511745939165, "loss": 5.0115, "mean_token_accuracy": 0.20684807151556014, "num_tokens": 27873211.0, "step": 12160 }, { "entropy": 5.213997268676758, "epoch": 1.1685878962536023, "grad_norm": 1.3984375, "learning_rate": 0.000487103679857183, "loss": 5.0288, "mean_token_accuracy": 0.20902891159057618, "num_tokens": 27883365.0, "step": 12165 }, { "entropy": 5.114966011047363, "epoch": 1.1690682036503361, "grad_norm": 1.203125, "learning_rate": 0.0004870922373305567, "loss": 4.9032, "mean_token_accuracy": 0.21823573112487793, "num_tokens": 27894669.0, "step": 12170 }, { "entropy": 5.189022970199585, "epoch": 1.1695485110470702, "grad_norm": 1.3125, "learning_rate": 0.00048708078987977837, "loss": 5.0406, "mean_token_accuracy": 0.21348860412836074, "num_tokens": 27906480.0, "step": 12175 }, { "entropy": 5.1677796840667725, "epoch": 1.1700288184438041, "grad_norm": 1.296875, "learning_rate": 0.00048706933750511394, "loss": 4.9761, "mean_token_accuracy": 0.21937694698572158, "num_tokens": 27918383.0, "step": 12180 }, { "entropy": 5.246155738830566, "epoch": 1.170509125840538, "grad_norm": 1.2109375, "learning_rate": 0.0004870578802068292, "loss": 5.0332, "mean_token_accuracy": 0.2065381273627281, "num_tokens": 27928944.0, "step": 12185 }, { "entropy": 5.170810222625732, "epoch": 1.170989433237272, "grad_norm": 1.171875, "learning_rate": 0.00048704641798519006, "loss": 4.9503, "mean_token_accuracy": 0.21335744559764863, "num_tokens": 27941105.0, "step": 12190 }, { "entropy": 5.2019225597381595, "epoch": 1.1714697406340058, "grad_norm": 1.1875, "learning_rate": 0.00048703495084046286, "loss": 4.9969, "mean_token_accuracy": 0.2134275645017624, "num_tokens": 27952925.0, "step": 12195 }, { "entropy": 5.109914255142212, "epoch": 1.1719500480307397, "grad_norm": 1.171875, "learning_rate": 0.0004870234787729137, "loss": 4.9838, "mean_token_accuracy": 0.21968378871679306, "num_tokens": 27965504.0, "step": 12200 }, { "entropy": 5.158345079421997, "epoch": 1.1724303554274735, "grad_norm": 1.203125, "learning_rate": 0.0004870120017828089, "loss": 4.9624, "mean_token_accuracy": 0.21934993118047713, "num_tokens": 27975985.0, "step": 12205 }, { "entropy": 5.340363693237305, "epoch": 1.1729106628242074, "grad_norm": 1.3828125, "learning_rate": 0.000487000519870415, "loss": 5.0612, "mean_token_accuracy": 0.20156388878822326, "num_tokens": 27987784.0, "step": 12210 }, { "entropy": 5.214973592758179, "epoch": 1.1733909702209413, "grad_norm": 1.2890625, "learning_rate": 0.0004869890330359986, "loss": 4.9831, "mean_token_accuracy": 0.207270847260952, "num_tokens": 27999623.0, "step": 12215 }, { "entropy": 5.146721315383911, "epoch": 1.1738712776176754, "grad_norm": 1.328125, "learning_rate": 0.0004869775412798262, "loss": 4.9998, "mean_token_accuracy": 0.20671399533748627, "num_tokens": 28011561.0, "step": 12220 }, { "entropy": 5.233490705490112, "epoch": 1.1743515850144093, "grad_norm": 1.28125, "learning_rate": 0.00048696604460216476, "loss": 5.0422, "mean_token_accuracy": 0.21980682760477066, "num_tokens": 28022108.0, "step": 12225 }, { "entropy": 5.276953649520874, "epoch": 1.1748318924111432, "grad_norm": 1.3203125, "learning_rate": 0.00048695454300328123, "loss": 5.0175, "mean_token_accuracy": 0.20736639499664306, "num_tokens": 28033497.0, "step": 12230 }, { "entropy": 5.237717533111573, "epoch": 1.175312199807877, "grad_norm": 1.15625, "learning_rate": 0.00048694303648344256, "loss": 5.0456, "mean_token_accuracy": 0.20967191308736802, "num_tokens": 28044790.0, "step": 12235 }, { "entropy": 5.111946868896484, "epoch": 1.175792507204611, "grad_norm": 1.1484375, "learning_rate": 0.00048693152504291595, "loss": 4.9879, "mean_token_accuracy": 0.21664219200611115, "num_tokens": 28056000.0, "step": 12240 }, { "entropy": 5.18005404472351, "epoch": 1.1762728146013448, "grad_norm": 1.15625, "learning_rate": 0.0004869200086819686, "loss": 5.0213, "mean_token_accuracy": 0.21006689369678497, "num_tokens": 28068264.0, "step": 12245 }, { "entropy": 5.234067392349243, "epoch": 1.1767531219980787, "grad_norm": 1.28125, "learning_rate": 0.00048690848740086796, "loss": 4.9192, "mean_token_accuracy": 0.21703227013349533, "num_tokens": 28080220.0, "step": 12250 }, { "entropy": 5.251984262466431, "epoch": 1.1772334293948128, "grad_norm": 1.2265625, "learning_rate": 0.0004868969611998814, "loss": 4.9671, "mean_token_accuracy": 0.21270408034324645, "num_tokens": 28091531.0, "step": 12255 }, { "entropy": 5.154574251174926, "epoch": 1.1777137367915467, "grad_norm": 1.2734375, "learning_rate": 0.0004868854300792767, "loss": 4.9726, "mean_token_accuracy": 0.20649342983961105, "num_tokens": 28102982.0, "step": 12260 }, { "entropy": 5.172384786605835, "epoch": 1.1781940441882806, "grad_norm": 1.3046875, "learning_rate": 0.00048687389403932144, "loss": 5.0406, "mean_token_accuracy": 0.21188410818576814, "num_tokens": 28114131.0, "step": 12265 }, { "entropy": 5.2116370677948, "epoch": 1.1786743515850144, "grad_norm": 1.2421875, "learning_rate": 0.0004868623530802835, "loss": 4.9634, "mean_token_accuracy": 0.21050270646810532, "num_tokens": 28125637.0, "step": 12270 }, { "entropy": 5.274893808364868, "epoch": 1.1791546589817483, "grad_norm": 1.2734375, "learning_rate": 0.00048685080720243086, "loss": 5.0384, "mean_token_accuracy": 0.21438082605600356, "num_tokens": 28137284.0, "step": 12275 }, { "entropy": 5.185189199447632, "epoch": 1.1796349663784822, "grad_norm": 1.2265625, "learning_rate": 0.0004868392564060315, "loss": 5.0397, "mean_token_accuracy": 0.21561664193868638, "num_tokens": 28148909.0, "step": 12280 }, { "entropy": 5.227141571044922, "epoch": 1.180115273775216, "grad_norm": 1.53125, "learning_rate": 0.0004868277006913537, "loss": 5.0201, "mean_token_accuracy": 0.21342774629592895, "num_tokens": 28160392.0, "step": 12285 }, { "entropy": 5.2492955207824705, "epoch": 1.18059558117195, "grad_norm": 1.6171875, "learning_rate": 0.0004868161400586656, "loss": 5.0965, "mean_token_accuracy": 0.20555120557546616, "num_tokens": 28172957.0, "step": 12290 }, { "entropy": 5.334953641891479, "epoch": 1.1810758885686838, "grad_norm": 1.265625, "learning_rate": 0.0004868045745082357, "loss": 5.0748, "mean_token_accuracy": 0.20318131595849992, "num_tokens": 28183239.0, "step": 12295 }, { "entropy": 5.188865756988525, "epoch": 1.181556195965418, "grad_norm": 1.2890625, "learning_rate": 0.0004867930040403326, "loss": 4.9936, "mean_token_accuracy": 0.21321745961904526, "num_tokens": 28195382.0, "step": 12300 }, { "entropy": 5.176113891601562, "epoch": 1.1820365033621518, "grad_norm": 1.3984375, "learning_rate": 0.00048678142865522475, "loss": 5.0722, "mean_token_accuracy": 0.2028682142496109, "num_tokens": 28206645.0, "step": 12305 }, { "entropy": 5.261957263946533, "epoch": 1.1825168107588857, "grad_norm": 1.171875, "learning_rate": 0.000486769848353181, "loss": 5.0623, "mean_token_accuracy": 0.21030078679323197, "num_tokens": 28218420.0, "step": 12310 }, { "entropy": 5.234085512161255, "epoch": 1.1829971181556196, "grad_norm": 1.2109375, "learning_rate": 0.00048675826313447027, "loss": 5.0647, "mean_token_accuracy": 0.20770383477211, "num_tokens": 28229458.0, "step": 12315 }, { "entropy": 5.270634365081787, "epoch": 1.1834774255523535, "grad_norm": 1.4921875, "learning_rate": 0.00048674667299936135, "loss": 5.0538, "mean_token_accuracy": 0.2074426531791687, "num_tokens": 28242240.0, "step": 12320 }, { "entropy": 5.380625152587891, "epoch": 1.1839577329490873, "grad_norm": 1.4375, "learning_rate": 0.00048673507794812356, "loss": 5.139, "mean_token_accuracy": 0.20491064041852952, "num_tokens": 28254597.0, "step": 12325 }, { "entropy": 5.217261886596679, "epoch": 1.1844380403458212, "grad_norm": 1.1796875, "learning_rate": 0.0004867234779810259, "loss": 5.0605, "mean_token_accuracy": 0.20772763192653657, "num_tokens": 28266674.0, "step": 12330 }, { "entropy": 5.174903488159179, "epoch": 1.1849183477425553, "grad_norm": 1.1484375, "learning_rate": 0.0004867118730983378, "loss": 5.0235, "mean_token_accuracy": 0.213454669713974, "num_tokens": 28278671.0, "step": 12335 }, { "entropy": 5.22681565284729, "epoch": 1.1853986551392892, "grad_norm": 1.375, "learning_rate": 0.0004867002633003286, "loss": 4.9512, "mean_token_accuracy": 0.21354973167181016, "num_tokens": 28291085.0, "step": 12340 }, { "entropy": 5.146066761016845, "epoch": 1.185878962536023, "grad_norm": 1.6328125, "learning_rate": 0.000486688648587268, "loss": 5.0309, "mean_token_accuracy": 0.215641950070858, "num_tokens": 28303457.0, "step": 12345 }, { "entropy": 5.223130226135254, "epoch": 1.186359269932757, "grad_norm": 1.1640625, "learning_rate": 0.0004866770289594256, "loss": 5.0391, "mean_token_accuracy": 0.20597289353609086, "num_tokens": 28314599.0, "step": 12350 }, { "entropy": 5.3448234558105465, "epoch": 1.1868395773294909, "grad_norm": 1.25, "learning_rate": 0.00048666540441707107, "loss": 5.1315, "mean_token_accuracy": 0.2068374440073967, "num_tokens": 28326266.0, "step": 12355 }, { "entropy": 5.251500129699707, "epoch": 1.1873198847262247, "grad_norm": 1.3046875, "learning_rate": 0.0004866537749604744, "loss": 4.9853, "mean_token_accuracy": 0.207984322309494, "num_tokens": 28337238.0, "step": 12360 }, { "entropy": 5.157985210418701, "epoch": 1.1878001921229586, "grad_norm": 1.2890625, "learning_rate": 0.00048664214058990546, "loss": 4.9818, "mean_token_accuracy": 0.21365345120429993, "num_tokens": 28348060.0, "step": 12365 }, { "entropy": 5.145942068099975, "epoch": 1.1882804995196925, "grad_norm": 1.7109375, "learning_rate": 0.0004866305013056346, "loss": 5.0371, "mean_token_accuracy": 0.2051353007555008, "num_tokens": 28359442.0, "step": 12370 }, { "entropy": 5.244502449035645, "epoch": 1.1887608069164266, "grad_norm": 1.203125, "learning_rate": 0.0004866188571079318, "loss": 5.0663, "mean_token_accuracy": 0.2088254600763321, "num_tokens": 28370827.0, "step": 12375 }, { "entropy": 5.345438051223755, "epoch": 1.1892411143131605, "grad_norm": 1.5703125, "learning_rate": 0.0004866072079970676, "loss": 5.0994, "mean_token_accuracy": 0.19838736355304717, "num_tokens": 28383115.0, "step": 12380 }, { "entropy": 5.179604244232178, "epoch": 1.1897214217098944, "grad_norm": 1.203125, "learning_rate": 0.00048659555397331236, "loss": 4.9712, "mean_token_accuracy": 0.21606729328632354, "num_tokens": 28394904.0, "step": 12385 }, { "entropy": 5.1870029926300045, "epoch": 1.1902017291066282, "grad_norm": 1.328125, "learning_rate": 0.0004865838950369366, "loss": 5.0248, "mean_token_accuracy": 0.2077416345477104, "num_tokens": 28407357.0, "step": 12390 }, { "entropy": 5.206893348693848, "epoch": 1.1906820365033621, "grad_norm": 1.4765625, "learning_rate": 0.00048657223118821116, "loss": 4.9821, "mean_token_accuracy": 0.21372554153203965, "num_tokens": 28418088.0, "step": 12395 }, { "entropy": 5.062207841873169, "epoch": 1.191162343900096, "grad_norm": 1.1875, "learning_rate": 0.00048656056242740665, "loss": 4.889, "mean_token_accuracy": 0.21973242610692978, "num_tokens": 28430022.0, "step": 12400 }, { "entropy": 5.1975541591644285, "epoch": 1.19164265129683, "grad_norm": 1.2578125, "learning_rate": 0.0004865488887547942, "loss": 4.9833, "mean_token_accuracy": 0.214840891957283, "num_tokens": 28440530.0, "step": 12405 }, { "entropy": 5.258637619018555, "epoch": 1.192122958693564, "grad_norm": 1.171875, "learning_rate": 0.0004865372101706446, "loss": 5.0893, "mean_token_accuracy": 0.20531991273164749, "num_tokens": 28452707.0, "step": 12410 }, { "entropy": 5.119096803665161, "epoch": 1.1926032660902979, "grad_norm": 1.171875, "learning_rate": 0.0004865255266752292, "loss": 4.8979, "mean_token_accuracy": 0.22423699051141738, "num_tokens": 28465131.0, "step": 12415 }, { "entropy": 5.142260789871216, "epoch": 1.1930835734870318, "grad_norm": 1.421875, "learning_rate": 0.0004865138382688191, "loss": 5.0173, "mean_token_accuracy": 0.2174185335636139, "num_tokens": 28476136.0, "step": 12420 }, { "entropy": 5.216041707992554, "epoch": 1.1935638808837656, "grad_norm": 1.2109375, "learning_rate": 0.0004865021449516859, "loss": 4.9275, "mean_token_accuracy": 0.2106182113289833, "num_tokens": 28488374.0, "step": 12425 }, { "entropy": 5.248443746566773, "epoch": 1.1940441882804995, "grad_norm": 1.375, "learning_rate": 0.0004864904467241008, "loss": 4.9863, "mean_token_accuracy": 0.20798565447330475, "num_tokens": 28499585.0, "step": 12430 }, { "entropy": 5.132749748229981, "epoch": 1.1945244956772334, "grad_norm": 1.2109375, "learning_rate": 0.00048647874358633556, "loss": 4.9399, "mean_token_accuracy": 0.2145738869905472, "num_tokens": 28510707.0, "step": 12435 }, { "entropy": 5.247034311294556, "epoch": 1.1950048030739673, "grad_norm": 1.2421875, "learning_rate": 0.00048646703553866183, "loss": 5.0617, "mean_token_accuracy": 0.20649414509534836, "num_tokens": 28522398.0, "step": 12440 }, { "entropy": 5.194453859329224, "epoch": 1.1954851104707012, "grad_norm": 1.5859375, "learning_rate": 0.0004864553225813515, "loss": 4.9577, "mean_token_accuracy": 0.21434492319822313, "num_tokens": 28532949.0, "step": 12445 }, { "entropy": 5.127518177032471, "epoch": 1.195965417867435, "grad_norm": 1.25, "learning_rate": 0.0004864436047146765, "loss": 4.9267, "mean_token_accuracy": 0.22026402056217192, "num_tokens": 28544292.0, "step": 12450 }, { "entropy": 5.219339227676391, "epoch": 1.1964457252641691, "grad_norm": 1.4375, "learning_rate": 0.00048643188193890874, "loss": 5.0951, "mean_token_accuracy": 0.20473963618278504, "num_tokens": 28556430.0, "step": 12455 }, { "entropy": 5.302267837524414, "epoch": 1.196926032660903, "grad_norm": 1.296875, "learning_rate": 0.0004864201542543206, "loss": 5.0565, "mean_token_accuracy": 0.20778754204511643, "num_tokens": 28568475.0, "step": 12460 }, { "entropy": 5.265069389343262, "epoch": 1.197406340057637, "grad_norm": 1.421875, "learning_rate": 0.0004864084216611843, "loss": 5.0026, "mean_token_accuracy": 0.20922221690416337, "num_tokens": 28579653.0, "step": 12465 }, { "entropy": 5.153708600997925, "epoch": 1.1978866474543708, "grad_norm": 1.28125, "learning_rate": 0.00048639668415977207, "loss": 4.9371, "mean_token_accuracy": 0.21175346672534942, "num_tokens": 28590108.0, "step": 12470 }, { "entropy": 5.165795612335205, "epoch": 1.1983669548511047, "grad_norm": 1.203125, "learning_rate": 0.00048638494175035665, "loss": 4.9589, "mean_token_accuracy": 0.2166977271437645, "num_tokens": 28602008.0, "step": 12475 }, { "entropy": 5.141294145584107, "epoch": 1.1988472622478386, "grad_norm": 1.3203125, "learning_rate": 0.0004863731944332105, "loss": 4.8937, "mean_token_accuracy": 0.22100506275892257, "num_tokens": 28613286.0, "step": 12480 }, { "entropy": 5.158958911895752, "epoch": 1.1993275696445724, "grad_norm": 1.2890625, "learning_rate": 0.0004863614422086065, "loss": 4.905, "mean_token_accuracy": 0.21994735598564147, "num_tokens": 28625497.0, "step": 12485 }, { "entropy": 5.1904459476470945, "epoch": 1.1998078770413065, "grad_norm": 1.390625, "learning_rate": 0.0004863496850768174, "loss": 5.0029, "mean_token_accuracy": 0.21328900158405303, "num_tokens": 28638046.0, "step": 12490 }, { "entropy": 5.097305774688721, "epoch": 1.2002881844380404, "grad_norm": 1.3203125, "learning_rate": 0.0004863379230381162, "loss": 4.9046, "mean_token_accuracy": 0.2162349119782448, "num_tokens": 28647923.0, "step": 12495 }, { "entropy": 5.14150071144104, "epoch": 1.2007684918347743, "grad_norm": 1.2421875, "learning_rate": 0.000486326156092776, "loss": 4.9583, "mean_token_accuracy": 0.21833768635988235, "num_tokens": 28660023.0, "step": 12500 }, { "entropy": 5.187930011749268, "epoch": 1.2012487992315082, "grad_norm": 1.3203125, "learning_rate": 0.00048631438424106985, "loss": 4.9908, "mean_token_accuracy": 0.21353389471769332, "num_tokens": 28671568.0, "step": 12505 }, { "entropy": 5.122547054290772, "epoch": 1.201729106628242, "grad_norm": 1.3046875, "learning_rate": 0.00048630260748327124, "loss": 4.9636, "mean_token_accuracy": 0.21925023049116135, "num_tokens": 28682897.0, "step": 12510 }, { "entropy": 5.162188148498535, "epoch": 1.202209414024976, "grad_norm": 1.390625, "learning_rate": 0.00048629082581965355, "loss": 5.0342, "mean_token_accuracy": 0.21101551204919816, "num_tokens": 28694067.0, "step": 12515 }, { "entropy": 5.242506408691407, "epoch": 1.2026897214217098, "grad_norm": 1.46875, "learning_rate": 0.00048627903925049033, "loss": 4.9589, "mean_token_accuracy": 0.21290026605129242, "num_tokens": 28705738.0, "step": 12520 }, { "entropy": 5.195635080337524, "epoch": 1.2031700288184437, "grad_norm": 1.25, "learning_rate": 0.00048626724777605507, "loss": 4.9092, "mean_token_accuracy": 0.21891177147626878, "num_tokens": 28717419.0, "step": 12525 }, { "entropy": 5.187272739410401, "epoch": 1.2036503362151778, "grad_norm": 1.265625, "learning_rate": 0.0004862554513966217, "loss": 5.1105, "mean_token_accuracy": 0.20062761902809143, "num_tokens": 28728587.0, "step": 12530 }, { "entropy": 5.23038272857666, "epoch": 1.2041306436119117, "grad_norm": 1.21875, "learning_rate": 0.00048624365011246405, "loss": 5.0802, "mean_token_accuracy": 0.20460240244865419, "num_tokens": 28740818.0, "step": 12535 }, { "entropy": 5.1875214099884035, "epoch": 1.2046109510086456, "grad_norm": 1.28125, "learning_rate": 0.0004862318439238561, "loss": 4.927, "mean_token_accuracy": 0.224493670463562, "num_tokens": 28751936.0, "step": 12540 }, { "entropy": 5.185813951492309, "epoch": 1.2050912584053795, "grad_norm": 1.34375, "learning_rate": 0.000486220032831072, "loss": 4.9948, "mean_token_accuracy": 0.2092664435505867, "num_tokens": 28763271.0, "step": 12545 }, { "entropy": 5.225413799285889, "epoch": 1.2055715658021133, "grad_norm": 1.2109375, "learning_rate": 0.0004862082168343859, "loss": 5.0384, "mean_token_accuracy": 0.21466702818870545, "num_tokens": 28774282.0, "step": 12550 }, { "entropy": 5.275173616409302, "epoch": 1.2060518731988472, "grad_norm": 1.1875, "learning_rate": 0.0004861963959340722, "loss": 5.0968, "mean_token_accuracy": 0.20915820598602294, "num_tokens": 28785826.0, "step": 12555 }, { "entropy": 5.120945119857788, "epoch": 1.206532180595581, "grad_norm": 1.15625, "learning_rate": 0.0004861845701304053, "loss": 4.9057, "mean_token_accuracy": 0.21730198264122008, "num_tokens": 28797669.0, "step": 12560 }, { "entropy": 5.19795560836792, "epoch": 1.2070124879923152, "grad_norm": 1.1796875, "learning_rate": 0.00048617273942365977, "loss": 5.0742, "mean_token_accuracy": 0.20622512996196746, "num_tokens": 28808438.0, "step": 12565 }, { "entropy": 5.136143445968628, "epoch": 1.207492795389049, "grad_norm": 1.46875, "learning_rate": 0.0004861609038141103, "loss": 4.9156, "mean_token_accuracy": 0.2179456263780594, "num_tokens": 28819707.0, "step": 12570 }, { "entropy": 5.243184280395508, "epoch": 1.207973102785783, "grad_norm": 1.2421875, "learning_rate": 0.00048614906330203165, "loss": 5.02, "mean_token_accuracy": 0.2118644818663597, "num_tokens": 28831829.0, "step": 12575 }, { "entropy": 5.178156328201294, "epoch": 1.2084534101825168, "grad_norm": 1.28125, "learning_rate": 0.0004861372178876987, "loss": 4.9879, "mean_token_accuracy": 0.2112196832895279, "num_tokens": 28843429.0, "step": 12580 }, { "entropy": 5.162006902694702, "epoch": 1.2089337175792507, "grad_norm": 1.4609375, "learning_rate": 0.00048612536757138653, "loss": 4.9146, "mean_token_accuracy": 0.21943530589342117, "num_tokens": 28856239.0, "step": 12585 }, { "entropy": 5.224708223342896, "epoch": 1.2094140249759846, "grad_norm": 1.2734375, "learning_rate": 0.0004861135123533702, "loss": 5.0127, "mean_token_accuracy": 0.21318120807409285, "num_tokens": 28868599.0, "step": 12590 }, { "entropy": 5.2225141525268555, "epoch": 1.2098943323727185, "grad_norm": 1.234375, "learning_rate": 0.00048610165223392503, "loss": 5.035, "mean_token_accuracy": 0.20697897523641587, "num_tokens": 28880234.0, "step": 12595 }, { "entropy": 5.2457115173339846, "epoch": 1.2103746397694524, "grad_norm": 1.1796875, "learning_rate": 0.0004860897872133263, "loss": 5.0615, "mean_token_accuracy": 0.2093821495771408, "num_tokens": 28892198.0, "step": 12600 }, { "entropy": 5.230489587783813, "epoch": 1.2108549471661862, "grad_norm": 1.1796875, "learning_rate": 0.0004860779172918496, "loss": 5.0893, "mean_token_accuracy": 0.20883096009492874, "num_tokens": 28904153.0, "step": 12605 }, { "entropy": 5.278970098495483, "epoch": 1.2113352545629203, "grad_norm": 1.3515625, "learning_rate": 0.0004860660424697704, "loss": 5.0579, "mean_token_accuracy": 0.21126580536365508, "num_tokens": 28915284.0, "step": 12610 }, { "entropy": 5.21758451461792, "epoch": 1.2118155619596542, "grad_norm": 1.2109375, "learning_rate": 0.00048605416274736434, "loss": 5.0239, "mean_token_accuracy": 0.20837975144386292, "num_tokens": 28928168.0, "step": 12615 }, { "entropy": 5.152167892456054, "epoch": 1.2122958693563881, "grad_norm": 1.1953125, "learning_rate": 0.00048604227812490744, "loss": 4.9032, "mean_token_accuracy": 0.21995031386613845, "num_tokens": 28938548.0, "step": 12620 }, { "entropy": 5.1527406692504885, "epoch": 1.212776176753122, "grad_norm": 1.234375, "learning_rate": 0.00048603038860267546, "loss": 4.9921, "mean_token_accuracy": 0.21356878131628038, "num_tokens": 28949252.0, "step": 12625 }, { "entropy": 5.230664396286011, "epoch": 1.2132564841498559, "grad_norm": 1.2421875, "learning_rate": 0.0004860184941809445, "loss": 5.0107, "mean_token_accuracy": 0.21580713540315627, "num_tokens": 28960758.0, "step": 12630 }, { "entropy": 5.214424085617066, "epoch": 1.2137367915465898, "grad_norm": 1.25, "learning_rate": 0.00048600659485999073, "loss": 4.9823, "mean_token_accuracy": 0.21401938945055007, "num_tokens": 28972604.0, "step": 12635 }, { "entropy": 5.171445035934449, "epoch": 1.2142170989433236, "grad_norm": 1.2109375, "learning_rate": 0.00048599469064009027, "loss": 4.9781, "mean_token_accuracy": 0.2194841518998146, "num_tokens": 28983617.0, "step": 12640 }, { "entropy": 5.171384048461914, "epoch": 1.2146974063400577, "grad_norm": 1.3046875, "learning_rate": 0.00048598278152151974, "loss": 5.0017, "mean_token_accuracy": 0.21235128194093705, "num_tokens": 28994277.0, "step": 12645 }, { "entropy": 5.2698729038238525, "epoch": 1.2151777137367916, "grad_norm": 1.2734375, "learning_rate": 0.0004859708675045555, "loss": 5.0415, "mean_token_accuracy": 0.21211624890565872, "num_tokens": 29004844.0, "step": 12650 }, { "entropy": 5.067803430557251, "epoch": 1.2156580211335255, "grad_norm": 1.3125, "learning_rate": 0.0004859589485894741, "loss": 4.9214, "mean_token_accuracy": 0.21981519609689712, "num_tokens": 29015978.0, "step": 12655 }, { "entropy": 5.245090198516846, "epoch": 1.2161383285302594, "grad_norm": 1.2109375, "learning_rate": 0.0004859470247765524, "loss": 5.0228, "mean_token_accuracy": 0.21242079883813858, "num_tokens": 29026966.0, "step": 12660 }, { "entropy": 5.231347465515137, "epoch": 1.2166186359269933, "grad_norm": 1.2265625, "learning_rate": 0.0004859350960660671, "loss": 4.9512, "mean_token_accuracy": 0.2151286020874977, "num_tokens": 29037943.0, "step": 12665 }, { "entropy": 5.210123300552368, "epoch": 1.2170989433237271, "grad_norm": 1.515625, "learning_rate": 0.0004859231624582953, "loss": 5.0191, "mean_token_accuracy": 0.2085331290960312, "num_tokens": 29049939.0, "step": 12670 }, { "entropy": 5.1238236904144285, "epoch": 1.217579250720461, "grad_norm": 1.2890625, "learning_rate": 0.00048591122395351394, "loss": 5.0294, "mean_token_accuracy": 0.21389884501695633, "num_tokens": 29062442.0, "step": 12675 }, { "entropy": 5.19063811302185, "epoch": 1.218059558117195, "grad_norm": 1.2109375, "learning_rate": 0.0004858992805520003, "loss": 5.0303, "mean_token_accuracy": 0.21078938841819764, "num_tokens": 29074240.0, "step": 12680 }, { "entropy": 5.28305025100708, "epoch": 1.218539865513929, "grad_norm": 1.1796875, "learning_rate": 0.00048588733225403153, "loss": 5.0248, "mean_token_accuracy": 0.20675573348999024, "num_tokens": 29085550.0, "step": 12685 }, { "entropy": 5.132738399505615, "epoch": 1.219020172910663, "grad_norm": 1.375, "learning_rate": 0.0004858753790598851, "loss": 4.9732, "mean_token_accuracy": 0.21328083127737046, "num_tokens": 29097092.0, "step": 12690 }, { "entropy": 5.131064367294312, "epoch": 1.2195004803073968, "grad_norm": 1.2265625, "learning_rate": 0.0004858634209698386, "loss": 5.0119, "mean_token_accuracy": 0.21203078627586364, "num_tokens": 29108119.0, "step": 12695 }, { "entropy": 5.2007276058197025, "epoch": 1.2199807877041307, "grad_norm": 1.234375, "learning_rate": 0.00048585145798416956, "loss": 4.9885, "mean_token_accuracy": 0.20587167888879776, "num_tokens": 29120605.0, "step": 12700 }, { "entropy": 5.203617095947266, "epoch": 1.2204610951008645, "grad_norm": 1.25, "learning_rate": 0.0004858394901031558, "loss": 4.9715, "mean_token_accuracy": 0.21140657663345336, "num_tokens": 29131582.0, "step": 12705 }, { "entropy": 5.226724433898926, "epoch": 1.2209414024975984, "grad_norm": 1.3515625, "learning_rate": 0.0004858275173270751, "loss": 4.9982, "mean_token_accuracy": 0.2063015416264534, "num_tokens": 29143436.0, "step": 12710 }, { "entropy": 5.195918226242066, "epoch": 1.2214217098943323, "grad_norm": 1.328125, "learning_rate": 0.00048581553965620553, "loss": 4.9219, "mean_token_accuracy": 0.2154676854610443, "num_tokens": 29154445.0, "step": 12715 }, { "entropy": 5.198876190185547, "epoch": 1.2219020172910664, "grad_norm": 1.296875, "learning_rate": 0.00048580355709082506, "loss": 5.0403, "mean_token_accuracy": 0.21526143848896026, "num_tokens": 29164599.0, "step": 12720 }, { "entropy": 5.141847133636475, "epoch": 1.2223823246878003, "grad_norm": 1.1953125, "learning_rate": 0.000485791569631212, "loss": 4.9984, "mean_token_accuracy": 0.20752517729997635, "num_tokens": 29176275.0, "step": 12725 }, { "entropy": 5.235421657562256, "epoch": 1.2228626320845342, "grad_norm": 1.1875, "learning_rate": 0.0004857795772776446, "loss": 5.0038, "mean_token_accuracy": 0.20879749357700347, "num_tokens": 29189102.0, "step": 12730 }, { "entropy": 5.225815057754517, "epoch": 1.223342939481268, "grad_norm": 1.2890625, "learning_rate": 0.00048576758003040127, "loss": 5.0339, "mean_token_accuracy": 0.2110910639166832, "num_tokens": 29200953.0, "step": 12735 }, { "entropy": 5.213280534744262, "epoch": 1.223823246878002, "grad_norm": 1.3671875, "learning_rate": 0.00048575557788976066, "loss": 5.0438, "mean_token_accuracy": 0.20327647179365158, "num_tokens": 29212942.0, "step": 12740 }, { "entropy": 5.164249658584595, "epoch": 1.2243035542747358, "grad_norm": 1.21875, "learning_rate": 0.0004857435708560013, "loss": 4.9348, "mean_token_accuracy": 0.21420682966709137, "num_tokens": 29224949.0, "step": 12745 }, { "entropy": 5.280761194229126, "epoch": 1.2247838616714697, "grad_norm": 1.3125, "learning_rate": 0.00048573155892940204, "loss": 5.0932, "mean_token_accuracy": 0.20139861702919007, "num_tokens": 29236044.0, "step": 12750 }, { "entropy": 5.2009320735931395, "epoch": 1.2252641690682036, "grad_norm": 1.0390625, "learning_rate": 0.00048571954211024164, "loss": 4.9868, "mean_token_accuracy": 0.21266197860240937, "num_tokens": 29248084.0, "step": 12755 }, { "entropy": 5.135626983642578, "epoch": 1.2257444764649374, "grad_norm": 1.28125, "learning_rate": 0.00048570752039879924, "loss": 4.873, "mean_token_accuracy": 0.22127241939306258, "num_tokens": 29258710.0, "step": 12760 }, { "entropy": 5.214362525939942, "epoch": 1.2262247838616716, "grad_norm": 1.234375, "learning_rate": 0.0004856954937953539, "loss": 5.0884, "mean_token_accuracy": 0.20115942060947417, "num_tokens": 29270173.0, "step": 12765 }, { "entropy": 5.278593635559082, "epoch": 1.2267050912584054, "grad_norm": 1.1953125, "learning_rate": 0.0004856834623001848, "loss": 5.0685, "mean_token_accuracy": 0.20889558047056198, "num_tokens": 29280407.0, "step": 12770 }, { "entropy": 5.135749340057373, "epoch": 1.2271853986551393, "grad_norm": 1.265625, "learning_rate": 0.0004856714259135713, "loss": 4.9413, "mean_token_accuracy": 0.2196897506713867, "num_tokens": 29292287.0, "step": 12775 }, { "entropy": 5.2047443866729735, "epoch": 1.2276657060518732, "grad_norm": 1.3671875, "learning_rate": 0.0004856593846357929, "loss": 5.0592, "mean_token_accuracy": 0.20803760290145873, "num_tokens": 29303099.0, "step": 12780 }, { "entropy": 5.2750386714935305, "epoch": 1.228146013448607, "grad_norm": 1.2890625, "learning_rate": 0.0004856473384671291, "loss": 5.1128, "mean_token_accuracy": 0.2065555065870285, "num_tokens": 29314445.0, "step": 12785 }, { "entropy": 5.146969270706177, "epoch": 1.228626320845341, "grad_norm": 2.125, "learning_rate": 0.00048563528740785955, "loss": 4.9752, "mean_token_accuracy": 0.20982863157987594, "num_tokens": 29325309.0, "step": 12790 }, { "entropy": 5.152886533737183, "epoch": 1.2291066282420748, "grad_norm": 1.109375, "learning_rate": 0.00048562323145826414, "loss": 4.9259, "mean_token_accuracy": 0.22105071544647217, "num_tokens": 29338582.0, "step": 12795 }, { "entropy": 5.207387828826905, "epoch": 1.229586935638809, "grad_norm": 1.2421875, "learning_rate": 0.0004856111706186227, "loss": 4.9922, "mean_token_accuracy": 0.21350787281990052, "num_tokens": 29349875.0, "step": 12800 }, { "entropy": 5.283040285110474, "epoch": 1.2300672430355428, "grad_norm": 1.1640625, "learning_rate": 0.00048559910488921534, "loss": 5.1049, "mean_token_accuracy": 0.2001900017261505, "num_tokens": 29361800.0, "step": 12805 }, { "entropy": 5.165265846252441, "epoch": 1.2305475504322767, "grad_norm": 1.2421875, "learning_rate": 0.000485587034270322, "loss": 4.9814, "mean_token_accuracy": 0.21499158591032028, "num_tokens": 29372795.0, "step": 12810 }, { "entropy": 5.20758466720581, "epoch": 1.2310278578290106, "grad_norm": 1.1328125, "learning_rate": 0.000485574958762223, "loss": 5.0659, "mean_token_accuracy": 0.21106487214565278, "num_tokens": 29385391.0, "step": 12815 }, { "entropy": 5.130280923843384, "epoch": 1.2315081652257445, "grad_norm": 1.265625, "learning_rate": 0.00048556287836519886, "loss": 4.9144, "mean_token_accuracy": 0.22030035853385926, "num_tokens": 29397113.0, "step": 12820 }, { "entropy": 5.277672386169433, "epoch": 1.2319884726224783, "grad_norm": 1.3125, "learning_rate": 0.0004855507930795299, "loss": 5.097, "mean_token_accuracy": 0.21232483088970183, "num_tokens": 29407552.0, "step": 12825 }, { "entropy": 5.186304426193237, "epoch": 1.2324687800192122, "grad_norm": 1.25, "learning_rate": 0.00048553870290549665, "loss": 5.0012, "mean_token_accuracy": 0.21472340673208237, "num_tokens": 29418500.0, "step": 12830 }, { "entropy": 5.229617691040039, "epoch": 1.232949087415946, "grad_norm": 1.171875, "learning_rate": 0.00048552660784338, "loss": 5.0069, "mean_token_accuracy": 0.21694095134735109, "num_tokens": 29430335.0, "step": 12835 }, { "entropy": 5.128983354568481, "epoch": 1.23342939481268, "grad_norm": 1.1640625, "learning_rate": 0.0004855145078934606, "loss": 5.0013, "mean_token_accuracy": 0.20726215988397598, "num_tokens": 29441435.0, "step": 12840 }, { "entropy": 5.22078046798706, "epoch": 1.233909702209414, "grad_norm": 1.2578125, "learning_rate": 0.0004855024030560195, "loss": 4.9875, "mean_token_accuracy": 0.21403325647115706, "num_tokens": 29453140.0, "step": 12845 }, { "entropy": 5.107886886596679, "epoch": 1.234390009606148, "grad_norm": 1.2890625, "learning_rate": 0.0004854902933313376, "loss": 4.9168, "mean_token_accuracy": 0.2206453412771225, "num_tokens": 29464572.0, "step": 12850 }, { "entropy": 5.273893547058106, "epoch": 1.2348703170028819, "grad_norm": 1.21875, "learning_rate": 0.00048547817871969607, "loss": 5.1412, "mean_token_accuracy": 0.19843536466360093, "num_tokens": 29477069.0, "step": 12855 }, { "entropy": 5.258860635757446, "epoch": 1.2353506243996157, "grad_norm": 1.1875, "learning_rate": 0.00048546605922137633, "loss": 5.0259, "mean_token_accuracy": 0.21055852621793747, "num_tokens": 29486860.0, "step": 12860 }, { "entropy": 5.241803359985352, "epoch": 1.2358309317963496, "grad_norm": 1.1953125, "learning_rate": 0.0004854539348366596, "loss": 5.0934, "mean_token_accuracy": 0.20382969826459885, "num_tokens": 29499129.0, "step": 12865 }, { "entropy": 5.1568663120269775, "epoch": 1.2363112391930835, "grad_norm": 1.25, "learning_rate": 0.0004854418055658274, "loss": 4.9683, "mean_token_accuracy": 0.21473043411970139, "num_tokens": 29510764.0, "step": 12870 }, { "entropy": 5.179628610610962, "epoch": 1.2367915465898176, "grad_norm": 1.21875, "learning_rate": 0.00048542967140916134, "loss": 5.0443, "mean_token_accuracy": 0.2080310419201851, "num_tokens": 29522882.0, "step": 12875 }, { "entropy": 5.231398963928223, "epoch": 1.2372718539865515, "grad_norm": 1.234375, "learning_rate": 0.0004854175323669432, "loss": 4.9823, "mean_token_accuracy": 0.20900345593690872, "num_tokens": 29533348.0, "step": 12880 }, { "entropy": 5.1414636135101315, "epoch": 1.2377521613832854, "grad_norm": 1.0859375, "learning_rate": 0.0004854053884394547, "loss": 4.9611, "mean_token_accuracy": 0.2162790149450302, "num_tokens": 29545649.0, "step": 12885 }, { "entropy": 5.209917974472046, "epoch": 1.2382324687800192, "grad_norm": 1.1484375, "learning_rate": 0.00048539323962697796, "loss": 5.0101, "mean_token_accuracy": 0.20617685168981553, "num_tokens": 29558252.0, "step": 12890 }, { "entropy": 5.315207386016846, "epoch": 1.2387127761767531, "grad_norm": 1.2109375, "learning_rate": 0.0004853810859297949, "loss": 5.1166, "mean_token_accuracy": 0.21191854476928712, "num_tokens": 29569495.0, "step": 12895 }, { "entropy": 5.209201908111572, "epoch": 1.239193083573487, "grad_norm": 1.2109375, "learning_rate": 0.00048536892734818773, "loss": 4.9999, "mean_token_accuracy": 0.211149762570858, "num_tokens": 29582167.0, "step": 12900 }, { "entropy": 5.23478512763977, "epoch": 1.239673390970221, "grad_norm": 1.1875, "learning_rate": 0.0004853567638824387, "loss": 5.0181, "mean_token_accuracy": 0.2111159771680832, "num_tokens": 29593964.0, "step": 12905 }, { "entropy": 5.227269411087036, "epoch": 1.2401536983669548, "grad_norm": 1.3046875, "learning_rate": 0.00048534459553283026, "loss": 4.9755, "mean_token_accuracy": 0.21934866458177565, "num_tokens": 29604844.0, "step": 12910 }, { "entropy": 5.219102811813355, "epoch": 1.2406340057636887, "grad_norm": 1.203125, "learning_rate": 0.0004853324222996449, "loss": 5.0165, "mean_token_accuracy": 0.21728123128414153, "num_tokens": 29615219.0, "step": 12915 }, { "entropy": 5.303751039505005, "epoch": 1.2411143131604228, "grad_norm": 1.171875, "learning_rate": 0.00048532024418316525, "loss": 5.1028, "mean_token_accuracy": 0.2071371465921402, "num_tokens": 29626472.0, "step": 12920 }, { "entropy": 5.16833004951477, "epoch": 1.2415946205571566, "grad_norm": 1.2890625, "learning_rate": 0.0004853080611836741, "loss": 4.9767, "mean_token_accuracy": 0.2196236953139305, "num_tokens": 29637966.0, "step": 12925 }, { "entropy": 5.132480478286743, "epoch": 1.2420749279538905, "grad_norm": 1.1875, "learning_rate": 0.00048529587330145427, "loss": 4.906, "mean_token_accuracy": 0.2214494377374649, "num_tokens": 29648730.0, "step": 12930 }, { "entropy": 5.132258462905884, "epoch": 1.2425552353506244, "grad_norm": 1.3046875, "learning_rate": 0.00048528368053678863, "loss": 4.929, "mean_token_accuracy": 0.2147599697113037, "num_tokens": 29660576.0, "step": 12935 }, { "entropy": 5.175818014144897, "epoch": 1.2430355427473583, "grad_norm": 1.15625, "learning_rate": 0.0004852714828899604, "loss": 4.9604, "mean_token_accuracy": 0.2147279053926468, "num_tokens": 29672906.0, "step": 12940 }, { "entropy": 5.156858634948731, "epoch": 1.2435158501440922, "grad_norm": 1.140625, "learning_rate": 0.00048525928036125264, "loss": 4.9559, "mean_token_accuracy": 0.21674090325832368, "num_tokens": 29685360.0, "step": 12945 }, { "entropy": 5.184188318252564, "epoch": 1.243996157540826, "grad_norm": 1.3203125, "learning_rate": 0.00048524707295094884, "loss": 4.9588, "mean_token_accuracy": 0.2069990187883377, "num_tokens": 29697257.0, "step": 12950 }, { "entropy": 5.153713750839233, "epoch": 1.2444764649375601, "grad_norm": 1.3515625, "learning_rate": 0.0004852348606593322, "loss": 4.9132, "mean_token_accuracy": 0.220682792365551, "num_tokens": 29707877.0, "step": 12955 }, { "entropy": 5.208719635009766, "epoch": 1.244956772334294, "grad_norm": 1.1796875, "learning_rate": 0.00048522264348668646, "loss": 4.9975, "mean_token_accuracy": 0.21275103688240052, "num_tokens": 29719358.0, "step": 12960 }, { "entropy": 5.10908875465393, "epoch": 1.245437079731028, "grad_norm": 1.2421875, "learning_rate": 0.0004852104214332951, "loss": 4.8733, "mean_token_accuracy": 0.22901579290628432, "num_tokens": 29730383.0, "step": 12965 }, { "entropy": 5.184739780426026, "epoch": 1.2459173871277618, "grad_norm": 1.296875, "learning_rate": 0.00048519819449944205, "loss": 4.9995, "mean_token_accuracy": 0.21587478816509248, "num_tokens": 29741142.0, "step": 12970 }, { "entropy": 5.156636571884155, "epoch": 1.2463976945244957, "grad_norm": 1.203125, "learning_rate": 0.000485185962685411, "loss": 4.9697, "mean_token_accuracy": 0.21590882092714309, "num_tokens": 29754618.0, "step": 12975 }, { "entropy": 5.214703130722046, "epoch": 1.2468780019212296, "grad_norm": 1.4453125, "learning_rate": 0.000485173725991486, "loss": 4.9845, "mean_token_accuracy": 0.21426209211349487, "num_tokens": 29767115.0, "step": 12980 }, { "entropy": 5.226226806640625, "epoch": 1.2473583093179634, "grad_norm": 1.1875, "learning_rate": 0.00048516148441795124, "loss": 5.055, "mean_token_accuracy": 0.21063894778490067, "num_tokens": 29778165.0, "step": 12985 }, { "entropy": 5.291137981414795, "epoch": 1.2478386167146973, "grad_norm": 1.125, "learning_rate": 0.0004851492379650908, "loss": 5.1231, "mean_token_accuracy": 0.2022738501429558, "num_tokens": 29790528.0, "step": 12990 }, { "entropy": 5.202538394927979, "epoch": 1.2483189241114312, "grad_norm": 1.1171875, "learning_rate": 0.0004851369866331891, "loss": 4.9323, "mean_token_accuracy": 0.20517953634262084, "num_tokens": 29801709.0, "step": 12995 }, { "entropy": 5.175222253799438, "epoch": 1.2487992315081653, "grad_norm": 1.203125, "learning_rate": 0.0004851247304225306, "loss": 5.0036, "mean_token_accuracy": 0.2128250151872635, "num_tokens": 29812963.0, "step": 13000 }, { "entropy": 5.2060657978057865, "epoch": 1.2492795389048992, "grad_norm": 1.2265625, "learning_rate": 0.0004851124693333997, "loss": 4.9772, "mean_token_accuracy": 0.2129211023449898, "num_tokens": 29823711.0, "step": 13005 }, { "entropy": 5.165619707107544, "epoch": 1.249759846301633, "grad_norm": 1.296875, "learning_rate": 0.0004851002033660812, "loss": 4.9446, "mean_token_accuracy": 0.21848293840885163, "num_tokens": 29834038.0, "step": 13010 }, { "entropy": 5.213901424407959, "epoch": 1.250240153698367, "grad_norm": 1.2734375, "learning_rate": 0.00048508793252085994, "loss": 4.9833, "mean_token_accuracy": 0.21572160869836807, "num_tokens": 29844759.0, "step": 13015 }, { "entropy": 5.092281866073608, "epoch": 1.2507204610951008, "grad_norm": 1.296875, "learning_rate": 0.0004850756567980206, "loss": 4.8518, "mean_token_accuracy": 0.21635698527097702, "num_tokens": 29855643.0, "step": 13020 }, { "entropy": 5.237672472000122, "epoch": 1.2512007684918347, "grad_norm": 1.1328125, "learning_rate": 0.00048506337619784836, "loss": 5.0672, "mean_token_accuracy": 0.20816876441240312, "num_tokens": 29866917.0, "step": 13025 }, { "entropy": 5.285785484313965, "epoch": 1.2516810758885688, "grad_norm": 1.1484375, "learning_rate": 0.0004850510907206283, "loss": 5.1273, "mean_token_accuracy": 0.20473649799823762, "num_tokens": 29878937.0, "step": 13030 }, { "entropy": 5.139752054214478, "epoch": 1.2521613832853027, "grad_norm": 1.390625, "learning_rate": 0.00048503880036664555, "loss": 4.9387, "mean_token_accuracy": 0.2179243117570877, "num_tokens": 29889544.0, "step": 13035 }, { "entropy": 5.133181190490722, "epoch": 1.2526416906820366, "grad_norm": 1.2578125, "learning_rate": 0.0004850265051361857, "loss": 4.9495, "mean_token_accuracy": 0.21097120344638826, "num_tokens": 29901919.0, "step": 13040 }, { "entropy": 5.163506126403808, "epoch": 1.2531219980787704, "grad_norm": 1.1953125, "learning_rate": 0.0004850142050295339, "loss": 4.9949, "mean_token_accuracy": 0.21175018399953843, "num_tokens": 29913870.0, "step": 13045 }, { "entropy": 5.183034372329712, "epoch": 1.2536023054755043, "grad_norm": 1.3828125, "learning_rate": 0.00048500190004697595, "loss": 4.9554, "mean_token_accuracy": 0.21792073249816896, "num_tokens": 29925180.0, "step": 13050 }, { "entropy": 5.097564458847046, "epoch": 1.2540826128722382, "grad_norm": 1.265625, "learning_rate": 0.0004849895901887974, "loss": 4.8784, "mean_token_accuracy": 0.22044360488653184, "num_tokens": 29936433.0, "step": 13055 }, { "entropy": 5.19610743522644, "epoch": 1.254562920268972, "grad_norm": 1.296875, "learning_rate": 0.0004849772754552842, "loss": 5.074, "mean_token_accuracy": 0.20816617459058762, "num_tokens": 29948891.0, "step": 13060 }, { "entropy": 5.186704921722412, "epoch": 1.255043227665706, "grad_norm": 1.2109375, "learning_rate": 0.00048496495584672214, "loss": 4.884, "mean_token_accuracy": 0.21899646669626235, "num_tokens": 29960113.0, "step": 13065 }, { "entropy": 5.193692255020141, "epoch": 1.2555235350624399, "grad_norm": 1.1640625, "learning_rate": 0.00048495263136339725, "loss": 5.0114, "mean_token_accuracy": 0.20587489008903503, "num_tokens": 29972168.0, "step": 13070 }, { "entropy": 5.169920969009399, "epoch": 1.2560038424591737, "grad_norm": 1.2578125, "learning_rate": 0.0004849403020055956, "loss": 5.021, "mean_token_accuracy": 0.21360062509775163, "num_tokens": 29982996.0, "step": 13075 }, { "entropy": 5.186982870101929, "epoch": 1.2564841498559078, "grad_norm": 1.375, "learning_rate": 0.00048492796777360373, "loss": 5.0222, "mean_token_accuracy": 0.20997272729873656, "num_tokens": 29994088.0, "step": 13080 }, { "entropy": 5.285563182830811, "epoch": 1.2569644572526417, "grad_norm": 1.28125, "learning_rate": 0.00048491562866770767, "loss": 5.0864, "mean_token_accuracy": 0.2084256410598755, "num_tokens": 30005403.0, "step": 13085 }, { "entropy": 5.225534963607788, "epoch": 1.2574447646493756, "grad_norm": 1.2421875, "learning_rate": 0.00048490328468819404, "loss": 4.9807, "mean_token_accuracy": 0.22016366571187973, "num_tokens": 30015961.0, "step": 13090 }, { "entropy": 5.1508348941802975, "epoch": 1.2579250720461095, "grad_norm": 1.2421875, "learning_rate": 0.00048489093583534945, "loss": 4.9337, "mean_token_accuracy": 0.21542756259441376, "num_tokens": 30026670.0, "step": 13095 }, { "entropy": 5.179332733154297, "epoch": 1.2584053794428434, "grad_norm": 1.2578125, "learning_rate": 0.0004848785821094606, "loss": 4.9697, "mean_token_accuracy": 0.21637785881757737, "num_tokens": 30036711.0, "step": 13100 }, { "entropy": 5.204781723022461, "epoch": 1.2588856868395775, "grad_norm": 1.234375, "learning_rate": 0.0004848662235108142, "loss": 5.0481, "mean_token_accuracy": 0.20675273686647416, "num_tokens": 30047587.0, "step": 13105 }, { "entropy": 5.196116733551025, "epoch": 1.2593659942363113, "grad_norm": 1.4765625, "learning_rate": 0.0004848538600396973, "loss": 4.982, "mean_token_accuracy": 0.21352463364601135, "num_tokens": 30059348.0, "step": 13110 }, { "entropy": 5.233518457412719, "epoch": 1.2598463016330452, "grad_norm": 1.3125, "learning_rate": 0.00048484149169639694, "loss": 4.9836, "mean_token_accuracy": 0.21362563073635102, "num_tokens": 30070485.0, "step": 13115 }, { "entropy": 5.15550799369812, "epoch": 1.260326609029779, "grad_norm": 1.265625, "learning_rate": 0.0004848291184812003, "loss": 4.9135, "mean_token_accuracy": 0.22237775921821595, "num_tokens": 30081114.0, "step": 13120 }, { "entropy": 5.186419725418091, "epoch": 1.260806916426513, "grad_norm": 1.2421875, "learning_rate": 0.0004848167403943945, "loss": 5.0575, "mean_token_accuracy": 0.2090092420578003, "num_tokens": 30092634.0, "step": 13125 }, { "entropy": 5.190171480178833, "epoch": 1.2612872238232469, "grad_norm": 1.1171875, "learning_rate": 0.00048480435743626703, "loss": 4.9924, "mean_token_accuracy": 0.21862466484308243, "num_tokens": 30104205.0, "step": 13130 }, { "entropy": 5.219733333587646, "epoch": 1.2617675312199808, "grad_norm": 1.40625, "learning_rate": 0.0004847919696071054, "loss": 5.014, "mean_token_accuracy": 0.21289038211107253, "num_tokens": 30116978.0, "step": 13135 }, { "entropy": 5.1809934139251705, "epoch": 1.2622478386167146, "grad_norm": 1.1640625, "learning_rate": 0.00048477957690719716, "loss": 4.9081, "mean_token_accuracy": 0.21542966216802598, "num_tokens": 30128549.0, "step": 13140 }, { "entropy": 5.242063808441162, "epoch": 1.2627281460134485, "grad_norm": 1.3359375, "learning_rate": 0.0004847671793368301, "loss": 5.0544, "mean_token_accuracy": 0.2094632938504219, "num_tokens": 30139492.0, "step": 13145 }, { "entropy": 5.182856559753418, "epoch": 1.2632084534101824, "grad_norm": 1.2734375, "learning_rate": 0.000484754776896292, "loss": 4.969, "mean_token_accuracy": 0.21324526518583298, "num_tokens": 30150450.0, "step": 13150 }, { "entropy": 5.2239217281341555, "epoch": 1.2636887608069165, "grad_norm": 1.2734375, "learning_rate": 0.0004847423695858708, "loss": 5.0259, "mean_token_accuracy": 0.21593111753463745, "num_tokens": 30162204.0, "step": 13155 }, { "entropy": 5.2082499980926515, "epoch": 1.2641690682036504, "grad_norm": 1.234375, "learning_rate": 0.00048472995740585456, "loss": 4.971, "mean_token_accuracy": 0.210064397752285, "num_tokens": 30172574.0, "step": 13160 }, { "entropy": 5.113088941574096, "epoch": 1.2646493756003843, "grad_norm": 1.3125, "learning_rate": 0.0004847175403565316, "loss": 4.9037, "mean_token_accuracy": 0.21865027099847795, "num_tokens": 30183957.0, "step": 13165 }, { "entropy": 5.228566980361938, "epoch": 1.2651296829971181, "grad_norm": 1.2421875, "learning_rate": 0.00048470511843818996, "loss": 4.9679, "mean_token_accuracy": 0.2197330266237259, "num_tokens": 30194207.0, "step": 13170 }, { "entropy": 5.143984985351563, "epoch": 1.265609990393852, "grad_norm": 1.2734375, "learning_rate": 0.0004846926916511182, "loss": 4.9185, "mean_token_accuracy": 0.22421342581510545, "num_tokens": 30205180.0, "step": 13175 }, { "entropy": 5.170929908752441, "epoch": 1.266090297790586, "grad_norm": 1.3203125, "learning_rate": 0.0004846802599956048, "loss": 4.9828, "mean_token_accuracy": 0.20965515226125717, "num_tokens": 30217734.0, "step": 13180 }, { "entropy": 5.29590859413147, "epoch": 1.26657060518732, "grad_norm": 1.1796875, "learning_rate": 0.00048466782347193847, "loss": 5.2001, "mean_token_accuracy": 0.1999218687415123, "num_tokens": 30229835.0, "step": 13185 }, { "entropy": 5.277102136611939, "epoch": 1.267050912584054, "grad_norm": 1.2734375, "learning_rate": 0.00048465538208040775, "loss": 5.0373, "mean_token_accuracy": 0.20946380198001863, "num_tokens": 30241932.0, "step": 13190 }, { "entropy": 5.211878299713135, "epoch": 1.2675312199807878, "grad_norm": 1.2734375, "learning_rate": 0.00048464293582130166, "loss": 5.0248, "mean_token_accuracy": 0.21039628088474274, "num_tokens": 30253149.0, "step": 13195 }, { "entropy": 5.256510972976685, "epoch": 1.2680115273775217, "grad_norm": 1.2890625, "learning_rate": 0.0004846304846949091, "loss": 4.9547, "mean_token_accuracy": 0.2133228898048401, "num_tokens": 30264083.0, "step": 13200 }, { "entropy": 5.156170415878296, "epoch": 1.2684918347742555, "grad_norm": 1.4453125, "learning_rate": 0.00048461802870151916, "loss": 4.9245, "mean_token_accuracy": 0.22115042805671692, "num_tokens": 30274832.0, "step": 13205 }, { "entropy": 5.104134511947632, "epoch": 1.2689721421709894, "grad_norm": 1.40625, "learning_rate": 0.00048460556784142106, "loss": 4.9446, "mean_token_accuracy": 0.21649524569511414, "num_tokens": 30284945.0, "step": 13210 }, { "entropy": 5.27008090019226, "epoch": 1.2694524495677233, "grad_norm": 1.25, "learning_rate": 0.00048459310211490406, "loss": 4.9969, "mean_token_accuracy": 0.2195618912577629, "num_tokens": 30295133.0, "step": 13215 }, { "entropy": 5.191392421722412, "epoch": 1.2699327569644572, "grad_norm": 1.6171875, "learning_rate": 0.0004845806315222576, "loss": 5.0609, "mean_token_accuracy": 0.20277179926633834, "num_tokens": 30305268.0, "step": 13220 }, { "entropy": 5.123500633239746, "epoch": 1.270413064361191, "grad_norm": 1.2265625, "learning_rate": 0.0004845681560637711, "loss": 4.9226, "mean_token_accuracy": 0.21860510110855103, "num_tokens": 30317118.0, "step": 13225 }, { "entropy": 5.243611288070679, "epoch": 1.270893371757925, "grad_norm": 1.390625, "learning_rate": 0.0004845556757397344, "loss": 5.0812, "mean_token_accuracy": 0.20786524415016175, "num_tokens": 30328684.0, "step": 13230 }, { "entropy": 5.27822527885437, "epoch": 1.271373679154659, "grad_norm": 1.2265625, "learning_rate": 0.0004845431905504372, "loss": 5.0788, "mean_token_accuracy": 0.2057919830083847, "num_tokens": 30341433.0, "step": 13235 }, { "entropy": 5.1837303161621096, "epoch": 1.271853986551393, "grad_norm": 1.15625, "learning_rate": 0.00048453070049616926, "loss": 4.958, "mean_token_accuracy": 0.2240109384059906, "num_tokens": 30353159.0, "step": 13240 }, { "entropy": 5.156756496429443, "epoch": 1.2723342939481268, "grad_norm": 1.4921875, "learning_rate": 0.00048451820557722064, "loss": 5.0083, "mean_token_accuracy": 0.21551052629947662, "num_tokens": 30363251.0, "step": 13245 }, { "entropy": 5.246157121658325, "epoch": 1.2728146013448607, "grad_norm": 1.1328125, "learning_rate": 0.0004845057057938815, "loss": 5.0621, "mean_token_accuracy": 0.21401735842227937, "num_tokens": 30375850.0, "step": 13250 }, { "entropy": 5.240186405181885, "epoch": 1.2732949087415946, "grad_norm": 1.2109375, "learning_rate": 0.00048449320114644185, "loss": 5.0836, "mean_token_accuracy": 0.20593365728855134, "num_tokens": 30386839.0, "step": 13255 }, { "entropy": 5.22416672706604, "epoch": 1.2737752161383284, "grad_norm": 1.25, "learning_rate": 0.0004844806916351922, "loss": 5.052, "mean_token_accuracy": 0.2087215930223465, "num_tokens": 30398872.0, "step": 13260 }, { "entropy": 5.206205415725708, "epoch": 1.2742555235350626, "grad_norm": 1.421875, "learning_rate": 0.0004844681772604229, "loss": 4.962, "mean_token_accuracy": 0.22111569941043854, "num_tokens": 30409581.0, "step": 13265 }, { "entropy": 5.173876953125, "epoch": 1.2747358309317964, "grad_norm": 1.3046875, "learning_rate": 0.00048445565802242454, "loss": 4.9982, "mean_token_accuracy": 0.214154152572155, "num_tokens": 30420209.0, "step": 13270 }, { "entropy": 5.225718021392822, "epoch": 1.2752161383285303, "grad_norm": 1.3984375, "learning_rate": 0.0004844431339214878, "loss": 5.0296, "mean_token_accuracy": 0.21498659551143645, "num_tokens": 30432093.0, "step": 13275 }, { "entropy": 5.192876482009888, "epoch": 1.2756964457252642, "grad_norm": 1.21875, "learning_rate": 0.0004844306049579034, "loss": 4.9477, "mean_token_accuracy": 0.21263082027435304, "num_tokens": 30442796.0, "step": 13280 }, { "entropy": 5.20331597328186, "epoch": 1.276176753121998, "grad_norm": 1.234375, "learning_rate": 0.00048441807113196216, "loss": 4.9849, "mean_token_accuracy": 0.20858001410961152, "num_tokens": 30455226.0, "step": 13285 }, { "entropy": 5.262969589233398, "epoch": 1.276657060518732, "grad_norm": 1.3515625, "learning_rate": 0.00048440553244395517, "loss": 5.0852, "mean_token_accuracy": 0.20551335960626602, "num_tokens": 30467082.0, "step": 13290 }, { "entropy": 5.207232666015625, "epoch": 1.2771373679154658, "grad_norm": 1.234375, "learning_rate": 0.00048439298889417357, "loss": 4.9857, "mean_token_accuracy": 0.20911924540996552, "num_tokens": 30479051.0, "step": 13295 }, { "entropy": 5.192299127578735, "epoch": 1.2776176753121997, "grad_norm": 1.3203125, "learning_rate": 0.00048438044048290847, "loss": 5.0429, "mean_token_accuracy": 0.20957115888595582, "num_tokens": 30489989.0, "step": 13300 }, { "entropy": 5.226817989349366, "epoch": 1.2780979827089336, "grad_norm": 1.5, "learning_rate": 0.00048436788721045135, "loss": 4.9441, "mean_token_accuracy": 0.21679565608501433, "num_tokens": 30501533.0, "step": 13305 }, { "entropy": 5.203074645996094, "epoch": 1.2785782901056677, "grad_norm": 1.171875, "learning_rate": 0.0004843553290770935, "loss": 5.029, "mean_token_accuracy": 0.2081605538725853, "num_tokens": 30512231.0, "step": 13310 }, { "entropy": 5.154972076416016, "epoch": 1.2790585975024016, "grad_norm": 1.171875, "learning_rate": 0.0004843427660831266, "loss": 5.0001, "mean_token_accuracy": 0.21241324096918107, "num_tokens": 30523204.0, "step": 13315 }, { "entropy": 5.282387590408325, "epoch": 1.2795389048991355, "grad_norm": 1.1171875, "learning_rate": 0.00048433019822884235, "loss": 5.1216, "mean_token_accuracy": 0.20325924307107926, "num_tokens": 30534956.0, "step": 13320 }, { "entropy": 5.182562732696534, "epoch": 1.2800192122958693, "grad_norm": 1.1953125, "learning_rate": 0.0004843176255145325, "loss": 4.9731, "mean_token_accuracy": 0.21960055232048034, "num_tokens": 30545938.0, "step": 13325 }, { "entropy": 5.180881690979004, "epoch": 1.2804995196926032, "grad_norm": 1.3203125, "learning_rate": 0.0004843050479404888, "loss": 4.915, "mean_token_accuracy": 0.21800871789455414, "num_tokens": 30557323.0, "step": 13330 }, { "entropy": 5.227671194076538, "epoch": 1.280979827089337, "grad_norm": 1.2109375, "learning_rate": 0.00048429246550700343, "loss": 4.9882, "mean_token_accuracy": 0.21517169177532197, "num_tokens": 30569960.0, "step": 13335 }, { "entropy": 5.110123968124389, "epoch": 1.2814601344860712, "grad_norm": 1.328125, "learning_rate": 0.0004842798782143686, "loss": 5.0236, "mean_token_accuracy": 0.20901857316493988, "num_tokens": 30581904.0, "step": 13340 }, { "entropy": 5.196750164031982, "epoch": 1.281940441882805, "grad_norm": 1.25, "learning_rate": 0.00048426728606287627, "loss": 4.9905, "mean_token_accuracy": 0.21553199142217636, "num_tokens": 30592955.0, "step": 13345 }, { "entropy": 5.22657151222229, "epoch": 1.282420749279539, "grad_norm": 1.2578125, "learning_rate": 0.0004842546890528191, "loss": 5.0424, "mean_token_accuracy": 0.21174602657556535, "num_tokens": 30604020.0, "step": 13350 }, { "entropy": 5.152847194671631, "epoch": 1.2829010566762729, "grad_norm": 1.2109375, "learning_rate": 0.0004842420871844893, "loss": 4.9739, "mean_token_accuracy": 0.2086465060710907, "num_tokens": 30615623.0, "step": 13355 }, { "entropy": 5.217724800109863, "epoch": 1.2833813640730067, "grad_norm": 1.2265625, "learning_rate": 0.0004842294804581796, "loss": 5.0257, "mean_token_accuracy": 0.2142942488193512, "num_tokens": 30626258.0, "step": 13360 }, { "entropy": 5.231122970581055, "epoch": 1.2838616714697406, "grad_norm": 1.078125, "learning_rate": 0.00048421686887418266, "loss": 5.008, "mean_token_accuracy": 0.20600861310958862, "num_tokens": 30637861.0, "step": 13365 }, { "entropy": 5.25348687171936, "epoch": 1.2843419788664745, "grad_norm": 1.1640625, "learning_rate": 0.0004842042524327912, "loss": 5.0313, "mean_token_accuracy": 0.2152662232518196, "num_tokens": 30648835.0, "step": 13370 }, { "entropy": 5.268816089630127, "epoch": 1.2848222862632084, "grad_norm": 1.2578125, "learning_rate": 0.0004841916311342983, "loss": 5.071, "mean_token_accuracy": 0.20168877840042115, "num_tokens": 30659117.0, "step": 13375 }, { "entropy": 5.167082214355469, "epoch": 1.2853025936599423, "grad_norm": 1.28125, "learning_rate": 0.0004841790049789969, "loss": 4.99, "mean_token_accuracy": 0.21684323698282243, "num_tokens": 30670282.0, "step": 13380 }, { "entropy": 5.192442560195923, "epoch": 1.2857829010566761, "grad_norm": 1.328125, "learning_rate": 0.00048416637396718004, "loss": 5.049, "mean_token_accuracy": 0.20397165417671204, "num_tokens": 30681967.0, "step": 13385 }, { "entropy": 5.288777303695679, "epoch": 1.2862632084534102, "grad_norm": 1.15625, "learning_rate": 0.0004841537380991411, "loss": 4.9953, "mean_token_accuracy": 0.2100960224866867, "num_tokens": 30692803.0, "step": 13390 }, { "entropy": 5.132681179046631, "epoch": 1.2867435158501441, "grad_norm": 1.5078125, "learning_rate": 0.00048414109737517346, "loss": 4.8827, "mean_token_accuracy": 0.22248595058918, "num_tokens": 30704097.0, "step": 13395 }, { "entropy": 5.161887550354004, "epoch": 1.287223823246878, "grad_norm": 1.1640625, "learning_rate": 0.0004841284517955706, "loss": 5.0591, "mean_token_accuracy": 0.20569444447755814, "num_tokens": 30717983.0, "step": 13400 }, { "entropy": 5.150148868560791, "epoch": 1.2877041306436119, "grad_norm": 1.2421875, "learning_rate": 0.000484115801360626, "loss": 4.8516, "mean_token_accuracy": 0.22625237703323364, "num_tokens": 30728446.0, "step": 13405 }, { "entropy": 5.112018346786499, "epoch": 1.2881844380403458, "grad_norm": 1.3046875, "learning_rate": 0.0004841031460706335, "loss": 4.8186, "mean_token_accuracy": 0.22128551304340363, "num_tokens": 30739587.0, "step": 13410 }, { "entropy": 5.120370292663575, "epoch": 1.2886647454370797, "grad_norm": 1.34375, "learning_rate": 0.00048409048592588683, "loss": 4.9393, "mean_token_accuracy": 0.21629711836576462, "num_tokens": 30750093.0, "step": 13415 }, { "entropy": 5.221544075012207, "epoch": 1.2891450528338138, "grad_norm": 1.1953125, "learning_rate": 0.0004840778209266799, "loss": 5.0089, "mean_token_accuracy": 0.21404524147510529, "num_tokens": 30761692.0, "step": 13420 }, { "entropy": 5.132301568984985, "epoch": 1.2896253602305476, "grad_norm": 1.296875, "learning_rate": 0.00048406515107330685, "loss": 4.9333, "mean_token_accuracy": 0.21460689157247542, "num_tokens": 30773474.0, "step": 13425 }, { "entropy": 5.293475818634033, "epoch": 1.2901056676272815, "grad_norm": 1.1875, "learning_rate": 0.00048405247636606173, "loss": 5.1002, "mean_token_accuracy": 0.20041738152503968, "num_tokens": 30785464.0, "step": 13430 }, { "entropy": 5.225921392440796, "epoch": 1.2905859750240154, "grad_norm": 1.375, "learning_rate": 0.00048403979680523894, "loss": 4.9796, "mean_token_accuracy": 0.21184006035327912, "num_tokens": 30796343.0, "step": 13435 }, { "entropy": 5.233290290832519, "epoch": 1.2910662824207493, "grad_norm": 1.375, "learning_rate": 0.0004840271123911328, "loss": 4.9907, "mean_token_accuracy": 0.21301163733005524, "num_tokens": 30807795.0, "step": 13440 }, { "entropy": 5.200629138946534, "epoch": 1.2915465898174832, "grad_norm": 1.5859375, "learning_rate": 0.0004840144231240377, "loss": 4.9783, "mean_token_accuracy": 0.20906727910041809, "num_tokens": 30819629.0, "step": 13445 }, { "entropy": 5.146392774581909, "epoch": 1.292026897214217, "grad_norm": 1.3515625, "learning_rate": 0.0004840017290042484, "loss": 4.9387, "mean_token_accuracy": 0.21682157814502717, "num_tokens": 30831545.0, "step": 13450 }, { "entropy": 5.180679416656494, "epoch": 1.292507204610951, "grad_norm": 1.34375, "learning_rate": 0.00048398903003205957, "loss": 4.9697, "mean_token_accuracy": 0.21640813797712327, "num_tokens": 30843614.0, "step": 13455 }, { "entropy": 5.144548463821411, "epoch": 1.2929875120076848, "grad_norm": 1.2890625, "learning_rate": 0.00048397632620776604, "loss": 4.9008, "mean_token_accuracy": 0.21930991858243942, "num_tokens": 30853749.0, "step": 13460 }, { "entropy": 5.183133602142334, "epoch": 1.293467819404419, "grad_norm": 1.4453125, "learning_rate": 0.00048396361753166276, "loss": 4.9319, "mean_token_accuracy": 0.21686296314001083, "num_tokens": 30865065.0, "step": 13465 }, { "entropy": 5.188254976272583, "epoch": 1.2939481268011528, "grad_norm": 1.3125, "learning_rate": 0.00048395090400404466, "loss": 5.0198, "mean_token_accuracy": 0.209990793466568, "num_tokens": 30876746.0, "step": 13470 }, { "entropy": 5.24221830368042, "epoch": 1.2944284341978867, "grad_norm": 1.265625, "learning_rate": 0.00048393818562520715, "loss": 5.0519, "mean_token_accuracy": 0.21296084821224212, "num_tokens": 30889543.0, "step": 13475 }, { "entropy": 5.221997165679932, "epoch": 1.2949087415946205, "grad_norm": 1.3515625, "learning_rate": 0.00048392546239544535, "loss": 5.0196, "mean_token_accuracy": 0.21187058687210084, "num_tokens": 30901949.0, "step": 13480 }, { "entropy": 5.177766799926758, "epoch": 1.2953890489913544, "grad_norm": 1.1640625, "learning_rate": 0.0004839127343150547, "loss": 5.0104, "mean_token_accuracy": 0.2122276872396469, "num_tokens": 30912660.0, "step": 13485 }, { "entropy": 5.1722331047058105, "epoch": 1.2958693563880883, "grad_norm": 1.3125, "learning_rate": 0.0004839000013843307, "loss": 4.9911, "mean_token_accuracy": 0.2134689912199974, "num_tokens": 30924276.0, "step": 13490 }, { "entropy": 5.172789239883423, "epoch": 1.2963496637848224, "grad_norm": 1.5234375, "learning_rate": 0.00048388726360356894, "loss": 4.9235, "mean_token_accuracy": 0.21602853089571, "num_tokens": 30935169.0, "step": 13495 }, { "entropy": 5.292079591751099, "epoch": 1.2968299711815563, "grad_norm": 1.296875, "learning_rate": 0.0004838745209730653, "loss": 5.1031, "mean_token_accuracy": 0.20405390560626985, "num_tokens": 30946116.0, "step": 13500 }, { "entropy": 5.241908931732178, "epoch": 1.2973102785782902, "grad_norm": 1.3359375, "learning_rate": 0.00048386177349311535, "loss": 5.0269, "mean_token_accuracy": 0.20935841649770737, "num_tokens": 30958509.0, "step": 13505 }, { "entropy": 5.223677349090576, "epoch": 1.297790585975024, "grad_norm": 1.3203125, "learning_rate": 0.0004838490211640153, "loss": 5.0269, "mean_token_accuracy": 0.2150777280330658, "num_tokens": 30969501.0, "step": 13510 }, { "entropy": 5.135782957077026, "epoch": 1.298270893371758, "grad_norm": 1.1875, "learning_rate": 0.0004838362639860611, "loss": 4.9413, "mean_token_accuracy": 0.21526733487844468, "num_tokens": 30979768.0, "step": 13515 }, { "entropy": 5.1186598777771, "epoch": 1.2987512007684918, "grad_norm": 1.265625, "learning_rate": 0.000483823501959549, "loss": 4.9318, "mean_token_accuracy": 0.22160837799310684, "num_tokens": 30992020.0, "step": 13520 }, { "entropy": 5.242673254013061, "epoch": 1.2992315081652257, "grad_norm": 1.1484375, "learning_rate": 0.00048381073508477527, "loss": 5.0647, "mean_token_accuracy": 0.21089437007904052, "num_tokens": 31004081.0, "step": 13525 }, { "entropy": 5.293477010726929, "epoch": 1.2997118155619596, "grad_norm": 1.2265625, "learning_rate": 0.00048379796336203625, "loss": 5.0783, "mean_token_accuracy": 0.2032276600599289, "num_tokens": 31015776.0, "step": 13530 }, { "entropy": 5.198190069198608, "epoch": 1.3001921229586935, "grad_norm": 1.1953125, "learning_rate": 0.0004837851867916286, "loss": 4.9279, "mean_token_accuracy": 0.21109964847564697, "num_tokens": 31028066.0, "step": 13535 }, { "entropy": 5.151645803451538, "epoch": 1.3006724303554273, "grad_norm": 1.4140625, "learning_rate": 0.0004837724053738488, "loss": 4.9701, "mean_token_accuracy": 0.21504315882921218, "num_tokens": 31040234.0, "step": 13540 }, { "entropy": 5.235762643814087, "epoch": 1.3011527377521614, "grad_norm": 1.453125, "learning_rate": 0.00048375961910899373, "loss": 5.02, "mean_token_accuracy": 0.21187748908996581, "num_tokens": 31051158.0, "step": 13545 }, { "entropy": 5.290946435928345, "epoch": 1.3016330451488953, "grad_norm": 1.375, "learning_rate": 0.0004837468279973602, "loss": 5.0187, "mean_token_accuracy": 0.21141756772994996, "num_tokens": 31063404.0, "step": 13550 }, { "entropy": 5.1908276081085205, "epoch": 1.3021133525456292, "grad_norm": 1.265625, "learning_rate": 0.0004837340320392451, "loss": 4.9814, "mean_token_accuracy": 0.2137753427028656, "num_tokens": 31073845.0, "step": 13555 }, { "entropy": 5.128818511962891, "epoch": 1.302593659942363, "grad_norm": 1.3515625, "learning_rate": 0.00048372123123494563, "loss": 5.0075, "mean_token_accuracy": 0.20479750484228135, "num_tokens": 31086914.0, "step": 13560 }, { "entropy": 5.209336137771606, "epoch": 1.303073967339097, "grad_norm": 1.125, "learning_rate": 0.000483708425584759, "loss": 4.9457, "mean_token_accuracy": 0.21852964758872986, "num_tokens": 31098627.0, "step": 13565 }, { "entropy": 5.310657691955567, "epoch": 1.3035542747358309, "grad_norm": 1.421875, "learning_rate": 0.0004836956150889825, "loss": 5.2078, "mean_token_accuracy": 0.20212220698595046, "num_tokens": 31110064.0, "step": 13570 }, { "entropy": 5.232502269744873, "epoch": 1.304034582132565, "grad_norm": 1.4921875, "learning_rate": 0.0004836827997479134, "loss": 4.94, "mean_token_accuracy": 0.22403647303581237, "num_tokens": 31121572.0, "step": 13575 }, { "entropy": 5.2666408061981205, "epoch": 1.3045148895292988, "grad_norm": 1.1875, "learning_rate": 0.0004836699795618494, "loss": 5.0574, "mean_token_accuracy": 0.20667948424816132, "num_tokens": 31133276.0, "step": 13580 }, { "entropy": 5.261113119125366, "epoch": 1.3049951969260327, "grad_norm": 1.1953125, "learning_rate": 0.0004836571545310881, "loss": 5.0826, "mean_token_accuracy": 0.20838973075151443, "num_tokens": 31144524.0, "step": 13585 }, { "entropy": 5.195058012008667, "epoch": 1.3054755043227666, "grad_norm": 1.3671875, "learning_rate": 0.00048364432465592723, "loss": 4.9389, "mean_token_accuracy": 0.2192491337656975, "num_tokens": 31155775.0, "step": 13590 }, { "entropy": 5.19091010093689, "epoch": 1.3059558117195005, "grad_norm": 1.21875, "learning_rate": 0.0004836314899366647, "loss": 4.9018, "mean_token_accuracy": 0.21838261038064957, "num_tokens": 31166176.0, "step": 13595 }, { "entropy": 5.180618000030518, "epoch": 1.3064361191162344, "grad_norm": 1.3046875, "learning_rate": 0.00048361865037359846, "loss": 4.9768, "mean_token_accuracy": 0.21404249221086502, "num_tokens": 31178568.0, "step": 13600 }, { "entropy": 5.244294548034668, "epoch": 1.3069164265129682, "grad_norm": 1.453125, "learning_rate": 0.00048360580596702664, "loss": 5.0507, "mean_token_accuracy": 0.21091840416193008, "num_tokens": 31188897.0, "step": 13605 }, { "entropy": 5.194856262207031, "epoch": 1.3073967339097021, "grad_norm": 1.2734375, "learning_rate": 0.00048359295671724744, "loss": 5.0011, "mean_token_accuracy": 0.21148939728736876, "num_tokens": 31201370.0, "step": 13610 }, { "entropy": 5.184963703155518, "epoch": 1.307877041306436, "grad_norm": 1.296875, "learning_rate": 0.0004835801026245592, "loss": 5.0653, "mean_token_accuracy": 0.20697593539953232, "num_tokens": 31212704.0, "step": 13615 }, { "entropy": 5.2461165428161625, "epoch": 1.30835734870317, "grad_norm": 1.15625, "learning_rate": 0.00048356724368926035, "loss": 5.0037, "mean_token_accuracy": 0.21529320627450943, "num_tokens": 31224447.0, "step": 13620 }, { "entropy": 5.257749462127686, "epoch": 1.308837656099904, "grad_norm": 1.40625, "learning_rate": 0.00048355437991164937, "loss": 5.0519, "mean_token_accuracy": 0.21352533251047134, "num_tokens": 31235648.0, "step": 13625 }, { "entropy": 5.160556888580322, "epoch": 1.3093179634966379, "grad_norm": 1.4453125, "learning_rate": 0.000483541511292025, "loss": 5.0468, "mean_token_accuracy": 0.21006453037261963, "num_tokens": 31248330.0, "step": 13630 }, { "entropy": 5.208528804779053, "epoch": 1.3097982708933718, "grad_norm": 1.65625, "learning_rate": 0.00048352863783068594, "loss": 5.0721, "mean_token_accuracy": 0.21153536587953567, "num_tokens": 31260036.0, "step": 13635 }, { "entropy": 5.1190698623657225, "epoch": 1.3102785782901056, "grad_norm": 1.2578125, "learning_rate": 0.00048351575952793117, "loss": 4.8524, "mean_token_accuracy": 0.21958549171686173, "num_tokens": 31271473.0, "step": 13640 }, { "entropy": 5.102574396133423, "epoch": 1.3107588856868395, "grad_norm": 1.375, "learning_rate": 0.0004835028763840595, "loss": 4.8655, "mean_token_accuracy": 0.21855390667915345, "num_tokens": 31282045.0, "step": 13645 }, { "entropy": 5.208409547805786, "epoch": 1.3112391930835736, "grad_norm": 1.2890625, "learning_rate": 0.0004834899883993703, "loss": 5.0836, "mean_token_accuracy": 0.2093469277024269, "num_tokens": 31292644.0, "step": 13650 }, { "entropy": 5.228438043594361, "epoch": 1.3117195004803075, "grad_norm": 1.2578125, "learning_rate": 0.00048347709557416263, "loss": 5.0165, "mean_token_accuracy": 0.22113081067800522, "num_tokens": 31304422.0, "step": 13655 }, { "entropy": 5.12980465888977, "epoch": 1.3121998078770414, "grad_norm": 1.3671875, "learning_rate": 0.0004834641979087359, "loss": 4.9284, "mean_token_accuracy": 0.21452227383852004, "num_tokens": 31314845.0, "step": 13660 }, { "entropy": 5.248886060714722, "epoch": 1.3126801152737753, "grad_norm": 1.265625, "learning_rate": 0.0004834512954033894, "loss": 5.0768, "mean_token_accuracy": 0.20920535922050476, "num_tokens": 31325669.0, "step": 13665 }, { "entropy": 5.186451864242554, "epoch": 1.3131604226705091, "grad_norm": 1.234375, "learning_rate": 0.00048343838805842284, "loss": 4.9737, "mean_token_accuracy": 0.22008894085884095, "num_tokens": 31336023.0, "step": 13670 }, { "entropy": 5.238216543197632, "epoch": 1.313640730067243, "grad_norm": 1.3046875, "learning_rate": 0.00048342547587413583, "loss": 5.0729, "mean_token_accuracy": 0.20934326946735382, "num_tokens": 31347146.0, "step": 13675 }, { "entropy": 5.273143291473389, "epoch": 1.314121037463977, "grad_norm": 1.1328125, "learning_rate": 0.0004834125588508282, "loss": 5.0544, "mean_token_accuracy": 0.2122495487332344, "num_tokens": 31357616.0, "step": 13680 }, { "entropy": 5.262011289596558, "epoch": 1.3146013448607108, "grad_norm": 1.15625, "learning_rate": 0.0004833996369887998, "loss": 5.0584, "mean_token_accuracy": 0.2167961359024048, "num_tokens": 31369189.0, "step": 13685 }, { "entropy": 5.159175825119019, "epoch": 1.3150816522574447, "grad_norm": 1.21875, "learning_rate": 0.00048338671028835063, "loss": 4.9154, "mean_token_accuracy": 0.22063409090042113, "num_tokens": 31380841.0, "step": 13690 }, { "entropy": 5.126403951644898, "epoch": 1.3155619596541785, "grad_norm": 1.25, "learning_rate": 0.00048337377874978086, "loss": 4.9398, "mean_token_accuracy": 0.21937094777822494, "num_tokens": 31393092.0, "step": 13695 }, { "entropy": 5.165436792373657, "epoch": 1.3160422670509127, "grad_norm": 1.1953125, "learning_rate": 0.00048336084237339067, "loss": 4.9908, "mean_token_accuracy": 0.21392715871334075, "num_tokens": 31404228.0, "step": 13700 }, { "entropy": 5.2636415481567385, "epoch": 1.3165225744476465, "grad_norm": 1.4375, "learning_rate": 0.0004833479011594804, "loss": 5.0796, "mean_token_accuracy": 0.20588037818670274, "num_tokens": 31416233.0, "step": 13705 }, { "entropy": 5.302021265029907, "epoch": 1.3170028818443804, "grad_norm": 1.3828125, "learning_rate": 0.00048333495510835057, "loss": 5.0554, "mean_token_accuracy": 0.2089495837688446, "num_tokens": 31427787.0, "step": 13710 }, { "entropy": 5.034874057769775, "epoch": 1.3174831892411143, "grad_norm": 1.2734375, "learning_rate": 0.00048332200422030163, "loss": 4.8463, "mean_token_accuracy": 0.21873539835214614, "num_tokens": 31438736.0, "step": 13715 }, { "entropy": 5.0820282936096195, "epoch": 1.3179634966378482, "grad_norm": 1.25, "learning_rate": 0.0004833090484956345, "loss": 4.8609, "mean_token_accuracy": 0.2156251400709152, "num_tokens": 31449225.0, "step": 13720 }, { "entropy": 5.3021155834198, "epoch": 1.318443804034582, "grad_norm": 1.3359375, "learning_rate": 0.00048329608793464966, "loss": 5.0795, "mean_token_accuracy": 0.21024447679519653, "num_tokens": 31461223.0, "step": 13725 }, { "entropy": 5.292631769180298, "epoch": 1.3189241114313162, "grad_norm": 1.2265625, "learning_rate": 0.0004832831225376482, "loss": 5.0905, "mean_token_accuracy": 0.2085978850722313, "num_tokens": 31472397.0, "step": 13730 }, { "entropy": 5.110633659362793, "epoch": 1.31940441882805, "grad_norm": 1.28125, "learning_rate": 0.0004832701523049312, "loss": 4.9209, "mean_token_accuracy": 0.22347624897956847, "num_tokens": 31484079.0, "step": 13735 }, { "entropy": 5.168765449523926, "epoch": 1.319884726224784, "grad_norm": 1.25, "learning_rate": 0.0004832571772367997, "loss": 4.9554, "mean_token_accuracy": 0.20989650189876558, "num_tokens": 31496732.0, "step": 13740 }, { "entropy": 5.222825241088867, "epoch": 1.3203650336215178, "grad_norm": 1.109375, "learning_rate": 0.00048324419733355485, "loss": 4.8996, "mean_token_accuracy": 0.22089578211307526, "num_tokens": 31507601.0, "step": 13745 }, { "entropy": 5.175820016860962, "epoch": 1.3208453410182517, "grad_norm": 1.1640625, "learning_rate": 0.00048323121259549805, "loss": 4.9605, "mean_token_accuracy": 0.21191650182008742, "num_tokens": 31519551.0, "step": 13750 }, { "entropy": 5.148926830291748, "epoch": 1.3213256484149856, "grad_norm": 1.265625, "learning_rate": 0.00048321822302293095, "loss": 5.0144, "mean_token_accuracy": 0.21526147425174713, "num_tokens": 31531531.0, "step": 13755 }, { "entropy": 5.234309291839599, "epoch": 1.3218059558117194, "grad_norm": 1.3359375, "learning_rate": 0.0004832052286161549, "loss": 4.9451, "mean_token_accuracy": 0.20788251608610153, "num_tokens": 31542386.0, "step": 13760 }, { "entropy": 5.1972698211669925, "epoch": 1.3222862632084533, "grad_norm": 1.2109375, "learning_rate": 0.00048319222937547176, "loss": 4.9567, "mean_token_accuracy": 0.21743978857994078, "num_tokens": 31554142.0, "step": 13765 }, { "entropy": 5.197652673721313, "epoch": 1.3227665706051872, "grad_norm": 1.203125, "learning_rate": 0.00048317922530118323, "loss": 4.9909, "mean_token_accuracy": 0.210670205950737, "num_tokens": 31566687.0, "step": 13770 }, { "entropy": 5.255802297592163, "epoch": 1.3232468780019213, "grad_norm": 1.3203125, "learning_rate": 0.0004831662163935912, "loss": 5.0836, "mean_token_accuracy": 0.20739447325468063, "num_tokens": 31577893.0, "step": 13775 }, { "entropy": 5.338723754882812, "epoch": 1.3237271853986552, "grad_norm": 1.2421875, "learning_rate": 0.00048315320265299784, "loss": 5.141, "mean_token_accuracy": 0.20476285815238954, "num_tokens": 31590483.0, "step": 13780 }, { "entropy": 5.232583475112915, "epoch": 1.324207492795389, "grad_norm": 1.265625, "learning_rate": 0.00048314018407970516, "loss": 5.0018, "mean_token_accuracy": 0.21295965909957887, "num_tokens": 31601373.0, "step": 13785 }, { "entropy": 5.163480615615844, "epoch": 1.324687800192123, "grad_norm": 1.3125, "learning_rate": 0.00048312716067401535, "loss": 5.037, "mean_token_accuracy": 0.20841159224510192, "num_tokens": 31613981.0, "step": 13790 }, { "entropy": 5.269176197052002, "epoch": 1.3251681075888568, "grad_norm": 1.2578125, "learning_rate": 0.000483114132436231, "loss": 5.0406, "mean_token_accuracy": 0.20180656611919404, "num_tokens": 31626727.0, "step": 13795 }, { "entropy": 5.169664478302002, "epoch": 1.3256484149855907, "grad_norm": 1.3125, "learning_rate": 0.0004831010993666545, "loss": 4.9317, "mean_token_accuracy": 0.21578803807497024, "num_tokens": 31638519.0, "step": 13800 }, { "entropy": 5.129696464538574, "epoch": 1.3261287223823248, "grad_norm": 1.3125, "learning_rate": 0.0004830880614655884, "loss": 4.9147, "mean_token_accuracy": 0.21223049610853195, "num_tokens": 31649184.0, "step": 13805 }, { "entropy": 5.234156227111816, "epoch": 1.3266090297790587, "grad_norm": 1.40625, "learning_rate": 0.00048307501873333527, "loss": 5.0198, "mean_token_accuracy": 0.21354001611471177, "num_tokens": 31660266.0, "step": 13810 }, { "entropy": 5.198557376861572, "epoch": 1.3270893371757926, "grad_norm": 1.265625, "learning_rate": 0.0004830619711701982, "loss": 4.978, "mean_token_accuracy": 0.2158899873495102, "num_tokens": 31670841.0, "step": 13815 }, { "entropy": 5.151759386062622, "epoch": 1.3275696445725265, "grad_norm": 1.3125, "learning_rate": 0.0004830489187764799, "loss": 4.9399, "mean_token_accuracy": 0.21373986601829528, "num_tokens": 31680786.0, "step": 13820 }, { "entropy": 5.0768732070922855, "epoch": 1.3280499519692603, "grad_norm": 1.0703125, "learning_rate": 0.0004830358615524835, "loss": 4.9565, "mean_token_accuracy": 0.21601667404174804, "num_tokens": 31692639.0, "step": 13825 }, { "entropy": 5.29799165725708, "epoch": 1.3285302593659942, "grad_norm": 1.3203125, "learning_rate": 0.00048302279949851215, "loss": 5.0705, "mean_token_accuracy": 0.21043513119220733, "num_tokens": 31703520.0, "step": 13830 }, { "entropy": 5.207717561721802, "epoch": 1.329010566762728, "grad_norm": 1.2421875, "learning_rate": 0.00048300973261486906, "loss": 4.9361, "mean_token_accuracy": 0.22781557142734526, "num_tokens": 31715659.0, "step": 13835 }, { "entropy": 5.160664987564087, "epoch": 1.329490874159462, "grad_norm": 1.1953125, "learning_rate": 0.0004829966609018577, "loss": 5.009, "mean_token_accuracy": 0.20956473052501678, "num_tokens": 31727896.0, "step": 13840 }, { "entropy": 5.167005348205566, "epoch": 1.3299711815561959, "grad_norm": 1.1796875, "learning_rate": 0.00048298358435978146, "loss": 4.9763, "mean_token_accuracy": 0.21739626675844193, "num_tokens": 31738618.0, "step": 13845 }, { "entropy": 5.205134677886963, "epoch": 1.3304514889529298, "grad_norm": 1.234375, "learning_rate": 0.00048297050298894394, "loss": 5.0286, "mean_token_accuracy": 0.20701279789209365, "num_tokens": 31750306.0, "step": 13850 }, { "entropy": 5.272959852218628, "epoch": 1.3309317963496639, "grad_norm": 1.2421875, "learning_rate": 0.0004829574167896489, "loss": 5.0978, "mean_token_accuracy": 0.20369507372379303, "num_tokens": 31762015.0, "step": 13855 }, { "entropy": 5.2029674530029295, "epoch": 1.3314121037463977, "grad_norm": 1.3828125, "learning_rate": 0.00048294432576220027, "loss": 4.9805, "mean_token_accuracy": 0.2133902981877327, "num_tokens": 31774120.0, "step": 13860 }, { "entropy": 5.171335029602051, "epoch": 1.3318924111431316, "grad_norm": 1.2578125, "learning_rate": 0.0004829312299069017, "loss": 4.9216, "mean_token_accuracy": 0.22213226556777954, "num_tokens": 31785485.0, "step": 13865 }, { "entropy": 5.200899696350097, "epoch": 1.3323727185398655, "grad_norm": 1.40625, "learning_rate": 0.00048291812922405755, "loss": 5.0193, "mean_token_accuracy": 0.21298006922006607, "num_tokens": 31795621.0, "step": 13870 }, { "entropy": 5.163142156600952, "epoch": 1.3328530259365994, "grad_norm": 1.53125, "learning_rate": 0.0004829050237139717, "loss": 4.908, "mean_token_accuracy": 0.21594492197036744, "num_tokens": 31806761.0, "step": 13875 }, { "entropy": 5.23978180885315, "epoch": 1.3333333333333333, "grad_norm": 1.4453125, "learning_rate": 0.0004828919133769486, "loss": 4.9982, "mean_token_accuracy": 0.2074069321155548, "num_tokens": 31817996.0, "step": 13880 }, { "entropy": 5.220539999008179, "epoch": 1.3338136407300674, "grad_norm": 1.28125, "learning_rate": 0.0004828787982132926, "loss": 5.0618, "mean_token_accuracy": 0.21009029895067216, "num_tokens": 31829888.0, "step": 13885 }, { "entropy": 5.245324230194091, "epoch": 1.3342939481268012, "grad_norm": 1.1484375, "learning_rate": 0.00048286567822330815, "loss": 4.9951, "mean_token_accuracy": 0.21646622121334075, "num_tokens": 31842582.0, "step": 13890 }, { "entropy": 5.1263108253479, "epoch": 1.3347742555235351, "grad_norm": 1.2109375, "learning_rate": 0.0004828525534072999, "loss": 4.9066, "mean_token_accuracy": 0.2145892322063446, "num_tokens": 31854765.0, "step": 13895 }, { "entropy": 5.185867691040039, "epoch": 1.335254562920269, "grad_norm": 1.1484375, "learning_rate": 0.00048283942376557254, "loss": 4.9543, "mean_token_accuracy": 0.20899006426334382, "num_tokens": 31865505.0, "step": 13900 }, { "entropy": 5.1626379013061525, "epoch": 1.3357348703170029, "grad_norm": 1.234375, "learning_rate": 0.00048282628929843097, "loss": 4.9562, "mean_token_accuracy": 0.2139149159193039, "num_tokens": 31876599.0, "step": 13905 }, { "entropy": 5.196528911590576, "epoch": 1.3362151777137368, "grad_norm": 1.2265625, "learning_rate": 0.00048281315000617996, "loss": 4.9635, "mean_token_accuracy": 0.21398892104625702, "num_tokens": 31887292.0, "step": 13910 }, { "entropy": 5.179883575439453, "epoch": 1.3366954851104706, "grad_norm": 1.3125, "learning_rate": 0.0004828000058891248, "loss": 5.0204, "mean_token_accuracy": 0.20837054550647735, "num_tokens": 31899018.0, "step": 13915 }, { "entropy": 5.125789833068848, "epoch": 1.3371757925072045, "grad_norm": 1.2265625, "learning_rate": 0.0004827868569475706, "loss": 4.9423, "mean_token_accuracy": 0.2189765304327011, "num_tokens": 31910402.0, "step": 13920 }, { "entropy": 5.2392956733703615, "epoch": 1.3376560999039384, "grad_norm": 1.234375, "learning_rate": 0.00048277370318182243, "loss": 5.0742, "mean_token_accuracy": 0.2051733672618866, "num_tokens": 31921792.0, "step": 13925 }, { "entropy": 5.264586544036865, "epoch": 1.3381364073006723, "grad_norm": 1.4296875, "learning_rate": 0.00048276054459218596, "loss": 5.0715, "mean_token_accuracy": 0.206744547188282, "num_tokens": 31932786.0, "step": 13930 }, { "entropy": 5.2243023872375485, "epoch": 1.3386167146974064, "grad_norm": 1.1796875, "learning_rate": 0.00048274738117896643, "loss": 5.0083, "mean_token_accuracy": 0.21426367163658142, "num_tokens": 31944286.0, "step": 13935 }, { "entropy": 5.166335105895996, "epoch": 1.3390970220941403, "grad_norm": 1.2734375, "learning_rate": 0.00048273421294246966, "loss": 5.069, "mean_token_accuracy": 0.21005858927965165, "num_tokens": 31956304.0, "step": 13940 }, { "entropy": 5.209878873825073, "epoch": 1.3395773294908742, "grad_norm": 1.25, "learning_rate": 0.00048272103988300134, "loss": 5.0088, "mean_token_accuracy": 0.21268565505743026, "num_tokens": 31968978.0, "step": 13945 }, { "entropy": 5.243113946914673, "epoch": 1.340057636887608, "grad_norm": 1.3046875, "learning_rate": 0.0004827078620008672, "loss": 4.987, "mean_token_accuracy": 0.20750254094600679, "num_tokens": 31980133.0, "step": 13950 }, { "entropy": 5.202300357818603, "epoch": 1.340537944284342, "grad_norm": 1.4296875, "learning_rate": 0.00048269467929637337, "loss": 4.9367, "mean_token_accuracy": 0.22350060045719147, "num_tokens": 31990331.0, "step": 13955 }, { "entropy": 5.211510848999024, "epoch": 1.341018251681076, "grad_norm": 1.21875, "learning_rate": 0.00048268149176982576, "loss": 5.0098, "mean_token_accuracy": 0.2139397069811821, "num_tokens": 32001956.0, "step": 13960 }, { "entropy": 5.2395045280456545, "epoch": 1.34149855907781, "grad_norm": 1.1796875, "learning_rate": 0.00048266829942153055, "loss": 5.054, "mean_token_accuracy": 0.21206386983394623, "num_tokens": 32013577.0, "step": 13965 }, { "entropy": 5.341452741622925, "epoch": 1.3419788664745438, "grad_norm": 1.28125, "learning_rate": 0.00048265510225179413, "loss": 5.1105, "mean_token_accuracy": 0.20956795960664748, "num_tokens": 32025751.0, "step": 13970 }, { "entropy": 5.188568878173828, "epoch": 1.3424591738712777, "grad_norm": 1.359375, "learning_rate": 0.0004826419002609229, "loss": 5.0519, "mean_token_accuracy": 0.20738618373870848, "num_tokens": 32036592.0, "step": 13975 }, { "entropy": 5.274795866012573, "epoch": 1.3429394812680115, "grad_norm": 1.421875, "learning_rate": 0.00048262869344922326, "loss": 5.0194, "mean_token_accuracy": 0.2111186280846596, "num_tokens": 32048122.0, "step": 13980 }, { "entropy": 5.239737606048584, "epoch": 1.3434197886647454, "grad_norm": 1.203125, "learning_rate": 0.00048261548181700186, "loss": 4.9765, "mean_token_accuracy": 0.2105955883860588, "num_tokens": 32058673.0, "step": 13985 }, { "entropy": 5.085059738159179, "epoch": 1.3439000960614793, "grad_norm": 1.40625, "learning_rate": 0.0004826022653645655, "loss": 4.924, "mean_token_accuracy": 0.2151069536805153, "num_tokens": 32070219.0, "step": 13990 }, { "entropy": 5.24159984588623, "epoch": 1.3443804034582132, "grad_norm": 1.34375, "learning_rate": 0.000482589044092221, "loss": 5.0455, "mean_token_accuracy": 0.20947272181510926, "num_tokens": 32081883.0, "step": 13995 }, { "entropy": 5.30658369064331, "epoch": 1.344860710854947, "grad_norm": 1.3125, "learning_rate": 0.00048257581800027527, "loss": 5.0672, "mean_token_accuracy": 0.20932937860488893, "num_tokens": 32094107.0, "step": 14000 }, { "entropy": 5.196062517166138, "epoch": 1.345341018251681, "grad_norm": 1.2109375, "learning_rate": 0.0004825625870890354, "loss": 4.936, "mean_token_accuracy": 0.21412646919488906, "num_tokens": 32105242.0, "step": 14005 }, { "entropy": 5.214570760726929, "epoch": 1.345821325648415, "grad_norm": 1.390625, "learning_rate": 0.0004825493513588086, "loss": 4.9387, "mean_token_accuracy": 0.2163090154528618, "num_tokens": 32117326.0, "step": 14010 }, { "entropy": 5.2475536346435545, "epoch": 1.346301633045149, "grad_norm": 1.4296875, "learning_rate": 0.00048253611080990226, "loss": 5.0293, "mean_token_accuracy": 0.21036939769983293, "num_tokens": 32129002.0, "step": 14015 }, { "entropy": 5.212005138397217, "epoch": 1.3467819404418828, "grad_norm": 1.21875, "learning_rate": 0.0004825228654426236, "loss": 5.0548, "mean_token_accuracy": 0.2069345846772194, "num_tokens": 32140380.0, "step": 14020 }, { "entropy": 5.134690666198731, "epoch": 1.3472622478386167, "grad_norm": 1.328125, "learning_rate": 0.0004825096152572803, "loss": 4.9054, "mean_token_accuracy": 0.21957446187734603, "num_tokens": 32151806.0, "step": 14025 }, { "entropy": 5.20372257232666, "epoch": 1.3477425552353506, "grad_norm": 1.28125, "learning_rate": 0.00048249636025417974, "loss": 5.1018, "mean_token_accuracy": 0.2116215631365776, "num_tokens": 32163426.0, "step": 14030 }, { "entropy": 5.213733768463134, "epoch": 1.3482228626320845, "grad_norm": 1.3046875, "learning_rate": 0.00048248310043362997, "loss": 5.0054, "mean_token_accuracy": 0.21968272477388381, "num_tokens": 32174349.0, "step": 14035 }, { "entropy": 5.264178276062012, "epoch": 1.3487031700288186, "grad_norm": 1.2421875, "learning_rate": 0.0004824698357959386, "loss": 5.0717, "mean_token_accuracy": 0.2072421357035637, "num_tokens": 32185382.0, "step": 14040 }, { "entropy": 5.181209135055542, "epoch": 1.3491834774255524, "grad_norm": 1.3203125, "learning_rate": 0.00048245656634141385, "loss": 4.9607, "mean_token_accuracy": 0.21428043842315675, "num_tokens": 32195687.0, "step": 14045 }, { "entropy": 5.154476022720337, "epoch": 1.3496637848222863, "grad_norm": 1.4609375, "learning_rate": 0.00048244329207036354, "loss": 4.9185, "mean_token_accuracy": 0.22052669078111647, "num_tokens": 32205973.0, "step": 14050 }, { "entropy": 5.170621156692505, "epoch": 1.3501440922190202, "grad_norm": 1.421875, "learning_rate": 0.00048243001298309604, "loss": 4.9966, "mean_token_accuracy": 0.21300121247768403, "num_tokens": 32217469.0, "step": 14055 }, { "entropy": 5.1875158786773685, "epoch": 1.350624399615754, "grad_norm": 1.3125, "learning_rate": 0.00048241672907991954, "loss": 5.0227, "mean_token_accuracy": 0.21119635105133056, "num_tokens": 32228257.0, "step": 14060 }, { "entropy": 5.206764030456543, "epoch": 1.351104707012488, "grad_norm": 1.1640625, "learning_rate": 0.0004824034403611424, "loss": 4.9875, "mean_token_accuracy": 0.21840890049934386, "num_tokens": 32239420.0, "step": 14065 }, { "entropy": 5.238004541397094, "epoch": 1.3515850144092219, "grad_norm": 1.203125, "learning_rate": 0.0004823901468270733, "loss": 5.0209, "mean_token_accuracy": 0.21211865544319153, "num_tokens": 32250962.0, "step": 14070 }, { "entropy": 5.234237623214722, "epoch": 1.3520653218059557, "grad_norm": 1.21875, "learning_rate": 0.0004823768484780209, "loss": 5.0047, "mean_token_accuracy": 0.21667125970125198, "num_tokens": 32262310.0, "step": 14075 }, { "entropy": 5.1772243022918705, "epoch": 1.3525456292026896, "grad_norm": 1.2578125, "learning_rate": 0.00048236354531429375, "loss": 4.903, "mean_token_accuracy": 0.21903317421674728, "num_tokens": 32273373.0, "step": 14080 }, { "entropy": 5.260414171218872, "epoch": 1.3530259365994235, "grad_norm": 1.34375, "learning_rate": 0.0004823502373362009, "loss": 5.0224, "mean_token_accuracy": 0.21051734387874604, "num_tokens": 32285020.0, "step": 14085 }, { "entropy": 5.182881259918213, "epoch": 1.3535062439961576, "grad_norm": 1.5, "learning_rate": 0.0004823369245440512, "loss": 5.0027, "mean_token_accuracy": 0.21150606274604797, "num_tokens": 32296224.0, "step": 14090 }, { "entropy": 5.177131128311157, "epoch": 1.3539865513928915, "grad_norm": 1.5390625, "learning_rate": 0.00048232360693815387, "loss": 5.0028, "mean_token_accuracy": 0.21548304408788682, "num_tokens": 32306913.0, "step": 14095 }, { "entropy": 5.239502000808716, "epoch": 1.3544668587896254, "grad_norm": 1.296875, "learning_rate": 0.00048231028451881786, "loss": 4.9757, "mean_token_accuracy": 0.21516055166721343, "num_tokens": 32317981.0, "step": 14100 }, { "entropy": 5.112478399276734, "epoch": 1.3549471661863592, "grad_norm": 3.703125, "learning_rate": 0.0004822969572863527, "loss": 4.8805, "mean_token_accuracy": 0.22126417160034179, "num_tokens": 32329656.0, "step": 14105 }, { "entropy": 5.247384786605835, "epoch": 1.3554274735830931, "grad_norm": 1.5, "learning_rate": 0.00048228362524106776, "loss": 5.0463, "mean_token_accuracy": 0.2171102821826935, "num_tokens": 32339460.0, "step": 14110 }, { "entropy": 5.15761137008667, "epoch": 1.3559077809798272, "grad_norm": 1.2421875, "learning_rate": 0.00048227028838327253, "loss": 4.9853, "mean_token_accuracy": 0.2139111652970314, "num_tokens": 32351237.0, "step": 14115 }, { "entropy": 5.221787309646606, "epoch": 1.356388088376561, "grad_norm": 1.1796875, "learning_rate": 0.00048225694671327665, "loss": 4.9212, "mean_token_accuracy": 0.21492843478918075, "num_tokens": 32362368.0, "step": 14120 }, { "entropy": 5.100544738769531, "epoch": 1.356868395773295, "grad_norm": 1.265625, "learning_rate": 0.0004822436002313899, "loss": 4.9558, "mean_token_accuracy": 0.22090719044208526, "num_tokens": 32373738.0, "step": 14125 }, { "entropy": 5.202834510803223, "epoch": 1.3573487031700289, "grad_norm": 1.2421875, "learning_rate": 0.0004822302489379222, "loss": 5.0004, "mean_token_accuracy": 0.21080951392650604, "num_tokens": 32384274.0, "step": 14130 }, { "entropy": 5.1527222156524655, "epoch": 1.3578290105667628, "grad_norm": 1.40625, "learning_rate": 0.00048221689283318335, "loss": 4.9162, "mean_token_accuracy": 0.22189487069845198, "num_tokens": 32395692.0, "step": 14135 }, { "entropy": 5.170470714569092, "epoch": 1.3583093179634966, "grad_norm": 1.21875, "learning_rate": 0.0004822035319174837, "loss": 4.9993, "mean_token_accuracy": 0.215805584192276, "num_tokens": 32407367.0, "step": 14140 }, { "entropy": 5.2223457336425785, "epoch": 1.3587896253602305, "grad_norm": 1.265625, "learning_rate": 0.0004821901661911332, "loss": 4.9985, "mean_token_accuracy": 0.21592912524938584, "num_tokens": 32418753.0, "step": 14145 }, { "entropy": 5.224755716323853, "epoch": 1.3592699327569644, "grad_norm": 1.140625, "learning_rate": 0.0004821767956544423, "loss": 4.9901, "mean_token_accuracy": 0.2103568896651268, "num_tokens": 32429499.0, "step": 14150 }, { "entropy": 5.1020674228668215, "epoch": 1.3597502401536983, "grad_norm": 1.203125, "learning_rate": 0.0004821634203077214, "loss": 4.9625, "mean_token_accuracy": 0.2103301167488098, "num_tokens": 32441246.0, "step": 14155 }, { "entropy": 5.2334638118743895, "epoch": 1.3602305475504322, "grad_norm": 1.25, "learning_rate": 0.000482150040151281, "loss": 5.0333, "mean_token_accuracy": 0.2047850400209427, "num_tokens": 32452953.0, "step": 14160 }, { "entropy": 5.247525882720947, "epoch": 1.3607108549471663, "grad_norm": 1.1796875, "learning_rate": 0.0004821366551854318, "loss": 4.9967, "mean_token_accuracy": 0.2128307819366455, "num_tokens": 32464030.0, "step": 14165 }, { "entropy": 5.110248804092407, "epoch": 1.3611911623439001, "grad_norm": 1.140625, "learning_rate": 0.0004821232654104845, "loss": 4.811, "mean_token_accuracy": 0.21935641169548034, "num_tokens": 32475091.0, "step": 14170 }, { "entropy": 5.127313280105591, "epoch": 1.361671469740634, "grad_norm": 1.59375, "learning_rate": 0.00048210987082675005, "loss": 5.0527, "mean_token_accuracy": 0.21497475653886794, "num_tokens": 32486047.0, "step": 14175 }, { "entropy": 5.163631916046143, "epoch": 1.362151777137368, "grad_norm": 1.109375, "learning_rate": 0.00048209647143453946, "loss": 4.9198, "mean_token_accuracy": 0.22030255049467087, "num_tokens": 32497141.0, "step": 14180 }, { "entropy": 5.1883574485778805, "epoch": 1.3626320845341018, "grad_norm": 1.3984375, "learning_rate": 0.00048208306723416356, "loss": 4.9806, "mean_token_accuracy": 0.2113000214099884, "num_tokens": 32509282.0, "step": 14185 }, { "entropy": 5.247734832763672, "epoch": 1.3631123919308357, "grad_norm": 1.265625, "learning_rate": 0.0004820696582259339, "loss": 5.0618, "mean_token_accuracy": 0.20664857178926468, "num_tokens": 32521383.0, "step": 14190 }, { "entropy": 5.196439790725708, "epoch": 1.3635926993275698, "grad_norm": 1.2734375, "learning_rate": 0.0004820562444101616, "loss": 4.9404, "mean_token_accuracy": 0.2142431080341339, "num_tokens": 32533168.0, "step": 14195 }, { "entropy": 5.119819307327271, "epoch": 1.3640730067243036, "grad_norm": 1.28125, "learning_rate": 0.0004820428257871581, "loss": 4.9734, "mean_token_accuracy": 0.21469512581825256, "num_tokens": 32545333.0, "step": 14200 }, { "entropy": 5.115553903579712, "epoch": 1.3645533141210375, "grad_norm": 1.28125, "learning_rate": 0.0004820294023572351, "loss": 4.9275, "mean_token_accuracy": 0.2175430715084076, "num_tokens": 32556665.0, "step": 14205 }, { "entropy": 5.278141355514526, "epoch": 1.3650336215177714, "grad_norm": 1.234375, "learning_rate": 0.000482015974120704, "loss": 5.1187, "mean_token_accuracy": 0.2067723110318184, "num_tokens": 32568318.0, "step": 14210 }, { "entropy": 5.243190097808838, "epoch": 1.3655139289145053, "grad_norm": 1.4296875, "learning_rate": 0.00048200254107787677, "loss": 4.924, "mean_token_accuracy": 0.21714796870946884, "num_tokens": 32580010.0, "step": 14215 }, { "entropy": 5.200288200378418, "epoch": 1.3659942363112392, "grad_norm": 1.1484375, "learning_rate": 0.00048198910322906516, "loss": 5.094, "mean_token_accuracy": 0.201282075047493, "num_tokens": 32592075.0, "step": 14220 }, { "entropy": 5.279963874816895, "epoch": 1.366474543707973, "grad_norm": 1.1953125, "learning_rate": 0.00048197566057458125, "loss": 5.1004, "mean_token_accuracy": 0.21089850068092347, "num_tokens": 32604548.0, "step": 14225 }, { "entropy": 5.305767488479614, "epoch": 1.366954851104707, "grad_norm": 1.2890625, "learning_rate": 0.000481962213114737, "loss": 5.0238, "mean_token_accuracy": 0.2131134197115898, "num_tokens": 32616036.0, "step": 14230 }, { "entropy": 5.124698162078857, "epoch": 1.3674351585014408, "grad_norm": 1.1953125, "learning_rate": 0.0004819487608498448, "loss": 4.9755, "mean_token_accuracy": 0.21251588463783264, "num_tokens": 32628157.0, "step": 14235 }, { "entropy": 5.2571838855743405, "epoch": 1.3679154658981747, "grad_norm": 1.359375, "learning_rate": 0.00048193530378021687, "loss": 5.0518, "mean_token_accuracy": 0.20990225523710251, "num_tokens": 32640571.0, "step": 14240 }, { "entropy": 5.211359739303589, "epoch": 1.3683957732949088, "grad_norm": 1.2578125, "learning_rate": 0.00048192184190616567, "loss": 5.0147, "mean_token_accuracy": 0.20850124061107636, "num_tokens": 32652005.0, "step": 14245 }, { "entropy": 5.182319211959839, "epoch": 1.3688760806916427, "grad_norm": 1.21875, "learning_rate": 0.0004819083752280037, "loss": 4.953, "mean_token_accuracy": 0.2171345219016075, "num_tokens": 32663655.0, "step": 14250 }, { "entropy": 5.153241872787476, "epoch": 1.3693563880883766, "grad_norm": 1.171875, "learning_rate": 0.00048189490374604373, "loss": 5.0271, "mean_token_accuracy": 0.20629312843084335, "num_tokens": 32675419.0, "step": 14255 }, { "entropy": 5.223625183105469, "epoch": 1.3698366954851104, "grad_norm": 1.3984375, "learning_rate": 0.0004818814274605983, "loss": 4.9521, "mean_token_accuracy": 0.21175645738840104, "num_tokens": 32686680.0, "step": 14260 }, { "entropy": 5.112087917327881, "epoch": 1.3703170028818443, "grad_norm": 1.1484375, "learning_rate": 0.0004818679463719805, "loss": 4.8616, "mean_token_accuracy": 0.2263885572552681, "num_tokens": 32697321.0, "step": 14265 }, { "entropy": 5.0614667415618895, "epoch": 1.3707973102785782, "grad_norm": 1.1484375, "learning_rate": 0.0004818544604805033, "loss": 4.9871, "mean_token_accuracy": 0.2116144999861717, "num_tokens": 32708885.0, "step": 14270 }, { "entropy": 5.212231779098511, "epoch": 1.3712776176753123, "grad_norm": 1.5390625, "learning_rate": 0.0004818409697864798, "loss": 4.9688, "mean_token_accuracy": 0.221114681661129, "num_tokens": 32720517.0, "step": 14275 }, { "entropy": 5.181997632980346, "epoch": 1.3717579250720462, "grad_norm": 1.2421875, "learning_rate": 0.00048182747429022303, "loss": 4.9662, "mean_token_accuracy": 0.2124532178044319, "num_tokens": 32731072.0, "step": 14280 }, { "entropy": 5.273564004898072, "epoch": 1.37223823246878, "grad_norm": 1.328125, "learning_rate": 0.0004818139739920465, "loss": 5.0366, "mean_token_accuracy": 0.21512430757284165, "num_tokens": 32742117.0, "step": 14285 }, { "entropy": 5.170454597473144, "epoch": 1.372718539865514, "grad_norm": 1.3515625, "learning_rate": 0.0004818004688922637, "loss": 5.0208, "mean_token_accuracy": 0.2080516129732132, "num_tokens": 32753754.0, "step": 14290 }, { "entropy": 5.15729718208313, "epoch": 1.3731988472622478, "grad_norm": 1.171875, "learning_rate": 0.000481786958991188, "loss": 4.9537, "mean_token_accuracy": 0.21438979208469391, "num_tokens": 32766511.0, "step": 14295 }, { "entropy": 5.300592947006225, "epoch": 1.3736791546589817, "grad_norm": 1.4609375, "learning_rate": 0.00048177344428913316, "loss": 5.0434, "mean_token_accuracy": 0.20874705910682678, "num_tokens": 32777715.0, "step": 14300 }, { "entropy": 5.202049112319946, "epoch": 1.3741594620557156, "grad_norm": 1.1640625, "learning_rate": 0.00048175992478641293, "loss": 4.9841, "mean_token_accuracy": 0.21299902647733687, "num_tokens": 32789132.0, "step": 14305 }, { "entropy": 5.293251276016235, "epoch": 1.3746397694524495, "grad_norm": 1.3125, "learning_rate": 0.0004817464004833412, "loss": 5.0638, "mean_token_accuracy": 0.21248952597379683, "num_tokens": 32800439.0, "step": 14310 }, { "entropy": 5.183023118972779, "epoch": 1.3751200768491834, "grad_norm": 1.2734375, "learning_rate": 0.00048173287138023204, "loss": 4.9448, "mean_token_accuracy": 0.2125942125916481, "num_tokens": 32813605.0, "step": 14315 }, { "entropy": 5.2913895606994625, "epoch": 1.3756003842459175, "grad_norm": 1.359375, "learning_rate": 0.0004817193374773993, "loss": 5.0649, "mean_token_accuracy": 0.21377015858888626, "num_tokens": 32824225.0, "step": 14320 }, { "entropy": 5.224006319046021, "epoch": 1.3760806916426513, "grad_norm": 1.2734375, "learning_rate": 0.00048170579877515753, "loss": 5.0986, "mean_token_accuracy": 0.1994484543800354, "num_tokens": 32836917.0, "step": 14325 }, { "entropy": 5.171962022781372, "epoch": 1.3765609990393852, "grad_norm": 1.2421875, "learning_rate": 0.0004816922552738209, "loss": 4.9375, "mean_token_accuracy": 0.2171504095196724, "num_tokens": 32847972.0, "step": 14330 }, { "entropy": 5.122869682312012, "epoch": 1.377041306436119, "grad_norm": 1.171875, "learning_rate": 0.00048167870697370373, "loss": 4.9153, "mean_token_accuracy": 0.2238215833902359, "num_tokens": 32858922.0, "step": 14335 }, { "entropy": 5.170677995681762, "epoch": 1.377521613832853, "grad_norm": 1.140625, "learning_rate": 0.0004816651538751207, "loss": 4.8897, "mean_token_accuracy": 0.21658048182725906, "num_tokens": 32869788.0, "step": 14340 }, { "entropy": 5.328837919235229, "epoch": 1.3780019212295869, "grad_norm": 1.28125, "learning_rate": 0.00048165159597838664, "loss": 5.1289, "mean_token_accuracy": 0.1964000031352043, "num_tokens": 32881678.0, "step": 14345 }, { "entropy": 5.239847898483276, "epoch": 1.378482228626321, "grad_norm": 1.296875, "learning_rate": 0.000481638033283816, "loss": 5.0056, "mean_token_accuracy": 0.20893828570842743, "num_tokens": 32894183.0, "step": 14350 }, { "entropy": 5.181384801864624, "epoch": 1.3789625360230549, "grad_norm": 1.3125, "learning_rate": 0.00048162446579172387, "loss": 4.9588, "mean_token_accuracy": 0.21461172699928283, "num_tokens": 32906001.0, "step": 14355 }, { "entropy": 5.219556427001953, "epoch": 1.3794428434197887, "grad_norm": 1.2265625, "learning_rate": 0.0004816108935024252, "loss": 4.9869, "mean_token_accuracy": 0.21462354362010955, "num_tokens": 32917236.0, "step": 14360 }, { "entropy": 5.205921077728272, "epoch": 1.3799231508165226, "grad_norm": 1.4296875, "learning_rate": 0.00048159731641623507, "loss": 5.0295, "mean_token_accuracy": 0.20555862188339233, "num_tokens": 32929710.0, "step": 14365 }, { "entropy": 5.1836954116821286, "epoch": 1.3804034582132565, "grad_norm": 1.359375, "learning_rate": 0.0004815837345334687, "loss": 5.0501, "mean_token_accuracy": 0.21190683096647261, "num_tokens": 32941565.0, "step": 14370 }, { "entropy": 5.2973743915557865, "epoch": 1.3808837656099904, "grad_norm": 1.4140625, "learning_rate": 0.0004815701478544415, "loss": 5.1076, "mean_token_accuracy": 0.20745208263397216, "num_tokens": 32952730.0, "step": 14375 }, { "entropy": 5.153661918640137, "epoch": 1.3813640730067243, "grad_norm": 1.3828125, "learning_rate": 0.00048155655637946876, "loss": 4.9323, "mean_token_accuracy": 0.21619703769683837, "num_tokens": 32963589.0, "step": 14380 }, { "entropy": 5.156609296798706, "epoch": 1.3818443804034581, "grad_norm": 1.3359375, "learning_rate": 0.0004815429601088662, "loss": 4.8997, "mean_token_accuracy": 0.21508285403251648, "num_tokens": 32975008.0, "step": 14385 }, { "entropy": 5.220380783081055, "epoch": 1.382324687800192, "grad_norm": 1.2421875, "learning_rate": 0.0004815293590429494, "loss": 5.0953, "mean_token_accuracy": 0.20399677157402038, "num_tokens": 32986497.0, "step": 14390 }, { "entropy": 5.126517963409424, "epoch": 1.382804995196926, "grad_norm": 1.4296875, "learning_rate": 0.00048151575318203417, "loss": 4.8398, "mean_token_accuracy": 0.2227206841111183, "num_tokens": 32998298.0, "step": 14395 }, { "entropy": 5.315741157531738, "epoch": 1.38328530259366, "grad_norm": 1.109375, "learning_rate": 0.00048150214252643637, "loss": 5.0991, "mean_token_accuracy": 0.20775451213121415, "num_tokens": 33010943.0, "step": 14400 }, { "entropy": 5.2394379615783695, "epoch": 1.3837656099903939, "grad_norm": 1.265625, "learning_rate": 0.000481488527076472, "loss": 5.0057, "mean_token_accuracy": 0.2137501820921898, "num_tokens": 33022139.0, "step": 14405 }, { "entropy": 5.229398345947265, "epoch": 1.3842459173871278, "grad_norm": 1.5, "learning_rate": 0.0004814749068324571, "loss": 5.0743, "mean_token_accuracy": 0.208684541285038, "num_tokens": 33034653.0, "step": 14410 }, { "entropy": 5.133923292160034, "epoch": 1.3847262247838616, "grad_norm": 1.2578125, "learning_rate": 0.00048146128179470804, "loss": 4.7775, "mean_token_accuracy": 0.2346142292022705, "num_tokens": 33044339.0, "step": 14415 }, { "entropy": 5.1511882781982425, "epoch": 1.3852065321805955, "grad_norm": 1.234375, "learning_rate": 0.0004814476519635411, "loss": 4.9889, "mean_token_accuracy": 0.21854183077812195, "num_tokens": 33055418.0, "step": 14420 }, { "entropy": 5.2140251159667965, "epoch": 1.3856868395773294, "grad_norm": 1.3828125, "learning_rate": 0.00048143401733927274, "loss": 5.0771, "mean_token_accuracy": 0.21263548582792283, "num_tokens": 33067239.0, "step": 14425 }, { "entropy": 5.169557380676269, "epoch": 1.3861671469740635, "grad_norm": 1.1640625, "learning_rate": 0.00048142037792221943, "loss": 4.8765, "mean_token_accuracy": 0.22270715832710267, "num_tokens": 33079101.0, "step": 14430 }, { "entropy": 5.172262811660767, "epoch": 1.3866474543707974, "grad_norm": 1.1484375, "learning_rate": 0.0004814067337126978, "loss": 4.9711, "mean_token_accuracy": 0.21912187784910203, "num_tokens": 33090265.0, "step": 14435 }, { "entropy": 5.1767114162445065, "epoch": 1.3871277617675313, "grad_norm": 1.1484375, "learning_rate": 0.0004813930847110248, "loss": 4.9278, "mean_token_accuracy": 0.22050705552101135, "num_tokens": 33101724.0, "step": 14440 }, { "entropy": 5.140494298934937, "epoch": 1.3876080691642652, "grad_norm": 1.25, "learning_rate": 0.0004813794309175173, "loss": 4.9003, "mean_token_accuracy": 0.2202922970056534, "num_tokens": 33113111.0, "step": 14445 }, { "entropy": 5.224148082733154, "epoch": 1.388088376560999, "grad_norm": 1.15625, "learning_rate": 0.00048136577233249205, "loss": 5.0668, "mean_token_accuracy": 0.20422582030296327, "num_tokens": 33123925.0, "step": 14450 }, { "entropy": 5.21802864074707, "epoch": 1.388568683957733, "grad_norm": 1.2734375, "learning_rate": 0.0004813521089562666, "loss": 5.0279, "mean_token_accuracy": 0.2057347998023033, "num_tokens": 33137400.0, "step": 14455 }, { "entropy": 5.1206972122192385, "epoch": 1.3890489913544668, "grad_norm": 1.0625, "learning_rate": 0.0004813384407891577, "loss": 4.8598, "mean_token_accuracy": 0.225153611600399, "num_tokens": 33149326.0, "step": 14460 }, { "entropy": 5.25689435005188, "epoch": 1.3895292987512007, "grad_norm": 1.2578125, "learning_rate": 0.000481324767831483, "loss": 5.0631, "mean_token_accuracy": 0.20121145844459534, "num_tokens": 33159904.0, "step": 14465 }, { "entropy": 5.1242194175720215, "epoch": 1.3900096061479346, "grad_norm": 1.125, "learning_rate": 0.0004813110900835598, "loss": 4.9674, "mean_token_accuracy": 0.21700112670660018, "num_tokens": 33171701.0, "step": 14470 }, { "entropy": 5.188337993621826, "epoch": 1.3904899135446687, "grad_norm": 1.2421875, "learning_rate": 0.0004812974075457058, "loss": 4.8593, "mean_token_accuracy": 0.21914471834897994, "num_tokens": 33181598.0, "step": 14475 }, { "entropy": 5.158151054382325, "epoch": 1.3909702209414025, "grad_norm": 1.2265625, "learning_rate": 0.00048128372021823845, "loss": 4.9018, "mean_token_accuracy": 0.2141671285033226, "num_tokens": 33192674.0, "step": 14480 }, { "entropy": 5.147328567504883, "epoch": 1.3914505283381364, "grad_norm": 1.234375, "learning_rate": 0.00048127002810147574, "loss": 4.9428, "mean_token_accuracy": 0.21623384952545166, "num_tokens": 33203356.0, "step": 14485 }, { "entropy": 5.203651762008667, "epoch": 1.3919308357348703, "grad_norm": 1.4140625, "learning_rate": 0.0004812563311957355, "loss": 5.0018, "mean_token_accuracy": 0.20919703990221022, "num_tokens": 33215471.0, "step": 14490 }, { "entropy": 5.1700574398040775, "epoch": 1.3924111431316042, "grad_norm": 1.2109375, "learning_rate": 0.0004812426295013356, "loss": 4.9644, "mean_token_accuracy": 0.2118792712688446, "num_tokens": 33227656.0, "step": 14495 }, { "entropy": 5.211377429962158, "epoch": 1.392891450528338, "grad_norm": 1.265625, "learning_rate": 0.00048122892301859433, "loss": 5.0935, "mean_token_accuracy": 0.2065966710448265, "num_tokens": 33239752.0, "step": 14500 }, { "entropy": 5.261070919036865, "epoch": 1.3933717579250722, "grad_norm": 1.3125, "learning_rate": 0.00048121521174782983, "loss": 5.024, "mean_token_accuracy": 0.20616735219955445, "num_tokens": 33251352.0, "step": 14505 }, { "entropy": 5.210687255859375, "epoch": 1.393852065321806, "grad_norm": 1.3046875, "learning_rate": 0.00048120149568936044, "loss": 5.0163, "mean_token_accuracy": 0.210965596139431, "num_tokens": 33262276.0, "step": 14510 }, { "entropy": 5.234826755523682, "epoch": 1.39433237271854, "grad_norm": 1.3515625, "learning_rate": 0.0004811877748435046, "loss": 5.0574, "mean_token_accuracy": 0.20583246052265167, "num_tokens": 33273615.0, "step": 14515 }, { "entropy": 5.220763158798218, "epoch": 1.3948126801152738, "grad_norm": 1.4140625, "learning_rate": 0.0004811740492105809, "loss": 4.9741, "mean_token_accuracy": 0.2154085621237755, "num_tokens": 33283990.0, "step": 14520 }, { "entropy": 5.14139952659607, "epoch": 1.3952929875120077, "grad_norm": 1.2890625, "learning_rate": 0.000481160318790908, "loss": 4.9668, "mean_token_accuracy": 0.21832637190818788, "num_tokens": 33295472.0, "step": 14525 }, { "entropy": 5.236377191543579, "epoch": 1.3957732949087416, "grad_norm": 1.3125, "learning_rate": 0.00048114658358480467, "loss": 5.028, "mean_token_accuracy": 0.21142471432685853, "num_tokens": 33306742.0, "step": 14530 }, { "entropy": 5.2647254943847654, "epoch": 1.3962536023054755, "grad_norm": 1.28125, "learning_rate": 0.00048113284359258977, "loss": 5.0231, "mean_token_accuracy": 0.20946406126022338, "num_tokens": 33317737.0, "step": 14535 }, { "entropy": 5.16503872871399, "epoch": 1.3967339097022093, "grad_norm": 1.1953125, "learning_rate": 0.00048111909881458234, "loss": 5.0284, "mean_token_accuracy": 0.20923743396997452, "num_tokens": 33329673.0, "step": 14540 }, { "entropy": 5.086363649368286, "epoch": 1.3972142170989432, "grad_norm": 1.2265625, "learning_rate": 0.00048110534925110146, "loss": 4.8421, "mean_token_accuracy": 0.2240893319249153, "num_tokens": 33342047.0, "step": 14545 }, { "entropy": 5.161404705047607, "epoch": 1.397694524495677, "grad_norm": 1.2734375, "learning_rate": 0.0004810915949024664, "loss": 4.9326, "mean_token_accuracy": 0.21960555166006088, "num_tokens": 33353287.0, "step": 14550 }, { "entropy": 5.251311635971069, "epoch": 1.3981748318924112, "grad_norm": 1.046875, "learning_rate": 0.0004810778357689965, "loss": 5.0174, "mean_token_accuracy": 0.21918236762285231, "num_tokens": 33365465.0, "step": 14555 }, { "entropy": 5.1449960231781, "epoch": 1.398655139289145, "grad_norm": 1.21875, "learning_rate": 0.00048106407185101116, "loss": 4.9636, "mean_token_accuracy": 0.21153138428926468, "num_tokens": 33376680.0, "step": 14560 }, { "entropy": 5.135947895050049, "epoch": 1.399135446685879, "grad_norm": 1.171875, "learning_rate": 0.00048105030314883, "loss": 4.9717, "mean_token_accuracy": 0.21088991016149522, "num_tokens": 33387995.0, "step": 14565 }, { "entropy": 5.18218960762024, "epoch": 1.3996157540826129, "grad_norm": 1.1171875, "learning_rate": 0.0004810365296627725, "loss": 4.9477, "mean_token_accuracy": 0.21563183516263962, "num_tokens": 33400455.0, "step": 14570 }, { "entropy": 5.21840271949768, "epoch": 1.4000960614793467, "grad_norm": 1.265625, "learning_rate": 0.0004810227513931587, "loss": 4.9926, "mean_token_accuracy": 0.20900965332984925, "num_tokens": 33413264.0, "step": 14575 }, { "entropy": 5.157942056655884, "epoch": 1.4005763688760806, "grad_norm": 1.25, "learning_rate": 0.0004810089683403084, "loss": 4.8773, "mean_token_accuracy": 0.224508535861969, "num_tokens": 33423516.0, "step": 14580 }, { "entropy": 5.199560356140137, "epoch": 1.4010566762728147, "grad_norm": 1.1953125, "learning_rate": 0.0004809951805045415, "loss": 5.0182, "mean_token_accuracy": 0.21045213490724562, "num_tokens": 33434952.0, "step": 14585 }, { "entropy": 5.167169284820557, "epoch": 1.4015369836695486, "grad_norm": 1.234375, "learning_rate": 0.00048098138788617815, "loss": 4.9808, "mean_token_accuracy": 0.20959090143442155, "num_tokens": 33447025.0, "step": 14590 }, { "entropy": 5.233838939666748, "epoch": 1.4020172910662825, "grad_norm": 1.125, "learning_rate": 0.0004809675904855387, "loss": 4.9454, "mean_token_accuracy": 0.21511317044496536, "num_tokens": 33459990.0, "step": 14595 }, { "entropy": 5.194369840621948, "epoch": 1.4024975984630164, "grad_norm": 1.1875, "learning_rate": 0.00048095378830294343, "loss": 4.92, "mean_token_accuracy": 0.21844571679830552, "num_tokens": 33471334.0, "step": 14600 }, { "entropy": 5.197934675216675, "epoch": 1.4029779058597502, "grad_norm": 1.1796875, "learning_rate": 0.00048093998133871276, "loss": 5.0333, "mean_token_accuracy": 0.21131069511175155, "num_tokens": 33483744.0, "step": 14605 }, { "entropy": 5.198799562454224, "epoch": 1.4034582132564841, "grad_norm": 1.3515625, "learning_rate": 0.0004809261695931671, "loss": 5.0426, "mean_token_accuracy": 0.21084193140268326, "num_tokens": 33496532.0, "step": 14610 }, { "entropy": 5.159095096588135, "epoch": 1.403938520653218, "grad_norm": 1.1484375, "learning_rate": 0.0004809123530666273, "loss": 4.9746, "mean_token_accuracy": 0.21460918039083482, "num_tokens": 33507545.0, "step": 14615 }, { "entropy": 5.24304347038269, "epoch": 1.4044188280499519, "grad_norm": 1.203125, "learning_rate": 0.0004808985317594142, "loss": 4.9747, "mean_token_accuracy": 0.21796323955059052, "num_tokens": 33519232.0, "step": 14620 }, { "entropy": 5.157637119293213, "epoch": 1.4048991354466858, "grad_norm": 1.390625, "learning_rate": 0.00048088470567184854, "loss": 4.9454, "mean_token_accuracy": 0.2096275046467781, "num_tokens": 33531088.0, "step": 14625 }, { "entropy": 5.225188112258911, "epoch": 1.4053794428434199, "grad_norm": 1.296875, "learning_rate": 0.00048087087480425133, "loss": 5.0125, "mean_token_accuracy": 0.2120126485824585, "num_tokens": 33543991.0, "step": 14630 }, { "entropy": 5.237149286270141, "epoch": 1.4058597502401537, "grad_norm": 1.1953125, "learning_rate": 0.0004808570391569437, "loss": 5.0633, "mean_token_accuracy": 0.20491664558649064, "num_tokens": 33555303.0, "step": 14635 }, { "entropy": 5.264308309555053, "epoch": 1.4063400576368876, "grad_norm": 1.2734375, "learning_rate": 0.00048084319873024694, "loss": 5.0398, "mean_token_accuracy": 0.2091526836156845, "num_tokens": 33565587.0, "step": 14640 }, { "entropy": 5.181278657913208, "epoch": 1.4068203650336215, "grad_norm": 1.296875, "learning_rate": 0.0004808293535244823, "loss": 4.9055, "mean_token_accuracy": 0.21529979556798934, "num_tokens": 33576874.0, "step": 14645 }, { "entropy": 5.14690670967102, "epoch": 1.4073006724303554, "grad_norm": 1.53125, "learning_rate": 0.0004808155035399712, "loss": 4.9121, "mean_token_accuracy": 0.22220734357833863, "num_tokens": 33587703.0, "step": 14650 }, { "entropy": 5.13805742263794, "epoch": 1.4077809798270893, "grad_norm": 1.296875, "learning_rate": 0.0004808016487770354, "loss": 4.8853, "mean_token_accuracy": 0.22091327458620072, "num_tokens": 33598487.0, "step": 14655 }, { "entropy": 5.156050348281861, "epoch": 1.4082612872238234, "grad_norm": 1.265625, "learning_rate": 0.00048078778923599637, "loss": 5.0117, "mean_token_accuracy": 0.20967613756656647, "num_tokens": 33610838.0, "step": 14660 }, { "entropy": 5.176731777191162, "epoch": 1.4087415946205573, "grad_norm": 1.2265625, "learning_rate": 0.00048077392491717593, "loss": 4.958, "mean_token_accuracy": 0.21370896100997924, "num_tokens": 33622726.0, "step": 14665 }, { "entropy": 5.12830867767334, "epoch": 1.4092219020172911, "grad_norm": 1.2890625, "learning_rate": 0.00048076005582089597, "loss": 4.935, "mean_token_accuracy": 0.21416952610015869, "num_tokens": 33635922.0, "step": 14670 }, { "entropy": 5.317528486251831, "epoch": 1.409702209414025, "grad_norm": 1.3359375, "learning_rate": 0.00048074618194747845, "loss": 5.1027, "mean_token_accuracy": 0.20418261289596557, "num_tokens": 33648486.0, "step": 14675 }, { "entropy": 5.2534605979919435, "epoch": 1.410182516810759, "grad_norm": 1.2578125, "learning_rate": 0.0004807323032972456, "loss": 4.9975, "mean_token_accuracy": 0.21309973299503326, "num_tokens": 33659812.0, "step": 14680 }, { "entropy": 5.222612524032593, "epoch": 1.4106628242074928, "grad_norm": 1.21875, "learning_rate": 0.0004807184198705196, "loss": 4.9752, "mean_token_accuracy": 0.21646286994218827, "num_tokens": 33671878.0, "step": 14685 }, { "entropy": 5.133413934707642, "epoch": 1.4111431316042267, "grad_norm": 1.1171875, "learning_rate": 0.0004807045316676226, "loss": 4.8809, "mean_token_accuracy": 0.21957986503839494, "num_tokens": 33683759.0, "step": 14690 }, { "entropy": 5.253868293762207, "epoch": 1.4116234390009605, "grad_norm": 1.2109375, "learning_rate": 0.0004806906386888773, "loss": 5.0728, "mean_token_accuracy": 0.2058554098010063, "num_tokens": 33694085.0, "step": 14695 }, { "entropy": 5.213767671585083, "epoch": 1.4121037463976944, "grad_norm": 1.3203125, "learning_rate": 0.00048067674093460607, "loss": 4.9295, "mean_token_accuracy": 0.21423121094703673, "num_tokens": 33705810.0, "step": 14700 }, { "entropy": 5.210308361053467, "epoch": 1.4125840537944283, "grad_norm": 1.3203125, "learning_rate": 0.00048066283840513175, "loss": 5.0187, "mean_token_accuracy": 0.21089961528778076, "num_tokens": 33716798.0, "step": 14705 }, { "entropy": 5.149886178970337, "epoch": 1.4130643611911624, "grad_norm": 1.453125, "learning_rate": 0.0004806489311007769, "loss": 4.9737, "mean_token_accuracy": 0.22218613475561141, "num_tokens": 33728515.0, "step": 14710 }, { "entropy": 5.183704948425293, "epoch": 1.4135446685878963, "grad_norm": 1.078125, "learning_rate": 0.00048063501902186463, "loss": 4.9941, "mean_token_accuracy": 0.21435530483722687, "num_tokens": 33740684.0, "step": 14715 }, { "entropy": 5.210676050186157, "epoch": 1.4140249759846302, "grad_norm": 1.5390625, "learning_rate": 0.00048062110216871775, "loss": 4.9772, "mean_token_accuracy": 0.2168477714061737, "num_tokens": 33753017.0, "step": 14720 }, { "entropy": 5.175452709197998, "epoch": 1.414505283381364, "grad_norm": 1.3203125, "learning_rate": 0.00048060718054165945, "loss": 4.9873, "mean_token_accuracy": 0.21028392165899276, "num_tokens": 33764393.0, "step": 14725 }, { "entropy": 5.235194349288941, "epoch": 1.414985590778098, "grad_norm": 1.3046875, "learning_rate": 0.000480593254141013, "loss": 5.0995, "mean_token_accuracy": 0.2018577605485916, "num_tokens": 33774941.0, "step": 14730 }, { "entropy": 5.101527976989746, "epoch": 1.4154658981748318, "grad_norm": 1.2890625, "learning_rate": 0.00048057932296710165, "loss": 4.8514, "mean_token_accuracy": 0.22604466378688812, "num_tokens": 33786534.0, "step": 14735 }, { "entropy": 5.284107494354248, "epoch": 1.415946205571566, "grad_norm": 1.3671875, "learning_rate": 0.0004805653870202489, "loss": 5.1043, "mean_token_accuracy": 0.2047765925526619, "num_tokens": 33798339.0, "step": 14740 }, { "entropy": 5.283356618881226, "epoch": 1.4164265129682998, "grad_norm": 1.34375, "learning_rate": 0.00048055144630077825, "loss": 5.1154, "mean_token_accuracy": 0.2043526902794838, "num_tokens": 33810368.0, "step": 14745 }, { "entropy": 5.194024896621704, "epoch": 1.4169068203650337, "grad_norm": 1.34375, "learning_rate": 0.00048053750080901336, "loss": 4.9659, "mean_token_accuracy": 0.2081344470381737, "num_tokens": 33821111.0, "step": 14750 }, { "entropy": 5.111878871917725, "epoch": 1.4173871277617676, "grad_norm": 1.15625, "learning_rate": 0.00048052355054527794, "loss": 4.9638, "mean_token_accuracy": 0.21788180470466614, "num_tokens": 33833629.0, "step": 14755 }, { "entropy": 5.268445110321045, "epoch": 1.4178674351585014, "grad_norm": 1.2890625, "learning_rate": 0.00048050959550989606, "loss": 5.063, "mean_token_accuracy": 0.20761503428220748, "num_tokens": 33846531.0, "step": 14760 }, { "entropy": 5.145606899261475, "epoch": 1.4183477425552353, "grad_norm": 1.6640625, "learning_rate": 0.0004804956357031916, "loss": 4.9377, "mean_token_accuracy": 0.21531563848257065, "num_tokens": 33857251.0, "step": 14765 }, { "entropy": 5.137730884552002, "epoch": 1.4188280499519692, "grad_norm": 1.3203125, "learning_rate": 0.00048048167112548873, "loss": 4.9639, "mean_token_accuracy": 0.21248998492956161, "num_tokens": 33869314.0, "step": 14770 }, { "entropy": 5.285694265365601, "epoch": 1.419308357348703, "grad_norm": 1.25, "learning_rate": 0.00048046770177711157, "loss": 5.0916, "mean_token_accuracy": 0.2074048936367035, "num_tokens": 33880203.0, "step": 14775 }, { "entropy": 5.2045482158660885, "epoch": 1.419788664745437, "grad_norm": 1.2109375, "learning_rate": 0.0004804537276583844, "loss": 4.9006, "mean_token_accuracy": 0.2209893763065338, "num_tokens": 33891379.0, "step": 14780 }, { "entropy": 5.149256420135498, "epoch": 1.420268972142171, "grad_norm": 1.4765625, "learning_rate": 0.0004804397487696319, "loss": 4.9435, "mean_token_accuracy": 0.21625811159610747, "num_tokens": 33902788.0, "step": 14785 }, { "entropy": 5.160754537582397, "epoch": 1.420749279538905, "grad_norm": 1.234375, "learning_rate": 0.0004804257651111783, "loss": 4.9719, "mean_token_accuracy": 0.21887465864419936, "num_tokens": 33913609.0, "step": 14790 }, { "entropy": 5.174741888046265, "epoch": 1.4212295869356388, "grad_norm": 1.3515625, "learning_rate": 0.00048041177668334853, "loss": 4.8739, "mean_token_accuracy": 0.21784851402044297, "num_tokens": 33924379.0, "step": 14795 }, { "entropy": 5.137226343154907, "epoch": 1.4217098943323727, "grad_norm": 1.328125, "learning_rate": 0.0004803977834864672, "loss": 5.009, "mean_token_accuracy": 0.20945742577314377, "num_tokens": 33936209.0, "step": 14800 }, { "entropy": 5.228136348724365, "epoch": 1.4221902017291066, "grad_norm": 1.28125, "learning_rate": 0.00048038378552085927, "loss": 4.9569, "mean_token_accuracy": 0.2131284847855568, "num_tokens": 33947679.0, "step": 14805 }, { "entropy": 5.164991664886474, "epoch": 1.4226705091258405, "grad_norm": 1.6953125, "learning_rate": 0.00048036978278684974, "loss": 4.9628, "mean_token_accuracy": 0.2154536247253418, "num_tokens": 33959474.0, "step": 14810 }, { "entropy": 5.293703842163086, "epoch": 1.4231508165225746, "grad_norm": 1.2421875, "learning_rate": 0.0004803557752847636, "loss": 5.1033, "mean_token_accuracy": 0.20608988404273987, "num_tokens": 33970831.0, "step": 14815 }, { "entropy": 5.348191404342652, "epoch": 1.4236311239193085, "grad_norm": 1.25, "learning_rate": 0.00048034176301492616, "loss": 5.0618, "mean_token_accuracy": 0.2066340461373329, "num_tokens": 33981765.0, "step": 14820 }, { "entropy": 5.168016386032105, "epoch": 1.4241114313160423, "grad_norm": 1.3828125, "learning_rate": 0.0004803277459776628, "loss": 4.9541, "mean_token_accuracy": 0.21502473205327988, "num_tokens": 33992435.0, "step": 14825 }, { "entropy": 5.09135160446167, "epoch": 1.4245917387127762, "grad_norm": 1.3671875, "learning_rate": 0.00048031372417329875, "loss": 4.9171, "mean_token_accuracy": 0.22110578566789627, "num_tokens": 34004570.0, "step": 14830 }, { "entropy": 5.189966630935669, "epoch": 1.42507204610951, "grad_norm": 1.2890625, "learning_rate": 0.0004802996976021598, "loss": 4.9145, "mean_token_accuracy": 0.22021586894989015, "num_tokens": 34015494.0, "step": 14835 }, { "entropy": 5.226748323440551, "epoch": 1.425552353506244, "grad_norm": 1.1796875, "learning_rate": 0.00048028566626457145, "loss": 4.9932, "mean_token_accuracy": 0.20883565545082092, "num_tokens": 34026684.0, "step": 14840 }, { "entropy": 5.173442220687866, "epoch": 1.4260326609029779, "grad_norm": 1.171875, "learning_rate": 0.00048027163016085947, "loss": 4.9726, "mean_token_accuracy": 0.21021606177091598, "num_tokens": 34038948.0, "step": 14845 }, { "entropy": 5.259505367279052, "epoch": 1.4265129682997117, "grad_norm": 1.328125, "learning_rate": 0.00048025758929134976, "loss": 5.1272, "mean_token_accuracy": 0.20375512093305587, "num_tokens": 34052216.0, "step": 14850 }, { "entropy": 5.240663814544678, "epoch": 1.4269932756964456, "grad_norm": 1.234375, "learning_rate": 0.0004802435436563684, "loss": 4.9785, "mean_token_accuracy": 0.22330356240272523, "num_tokens": 34062602.0, "step": 14855 }, { "entropy": 5.1824178218841555, "epoch": 1.4274735830931795, "grad_norm": 1.2265625, "learning_rate": 0.00048022949325624134, "loss": 4.9436, "mean_token_accuracy": 0.21629261821508408, "num_tokens": 34075049.0, "step": 14860 }, { "entropy": 5.177359342575073, "epoch": 1.4279538904899136, "grad_norm": 1.40625, "learning_rate": 0.00048021543809129483, "loss": 5.0492, "mean_token_accuracy": 0.20940061509609223, "num_tokens": 34086719.0, "step": 14865 }, { "entropy": 5.237700700759888, "epoch": 1.4284341978866475, "grad_norm": 1.28125, "learning_rate": 0.0004802013781618552, "loss": 4.9707, "mean_token_accuracy": 0.21425776779651642, "num_tokens": 34098439.0, "step": 14870 }, { "entropy": 5.309500598907471, "epoch": 1.4289145052833814, "grad_norm": 1.328125, "learning_rate": 0.00048018731346824895, "loss": 4.9895, "mean_token_accuracy": 0.21168100982904434, "num_tokens": 34110711.0, "step": 14875 }, { "entropy": 5.141423988342285, "epoch": 1.4293948126801153, "grad_norm": 1.71875, "learning_rate": 0.0004801732440108026, "loss": 4.9326, "mean_token_accuracy": 0.21338745206594467, "num_tokens": 34122191.0, "step": 14880 }, { "entropy": 5.094103765487671, "epoch": 1.4298751200768491, "grad_norm": 2.46875, "learning_rate": 0.0004801591697898427, "loss": 4.8899, "mean_token_accuracy": 0.21937906593084336, "num_tokens": 34132838.0, "step": 14885 }, { "entropy": 5.210858488082886, "epoch": 1.430355427473583, "grad_norm": 1.6640625, "learning_rate": 0.0004801450908056961, "loss": 5.0114, "mean_token_accuracy": 0.20959677100181578, "num_tokens": 34143394.0, "step": 14890 }, { "entropy": 5.3004334449768065, "epoch": 1.4308357348703171, "grad_norm": 1.5, "learning_rate": 0.0004801310070586896, "loss": 5.0731, "mean_token_accuracy": 0.20850346684455873, "num_tokens": 34155934.0, "step": 14895 }, { "entropy": 5.077618026733399, "epoch": 1.431316042267051, "grad_norm": 1.265625, "learning_rate": 0.0004801169185491503, "loss": 4.8665, "mean_token_accuracy": 0.22559798061847686, "num_tokens": 34167949.0, "step": 14900 }, { "entropy": 5.2307600498199465, "epoch": 1.4317963496637849, "grad_norm": 1.5703125, "learning_rate": 0.00048010282527740516, "loss": 5.1348, "mean_token_accuracy": 0.20259464681148528, "num_tokens": 34179733.0, "step": 14905 }, { "entropy": 5.207586050033569, "epoch": 1.4322766570605188, "grad_norm": 1.4296875, "learning_rate": 0.00048008872724378146, "loss": 4.9037, "mean_token_accuracy": 0.2150167018175125, "num_tokens": 34190513.0, "step": 14910 }, { "entropy": 5.152509164810181, "epoch": 1.4327569644572526, "grad_norm": 1.328125, "learning_rate": 0.0004800746244486065, "loss": 4.9531, "mean_token_accuracy": 0.21709322184324265, "num_tokens": 34201562.0, "step": 14915 }, { "entropy": 5.062794637680054, "epoch": 1.4332372718539865, "grad_norm": 1.453125, "learning_rate": 0.0004800605168922077, "loss": 4.9106, "mean_token_accuracy": 0.21840206682682037, "num_tokens": 34212637.0, "step": 14920 }, { "entropy": 5.150622749328614, "epoch": 1.4337175792507204, "grad_norm": 1.21875, "learning_rate": 0.00048004640457491267, "loss": 4.9488, "mean_token_accuracy": 0.21455983370542525, "num_tokens": 34225394.0, "step": 14925 }, { "entropy": 5.278602600097656, "epoch": 1.4341978866474543, "grad_norm": 1.3046875, "learning_rate": 0.000480032287497049, "loss": 4.9471, "mean_token_accuracy": 0.2104356735944748, "num_tokens": 34236977.0, "step": 14930 }, { "entropy": 5.212551403045654, "epoch": 1.4346781940441882, "grad_norm": 1.21875, "learning_rate": 0.00048001816565894427, "loss": 5.048, "mean_token_accuracy": 0.21722146570682527, "num_tokens": 34247486.0, "step": 14935 }, { "entropy": 5.173838663101196, "epoch": 1.435158501440922, "grad_norm": 1.234375, "learning_rate": 0.0004800040390609267, "loss": 4.9366, "mean_token_accuracy": 0.22200540751218795, "num_tokens": 34259404.0, "step": 14940 }, { "entropy": 5.181051015853882, "epoch": 1.4356388088376562, "grad_norm": 1.2578125, "learning_rate": 0.00047998990770332396, "loss": 4.933, "mean_token_accuracy": 0.22339427024126052, "num_tokens": 34270388.0, "step": 14945 }, { "entropy": 5.264690160751343, "epoch": 1.43611911623439, "grad_norm": 1.3046875, "learning_rate": 0.0004799757715864643, "loss": 4.9981, "mean_token_accuracy": 0.21528103947639465, "num_tokens": 34281321.0, "step": 14950 }, { "entropy": 5.287259483337403, "epoch": 1.436599423631124, "grad_norm": 1.3671875, "learning_rate": 0.0004799616307106759, "loss": 5.0543, "mean_token_accuracy": 0.20392760783433914, "num_tokens": 34293177.0, "step": 14955 }, { "entropy": 5.23415994644165, "epoch": 1.4370797310278578, "grad_norm": 1.3125, "learning_rate": 0.000479947485076287, "loss": 5.0558, "mean_token_accuracy": 0.21059397161006926, "num_tokens": 34305175.0, "step": 14960 }, { "entropy": 5.194935846328735, "epoch": 1.4375600384245917, "grad_norm": 1.3203125, "learning_rate": 0.00047993333468362607, "loss": 5.0247, "mean_token_accuracy": 0.20465970337390899, "num_tokens": 34317969.0, "step": 14965 }, { "entropy": 5.307715892791748, "epoch": 1.4380403458213258, "grad_norm": 1.15625, "learning_rate": 0.00047991917953302173, "loss": 5.0479, "mean_token_accuracy": 0.20630020052194595, "num_tokens": 34329913.0, "step": 14970 }, { "entropy": 5.25423846244812, "epoch": 1.4385206532180597, "grad_norm": 1.28125, "learning_rate": 0.00047990501962480236, "loss": 4.9951, "mean_token_accuracy": 0.2149421378970146, "num_tokens": 34341656.0, "step": 14975 }, { "entropy": 5.171209383010864, "epoch": 1.4390009606147935, "grad_norm": 1.3046875, "learning_rate": 0.000479890854959297, "loss": 4.9685, "mean_token_accuracy": 0.21098122894763946, "num_tokens": 34351767.0, "step": 14980 }, { "entropy": 5.176991987228393, "epoch": 1.4394812680115274, "grad_norm": 1.2578125, "learning_rate": 0.0004798766855368344, "loss": 4.9592, "mean_token_accuracy": 0.2132784456014633, "num_tokens": 34363437.0, "step": 14985 }, { "entropy": 5.265459060668945, "epoch": 1.4399615754082613, "grad_norm": 1.4921875, "learning_rate": 0.00047986251135774343, "loss": 5.0465, "mean_token_accuracy": 0.2133356049656868, "num_tokens": 34374991.0, "step": 14990 }, { "entropy": 5.162412214279175, "epoch": 1.4404418828049952, "grad_norm": 1.453125, "learning_rate": 0.0004798483324223533, "loss": 4.9692, "mean_token_accuracy": 0.21427072286605836, "num_tokens": 34386218.0, "step": 14995 }, { "entropy": 5.246811056137085, "epoch": 1.440922190201729, "grad_norm": 1.2734375, "learning_rate": 0.0004798341487309932, "loss": 5.0105, "mean_token_accuracy": 0.2116893395781517, "num_tokens": 34396287.0, "step": 15000 }, { "epoch": 1.440922190201729, "eval_entropy": 5.037319316682671, "eval_loss": 5.06929874420166, "eval_mean_token_accuracy": 0.21831489476792584, "eval_num_tokens": 34396287.0, "eval_runtime": 26.5223, "eval_samples_per_second": 1237.261, "eval_steps_per_second": 154.662, "step": 15000 }, { "entropy": 5.250750732421875, "epoch": 1.441402497598463, "grad_norm": 1.1640625, "learning_rate": 0.00047981996028399233, "loss": 5.0093, "mean_token_accuracy": 0.20998309999704362, "num_tokens": 34407251.0, "step": 15005 }, { "entropy": 5.326452255249023, "epoch": 1.4418828049951968, "grad_norm": 1.375, "learning_rate": 0.0004798057670816802, "loss": 5.1435, "mean_token_accuracy": 0.20517653226852417, "num_tokens": 34419185.0, "step": 15010 }, { "entropy": 5.208475351333618, "epoch": 1.4423631123919307, "grad_norm": 1.1953125, "learning_rate": 0.0004797915691243863, "loss": 4.9709, "mean_token_accuracy": 0.21588644683361052, "num_tokens": 34431159.0, "step": 15015 }, { "entropy": 5.154812479019165, "epoch": 1.4428434197886648, "grad_norm": 1.25, "learning_rate": 0.0004797773664124403, "loss": 4.9572, "mean_token_accuracy": 0.21588555574417115, "num_tokens": 34442614.0, "step": 15020 }, { "entropy": 5.13772292137146, "epoch": 1.4433237271853987, "grad_norm": 1.1796875, "learning_rate": 0.00047976315894617195, "loss": 4.9335, "mean_token_accuracy": 0.21881027668714523, "num_tokens": 34453984.0, "step": 15025 }, { "entropy": 5.1489208221435545, "epoch": 1.4438040345821326, "grad_norm": 1.2265625, "learning_rate": 0.000479748946725911, "loss": 4.9391, "mean_token_accuracy": 0.21688321828842164, "num_tokens": 34466296.0, "step": 15030 }, { "entropy": 5.300703620910644, "epoch": 1.4442843419788665, "grad_norm": 1.2421875, "learning_rate": 0.0004797347297519875, "loss": 5.1447, "mean_token_accuracy": 0.20751263648271562, "num_tokens": 34478088.0, "step": 15035 }, { "entropy": 5.195109748840332, "epoch": 1.4447646493756003, "grad_norm": 1.1796875, "learning_rate": 0.00047972050802473154, "loss": 5.047, "mean_token_accuracy": 0.20686309933662414, "num_tokens": 34491664.0, "step": 15040 }, { "entropy": 5.13297290802002, "epoch": 1.4452449567723342, "grad_norm": 1.15625, "learning_rate": 0.0004797062815444733, "loss": 4.9291, "mean_token_accuracy": 0.21764905750751495, "num_tokens": 34502977.0, "step": 15045 }, { "entropy": 5.207884359359741, "epoch": 1.4457252641690683, "grad_norm": 1.359375, "learning_rate": 0.000479692050311543, "loss": 4.8953, "mean_token_accuracy": 0.2181214064359665, "num_tokens": 34515067.0, "step": 15050 }, { "entropy": 5.223143815994263, "epoch": 1.4462055715658022, "grad_norm": 1.21875, "learning_rate": 0.0004796778143262711, "loss": 5.0498, "mean_token_accuracy": 0.2105468362569809, "num_tokens": 34525012.0, "step": 15055 }, { "entropy": 5.092001056671142, "epoch": 1.446685878962536, "grad_norm": 1.359375, "learning_rate": 0.0004796635735889882, "loss": 4.8538, "mean_token_accuracy": 0.22900600135326385, "num_tokens": 34535789.0, "step": 15060 }, { "entropy": 5.198391342163086, "epoch": 1.44716618635927, "grad_norm": 1.2734375, "learning_rate": 0.00047964932810002476, "loss": 4.9676, "mean_token_accuracy": 0.21989088952541352, "num_tokens": 34546276.0, "step": 15065 }, { "entropy": 5.198813486099243, "epoch": 1.4476464937560038, "grad_norm": 1.3359375, "learning_rate": 0.0004796350778597117, "loss": 4.9705, "mean_token_accuracy": 0.21042503118515016, "num_tokens": 34558361.0, "step": 15070 }, { "entropy": 5.154997491836548, "epoch": 1.4481268011527377, "grad_norm": 1.2578125, "learning_rate": 0.0004796208228683796, "loss": 4.9392, "mean_token_accuracy": 0.21781230419874192, "num_tokens": 34569482.0, "step": 15075 }, { "entropy": 5.262822818756104, "epoch": 1.4486071085494716, "grad_norm": 1.234375, "learning_rate": 0.00047960656312635977, "loss": 5.0336, "mean_token_accuracy": 0.2166367918252945, "num_tokens": 34580128.0, "step": 15080 }, { "entropy": 5.16861662864685, "epoch": 1.4490874159462055, "grad_norm": 1.1171875, "learning_rate": 0.0004795922986339831, "loss": 4.9457, "mean_token_accuracy": 0.21179744154214858, "num_tokens": 34591105.0, "step": 15085 }, { "entropy": 5.203324699401856, "epoch": 1.4495677233429394, "grad_norm": 1.140625, "learning_rate": 0.00047957802939158057, "loss": 4.9878, "mean_token_accuracy": 0.21329084187746047, "num_tokens": 34602618.0, "step": 15090 }, { "entropy": 5.227561187744141, "epoch": 1.4500480307396733, "grad_norm": 1.2109375, "learning_rate": 0.0004795637553994838, "loss": 5.0729, "mean_token_accuracy": 0.2020048052072525, "num_tokens": 34614179.0, "step": 15095 }, { "entropy": 5.321483945846557, "epoch": 1.4505283381364074, "grad_norm": 1.125, "learning_rate": 0.00047954947665802404, "loss": 5.0928, "mean_token_accuracy": 0.2034539520740509, "num_tokens": 34625456.0, "step": 15100 }, { "entropy": 5.186492490768432, "epoch": 1.4510086455331412, "grad_norm": 1.1328125, "learning_rate": 0.0004795351931675329, "loss": 4.9536, "mean_token_accuracy": 0.22268653959035872, "num_tokens": 34636268.0, "step": 15105 }, { "entropy": 5.138030385971069, "epoch": 1.4514889529298751, "grad_norm": 1.1640625, "learning_rate": 0.0004795209049283419, "loss": 4.9032, "mean_token_accuracy": 0.22105590552091597, "num_tokens": 34647665.0, "step": 15110 }, { "entropy": 5.2529072761535645, "epoch": 1.451969260326609, "grad_norm": 1.359375, "learning_rate": 0.0004795066119407827, "loss": 5.0009, "mean_token_accuracy": 0.21218062788248063, "num_tokens": 34659965.0, "step": 15115 }, { "entropy": 5.214857578277588, "epoch": 1.4524495677233429, "grad_norm": 1.1796875, "learning_rate": 0.0004794923142051873, "loss": 5.052, "mean_token_accuracy": 0.21157704889774323, "num_tokens": 34671724.0, "step": 15120 }, { "entropy": 5.126708841323852, "epoch": 1.452929875120077, "grad_norm": 1.1796875, "learning_rate": 0.00047947801172188755, "loss": 4.9103, "mean_token_accuracy": 0.22448884695768356, "num_tokens": 34682445.0, "step": 15125 }, { "entropy": 5.107527399063111, "epoch": 1.4534101825168109, "grad_norm": 1.2734375, "learning_rate": 0.0004794637044912155, "loss": 4.8546, "mean_token_accuracy": 0.22458722293376923, "num_tokens": 34693621.0, "step": 15130 }, { "entropy": 5.1660699367523195, "epoch": 1.4538904899135447, "grad_norm": 1.40625, "learning_rate": 0.0004794493925135034, "loss": 4.8351, "mean_token_accuracy": 0.227722430229187, "num_tokens": 34703978.0, "step": 15135 }, { "entropy": 5.199901390075683, "epoch": 1.4543707973102786, "grad_norm": 1.5390625, "learning_rate": 0.00047943507578908357, "loss": 5.0363, "mean_token_accuracy": 0.21987725645303727, "num_tokens": 34715468.0, "step": 15140 }, { "entropy": 5.237486171722412, "epoch": 1.4548511047070125, "grad_norm": 1.3828125, "learning_rate": 0.0004794207543182883, "loss": 4.9965, "mean_token_accuracy": 0.21251980364322662, "num_tokens": 34726383.0, "step": 15145 }, { "entropy": 5.179723453521729, "epoch": 1.4553314121037464, "grad_norm": 1.34375, "learning_rate": 0.00047940642810145005, "loss": 5.0156, "mean_token_accuracy": 0.21433500498533248, "num_tokens": 34737123.0, "step": 15150 }, { "entropy": 5.0879114151000975, "epoch": 1.4558117195004803, "grad_norm": 1.2421875, "learning_rate": 0.00047939209713890156, "loss": 4.9252, "mean_token_accuracy": 0.22000515311956406, "num_tokens": 34749197.0, "step": 15155 }, { "entropy": 5.225097751617431, "epoch": 1.4562920268972142, "grad_norm": 1.515625, "learning_rate": 0.00047937776143097547, "loss": 4.9765, "mean_token_accuracy": 0.2143160358071327, "num_tokens": 34759785.0, "step": 15160 }, { "entropy": 5.112883234024048, "epoch": 1.456772334293948, "grad_norm": 1.2734375, "learning_rate": 0.0004793634209780047, "loss": 4.8375, "mean_token_accuracy": 0.22548486590385436, "num_tokens": 34770938.0, "step": 15165 }, { "entropy": 5.128720092773437, "epoch": 1.457252641690682, "grad_norm": 1.2734375, "learning_rate": 0.0004793490757803221, "loss": 4.8902, "mean_token_accuracy": 0.22066261917352675, "num_tokens": 34782126.0, "step": 15170 }, { "entropy": 5.119160270690918, "epoch": 1.457732949087416, "grad_norm": 1.453125, "learning_rate": 0.00047933472583826063, "loss": 4.9154, "mean_token_accuracy": 0.21358481496572496, "num_tokens": 34793802.0, "step": 15175 }, { "entropy": 5.1480120658874515, "epoch": 1.45821325648415, "grad_norm": 1.140625, "learning_rate": 0.0004793203711521537, "loss": 4.9813, "mean_token_accuracy": 0.22073666751384735, "num_tokens": 34805696.0, "step": 15180 }, { "entropy": 5.296039390563965, "epoch": 1.4586935638808838, "grad_norm": 1.3984375, "learning_rate": 0.00047930601172233446, "loss": 5.1314, "mean_token_accuracy": 0.2063765347003937, "num_tokens": 34818679.0, "step": 15185 }, { "entropy": 5.297084808349609, "epoch": 1.4591738712776177, "grad_norm": 1.1484375, "learning_rate": 0.00047929164754913624, "loss": 4.9855, "mean_token_accuracy": 0.21749197095632553, "num_tokens": 34830528.0, "step": 15190 }, { "entropy": 5.171178531646729, "epoch": 1.4596541786743515, "grad_norm": 1.2890625, "learning_rate": 0.0004792772786328926, "loss": 4.8748, "mean_token_accuracy": 0.22240075021982192, "num_tokens": 34841621.0, "step": 15195 }, { "entropy": 5.087448406219482, "epoch": 1.4601344860710854, "grad_norm": 1.21875, "learning_rate": 0.00047926290497393714, "loss": 4.8775, "mean_token_accuracy": 0.2203219324350357, "num_tokens": 34854448.0, "step": 15200 }, { "entropy": 5.195474147796631, "epoch": 1.4606147934678195, "grad_norm": 1.3046875, "learning_rate": 0.0004792485265726036, "loss": 4.996, "mean_token_accuracy": 0.2099252760410309, "num_tokens": 34866492.0, "step": 15205 }, { "entropy": 5.18867597579956, "epoch": 1.4610951008645534, "grad_norm": 1.2109375, "learning_rate": 0.0004792341434292257, "loss": 4.958, "mean_token_accuracy": 0.21405645608901977, "num_tokens": 34876869.0, "step": 15210 }, { "entropy": 5.122355937957764, "epoch": 1.4615754082612873, "grad_norm": 1.2109375, "learning_rate": 0.0004792197555441374, "loss": 4.8834, "mean_token_accuracy": 0.22930939495563507, "num_tokens": 34888449.0, "step": 15215 }, { "entropy": 5.125897169113159, "epoch": 1.4620557156580212, "grad_norm": 1.1796875, "learning_rate": 0.0004792053629176729, "loss": 4.9055, "mean_token_accuracy": 0.2263544738292694, "num_tokens": 34898124.0, "step": 15220 }, { "entropy": 5.202107191085815, "epoch": 1.462536023054755, "grad_norm": 1.3828125, "learning_rate": 0.0004791909655501662, "loss": 4.8921, "mean_token_accuracy": 0.22310193479061127, "num_tokens": 34909128.0, "step": 15225 }, { "entropy": 5.092991304397583, "epoch": 1.463016330451489, "grad_norm": 1.4375, "learning_rate": 0.0004791765634419516, "loss": 4.9389, "mean_token_accuracy": 0.2171325519680977, "num_tokens": 34920541.0, "step": 15230 }, { "entropy": 5.047167062759399, "epoch": 1.4634966378482228, "grad_norm": 1.7734375, "learning_rate": 0.00047916215659336343, "loss": 4.8782, "mean_token_accuracy": 0.22050851583480835, "num_tokens": 34931605.0, "step": 15235 }, { "entropy": 5.168670988082885, "epoch": 1.4639769452449567, "grad_norm": 1.5078125, "learning_rate": 0.0004791477450047363, "loss": 4.9172, "mean_token_accuracy": 0.22224834561347961, "num_tokens": 34943057.0, "step": 15240 }, { "entropy": 5.2066905975341795, "epoch": 1.4644572526416906, "grad_norm": 1.4609375, "learning_rate": 0.00047913332867640464, "loss": 5.085, "mean_token_accuracy": 0.20473618805408478, "num_tokens": 34954386.0, "step": 15245 }, { "entropy": 5.141163301467896, "epoch": 1.4649375600384245, "grad_norm": 1.2890625, "learning_rate": 0.0004791189076087033, "loss": 4.9266, "mean_token_accuracy": 0.21532471030950545, "num_tokens": 34965874.0, "step": 15250 }, { "entropy": 5.232634353637695, "epoch": 1.4654178674351586, "grad_norm": 1.2734375, "learning_rate": 0.00047910448180196703, "loss": 5.0222, "mean_token_accuracy": 0.2117237016558647, "num_tokens": 34977408.0, "step": 15255 }, { "entropy": 5.1937174797058105, "epoch": 1.4658981748318924, "grad_norm": 1.4765625, "learning_rate": 0.0004790900512565307, "loss": 4.8537, "mean_token_accuracy": 0.22348989248275758, "num_tokens": 34987788.0, "step": 15260 }, { "entropy": 5.230491399765015, "epoch": 1.4663784822286263, "grad_norm": 1.171875, "learning_rate": 0.0004790756159727294, "loss": 5.0276, "mean_token_accuracy": 0.2132936492562294, "num_tokens": 35001051.0, "step": 15265 }, { "entropy": 5.185359954833984, "epoch": 1.4668587896253602, "grad_norm": 1.1640625, "learning_rate": 0.00047906117595089835, "loss": 4.969, "mean_token_accuracy": 0.21702387034893036, "num_tokens": 35012621.0, "step": 15270 }, { "entropy": 5.178160524368286, "epoch": 1.467339097022094, "grad_norm": 1.4140625, "learning_rate": 0.0004790467311913727, "loss": 4.9673, "mean_token_accuracy": 0.21435904800891875, "num_tokens": 35023789.0, "step": 15275 }, { "entropy": 5.145422744750976, "epoch": 1.4678194044188282, "grad_norm": 1.2578125, "learning_rate": 0.0004790322816944879, "loss": 4.947, "mean_token_accuracy": 0.22396451681852342, "num_tokens": 35035839.0, "step": 15280 }, { "entropy": 5.2104826927185055, "epoch": 1.468299711815562, "grad_norm": 1.234375, "learning_rate": 0.0004790178274605793, "loss": 5.0371, "mean_token_accuracy": 0.20950869023799895, "num_tokens": 35047823.0, "step": 15285 }, { "entropy": 5.244489479064941, "epoch": 1.468780019212296, "grad_norm": 1.2421875, "learning_rate": 0.00047900336848998254, "loss": 5.0496, "mean_token_accuracy": 0.2074924662709236, "num_tokens": 35058597.0, "step": 15290 }, { "entropy": 5.127735662460327, "epoch": 1.4692603266090298, "grad_norm": 1.2578125, "learning_rate": 0.0004789889047830334, "loss": 4.9085, "mean_token_accuracy": 0.22231007516384124, "num_tokens": 35069822.0, "step": 15295 }, { "entropy": 5.178108882904053, "epoch": 1.4697406340057637, "grad_norm": 1.234375, "learning_rate": 0.00047897443634006766, "loss": 4.9835, "mean_token_accuracy": 0.21558043211698533, "num_tokens": 35081423.0, "step": 15300 }, { "entropy": 5.246568965911865, "epoch": 1.4702209414024976, "grad_norm": 1.15625, "learning_rate": 0.0004789599631614211, "loss": 5.0002, "mean_token_accuracy": 0.21287845075130463, "num_tokens": 35092565.0, "step": 15305 }, { "entropy": 5.18705940246582, "epoch": 1.4707012487992315, "grad_norm": 1.1484375, "learning_rate": 0.0004789454852474298, "loss": 4.941, "mean_token_accuracy": 0.21825831830501558, "num_tokens": 35103811.0, "step": 15310 }, { "entropy": 5.2444439888000485, "epoch": 1.4711815561959654, "grad_norm": 1.34375, "learning_rate": 0.0004789310025984299, "loss": 5.07, "mean_token_accuracy": 0.20772210359573365, "num_tokens": 35115516.0, "step": 15315 }, { "entropy": 5.217182779312134, "epoch": 1.4716618635926992, "grad_norm": 1.3125, "learning_rate": 0.00047891651521475776, "loss": 5.0205, "mean_token_accuracy": 0.2142233058810234, "num_tokens": 35127285.0, "step": 15320 }, { "entropy": 5.153268194198608, "epoch": 1.4721421709894331, "grad_norm": 1.1015625, "learning_rate": 0.00047890202309674963, "loss": 4.9433, "mean_token_accuracy": 0.21643016785383223, "num_tokens": 35137884.0, "step": 15325 }, { "entropy": 5.276388359069824, "epoch": 1.4726224783861672, "grad_norm": 1.421875, "learning_rate": 0.00047888752624474195, "loss": 5.1031, "mean_token_accuracy": 0.20545923113822936, "num_tokens": 35149935.0, "step": 15330 }, { "entropy": 5.2142415046691895, "epoch": 1.473102785782901, "grad_norm": 1.46875, "learning_rate": 0.0004788730246590714, "loss": 5.0424, "mean_token_accuracy": 0.21042255759239198, "num_tokens": 35162610.0, "step": 15335 }, { "entropy": 5.180163049697876, "epoch": 1.473583093179635, "grad_norm": 1.34375, "learning_rate": 0.00047885851834007456, "loss": 4.9073, "mean_token_accuracy": 0.2148707166314125, "num_tokens": 35174799.0, "step": 15340 }, { "entropy": 5.1315391063690186, "epoch": 1.4740634005763689, "grad_norm": 1.203125, "learning_rate": 0.00047884400728808824, "loss": 4.9346, "mean_token_accuracy": 0.2183023527264595, "num_tokens": 35186004.0, "step": 15345 }, { "entropy": 5.175625896453857, "epoch": 1.4745437079731027, "grad_norm": 1.1640625, "learning_rate": 0.0004788294915034494, "loss": 4.9593, "mean_token_accuracy": 0.2172788307070732, "num_tokens": 35197310.0, "step": 15350 }, { "entropy": 5.120343971252441, "epoch": 1.4750240153698366, "grad_norm": 1.15625, "learning_rate": 0.000478814970986495, "loss": 4.909, "mean_token_accuracy": 0.22186490893363953, "num_tokens": 35208703.0, "step": 15355 }, { "entropy": 5.156807708740234, "epoch": 1.4755043227665707, "grad_norm": 1.21875, "learning_rate": 0.00047880044573756213, "loss": 4.9205, "mean_token_accuracy": 0.22117386311292647, "num_tokens": 35219927.0, "step": 15360 }, { "entropy": 5.186833000183105, "epoch": 1.4759846301633046, "grad_norm": 1.265625, "learning_rate": 0.00047878591575698816, "loss": 4.9142, "mean_token_accuracy": 0.21543453335762025, "num_tokens": 35231077.0, "step": 15365 }, { "entropy": 5.196823215484619, "epoch": 1.4764649375600385, "grad_norm": 1.2421875, "learning_rate": 0.0004787713810451103, "loss": 4.9946, "mean_token_accuracy": 0.2103252202272415, "num_tokens": 35241984.0, "step": 15370 }, { "entropy": 5.159264945983887, "epoch": 1.4769452449567724, "grad_norm": 1.390625, "learning_rate": 0.00047875684160226606, "loss": 4.9422, "mean_token_accuracy": 0.2182306170463562, "num_tokens": 35252717.0, "step": 15375 }, { "entropy": 5.179389905929566, "epoch": 1.4774255523535063, "grad_norm": 1.21875, "learning_rate": 0.000478742297428793, "loss": 5.0073, "mean_token_accuracy": 0.21191850453615188, "num_tokens": 35263916.0, "step": 15380 }, { "entropy": 5.164166069030761, "epoch": 1.4779058597502401, "grad_norm": 1.2109375, "learning_rate": 0.00047872774852502877, "loss": 4.9267, "mean_token_accuracy": 0.21910004168748856, "num_tokens": 35274772.0, "step": 15385 }, { "entropy": 5.11943564414978, "epoch": 1.478386167146974, "grad_norm": 1.1640625, "learning_rate": 0.0004787131948913112, "loss": 4.8781, "mean_token_accuracy": 0.22653010189533235, "num_tokens": 35287150.0, "step": 15390 }, { "entropy": 5.131641054153443, "epoch": 1.478866474543708, "grad_norm": 1.3046875, "learning_rate": 0.00047869863652797806, "loss": 4.8877, "mean_token_accuracy": 0.2227863147854805, "num_tokens": 35298538.0, "step": 15395 }, { "entropy": 5.107625436782837, "epoch": 1.4793467819404418, "grad_norm": 1.2421875, "learning_rate": 0.0004786840734353675, "loss": 4.886, "mean_token_accuracy": 0.22072995603084564, "num_tokens": 35309395.0, "step": 15400 }, { "entropy": 5.136013507843018, "epoch": 1.4798270893371757, "grad_norm": 1.2890625, "learning_rate": 0.00047866950561381756, "loss": 4.9366, "mean_token_accuracy": 0.21733225584030152, "num_tokens": 35320741.0, "step": 15405 }, { "entropy": 5.298266792297364, "epoch": 1.4803073967339098, "grad_norm": 1.203125, "learning_rate": 0.0004786549330636665, "loss": 5.057, "mean_token_accuracy": 0.20781148821115494, "num_tokens": 35331895.0, "step": 15410 }, { "entropy": 5.117559576034546, "epoch": 1.4807877041306436, "grad_norm": 1.1640625, "learning_rate": 0.00047864035578525256, "loss": 4.8407, "mean_token_accuracy": 0.23251519501209258, "num_tokens": 35343775.0, "step": 15415 }, { "entropy": 5.1231804370880125, "epoch": 1.4812680115273775, "grad_norm": 1.34375, "learning_rate": 0.0004786257737789143, "loss": 4.9988, "mean_token_accuracy": 0.21539798378944397, "num_tokens": 35355043.0, "step": 15420 }, { "entropy": 5.244394207000733, "epoch": 1.4817483189241114, "grad_norm": 1.2109375, "learning_rate": 0.0004786111870449902, "loss": 5.0196, "mean_token_accuracy": 0.2094297468662262, "num_tokens": 35365387.0, "step": 15425 }, { "entropy": 5.215351247787476, "epoch": 1.4822286263208453, "grad_norm": 1.3984375, "learning_rate": 0.00047859659558381894, "loss": 4.9363, "mean_token_accuracy": 0.22851166874170303, "num_tokens": 35376400.0, "step": 15430 }, { "entropy": 5.197208881378174, "epoch": 1.4827089337175792, "grad_norm": 1.21875, "learning_rate": 0.00047858199939573935, "loss": 4.987, "mean_token_accuracy": 0.21214037835597993, "num_tokens": 35387315.0, "step": 15435 }, { "entropy": 5.180497694015503, "epoch": 1.4831892411143133, "grad_norm": 1.203125, "learning_rate": 0.00047856739848109014, "loss": 4.981, "mean_token_accuracy": 0.21736457496881484, "num_tokens": 35398666.0, "step": 15440 }, { "entropy": 5.155378150939941, "epoch": 1.4836695485110472, "grad_norm": 1.15625, "learning_rate": 0.00047855279284021046, "loss": 4.96, "mean_token_accuracy": 0.22037553489208223, "num_tokens": 35409192.0, "step": 15445 }, { "entropy": 5.189713001251221, "epoch": 1.484149855907781, "grad_norm": 1.2265625, "learning_rate": 0.00047853818247343933, "loss": 5.0013, "mean_token_accuracy": 0.206741601228714, "num_tokens": 35419812.0, "step": 15450 }, { "entropy": 5.255188465118408, "epoch": 1.484630163304515, "grad_norm": 1.265625, "learning_rate": 0.00047852356738111606, "loss": 4.9344, "mean_token_accuracy": 0.22674974501132966, "num_tokens": 35430875.0, "step": 15455 }, { "entropy": 5.152353525161743, "epoch": 1.4851104707012488, "grad_norm": 1.2734375, "learning_rate": 0.0004785089475635799, "loss": 4.9248, "mean_token_accuracy": 0.21964309960603715, "num_tokens": 35441065.0, "step": 15460 }, { "entropy": 5.103597593307495, "epoch": 1.4855907780979827, "grad_norm": 1.15625, "learning_rate": 0.00047849432302117024, "loss": 4.9745, "mean_token_accuracy": 0.2140120819211006, "num_tokens": 35452164.0, "step": 15465 }, { "entropy": 5.235323476791382, "epoch": 1.4860710854947166, "grad_norm": 1.296875, "learning_rate": 0.00047847969375422656, "loss": 5.0663, "mean_token_accuracy": 0.20626734495162963, "num_tokens": 35463158.0, "step": 15470 }, { "entropy": 5.090017223358155, "epoch": 1.4865513928914504, "grad_norm": 1.2890625, "learning_rate": 0.0004784650597630887, "loss": 4.8789, "mean_token_accuracy": 0.22733232527971267, "num_tokens": 35474153.0, "step": 15475 }, { "entropy": 5.211921691894531, "epoch": 1.4870317002881843, "grad_norm": 1.296875, "learning_rate": 0.00047845042104809635, "loss": 4.9649, "mean_token_accuracy": 0.21242944300174713, "num_tokens": 35485680.0, "step": 15480 }, { "entropy": 5.209507083892822, "epoch": 1.4875120076849184, "grad_norm": 1.171875, "learning_rate": 0.0004784357776095892, "loss": 5.0195, "mean_token_accuracy": 0.215239979326725, "num_tokens": 35497271.0, "step": 15485 }, { "entropy": 5.190091228485107, "epoch": 1.4879923150816523, "grad_norm": 1.2421875, "learning_rate": 0.0004784211294479075, "loss": 4.9131, "mean_token_accuracy": 0.22402856945991517, "num_tokens": 35509166.0, "step": 15490 }, { "entropy": 5.165255784988403, "epoch": 1.4884726224783862, "grad_norm": 1.3125, "learning_rate": 0.0004784064765633912, "loss": 4.9992, "mean_token_accuracy": 0.2183140769600868, "num_tokens": 35521289.0, "step": 15495 }, { "entropy": 5.160954904556275, "epoch": 1.48895292987512, "grad_norm": 1.203125, "learning_rate": 0.00047839181895638057, "loss": 4.9491, "mean_token_accuracy": 0.22083631306886672, "num_tokens": 35532179.0, "step": 15500 }, { "entropy": 5.308222866058349, "epoch": 1.489433237271854, "grad_norm": 1.4140625, "learning_rate": 0.00047837715662721575, "loss": 5.1324, "mean_token_accuracy": 0.2071303442120552, "num_tokens": 35544703.0, "step": 15505 }, { "entropy": 5.273142337799072, "epoch": 1.4899135446685878, "grad_norm": 1.296875, "learning_rate": 0.0004783624895762372, "loss": 5.1179, "mean_token_accuracy": 0.2036224529147148, "num_tokens": 35557853.0, "step": 15510 }, { "entropy": 5.17064061164856, "epoch": 1.490393852065322, "grad_norm": 1.2421875, "learning_rate": 0.00047834781780378563, "loss": 4.8318, "mean_token_accuracy": 0.22622861266136168, "num_tokens": 35570340.0, "step": 15515 }, { "entropy": 5.192807006835937, "epoch": 1.4908741594620558, "grad_norm": 1.2890625, "learning_rate": 0.0004783331413102015, "loss": 5.0187, "mean_token_accuracy": 0.21527829617261887, "num_tokens": 35582387.0, "step": 15520 }, { "entropy": 5.2773651599884035, "epoch": 1.4913544668587897, "grad_norm": 1.2109375, "learning_rate": 0.00047831846009582557, "loss": 5.07, "mean_token_accuracy": 0.206882107257843, "num_tokens": 35595105.0, "step": 15525 }, { "entropy": 5.235234880447388, "epoch": 1.4918347742555236, "grad_norm": 1.3671875, "learning_rate": 0.0004783037741609988, "loss": 5.0424, "mean_token_accuracy": 0.2106972947716713, "num_tokens": 35607160.0, "step": 15530 }, { "entropy": 5.196556234359742, "epoch": 1.4923150816522575, "grad_norm": 1.328125, "learning_rate": 0.0004782890835060621, "loss": 4.9382, "mean_token_accuracy": 0.21910466104745865, "num_tokens": 35619097.0, "step": 15535 }, { "entropy": 5.073312139511108, "epoch": 1.4927953890489913, "grad_norm": 1.2265625, "learning_rate": 0.0004782743881313564, "loss": 4.8311, "mean_token_accuracy": 0.22151407450437546, "num_tokens": 35629868.0, "step": 15540 }, { "entropy": 5.171602392196656, "epoch": 1.4932756964457252, "grad_norm": 1.5078125, "learning_rate": 0.00047825968803722315, "loss": 4.9882, "mean_token_accuracy": 0.21382750123739241, "num_tokens": 35640622.0, "step": 15545 }, { "entropy": 5.203648948669434, "epoch": 1.493756003842459, "grad_norm": 1.328125, "learning_rate": 0.0004782449832240035, "loss": 4.9731, "mean_token_accuracy": 0.21205914914608, "num_tokens": 35652383.0, "step": 15550 }, { "entropy": 5.1007692337036135, "epoch": 1.494236311239193, "grad_norm": 1.2265625, "learning_rate": 0.0004782302736920387, "loss": 4.9049, "mean_token_accuracy": 0.2167341247200966, "num_tokens": 35663838.0, "step": 15555 }, { "entropy": 5.2010805130004885, "epoch": 1.4947166186359269, "grad_norm": 1.2890625, "learning_rate": 0.0004782155594416705, "loss": 4.9483, "mean_token_accuracy": 0.2152695655822754, "num_tokens": 35674564.0, "step": 15560 }, { "entropy": 5.242137813568116, "epoch": 1.495196926032661, "grad_norm": 1.1953125, "learning_rate": 0.00047820084047324045, "loss": 4.968, "mean_token_accuracy": 0.21514491289854049, "num_tokens": 35685518.0, "step": 15565 }, { "entropy": 5.261635780334473, "epoch": 1.4956772334293948, "grad_norm": 1.25, "learning_rate": 0.00047818611678709027, "loss": 4.9776, "mean_token_accuracy": 0.215865059196949, "num_tokens": 35696597.0, "step": 15570 }, { "entropy": 5.122075605392456, "epoch": 1.4961575408261287, "grad_norm": 1.1484375, "learning_rate": 0.0004781713883835618, "loss": 4.9016, "mean_token_accuracy": 0.22335670590400697, "num_tokens": 35707229.0, "step": 15575 }, { "entropy": 5.1522300243377686, "epoch": 1.4966378482228626, "grad_norm": 1.1875, "learning_rate": 0.00047815665526299695, "loss": 4.9901, "mean_token_accuracy": 0.21233994662761688, "num_tokens": 35719440.0, "step": 15580 }, { "entropy": 5.230376529693603, "epoch": 1.4971181556195965, "grad_norm": 1.3828125, "learning_rate": 0.0004781419174257378, "loss": 4.9846, "mean_token_accuracy": 0.20775482654571534, "num_tokens": 35731611.0, "step": 15585 }, { "entropy": 5.1648476123809814, "epoch": 1.4975984630163304, "grad_norm": 1.1953125, "learning_rate": 0.0004781271748721266, "loss": 4.9783, "mean_token_accuracy": 0.21286329627037048, "num_tokens": 35743047.0, "step": 15590 }, { "entropy": 5.227255868911743, "epoch": 1.4980787704130645, "grad_norm": 1.3125, "learning_rate": 0.0004781124276025055, "loss": 4.9184, "mean_token_accuracy": 0.23288827687501906, "num_tokens": 35753499.0, "step": 15595 }, { "entropy": 5.215896415710449, "epoch": 1.4985590778097984, "grad_norm": 1.3046875, "learning_rate": 0.000478097675617217, "loss": 5.1346, "mean_token_accuracy": 0.21066973358392715, "num_tokens": 35764682.0, "step": 15600 }, { "entropy": 5.239116668701172, "epoch": 1.4990393852065322, "grad_norm": 1.234375, "learning_rate": 0.00047808291891660357, "loss": 4.9312, "mean_token_accuracy": 0.22250870913267135, "num_tokens": 35775160.0, "step": 15605 }, { "entropy": 5.216899585723877, "epoch": 1.4995196926032661, "grad_norm": 1.5, "learning_rate": 0.00047806815750100774, "loss": 4.9735, "mean_token_accuracy": 0.21689383089542388, "num_tokens": 35786089.0, "step": 15610 }, { "entropy": 5.17706995010376, "epoch": 1.5, "grad_norm": 1.3515625, "learning_rate": 0.0004780533913707723, "loss": 4.9976, "mean_token_accuracy": 0.21321234852075577, "num_tokens": 35796851.0, "step": 15615 }, { "entropy": 5.229093360900879, "epoch": 1.5004803073967339, "grad_norm": 1.2890625, "learning_rate": 0.00047803862052624006, "loss": 5.0117, "mean_token_accuracy": 0.20809693783521652, "num_tokens": 35808553.0, "step": 15620 }, { "entropy": 5.275240278244018, "epoch": 1.5009606147934678, "grad_norm": 1.25, "learning_rate": 0.00047802384496775397, "loss": 5.1488, "mean_token_accuracy": 0.21014924496412277, "num_tokens": 35820108.0, "step": 15625 }, { "entropy": 5.187834882736206, "epoch": 1.5014409221902016, "grad_norm": 1.1953125, "learning_rate": 0.0004780090646956571, "loss": 4.8862, "mean_token_accuracy": 0.22773226201534272, "num_tokens": 35831672.0, "step": 15630 }, { "entropy": 5.280662202835083, "epoch": 1.5019212295869355, "grad_norm": 1.2890625, "learning_rate": 0.00047799427971029245, "loss": 5.0788, "mean_token_accuracy": 0.21054953187704087, "num_tokens": 35843164.0, "step": 15635 }, { "entropy": 5.225699520111084, "epoch": 1.5024015369836694, "grad_norm": 1.5, "learning_rate": 0.0004779794900120034, "loss": 5.0723, "mean_token_accuracy": 0.21249438375234603, "num_tokens": 35854677.0, "step": 15640 }, { "entropy": 5.19042010307312, "epoch": 1.5028818443804035, "grad_norm": 1.234375, "learning_rate": 0.0004779646956011334, "loss": 4.9617, "mean_token_accuracy": 0.22047783583402633, "num_tokens": 35865956.0, "step": 15645 }, { "entropy": 5.176256704330444, "epoch": 1.5033621517771374, "grad_norm": 1.34375, "learning_rate": 0.00047794989647802574, "loss": 4.9709, "mean_token_accuracy": 0.2125203862786293, "num_tokens": 35877451.0, "step": 15650 }, { "entropy": 5.194400215148926, "epoch": 1.5038424591738713, "grad_norm": 1.2890625, "learning_rate": 0.00047793509264302424, "loss": 4.9537, "mean_token_accuracy": 0.21531011760234833, "num_tokens": 35888436.0, "step": 15655 }, { "entropy": 5.187810611724854, "epoch": 1.5043227665706052, "grad_norm": 1.2265625, "learning_rate": 0.00047792028409647237, "loss": 4.9621, "mean_token_accuracy": 0.2149658814072609, "num_tokens": 35901010.0, "step": 15660 }, { "entropy": 5.213496112823487, "epoch": 1.5048030739673393, "grad_norm": 1.2734375, "learning_rate": 0.00047790547083871414, "loss": 4.9768, "mean_token_accuracy": 0.21322050243616103, "num_tokens": 35912697.0, "step": 15665 }, { "entropy": 5.153905248641967, "epoch": 1.5052833813640731, "grad_norm": 1.4453125, "learning_rate": 0.00047789065287009335, "loss": 4.8969, "mean_token_accuracy": 0.22069223672151567, "num_tokens": 35924614.0, "step": 15670 }, { "entropy": 5.203504943847657, "epoch": 1.505763688760807, "grad_norm": 1.4375, "learning_rate": 0.0004778758301909542, "loss": 4.9809, "mean_token_accuracy": 0.2169592186808586, "num_tokens": 35935492.0, "step": 15675 }, { "entropy": 5.098133087158203, "epoch": 1.506243996157541, "grad_norm": 1.125, "learning_rate": 0.0004778610028016405, "loss": 4.9889, "mean_token_accuracy": 0.21588987559080125, "num_tokens": 35947901.0, "step": 15680 }, { "entropy": 5.133358764648437, "epoch": 1.5067243035542748, "grad_norm": 1.3828125, "learning_rate": 0.0004778461707024967, "loss": 4.8208, "mean_token_accuracy": 0.22946203052997588, "num_tokens": 35959690.0, "step": 15685 }, { "entropy": 5.206504774093628, "epoch": 1.5072046109510087, "grad_norm": 1.296875, "learning_rate": 0.0004778313338938672, "loss": 4.9199, "mean_token_accuracy": 0.22398556172847747, "num_tokens": 35971209.0, "step": 15690 }, { "entropy": 5.120486497879028, "epoch": 1.5076849183477425, "grad_norm": 1.3984375, "learning_rate": 0.00047781649237609643, "loss": 4.8075, "mean_token_accuracy": 0.2294871136546135, "num_tokens": 35981795.0, "step": 15695 }, { "entropy": 5.0691118240356445, "epoch": 1.5081652257444764, "grad_norm": 1.40625, "learning_rate": 0.0004778016461495289, "loss": 4.9587, "mean_token_accuracy": 0.21469815373420714, "num_tokens": 35993358.0, "step": 15700 }, { "entropy": 5.153229188919068, "epoch": 1.5086455331412103, "grad_norm": 1.4453125, "learning_rate": 0.0004777867952145094, "loss": 4.9904, "mean_token_accuracy": 0.22090202271938325, "num_tokens": 36005353.0, "step": 15705 }, { "entropy": 5.269125986099243, "epoch": 1.5091258405379442, "grad_norm": 1.2734375, "learning_rate": 0.0004777719395713826, "loss": 4.9591, "mean_token_accuracy": 0.21391933113336564, "num_tokens": 36017510.0, "step": 15710 }, { "entropy": 5.196733570098877, "epoch": 1.509606147934678, "grad_norm": 1.3515625, "learning_rate": 0.00047775707922049354, "loss": 4.9517, "mean_token_accuracy": 0.21790158450603486, "num_tokens": 36028687.0, "step": 15715 }, { "entropy": 5.108423948287964, "epoch": 1.510086455331412, "grad_norm": 1.2890625, "learning_rate": 0.0004777422141621871, "loss": 4.8758, "mean_token_accuracy": 0.21638176292181016, "num_tokens": 36039851.0, "step": 15720 }, { "entropy": 5.204169845581054, "epoch": 1.510566762728146, "grad_norm": 1.234375, "learning_rate": 0.0004777273443968085, "loss": 5.0531, "mean_token_accuracy": 0.21372610628604888, "num_tokens": 36050776.0, "step": 15725 }, { "entropy": 5.132400417327881, "epoch": 1.51104707012488, "grad_norm": 1.328125, "learning_rate": 0.0004777124699247029, "loss": 4.8719, "mean_token_accuracy": 0.21945572644472122, "num_tokens": 36062151.0, "step": 15730 }, { "entropy": 5.187724494934082, "epoch": 1.5115273775216138, "grad_norm": 1.2734375, "learning_rate": 0.0004776975907462157, "loss": 4.9771, "mean_token_accuracy": 0.21894902735948563, "num_tokens": 36074601.0, "step": 15735 }, { "entropy": 5.149637079238891, "epoch": 1.5120076849183477, "grad_norm": 1.7890625, "learning_rate": 0.0004776827068616924, "loss": 4.8867, "mean_token_accuracy": 0.22149800211191178, "num_tokens": 36085436.0, "step": 15740 }, { "entropy": 5.249966144561768, "epoch": 1.5124879923150818, "grad_norm": 1.3515625, "learning_rate": 0.0004776678182714785, "loss": 5.0655, "mean_token_accuracy": 0.20393786877393721, "num_tokens": 36097589.0, "step": 15745 }, { "entropy": 5.1826738834381105, "epoch": 1.5129682997118157, "grad_norm": 1.296875, "learning_rate": 0.00047765292497591955, "loss": 4.9858, "mean_token_accuracy": 0.21298189610242843, "num_tokens": 36109993.0, "step": 15750 }, { "entropy": 5.207590389251709, "epoch": 1.5134486071085496, "grad_norm": 1.265625, "learning_rate": 0.00047763802697536146, "loss": 4.924, "mean_token_accuracy": 0.21502208560705185, "num_tokens": 36122439.0, "step": 15755 }, { "entropy": 5.2367846965789795, "epoch": 1.5139289145052834, "grad_norm": 1.328125, "learning_rate": 0.00047762312427015015, "loss": 4.9541, "mean_token_accuracy": 0.21168054342269899, "num_tokens": 36133867.0, "step": 15760 }, { "entropy": 5.259883260726928, "epoch": 1.5144092219020173, "grad_norm": 1.1875, "learning_rate": 0.00047760821686063153, "loss": 5.053, "mean_token_accuracy": 0.20947152823209764, "num_tokens": 36144202.0, "step": 15765 }, { "entropy": 5.237866592407227, "epoch": 1.5148895292987512, "grad_norm": 1.53125, "learning_rate": 0.00047759330474715173, "loss": 4.9472, "mean_token_accuracy": 0.22170411497354509, "num_tokens": 36154863.0, "step": 15770 }, { "entropy": 5.215115213394165, "epoch": 1.515369836695485, "grad_norm": 1.2109375, "learning_rate": 0.00047757838793005704, "loss": 4.9774, "mean_token_accuracy": 0.21179027259349822, "num_tokens": 36166360.0, "step": 15775 }, { "entropy": 5.117362546920776, "epoch": 1.515850144092219, "grad_norm": 1.5703125, "learning_rate": 0.00047756346640969366, "loss": 4.8669, "mean_token_accuracy": 0.22667350769042968, "num_tokens": 36177477.0, "step": 15780 }, { "entropy": 5.210712575912476, "epoch": 1.5163304514889528, "grad_norm": 1.28125, "learning_rate": 0.00047754854018640803, "loss": 4.9971, "mean_token_accuracy": 0.21386642158031463, "num_tokens": 36188510.0, "step": 15785 }, { "entropy": 5.14784517288208, "epoch": 1.5168107588856867, "grad_norm": 1.109375, "learning_rate": 0.00047753360926054684, "loss": 4.8942, "mean_token_accuracy": 0.22038694620132446, "num_tokens": 36199084.0, "step": 15790 }, { "entropy": 5.111806440353393, "epoch": 1.5172910662824206, "grad_norm": 1.203125, "learning_rate": 0.00047751867363245653, "loss": 4.9112, "mean_token_accuracy": 0.21519201546907424, "num_tokens": 36211265.0, "step": 15795 }, { "entropy": 5.292361068725586, "epoch": 1.5177713736791547, "grad_norm": 1.328125, "learning_rate": 0.0004775037333024841, "loss": 5.0752, "mean_token_accuracy": 0.20719213485717775, "num_tokens": 36223070.0, "step": 15800 }, { "entropy": 5.223334217071534, "epoch": 1.5182516810758886, "grad_norm": 1.328125, "learning_rate": 0.0004774887882709762, "loss": 5.0089, "mean_token_accuracy": 0.21609985679388047, "num_tokens": 36235021.0, "step": 15805 }, { "entropy": 5.188208532333374, "epoch": 1.5187319884726225, "grad_norm": 1.5546875, "learning_rate": 0.00047747383853827995, "loss": 4.9597, "mean_token_accuracy": 0.21417346149682998, "num_tokens": 36245647.0, "step": 15810 }, { "entropy": 5.244234657287597, "epoch": 1.5192122958693564, "grad_norm": 1.234375, "learning_rate": 0.0004774588841047424, "loss": 5.0398, "mean_token_accuracy": 0.21030631810426711, "num_tokens": 36257470.0, "step": 15815 }, { "entropy": 5.176792812347412, "epoch": 1.5196926032660905, "grad_norm": 1.2421875, "learning_rate": 0.0004774439249707108, "loss": 4.8951, "mean_token_accuracy": 0.21645855009555817, "num_tokens": 36268839.0, "step": 15820 }, { "entropy": 5.1920037269592285, "epoch": 1.5201729106628243, "grad_norm": 1.6484375, "learning_rate": 0.0004774289611365323, "loss": 5.038, "mean_token_accuracy": 0.20925631374120712, "num_tokens": 36280624.0, "step": 15825 }, { "entropy": 5.211209154129028, "epoch": 1.5206532180595582, "grad_norm": 1.3359375, "learning_rate": 0.00047741399260255434, "loss": 4.9448, "mean_token_accuracy": 0.21624568998813629, "num_tokens": 36292696.0, "step": 15830 }, { "entropy": 5.22960147857666, "epoch": 1.521133525456292, "grad_norm": 1.28125, "learning_rate": 0.00047739901936912467, "loss": 4.9583, "mean_token_accuracy": 0.21953330487012862, "num_tokens": 36303612.0, "step": 15835 }, { "entropy": 5.146418714523316, "epoch": 1.521613832853026, "grad_norm": 1.1640625, "learning_rate": 0.0004773840414365907, "loss": 4.9303, "mean_token_accuracy": 0.21528706550598145, "num_tokens": 36314511.0, "step": 15840 }, { "entropy": 5.289036989212036, "epoch": 1.5220941402497599, "grad_norm": 1.2265625, "learning_rate": 0.00047736905880530026, "loss": 5.0616, "mean_token_accuracy": 0.20672106891870498, "num_tokens": 36327276.0, "step": 15845 }, { "entropy": 5.214120817184448, "epoch": 1.5225744476464937, "grad_norm": 1.21875, "learning_rate": 0.0004773540714756012, "loss": 4.9296, "mean_token_accuracy": 0.21783770322799684, "num_tokens": 36339373.0, "step": 15850 }, { "entropy": 5.239572238922119, "epoch": 1.5230547550432276, "grad_norm": 1.1484375, "learning_rate": 0.00047733907944784144, "loss": 5.0491, "mean_token_accuracy": 0.20820102244615554, "num_tokens": 36351863.0, "step": 15855 }, { "entropy": 5.161839580535888, "epoch": 1.5235350624399615, "grad_norm": 1.2109375, "learning_rate": 0.0004773240827223691, "loss": 4.9524, "mean_token_accuracy": 0.22128331512212754, "num_tokens": 36363961.0, "step": 15860 }, { "entropy": 5.185322666168213, "epoch": 1.5240153698366954, "grad_norm": 1.1875, "learning_rate": 0.0004773090812995323, "loss": 4.9384, "mean_token_accuracy": 0.2248750001192093, "num_tokens": 36374738.0, "step": 15865 }, { "entropy": 5.185145139694214, "epoch": 1.5244956772334293, "grad_norm": 1.359375, "learning_rate": 0.00047729407517967945, "loss": 4.8691, "mean_token_accuracy": 0.22020863592624665, "num_tokens": 36386472.0, "step": 15870 }, { "entropy": 5.170194339752197, "epoch": 1.5249759846301632, "grad_norm": 1.2578125, "learning_rate": 0.00047727906436315884, "loss": 4.956, "mean_token_accuracy": 0.22778922319412231, "num_tokens": 36397042.0, "step": 15875 }, { "entropy": 5.166629409790039, "epoch": 1.5254562920268973, "grad_norm": 1.296875, "learning_rate": 0.00047726404885031895, "loss": 4.9269, "mean_token_accuracy": 0.21769467294216155, "num_tokens": 36408720.0, "step": 15880 }, { "entropy": 5.233714628219604, "epoch": 1.5259365994236311, "grad_norm": 1.359375, "learning_rate": 0.00047724902864150845, "loss": 5.0013, "mean_token_accuracy": 0.2127738893032074, "num_tokens": 36420885.0, "step": 15885 }, { "entropy": 5.250354194641114, "epoch": 1.526416906820365, "grad_norm": 1.25, "learning_rate": 0.00047723400373707607, "loss": 5.0181, "mean_token_accuracy": 0.20683068931102752, "num_tokens": 36433678.0, "step": 15890 }, { "entropy": 5.2336162567138675, "epoch": 1.526897214217099, "grad_norm": 1.6953125, "learning_rate": 0.0004772189741373707, "loss": 5.0423, "mean_token_accuracy": 0.21756250262260438, "num_tokens": 36445143.0, "step": 15895 }, { "entropy": 5.102718687057495, "epoch": 1.527377521613833, "grad_norm": 1.203125, "learning_rate": 0.00047720393984274117, "loss": 4.9456, "mean_token_accuracy": 0.21202410906553268, "num_tokens": 36456214.0, "step": 15900 }, { "entropy": 5.2174307346344, "epoch": 1.5278578290105669, "grad_norm": 1.2734375, "learning_rate": 0.00047718890085353654, "loss": 4.9635, "mean_token_accuracy": 0.2163314238190651, "num_tokens": 36466767.0, "step": 15905 }, { "entropy": 5.3040376663208, "epoch": 1.5283381364073008, "grad_norm": 1.4375, "learning_rate": 0.000477173857170106, "loss": 4.9536, "mean_token_accuracy": 0.2135412722826004, "num_tokens": 36478150.0, "step": 15910 }, { "entropy": 5.167091703414917, "epoch": 1.5288184438040346, "grad_norm": 1.1640625, "learning_rate": 0.00047715880879279894, "loss": 4.9546, "mean_token_accuracy": 0.2112107068300247, "num_tokens": 36488280.0, "step": 15915 }, { "entropy": 5.194524145126342, "epoch": 1.5292987512007685, "grad_norm": 1.21875, "learning_rate": 0.0004771437557219646, "loss": 5.0048, "mean_token_accuracy": 0.21566450595855713, "num_tokens": 36498505.0, "step": 15920 }, { "entropy": 5.1693662166595455, "epoch": 1.5297790585975024, "grad_norm": 1.4140625, "learning_rate": 0.0004771286979579524, "loss": 4.9538, "mean_token_accuracy": 0.2125968560576439, "num_tokens": 36509271.0, "step": 15925 }, { "entropy": 5.234210538864136, "epoch": 1.5302593659942363, "grad_norm": 1.3359375, "learning_rate": 0.0004771136355011121, "loss": 5.0366, "mean_token_accuracy": 0.20775097012519836, "num_tokens": 36520022.0, "step": 15930 }, { "entropy": 5.213120317459106, "epoch": 1.5307396733909702, "grad_norm": 1.4765625, "learning_rate": 0.00047709856835179333, "loss": 4.9638, "mean_token_accuracy": 0.21767441034317017, "num_tokens": 36532769.0, "step": 15935 }, { "entropy": 5.239807176589966, "epoch": 1.531219980787704, "grad_norm": 1.3046875, "learning_rate": 0.00047708349651034586, "loss": 4.9947, "mean_token_accuracy": 0.22149415910243989, "num_tokens": 36544454.0, "step": 15940 }, { "entropy": 5.220270252227783, "epoch": 1.531700288184438, "grad_norm": 1.21875, "learning_rate": 0.00047706841997711974, "loss": 4.9688, "mean_token_accuracy": 0.20992875397205352, "num_tokens": 36555916.0, "step": 15945 }, { "entropy": 5.154624176025391, "epoch": 1.5321805955811718, "grad_norm": 1.34375, "learning_rate": 0.00047705333875246495, "loss": 4.9463, "mean_token_accuracy": 0.21742784678936006, "num_tokens": 36567829.0, "step": 15950 }, { "entropy": 5.20950345993042, "epoch": 1.532660902977906, "grad_norm": 1.234375, "learning_rate": 0.00047703825283673153, "loss": 5.0589, "mean_token_accuracy": 0.21559867709875108, "num_tokens": 36578216.0, "step": 15955 }, { "entropy": 5.240422534942627, "epoch": 1.5331412103746398, "grad_norm": 1.15625, "learning_rate": 0.0004770231622302699, "loss": 4.9537, "mean_token_accuracy": 0.21943466514348983, "num_tokens": 36589945.0, "step": 15960 }, { "entropy": 5.250634670257568, "epoch": 1.5336215177713737, "grad_norm": 1.3359375, "learning_rate": 0.00047700806693343016, "loss": 4.9771, "mean_token_accuracy": 0.21575426161289216, "num_tokens": 36600724.0, "step": 15965 }, { "entropy": 5.060350513458252, "epoch": 1.5341018251681076, "grad_norm": 1.2734375, "learning_rate": 0.00047699296694656316, "loss": 4.8074, "mean_token_accuracy": 0.22741931974887847, "num_tokens": 36611154.0, "step": 15970 }, { "entropy": 5.169518995285034, "epoch": 1.5345821325648417, "grad_norm": 1.265625, "learning_rate": 0.0004769778622700192, "loss": 4.9076, "mean_token_accuracy": 0.21358391046524047, "num_tokens": 36621750.0, "step": 15975 }, { "entropy": 5.205866909027099, "epoch": 1.5350624399615755, "grad_norm": 1.2421875, "learning_rate": 0.00047696275290414885, "loss": 4.9543, "mean_token_accuracy": 0.21394149214029312, "num_tokens": 36633294.0, "step": 15980 }, { "entropy": 5.125529336929321, "epoch": 1.5355427473583094, "grad_norm": 1.234375, "learning_rate": 0.00047694763884930324, "loss": 4.8902, "mean_token_accuracy": 0.2196623682975769, "num_tokens": 36646377.0, "step": 15985 }, { "entropy": 5.141584873199463, "epoch": 1.5360230547550433, "grad_norm": 1.2265625, "learning_rate": 0.00047693252010583314, "loss": 4.8424, "mean_token_accuracy": 0.22554460167884827, "num_tokens": 36656159.0, "step": 15990 }, { "entropy": 5.14939513206482, "epoch": 1.5365033621517772, "grad_norm": 1.1796875, "learning_rate": 0.0004769173966740895, "loss": 4.9904, "mean_token_accuracy": 0.2079702839255333, "num_tokens": 36667522.0, "step": 15995 }, { "entropy": 5.26382999420166, "epoch": 1.536983669548511, "grad_norm": 1.109375, "learning_rate": 0.00047690226855442346, "loss": 4.9977, "mean_token_accuracy": 0.21323842704296112, "num_tokens": 36678662.0, "step": 16000 }, { "entropy": 5.19386396408081, "epoch": 1.537463976945245, "grad_norm": 1.25, "learning_rate": 0.0004768871357471863, "loss": 4.9681, "mean_token_accuracy": 0.21223879903554915, "num_tokens": 36689876.0, "step": 16005 }, { "entropy": 5.211331748962403, "epoch": 1.5379442843419788, "grad_norm": 1.453125, "learning_rate": 0.00047687199825272936, "loss": 4.9179, "mean_token_accuracy": 0.22314226925373076, "num_tokens": 36701140.0, "step": 16010 }, { "entropy": 5.2290960311889645, "epoch": 1.5384245917387127, "grad_norm": 1.2265625, "learning_rate": 0.00047685685607140403, "loss": 5.067, "mean_token_accuracy": 0.20884881168603897, "num_tokens": 36711077.0, "step": 16015 }, { "entropy": 5.214303207397461, "epoch": 1.5389048991354466, "grad_norm": 1.3203125, "learning_rate": 0.00047684170920356185, "loss": 4.9477, "mean_token_accuracy": 0.2187011405825615, "num_tokens": 36722705.0, "step": 16020 }, { "entropy": 5.1465880393981935, "epoch": 1.5393852065321805, "grad_norm": 1.421875, "learning_rate": 0.0004768265576495546, "loss": 4.9004, "mean_token_accuracy": 0.2221095785498619, "num_tokens": 36733446.0, "step": 16025 }, { "entropy": 5.114369249343872, "epoch": 1.5398655139289144, "grad_norm": 1.1953125, "learning_rate": 0.00047681140140973396, "loss": 4.9272, "mean_token_accuracy": 0.2224670261144638, "num_tokens": 36744529.0, "step": 16030 }, { "entropy": 5.095712089538575, "epoch": 1.5403458213256485, "grad_norm": 1.4375, "learning_rate": 0.0004767962404844517, "loss": 4.9387, "mean_token_accuracy": 0.21873563379049302, "num_tokens": 36756248.0, "step": 16035 }, { "entropy": 5.287334442138672, "epoch": 1.5408261287223823, "grad_norm": 1.3046875, "learning_rate": 0.00047678107487406015, "loss": 5.0196, "mean_token_accuracy": 0.21320114731788636, "num_tokens": 36768097.0, "step": 16040 }, { "entropy": 5.204667949676514, "epoch": 1.5413064361191162, "grad_norm": 1.1171875, "learning_rate": 0.00047676590457891116, "loss": 4.9646, "mean_token_accuracy": 0.21519066542387008, "num_tokens": 36780395.0, "step": 16045 }, { "entropy": 5.173113012313843, "epoch": 1.54178674351585, "grad_norm": 1.296875, "learning_rate": 0.0004767507295993569, "loss": 4.9343, "mean_token_accuracy": 0.211539426445961, "num_tokens": 36791897.0, "step": 16050 }, { "entropy": 5.160366153717041, "epoch": 1.5422670509125842, "grad_norm": 1.828125, "learning_rate": 0.0004767355499357498, "loss": 4.9177, "mean_token_accuracy": 0.22163355052471162, "num_tokens": 36802716.0, "step": 16055 }, { "entropy": 5.138392639160156, "epoch": 1.542747358309318, "grad_norm": 1.4375, "learning_rate": 0.0004767203655884423, "loss": 4.9724, "mean_token_accuracy": 0.21543466299772263, "num_tokens": 36814471.0, "step": 16060 }, { "entropy": 5.1336760997772215, "epoch": 1.543227665706052, "grad_norm": 1.25, "learning_rate": 0.0004767051765577869, "loss": 4.9174, "mean_token_accuracy": 0.2216467648744583, "num_tokens": 36825975.0, "step": 16065 }, { "entropy": 5.2050800800323485, "epoch": 1.5437079731027858, "grad_norm": 1.203125, "learning_rate": 0.00047668998284413624, "loss": 5.0166, "mean_token_accuracy": 0.20977095812559127, "num_tokens": 36837990.0, "step": 16070 }, { "entropy": 5.227380561828613, "epoch": 1.5441882804995197, "grad_norm": 1.1328125, "learning_rate": 0.00047667478444784306, "loss": 5.0358, "mean_token_accuracy": 0.21031395345926285, "num_tokens": 36849115.0, "step": 16075 }, { "entropy": 5.22444372177124, "epoch": 1.5446685878962536, "grad_norm": 1.4140625, "learning_rate": 0.0004766595813692602, "loss": 4.8866, "mean_token_accuracy": 0.21476306468248368, "num_tokens": 36860626.0, "step": 16080 }, { "entropy": 5.200777339935303, "epoch": 1.5451488952929875, "grad_norm": 1.34375, "learning_rate": 0.00047664437360874076, "loss": 5.0325, "mean_token_accuracy": 0.20902796387672423, "num_tokens": 36871926.0, "step": 16085 }, { "entropy": 5.180306005477905, "epoch": 1.5456292026897214, "grad_norm": 1.1796875, "learning_rate": 0.00047662916116663766, "loss": 4.949, "mean_token_accuracy": 0.22320764660835266, "num_tokens": 36883511.0, "step": 16090 }, { "entropy": 5.253410387039184, "epoch": 1.5461095100864553, "grad_norm": 1.265625, "learning_rate": 0.00047661394404330417, "loss": 5.0173, "mean_token_accuracy": 0.21642861217260362, "num_tokens": 36895468.0, "step": 16095 }, { "entropy": 5.182759952545166, "epoch": 1.5465898174831891, "grad_norm": 1.2890625, "learning_rate": 0.00047659872223909357, "loss": 4.911, "mean_token_accuracy": 0.21871508955955504, "num_tokens": 36906957.0, "step": 16100 }, { "entropy": 5.156119346618652, "epoch": 1.547070124879923, "grad_norm": 1.3984375, "learning_rate": 0.0004765834957543592, "loss": 4.9463, "mean_token_accuracy": 0.21861688941717147, "num_tokens": 36916495.0, "step": 16105 }, { "entropy": 5.1166833400726315, "epoch": 1.547550432276657, "grad_norm": 1.21875, "learning_rate": 0.00047656826458945475, "loss": 4.9244, "mean_token_accuracy": 0.21573103368282318, "num_tokens": 36927301.0, "step": 16110 }, { "entropy": 5.178887462615966, "epoch": 1.548030739673391, "grad_norm": 1.4765625, "learning_rate": 0.00047655302874473365, "loss": 4.8872, "mean_token_accuracy": 0.22074204683303833, "num_tokens": 36938116.0, "step": 16115 }, { "entropy": 5.199589490890503, "epoch": 1.5485110470701249, "grad_norm": 1.390625, "learning_rate": 0.0004765377882205498, "loss": 4.9571, "mean_token_accuracy": 0.21698470413684845, "num_tokens": 36950292.0, "step": 16120 }, { "entropy": 5.1331400871276855, "epoch": 1.5489913544668588, "grad_norm": 1.21875, "learning_rate": 0.0004765225430172568, "loss": 4.9833, "mean_token_accuracy": 0.220487479865551, "num_tokens": 36962017.0, "step": 16125 }, { "entropy": 5.2799131870269775, "epoch": 1.5494716618635929, "grad_norm": 1.203125, "learning_rate": 0.0004765072931352089, "loss": 5.0086, "mean_token_accuracy": 0.21049043387174607, "num_tokens": 36972831.0, "step": 16130 }, { "entropy": 5.199352645874024, "epoch": 1.5499519692603267, "grad_norm": 1.3828125, "learning_rate": 0.00047649203857476, "loss": 4.8725, "mean_token_accuracy": 0.2225062444806099, "num_tokens": 36983324.0, "step": 16135 }, { "entropy": 5.153161334991455, "epoch": 1.5504322766570606, "grad_norm": 1.1953125, "learning_rate": 0.00047647677933626423, "loss": 5.1157, "mean_token_accuracy": 0.20301380157470703, "num_tokens": 36996093.0, "step": 16140 }, { "entropy": 5.1830164909362795, "epoch": 1.5509125840537945, "grad_norm": 1.3125, "learning_rate": 0.00047646151542007583, "loss": 4.9454, "mean_token_accuracy": 0.2194085642695427, "num_tokens": 37006548.0, "step": 16145 }, { "entropy": 5.234643363952637, "epoch": 1.5513928914505284, "grad_norm": 1.234375, "learning_rate": 0.0004764462468265494, "loss": 4.946, "mean_token_accuracy": 0.2132670909166336, "num_tokens": 37019181.0, "step": 16150 }, { "entropy": 5.261085796356201, "epoch": 1.5518731988472623, "grad_norm": 1.5703125, "learning_rate": 0.00047643097355603913, "loss": 4.985, "mean_token_accuracy": 0.21415400505065918, "num_tokens": 37029808.0, "step": 16155 }, { "entropy": 5.180344581604004, "epoch": 1.5523535062439962, "grad_norm": 1.34375, "learning_rate": 0.0004764156956088999, "loss": 4.9579, "mean_token_accuracy": 0.21201344579458237, "num_tokens": 37040352.0, "step": 16160 }, { "entropy": 5.1573163032531735, "epoch": 1.55283381364073, "grad_norm": 1.15625, "learning_rate": 0.0004764004129854863, "loss": 4.9926, "mean_token_accuracy": 0.2195364996790886, "num_tokens": 37050780.0, "step": 16165 }, { "entropy": 5.190805196762085, "epoch": 1.553314121037464, "grad_norm": 1.171875, "learning_rate": 0.00047638512568615307, "loss": 4.9335, "mean_token_accuracy": 0.21873989403247834, "num_tokens": 37062608.0, "step": 16170 }, { "entropy": 5.184975290298462, "epoch": 1.5537944284341978, "grad_norm": 1.1953125, "learning_rate": 0.0004763698337112553, "loss": 5.0266, "mean_token_accuracy": 0.2099449321627617, "num_tokens": 37074578.0, "step": 16175 }, { "entropy": 5.243215465545655, "epoch": 1.5542747358309317, "grad_norm": 1.265625, "learning_rate": 0.0004763545370611479, "loss": 5.0469, "mean_token_accuracy": 0.2158788859844208, "num_tokens": 37086487.0, "step": 16180 }, { "entropy": 5.2162518978118895, "epoch": 1.5547550432276656, "grad_norm": 1.3671875, "learning_rate": 0.00047633923573618605, "loss": 4.9287, "mean_token_accuracy": 0.22581578940153121, "num_tokens": 37097249.0, "step": 16185 }, { "entropy": 5.257421016693115, "epoch": 1.5552353506243997, "grad_norm": 1.296875, "learning_rate": 0.000476323929736725, "loss": 5.1366, "mean_token_accuracy": 0.20327619165182115, "num_tokens": 37109274.0, "step": 16190 }, { "entropy": 5.191748714447021, "epoch": 1.5557156580211335, "grad_norm": 1.2421875, "learning_rate": 0.00047630861906312004, "loss": 4.9506, "mean_token_accuracy": 0.21775271743535995, "num_tokens": 37120100.0, "step": 16195 }, { "entropy": 5.18410849571228, "epoch": 1.5561959654178674, "grad_norm": 1.171875, "learning_rate": 0.0004762933037157268, "loss": 4.9254, "mean_token_accuracy": 0.21204567849636077, "num_tokens": 37131889.0, "step": 16200 }, { "entropy": 5.114263343811035, "epoch": 1.5566762728146013, "grad_norm": 1.359375, "learning_rate": 0.00047627798369490076, "loss": 4.8947, "mean_token_accuracy": 0.22275954782962798, "num_tokens": 37142538.0, "step": 16205 }, { "entropy": 5.235247564315796, "epoch": 1.5571565802113354, "grad_norm": 1.359375, "learning_rate": 0.00047626265900099757, "loss": 5.0412, "mean_token_accuracy": 0.2095339596271515, "num_tokens": 37153639.0, "step": 16210 }, { "entropy": 5.216181850433349, "epoch": 1.5576368876080693, "grad_norm": 1.453125, "learning_rate": 0.00047624732963437314, "loss": 4.9877, "mean_token_accuracy": 0.21557213515043258, "num_tokens": 37164029.0, "step": 16215 }, { "entropy": 5.211888360977173, "epoch": 1.5581171950048032, "grad_norm": 1.2109375, "learning_rate": 0.00047623199559538324, "loss": 5.032, "mean_token_accuracy": 0.211012165248394, "num_tokens": 37175880.0, "step": 16220 }, { "entropy": 5.221793174743652, "epoch": 1.558597502401537, "grad_norm": 1.46875, "learning_rate": 0.000476216656884384, "loss": 4.9699, "mean_token_accuracy": 0.21573802679777146, "num_tokens": 37187483.0, "step": 16225 }, { "entropy": 5.2223461151123045, "epoch": 1.559077809798271, "grad_norm": 1.25, "learning_rate": 0.00047620131350173135, "loss": 4.9836, "mean_token_accuracy": 0.21847614794969558, "num_tokens": 37198978.0, "step": 16230 }, { "entropy": 5.254594516754151, "epoch": 1.5595581171950048, "grad_norm": 1.3828125, "learning_rate": 0.0004761859654477817, "loss": 5.0051, "mean_token_accuracy": 0.20368780344724655, "num_tokens": 37210054.0, "step": 16235 }, { "entropy": 5.1702179431915285, "epoch": 1.5600384245917387, "grad_norm": 1.265625, "learning_rate": 0.0004761706127228914, "loss": 4.9572, "mean_token_accuracy": 0.21917274296283723, "num_tokens": 37221744.0, "step": 16240 }, { "entropy": 5.151251411437988, "epoch": 1.5605187319884726, "grad_norm": 1.2890625, "learning_rate": 0.0004761552553274168, "loss": 4.9985, "mean_token_accuracy": 0.21715299040079117, "num_tokens": 37234629.0, "step": 16245 }, { "entropy": 5.2933587551116945, "epoch": 1.5609990393852065, "grad_norm": 1.21875, "learning_rate": 0.0004761398932617144, "loss": 5.0695, "mean_token_accuracy": 0.21107089519500732, "num_tokens": 37246960.0, "step": 16250 }, { "entropy": 5.151790046691895, "epoch": 1.5614793467819403, "grad_norm": 1.2578125, "learning_rate": 0.000476124526526141, "loss": 4.9236, "mean_token_accuracy": 0.2184425637125969, "num_tokens": 37259326.0, "step": 16255 }, { "entropy": 5.091758918762207, "epoch": 1.5619596541786742, "grad_norm": 1.609375, "learning_rate": 0.00047610915512105327, "loss": 4.9732, "mean_token_accuracy": 0.2134696900844574, "num_tokens": 37271436.0, "step": 16260 }, { "entropy": 5.234701013565063, "epoch": 1.562439961575408, "grad_norm": 1.1953125, "learning_rate": 0.0004760937790468082, "loss": 5.038, "mean_token_accuracy": 0.20630017220973967, "num_tokens": 37283230.0, "step": 16265 }, { "entropy": 5.2092828273773195, "epoch": 1.5629202689721422, "grad_norm": 1.171875, "learning_rate": 0.0004760783983037627, "loss": 4.9399, "mean_token_accuracy": 0.21871796250343323, "num_tokens": 37295185.0, "step": 16270 }, { "entropy": 5.198478031158447, "epoch": 1.563400576368876, "grad_norm": 1.328125, "learning_rate": 0.0004760630128922738, "loss": 5.0027, "mean_token_accuracy": 0.21369778662919997, "num_tokens": 37307573.0, "step": 16275 }, { "entropy": 5.271113395690918, "epoch": 1.56388088376561, "grad_norm": 1.46875, "learning_rate": 0.0004760476228126989, "loss": 5.0284, "mean_token_accuracy": 0.21126459836959838, "num_tokens": 37319084.0, "step": 16280 }, { "entropy": 5.120991039276123, "epoch": 1.5643611911623438, "grad_norm": 1.203125, "learning_rate": 0.0004760322280653951, "loss": 4.9307, "mean_token_accuracy": 0.21826708912849427, "num_tokens": 37330447.0, "step": 16285 }, { "entropy": 5.164800930023193, "epoch": 1.564841498559078, "grad_norm": 1.3046875, "learning_rate": 0.0004760168286507199, "loss": 4.8447, "mean_token_accuracy": 0.21647296249866485, "num_tokens": 37341184.0, "step": 16290 }, { "entropy": 5.217311954498291, "epoch": 1.5653218059558118, "grad_norm": 1.1796875, "learning_rate": 0.00047600142456903085, "loss": 4.9875, "mean_token_accuracy": 0.21339131146669388, "num_tokens": 37352712.0, "step": 16295 }, { "entropy": 5.191257572174072, "epoch": 1.5658021133525457, "grad_norm": 1.21875, "learning_rate": 0.00047598601582068555, "loss": 5.0424, "mean_token_accuracy": 0.2111809030175209, "num_tokens": 37365001.0, "step": 16300 }, { "entropy": 5.306282901763916, "epoch": 1.5662824207492796, "grad_norm": 1.1875, "learning_rate": 0.0004759706024060418, "loss": 5.1008, "mean_token_accuracy": 0.20626700818538665, "num_tokens": 37377005.0, "step": 16305 }, { "entropy": 5.196134757995606, "epoch": 1.5667627281460135, "grad_norm": 1.6640625, "learning_rate": 0.0004759551843254575, "loss": 4.9281, "mean_token_accuracy": 0.2204608216881752, "num_tokens": 37387329.0, "step": 16310 }, { "entropy": 5.256574726104736, "epoch": 1.5672430355427474, "grad_norm": 1.2890625, "learning_rate": 0.00047593976157929034, "loss": 5.0302, "mean_token_accuracy": 0.21475369334220887, "num_tokens": 37398894.0, "step": 16315 }, { "entropy": 5.231758117675781, "epoch": 1.5677233429394812, "grad_norm": 1.2890625, "learning_rate": 0.0004759243341678987, "loss": 4.937, "mean_token_accuracy": 0.21920875310897828, "num_tokens": 37409913.0, "step": 16320 }, { "entropy": 5.217631864547729, "epoch": 1.5682036503362151, "grad_norm": 1.2421875, "learning_rate": 0.0004759089020916407, "loss": 4.899, "mean_token_accuracy": 0.21592830419540404, "num_tokens": 37421859.0, "step": 16325 }, { "entropy": 5.188553094863892, "epoch": 1.568683957732949, "grad_norm": 1.1875, "learning_rate": 0.00047589346535087444, "loss": 4.9689, "mean_token_accuracy": 0.21344497352838515, "num_tokens": 37432827.0, "step": 16330 }, { "entropy": 5.171142482757569, "epoch": 1.5691642651296829, "grad_norm": 1.2734375, "learning_rate": 0.0004758780239459586, "loss": 4.9727, "mean_token_accuracy": 0.22152907252311707, "num_tokens": 37444171.0, "step": 16335 }, { "entropy": 5.194261264801026, "epoch": 1.5696445725264168, "grad_norm": 1.2890625, "learning_rate": 0.0004758625778772514, "loss": 4.9252, "mean_token_accuracy": 0.2152291163802147, "num_tokens": 37455194.0, "step": 16340 }, { "entropy": 5.0763763904571535, "epoch": 1.5701248799231509, "grad_norm": 1.140625, "learning_rate": 0.00047584712714511166, "loss": 4.868, "mean_token_accuracy": 0.22146839201450347, "num_tokens": 37465839.0, "step": 16345 }, { "entropy": 5.157925367355347, "epoch": 1.5706051873198847, "grad_norm": 1.09375, "learning_rate": 0.00047583167174989797, "loss": 4.9428, "mean_token_accuracy": 0.22237591743469237, "num_tokens": 37476948.0, "step": 16350 }, { "entropy": 5.198430585861206, "epoch": 1.5710854947166186, "grad_norm": 1.234375, "learning_rate": 0.0004758162116919692, "loss": 4.9383, "mean_token_accuracy": 0.21987285912036897, "num_tokens": 37487382.0, "step": 16355 }, { "entropy": 5.106133604049683, "epoch": 1.5715658021133525, "grad_norm": 1.2890625, "learning_rate": 0.00047580074697168434, "loss": 4.9045, "mean_token_accuracy": 0.22711621075868607, "num_tokens": 37498347.0, "step": 16360 }, { "entropy": 5.189422130584717, "epoch": 1.5720461095100866, "grad_norm": 1.2109375, "learning_rate": 0.00047578527758940236, "loss": 5.0108, "mean_token_accuracy": 0.20658042430877685, "num_tokens": 37509620.0, "step": 16365 }, { "entropy": 5.200649690628052, "epoch": 1.5725264169068205, "grad_norm": 1.4375, "learning_rate": 0.0004757698035454825, "loss": 4.8942, "mean_token_accuracy": 0.22583490014076232, "num_tokens": 37519768.0, "step": 16370 }, { "entropy": 5.118372964859009, "epoch": 1.5730067243035544, "grad_norm": 1.7734375, "learning_rate": 0.0004757543248402839, "loss": 4.8915, "mean_token_accuracy": 0.21790195405483245, "num_tokens": 37530476.0, "step": 16375 }, { "entropy": 5.189021921157837, "epoch": 1.5734870317002883, "grad_norm": 1.1640625, "learning_rate": 0.00047573884147416597, "loss": 5.0071, "mean_token_accuracy": 0.21477911174297332, "num_tokens": 37541702.0, "step": 16380 }, { "entropy": 5.234851312637329, "epoch": 1.5739673390970221, "grad_norm": 1.1640625, "learning_rate": 0.0004757233534474883, "loss": 4.9618, "mean_token_accuracy": 0.21515962332487107, "num_tokens": 37553323.0, "step": 16385 }, { "entropy": 5.239250135421753, "epoch": 1.574447646493756, "grad_norm": 1.265625, "learning_rate": 0.0004757078607606103, "loss": 5.0426, "mean_token_accuracy": 0.20883728861808776, "num_tokens": 37564882.0, "step": 16390 }, { "entropy": 5.2088868618011475, "epoch": 1.57492795389049, "grad_norm": 1.3125, "learning_rate": 0.0004756923634138918, "loss": 4.9551, "mean_token_accuracy": 0.21754217594861985, "num_tokens": 37575612.0, "step": 16395 }, { "entropy": 5.184630298614502, "epoch": 1.5754082612872238, "grad_norm": 1.2265625, "learning_rate": 0.00047567686140769264, "loss": 4.9768, "mean_token_accuracy": 0.21821277886629104, "num_tokens": 37587089.0, "step": 16400 }, { "entropy": 5.175845193862915, "epoch": 1.5758885686839577, "grad_norm": 1.203125, "learning_rate": 0.00047566135474237247, "loss": 4.9184, "mean_token_accuracy": 0.21809831261634827, "num_tokens": 37598429.0, "step": 16405 }, { "entropy": 5.239535188674926, "epoch": 1.5763688760806915, "grad_norm": 1.2578125, "learning_rate": 0.00047564584341829166, "loss": 5.0381, "mean_token_accuracy": 0.21282682567834854, "num_tokens": 37609769.0, "step": 16410 }, { "entropy": 5.221446514129639, "epoch": 1.5768491834774254, "grad_norm": 1.2421875, "learning_rate": 0.00047563032743581, "loss": 5.087, "mean_token_accuracy": 0.20481704473495482, "num_tokens": 37622425.0, "step": 16415 }, { "entropy": 5.187086963653565, "epoch": 1.5773294908741593, "grad_norm": 1.1484375, "learning_rate": 0.00047561480679528804, "loss": 4.9807, "mean_token_accuracy": 0.21880702823400497, "num_tokens": 37634819.0, "step": 16420 }, { "entropy": 5.198013353347778, "epoch": 1.5778097982708934, "grad_norm": 1.2265625, "learning_rate": 0.0004755992814970859, "loss": 4.8433, "mean_token_accuracy": 0.2195771813392639, "num_tokens": 37645531.0, "step": 16425 }, { "entropy": 5.127077627182007, "epoch": 1.5782901056676273, "grad_norm": 1.1875, "learning_rate": 0.000475583751541564, "loss": 4.9271, "mean_token_accuracy": 0.21498081386089324, "num_tokens": 37657243.0, "step": 16430 }, { "entropy": 5.225577545166016, "epoch": 1.5787704130643612, "grad_norm": 1.15625, "learning_rate": 0.00047556821692908315, "loss": 4.9839, "mean_token_accuracy": 0.21665328592061997, "num_tokens": 37668218.0, "step": 16435 }, { "entropy": 5.249110507965088, "epoch": 1.579250720461095, "grad_norm": 1.203125, "learning_rate": 0.0004755526776600038, "loss": 4.9283, "mean_token_accuracy": 0.21837957799434662, "num_tokens": 37680173.0, "step": 16440 }, { "entropy": 5.161304950714111, "epoch": 1.5797310278578292, "grad_norm": 1.203125, "learning_rate": 0.00047553713373468684, "loss": 4.9597, "mean_token_accuracy": 0.21027158498764037, "num_tokens": 37691281.0, "step": 16445 }, { "entropy": 5.209333801269532, "epoch": 1.580211335254563, "grad_norm": 1.640625, "learning_rate": 0.00047552158515349306, "loss": 5.0076, "mean_token_accuracy": 0.20411573201417924, "num_tokens": 37702320.0, "step": 16450 }, { "entropy": 5.214414644241333, "epoch": 1.580691642651297, "grad_norm": 1.4296875, "learning_rate": 0.00047550603191678356, "loss": 4.9337, "mean_token_accuracy": 0.21256616711616516, "num_tokens": 37713724.0, "step": 16455 }, { "entropy": 5.193028688430786, "epoch": 1.5811719500480308, "grad_norm": 1.2734375, "learning_rate": 0.0004754904740249194, "loss": 4.9276, "mean_token_accuracy": 0.21470242887735366, "num_tokens": 37724532.0, "step": 16460 }, { "entropy": 5.131993532180786, "epoch": 1.5816522574447647, "grad_norm": 1.3984375, "learning_rate": 0.00047547491147826156, "loss": 4.8838, "mean_token_accuracy": 0.22787191569805146, "num_tokens": 37735174.0, "step": 16465 }, { "entropy": 5.177940845489502, "epoch": 1.5821325648414986, "grad_norm": 1.2734375, "learning_rate": 0.0004754593442771718, "loss": 5.0392, "mean_token_accuracy": 0.21157672852277756, "num_tokens": 37746981.0, "step": 16470 }, { "entropy": 5.217215061187744, "epoch": 1.5826128722382324, "grad_norm": 1.1640625, "learning_rate": 0.00047544377242201115, "loss": 4.989, "mean_token_accuracy": 0.2165716901421547, "num_tokens": 37757576.0, "step": 16475 }, { "entropy": 5.216026020050049, "epoch": 1.5830931796349663, "grad_norm": 1.1953125, "learning_rate": 0.00047542819591314136, "loss": 5.006, "mean_token_accuracy": 0.2124703660607338, "num_tokens": 37769053.0, "step": 16480 }, { "entropy": 5.248029994964599, "epoch": 1.5835734870317002, "grad_norm": 1.15625, "learning_rate": 0.0004754126147509241, "loss": 4.9664, "mean_token_accuracy": 0.2157026171684265, "num_tokens": 37780517.0, "step": 16485 }, { "entropy": 5.269943332672119, "epoch": 1.584053794428434, "grad_norm": 1.1796875, "learning_rate": 0.00047539702893572086, "loss": 5.0513, "mean_token_accuracy": 0.20178954899311066, "num_tokens": 37791333.0, "step": 16490 }, { "entropy": 5.14649658203125, "epoch": 1.584534101825168, "grad_norm": 1.2265625, "learning_rate": 0.00047538143846789376, "loss": 4.9061, "mean_token_accuracy": 0.22500312328338623, "num_tokens": 37802635.0, "step": 16495 }, { "entropy": 5.0807657718658445, "epoch": 1.585014409221902, "grad_norm": 1.2734375, "learning_rate": 0.0004753658433478047, "loss": 4.8146, "mean_token_accuracy": 0.22304627895355225, "num_tokens": 37814788.0, "step": 16500 }, { "entropy": 5.194208145141602, "epoch": 1.585494716618636, "grad_norm": 1.171875, "learning_rate": 0.00047535024357581564, "loss": 4.9254, "mean_token_accuracy": 0.2166296660900116, "num_tokens": 37826650.0, "step": 16505 }, { "entropy": 5.219819498062134, "epoch": 1.5859750240153698, "grad_norm": 1.3125, "learning_rate": 0.0004753346391522889, "loss": 5.0167, "mean_token_accuracy": 0.21236117631196977, "num_tokens": 37839362.0, "step": 16510 }, { "entropy": 5.205073118209839, "epoch": 1.5864553314121037, "grad_norm": 1.5, "learning_rate": 0.00047531903007758667, "loss": 4.9954, "mean_token_accuracy": 0.209988933801651, "num_tokens": 37851057.0, "step": 16515 }, { "entropy": 5.181534385681152, "epoch": 1.5869356388088378, "grad_norm": 1.328125, "learning_rate": 0.0004753034163520714, "loss": 4.8959, "mean_token_accuracy": 0.22577075511217118, "num_tokens": 37863507.0, "step": 16520 }, { "entropy": 5.188526678085327, "epoch": 1.5874159462055717, "grad_norm": 1.3046875, "learning_rate": 0.00047528779797610557, "loss": 4.9664, "mean_token_accuracy": 0.2138543888926506, "num_tokens": 37874439.0, "step": 16525 }, { "entropy": 5.17763729095459, "epoch": 1.5878962536023056, "grad_norm": 1.1171875, "learning_rate": 0.00047527217495005184, "loss": 4.9292, "mean_token_accuracy": 0.21783537715673446, "num_tokens": 37886720.0, "step": 16530 }, { "entropy": 5.214607858657837, "epoch": 1.5883765609990395, "grad_norm": 1.265625, "learning_rate": 0.00047525654727427285, "loss": 5.0085, "mean_token_accuracy": 0.21364359855651854, "num_tokens": 37897919.0, "step": 16535 }, { "entropy": 5.20990104675293, "epoch": 1.5888568683957733, "grad_norm": 1.2265625, "learning_rate": 0.0004752409149491315, "loss": 5.0482, "mean_token_accuracy": 0.21286925077438354, "num_tokens": 37911061.0, "step": 16540 }, { "entropy": 5.240172100067139, "epoch": 1.5893371757925072, "grad_norm": 1.15625, "learning_rate": 0.00047522527797499075, "loss": 5.0121, "mean_token_accuracy": 0.21240307092666627, "num_tokens": 37922677.0, "step": 16545 }, { "entropy": 5.096889591217041, "epoch": 1.589817483189241, "grad_norm": 1.34375, "learning_rate": 0.0004752096363522135, "loss": 4.8004, "mean_token_accuracy": 0.2269800528883934, "num_tokens": 37932802.0, "step": 16550 }, { "entropy": 5.175290727615357, "epoch": 1.590297790585975, "grad_norm": 1.3671875, "learning_rate": 0.00047519399008116305, "loss": 4.9299, "mean_token_accuracy": 0.21652191430330275, "num_tokens": 37944782.0, "step": 16555 }, { "entropy": 5.169613695144653, "epoch": 1.5907780979827089, "grad_norm": 1.203125, "learning_rate": 0.0004751783391622026, "loss": 4.9413, "mean_token_accuracy": 0.21697622388601304, "num_tokens": 37956577.0, "step": 16560 }, { "entropy": 5.214099884033203, "epoch": 1.5912584053794427, "grad_norm": 1.046875, "learning_rate": 0.0004751626835956955, "loss": 4.9855, "mean_token_accuracy": 0.21233401596546173, "num_tokens": 37967937.0, "step": 16565 }, { "entropy": 5.126390886306763, "epoch": 1.5917387127761766, "grad_norm": 1.25, "learning_rate": 0.0004751470233820053, "loss": 4.9299, "mean_token_accuracy": 0.22897855043411255, "num_tokens": 37978113.0, "step": 16570 }, { "entropy": 5.143404293060303, "epoch": 1.5922190201729105, "grad_norm": 1.3046875, "learning_rate": 0.0004751313585214955, "loss": 4.8748, "mean_token_accuracy": 0.22316106110811235, "num_tokens": 37988852.0, "step": 16575 }, { "entropy": 5.231024122238159, "epoch": 1.5926993275696446, "grad_norm": 1.1875, "learning_rate": 0.0004751156890145298, "loss": 4.9576, "mean_token_accuracy": 0.2123672142624855, "num_tokens": 38000531.0, "step": 16580 }, { "entropy": 5.1332155704498295, "epoch": 1.5931796349663785, "grad_norm": 1.3828125, "learning_rate": 0.000475100014861472, "loss": 4.932, "mean_token_accuracy": 0.22418509423732758, "num_tokens": 38011818.0, "step": 16585 }, { "entropy": 5.2205602645874025, "epoch": 1.5936599423631124, "grad_norm": 1.2265625, "learning_rate": 0.0004750843360626861, "loss": 4.9578, "mean_token_accuracy": 0.22010722607374192, "num_tokens": 38022455.0, "step": 16590 }, { "entropy": 5.125461912155151, "epoch": 1.5941402497598463, "grad_norm": 1.21875, "learning_rate": 0.0004750686526185359, "loss": 4.9286, "mean_token_accuracy": 0.22543989717960358, "num_tokens": 38033126.0, "step": 16595 }, { "entropy": 5.033788013458252, "epoch": 1.5946205571565804, "grad_norm": 1.171875, "learning_rate": 0.00047505296452938584, "loss": 4.8884, "mean_token_accuracy": 0.21922594010829927, "num_tokens": 38044935.0, "step": 16600 }, { "entropy": 5.212667560577392, "epoch": 1.5951008645533142, "grad_norm": 1.171875, "learning_rate": 0.00047503727179559995, "loss": 4.9732, "mean_token_accuracy": 0.21320051848888397, "num_tokens": 38056080.0, "step": 16605 }, { "entropy": 5.208511447906494, "epoch": 1.5955811719500481, "grad_norm": 1.2109375, "learning_rate": 0.00047502157441754256, "loss": 4.9921, "mean_token_accuracy": 0.2140924945473671, "num_tokens": 38066788.0, "step": 16610 }, { "entropy": 5.246877956390381, "epoch": 1.596061479346782, "grad_norm": 1.2265625, "learning_rate": 0.0004750058723955781, "loss": 5.0093, "mean_token_accuracy": 0.21564434170722963, "num_tokens": 38079529.0, "step": 16615 }, { "entropy": 5.1925897121429445, "epoch": 1.5965417867435159, "grad_norm": 1.2109375, "learning_rate": 0.0004749901657300713, "loss": 4.9602, "mean_token_accuracy": 0.21586501747369766, "num_tokens": 38091484.0, "step": 16620 }, { "entropy": 5.188087844848633, "epoch": 1.5970220941402498, "grad_norm": 1.296875, "learning_rate": 0.00047497445442138667, "loss": 4.9166, "mean_token_accuracy": 0.21774567365646363, "num_tokens": 38103102.0, "step": 16625 }, { "entropy": 5.196709108352661, "epoch": 1.5975024015369836, "grad_norm": 1.2578125, "learning_rate": 0.00047495873846988896, "loss": 5.0333, "mean_token_accuracy": 0.2103504180908203, "num_tokens": 38115565.0, "step": 16630 }, { "entropy": 5.170992374420166, "epoch": 1.5979827089337175, "grad_norm": 1.21875, "learning_rate": 0.0004749430178759431, "loss": 4.9409, "mean_token_accuracy": 0.217051962018013, "num_tokens": 38126741.0, "step": 16635 }, { "entropy": 5.17347526550293, "epoch": 1.5984630163304514, "grad_norm": 1.5078125, "learning_rate": 0.00047492729263991413, "loss": 4.9496, "mean_token_accuracy": 0.2166967958211899, "num_tokens": 38137149.0, "step": 16640 }, { "entropy": 5.188420677185059, "epoch": 1.5989433237271853, "grad_norm": 1.2890625, "learning_rate": 0.00047491156276216695, "loss": 4.8864, "mean_token_accuracy": 0.2285622701048851, "num_tokens": 38148537.0, "step": 16645 }, { "entropy": 5.168776988983154, "epoch": 1.5994236311239192, "grad_norm": 1.28125, "learning_rate": 0.00047489582824306704, "loss": 4.9282, "mean_token_accuracy": 0.21946836411952972, "num_tokens": 38159097.0, "step": 16650 }, { "entropy": 5.191212558746338, "epoch": 1.5999039385206533, "grad_norm": 1.21875, "learning_rate": 0.00047488008908297955, "loss": 4.9028, "mean_token_accuracy": 0.22523313760757446, "num_tokens": 38171400.0, "step": 16655 }, { "entropy": 5.212293004989624, "epoch": 1.6003842459173871, "grad_norm": 1.609375, "learning_rate": 0.0004748643452822699, "loss": 4.955, "mean_token_accuracy": 0.21418403089046478, "num_tokens": 38181876.0, "step": 16660 }, { "entropy": 5.211285972595215, "epoch": 1.600864553314121, "grad_norm": 1.0703125, "learning_rate": 0.0004748485968413036, "loss": 4.9318, "mean_token_accuracy": 0.22244168519973756, "num_tokens": 38193680.0, "step": 16665 }, { "entropy": 5.134846878051758, "epoch": 1.601344860710855, "grad_norm": 1.28125, "learning_rate": 0.00047483284376044634, "loss": 4.9106, "mean_token_accuracy": 0.2231910213828087, "num_tokens": 38204944.0, "step": 16670 }, { "entropy": 5.129866218566894, "epoch": 1.601825168107589, "grad_norm": 1.2734375, "learning_rate": 0.0004748170860400638, "loss": 4.8625, "mean_token_accuracy": 0.2263529285788536, "num_tokens": 38216381.0, "step": 16675 }, { "entropy": 5.1508636474609375, "epoch": 1.602305475504323, "grad_norm": 1.203125, "learning_rate": 0.00047480132368052185, "loss": 4.8845, "mean_token_accuracy": 0.22613088488578797, "num_tokens": 38227420.0, "step": 16680 }, { "entropy": 5.208236646652222, "epoch": 1.6027857829010568, "grad_norm": 1.1484375, "learning_rate": 0.00047478555668218643, "loss": 5.0062, "mean_token_accuracy": 0.21762621849775315, "num_tokens": 38237869.0, "step": 16685 }, { "entropy": 5.164920616149902, "epoch": 1.6032660902977907, "grad_norm": 1.25, "learning_rate": 0.0004747697850454237, "loss": 4.9765, "mean_token_accuracy": 0.2174433395266533, "num_tokens": 38250362.0, "step": 16690 }, { "entropy": 5.280116271972656, "epoch": 1.6037463976945245, "grad_norm": 1.2890625, "learning_rate": 0.0004747540087705997, "loss": 5.0409, "mean_token_accuracy": 0.21639316529035568, "num_tokens": 38262887.0, "step": 16695 }, { "entropy": 5.230034446716308, "epoch": 1.6042267050912584, "grad_norm": 1.3671875, "learning_rate": 0.0004747382278580808, "loss": 4.9034, "mean_token_accuracy": 0.2294904425740242, "num_tokens": 38273206.0, "step": 16700 }, { "entropy": 5.185104942321777, "epoch": 1.6047070124879923, "grad_norm": 1.3359375, "learning_rate": 0.0004747224423082333, "loss": 4.9173, "mean_token_accuracy": 0.21850554943084716, "num_tokens": 38283307.0, "step": 16705 }, { "entropy": 5.211791467666626, "epoch": 1.6051873198847262, "grad_norm": 1.2578125, "learning_rate": 0.00047470665212142384, "loss": 4.9883, "mean_token_accuracy": 0.21574016958475112, "num_tokens": 38293830.0, "step": 16710 }, { "entropy": 5.214894819259643, "epoch": 1.60566762728146, "grad_norm": 1.234375, "learning_rate": 0.00047469085729801887, "loss": 4.9774, "mean_token_accuracy": 0.219608137011528, "num_tokens": 38306523.0, "step": 16715 }, { "entropy": 5.200537395477295, "epoch": 1.606147934678194, "grad_norm": 1.34375, "learning_rate": 0.00047467505783838515, "loss": 4.9473, "mean_token_accuracy": 0.2235700950026512, "num_tokens": 38318468.0, "step": 16720 }, { "entropy": 5.2180544376373295, "epoch": 1.6066282420749278, "grad_norm": 1.3125, "learning_rate": 0.0004746592537428895, "loss": 4.8867, "mean_token_accuracy": 0.22414906024932862, "num_tokens": 38329852.0, "step": 16725 }, { "entropy": 5.076629400253296, "epoch": 1.6071085494716617, "grad_norm": 1.28125, "learning_rate": 0.00047464344501189877, "loss": 4.8605, "mean_token_accuracy": 0.23030537664890288, "num_tokens": 38340951.0, "step": 16730 }, { "entropy": 5.25096173286438, "epoch": 1.6075888568683958, "grad_norm": 1.1875, "learning_rate": 0.00047462763164578015, "loss": 5.0228, "mean_token_accuracy": 0.21357613205909728, "num_tokens": 38351490.0, "step": 16735 }, { "entropy": 5.198782014846802, "epoch": 1.6080691642651297, "grad_norm": 1.3125, "learning_rate": 0.0004746118136449007, "loss": 4.9292, "mean_token_accuracy": 0.2180192857980728, "num_tokens": 38362364.0, "step": 16740 }, { "entropy": 5.180115175247193, "epoch": 1.6085494716618636, "grad_norm": 1.3203125, "learning_rate": 0.0004745959910096276, "loss": 4.9898, "mean_token_accuracy": 0.22322781383991241, "num_tokens": 38374132.0, "step": 16745 }, { "entropy": 5.1993663787841795, "epoch": 1.6090297790585975, "grad_norm": 1.2734375, "learning_rate": 0.00047458016374032837, "loss": 4.9685, "mean_token_accuracy": 0.2143290311098099, "num_tokens": 38384608.0, "step": 16750 }, { "entropy": 5.17359938621521, "epoch": 1.6095100864553316, "grad_norm": 1.3203125, "learning_rate": 0.0004745643318373703, "loss": 4.9836, "mean_token_accuracy": 0.21605729013681413, "num_tokens": 38396639.0, "step": 16755 }, { "entropy": 5.18921127319336, "epoch": 1.6099903938520654, "grad_norm": 1.1953125, "learning_rate": 0.00047454849530112106, "loss": 4.9898, "mean_token_accuracy": 0.2078189730644226, "num_tokens": 38407954.0, "step": 16760 }, { "entropy": 5.28509259223938, "epoch": 1.6104707012487993, "grad_norm": 1.3203125, "learning_rate": 0.00047453265413194826, "loss": 4.9396, "mean_token_accuracy": 0.2211918741464615, "num_tokens": 38418939.0, "step": 16765 }, { "entropy": 5.21851601600647, "epoch": 1.6109510086455332, "grad_norm": 1.171875, "learning_rate": 0.00047451680833021973, "loss": 4.9328, "mean_token_accuracy": 0.21415177136659622, "num_tokens": 38429717.0, "step": 16770 }, { "entropy": 5.12396125793457, "epoch": 1.611431316042267, "grad_norm": 1.2265625, "learning_rate": 0.0004745009578963034, "loss": 4.8761, "mean_token_accuracy": 0.22479525655508042, "num_tokens": 38441054.0, "step": 16775 }, { "entropy": 5.115128135681152, "epoch": 1.611911623439001, "grad_norm": 1.4609375, "learning_rate": 0.00047448510283056716, "loss": 4.9116, "mean_token_accuracy": 0.22436713427305222, "num_tokens": 38451329.0, "step": 16780 }, { "entropy": 5.247094392776489, "epoch": 1.6123919308357348, "grad_norm": 1.1953125, "learning_rate": 0.00047446924313337925, "loss": 4.9893, "mean_token_accuracy": 0.21566009074449538, "num_tokens": 38462118.0, "step": 16785 }, { "entropy": 5.180738306045532, "epoch": 1.6128722382324687, "grad_norm": 1.2890625, "learning_rate": 0.00047445337880510773, "loss": 4.9299, "mean_token_accuracy": 0.22173037976026536, "num_tokens": 38472642.0, "step": 16790 }, { "entropy": 5.087857055664062, "epoch": 1.6133525456292026, "grad_norm": 1.703125, "learning_rate": 0.0004744375098461211, "loss": 4.8573, "mean_token_accuracy": 0.22567004710435867, "num_tokens": 38483866.0, "step": 16795 }, { "entropy": 5.136823225021362, "epoch": 1.6138328530259365, "grad_norm": 1.2734375, "learning_rate": 0.0004744216362567876, "loss": 4.856, "mean_token_accuracy": 0.2275155559182167, "num_tokens": 38494164.0, "step": 16800 }, { "entropy": 5.250975799560547, "epoch": 1.6143131604226704, "grad_norm": 1.375, "learning_rate": 0.00047440575803747595, "loss": 5.0822, "mean_token_accuracy": 0.20916907489299774, "num_tokens": 38505340.0, "step": 16805 }, { "entropy": 5.361870002746582, "epoch": 1.6147934678194045, "grad_norm": 1.265625, "learning_rate": 0.00047438987518855463, "loss": 5.082, "mean_token_accuracy": 0.20903967767953874, "num_tokens": 38516164.0, "step": 16810 }, { "entropy": 5.22707691192627, "epoch": 1.6152737752161384, "grad_norm": 1.234375, "learning_rate": 0.0004743739877103926, "loss": 4.9441, "mean_token_accuracy": 0.21966830492019654, "num_tokens": 38526545.0, "step": 16815 }, { "entropy": 5.091162443161011, "epoch": 1.6157540826128722, "grad_norm": 1.09375, "learning_rate": 0.0004743580956033585, "loss": 4.9081, "mean_token_accuracy": 0.2183246672153473, "num_tokens": 38538102.0, "step": 16820 }, { "entropy": 5.256732702255249, "epoch": 1.6162343900096061, "grad_norm": 1.140625, "learning_rate": 0.00047434219886782135, "loss": 5.0344, "mean_token_accuracy": 0.21077128499746323, "num_tokens": 38550594.0, "step": 16825 }, { "entropy": 5.209319257736206, "epoch": 1.6167146974063402, "grad_norm": 1.296875, "learning_rate": 0.0004743262975041504, "loss": 4.9029, "mean_token_accuracy": 0.2212669938802719, "num_tokens": 38561666.0, "step": 16830 }, { "entropy": 5.135405111312866, "epoch": 1.617195004803074, "grad_norm": 1.1953125, "learning_rate": 0.0004743103915127146, "loss": 4.8723, "mean_token_accuracy": 0.22310091853141784, "num_tokens": 38572923.0, "step": 16835 }, { "entropy": 5.170195627212524, "epoch": 1.617675312199808, "grad_norm": 1.3828125, "learning_rate": 0.00047429448089388336, "loss": 4.9256, "mean_token_accuracy": 0.22085566222667694, "num_tokens": 38584108.0, "step": 16840 }, { "entropy": 5.3052619934082035, "epoch": 1.6181556195965419, "grad_norm": 1.296875, "learning_rate": 0.00047427856564802605, "loss": 5.052, "mean_token_accuracy": 0.20214477479457854, "num_tokens": 38594974.0, "step": 16845 }, { "entropy": 5.246438884735108, "epoch": 1.6186359269932757, "grad_norm": 1.1171875, "learning_rate": 0.0004742626457755122, "loss": 5.0071, "mean_token_accuracy": 0.2198152020573616, "num_tokens": 38606436.0, "step": 16850 }, { "entropy": 5.152284097671509, "epoch": 1.6191162343900096, "grad_norm": 1.171875, "learning_rate": 0.0004742467212767114, "loss": 4.8889, "mean_token_accuracy": 0.2204935997724533, "num_tokens": 38618760.0, "step": 16855 }, { "entropy": 5.138833665847779, "epoch": 1.6195965417867435, "grad_norm": 1.1953125, "learning_rate": 0.0004742307921519933, "loss": 4.8611, "mean_token_accuracy": 0.2265857771039009, "num_tokens": 38629901.0, "step": 16860 }, { "entropy": 5.268418407440185, "epoch": 1.6200768491834774, "grad_norm": 1.171875, "learning_rate": 0.00047421485840172794, "loss": 5.0478, "mean_token_accuracy": 0.20666339248418808, "num_tokens": 38640798.0, "step": 16865 }, { "entropy": 5.245220804214478, "epoch": 1.6205571565802113, "grad_norm": 1.1015625, "learning_rate": 0.0004741989200262851, "loss": 4.9676, "mean_token_accuracy": 0.21834530234336852, "num_tokens": 38653048.0, "step": 16870 }, { "entropy": 5.228298997879028, "epoch": 1.6210374639769451, "grad_norm": 1.265625, "learning_rate": 0.0004741829770260347, "loss": 5.0116, "mean_token_accuracy": 0.21510974317789078, "num_tokens": 38666219.0, "step": 16875 }, { "entropy": 5.2042152881622314, "epoch": 1.621517771373679, "grad_norm": 1.2734375, "learning_rate": 0.00047416702940134714, "loss": 4.909, "mean_token_accuracy": 0.22228912860155106, "num_tokens": 38678584.0, "step": 16880 }, { "entropy": 5.181402587890625, "epoch": 1.621998078770413, "grad_norm": 1.109375, "learning_rate": 0.00047415107715259255, "loss": 4.9377, "mean_token_accuracy": 0.21005474478006364, "num_tokens": 38690164.0, "step": 16885 }, { "entropy": 5.208297920227051, "epoch": 1.622478386167147, "grad_norm": 1.28125, "learning_rate": 0.00047413512028014125, "loss": 5.0103, "mean_token_accuracy": 0.21610539108514787, "num_tokens": 38702035.0, "step": 16890 }, { "entropy": 5.2519388675689695, "epoch": 1.622958693563881, "grad_norm": 1.2734375, "learning_rate": 0.0004741191587843638, "loss": 5.0296, "mean_token_accuracy": 0.21492973119020461, "num_tokens": 38713802.0, "step": 16895 }, { "entropy": 5.190322732925415, "epoch": 1.6234390009606148, "grad_norm": 1.21875, "learning_rate": 0.0004741031926656308, "loss": 4.8923, "mean_token_accuracy": 0.22572966963052749, "num_tokens": 38725682.0, "step": 16900 }, { "entropy": 5.178541278839111, "epoch": 1.6239193083573487, "grad_norm": 1.140625, "learning_rate": 0.0004740872219243128, "loss": 4.9072, "mean_token_accuracy": 0.21873684823513032, "num_tokens": 38737897.0, "step": 16905 }, { "entropy": 5.229857921600342, "epoch": 1.6243996157540828, "grad_norm": 1.2578125, "learning_rate": 0.0004740712465607807, "loss": 5.018, "mean_token_accuracy": 0.2116595149040222, "num_tokens": 38749446.0, "step": 16910 }, { "entropy": 5.176464700698853, "epoch": 1.6248799231508166, "grad_norm": 1.203125, "learning_rate": 0.0004740552665754054, "loss": 4.9725, "mean_token_accuracy": 0.21386126130819322, "num_tokens": 38761406.0, "step": 16915 }, { "entropy": 5.204869890213013, "epoch": 1.6253602305475505, "grad_norm": 1.1875, "learning_rate": 0.00047403928196855776, "loss": 4.976, "mean_token_accuracy": 0.22327034771442414, "num_tokens": 38772574.0, "step": 16920 }, { "entropy": 5.201022005081176, "epoch": 1.6258405379442844, "grad_norm": 1.1875, "learning_rate": 0.00047402329274060916, "loss": 4.9489, "mean_token_accuracy": 0.22577953338623047, "num_tokens": 38782932.0, "step": 16925 }, { "entropy": 5.209049367904663, "epoch": 1.6263208453410183, "grad_norm": 1.171875, "learning_rate": 0.0004740072988919306, "loss": 5.009, "mean_token_accuracy": 0.21278314143419266, "num_tokens": 38793799.0, "step": 16930 }, { "entropy": 5.073943376541138, "epoch": 1.6268011527377522, "grad_norm": 1.171875, "learning_rate": 0.0004739913004228936, "loss": 4.8296, "mean_token_accuracy": 0.2272112175822258, "num_tokens": 38804899.0, "step": 16935 }, { "entropy": 5.17745361328125, "epoch": 1.627281460134486, "grad_norm": 1.28125, "learning_rate": 0.0004739752973338694, "loss": 5.0203, "mean_token_accuracy": 0.20754062682390212, "num_tokens": 38816455.0, "step": 16940 }, { "entropy": 5.232742404937744, "epoch": 1.62776176753122, "grad_norm": 1.2265625, "learning_rate": 0.00047395928962522965, "loss": 4.9653, "mean_token_accuracy": 0.21127762645483017, "num_tokens": 38827735.0, "step": 16945 }, { "entropy": 5.1770717144012455, "epoch": 1.6282420749279538, "grad_norm": 1.234375, "learning_rate": 0.00047394327729734595, "loss": 4.9796, "mean_token_accuracy": 0.2126038447022438, "num_tokens": 38838536.0, "step": 16950 }, { "entropy": 5.273472642898559, "epoch": 1.6287223823246877, "grad_norm": 1.1484375, "learning_rate": 0.0004739272603505901, "loss": 5.0236, "mean_token_accuracy": 0.21635421216487885, "num_tokens": 38849577.0, "step": 16955 }, { "entropy": 5.277729892730713, "epoch": 1.6292026897214216, "grad_norm": 1.2421875, "learning_rate": 0.000473911238785334, "loss": 4.981, "mean_token_accuracy": 0.21457867622375487, "num_tokens": 38861370.0, "step": 16960 }, { "entropy": 5.16580753326416, "epoch": 1.6296829971181557, "grad_norm": 1.3671875, "learning_rate": 0.0004738952126019496, "loss": 4.927, "mean_token_accuracy": 0.21482086628675462, "num_tokens": 38872885.0, "step": 16965 }, { "entropy": 5.1759748458862305, "epoch": 1.6301633045148896, "grad_norm": 1.1875, "learning_rate": 0.0004738791818008089, "loss": 4.9798, "mean_token_accuracy": 0.21594414860010147, "num_tokens": 38885440.0, "step": 16970 }, { "entropy": 5.167402601242065, "epoch": 1.6306436119116234, "grad_norm": 1.3203125, "learning_rate": 0.0004738631463822841, "loss": 4.8927, "mean_token_accuracy": 0.22061660438776015, "num_tokens": 38897108.0, "step": 16975 }, { "entropy": 5.139171504974366, "epoch": 1.6311239193083573, "grad_norm": 1.25, "learning_rate": 0.00047384710634674766, "loss": 4.9746, "mean_token_accuracy": 0.21460178643465042, "num_tokens": 38908020.0, "step": 16980 }, { "entropy": 5.231308460235596, "epoch": 1.6316042267050914, "grad_norm": 1.390625, "learning_rate": 0.00047383106169457184, "loss": 4.982, "mean_token_accuracy": 0.21495762020349501, "num_tokens": 38919558.0, "step": 16985 }, { "entropy": 5.212464714050293, "epoch": 1.6320845341018253, "grad_norm": 1.2734375, "learning_rate": 0.0004738150124261292, "loss": 4.9595, "mean_token_accuracy": 0.22091935575008392, "num_tokens": 38931008.0, "step": 16990 }, { "entropy": 5.120176219940186, "epoch": 1.6325648414985592, "grad_norm": 1.1640625, "learning_rate": 0.00047379895854179226, "loss": 4.8583, "mean_token_accuracy": 0.22508623749017714, "num_tokens": 38943474.0, "step": 16995 }, { "entropy": 5.216991710662842, "epoch": 1.633045148895293, "grad_norm": 1.203125, "learning_rate": 0.0004737829000419338, "loss": 5.051, "mean_token_accuracy": 0.2126992627978325, "num_tokens": 38954592.0, "step": 17000 }, { "entropy": 5.210864067077637, "epoch": 1.633525456292027, "grad_norm": 1.2109375, "learning_rate": 0.00047376683692692666, "loss": 4.9369, "mean_token_accuracy": 0.21950011253356932, "num_tokens": 38965627.0, "step": 17005 }, { "entropy": 5.096020603179932, "epoch": 1.6340057636887608, "grad_norm": 1.2578125, "learning_rate": 0.0004737507691971439, "loss": 4.9038, "mean_token_accuracy": 0.2222321853041649, "num_tokens": 38975731.0, "step": 17010 }, { "entropy": 5.133788967132569, "epoch": 1.6344860710854947, "grad_norm": 1.3125, "learning_rate": 0.00047373469685295833, "loss": 4.9094, "mean_token_accuracy": 0.22627927511930465, "num_tokens": 38987101.0, "step": 17015 }, { "entropy": 5.1970940113067625, "epoch": 1.6349663784822286, "grad_norm": 1.1875, "learning_rate": 0.00047371861989474326, "loss": 4.957, "mean_token_accuracy": 0.22381552755832673, "num_tokens": 38999584.0, "step": 17020 }, { "entropy": 5.131382656097412, "epoch": 1.6354466858789625, "grad_norm": 1.2265625, "learning_rate": 0.0004737025383228719, "loss": 4.9378, "mean_token_accuracy": 0.2247656613588333, "num_tokens": 39012025.0, "step": 17025 }, { "entropy": 5.053452491760254, "epoch": 1.6359269932756964, "grad_norm": 1.453125, "learning_rate": 0.00047368645213771764, "loss": 4.8227, "mean_token_accuracy": 0.2244688794016838, "num_tokens": 39024102.0, "step": 17030 }, { "entropy": 5.165864324569702, "epoch": 1.6364073006724302, "grad_norm": 1.4296875, "learning_rate": 0.000473670361339654, "loss": 4.957, "mean_token_accuracy": 0.2133333921432495, "num_tokens": 39035295.0, "step": 17035 }, { "entropy": 5.164595079421997, "epoch": 1.6368876080691641, "grad_norm": 1.15625, "learning_rate": 0.0004736542659290544, "loss": 4.8585, "mean_token_accuracy": 0.22263574600219727, "num_tokens": 39046493.0, "step": 17040 }, { "entropy": 5.168615007400513, "epoch": 1.6373679154658982, "grad_norm": 1.2578125, "learning_rate": 0.0004736381659062927, "loss": 4.9059, "mean_token_accuracy": 0.21746231317520143, "num_tokens": 39056746.0, "step": 17045 }, { "entropy": 5.183399057388305, "epoch": 1.637848222862632, "grad_norm": 1.34375, "learning_rate": 0.00047362206127174255, "loss": 4.9483, "mean_token_accuracy": 0.2127215713262558, "num_tokens": 39068263.0, "step": 17050 }, { "entropy": 5.211107921600342, "epoch": 1.638328530259366, "grad_norm": 1.1953125, "learning_rate": 0.00047360595202577786, "loss": 4.9733, "mean_token_accuracy": 0.21741271317005156, "num_tokens": 39080666.0, "step": 17055 }, { "entropy": 5.103889846801758, "epoch": 1.6388088376560999, "grad_norm": 1.234375, "learning_rate": 0.00047358983816877284, "loss": 4.9431, "mean_token_accuracy": 0.21217281520366668, "num_tokens": 39091851.0, "step": 17060 }, { "entropy": 5.235210275650024, "epoch": 1.639289145052834, "grad_norm": 1.3046875, "learning_rate": 0.0004735737197011015, "loss": 4.9498, "mean_token_accuracy": 0.2229301705956459, "num_tokens": 39102893.0, "step": 17065 }, { "entropy": 5.150740432739258, "epoch": 1.6397694524495678, "grad_norm": 1.328125, "learning_rate": 0.00047355759662313793, "loss": 4.8814, "mean_token_accuracy": 0.2147470995783806, "num_tokens": 39113582.0, "step": 17070 }, { "entropy": 5.24199595451355, "epoch": 1.6402497598463017, "grad_norm": 1.203125, "learning_rate": 0.0004735414689352566, "loss": 5.0776, "mean_token_accuracy": 0.21010097116231918, "num_tokens": 39124616.0, "step": 17075 }, { "entropy": 5.12250599861145, "epoch": 1.6407300672430356, "grad_norm": 1.296875, "learning_rate": 0.0004735253366378318, "loss": 4.8727, "mean_token_accuracy": 0.22840944528579712, "num_tokens": 39135537.0, "step": 17080 }, { "entropy": 5.1794140338897705, "epoch": 1.6412103746397695, "grad_norm": 1.25, "learning_rate": 0.0004735091997312383, "loss": 4.9501, "mean_token_accuracy": 0.22090162485837936, "num_tokens": 39146363.0, "step": 17085 }, { "entropy": 5.157111310958863, "epoch": 1.6416906820365034, "grad_norm": 1.296875, "learning_rate": 0.00047349305821585067, "loss": 4.91, "mean_token_accuracy": 0.22369770109653472, "num_tokens": 39157319.0, "step": 17090 }, { "entropy": 5.173314619064331, "epoch": 1.6421709894332372, "grad_norm": 1.2109375, "learning_rate": 0.0004734769120920435, "loss": 4.9141, "mean_token_accuracy": 0.22113776504993438, "num_tokens": 39169352.0, "step": 17095 }, { "entropy": 5.206317377090454, "epoch": 1.6426512968299711, "grad_norm": 1.21875, "learning_rate": 0.0004734607613601919, "loss": 4.9054, "mean_token_accuracy": 0.22213377356529235, "num_tokens": 39180989.0, "step": 17100 }, { "entropy": 5.174010324478149, "epoch": 1.643131604226705, "grad_norm": 1.2734375, "learning_rate": 0.00047344460602067077, "loss": 4.9605, "mean_token_accuracy": 0.21831177175045013, "num_tokens": 39192309.0, "step": 17105 }, { "entropy": 5.108195209503174, "epoch": 1.643611911623439, "grad_norm": 1.203125, "learning_rate": 0.000473428446073855, "loss": 4.9048, "mean_token_accuracy": 0.21822543889284135, "num_tokens": 39203586.0, "step": 17110 }, { "entropy": 5.1728309154510494, "epoch": 1.6440922190201728, "grad_norm": 1.265625, "learning_rate": 0.00047341228152012003, "loss": 4.9358, "mean_token_accuracy": 0.2138598531484604, "num_tokens": 39216248.0, "step": 17115 }, { "entropy": 5.217025470733643, "epoch": 1.6445725264169067, "grad_norm": 1.234375, "learning_rate": 0.000473396112359841, "loss": 4.9216, "mean_token_accuracy": 0.21717059910297393, "num_tokens": 39227244.0, "step": 17120 }, { "entropy": 5.171856927871704, "epoch": 1.6450528338136408, "grad_norm": 1.171875, "learning_rate": 0.00047337993859339334, "loss": 4.8631, "mean_token_accuracy": 0.22649723738431932, "num_tokens": 39238769.0, "step": 17125 }, { "entropy": 5.140405130386353, "epoch": 1.6455331412103746, "grad_norm": 1.1953125, "learning_rate": 0.00047336376022115255, "loss": 4.9605, "mean_token_accuracy": 0.21623745262622834, "num_tokens": 39251424.0, "step": 17130 }, { "entropy": 5.150231647491455, "epoch": 1.6460134486071085, "grad_norm": 1.1484375, "learning_rate": 0.00047334757724349437, "loss": 4.9106, "mean_token_accuracy": 0.2248495638370514, "num_tokens": 39262451.0, "step": 17135 }, { "entropy": 5.156459474563599, "epoch": 1.6464937560038426, "grad_norm": 1.1484375, "learning_rate": 0.0004733313896607943, "loss": 4.9223, "mean_token_accuracy": 0.2204621374607086, "num_tokens": 39275129.0, "step": 17140 }, { "entropy": 5.195129108428955, "epoch": 1.6469740634005765, "grad_norm": 1.1640625, "learning_rate": 0.0004733151974734284, "loss": 5.0455, "mean_token_accuracy": 0.21298815310001373, "num_tokens": 39287399.0, "step": 17145 }, { "entropy": 5.2080107688903805, "epoch": 1.6474543707973104, "grad_norm": 1.1796875, "learning_rate": 0.00047329900068177245, "loss": 4.8904, "mean_token_accuracy": 0.22203465551137924, "num_tokens": 39297755.0, "step": 17150 }, { "entropy": 5.27492470741272, "epoch": 1.6479346781940443, "grad_norm": 1.2265625, "learning_rate": 0.00047328279928620244, "loss": 5.0159, "mean_token_accuracy": 0.20743546783924102, "num_tokens": 39311012.0, "step": 17155 }, { "entropy": 5.191003847122192, "epoch": 1.6484149855907781, "grad_norm": 1.15625, "learning_rate": 0.0004732665932870947, "loss": 4.916, "mean_token_accuracy": 0.22007073909044267, "num_tokens": 39321797.0, "step": 17160 }, { "entropy": 5.178836727142334, "epoch": 1.648895292987512, "grad_norm": 1.1171875, "learning_rate": 0.00047325038268482544, "loss": 4.8881, "mean_token_accuracy": 0.23106451481580734, "num_tokens": 39333598.0, "step": 17165 }, { "entropy": 5.215980386734008, "epoch": 1.649375600384246, "grad_norm": 1.2421875, "learning_rate": 0.0004732341674797709, "loss": 4.9832, "mean_token_accuracy": 0.21182733327150344, "num_tokens": 39345578.0, "step": 17170 }, { "entropy": 5.085534715652466, "epoch": 1.6498559077809798, "grad_norm": 1.25, "learning_rate": 0.00047321794767230766, "loss": 4.849, "mean_token_accuracy": 0.2237042009830475, "num_tokens": 39356744.0, "step": 17175 }, { "entropy": 5.203284645080567, "epoch": 1.6503362151777137, "grad_norm": 1.2734375, "learning_rate": 0.00047320172326281224, "loss": 4.9813, "mean_token_accuracy": 0.21088655143976212, "num_tokens": 39368138.0, "step": 17180 }, { "entropy": 5.088240718841552, "epoch": 1.6508165225744476, "grad_norm": 1.5625, "learning_rate": 0.00047318549425166134, "loss": 4.825, "mean_token_accuracy": 0.2208867460489273, "num_tokens": 39380117.0, "step": 17185 }, { "entropy": 5.144798612594604, "epoch": 1.6512968299711814, "grad_norm": 1.296875, "learning_rate": 0.0004731692606392318, "loss": 4.9032, "mean_token_accuracy": 0.21989178657531738, "num_tokens": 39390913.0, "step": 17190 }, { "entropy": 5.259878635406494, "epoch": 1.6517771373679153, "grad_norm": 1.265625, "learning_rate": 0.0004731530224259004, "loss": 5.0399, "mean_token_accuracy": 0.209253753721714, "num_tokens": 39401001.0, "step": 17195 }, { "entropy": 5.167004823684692, "epoch": 1.6522574447646494, "grad_norm": 1.1796875, "learning_rate": 0.0004731367796120442, "loss": 4.9199, "mean_token_accuracy": 0.2191713660955429, "num_tokens": 39411982.0, "step": 17200 }, { "entropy": 5.236986684799194, "epoch": 1.6527377521613833, "grad_norm": 1.1953125, "learning_rate": 0.0004731205321980404, "loss": 4.9493, "mean_token_accuracy": 0.21665553301572799, "num_tokens": 39423363.0, "step": 17205 }, { "entropy": 5.178996753692627, "epoch": 1.6532180595581172, "grad_norm": 1.2578125, "learning_rate": 0.00047310428018426616, "loss": 4.9931, "mean_token_accuracy": 0.21703283488750458, "num_tokens": 39434002.0, "step": 17210 }, { "entropy": 5.136433792114258, "epoch": 1.653698366954851, "grad_norm": 1.5390625, "learning_rate": 0.0004730880235710987, "loss": 4.8818, "mean_token_accuracy": 0.2200036182999611, "num_tokens": 39444629.0, "step": 17215 }, { "entropy": 5.1588235855102536, "epoch": 1.6541786743515852, "grad_norm": 1.21875, "learning_rate": 0.0004730717623589155, "loss": 4.8502, "mean_token_accuracy": 0.23025956898927688, "num_tokens": 39454803.0, "step": 17220 }, { "entropy": 5.160949420928955, "epoch": 1.654658981748319, "grad_norm": 1.1875, "learning_rate": 0.0004730554965480942, "loss": 4.9935, "mean_token_accuracy": 0.2163141682744026, "num_tokens": 39466931.0, "step": 17225 }, { "entropy": 5.18949818611145, "epoch": 1.655139289145053, "grad_norm": 1.4609375, "learning_rate": 0.0004730392261390124, "loss": 4.9649, "mean_token_accuracy": 0.22342453449964522, "num_tokens": 39478127.0, "step": 17230 }, { "entropy": 5.222326755523682, "epoch": 1.6556195965417868, "grad_norm": 1.203125, "learning_rate": 0.0004730229511320478, "loss": 4.9707, "mean_token_accuracy": 0.21511962711811067, "num_tokens": 39488188.0, "step": 17235 }, { "entropy": 5.210654878616333, "epoch": 1.6560999039385207, "grad_norm": 1.28125, "learning_rate": 0.00047300667152757827, "loss": 4.9296, "mean_token_accuracy": 0.22033513486385345, "num_tokens": 39500035.0, "step": 17240 }, { "entropy": 5.2070694923400875, "epoch": 1.6565802113352546, "grad_norm": 1.3984375, "learning_rate": 0.00047299038732598184, "loss": 4.999, "mean_token_accuracy": 0.21806135773658752, "num_tokens": 39510922.0, "step": 17245 }, { "entropy": 5.182500553131104, "epoch": 1.6570605187319885, "grad_norm": 1.28125, "learning_rate": 0.00047297409852763644, "loss": 4.997, "mean_token_accuracy": 0.22009943872690202, "num_tokens": 39521319.0, "step": 17250 }, { "entropy": 5.2111945152282715, "epoch": 1.6575408261287223, "grad_norm": 1.359375, "learning_rate": 0.0004729578051329204, "loss": 4.9534, "mean_token_accuracy": 0.21612063497304917, "num_tokens": 39532408.0, "step": 17255 }, { "entropy": 5.219672155380249, "epoch": 1.6580211335254562, "grad_norm": 1.4296875, "learning_rate": 0.00047294150714221185, "loss": 5.0105, "mean_token_accuracy": 0.21210616379976271, "num_tokens": 39543323.0, "step": 17260 }, { "entropy": 5.242244911193848, "epoch": 1.65850144092219, "grad_norm": 1.15625, "learning_rate": 0.0004729252045558894, "loss": 4.9603, "mean_token_accuracy": 0.21582386493682862, "num_tokens": 39554133.0, "step": 17265 }, { "entropy": 5.154227447509766, "epoch": 1.658981748318924, "grad_norm": 1.3515625, "learning_rate": 0.00047290889737433133, "loss": 4.84, "mean_token_accuracy": 0.22423603981733323, "num_tokens": 39565990.0, "step": 17270 }, { "entropy": 5.206848955154419, "epoch": 1.6594620557156579, "grad_norm": 1.21875, "learning_rate": 0.00047289258559791633, "loss": 4.9429, "mean_token_accuracy": 0.2176191046833992, "num_tokens": 39577591.0, "step": 17275 }, { "entropy": 5.2025104522705075, "epoch": 1.659942363112392, "grad_norm": 1.203125, "learning_rate": 0.00047287626922702317, "loss": 4.9147, "mean_token_accuracy": 0.21797798275947572, "num_tokens": 39589922.0, "step": 17280 }, { "entropy": 5.126957130432129, "epoch": 1.6604226705091258, "grad_norm": 1.2109375, "learning_rate": 0.00047285994826203054, "loss": 4.9458, "mean_token_accuracy": 0.22071049809455873, "num_tokens": 39602183.0, "step": 17285 }, { "entropy": 5.20958571434021, "epoch": 1.6609029779058597, "grad_norm": 1.203125, "learning_rate": 0.0004728436227033175, "loss": 4.9578, "mean_token_accuracy": 0.21979390680789948, "num_tokens": 39612922.0, "step": 17290 }, { "entropy": 5.232578420639038, "epoch": 1.6613832853025938, "grad_norm": 1.3125, "learning_rate": 0.00047282729255126294, "loss": 4.9504, "mean_token_accuracy": 0.2208220601081848, "num_tokens": 39624475.0, "step": 17295 }, { "entropy": 5.023839282989502, "epoch": 1.6618635926993277, "grad_norm": 1.2265625, "learning_rate": 0.0004728109578062461, "loss": 4.8012, "mean_token_accuracy": 0.2280938968062401, "num_tokens": 39635230.0, "step": 17300 }, { "entropy": 5.088182401657105, "epoch": 1.6623439000960616, "grad_norm": 1.125, "learning_rate": 0.00047279461846864626, "loss": 4.9072, "mean_token_accuracy": 0.22009400725364686, "num_tokens": 39646788.0, "step": 17305 }, { "entropy": 5.137494707107544, "epoch": 1.6628242074927955, "grad_norm": 1.1484375, "learning_rate": 0.00047277827453884265, "loss": 4.8803, "mean_token_accuracy": 0.2225083142518997, "num_tokens": 39657531.0, "step": 17310 }, { "entropy": 5.292917203903198, "epoch": 1.6633045148895294, "grad_norm": 1.359375, "learning_rate": 0.00047276192601721477, "loss": 5.0619, "mean_token_accuracy": 0.20813206434249878, "num_tokens": 39669008.0, "step": 17315 }, { "entropy": 5.207920122146606, "epoch": 1.6637848222862632, "grad_norm": 1.1796875, "learning_rate": 0.0004727455729041422, "loss": 4.9061, "mean_token_accuracy": 0.22048480212688445, "num_tokens": 39680772.0, "step": 17320 }, { "entropy": 5.23115291595459, "epoch": 1.6642651296829971, "grad_norm": 1.3671875, "learning_rate": 0.00047272921520000465, "loss": 5.0517, "mean_token_accuracy": 0.21050333827733994, "num_tokens": 39694107.0, "step": 17325 }, { "entropy": 5.18481068611145, "epoch": 1.664745437079731, "grad_norm": 1.2109375, "learning_rate": 0.0004727128529051819, "loss": 4.9103, "mean_token_accuracy": 0.21743627935647963, "num_tokens": 39705440.0, "step": 17330 }, { "entropy": 5.171319675445557, "epoch": 1.6652257444764649, "grad_norm": 1.171875, "learning_rate": 0.0004726964860200537, "loss": 4.9121, "mean_token_accuracy": 0.2184045359492302, "num_tokens": 39716246.0, "step": 17335 }, { "entropy": 5.164891290664673, "epoch": 1.6657060518731988, "grad_norm": 1.3125, "learning_rate": 0.0004726801145450002, "loss": 4.9312, "mean_token_accuracy": 0.21594095528125762, "num_tokens": 39727233.0, "step": 17340 }, { "entropy": 5.196539783477784, "epoch": 1.6661863592699326, "grad_norm": 1.25, "learning_rate": 0.0004726637384804014, "loss": 4.9045, "mean_token_accuracy": 0.2156997725367546, "num_tokens": 39738929.0, "step": 17345 }, { "entropy": 5.152793312072754, "epoch": 1.6666666666666665, "grad_norm": 1.1953125, "learning_rate": 0.0004726473578266375, "loss": 4.9228, "mean_token_accuracy": 0.21952597051858902, "num_tokens": 39750714.0, "step": 17350 }, { "entropy": 5.183078670501709, "epoch": 1.6671469740634006, "grad_norm": 1.46875, "learning_rate": 0.00047263097258408893, "loss": 4.983, "mean_token_accuracy": 0.21496337354183198, "num_tokens": 39762810.0, "step": 17355 }, { "entropy": 5.259543704986572, "epoch": 1.6676272814601345, "grad_norm": 1.359375, "learning_rate": 0.0004726145827531359, "loss": 5.0234, "mean_token_accuracy": 0.21382358223199843, "num_tokens": 39774611.0, "step": 17360 }, { "entropy": 5.12020697593689, "epoch": 1.6681075888568684, "grad_norm": 1.2265625, "learning_rate": 0.00047259818833415916, "loss": 4.8302, "mean_token_accuracy": 0.2306036338210106, "num_tokens": 39784900.0, "step": 17365 }, { "entropy": 5.213765668869018, "epoch": 1.6685878962536023, "grad_norm": 1.3359375, "learning_rate": 0.00047258178932753917, "loss": 4.9686, "mean_token_accuracy": 0.2193788021802902, "num_tokens": 39794805.0, "step": 17370 }, { "entropy": 5.206267833709717, "epoch": 1.6690682036503364, "grad_norm": 1.28125, "learning_rate": 0.00047256538573365675, "loss": 4.9778, "mean_token_accuracy": 0.212169349193573, "num_tokens": 39806444.0, "step": 17375 }, { "entropy": 5.318323183059692, "epoch": 1.6695485110470702, "grad_norm": 1.25, "learning_rate": 0.0004725489775528928, "loss": 5.116, "mean_token_accuracy": 0.20811543017625808, "num_tokens": 39817724.0, "step": 17380 }, { "entropy": 5.151338577270508, "epoch": 1.6700288184438041, "grad_norm": 1.3203125, "learning_rate": 0.00047253256478562805, "loss": 4.9153, "mean_token_accuracy": 0.2115355148911476, "num_tokens": 39828944.0, "step": 17385 }, { "entropy": 5.144230937957763, "epoch": 1.670509125840538, "grad_norm": 1.2890625, "learning_rate": 0.00047251614743224374, "loss": 4.9378, "mean_token_accuracy": 0.2210545301437378, "num_tokens": 39839978.0, "step": 17390 }, { "entropy": 5.192106771469116, "epoch": 1.670989433237272, "grad_norm": 1.1875, "learning_rate": 0.00047249972549312107, "loss": 4.9251, "mean_token_accuracy": 0.22135266661643982, "num_tokens": 39850787.0, "step": 17395 }, { "entropy": 5.250539684295655, "epoch": 1.6714697406340058, "grad_norm": 1.265625, "learning_rate": 0.0004724832989686411, "loss": 5.0037, "mean_token_accuracy": 0.21468748897314072, "num_tokens": 39862245.0, "step": 17400 }, { "entropy": 5.163530015945435, "epoch": 1.6719500480307397, "grad_norm": 1.3359375, "learning_rate": 0.00047246686785918545, "loss": 4.9126, "mean_token_accuracy": 0.22578096389770508, "num_tokens": 39872295.0, "step": 17405 }, { "entropy": 5.088949918746948, "epoch": 1.6724303554274735, "grad_norm": 1.109375, "learning_rate": 0.00047245043216513546, "loss": 4.8559, "mean_token_accuracy": 0.21463808417320251, "num_tokens": 39882839.0, "step": 17410 }, { "entropy": 5.105159568786621, "epoch": 1.6729106628242074, "grad_norm": 1.2734375, "learning_rate": 0.0004724339918868727, "loss": 4.856, "mean_token_accuracy": 0.221045646071434, "num_tokens": 39894767.0, "step": 17415 }, { "entropy": 5.254346513748169, "epoch": 1.6733909702209413, "grad_norm": 1.28125, "learning_rate": 0.000472417547024779, "loss": 5.0189, "mean_token_accuracy": 0.20958801060914994, "num_tokens": 39907492.0, "step": 17420 }, { "entropy": 5.103402757644654, "epoch": 1.6738712776176752, "grad_norm": 1.2890625, "learning_rate": 0.00047240109757923593, "loss": 4.8019, "mean_token_accuracy": 0.22363511472940445, "num_tokens": 39919005.0, "step": 17425 }, { "entropy": 5.133781242370605, "epoch": 1.674351585014409, "grad_norm": 1.3984375, "learning_rate": 0.0004723846435506256, "loss": 4.9053, "mean_token_accuracy": 0.22093903720378877, "num_tokens": 39930417.0, "step": 17430 }, { "entropy": 5.136504316329956, "epoch": 1.6748318924111432, "grad_norm": 1.1796875, "learning_rate": 0.00047236818493932994, "loss": 4.9295, "mean_token_accuracy": 0.21972116082906723, "num_tokens": 39941873.0, "step": 17435 }, { "entropy": 5.218835639953613, "epoch": 1.675312199807877, "grad_norm": 1.1171875, "learning_rate": 0.0004723517217457311, "loss": 4.9594, "mean_token_accuracy": 0.2127823770046234, "num_tokens": 39953280.0, "step": 17440 }, { "entropy": 5.266405916213989, "epoch": 1.675792507204611, "grad_norm": 1.265625, "learning_rate": 0.0004723352539702113, "loss": 5.033, "mean_token_accuracy": 0.2113511174917221, "num_tokens": 39964898.0, "step": 17445 }, { "entropy": 5.169444513320923, "epoch": 1.6762728146013448, "grad_norm": 1.21875, "learning_rate": 0.0004723187816131529, "loss": 4.8971, "mean_token_accuracy": 0.22143382728099822, "num_tokens": 39976945.0, "step": 17450 }, { "entropy": 5.21337661743164, "epoch": 1.676753121998079, "grad_norm": 1.328125, "learning_rate": 0.0004723023046749383, "loss": 5.0068, "mean_token_accuracy": 0.21935284435749053, "num_tokens": 39987760.0, "step": 17455 }, { "entropy": 5.109252262115478, "epoch": 1.6772334293948128, "grad_norm": 1.2265625, "learning_rate": 0.00047228582315595, "loss": 4.7994, "mean_token_accuracy": 0.2287563070654869, "num_tokens": 39998171.0, "step": 17460 }, { "entropy": 5.1659423351287845, "epoch": 1.6777137367915467, "grad_norm": 1.15625, "learning_rate": 0.0004722693370565708, "loss": 4.9056, "mean_token_accuracy": 0.21699397414922714, "num_tokens": 40010124.0, "step": 17465 }, { "entropy": 5.1289918422698975, "epoch": 1.6781940441882806, "grad_norm": 1.203125, "learning_rate": 0.00047225284637718323, "loss": 4.9364, "mean_token_accuracy": 0.21586138755083084, "num_tokens": 40022599.0, "step": 17470 }, { "entropy": 5.150423860549926, "epoch": 1.6786743515850144, "grad_norm": 1.1953125, "learning_rate": 0.0004722363511181703, "loss": 4.8901, "mean_token_accuracy": 0.2244936302304268, "num_tokens": 40033738.0, "step": 17475 }, { "entropy": 5.188096809387207, "epoch": 1.6791546589817483, "grad_norm": 1.171875, "learning_rate": 0.000472219851279915, "loss": 4.9457, "mean_token_accuracy": 0.2202860251069069, "num_tokens": 40044148.0, "step": 17480 }, { "entropy": 5.223975753784179, "epoch": 1.6796349663784822, "grad_norm": 1.265625, "learning_rate": 0.0004722033468628004, "loss": 4.9322, "mean_token_accuracy": 0.22158615589141845, "num_tokens": 40055430.0, "step": 17485 }, { "entropy": 5.1668178081512455, "epoch": 1.680115273775216, "grad_norm": 1.4140625, "learning_rate": 0.0004721868378672098, "loss": 4.9149, "mean_token_accuracy": 0.2159278705716133, "num_tokens": 40066147.0, "step": 17490 }, { "entropy": 5.178054475784302, "epoch": 1.68059558117195, "grad_norm": 1.2109375, "learning_rate": 0.0004721703242935261, "loss": 4.9165, "mean_token_accuracy": 0.21332445442676545, "num_tokens": 40077567.0, "step": 17495 }, { "entropy": 5.231701517105103, "epoch": 1.6810758885686838, "grad_norm": 1.1953125, "learning_rate": 0.0004721538061421331, "loss": 5.005, "mean_token_accuracy": 0.21199633330106735, "num_tokens": 40089544.0, "step": 17500 }, { "entropy": 5.082833242416382, "epoch": 1.6815561959654177, "grad_norm": 1.2578125, "learning_rate": 0.00047213728341341407, "loss": 4.8582, "mean_token_accuracy": 0.22495235800743102, "num_tokens": 40100557.0, "step": 17505 }, { "entropy": 5.198224449157715, "epoch": 1.6820365033621518, "grad_norm": 1.140625, "learning_rate": 0.0004721207561077527, "loss": 4.9242, "mean_token_accuracy": 0.21465859711170196, "num_tokens": 40112052.0, "step": 17510 }, { "entropy": 5.35629358291626, "epoch": 1.6825168107588857, "grad_norm": 1.140625, "learning_rate": 0.0004721042242255327, "loss": 5.1065, "mean_token_accuracy": 0.20359711796045304, "num_tokens": 40123830.0, "step": 17515 }, { "entropy": 5.199491548538208, "epoch": 1.6829971181556196, "grad_norm": 1.2734375, "learning_rate": 0.00047208768776713805, "loss": 4.9982, "mean_token_accuracy": 0.22124958634376526, "num_tokens": 40134222.0, "step": 17520 }, { "entropy": 5.147200632095337, "epoch": 1.6834774255523535, "grad_norm": 1.25, "learning_rate": 0.0004720711467329523, "loss": 4.9883, "mean_token_accuracy": 0.21965805292129517, "num_tokens": 40145552.0, "step": 17525 }, { "entropy": 5.155480766296387, "epoch": 1.6839577329490876, "grad_norm": 1.2578125, "learning_rate": 0.0004720546011233599, "loss": 4.8324, "mean_token_accuracy": 0.2238232597708702, "num_tokens": 40156615.0, "step": 17530 }, { "entropy": 5.144622659683227, "epoch": 1.6844380403458215, "grad_norm": 1.3203125, "learning_rate": 0.0004720380509387446, "loss": 4.932, "mean_token_accuracy": 0.22399861961603165, "num_tokens": 40168825.0, "step": 17535 }, { "entropy": 5.190985679626465, "epoch": 1.6849183477425553, "grad_norm": 1.3046875, "learning_rate": 0.000472021496179491, "loss": 4.9261, "mean_token_accuracy": 0.22324578315019608, "num_tokens": 40179523.0, "step": 17540 }, { "entropy": 5.129342079162598, "epoch": 1.6853986551392892, "grad_norm": 1.1171875, "learning_rate": 0.00047200493684598316, "loss": 4.8848, "mean_token_accuracy": 0.22110755145549774, "num_tokens": 40191362.0, "step": 17545 }, { "entropy": 5.168604230880737, "epoch": 1.685878962536023, "grad_norm": 1.1796875, "learning_rate": 0.00047198837293860573, "loss": 4.9654, "mean_token_accuracy": 0.21929350346326829, "num_tokens": 40202274.0, "step": 17550 }, { "entropy": 5.244691181182861, "epoch": 1.686359269932757, "grad_norm": 1.3203125, "learning_rate": 0.0004719718044577432, "loss": 4.9938, "mean_token_accuracy": 0.20958704501390457, "num_tokens": 40213907.0, "step": 17555 }, { "entropy": 5.152917718887329, "epoch": 1.6868395773294909, "grad_norm": 1.265625, "learning_rate": 0.00047195523140378034, "loss": 4.9344, "mean_token_accuracy": 0.21637397557497023, "num_tokens": 40225300.0, "step": 17560 }, { "entropy": 5.109961271286011, "epoch": 1.6873198847262247, "grad_norm": 1.125, "learning_rate": 0.00047193865377710177, "loss": 4.8457, "mean_token_accuracy": 0.22791109681129457, "num_tokens": 40236197.0, "step": 17565 }, { "entropy": 5.227808856964112, "epoch": 1.6878001921229586, "grad_norm": 1.3046875, "learning_rate": 0.00047192207157809246, "loss": 5.0596, "mean_token_accuracy": 0.2109197899699211, "num_tokens": 40247887.0, "step": 17570 }, { "entropy": 5.263893365859985, "epoch": 1.6882804995196925, "grad_norm": 1.2734375, "learning_rate": 0.00047190548480713736, "loss": 4.8982, "mean_token_accuracy": 0.2192056208848953, "num_tokens": 40258262.0, "step": 17575 }, { "entropy": 5.223889493942261, "epoch": 1.6887608069164264, "grad_norm": 1.1875, "learning_rate": 0.00047188889346462163, "loss": 4.9589, "mean_token_accuracy": 0.21418242901563644, "num_tokens": 40268735.0, "step": 17580 }, { "entropy": 5.200381278991699, "epoch": 1.6892411143131603, "grad_norm": 1.140625, "learning_rate": 0.00047187229755093037, "loss": 5.0426, "mean_token_accuracy": 0.20822969675064087, "num_tokens": 40279905.0, "step": 17585 }, { "entropy": 5.207263040542602, "epoch": 1.6897214217098944, "grad_norm": 1.3203125, "learning_rate": 0.000471855697066449, "loss": 4.914, "mean_token_accuracy": 0.21707093566656113, "num_tokens": 40290580.0, "step": 17590 }, { "entropy": 5.2004670143127445, "epoch": 1.6902017291066282, "grad_norm": 1.1171875, "learning_rate": 0.00047183909201156297, "loss": 5.0006, "mean_token_accuracy": 0.2152295872569084, "num_tokens": 40302472.0, "step": 17595 }, { "entropy": 5.251315307617188, "epoch": 1.6906820365033621, "grad_norm": 1.2890625, "learning_rate": 0.0004718224823866576, "loss": 4.9872, "mean_token_accuracy": 0.21021779626607895, "num_tokens": 40314238.0, "step": 17600 }, { "entropy": 5.131517028808593, "epoch": 1.691162343900096, "grad_norm": 1.25, "learning_rate": 0.0004718058681921186, "loss": 4.9242, "mean_token_accuracy": 0.22052521407604217, "num_tokens": 40326147.0, "step": 17605 }, { "entropy": 5.177646541595459, "epoch": 1.6916426512968301, "grad_norm": 1.1328125, "learning_rate": 0.00047178924942833185, "loss": 4.8935, "mean_token_accuracy": 0.221788227558136, "num_tokens": 40338210.0, "step": 17610 }, { "entropy": 5.096933364868164, "epoch": 1.692122958693564, "grad_norm": 1.1875, "learning_rate": 0.0004717726260956831, "loss": 4.859, "mean_token_accuracy": 0.22617415189743043, "num_tokens": 40349293.0, "step": 17615 }, { "entropy": 5.1479727268219, "epoch": 1.6926032660902979, "grad_norm": 1.109375, "learning_rate": 0.0004717559981945581, "loss": 4.8868, "mean_token_accuracy": 0.21825706362724304, "num_tokens": 40360916.0, "step": 17620 }, { "entropy": 5.165114021301269, "epoch": 1.6930835734870318, "grad_norm": 1.1796875, "learning_rate": 0.0004717393657253432, "loss": 4.9631, "mean_token_accuracy": 0.215592922270298, "num_tokens": 40373525.0, "step": 17625 }, { "entropy": 5.215253448486328, "epoch": 1.6935638808837656, "grad_norm": 1.1640625, "learning_rate": 0.0004717227286884243, "loss": 4.9601, "mean_token_accuracy": 0.21485102623701097, "num_tokens": 40385286.0, "step": 17630 }, { "entropy": 5.161527442932129, "epoch": 1.6940441882804995, "grad_norm": 1.375, "learning_rate": 0.0004717060870841879, "loss": 4.8639, "mean_token_accuracy": 0.22210344523191453, "num_tokens": 40396371.0, "step": 17635 }, { "entropy": 5.174149513244629, "epoch": 1.6945244956772334, "grad_norm": 1.2421875, "learning_rate": 0.0004716894409130202, "loss": 4.9296, "mean_token_accuracy": 0.21754053086042405, "num_tokens": 40407304.0, "step": 17640 }, { "entropy": 5.169687795639038, "epoch": 1.6950048030739673, "grad_norm": 1.3125, "learning_rate": 0.0004716727901753078, "loss": 4.8853, "mean_token_accuracy": 0.21763041615486145, "num_tokens": 40418384.0, "step": 17645 }, { "entropy": 5.20654559135437, "epoch": 1.6954851104707012, "grad_norm": 1.2109375, "learning_rate": 0.0004716561348714371, "loss": 4.9963, "mean_token_accuracy": 0.2188402831554413, "num_tokens": 40430603.0, "step": 17650 }, { "entropy": 5.123339128494263, "epoch": 1.695965417867435, "grad_norm": 1.34375, "learning_rate": 0.00047163947500179494, "loss": 4.8871, "mean_token_accuracy": 0.22057809680700302, "num_tokens": 40442597.0, "step": 17655 }, { "entropy": 5.097166204452515, "epoch": 1.696445725264169, "grad_norm": 1.28125, "learning_rate": 0.0004716228105667681, "loss": 4.8132, "mean_token_accuracy": 0.22695180177688598, "num_tokens": 40454078.0, "step": 17660 }, { "entropy": 5.142997455596924, "epoch": 1.696926032660903, "grad_norm": 1.390625, "learning_rate": 0.0004716061415667435, "loss": 4.8731, "mean_token_accuracy": 0.2260493054986, "num_tokens": 40465561.0, "step": 17665 }, { "entropy": 5.214082384109497, "epoch": 1.697406340057637, "grad_norm": 1.4453125, "learning_rate": 0.000471589468002108, "loss": 4.9655, "mean_token_accuracy": 0.2226713106036186, "num_tokens": 40476883.0, "step": 17670 }, { "entropy": 5.1789998531341555, "epoch": 1.6978866474543708, "grad_norm": 1.109375, "learning_rate": 0.0004715727898732488, "loss": 4.9351, "mean_token_accuracy": 0.2229066714644432, "num_tokens": 40488139.0, "step": 17675 }, { "entropy": 5.166921186447143, "epoch": 1.6983669548511047, "grad_norm": 1.2578125, "learning_rate": 0.00047155610718055315, "loss": 4.931, "mean_token_accuracy": 0.22183982133865357, "num_tokens": 40499367.0, "step": 17680 }, { "entropy": 5.078970956802368, "epoch": 1.6988472622478388, "grad_norm": 1.25, "learning_rate": 0.00047153941992440833, "loss": 4.8881, "mean_token_accuracy": 0.2115646108984947, "num_tokens": 40510628.0, "step": 17685 }, { "entropy": 5.121505689620972, "epoch": 1.6993275696445727, "grad_norm": 1.1640625, "learning_rate": 0.0004715227281052018, "loss": 4.8719, "mean_token_accuracy": 0.2255760669708252, "num_tokens": 40522680.0, "step": 17690 }, { "entropy": 5.144559717178344, "epoch": 1.6998078770413065, "grad_norm": 1.15625, "learning_rate": 0.0004715060317233211, "loss": 4.8053, "mean_token_accuracy": 0.23404240906238555, "num_tokens": 40533139.0, "step": 17695 }, { "entropy": 5.200980138778687, "epoch": 1.7002881844380404, "grad_norm": 1.203125, "learning_rate": 0.0004714893307791538, "loss": 5.0023, "mean_token_accuracy": 0.21619048565626145, "num_tokens": 40544578.0, "step": 17700 }, { "entropy": 5.229176378250122, "epoch": 1.7007684918347743, "grad_norm": 2.125, "learning_rate": 0.00047147262527308766, "loss": 4.9251, "mean_token_accuracy": 0.22029948830604554, "num_tokens": 40555667.0, "step": 17705 }, { "entropy": 5.145949172973633, "epoch": 1.7012487992315082, "grad_norm": 1.1875, "learning_rate": 0.0004714559152055106, "loss": 4.9556, "mean_token_accuracy": 0.224330173432827, "num_tokens": 40567123.0, "step": 17710 }, { "entropy": 5.191706800460816, "epoch": 1.701729106628242, "grad_norm": 1.3046875, "learning_rate": 0.0004714392005768106, "loss": 4.9387, "mean_token_accuracy": 0.21762551963329316, "num_tokens": 40579692.0, "step": 17715 }, { "entropy": 5.1885899066925045, "epoch": 1.702209414024976, "grad_norm": 1.234375, "learning_rate": 0.0004714224813873756, "loss": 4.9025, "mean_token_accuracy": 0.22344619333744048, "num_tokens": 40590989.0, "step": 17720 }, { "entropy": 5.131214809417725, "epoch": 1.7026897214217098, "grad_norm": 1.1171875, "learning_rate": 0.00047140575763759393, "loss": 4.9276, "mean_token_accuracy": 0.22275308072566985, "num_tokens": 40602050.0, "step": 17725 }, { "entropy": 5.1476117134094235, "epoch": 1.7031700288184437, "grad_norm": 1.328125, "learning_rate": 0.00047138902932785363, "loss": 4.9118, "mean_token_accuracy": 0.22158618420362472, "num_tokens": 40614552.0, "step": 17730 }, { "entropy": 5.210604238510132, "epoch": 1.7036503362151776, "grad_norm": 1.3671875, "learning_rate": 0.00047137229645854333, "loss": 4.8718, "mean_token_accuracy": 0.2272538051009178, "num_tokens": 40625903.0, "step": 17735 }, { "entropy": 5.17086706161499, "epoch": 1.7041306436119115, "grad_norm": 1.484375, "learning_rate": 0.0004713555590300513, "loss": 4.9524, "mean_token_accuracy": 0.21819649636745453, "num_tokens": 40638634.0, "step": 17740 }, { "entropy": 5.219864559173584, "epoch": 1.7046109510086456, "grad_norm": 1.2265625, "learning_rate": 0.0004713388170427664, "loss": 4.9558, "mean_token_accuracy": 0.21615031808614732, "num_tokens": 40651279.0, "step": 17745 }, { "entropy": 5.184417057037353, "epoch": 1.7050912584053795, "grad_norm": 1.4375, "learning_rate": 0.0004713220704970771, "loss": 4.9216, "mean_token_accuracy": 0.22155367732048034, "num_tokens": 40662306.0, "step": 17750 }, { "entropy": 5.0326759815216064, "epoch": 1.7055715658021133, "grad_norm": 1.3125, "learning_rate": 0.00047130531939337236, "loss": 4.775, "mean_token_accuracy": 0.22894255667924882, "num_tokens": 40672290.0, "step": 17755 }, { "entropy": 5.187178373336792, "epoch": 1.7060518731988472, "grad_norm": 1.3046875, "learning_rate": 0.00047128856373204086, "loss": 4.9134, "mean_token_accuracy": 0.22119101732969285, "num_tokens": 40683447.0, "step": 17760 }, { "entropy": 5.292993640899658, "epoch": 1.7065321805955813, "grad_norm": 1.3125, "learning_rate": 0.00047127180351347184, "loss": 5.0599, "mean_token_accuracy": 0.21206379681825638, "num_tokens": 40695230.0, "step": 17765 }, { "entropy": 5.193490266799927, "epoch": 1.7070124879923152, "grad_norm": 1.4453125, "learning_rate": 0.0004712550387380544, "loss": 4.9057, "mean_token_accuracy": 0.21839701384305954, "num_tokens": 40707311.0, "step": 17770 }, { "entropy": 5.201669216156006, "epoch": 1.707492795389049, "grad_norm": 1.296875, "learning_rate": 0.0004712382694061776, "loss": 4.9201, "mean_token_accuracy": 0.21827106177806854, "num_tokens": 40717928.0, "step": 17775 }, { "entropy": 5.137501668930054, "epoch": 1.707973102785783, "grad_norm": 1.1953125, "learning_rate": 0.00047122149551823096, "loss": 4.93, "mean_token_accuracy": 0.2145393192768097, "num_tokens": 40730355.0, "step": 17780 }, { "entropy": 5.221040725708008, "epoch": 1.7084534101825168, "grad_norm": 1.2421875, "learning_rate": 0.0004712047170746039, "loss": 5.0179, "mean_token_accuracy": 0.21589890420436858, "num_tokens": 40741412.0, "step": 17785 }, { "entropy": 5.209814357757568, "epoch": 1.7089337175792507, "grad_norm": 1.15625, "learning_rate": 0.00047118793407568586, "loss": 4.9045, "mean_token_accuracy": 0.22416329383850098, "num_tokens": 40753491.0, "step": 17790 }, { "entropy": 5.131305885314942, "epoch": 1.7094140249759846, "grad_norm": 1.7734375, "learning_rate": 0.00047117114652186657, "loss": 4.8506, "mean_token_accuracy": 0.22409170120954514, "num_tokens": 40765209.0, "step": 17795 }, { "entropy": 5.116810369491577, "epoch": 1.7098943323727185, "grad_norm": 1.140625, "learning_rate": 0.00047115435441353573, "loss": 4.9496, "mean_token_accuracy": 0.21688321977853775, "num_tokens": 40778065.0, "step": 17800 }, { "entropy": 5.138747310638427, "epoch": 1.7103746397694524, "grad_norm": 1.21875, "learning_rate": 0.00047113755775108333, "loss": 4.9235, "mean_token_accuracy": 0.22174129486083985, "num_tokens": 40789149.0, "step": 17805 }, { "entropy": 5.201202821731568, "epoch": 1.7108549471661862, "grad_norm": 1.15625, "learning_rate": 0.00047112075653489913, "loss": 4.9227, "mean_token_accuracy": 0.22810300290584565, "num_tokens": 40800340.0, "step": 17810 }, { "entropy": 5.178883695602417, "epoch": 1.7113352545629201, "grad_norm": 1.1953125, "learning_rate": 0.0004711039507653734, "loss": 4.9677, "mean_token_accuracy": 0.22021741718053817, "num_tokens": 40811866.0, "step": 17815 }, { "entropy": 5.158439302444458, "epoch": 1.7118155619596542, "grad_norm": 1.3671875, "learning_rate": 0.0004710871404428961, "loss": 4.9635, "mean_token_accuracy": 0.21318840384483337, "num_tokens": 40825850.0, "step": 17820 }, { "entropy": 5.2235781192779545, "epoch": 1.7122958693563881, "grad_norm": 1.09375, "learning_rate": 0.00047107032556785786, "loss": 4.9129, "mean_token_accuracy": 0.22048740983009338, "num_tokens": 40836688.0, "step": 17825 }, { "entropy": 5.180397367477417, "epoch": 1.712776176753122, "grad_norm": 1.1875, "learning_rate": 0.00047105350614064874, "loss": 4.9461, "mean_token_accuracy": 0.2169654995203018, "num_tokens": 40847803.0, "step": 17830 }, { "entropy": 5.188759469985962, "epoch": 1.7132564841498559, "grad_norm": 1.3046875, "learning_rate": 0.00047103668216165944, "loss": 4.975, "mean_token_accuracy": 0.2155713826417923, "num_tokens": 40859099.0, "step": 17835 }, { "entropy": 5.144463443756104, "epoch": 1.71373679154659, "grad_norm": 1.21875, "learning_rate": 0.00047101985363128045, "loss": 4.8284, "mean_token_accuracy": 0.23141866326332092, "num_tokens": 40870440.0, "step": 17840 }, { "entropy": 5.26697793006897, "epoch": 1.7142170989433239, "grad_norm": 1.1875, "learning_rate": 0.00047100302054990255, "loss": 5.0215, "mean_token_accuracy": 0.2102995663881302, "num_tokens": 40882329.0, "step": 17845 }, { "entropy": 5.247942304611206, "epoch": 1.7146974063400577, "grad_norm": 1.40625, "learning_rate": 0.0004709861829179165, "loss": 5.0207, "mean_token_accuracy": 0.21234164237976075, "num_tokens": 40893458.0, "step": 17850 }, { "entropy": 5.1152942180633545, "epoch": 1.7151777137367916, "grad_norm": 1.234375, "learning_rate": 0.00047096934073571325, "loss": 4.843, "mean_token_accuracy": 0.22821006327867507, "num_tokens": 40904626.0, "step": 17855 }, { "entropy": 5.088890552520752, "epoch": 1.7156580211335255, "grad_norm": 1.34375, "learning_rate": 0.00047095249400368384, "loss": 4.8521, "mean_token_accuracy": 0.22707059532403945, "num_tokens": 40916005.0, "step": 17860 }, { "entropy": 5.091427659988403, "epoch": 1.7161383285302594, "grad_norm": 1.1875, "learning_rate": 0.00047093564272221927, "loss": 4.8326, "mean_token_accuracy": 0.2260905146598816, "num_tokens": 40927470.0, "step": 17865 }, { "entropy": 5.2040282726287845, "epoch": 1.7166186359269933, "grad_norm": 1.3515625, "learning_rate": 0.00047091878689171105, "loss": 4.981, "mean_token_accuracy": 0.2205181822180748, "num_tokens": 40938968.0, "step": 17870 }, { "entropy": 5.121591472625733, "epoch": 1.7170989433237271, "grad_norm": 1.46875, "learning_rate": 0.0004709019265125502, "loss": 4.8641, "mean_token_accuracy": 0.22871831506490709, "num_tokens": 40952636.0, "step": 17875 }, { "entropy": 5.232312250137329, "epoch": 1.717579250720461, "grad_norm": 1.1875, "learning_rate": 0.00047088506158512837, "loss": 4.9736, "mean_token_accuracy": 0.2218574747443199, "num_tokens": 40964816.0, "step": 17880 }, { "entropy": 5.210276556015015, "epoch": 1.718059558117195, "grad_norm": 1.2578125, "learning_rate": 0.00047086819210983714, "loss": 5.0101, "mean_token_accuracy": 0.2185376450419426, "num_tokens": 40977152.0, "step": 17885 }, { "entropy": 5.152036380767822, "epoch": 1.7185398655139288, "grad_norm": 1.1796875, "learning_rate": 0.00047085131808706813, "loss": 4.9234, "mean_token_accuracy": 0.22430746555328368, "num_tokens": 40987506.0, "step": 17890 }, { "entropy": 5.213549518585205, "epoch": 1.7190201729106627, "grad_norm": 1.140625, "learning_rate": 0.0004708344395172129, "loss": 4.9523, "mean_token_accuracy": 0.21810881644487382, "num_tokens": 40998598.0, "step": 17895 }, { "entropy": 5.188005638122559, "epoch": 1.7195004803073968, "grad_norm": 1.2421875, "learning_rate": 0.0004708175564006636, "loss": 4.9545, "mean_token_accuracy": 0.21332263350486755, "num_tokens": 41009644.0, "step": 17900 }, { "entropy": 5.1379045963287355, "epoch": 1.7199807877041307, "grad_norm": 1.359375, "learning_rate": 0.0004708006687378121, "loss": 4.8313, "mean_token_accuracy": 0.22353375256061553, "num_tokens": 41021816.0, "step": 17905 }, { "entropy": 5.118629121780396, "epoch": 1.7204610951008645, "grad_norm": 1.125, "learning_rate": 0.0004707837765290505, "loss": 4.8747, "mean_token_accuracy": 0.22374353557825089, "num_tokens": 41033482.0, "step": 17910 }, { "entropy": 5.131730937957764, "epoch": 1.7209414024975984, "grad_norm": 1.171875, "learning_rate": 0.0004707668797747709, "loss": 4.8753, "mean_token_accuracy": 0.2280108168721199, "num_tokens": 41043854.0, "step": 17915 }, { "entropy": 5.200505256652832, "epoch": 1.7214217098943325, "grad_norm": 1.125, "learning_rate": 0.0004707499784753657, "loss": 5.002, "mean_token_accuracy": 0.2113771140575409, "num_tokens": 41056956.0, "step": 17920 }, { "entropy": 5.264665651321411, "epoch": 1.7219020172910664, "grad_norm": 1.2265625, "learning_rate": 0.0004707330726312273, "loss": 5.0186, "mean_token_accuracy": 0.20672281384468078, "num_tokens": 41068170.0, "step": 17925 }, { "entropy": 5.199624300003052, "epoch": 1.7223823246878003, "grad_norm": 1.1875, "learning_rate": 0.00047071616224274803, "loss": 4.9427, "mean_token_accuracy": 0.21977581828832626, "num_tokens": 41079149.0, "step": 17930 }, { "entropy": 5.2467875480651855, "epoch": 1.7228626320845342, "grad_norm": 1.2734375, "learning_rate": 0.0004706992473103207, "loss": 4.9797, "mean_token_accuracy": 0.21803479194641112, "num_tokens": 41091039.0, "step": 17935 }, { "entropy": 5.147163963317871, "epoch": 1.723342939481268, "grad_norm": 1.1328125, "learning_rate": 0.00047068232783433806, "loss": 4.9624, "mean_token_accuracy": 0.221414914727211, "num_tokens": 41103318.0, "step": 17940 }, { "entropy": 5.137469387054443, "epoch": 1.723823246878002, "grad_norm": 1.265625, "learning_rate": 0.0004706654038151927, "loss": 4.8392, "mean_token_accuracy": 0.22574034184217454, "num_tokens": 41114235.0, "step": 17945 }, { "entropy": 5.139921188354492, "epoch": 1.7243035542747358, "grad_norm": 1.203125, "learning_rate": 0.0004706484752532777, "loss": 4.9196, "mean_token_accuracy": 0.22196324169635773, "num_tokens": 41126008.0, "step": 17950 }, { "entropy": 5.084310674667359, "epoch": 1.7247838616714697, "grad_norm": 1.203125, "learning_rate": 0.0004706315421489861, "loss": 4.9158, "mean_token_accuracy": 0.22012482583522797, "num_tokens": 41138819.0, "step": 17955 }, { "entropy": 5.177040863037109, "epoch": 1.7252641690682036, "grad_norm": 1.1796875, "learning_rate": 0.0004706146045027109, "loss": 4.8906, "mean_token_accuracy": 0.2244026854634285, "num_tokens": 41149389.0, "step": 17960 }, { "entropy": 5.206911277770996, "epoch": 1.7257444764649374, "grad_norm": 1.2734375, "learning_rate": 0.0004705976623148455, "loss": 4.8801, "mean_token_accuracy": 0.2156105950474739, "num_tokens": 41161810.0, "step": 17965 }, { "entropy": 5.18083701133728, "epoch": 1.7262247838616713, "grad_norm": 1.1796875, "learning_rate": 0.00047058071558578324, "loss": 4.9052, "mean_token_accuracy": 0.21902025789022445, "num_tokens": 41172903.0, "step": 17970 }, { "entropy": 5.162399435043335, "epoch": 1.7267050912584054, "grad_norm": 1.2890625, "learning_rate": 0.0004705637643159175, "loss": 4.9808, "mean_token_accuracy": 0.22272872775793076, "num_tokens": 41183905.0, "step": 17975 }, { "entropy": 5.145203065872193, "epoch": 1.7271853986551393, "grad_norm": 1.34375, "learning_rate": 0.00047054680850564185, "loss": 4.8865, "mean_token_accuracy": 0.21936126351356505, "num_tokens": 41195921.0, "step": 17980 }, { "entropy": 5.071988487243653, "epoch": 1.7276657060518732, "grad_norm": 1.1015625, "learning_rate": 0.0004705298481553499, "loss": 4.9223, "mean_token_accuracy": 0.22444438189268112, "num_tokens": 41208287.0, "step": 17985 }, { "entropy": 5.1723504066467285, "epoch": 1.728146013448607, "grad_norm": 1.2109375, "learning_rate": 0.00047051288326543553, "loss": 4.9596, "mean_token_accuracy": 0.21251793950796127, "num_tokens": 41219864.0, "step": 17990 }, { "entropy": 5.212101888656616, "epoch": 1.7286263208453412, "grad_norm": 1.25, "learning_rate": 0.00047049591383629247, "loss": 4.8862, "mean_token_accuracy": 0.2253048986196518, "num_tokens": 41230640.0, "step": 17995 }, { "entropy": 5.166407299041748, "epoch": 1.729106628242075, "grad_norm": 1.3984375, "learning_rate": 0.00047047893986831493, "loss": 4.9322, "mean_token_accuracy": 0.2219822809100151, "num_tokens": 41242413.0, "step": 18000 }, { "epoch": 1.729106628242075, "eval_entropy": 5.004426007965958, "eval_loss": 5.013918876647949, "eval_mean_token_accuracy": 0.22329990555514567, "eval_num_tokens": 41242413.0, "eval_runtime": 26.6347, "eval_samples_per_second": 1232.042, "eval_steps_per_second": 154.01, "step": 18000 }, { "entropy": 5.205334949493408, "epoch": 1.729586935638809, "grad_norm": 1.203125, "learning_rate": 0.00047046196136189686, "loss": 4.9423, "mean_token_accuracy": 0.21605349332094193, "num_tokens": 41254400.0, "step": 18005 }, { "entropy": 5.1552910804748535, "epoch": 1.7300672430355428, "grad_norm": 1.28125, "learning_rate": 0.0004704449783174323, "loss": 4.9063, "mean_token_accuracy": 0.21709170937538147, "num_tokens": 41266849.0, "step": 18010 }, { "entropy": 5.202658462524414, "epoch": 1.7305475504322767, "grad_norm": 1.234375, "learning_rate": 0.0004704279907353158, "loss": 4.9789, "mean_token_accuracy": 0.2149608999490738, "num_tokens": 41278179.0, "step": 18015 }, { "entropy": 5.187939310073853, "epoch": 1.7310278578290106, "grad_norm": 1.484375, "learning_rate": 0.00047041099861594167, "loss": 4.9223, "mean_token_accuracy": 0.22421054244041444, "num_tokens": 41289904.0, "step": 18020 }, { "entropy": 5.29556884765625, "epoch": 1.7315081652257445, "grad_norm": 1.3828125, "learning_rate": 0.0004703940019597044, "loss": 5.0694, "mean_token_accuracy": 0.21076448261737823, "num_tokens": 41301861.0, "step": 18025 }, { "entropy": 5.164215087890625, "epoch": 1.7319884726224783, "grad_norm": 1.390625, "learning_rate": 0.00047037700076699857, "loss": 4.9078, "mean_token_accuracy": 0.22254907786846162, "num_tokens": 41312420.0, "step": 18030 }, { "entropy": 5.1473808765411375, "epoch": 1.7324687800192122, "grad_norm": 1.3203125, "learning_rate": 0.000470359995038219, "loss": 4.9099, "mean_token_accuracy": 0.22299474775791167, "num_tokens": 41323264.0, "step": 18035 }, { "entropy": 5.2324567317962645, "epoch": 1.732949087415946, "grad_norm": 1.3359375, "learning_rate": 0.0004703429847737604, "loss": 5.0112, "mean_token_accuracy": 0.21593133956193925, "num_tokens": 41335438.0, "step": 18040 }, { "entropy": 5.196746826171875, "epoch": 1.73342939481268, "grad_norm": 1.4765625, "learning_rate": 0.0004703259699740177, "loss": 4.9825, "mean_token_accuracy": 0.20940714031457902, "num_tokens": 41347295.0, "step": 18045 }, { "entropy": 5.165305757522583, "epoch": 1.7339097022094139, "grad_norm": 1.1875, "learning_rate": 0.00047030895063938607, "loss": 4.9752, "mean_token_accuracy": 0.21267576068639754, "num_tokens": 41358886.0, "step": 18050 }, { "entropy": 5.155602979660034, "epoch": 1.734390009606148, "grad_norm": 1.1640625, "learning_rate": 0.00047029192677026043, "loss": 4.8144, "mean_token_accuracy": 0.22844478636980056, "num_tokens": 41369402.0, "step": 18055 }, { "entropy": 5.24388952255249, "epoch": 1.7348703170028819, "grad_norm": 1.234375, "learning_rate": 0.0004702748983670363, "loss": 5.0246, "mean_token_accuracy": 0.21362147182226182, "num_tokens": 41379772.0, "step": 18060 }, { "entropy": 5.218845558166504, "epoch": 1.7353506243996157, "grad_norm": 1.203125, "learning_rate": 0.0004702578654301088, "loss": 4.9436, "mean_token_accuracy": 0.2147745117545128, "num_tokens": 41389776.0, "step": 18065 }, { "entropy": 5.055250024795532, "epoch": 1.7358309317963496, "grad_norm": 1.25, "learning_rate": 0.0004702408279598734, "loss": 4.8347, "mean_token_accuracy": 0.2225829392671585, "num_tokens": 41400640.0, "step": 18070 }, { "entropy": 5.136556816101074, "epoch": 1.7363112391930837, "grad_norm": 1.171875, "learning_rate": 0.0004702237859567258, "loss": 4.87, "mean_token_accuracy": 0.21867617815732956, "num_tokens": 41412653.0, "step": 18075 }, { "entropy": 5.3015899658203125, "epoch": 1.7367915465898176, "grad_norm": 1.25, "learning_rate": 0.0004702067394210616, "loss": 5.0337, "mean_token_accuracy": 0.20988011807203294, "num_tokens": 41423815.0, "step": 18080 }, { "entropy": 5.196315050125122, "epoch": 1.7372718539865515, "grad_norm": 1.4296875, "learning_rate": 0.00047018968835327643, "loss": 4.9292, "mean_token_accuracy": 0.21484117954969406, "num_tokens": 41436016.0, "step": 18085 }, { "entropy": 5.100410509109497, "epoch": 1.7377521613832854, "grad_norm": 1.1875, "learning_rate": 0.0004701726327537664, "loss": 4.884, "mean_token_accuracy": 0.2229089468717575, "num_tokens": 41447837.0, "step": 18090 }, { "entropy": 5.191250038146973, "epoch": 1.7382324687800192, "grad_norm": 1.3203125, "learning_rate": 0.0004701555726229274, "loss": 4.9544, "mean_token_accuracy": 0.21401021778583526, "num_tokens": 41459816.0, "step": 18095 }, { "entropy": 5.251665163040161, "epoch": 1.7387127761767531, "grad_norm": 1.1640625, "learning_rate": 0.0004701385079611555, "loss": 4.8759, "mean_token_accuracy": 0.21814749985933304, "num_tokens": 41471461.0, "step": 18100 }, { "entropy": 5.142739725112915, "epoch": 1.739193083573487, "grad_norm": 1.796875, "learning_rate": 0.00047012143876884677, "loss": 4.8584, "mean_token_accuracy": 0.22734878063201905, "num_tokens": 41481886.0, "step": 18105 }, { "entropy": 5.112404155731201, "epoch": 1.739673390970221, "grad_norm": 1.4921875, "learning_rate": 0.0004701043650463977, "loss": 4.909, "mean_token_accuracy": 0.221170374751091, "num_tokens": 41493698.0, "step": 18110 }, { "entropy": 5.152590131759643, "epoch": 1.7401536983669548, "grad_norm": 1.1953125, "learning_rate": 0.0004700872867942046, "loss": 4.8657, "mean_token_accuracy": 0.22904037982225417, "num_tokens": 41504744.0, "step": 18115 }, { "entropy": 5.22120509147644, "epoch": 1.7406340057636887, "grad_norm": 1.6875, "learning_rate": 0.000470070204012664, "loss": 4.9395, "mean_token_accuracy": 0.22211443781852722, "num_tokens": 41517129.0, "step": 18120 }, { "entropy": 5.2185193538665775, "epoch": 1.7411143131604225, "grad_norm": 1.375, "learning_rate": 0.00047005311670217256, "loss": 4.9401, "mean_token_accuracy": 0.21087878495454787, "num_tokens": 41529969.0, "step": 18125 }, { "entropy": 5.125943803787232, "epoch": 1.7415946205571564, "grad_norm": 1.2734375, "learning_rate": 0.00047003602486312687, "loss": 4.8841, "mean_token_accuracy": 0.22201181203126907, "num_tokens": 41541515.0, "step": 18130 }, { "entropy": 5.139989805221558, "epoch": 1.7420749279538905, "grad_norm": 1.5390625, "learning_rate": 0.0004700189284959238, "loss": 4.949, "mean_token_accuracy": 0.21680050939321518, "num_tokens": 41553045.0, "step": 18135 }, { "entropy": 5.198971176147461, "epoch": 1.7425552353506244, "grad_norm": 1.171875, "learning_rate": 0.00047000182760096037, "loss": 4.9805, "mean_token_accuracy": 0.2192812144756317, "num_tokens": 41565376.0, "step": 18140 }, { "entropy": 5.197419261932373, "epoch": 1.7430355427473583, "grad_norm": 1.5234375, "learning_rate": 0.0004699847221786335, "loss": 4.9399, "mean_token_accuracy": 0.21375814825296402, "num_tokens": 41577515.0, "step": 18145 }, { "entropy": 5.120383930206299, "epoch": 1.7435158501440924, "grad_norm": 1.390625, "learning_rate": 0.0004699676122293403, "loss": 4.8522, "mean_token_accuracy": 0.22192323207855225, "num_tokens": 41588589.0, "step": 18150 }, { "entropy": 5.194192266464233, "epoch": 1.7439961575408263, "grad_norm": 1.2265625, "learning_rate": 0.0004699504977534782, "loss": 4.9965, "mean_token_accuracy": 0.21457838714122773, "num_tokens": 41600727.0, "step": 18155 }, { "entropy": 5.205707025527954, "epoch": 1.7444764649375601, "grad_norm": 1.171875, "learning_rate": 0.0004699333787514444, "loss": 4.9063, "mean_token_accuracy": 0.2258935034275055, "num_tokens": 41612177.0, "step": 18160 }, { "entropy": 5.194634437561035, "epoch": 1.744956772334294, "grad_norm": 1.28125, "learning_rate": 0.0004699162552236363, "loss": 4.9316, "mean_token_accuracy": 0.2233037084341049, "num_tokens": 41623749.0, "step": 18165 }, { "entropy": 5.256892395019531, "epoch": 1.745437079731028, "grad_norm": 1.15625, "learning_rate": 0.00046989912717045165, "loss": 4.9596, "mean_token_accuracy": 0.21816200911998748, "num_tokens": 41635549.0, "step": 18170 }, { "entropy": 5.236298227310181, "epoch": 1.7459173871277618, "grad_norm": 1.2421875, "learning_rate": 0.00046988199459228793, "loss": 5.0126, "mean_token_accuracy": 0.21799473017454146, "num_tokens": 41647219.0, "step": 18175 }, { "entropy": 5.22831883430481, "epoch": 1.7463976945244957, "grad_norm": 1.2734375, "learning_rate": 0.0004698648574895429, "loss": 4.9618, "mean_token_accuracy": 0.21960075348615646, "num_tokens": 41658850.0, "step": 18180 }, { "entropy": 5.239414978027344, "epoch": 1.7468780019212296, "grad_norm": 1.15625, "learning_rate": 0.00046984771586261465, "loss": 4.9616, "mean_token_accuracy": 0.2154267430305481, "num_tokens": 41669304.0, "step": 18185 }, { "entropy": 5.238613176345825, "epoch": 1.7473583093179634, "grad_norm": 1.6328125, "learning_rate": 0.000469830569711901, "loss": 4.9846, "mean_token_accuracy": 0.22659707218408584, "num_tokens": 41681652.0, "step": 18190 }, { "entropy": 5.21958441734314, "epoch": 1.7478386167146973, "grad_norm": 1.296875, "learning_rate": 0.0004698134190377999, "loss": 4.9527, "mean_token_accuracy": 0.21284282505512236, "num_tokens": 41692548.0, "step": 18195 }, { "entropy": 5.184736871719361, "epoch": 1.7483189241114312, "grad_norm": 1.296875, "learning_rate": 0.00046979626384070983, "loss": 4.9079, "mean_token_accuracy": 0.21957321614027023, "num_tokens": 41704528.0, "step": 18200 }, { "entropy": 5.1673534393310545, "epoch": 1.748799231508165, "grad_norm": 1.2734375, "learning_rate": 0.000469779104121029, "loss": 4.9325, "mean_token_accuracy": 0.2225162521004677, "num_tokens": 41717314.0, "step": 18205 }, { "entropy": 5.260458278656006, "epoch": 1.7492795389048992, "grad_norm": 1.40625, "learning_rate": 0.00046976193987915553, "loss": 4.9965, "mean_token_accuracy": 0.21652406752109526, "num_tokens": 41729193.0, "step": 18210 }, { "entropy": 5.199534273147583, "epoch": 1.749759846301633, "grad_norm": 1.546875, "learning_rate": 0.0004697447711154883, "loss": 4.8802, "mean_token_accuracy": 0.2234228655695915, "num_tokens": 41741208.0, "step": 18215 }, { "entropy": 5.170067024230957, "epoch": 1.750240153698367, "grad_norm": 1.40625, "learning_rate": 0.00046972759783042576, "loss": 4.8981, "mean_token_accuracy": 0.22203250229358673, "num_tokens": 41752792.0, "step": 18220 }, { "entropy": 5.188149499893188, "epoch": 1.7507204610951008, "grad_norm": 1.203125, "learning_rate": 0.0004697104200243666, "loss": 4.903, "mean_token_accuracy": 0.2209831014275551, "num_tokens": 41763629.0, "step": 18225 }, { "entropy": 5.1191747188568115, "epoch": 1.751200768491835, "grad_norm": 1.390625, "learning_rate": 0.0004696932376977096, "loss": 4.8712, "mean_token_accuracy": 0.22894158214330673, "num_tokens": 41774502.0, "step": 18230 }, { "entropy": 5.180744075775147, "epoch": 1.7516810758885688, "grad_norm": 1.2734375, "learning_rate": 0.0004696760508508538, "loss": 4.9337, "mean_token_accuracy": 0.2156649187207222, "num_tokens": 41785700.0, "step": 18235 }, { "entropy": 5.124039554595948, "epoch": 1.7521613832853027, "grad_norm": 1.28125, "learning_rate": 0.00046965885948419814, "loss": 4.8808, "mean_token_accuracy": 0.2154080703854561, "num_tokens": 41797347.0, "step": 18240 }, { "entropy": 5.144995784759521, "epoch": 1.7526416906820366, "grad_norm": 1.4375, "learning_rate": 0.0004696416635981418, "loss": 4.9778, "mean_token_accuracy": 0.21650518029928206, "num_tokens": 41810866.0, "step": 18245 }, { "entropy": 5.219710350036621, "epoch": 1.7531219980787704, "grad_norm": 1.296875, "learning_rate": 0.000469624463193084, "loss": 4.9528, "mean_token_accuracy": 0.21880155354738234, "num_tokens": 41821786.0, "step": 18250 }, { "entropy": 5.1789576530456545, "epoch": 1.7536023054755043, "grad_norm": 1.2734375, "learning_rate": 0.000469607258269424, "loss": 4.9279, "mean_token_accuracy": 0.2151848182082176, "num_tokens": 41833591.0, "step": 18255 }, { "entropy": 5.148035192489624, "epoch": 1.7540826128722382, "grad_norm": 1.21875, "learning_rate": 0.0004695900488275614, "loss": 4.8489, "mean_token_accuracy": 0.22885044515132905, "num_tokens": 41844659.0, "step": 18260 }, { "entropy": 5.111026906967163, "epoch": 1.754562920268972, "grad_norm": 1.375, "learning_rate": 0.0004695728348678957, "loss": 4.882, "mean_token_accuracy": 0.22123366296291352, "num_tokens": 41856378.0, "step": 18265 }, { "entropy": 5.129684257507324, "epoch": 1.755043227665706, "grad_norm": 1.2421875, "learning_rate": 0.0004695556163908265, "loss": 4.8941, "mean_token_accuracy": 0.21303319483995437, "num_tokens": 41867624.0, "step": 18270 }, { "entropy": 5.219369792938233, "epoch": 1.7555235350624399, "grad_norm": 1.3125, "learning_rate": 0.0004695383933967536, "loss": 4.9177, "mean_token_accuracy": 0.2267256498336792, "num_tokens": 41879250.0, "step": 18275 }, { "entropy": 5.2192254066467285, "epoch": 1.7560038424591737, "grad_norm": 1.375, "learning_rate": 0.00046952116588607694, "loss": 4.9981, "mean_token_accuracy": 0.21951815187931062, "num_tokens": 41890713.0, "step": 18280 }, { "entropy": 5.2056262493133545, "epoch": 1.7564841498559076, "grad_norm": 1.296875, "learning_rate": 0.0004695039338591963, "loss": 4.9828, "mean_token_accuracy": 0.2174760267138481, "num_tokens": 41902924.0, "step": 18285 }, { "entropy": 5.23514404296875, "epoch": 1.7569644572526417, "grad_norm": 1.2734375, "learning_rate": 0.000469486697316512, "loss": 4.9828, "mean_token_accuracy": 0.21504862755537033, "num_tokens": 41914498.0, "step": 18290 }, { "entropy": 5.156249570846557, "epoch": 1.7574447646493756, "grad_norm": 1.1484375, "learning_rate": 0.000469469456258424, "loss": 4.8904, "mean_token_accuracy": 0.22041202187538148, "num_tokens": 41925258.0, "step": 18295 }, { "entropy": 5.142882680892944, "epoch": 1.7579250720461095, "grad_norm": 1.15625, "learning_rate": 0.0004694522106853327, "loss": 4.9446, "mean_token_accuracy": 0.21827390491962434, "num_tokens": 41938176.0, "step": 18300 }, { "entropy": 5.182098150253296, "epoch": 1.7584053794428436, "grad_norm": 1.40625, "learning_rate": 0.00046943496059763845, "loss": 4.7877, "mean_token_accuracy": 0.23012328147888184, "num_tokens": 41949160.0, "step": 18305 }, { "entropy": 5.17469048500061, "epoch": 1.7588856868395775, "grad_norm": 1.2734375, "learning_rate": 0.00046941770599574176, "loss": 4.9748, "mean_token_accuracy": 0.22138626724481583, "num_tokens": 41960981.0, "step": 18310 }, { "entropy": 5.136798906326294, "epoch": 1.7593659942363113, "grad_norm": 1.640625, "learning_rate": 0.0004694004468800433, "loss": 4.9173, "mean_token_accuracy": 0.2229830577969551, "num_tokens": 41971755.0, "step": 18315 }, { "entropy": 5.1424705505371096, "epoch": 1.7598463016330452, "grad_norm": 1.4921875, "learning_rate": 0.0004693831832509437, "loss": 4.9102, "mean_token_accuracy": 0.22170276194810867, "num_tokens": 41984337.0, "step": 18320 }, { "entropy": 5.138628149032593, "epoch": 1.760326609029779, "grad_norm": 1.203125, "learning_rate": 0.00046936591510884375, "loss": 4.9208, "mean_token_accuracy": 0.2221353381872177, "num_tokens": 41995648.0, "step": 18325 }, { "entropy": 5.172077274322509, "epoch": 1.760806916426513, "grad_norm": 1.359375, "learning_rate": 0.00046934864245414443, "loss": 4.8644, "mean_token_accuracy": 0.22787528187036515, "num_tokens": 42006995.0, "step": 18330 }, { "entropy": 5.261659097671509, "epoch": 1.7612872238232469, "grad_norm": 1.1953125, "learning_rate": 0.00046933136528724676, "loss": 5.0065, "mean_token_accuracy": 0.21258640736341478, "num_tokens": 42020074.0, "step": 18335 }, { "entropy": 5.150201749801636, "epoch": 1.7617675312199808, "grad_norm": 1.2109375, "learning_rate": 0.0004693140836085518, "loss": 4.8376, "mean_token_accuracy": 0.22797610759735107, "num_tokens": 42032053.0, "step": 18340 }, { "entropy": 5.1066876411437985, "epoch": 1.7622478386167146, "grad_norm": 1.203125, "learning_rate": 0.00046929679741846076, "loss": 4.8674, "mean_token_accuracy": 0.22894890010356903, "num_tokens": 42042223.0, "step": 18345 }, { "entropy": 5.165579700469971, "epoch": 1.7627281460134485, "grad_norm": 1.4375, "learning_rate": 0.00046927950671737505, "loss": 4.9458, "mean_token_accuracy": 0.22484788596630095, "num_tokens": 42053988.0, "step": 18350 }, { "entropy": 5.1154531955719, "epoch": 1.7632084534101824, "grad_norm": 1.3515625, "learning_rate": 0.00046926221150569617, "loss": 4.8331, "mean_token_accuracy": 0.22157266587018967, "num_tokens": 42064451.0, "step": 18355 }, { "entropy": 5.160877132415772, "epoch": 1.7636887608069163, "grad_norm": 1.359375, "learning_rate": 0.0004692449117838255, "loss": 5.0066, "mean_token_accuracy": 0.21049903929233552, "num_tokens": 42076503.0, "step": 18360 }, { "entropy": 5.189673995971679, "epoch": 1.7641690682036504, "grad_norm": 1.453125, "learning_rate": 0.0004692276075521648, "loss": 4.9317, "mean_token_accuracy": 0.21206413209438324, "num_tokens": 42088341.0, "step": 18365 }, { "entropy": 5.185642528533935, "epoch": 1.7646493756003843, "grad_norm": 1.3359375, "learning_rate": 0.0004692102988111158, "loss": 4.891, "mean_token_accuracy": 0.2198364794254303, "num_tokens": 42099862.0, "step": 18370 }, { "entropy": 5.2197236061096195, "epoch": 1.7651296829971181, "grad_norm": 1.34375, "learning_rate": 0.00046919298556108023, "loss": 5.0118, "mean_token_accuracy": 0.21568115353584288, "num_tokens": 42112117.0, "step": 18375 }, { "entropy": 5.237560224533081, "epoch": 1.765609990393852, "grad_norm": 1.3671875, "learning_rate": 0.00046917566780246036, "loss": 4.9319, "mean_token_accuracy": 0.2160506397485733, "num_tokens": 42123093.0, "step": 18380 }, { "entropy": 5.233705615997314, "epoch": 1.7660902977905861, "grad_norm": 1.2265625, "learning_rate": 0.00046915834553565793, "loss": 5.0065, "mean_token_accuracy": 0.218734946846962, "num_tokens": 42135266.0, "step": 18385 }, { "entropy": 5.329632234573364, "epoch": 1.76657060518732, "grad_norm": 1.3828125, "learning_rate": 0.0004691410187610753, "loss": 5.0506, "mean_token_accuracy": 0.21428396850824355, "num_tokens": 42145743.0, "step": 18390 }, { "entropy": 5.15242190361023, "epoch": 1.767050912584054, "grad_norm": 1.1875, "learning_rate": 0.00046912368747911465, "loss": 4.8459, "mean_token_accuracy": 0.22501615881919862, "num_tokens": 42157604.0, "step": 18395 }, { "entropy": 5.172465991973877, "epoch": 1.7675312199807878, "grad_norm": 1.125, "learning_rate": 0.00046910635169017845, "loss": 4.8578, "mean_token_accuracy": 0.2229616954922676, "num_tokens": 42168738.0, "step": 18400 }, { "entropy": 5.202865123748779, "epoch": 1.7680115273775217, "grad_norm": 1.296875, "learning_rate": 0.0004690890113946691, "loss": 4.9664, "mean_token_accuracy": 0.21324502676725388, "num_tokens": 42179699.0, "step": 18405 }, { "entropy": 5.189118957519531, "epoch": 1.7684918347742555, "grad_norm": 1.359375, "learning_rate": 0.0004690716665929893, "loss": 4.9179, "mean_token_accuracy": 0.2131508007645607, "num_tokens": 42190512.0, "step": 18410 }, { "entropy": 5.244009923934937, "epoch": 1.7689721421709894, "grad_norm": 1.21875, "learning_rate": 0.00046905431728554164, "loss": 4.968, "mean_token_accuracy": 0.213258358836174, "num_tokens": 42200996.0, "step": 18415 }, { "entropy": 5.138740158081054, "epoch": 1.7694524495677233, "grad_norm": 1.265625, "learning_rate": 0.00046903696347272894, "loss": 4.8941, "mean_token_accuracy": 0.22915413826704026, "num_tokens": 42213032.0, "step": 18420 }, { "entropy": 5.151826953887939, "epoch": 1.7699327569644572, "grad_norm": 1.3203125, "learning_rate": 0.00046901960515495413, "loss": 4.9574, "mean_token_accuracy": 0.213711653649807, "num_tokens": 42224366.0, "step": 18425 }, { "entropy": 5.080650377273559, "epoch": 1.770413064361191, "grad_norm": 1.2578125, "learning_rate": 0.0004690022423326202, "loss": 4.8162, "mean_token_accuracy": 0.2286729708313942, "num_tokens": 42235540.0, "step": 18430 }, { "entropy": 5.1232880592346195, "epoch": 1.770893371757925, "grad_norm": 1.2734375, "learning_rate": 0.0004689848750061303, "loss": 4.8721, "mean_token_accuracy": 0.22594636976718901, "num_tokens": 42246755.0, "step": 18435 }, { "entropy": 5.144126129150391, "epoch": 1.7713736791546588, "grad_norm": 1.2578125, "learning_rate": 0.0004689675031758876, "loss": 4.8891, "mean_token_accuracy": 0.22070587277412415, "num_tokens": 42259536.0, "step": 18440 }, { "entropy": 5.184299182891846, "epoch": 1.771853986551393, "grad_norm": 1.328125, "learning_rate": 0.0004689501268422954, "loss": 4.88, "mean_token_accuracy": 0.2211346685886383, "num_tokens": 42270813.0, "step": 18445 }, { "entropy": 5.265690517425537, "epoch": 1.7723342939481268, "grad_norm": 1.3671875, "learning_rate": 0.00046893274600575725, "loss": 5.0345, "mean_token_accuracy": 0.21505656242370605, "num_tokens": 42281953.0, "step": 18450 }, { "entropy": 5.187439250946045, "epoch": 1.7728146013448607, "grad_norm": 1.2421875, "learning_rate": 0.0004689153606666765, "loss": 5.0476, "mean_token_accuracy": 0.20950031727552415, "num_tokens": 42294041.0, "step": 18455 }, { "entropy": 5.180276155471802, "epoch": 1.7732949087415946, "grad_norm": 1.1171875, "learning_rate": 0.000468897970825457, "loss": 4.8725, "mean_token_accuracy": 0.22297078520059585, "num_tokens": 42305812.0, "step": 18460 }, { "entropy": 5.162807846069336, "epoch": 1.7737752161383287, "grad_norm": 1.2109375, "learning_rate": 0.00046888057648250233, "loss": 4.8246, "mean_token_accuracy": 0.22508549243211745, "num_tokens": 42317585.0, "step": 18465 }, { "entropy": 5.214176034927368, "epoch": 1.7742555235350626, "grad_norm": 1.5703125, "learning_rate": 0.0004688631776382164, "loss": 4.9353, "mean_token_accuracy": 0.2175424426794052, "num_tokens": 42328723.0, "step": 18470 }, { "entropy": 5.142759513854981, "epoch": 1.7747358309317964, "grad_norm": 1.2421875, "learning_rate": 0.00046884577429300305, "loss": 4.9217, "mean_token_accuracy": 0.22638531923294067, "num_tokens": 42340741.0, "step": 18475 }, { "entropy": 5.177513170242309, "epoch": 1.7752161383285303, "grad_norm": 1.265625, "learning_rate": 0.0004688283664472665, "loss": 4.9653, "mean_token_accuracy": 0.22027941346168517, "num_tokens": 42351504.0, "step": 18480 }, { "entropy": 5.203497743606567, "epoch": 1.7756964457252642, "grad_norm": 1.28125, "learning_rate": 0.00046881095410141084, "loss": 4.9315, "mean_token_accuracy": 0.2178051844239235, "num_tokens": 42362920.0, "step": 18485 }, { "entropy": 5.117229461669922, "epoch": 1.776176753121998, "grad_norm": 1.203125, "learning_rate": 0.00046879353725584036, "loss": 4.8404, "mean_token_accuracy": 0.22348118722438812, "num_tokens": 42374523.0, "step": 18490 }, { "entropy": 5.152104187011719, "epoch": 1.776657060518732, "grad_norm": 1.296875, "learning_rate": 0.00046877611591095923, "loss": 4.9442, "mean_token_accuracy": 0.21657546162605285, "num_tokens": 42384988.0, "step": 18495 }, { "entropy": 5.063777208328247, "epoch": 1.7771373679154658, "grad_norm": 1.296875, "learning_rate": 0.00046875869006717224, "loss": 4.8488, "mean_token_accuracy": 0.22488499134778978, "num_tokens": 42396503.0, "step": 18500 }, { "entropy": 5.220582485198975, "epoch": 1.7776176753121997, "grad_norm": 1.328125, "learning_rate": 0.00046874125972488375, "loss": 4.9486, "mean_token_accuracy": 0.2133714646100998, "num_tokens": 42406877.0, "step": 18505 }, { "entropy": 5.186308240890503, "epoch": 1.7780979827089336, "grad_norm": 1.171875, "learning_rate": 0.00046872382488449853, "loss": 4.89, "mean_token_accuracy": 0.2266918882727623, "num_tokens": 42418010.0, "step": 18510 }, { "entropy": 5.1456788063049315, "epoch": 1.7785782901056675, "grad_norm": 1.2578125, "learning_rate": 0.00046870638554642133, "loss": 4.8567, "mean_token_accuracy": 0.22165956050157548, "num_tokens": 42429056.0, "step": 18515 }, { "entropy": 5.1068305492401125, "epoch": 1.7790585975024016, "grad_norm": 1.265625, "learning_rate": 0.00046868894171105704, "loss": 4.8752, "mean_token_accuracy": 0.2224562093615532, "num_tokens": 42442294.0, "step": 18520 }, { "entropy": 5.205835819244385, "epoch": 1.7795389048991355, "grad_norm": 1.1875, "learning_rate": 0.0004686714933788107, "loss": 4.9486, "mean_token_accuracy": 0.21565259397029876, "num_tokens": 42454064.0, "step": 18525 }, { "entropy": 5.19152512550354, "epoch": 1.7800192122958693, "grad_norm": 1.1640625, "learning_rate": 0.0004686540405500873, "loss": 4.9295, "mean_token_accuracy": 0.22629985958337784, "num_tokens": 42465289.0, "step": 18530 }, { "entropy": 5.114503574371338, "epoch": 1.7804995196926032, "grad_norm": 1.296875, "learning_rate": 0.0004686365832252922, "loss": 4.8308, "mean_token_accuracy": 0.22398976534605025, "num_tokens": 42476662.0, "step": 18535 }, { "entropy": 5.182208681106568, "epoch": 1.7809798270893373, "grad_norm": 1.203125, "learning_rate": 0.00046861912140483056, "loss": 4.8944, "mean_token_accuracy": 0.22320135533809662, "num_tokens": 42486893.0, "step": 18540 }, { "entropy": 5.151525783538818, "epoch": 1.7814601344860712, "grad_norm": 1.15625, "learning_rate": 0.00046860165508910787, "loss": 4.8671, "mean_token_accuracy": 0.2246992588043213, "num_tokens": 42498202.0, "step": 18545 }, { "entropy": 5.105840873718262, "epoch": 1.781940441882805, "grad_norm": 1.265625, "learning_rate": 0.0004685841842785296, "loss": 4.7625, "mean_token_accuracy": 0.22927410155534744, "num_tokens": 42509378.0, "step": 18550 }, { "entropy": 5.1813897609710695, "epoch": 1.782420749279539, "grad_norm": 1.390625, "learning_rate": 0.0004685667089735014, "loss": 4.9547, "mean_token_accuracy": 0.22281541526317597, "num_tokens": 42519955.0, "step": 18555 }, { "entropy": 5.25247893333435, "epoch": 1.7829010566762729, "grad_norm": 1.359375, "learning_rate": 0.00046854922917442907, "loss": 5.0055, "mean_token_accuracy": 0.21443188935518265, "num_tokens": 42531896.0, "step": 18560 }, { "entropy": 5.18243556022644, "epoch": 1.7833813640730067, "grad_norm": 1.4765625, "learning_rate": 0.0004685317448817182, "loss": 4.9004, "mean_token_accuracy": 0.22561157047748565, "num_tokens": 42542123.0, "step": 18565 }, { "entropy": 5.176688098907471, "epoch": 1.7838616714697406, "grad_norm": 1.09375, "learning_rate": 0.0004685142560957751, "loss": 4.9098, "mean_token_accuracy": 0.21898285746574403, "num_tokens": 42554034.0, "step": 18570 }, { "entropy": 5.121625852584839, "epoch": 1.7843419788664745, "grad_norm": 1.2578125, "learning_rate": 0.0004684967628170054, "loss": 4.8628, "mean_token_accuracy": 0.22410739213228226, "num_tokens": 42564532.0, "step": 18575 }, { "entropy": 5.121355819702148, "epoch": 1.7848222862632084, "grad_norm": 0.984375, "learning_rate": 0.00046847926504581553, "loss": 4.8422, "mean_token_accuracy": 0.21688616573810576, "num_tokens": 42576864.0, "step": 18580 }, { "entropy": 5.106497955322266, "epoch": 1.7853025936599423, "grad_norm": 1.2109375, "learning_rate": 0.0004684617627826116, "loss": 4.8606, "mean_token_accuracy": 0.22770557105541228, "num_tokens": 42588571.0, "step": 18585 }, { "entropy": 5.218504762649536, "epoch": 1.7857829010566761, "grad_norm": 1.3984375, "learning_rate": 0.0004684442560278001, "loss": 4.9925, "mean_token_accuracy": 0.20908130556344987, "num_tokens": 42600912.0, "step": 18590 }, { "entropy": 5.226399898529053, "epoch": 1.78626320845341, "grad_norm": 1.1796875, "learning_rate": 0.00046842674478178727, "loss": 4.8523, "mean_token_accuracy": 0.22524797022342682, "num_tokens": 42612929.0, "step": 18595 }, { "entropy": 5.167687702178955, "epoch": 1.7867435158501441, "grad_norm": 1.125, "learning_rate": 0.0004684092290449798, "loss": 4.9383, "mean_token_accuracy": 0.22270502150058746, "num_tokens": 42623688.0, "step": 18600 }, { "entropy": 5.221596956253052, "epoch": 1.787223823246878, "grad_norm": 1.125, "learning_rate": 0.0004683917088177844, "loss": 5.0482, "mean_token_accuracy": 0.20961541533470154, "num_tokens": 42635773.0, "step": 18605 }, { "entropy": 5.200922203063965, "epoch": 1.7877041306436119, "grad_norm": 1.2734375, "learning_rate": 0.0004683741841006077, "loss": 4.8933, "mean_token_accuracy": 0.2215485990047455, "num_tokens": 42647210.0, "step": 18610 }, { "entropy": 5.1452563285827635, "epoch": 1.7881844380403458, "grad_norm": 1.25, "learning_rate": 0.0004683566548938567, "loss": 4.9286, "mean_token_accuracy": 0.21941764205694197, "num_tokens": 42658029.0, "step": 18615 }, { "entropy": 5.139180374145508, "epoch": 1.7886647454370799, "grad_norm": 1.0546875, "learning_rate": 0.0004683391211979383, "loss": 4.9548, "mean_token_accuracy": 0.21801802963018418, "num_tokens": 42669327.0, "step": 18620 }, { "entropy": 5.2122523307800295, "epoch": 1.7891450528338138, "grad_norm": 1.2265625, "learning_rate": 0.0004683215830132597, "loss": 4.8596, "mean_token_accuracy": 0.22672210782766342, "num_tokens": 42679794.0, "step": 18625 }, { "entropy": 5.142408990859986, "epoch": 1.7896253602305476, "grad_norm": 1.3046875, "learning_rate": 0.00046830404034022786, "loss": 4.8864, "mean_token_accuracy": 0.23015541732311248, "num_tokens": 42689838.0, "step": 18630 }, { "entropy": 5.161951875686645, "epoch": 1.7901056676272815, "grad_norm": 1.21875, "learning_rate": 0.0004682864931792502, "loss": 4.9635, "mean_token_accuracy": 0.2196178674697876, "num_tokens": 42700935.0, "step": 18635 }, { "entropy": 5.285586929321289, "epoch": 1.7905859750240154, "grad_norm": 1.171875, "learning_rate": 0.0004682689415307342, "loss": 5.0287, "mean_token_accuracy": 0.20989848375320436, "num_tokens": 42712533.0, "step": 18640 }, { "entropy": 5.190979862213135, "epoch": 1.7910662824207493, "grad_norm": 1.1640625, "learning_rate": 0.0004682513853950872, "loss": 4.9081, "mean_token_accuracy": 0.22632770985364914, "num_tokens": 42723604.0, "step": 18645 }, { "entropy": 5.207202386856079, "epoch": 1.7915465898174832, "grad_norm": 1.3515625, "learning_rate": 0.000468233824772717, "loss": 4.9157, "mean_token_accuracy": 0.21897136121988298, "num_tokens": 42735957.0, "step": 18650 }, { "entropy": 5.185697603225708, "epoch": 1.792026897214217, "grad_norm": 1.1875, "learning_rate": 0.000468216259664031, "loss": 5.0322, "mean_token_accuracy": 0.2145911380648613, "num_tokens": 42747287.0, "step": 18655 }, { "entropy": 5.173118257522583, "epoch": 1.792507204610951, "grad_norm": 1.21875, "learning_rate": 0.00046819869006943727, "loss": 4.9094, "mean_token_accuracy": 0.22061461806297303, "num_tokens": 42759270.0, "step": 18660 }, { "entropy": 5.142519521713257, "epoch": 1.7929875120076848, "grad_norm": 1.125, "learning_rate": 0.0004681811159893436, "loss": 4.8533, "mean_token_accuracy": 0.22513288110494614, "num_tokens": 42771955.0, "step": 18665 }, { "entropy": 5.2744043350219725, "epoch": 1.7934678194044187, "grad_norm": 1.078125, "learning_rate": 0.00046816353742415814, "loss": 5.0293, "mean_token_accuracy": 0.21436074376106262, "num_tokens": 42784610.0, "step": 18670 }, { "entropy": 5.184707164764404, "epoch": 1.7939481268011528, "grad_norm": 1.28125, "learning_rate": 0.00046814595437428885, "loss": 4.9121, "mean_token_accuracy": 0.2221132293343544, "num_tokens": 42795221.0, "step": 18675 }, { "entropy": 5.069939041137696, "epoch": 1.7944284341978867, "grad_norm": 1.15625, "learning_rate": 0.000468128366840144, "loss": 4.8812, "mean_token_accuracy": 0.22387212961912156, "num_tokens": 42807475.0, "step": 18680 }, { "entropy": 5.158438444137573, "epoch": 1.7949087415946205, "grad_norm": 1.140625, "learning_rate": 0.000468110774822132, "loss": 4.901, "mean_token_accuracy": 0.22224035561084748, "num_tokens": 42819193.0, "step": 18685 }, { "entropy": 5.226526880264283, "epoch": 1.7953890489913544, "grad_norm": 1.125, "learning_rate": 0.0004680931783206612, "loss": 4.9806, "mean_token_accuracy": 0.22079339921474456, "num_tokens": 42831267.0, "step": 18690 }, { "entropy": 5.199577331542969, "epoch": 1.7958693563880885, "grad_norm": 1.2109375, "learning_rate": 0.00046807557733614014, "loss": 4.994, "mean_token_accuracy": 0.21518171280622483, "num_tokens": 42843066.0, "step": 18695 }, { "entropy": 5.143007516860962, "epoch": 1.7963496637848224, "grad_norm": 1.1875, "learning_rate": 0.00046805797186897757, "loss": 4.8699, "mean_token_accuracy": 0.22779001146554947, "num_tokens": 42854630.0, "step": 18700 }, { "entropy": 5.2355574607849125, "epoch": 1.7968299711815563, "grad_norm": 1.3046875, "learning_rate": 0.00046804036191958206, "loss": 4.9618, "mean_token_accuracy": 0.21646393537521363, "num_tokens": 42865986.0, "step": 18705 }, { "entropy": 5.242122411727905, "epoch": 1.7973102785782902, "grad_norm": 1.2109375, "learning_rate": 0.00046802274748836267, "loss": 4.9731, "mean_token_accuracy": 0.2180660679936409, "num_tokens": 42877533.0, "step": 18710 }, { "entropy": 5.145714998245239, "epoch": 1.797790585975024, "grad_norm": 1.3125, "learning_rate": 0.0004680051285757281, "loss": 4.8409, "mean_token_accuracy": 0.2281106159090996, "num_tokens": 42889114.0, "step": 18715 }, { "entropy": 5.230079174041748, "epoch": 1.798270893371758, "grad_norm": 1.3515625, "learning_rate": 0.0004679875051820877, "loss": 5.0519, "mean_token_accuracy": 0.20809556990861894, "num_tokens": 42899483.0, "step": 18720 }, { "entropy": 5.178544616699218, "epoch": 1.7987512007684918, "grad_norm": 1.078125, "learning_rate": 0.0004679698773078503, "loss": 4.8805, "mean_token_accuracy": 0.2276952013373375, "num_tokens": 42910267.0, "step": 18725 }, { "entropy": 5.102669811248779, "epoch": 1.7992315081652257, "grad_norm": 1.3359375, "learning_rate": 0.00046795224495342554, "loss": 4.8994, "mean_token_accuracy": 0.2255684345960617, "num_tokens": 42922440.0, "step": 18730 }, { "entropy": 5.137301397323609, "epoch": 1.7997118155619596, "grad_norm": 1.3828125, "learning_rate": 0.00046793460811922255, "loss": 4.8559, "mean_token_accuracy": 0.22756927013397216, "num_tokens": 42933967.0, "step": 18735 }, { "entropy": 5.228575134277344, "epoch": 1.8001921229586935, "grad_norm": 1.1796875, "learning_rate": 0.00046791696680565075, "loss": 4.8842, "mean_token_accuracy": 0.22466631084680558, "num_tokens": 42945049.0, "step": 18740 }, { "entropy": 5.231136322021484, "epoch": 1.8006724303554273, "grad_norm": 1.1640625, "learning_rate": 0.00046789932101312003, "loss": 5.0187, "mean_token_accuracy": 0.2174960657954216, "num_tokens": 42956062.0, "step": 18745 }, { "entropy": 5.230274868011475, "epoch": 1.8011527377521612, "grad_norm": 1.265625, "learning_rate": 0.0004678816707420397, "loss": 5.0032, "mean_token_accuracy": 0.21903230547904967, "num_tokens": 42967687.0, "step": 18750 }, { "entropy": 5.197688245773316, "epoch": 1.8016330451488953, "grad_norm": 1.265625, "learning_rate": 0.0004678640159928198, "loss": 4.964, "mean_token_accuracy": 0.21139907091856003, "num_tokens": 42979916.0, "step": 18755 }, { "entropy": 5.182051801681519, "epoch": 1.8021133525456292, "grad_norm": 1.171875, "learning_rate": 0.0004678463567658701, "loss": 4.957, "mean_token_accuracy": 0.2161658376455307, "num_tokens": 42991639.0, "step": 18760 }, { "entropy": 5.224612426757813, "epoch": 1.802593659942363, "grad_norm": 1.2109375, "learning_rate": 0.0004678286930616006, "loss": 4.9208, "mean_token_accuracy": 0.2289966121315956, "num_tokens": 43001843.0, "step": 18765 }, { "entropy": 5.08487868309021, "epoch": 1.803073967339097, "grad_norm": 1.375, "learning_rate": 0.0004678110248804215, "loss": 4.8759, "mean_token_accuracy": 0.21512030959129333, "num_tokens": 43013249.0, "step": 18770 }, { "entropy": 5.132952308654785, "epoch": 1.803554274735831, "grad_norm": 1.34375, "learning_rate": 0.00046779335222274293, "loss": 4.9273, "mean_token_accuracy": 0.22092564702033995, "num_tokens": 43025867.0, "step": 18775 }, { "entropy": 5.224576234817505, "epoch": 1.804034582132565, "grad_norm": 1.34375, "learning_rate": 0.00046777567508897515, "loss": 4.9973, "mean_token_accuracy": 0.21442876756191254, "num_tokens": 43037530.0, "step": 18780 }, { "entropy": 5.17865571975708, "epoch": 1.8045148895292988, "grad_norm": 1.25, "learning_rate": 0.00046775799347952864, "loss": 4.9551, "mean_token_accuracy": 0.21771474480628966, "num_tokens": 43048675.0, "step": 18785 }, { "entropy": 5.216196775436401, "epoch": 1.8049951969260327, "grad_norm": 1.234375, "learning_rate": 0.0004677403073948139, "loss": 4.9717, "mean_token_accuracy": 0.2206488221883774, "num_tokens": 43059592.0, "step": 18790 }, { "entropy": 5.095714521408081, "epoch": 1.8054755043227666, "grad_norm": 1.4921875, "learning_rate": 0.0004677226168352416, "loss": 4.791, "mean_token_accuracy": 0.22456269711256027, "num_tokens": 43071755.0, "step": 18795 }, { "entropy": 5.183867025375366, "epoch": 1.8059558117195005, "grad_norm": 1.34375, "learning_rate": 0.0004677049218012223, "loss": 4.9514, "mean_token_accuracy": 0.22125904858112336, "num_tokens": 43081801.0, "step": 18800 }, { "entropy": 5.173227787017822, "epoch": 1.8064361191162344, "grad_norm": 1.1484375, "learning_rate": 0.000467687222293167, "loss": 4.8655, "mean_token_accuracy": 0.22511634826660157, "num_tokens": 43094160.0, "step": 18805 }, { "entropy": 5.154326152801514, "epoch": 1.8069164265129682, "grad_norm": 1.171875, "learning_rate": 0.0004676695183114866, "loss": 4.9346, "mean_token_accuracy": 0.22008128166198732, "num_tokens": 43106082.0, "step": 18810 }, { "entropy": 5.140604019165039, "epoch": 1.8073967339097021, "grad_norm": 1.3125, "learning_rate": 0.000467651809856592, "loss": 4.9109, "mean_token_accuracy": 0.21887325048446654, "num_tokens": 43117215.0, "step": 18815 }, { "entropy": 5.256780433654785, "epoch": 1.807877041306436, "grad_norm": 1.234375, "learning_rate": 0.00046763409692889446, "loss": 5.0994, "mean_token_accuracy": 0.20832848697900772, "num_tokens": 43127858.0, "step": 18820 }, { "entropy": 5.255016899108886, "epoch": 1.8083573487031699, "grad_norm": 1.234375, "learning_rate": 0.00046761637952880516, "loss": 4.9017, "mean_token_accuracy": 0.2215781033039093, "num_tokens": 43139952.0, "step": 18825 }, { "entropy": 5.103308439254761, "epoch": 1.808837656099904, "grad_norm": 1.2421875, "learning_rate": 0.00046759865765673555, "loss": 4.8262, "mean_token_accuracy": 0.22883502542972564, "num_tokens": 43150900.0, "step": 18830 }, { "entropy": 5.1161055088043215, "epoch": 1.8093179634966379, "grad_norm": 1.15625, "learning_rate": 0.000467580931313097, "loss": 4.8714, "mean_token_accuracy": 0.2249667078256607, "num_tokens": 43163199.0, "step": 18835 }, { "entropy": 5.187123489379883, "epoch": 1.8097982708933718, "grad_norm": 1.1640625, "learning_rate": 0.00046756320049830106, "loss": 4.8581, "mean_token_accuracy": 0.21825749725103377, "num_tokens": 43174147.0, "step": 18840 }, { "entropy": 5.133212614059448, "epoch": 1.8102785782901056, "grad_norm": 1.21875, "learning_rate": 0.0004675454652127594, "loss": 4.8336, "mean_token_accuracy": 0.22265468090772628, "num_tokens": 43186191.0, "step": 18845 }, { "entropy": 5.20296802520752, "epoch": 1.8107588856868397, "grad_norm": 1.359375, "learning_rate": 0.00046752772545688377, "loss": 5.0445, "mean_token_accuracy": 0.2127169817686081, "num_tokens": 43197541.0, "step": 18850 }, { "entropy": 5.243007707595825, "epoch": 1.8112391930835736, "grad_norm": 1.296875, "learning_rate": 0.0004675099812310861, "loss": 4.9299, "mean_token_accuracy": 0.2172359123826027, "num_tokens": 43208792.0, "step": 18855 }, { "entropy": 5.255295085906982, "epoch": 1.8117195004803075, "grad_norm": 1.1640625, "learning_rate": 0.0004674922325357782, "loss": 5.0026, "mean_token_accuracy": 0.21394715160131456, "num_tokens": 43219914.0, "step": 18860 }, { "entropy": 5.067933750152588, "epoch": 1.8121998078770414, "grad_norm": 1.2421875, "learning_rate": 0.00046747447937137235, "loss": 4.8319, "mean_token_accuracy": 0.22988979518413544, "num_tokens": 43232179.0, "step": 18865 }, { "entropy": 5.12700834274292, "epoch": 1.8126801152737753, "grad_norm": 1.4375, "learning_rate": 0.00046745672173828057, "loss": 4.8656, "mean_token_accuracy": 0.21539948731660843, "num_tokens": 43243196.0, "step": 18870 }, { "entropy": 5.245073318481445, "epoch": 1.8131604226705091, "grad_norm": 1.265625, "learning_rate": 0.0004674389596369151, "loss": 4.9478, "mean_token_accuracy": 0.21452756077051163, "num_tokens": 43255573.0, "step": 18875 }, { "entropy": 5.197284078598022, "epoch": 1.813640730067243, "grad_norm": 1.1015625, "learning_rate": 0.00046742119306768855, "loss": 4.8845, "mean_token_accuracy": 0.21725525557994843, "num_tokens": 43267679.0, "step": 18880 }, { "entropy": 5.266736459732056, "epoch": 1.814121037463977, "grad_norm": 1.25, "learning_rate": 0.0004674034220310132, "loss": 5.02, "mean_token_accuracy": 0.21002791672945023, "num_tokens": 43278857.0, "step": 18885 }, { "entropy": 5.224592781066894, "epoch": 1.8146013448607108, "grad_norm": 1.359375, "learning_rate": 0.00046738564652730176, "loss": 4.9148, "mean_token_accuracy": 0.2243320897221565, "num_tokens": 43291099.0, "step": 18890 }, { "entropy": 5.180141496658325, "epoch": 1.8150816522574447, "grad_norm": 1.234375, "learning_rate": 0.0004673678665569669, "loss": 4.9346, "mean_token_accuracy": 0.21560515463352203, "num_tokens": 43302863.0, "step": 18895 }, { "entropy": 5.17077054977417, "epoch": 1.8155619596541785, "grad_norm": 1.1328125, "learning_rate": 0.0004673500821204213, "loss": 4.8709, "mean_token_accuracy": 0.22833613753318788, "num_tokens": 43314673.0, "step": 18900 }, { "entropy": 5.162666893005371, "epoch": 1.8160422670509124, "grad_norm": 1.25, "learning_rate": 0.000467332293218078, "loss": 4.8609, "mean_token_accuracy": 0.22799091786146164, "num_tokens": 43325927.0, "step": 18905 }, { "entropy": 5.155388593673706, "epoch": 1.8165225744476465, "grad_norm": 1.28125, "learning_rate": 0.00046731449985035, "loss": 4.9259, "mean_token_accuracy": 0.22295121848583221, "num_tokens": 43337615.0, "step": 18910 }, { "entropy": 5.222216415405273, "epoch": 1.8170028818443804, "grad_norm": 1.3125, "learning_rate": 0.00046729670201765036, "loss": 4.9945, "mean_token_accuracy": 0.21877157241106032, "num_tokens": 43349195.0, "step": 18915 }, { "entropy": 5.218001508712769, "epoch": 1.8174831892411143, "grad_norm": 1.3359375, "learning_rate": 0.00046727889972039227, "loss": 4.9506, "mean_token_accuracy": 0.21841635107994078, "num_tokens": 43361274.0, "step": 18920 }, { "entropy": 5.12210259437561, "epoch": 1.8179634966378482, "grad_norm": 1.265625, "learning_rate": 0.00046726109295898904, "loss": 4.8873, "mean_token_accuracy": 0.22357902377843858, "num_tokens": 43372843.0, "step": 18925 }, { "entropy": 5.185367012023926, "epoch": 1.8184438040345823, "grad_norm": 1.171875, "learning_rate": 0.0004672432817338542, "loss": 4.8715, "mean_token_accuracy": 0.2250346526503563, "num_tokens": 43383492.0, "step": 18930 }, { "entropy": 5.152243757247925, "epoch": 1.8189241114313162, "grad_norm": 1.2421875, "learning_rate": 0.00046722546604540115, "loss": 4.897, "mean_token_accuracy": 0.2156965285539627, "num_tokens": 43395669.0, "step": 18935 }, { "entropy": 5.071681165695191, "epoch": 1.81940441882805, "grad_norm": 1.09375, "learning_rate": 0.0004672076458940436, "loss": 4.7988, "mean_token_accuracy": 0.22689439207315446, "num_tokens": 43407340.0, "step": 18940 }, { "entropy": 5.160733652114868, "epoch": 1.819884726224784, "grad_norm": 1.40625, "learning_rate": 0.00046718982128019534, "loss": 4.9413, "mean_token_accuracy": 0.21972607225179672, "num_tokens": 43418607.0, "step": 18945 }, { "entropy": 5.311149311065674, "epoch": 1.8203650336215178, "grad_norm": 1.1640625, "learning_rate": 0.00046717199220427003, "loss": 5.0566, "mean_token_accuracy": 0.21264605075120926, "num_tokens": 43429286.0, "step": 18950 }, { "entropy": 5.150112533569336, "epoch": 1.8208453410182517, "grad_norm": 1.1953125, "learning_rate": 0.00046715415866668163, "loss": 4.8894, "mean_token_accuracy": 0.22030486166477203, "num_tokens": 43440794.0, "step": 18955 }, { "entropy": 5.130909872055054, "epoch": 1.8213256484149856, "grad_norm": 1.1484375, "learning_rate": 0.0004671363206678443, "loss": 4.9403, "mean_token_accuracy": 0.2197520062327385, "num_tokens": 43452184.0, "step": 18960 }, { "entropy": 5.096556758880615, "epoch": 1.8218059558117194, "grad_norm": 1.40625, "learning_rate": 0.00046711847820817215, "loss": 4.8894, "mean_token_accuracy": 0.22419148236513137, "num_tokens": 43463361.0, "step": 18965 }, { "entropy": 5.205463838577271, "epoch": 1.8222862632084533, "grad_norm": 1.15625, "learning_rate": 0.0004671006312880794, "loss": 4.9319, "mean_token_accuracy": 0.21770550161600113, "num_tokens": 43474802.0, "step": 18970 }, { "entropy": 5.156509208679199, "epoch": 1.8227665706051872, "grad_norm": 1.140625, "learning_rate": 0.0004670827799079805, "loss": 4.8782, "mean_token_accuracy": 0.221728877723217, "num_tokens": 43486962.0, "step": 18975 }, { "entropy": 5.206051826477051, "epoch": 1.823246878001921, "grad_norm": 1.1640625, "learning_rate": 0.00046706492406828966, "loss": 4.9016, "mean_token_accuracy": 0.223107148706913, "num_tokens": 43498761.0, "step": 18980 }, { "entropy": 5.212453365325928, "epoch": 1.8237271853986552, "grad_norm": 1.1171875, "learning_rate": 0.0004670470637694217, "loss": 4.9724, "mean_token_accuracy": 0.22523002177476883, "num_tokens": 43511294.0, "step": 18985 }, { "entropy": 5.180626010894775, "epoch": 1.824207492795389, "grad_norm": 1.1875, "learning_rate": 0.0004670291990117912, "loss": 4.8959, "mean_token_accuracy": 0.222798952460289, "num_tokens": 43522858.0, "step": 18990 }, { "entropy": 5.216363430023193, "epoch": 1.824687800192123, "grad_norm": 1.1640625, "learning_rate": 0.0004670113297958128, "loss": 4.8931, "mean_token_accuracy": 0.21746156513690948, "num_tokens": 43533453.0, "step": 18995 }, { "entropy": 5.1565718173980715, "epoch": 1.8251681075888568, "grad_norm": 1.3515625, "learning_rate": 0.00046699345612190155, "loss": 4.8594, "mean_token_accuracy": 0.23229910880327226, "num_tokens": 43543889.0, "step": 19000 }, { "entropy": 5.084559917449951, "epoch": 1.825648414985591, "grad_norm": 1.2734375, "learning_rate": 0.00046697557799047233, "loss": 4.8411, "mean_token_accuracy": 0.23009685277938843, "num_tokens": 43554815.0, "step": 19005 }, { "entropy": 5.076380491256714, "epoch": 1.8261287223823248, "grad_norm": 1.2421875, "learning_rate": 0.0004669576954019403, "loss": 4.779, "mean_token_accuracy": 0.23019351810216904, "num_tokens": 43565258.0, "step": 19010 }, { "entropy": 5.193867635726929, "epoch": 1.8266090297790587, "grad_norm": 1.21875, "learning_rate": 0.0004669398083567205, "loss": 4.9275, "mean_token_accuracy": 0.21558635979890822, "num_tokens": 43576710.0, "step": 19015 }, { "entropy": 5.190999507904053, "epoch": 1.8270893371757926, "grad_norm": 1.21875, "learning_rate": 0.0004669219168552284, "loss": 4.9226, "mean_token_accuracy": 0.21981949657201766, "num_tokens": 43587590.0, "step": 19020 }, { "entropy": 5.135986948013306, "epoch": 1.8275696445725265, "grad_norm": 1.34375, "learning_rate": 0.00046690402089787916, "loss": 4.9395, "mean_token_accuracy": 0.21903201937675476, "num_tokens": 43599675.0, "step": 19025 }, { "entropy": 5.108835506439209, "epoch": 1.8280499519692603, "grad_norm": 1.21875, "learning_rate": 0.0004668861204850884, "loss": 4.8008, "mean_token_accuracy": 0.2312204658985138, "num_tokens": 43612369.0, "step": 19030 }, { "entropy": 5.134114503860474, "epoch": 1.8285302593659942, "grad_norm": 1.234375, "learning_rate": 0.00046686821561727176, "loss": 4.8287, "mean_token_accuracy": 0.22977259159088134, "num_tokens": 43624807.0, "step": 19035 }, { "entropy": 5.119396448135376, "epoch": 1.829010566762728, "grad_norm": 1.15625, "learning_rate": 0.0004668503062948449, "loss": 4.9142, "mean_token_accuracy": 0.22185174524784088, "num_tokens": 43635389.0, "step": 19040 }, { "entropy": 5.198632860183716, "epoch": 1.829490874159462, "grad_norm": 1.2265625, "learning_rate": 0.0004668323925182236, "loss": 4.8978, "mean_token_accuracy": 0.21805914491415024, "num_tokens": 43646807.0, "step": 19045 }, { "entropy": 5.205413579940796, "epoch": 1.8299711815561959, "grad_norm": 1.25, "learning_rate": 0.00046681447428782377, "loss": 4.9393, "mean_token_accuracy": 0.218313068151474, "num_tokens": 43657910.0, "step": 19050 }, { "entropy": 5.148894500732422, "epoch": 1.8304514889529298, "grad_norm": 1.171875, "learning_rate": 0.0004667965516040613, "loss": 4.8903, "mean_token_accuracy": 0.2225254535675049, "num_tokens": 43669376.0, "step": 19055 }, { "entropy": 5.132978916168213, "epoch": 1.8309317963496636, "grad_norm": 1.1640625, "learning_rate": 0.0004667786244673526, "loss": 4.8537, "mean_token_accuracy": 0.2235651895403862, "num_tokens": 43681702.0, "step": 19060 }, { "entropy": 5.235198020935059, "epoch": 1.8314121037463977, "grad_norm": 1.2109375, "learning_rate": 0.00046676069287811365, "loss": 4.9819, "mean_token_accuracy": 0.2188320890069008, "num_tokens": 43694795.0, "step": 19065 }, { "entropy": 5.248945569992065, "epoch": 1.8318924111431316, "grad_norm": 1.109375, "learning_rate": 0.0004667427568367607, "loss": 4.9883, "mean_token_accuracy": 0.22114041894674302, "num_tokens": 43707197.0, "step": 19070 }, { "entropy": 5.220323514938355, "epoch": 1.8323727185398655, "grad_norm": 1.2265625, "learning_rate": 0.00046672481634371047, "loss": 4.9703, "mean_token_accuracy": 0.21035372614860534, "num_tokens": 43719066.0, "step": 19075 }, { "entropy": 5.185680723190307, "epoch": 1.8328530259365994, "grad_norm": 1.171875, "learning_rate": 0.00046670687139937925, "loss": 4.9139, "mean_token_accuracy": 0.22479754090309143, "num_tokens": 43731080.0, "step": 19080 }, { "entropy": 5.197793245315552, "epoch": 1.8333333333333335, "grad_norm": 1.0703125, "learning_rate": 0.0004666889220041837, "loss": 4.9586, "mean_token_accuracy": 0.22060257345438003, "num_tokens": 43743048.0, "step": 19085 }, { "entropy": 5.127030611038208, "epoch": 1.8338136407300674, "grad_norm": 1.2734375, "learning_rate": 0.00046667096815854056, "loss": 4.9217, "mean_token_accuracy": 0.21732288599014282, "num_tokens": 43754701.0, "step": 19090 }, { "entropy": 5.1769365787506105, "epoch": 1.8342939481268012, "grad_norm": 1.3515625, "learning_rate": 0.0004666530098628667, "loss": 4.9569, "mean_token_accuracy": 0.21874051839113234, "num_tokens": 43765696.0, "step": 19095 }, { "entropy": 5.254151487350464, "epoch": 1.8347742555235351, "grad_norm": 1.1171875, "learning_rate": 0.0004666350471175791, "loss": 5.0066, "mean_token_accuracy": 0.21649441719055176, "num_tokens": 43777162.0, "step": 19100 }, { "entropy": 5.217176914215088, "epoch": 1.835254562920269, "grad_norm": 1.3125, "learning_rate": 0.0004666170799230947, "loss": 4.9294, "mean_token_accuracy": 0.21871206164360046, "num_tokens": 43789340.0, "step": 19105 }, { "entropy": 5.128383445739746, "epoch": 1.8357348703170029, "grad_norm": 1.3359375, "learning_rate": 0.0004665991082798307, "loss": 4.8158, "mean_token_accuracy": 0.22825666517019272, "num_tokens": 43800098.0, "step": 19110 }, { "entropy": 5.125943803787232, "epoch": 1.8362151777137368, "grad_norm": 1.46875, "learning_rate": 0.0004665811321882043, "loss": 4.9025, "mean_token_accuracy": 0.22539056986570358, "num_tokens": 43811403.0, "step": 19115 }, { "entropy": 5.256423711776733, "epoch": 1.8366954851104706, "grad_norm": 1.46875, "learning_rate": 0.00046656315164863297, "loss": 4.9268, "mean_token_accuracy": 0.22319784462451936, "num_tokens": 43821946.0, "step": 19120 }, { "entropy": 5.073716640472412, "epoch": 1.8371757925072045, "grad_norm": 1.1953125, "learning_rate": 0.00046654516666153403, "loss": 4.8323, "mean_token_accuracy": 0.2236419141292572, "num_tokens": 43832638.0, "step": 19125 }, { "entropy": 5.12318000793457, "epoch": 1.8376560999039384, "grad_norm": 1.234375, "learning_rate": 0.0004665271772273251, "loss": 4.8481, "mean_token_accuracy": 0.22164386957883836, "num_tokens": 43843616.0, "step": 19130 }, { "entropy": 5.230293321609497, "epoch": 1.8381364073006723, "grad_norm": 1.328125, "learning_rate": 0.0004665091833464239, "loss": 4.8637, "mean_token_accuracy": 0.22107865214347838, "num_tokens": 43854680.0, "step": 19135 }, { "entropy": 5.138217401504517, "epoch": 1.8386167146974062, "grad_norm": 1.2890625, "learning_rate": 0.00046649118501924805, "loss": 4.8908, "mean_token_accuracy": 0.2224344864487648, "num_tokens": 43866683.0, "step": 19140 }, { "entropy": 5.197470855712891, "epoch": 1.8390970220941403, "grad_norm": 1.1875, "learning_rate": 0.0004664731822462154, "loss": 4.9705, "mean_token_accuracy": 0.2186468482017517, "num_tokens": 43878080.0, "step": 19145 }, { "entropy": 5.240995216369629, "epoch": 1.8395773294908742, "grad_norm": 1.171875, "learning_rate": 0.00046645517502774415, "loss": 4.9098, "mean_token_accuracy": 0.2190140336751938, "num_tokens": 43888633.0, "step": 19150 }, { "entropy": 5.252007484436035, "epoch": 1.840057636887608, "grad_norm": 1.296875, "learning_rate": 0.00046643716336425224, "loss": 4.9367, "mean_token_accuracy": 0.2217061460018158, "num_tokens": 43899678.0, "step": 19155 }, { "entropy": 5.145054817199707, "epoch": 1.8405379442843421, "grad_norm": 1.1328125, "learning_rate": 0.0004664191472561578, "loss": 4.8904, "mean_token_accuracy": 0.22237008661031724, "num_tokens": 43910744.0, "step": 19160 }, { "entropy": 5.220931482315064, "epoch": 1.841018251681076, "grad_norm": 1.1796875, "learning_rate": 0.0004664011267038792, "loss": 5.0016, "mean_token_accuracy": 0.21928611844778062, "num_tokens": 43922467.0, "step": 19165 }, { "entropy": 5.142969036102295, "epoch": 1.84149855907781, "grad_norm": 1.0625, "learning_rate": 0.00046638310170783476, "loss": 4.8598, "mean_token_accuracy": 0.22431076914072037, "num_tokens": 43934919.0, "step": 19170 }, { "entropy": 5.187114095687866, "epoch": 1.8419788664745438, "grad_norm": 1.1171875, "learning_rate": 0.000466365072268443, "loss": 4.9115, "mean_token_accuracy": 0.22165709882974624, "num_tokens": 43947737.0, "step": 19175 }, { "entropy": 5.130555963516235, "epoch": 1.8424591738712777, "grad_norm": 1.1953125, "learning_rate": 0.0004663470383861225, "loss": 4.848, "mean_token_accuracy": 0.22130993008613586, "num_tokens": 43959656.0, "step": 19180 }, { "entropy": 5.1659932136535645, "epoch": 1.8429394812680115, "grad_norm": 1.25, "learning_rate": 0.000466329000061292, "loss": 4.9018, "mean_token_accuracy": 0.22112877368927003, "num_tokens": 43970732.0, "step": 19185 }, { "entropy": 5.174559164047241, "epoch": 1.8434197886647454, "grad_norm": 1.2734375, "learning_rate": 0.0004663109572943702, "loss": 4.8847, "mean_token_accuracy": 0.21701590120792388, "num_tokens": 43982369.0, "step": 19190 }, { "entropy": 5.214370965957642, "epoch": 1.8439000960614793, "grad_norm": 1.140625, "learning_rate": 0.000466292910085776, "loss": 4.9312, "mean_token_accuracy": 0.21655133664608, "num_tokens": 43993751.0, "step": 19195 }, { "entropy": 5.124353647232056, "epoch": 1.8443804034582132, "grad_norm": 1.1953125, "learning_rate": 0.00046627485843592854, "loss": 4.8756, "mean_token_accuracy": 0.2187720462679863, "num_tokens": 44005021.0, "step": 19200 }, { "entropy": 5.108295059204101, "epoch": 1.844860710854947, "grad_norm": 1.1796875, "learning_rate": 0.00046625680234524674, "loss": 4.8729, "mean_token_accuracy": 0.2216949701309204, "num_tokens": 44015571.0, "step": 19205 }, { "entropy": 5.198014593124389, "epoch": 1.845341018251681, "grad_norm": 1.2109375, "learning_rate": 0.00046623874181414993, "loss": 4.9338, "mean_token_accuracy": 0.21955927908420564, "num_tokens": 44026363.0, "step": 19210 }, { "entropy": 5.117824935913086, "epoch": 1.8458213256484148, "grad_norm": 1.15625, "learning_rate": 0.0004662206768430574, "loss": 4.8308, "mean_token_accuracy": 0.23248151242733, "num_tokens": 44038042.0, "step": 19215 }, { "entropy": 5.17954683303833, "epoch": 1.846301633045149, "grad_norm": 1.2734375, "learning_rate": 0.0004662026074323886, "loss": 4.9171, "mean_token_accuracy": 0.21451413333415986, "num_tokens": 44049086.0, "step": 19220 }, { "entropy": 5.197770690917968, "epoch": 1.8467819404418828, "grad_norm": 1.1875, "learning_rate": 0.00046618453358256303, "loss": 4.927, "mean_token_accuracy": 0.22353516668081283, "num_tokens": 44060816.0, "step": 19225 }, { "entropy": 5.115513134002685, "epoch": 1.8472622478386167, "grad_norm": 1.15625, "learning_rate": 0.00046616645529400026, "loss": 4.7785, "mean_token_accuracy": 0.22394300550222396, "num_tokens": 44071877.0, "step": 19230 }, { "entropy": 5.157533788681031, "epoch": 1.8477425552353506, "grad_norm": 1.09375, "learning_rate": 0.0004661483725671201, "loss": 4.9054, "mean_token_accuracy": 0.22069804072380067, "num_tokens": 44082618.0, "step": 19235 }, { "entropy": 5.154093313217163, "epoch": 1.8482228626320847, "grad_norm": 1.1796875, "learning_rate": 0.00046613028540234226, "loss": 4.8624, "mean_token_accuracy": 0.22440769523382187, "num_tokens": 44093411.0, "step": 19240 }, { "entropy": 5.08169150352478, "epoch": 1.8487031700288186, "grad_norm": 1.203125, "learning_rate": 0.0004661121938000867, "loss": 4.7732, "mean_token_accuracy": 0.23668147772550582, "num_tokens": 44103595.0, "step": 19245 }, { "entropy": 5.101725435256958, "epoch": 1.8491834774255524, "grad_norm": 1.453125, "learning_rate": 0.0004660940977607736, "loss": 4.8774, "mean_token_accuracy": 0.23302264213562013, "num_tokens": 44114740.0, "step": 19250 }, { "entropy": 5.108340311050415, "epoch": 1.8496637848222863, "grad_norm": 1.265625, "learning_rate": 0.00046607599728482285, "loss": 4.8246, "mean_token_accuracy": 0.22913457453250885, "num_tokens": 44125415.0, "step": 19255 }, { "entropy": 5.1172998428344725, "epoch": 1.8501440922190202, "grad_norm": 1.1640625, "learning_rate": 0.00046605789237265496, "loss": 4.8727, "mean_token_accuracy": 0.22260245233774184, "num_tokens": 44137399.0, "step": 19260 }, { "entropy": 5.148006868362427, "epoch": 1.850624399615754, "grad_norm": 1.25, "learning_rate": 0.00046603978302469, "loss": 4.8204, "mean_token_accuracy": 0.21915482729673386, "num_tokens": 44148354.0, "step": 19265 }, { "entropy": 5.109054517745972, "epoch": 1.851104707012488, "grad_norm": 1.28125, "learning_rate": 0.0004660216692413486, "loss": 4.8581, "mean_token_accuracy": 0.22766265720129014, "num_tokens": 44160496.0, "step": 19270 }, { "entropy": 5.156183767318725, "epoch": 1.8515850144092219, "grad_norm": 1.484375, "learning_rate": 0.0004660035510230513, "loss": 4.9271, "mean_token_accuracy": 0.2220014289021492, "num_tokens": 44171832.0, "step": 19275 }, { "entropy": 5.232931613922119, "epoch": 1.8520653218059557, "grad_norm": 1.2578125, "learning_rate": 0.0004659854283702186, "loss": 4.8992, "mean_token_accuracy": 0.21975383013486863, "num_tokens": 44182850.0, "step": 19280 }, { "entropy": 5.176450967788696, "epoch": 1.8525456292026896, "grad_norm": 1.265625, "learning_rate": 0.0004659673012832715, "loss": 4.9137, "mean_token_accuracy": 0.2231437310576439, "num_tokens": 44194343.0, "step": 19285 }, { "entropy": 5.230204057693482, "epoch": 1.8530259365994235, "grad_norm": 1.4453125, "learning_rate": 0.0004659491697626306, "loss": 5.011, "mean_token_accuracy": 0.21255818009376526, "num_tokens": 44206294.0, "step": 19290 }, { "entropy": 5.189047527313233, "epoch": 1.8535062439961574, "grad_norm": 1.03125, "learning_rate": 0.00046593103380871705, "loss": 4.8994, "mean_token_accuracy": 0.22048285007476806, "num_tokens": 44218576.0, "step": 19295 }, { "entropy": 5.099210739135742, "epoch": 1.8539865513928915, "grad_norm": 1.3125, "learning_rate": 0.00046591289342195184, "loss": 4.8999, "mean_token_accuracy": 0.21878990679979324, "num_tokens": 44230480.0, "step": 19300 }, { "entropy": 5.1187114238739015, "epoch": 1.8544668587896254, "grad_norm": 1.078125, "learning_rate": 0.0004658947486027562, "loss": 4.7986, "mean_token_accuracy": 0.2290068194270134, "num_tokens": 44240907.0, "step": 19305 }, { "entropy": 5.228351020812989, "epoch": 1.8549471661863592, "grad_norm": 1.453125, "learning_rate": 0.00046587659935155124, "loss": 4.9873, "mean_token_accuracy": 0.21489476263523102, "num_tokens": 44251469.0, "step": 19310 }, { "entropy": 5.287296009063721, "epoch": 1.8554274735830933, "grad_norm": 1.2265625, "learning_rate": 0.00046585844566875845, "loss": 4.9903, "mean_token_accuracy": 0.2172749251127243, "num_tokens": 44263875.0, "step": 19315 }, { "entropy": 5.1942919254302975, "epoch": 1.8559077809798272, "grad_norm": 1.2890625, "learning_rate": 0.0004658402875547993, "loss": 4.9816, "mean_token_accuracy": 0.2130007728934288, "num_tokens": 44276163.0, "step": 19320 }, { "entropy": 5.15506649017334, "epoch": 1.856388088376561, "grad_norm": 1.15625, "learning_rate": 0.00046582212501009533, "loss": 4.8997, "mean_token_accuracy": 0.21825883835554122, "num_tokens": 44287985.0, "step": 19325 }, { "entropy": 5.2520318031311035, "epoch": 1.856868395773295, "grad_norm": 1.2421875, "learning_rate": 0.00046580395803506825, "loss": 4.9908, "mean_token_accuracy": 0.21863823086023332, "num_tokens": 44299360.0, "step": 19330 }, { "entropy": 5.217044162750244, "epoch": 1.8573487031700289, "grad_norm": 1.2421875, "learning_rate": 0.0004657857866301399, "loss": 4.8603, "mean_token_accuracy": 0.231880284845829, "num_tokens": 44310459.0, "step": 19335 }, { "entropy": 5.279342746734619, "epoch": 1.8578290105667628, "grad_norm": 1.8828125, "learning_rate": 0.00046576761079573204, "loss": 5.0231, "mean_token_accuracy": 0.2130425050854683, "num_tokens": 44321832.0, "step": 19340 }, { "entropy": 5.147693634033203, "epoch": 1.8583093179634966, "grad_norm": 1.2890625, "learning_rate": 0.0004657494305322667, "loss": 4.9793, "mean_token_accuracy": 0.2211272895336151, "num_tokens": 44332829.0, "step": 19345 }, { "entropy": 5.161108541488647, "epoch": 1.8587896253602305, "grad_norm": 1.265625, "learning_rate": 0.0004657312458401661, "loss": 4.8809, "mean_token_accuracy": 0.2182894691824913, "num_tokens": 44344321.0, "step": 19350 }, { "entropy": 5.287922000885009, "epoch": 1.8592699327569644, "grad_norm": 1.2421875, "learning_rate": 0.0004657130567198522, "loss": 4.9975, "mean_token_accuracy": 0.20884452909231185, "num_tokens": 44357183.0, "step": 19355 }, { "entropy": 5.24537878036499, "epoch": 1.8597502401536983, "grad_norm": 1.2421875, "learning_rate": 0.00046569486317174746, "loss": 4.97, "mean_token_accuracy": 0.2225006863474846, "num_tokens": 44367118.0, "step": 19360 }, { "entropy": 5.225019359588623, "epoch": 1.8602305475504322, "grad_norm": 1.3359375, "learning_rate": 0.0004656766651962742, "loss": 4.9776, "mean_token_accuracy": 0.22012574672698976, "num_tokens": 44377699.0, "step": 19365 }, { "entropy": 5.138026428222656, "epoch": 1.860710854947166, "grad_norm": 1.328125, "learning_rate": 0.0004656584627938551, "loss": 4.9285, "mean_token_accuracy": 0.21567461490631104, "num_tokens": 44389457.0, "step": 19370 }, { "entropy": 5.127941083908081, "epoch": 1.8611911623439001, "grad_norm": 1.2265625, "learning_rate": 0.00046564025596491254, "loss": 4.8416, "mean_token_accuracy": 0.22531868368387223, "num_tokens": 44400140.0, "step": 19375 }, { "entropy": 5.152888202667237, "epoch": 1.861671469740634, "grad_norm": 1.2109375, "learning_rate": 0.0004656220447098693, "loss": 4.8384, "mean_token_accuracy": 0.2336040586233139, "num_tokens": 44411088.0, "step": 19380 }, { "entropy": 5.154614400863648, "epoch": 1.862151777137368, "grad_norm": 1.25, "learning_rate": 0.0004656038290291483, "loss": 4.8516, "mean_token_accuracy": 0.21973695307970048, "num_tokens": 44422515.0, "step": 19385 }, { "entropy": 5.126431226730347, "epoch": 1.8626320845341018, "grad_norm": 1.5546875, "learning_rate": 0.0004655856089231723, "loss": 4.8983, "mean_token_accuracy": 0.2270813450217247, "num_tokens": 44433711.0, "step": 19390 }, { "entropy": 5.23093090057373, "epoch": 1.8631123919308359, "grad_norm": 1.3125, "learning_rate": 0.0004655673843923644, "loss": 4.9393, "mean_token_accuracy": 0.2212497591972351, "num_tokens": 44446157.0, "step": 19395 }, { "entropy": 5.242627716064453, "epoch": 1.8635926993275698, "grad_norm": 1.328125, "learning_rate": 0.0004655491554371477, "loss": 5.0405, "mean_token_accuracy": 0.21081701517105103, "num_tokens": 44457506.0, "step": 19400 }, { "entropy": 5.200697708129883, "epoch": 1.8640730067243036, "grad_norm": 1.15625, "learning_rate": 0.00046553092205794543, "loss": 5.033, "mean_token_accuracy": 0.21556743383407592, "num_tokens": 44469572.0, "step": 19405 }, { "entropy": 5.1919454574584964, "epoch": 1.8645533141210375, "grad_norm": 1.46875, "learning_rate": 0.00046551268425518096, "loss": 4.9358, "mean_token_accuracy": 0.22127741873264312, "num_tokens": 44481932.0, "step": 19410 }, { "entropy": 5.158847665786743, "epoch": 1.8650336215177714, "grad_norm": 1.3046875, "learning_rate": 0.0004654944420292776, "loss": 4.8473, "mean_token_accuracy": 0.2273672789335251, "num_tokens": 44493081.0, "step": 19415 }, { "entropy": 5.1619750499725345, "epoch": 1.8655139289145053, "grad_norm": 1.2109375, "learning_rate": 0.000465476195380659, "loss": 4.9037, "mean_token_accuracy": 0.2256894126534462, "num_tokens": 44503919.0, "step": 19420 }, { "entropy": 5.163426876068115, "epoch": 1.8659942363112392, "grad_norm": 1.171875, "learning_rate": 0.0004654579443097487, "loss": 4.8935, "mean_token_accuracy": 0.22945554107427596, "num_tokens": 44515493.0, "step": 19425 }, { "entropy": 5.217961978912354, "epoch": 1.866474543707973, "grad_norm": 1.203125, "learning_rate": 0.0004654396888169705, "loss": 4.9388, "mean_token_accuracy": 0.21313630491495134, "num_tokens": 44527432.0, "step": 19430 }, { "entropy": 5.127003288269043, "epoch": 1.866954851104707, "grad_norm": 1.375, "learning_rate": 0.00046542142890274816, "loss": 4.8843, "mean_token_accuracy": 0.22532198578119278, "num_tokens": 44537634.0, "step": 19435 }, { "entropy": 5.097131729125977, "epoch": 1.8674351585014408, "grad_norm": 1.15625, "learning_rate": 0.0004654031645675057, "loss": 4.951, "mean_token_accuracy": 0.22149149626493453, "num_tokens": 44549856.0, "step": 19440 }, { "entropy": 5.095395898818969, "epoch": 1.8679154658981747, "grad_norm": 1.203125, "learning_rate": 0.0004653848958116672, "loss": 4.8544, "mean_token_accuracy": 0.22061508595943452, "num_tokens": 44563214.0, "step": 19445 }, { "entropy": 5.1080015182495115, "epoch": 1.8683957732949086, "grad_norm": 1.1875, "learning_rate": 0.00046536662263565667, "loss": 4.877, "mean_token_accuracy": 0.21556110978126525, "num_tokens": 44572982.0, "step": 19450 }, { "entropy": 5.159835386276245, "epoch": 1.8688760806916427, "grad_norm": 1.328125, "learning_rate": 0.0004653483450398985, "loss": 4.8434, "mean_token_accuracy": 0.22941485792398453, "num_tokens": 44583664.0, "step": 19455 }, { "entropy": 5.096084451675415, "epoch": 1.8693563880883766, "grad_norm": 1.2109375, "learning_rate": 0.00046533006302481694, "loss": 4.75, "mean_token_accuracy": 0.23159895092248917, "num_tokens": 44594987.0, "step": 19460 }, { "entropy": 5.144198322296143, "epoch": 1.8698366954851104, "grad_norm": 1.3359375, "learning_rate": 0.0004653117765908365, "loss": 4.9304, "mean_token_accuracy": 0.22142930179834366, "num_tokens": 44607123.0, "step": 19465 }, { "entropy": 5.126646852493286, "epoch": 1.8703170028818443, "grad_norm": 1.203125, "learning_rate": 0.0004652934857383816, "loss": 4.8467, "mean_token_accuracy": 0.22166724801063536, "num_tokens": 44618736.0, "step": 19470 }, { "entropy": 5.101489782333374, "epoch": 1.8707973102785784, "grad_norm": 1.1875, "learning_rate": 0.0004652751904678772, "loss": 4.8425, "mean_token_accuracy": 0.23013574928045272, "num_tokens": 44630681.0, "step": 19475 }, { "entropy": 5.146808242797851, "epoch": 1.8712776176753123, "grad_norm": 1.1875, "learning_rate": 0.00046525689077974775, "loss": 4.9051, "mean_token_accuracy": 0.2216115802526474, "num_tokens": 44641641.0, "step": 19480 }, { "entropy": 5.13797926902771, "epoch": 1.8717579250720462, "grad_norm": 1.2578125, "learning_rate": 0.00046523858667441834, "loss": 4.8518, "mean_token_accuracy": 0.223634971678257, "num_tokens": 44652489.0, "step": 19485 }, { "entropy": 5.1899360656738285, "epoch": 1.87223823246878, "grad_norm": 1.3984375, "learning_rate": 0.0004652202781523138, "loss": 4.941, "mean_token_accuracy": 0.21827635020017624, "num_tokens": 44665659.0, "step": 19490 }, { "entropy": 5.164613914489746, "epoch": 1.872718539865514, "grad_norm": 1.25, "learning_rate": 0.0004652019652138592, "loss": 4.8517, "mean_token_accuracy": 0.22854416221380233, "num_tokens": 44677300.0, "step": 19495 }, { "entropy": 5.118680858612061, "epoch": 1.8731988472622478, "grad_norm": 1.09375, "learning_rate": 0.0004651836478594798, "loss": 4.919, "mean_token_accuracy": 0.216032674908638, "num_tokens": 44690084.0, "step": 19500 }, { "entropy": 5.161847019195557, "epoch": 1.8736791546589817, "grad_norm": 1.171875, "learning_rate": 0.0004651653260896008, "loss": 4.8971, "mean_token_accuracy": 0.22766951471567154, "num_tokens": 44701208.0, "step": 19505 }, { "entropy": 5.2724034786224365, "epoch": 1.8741594620557156, "grad_norm": 1.265625, "learning_rate": 0.00046514699990464763, "loss": 5.0115, "mean_token_accuracy": 0.21198325604200363, "num_tokens": 44713546.0, "step": 19510 }, { "entropy": 5.362866449356079, "epoch": 1.8746397694524495, "grad_norm": 1.1328125, "learning_rate": 0.0004651286693050458, "loss": 5.1058, "mean_token_accuracy": 0.20230617225170136, "num_tokens": 44724802.0, "step": 19515 }, { "entropy": 5.175036287307739, "epoch": 1.8751200768491834, "grad_norm": 1.28125, "learning_rate": 0.0004651103342912207, "loss": 4.8891, "mean_token_accuracy": 0.22242380380630494, "num_tokens": 44736625.0, "step": 19520 }, { "entropy": 5.085053825378418, "epoch": 1.8756003842459172, "grad_norm": 1.1796875, "learning_rate": 0.00046509199486359824, "loss": 4.8602, "mean_token_accuracy": 0.21328776627779006, "num_tokens": 44750451.0, "step": 19525 }, { "entropy": 5.08069429397583, "epoch": 1.8760806916426513, "grad_norm": 1.109375, "learning_rate": 0.00046507365102260403, "loss": 4.8825, "mean_token_accuracy": 0.22342679798603057, "num_tokens": 44762043.0, "step": 19530 }, { "entropy": 5.201959800720215, "epoch": 1.8765609990393852, "grad_norm": 1.1640625, "learning_rate": 0.00046505530276866417, "loss": 4.9845, "mean_token_accuracy": 0.21430910676717757, "num_tokens": 44774329.0, "step": 19535 }, { "entropy": 5.232240343093872, "epoch": 1.877041306436119, "grad_norm": 1.125, "learning_rate": 0.00046503695010220443, "loss": 4.9481, "mean_token_accuracy": 0.2223116397857666, "num_tokens": 44786352.0, "step": 19540 }, { "entropy": 5.121707820892334, "epoch": 1.877521613832853, "grad_norm": 1.1484375, "learning_rate": 0.000465018593023651, "loss": 4.7645, "mean_token_accuracy": 0.22961812168359758, "num_tokens": 44796829.0, "step": 19545 }, { "entropy": 5.183013200759888, "epoch": 1.878001921229587, "grad_norm": 1.125, "learning_rate": 0.0004650002315334302, "loss": 4.9705, "mean_token_accuracy": 0.21601766496896743, "num_tokens": 44808900.0, "step": 19550 }, { "entropy": 5.1030841827392575, "epoch": 1.878482228626321, "grad_norm": 1.1875, "learning_rate": 0.000464981865631968, "loss": 4.8785, "mean_token_accuracy": 0.22552091330289842, "num_tokens": 44820638.0, "step": 19555 }, { "entropy": 5.112918424606323, "epoch": 1.8789625360230549, "grad_norm": 1.1171875, "learning_rate": 0.0004649634953196912, "loss": 4.8595, "mean_token_accuracy": 0.22413474321365356, "num_tokens": 44833370.0, "step": 19560 }, { "entropy": 5.111734342575073, "epoch": 1.8794428434197887, "grad_norm": 1.1796875, "learning_rate": 0.00046494512059702605, "loss": 4.9102, "mean_token_accuracy": 0.22396851181983948, "num_tokens": 44844291.0, "step": 19565 }, { "entropy": 5.137105417251587, "epoch": 1.8799231508165226, "grad_norm": 1.3046875, "learning_rate": 0.0004649267414643992, "loss": 4.8452, "mean_token_accuracy": 0.22659626305103303, "num_tokens": 44855303.0, "step": 19570 }, { "entropy": 5.119134569168091, "epoch": 1.8804034582132565, "grad_norm": 1.1015625, "learning_rate": 0.0004649083579222374, "loss": 4.8031, "mean_token_accuracy": 0.2253907725214958, "num_tokens": 44866593.0, "step": 19575 }, { "entropy": 5.115862894058227, "epoch": 1.8808837656099904, "grad_norm": 1.2421875, "learning_rate": 0.00046488996997096744, "loss": 4.9113, "mean_token_accuracy": 0.22246016263961793, "num_tokens": 44879124.0, "step": 19580 }, { "entropy": 5.138719701766968, "epoch": 1.8813640730067243, "grad_norm": 1.1796875, "learning_rate": 0.0004648715776110162, "loss": 4.8542, "mean_token_accuracy": 0.23651068955659865, "num_tokens": 44889942.0, "step": 19585 }, { "entropy": 5.114886236190796, "epoch": 1.8818443804034581, "grad_norm": 1.2109375, "learning_rate": 0.0004648531808428108, "loss": 4.8155, "mean_token_accuracy": 0.23198310881853104, "num_tokens": 44901002.0, "step": 19590 }, { "entropy": 5.169087839126587, "epoch": 1.882324687800192, "grad_norm": 1.15625, "learning_rate": 0.0004648347796667782, "loss": 4.9631, "mean_token_accuracy": 0.2160460963845253, "num_tokens": 44913495.0, "step": 19595 }, { "entropy": 5.137406253814698, "epoch": 1.882804995196926, "grad_norm": 1.0859375, "learning_rate": 0.0004648163740833458, "loss": 4.8565, "mean_token_accuracy": 0.21922577768564225, "num_tokens": 44924624.0, "step": 19600 }, { "entropy": 5.1774355411529545, "epoch": 1.8832853025936598, "grad_norm": 1.234375, "learning_rate": 0.00046479796409294076, "loss": 4.9293, "mean_token_accuracy": 0.22170649766921996, "num_tokens": 44935539.0, "step": 19605 }, { "entropy": 5.150081396102905, "epoch": 1.8837656099903939, "grad_norm": 1.1328125, "learning_rate": 0.0004647795496959907, "loss": 4.8425, "mean_token_accuracy": 0.22837162464857103, "num_tokens": 44945979.0, "step": 19610 }, { "entropy": 5.108224296569825, "epoch": 1.8842459173871278, "grad_norm": 1.1484375, "learning_rate": 0.00046476113089292286, "loss": 4.9229, "mean_token_accuracy": 0.21530138850212097, "num_tokens": 44958294.0, "step": 19615 }, { "entropy": 5.206214857101441, "epoch": 1.8847262247838616, "grad_norm": 1.09375, "learning_rate": 0.0004647427076841651, "loss": 5.0064, "mean_token_accuracy": 0.21917444765567778, "num_tokens": 44970261.0, "step": 19620 }, { "entropy": 5.231733274459839, "epoch": 1.8852065321805955, "grad_norm": 1.1796875, "learning_rate": 0.00046472428007014515, "loss": 4.9209, "mean_token_accuracy": 0.2197403684258461, "num_tokens": 44981515.0, "step": 19625 }, { "entropy": 5.0884459018707275, "epoch": 1.8856868395773296, "grad_norm": 1.234375, "learning_rate": 0.0004647058480512907, "loss": 4.8162, "mean_token_accuracy": 0.23149994909763336, "num_tokens": 44992473.0, "step": 19630 }, { "entropy": 5.197129726409912, "epoch": 1.8861671469740635, "grad_norm": 1.1796875, "learning_rate": 0.00046468741162802987, "loss": 4.9509, "mean_token_accuracy": 0.2180525004863739, "num_tokens": 45004264.0, "step": 19635 }, { "entropy": 5.163276147842407, "epoch": 1.8866474543707974, "grad_norm": 1.09375, "learning_rate": 0.0004646689708007905, "loss": 4.8297, "mean_token_accuracy": 0.22839758545160294, "num_tokens": 45015601.0, "step": 19640 }, { "entropy": 5.204332256317139, "epoch": 1.8871277617675313, "grad_norm": 1.1484375, "learning_rate": 0.00046465052557000087, "loss": 5.0055, "mean_token_accuracy": 0.22162444591522218, "num_tokens": 45028044.0, "step": 19645 }, { "entropy": 5.240250301361084, "epoch": 1.8876080691642652, "grad_norm": 1.5390625, "learning_rate": 0.00046463207593608916, "loss": 4.9686, "mean_token_accuracy": 0.2230614274740219, "num_tokens": 45040144.0, "step": 19650 }, { "entropy": 5.266805171966553, "epoch": 1.888088376560999, "grad_norm": 1.1484375, "learning_rate": 0.0004646136218994837, "loss": 5.0224, "mean_token_accuracy": 0.21359916925430297, "num_tokens": 45051844.0, "step": 19655 }, { "entropy": 5.174513053894043, "epoch": 1.888568683957733, "grad_norm": 1.2578125, "learning_rate": 0.00046459516346061304, "loss": 4.9785, "mean_token_accuracy": 0.21726988703012468, "num_tokens": 45062684.0, "step": 19660 }, { "entropy": 5.155341243743896, "epoch": 1.8890489913544668, "grad_norm": 1.1953125, "learning_rate": 0.00046457670061990564, "loss": 4.9112, "mean_token_accuracy": 0.2235699400305748, "num_tokens": 45073327.0, "step": 19665 }, { "entropy": 5.172566366195679, "epoch": 1.8895292987512007, "grad_norm": 1.203125, "learning_rate": 0.00046455823337779024, "loss": 4.928, "mean_token_accuracy": 0.2199169397354126, "num_tokens": 45085309.0, "step": 19670 }, { "entropy": 5.26616792678833, "epoch": 1.8900096061479346, "grad_norm": 1.0625, "learning_rate": 0.0004645397617346954, "loss": 4.981, "mean_token_accuracy": 0.2198196455836296, "num_tokens": 45095934.0, "step": 19675 }, { "entropy": 5.195982265472412, "epoch": 1.8904899135446684, "grad_norm": 1.2421875, "learning_rate": 0.0004645212856910502, "loss": 4.8771, "mean_token_accuracy": 0.22742779403924943, "num_tokens": 45106220.0, "step": 19680 }, { "entropy": 5.119170188903809, "epoch": 1.8909702209414025, "grad_norm": 1.1640625, "learning_rate": 0.0004645028052472835, "loss": 4.8547, "mean_token_accuracy": 0.2206563949584961, "num_tokens": 45117861.0, "step": 19685 }, { "entropy": 5.113675260543824, "epoch": 1.8914505283381364, "grad_norm": 1.15625, "learning_rate": 0.00046448432040382444, "loss": 4.8622, "mean_token_accuracy": 0.23194930851459503, "num_tokens": 45129248.0, "step": 19690 }, { "entropy": 5.2112102031707765, "epoch": 1.8919308357348703, "grad_norm": 1.3515625, "learning_rate": 0.000464465831161102, "loss": 4.8966, "mean_token_accuracy": 0.21587489694356918, "num_tokens": 45140610.0, "step": 19695 }, { "entropy": 5.147746658325195, "epoch": 1.8924111431316042, "grad_norm": 1.1328125, "learning_rate": 0.0004644473375195456, "loss": 4.8274, "mean_token_accuracy": 0.22959212362766265, "num_tokens": 45150697.0, "step": 19700 }, { "entropy": 5.177985334396363, "epoch": 1.8928914505283383, "grad_norm": 1.171875, "learning_rate": 0.00046442883947958466, "loss": 4.9042, "mean_token_accuracy": 0.22117964476346968, "num_tokens": 45162627.0, "step": 19705 }, { "entropy": 5.038029146194458, "epoch": 1.8933717579250722, "grad_norm": 1.234375, "learning_rate": 0.00046441033704164845, "loss": 4.6972, "mean_token_accuracy": 0.2338120698928833, "num_tokens": 45173030.0, "step": 19710 }, { "entropy": 5.172407436370849, "epoch": 1.893852065321806, "grad_norm": 1.1328125, "learning_rate": 0.0004643918302061666, "loss": 4.9106, "mean_token_accuracy": 0.22116362750530244, "num_tokens": 45183588.0, "step": 19715 }, { "entropy": 5.237579727172852, "epoch": 1.89433237271854, "grad_norm": 1.453125, "learning_rate": 0.0004643733189735689, "loss": 4.9034, "mean_token_accuracy": 0.2231490433216095, "num_tokens": 45195289.0, "step": 19720 }, { "entropy": 5.165693855285644, "epoch": 1.8948126801152738, "grad_norm": 1.15625, "learning_rate": 0.000464354803344285, "loss": 4.8877, "mean_token_accuracy": 0.22597122192382812, "num_tokens": 45207001.0, "step": 19725 }, { "entropy": 5.0735241889953615, "epoch": 1.8952929875120077, "grad_norm": 1.2578125, "learning_rate": 0.00046433628331874496, "loss": 4.9137, "mean_token_accuracy": 0.22631023377180098, "num_tokens": 45217447.0, "step": 19730 }, { "entropy": 5.171666955947876, "epoch": 1.8957732949087416, "grad_norm": 1.2578125, "learning_rate": 0.0004643177588973785, "loss": 4.8659, "mean_token_accuracy": 0.22453079670667647, "num_tokens": 45227588.0, "step": 19735 }, { "entropy": 5.181704807281494, "epoch": 1.8962536023054755, "grad_norm": 1.1953125, "learning_rate": 0.0004642992300806159, "loss": 4.8899, "mean_token_accuracy": 0.2267356261610985, "num_tokens": 45239969.0, "step": 19740 }, { "entropy": 5.118777227401734, "epoch": 1.8967339097022093, "grad_norm": 1.25, "learning_rate": 0.0004642806968688873, "loss": 4.8654, "mean_token_accuracy": 0.21952651739120482, "num_tokens": 45250877.0, "step": 19745 }, { "entropy": 5.194926071166992, "epoch": 1.8972142170989432, "grad_norm": 1.3203125, "learning_rate": 0.00046426215926262295, "loss": 4.9448, "mean_token_accuracy": 0.22451459765434265, "num_tokens": 45262846.0, "step": 19750 }, { "entropy": 5.164976167678833, "epoch": 1.897694524495677, "grad_norm": 1.1796875, "learning_rate": 0.0004642436172622532, "loss": 4.8732, "mean_token_accuracy": 0.22321307361125947, "num_tokens": 45274198.0, "step": 19755 }, { "entropy": 5.107458639144897, "epoch": 1.898174831892411, "grad_norm": 1.2578125, "learning_rate": 0.0004642250708682086, "loss": 4.876, "mean_token_accuracy": 0.22586560100317002, "num_tokens": 45285803.0, "step": 19760 }, { "entropy": 5.195139932632446, "epoch": 1.898655139289145, "grad_norm": 1.234375, "learning_rate": 0.00046420652008091984, "loss": 4.9029, "mean_token_accuracy": 0.22238381803035737, "num_tokens": 45298242.0, "step": 19765 }, { "entropy": 5.137690496444702, "epoch": 1.899135446685879, "grad_norm": 1.203125, "learning_rate": 0.0004641879649008174, "loss": 4.8659, "mean_token_accuracy": 0.22768788039684296, "num_tokens": 45309567.0, "step": 19770 }, { "entropy": 5.216307497024536, "epoch": 1.8996157540826129, "grad_norm": 1.2109375, "learning_rate": 0.0004641694053283323, "loss": 4.9775, "mean_token_accuracy": 0.22154446691274643, "num_tokens": 45321693.0, "step": 19775 }, { "entropy": 5.222153615951538, "epoch": 1.9000960614793467, "grad_norm": 1.109375, "learning_rate": 0.00046415084136389525, "loss": 4.9213, "mean_token_accuracy": 0.2227088153362274, "num_tokens": 45333094.0, "step": 19780 }, { "entropy": 5.142860698699951, "epoch": 1.9005763688760808, "grad_norm": 1.28125, "learning_rate": 0.0004641322730079374, "loss": 4.8623, "mean_token_accuracy": 0.22830192744731903, "num_tokens": 45344549.0, "step": 19785 }, { "entropy": 5.189712476730347, "epoch": 1.9010566762728147, "grad_norm": 1.21875, "learning_rate": 0.0004641137002608897, "loss": 4.9291, "mean_token_accuracy": 0.215300115942955, "num_tokens": 45356433.0, "step": 19790 }, { "entropy": 5.16183123588562, "epoch": 1.9015369836695486, "grad_norm": 1.234375, "learning_rate": 0.00046409512312318345, "loss": 4.875, "mean_token_accuracy": 0.22419418394565582, "num_tokens": 45368224.0, "step": 19795 }, { "entropy": 5.074509048461914, "epoch": 1.9020172910662825, "grad_norm": 1.15625, "learning_rate": 0.00046407654159524994, "loss": 4.814, "mean_token_accuracy": 0.22955633252859114, "num_tokens": 45379673.0, "step": 19800 }, { "entropy": 5.221636247634888, "epoch": 1.9024975984630164, "grad_norm": 1.140625, "learning_rate": 0.00046405795567752055, "loss": 4.9505, "mean_token_accuracy": 0.22039366215467454, "num_tokens": 45391613.0, "step": 19805 }, { "entropy": 5.164040994644165, "epoch": 1.9029779058597502, "grad_norm": 1.125, "learning_rate": 0.00046403936537042686, "loss": 4.9312, "mean_token_accuracy": 0.21759382635354996, "num_tokens": 45403693.0, "step": 19810 }, { "entropy": 5.153062391281128, "epoch": 1.9034582132564841, "grad_norm": 1.234375, "learning_rate": 0.00046402077067440043, "loss": 4.9291, "mean_token_accuracy": 0.22086333185434343, "num_tokens": 45416151.0, "step": 19815 }, { "entropy": 5.180631017684936, "epoch": 1.903938520653218, "grad_norm": 1.171875, "learning_rate": 0.00046400217158987293, "loss": 4.9654, "mean_token_accuracy": 0.21587077677249908, "num_tokens": 45428276.0, "step": 19820 }, { "entropy": 5.128487300872803, "epoch": 1.9044188280499519, "grad_norm": 1.296875, "learning_rate": 0.00046398356811727626, "loss": 4.8451, "mean_token_accuracy": 0.22816976755857468, "num_tokens": 45439877.0, "step": 19825 }, { "entropy": 5.178048086166382, "epoch": 1.9048991354466858, "grad_norm": 1.4375, "learning_rate": 0.0004639649602570423, "loss": 4.9352, "mean_token_accuracy": 0.23003823608160018, "num_tokens": 45451459.0, "step": 19830 }, { "entropy": 5.043431615829467, "epoch": 1.9053794428434196, "grad_norm": 1.28125, "learning_rate": 0.00046394634800960314, "loss": 4.7977, "mean_token_accuracy": 0.22761821001768112, "num_tokens": 45463341.0, "step": 19835 }, { "entropy": 5.19926815032959, "epoch": 1.9058597502401537, "grad_norm": 1.1484375, "learning_rate": 0.00046392773137539074, "loss": 4.8938, "mean_token_accuracy": 0.22187106013298036, "num_tokens": 45475585.0, "step": 19840 }, { "entropy": 5.246723794937134, "epoch": 1.9063400576368876, "grad_norm": 1.1015625, "learning_rate": 0.00046390911035483744, "loss": 4.8847, "mean_token_accuracy": 0.22155932635068892, "num_tokens": 45487178.0, "step": 19845 }, { "entropy": 5.1914482593536375, "epoch": 1.9068203650336215, "grad_norm": 1.1875, "learning_rate": 0.0004638904849483756, "loss": 4.9501, "mean_token_accuracy": 0.2210107535123825, "num_tokens": 45498832.0, "step": 19850 }, { "entropy": 5.174285650253296, "epoch": 1.9073006724303554, "grad_norm": 1.171875, "learning_rate": 0.00046387185515643756, "loss": 4.9425, "mean_token_accuracy": 0.2178223967552185, "num_tokens": 45511549.0, "step": 19855 }, { "entropy": 5.192152786254883, "epoch": 1.9077809798270895, "grad_norm": 1.2109375, "learning_rate": 0.0004638532209794559, "loss": 4.9193, "mean_token_accuracy": 0.22249827682971954, "num_tokens": 45523421.0, "step": 19860 }, { "entropy": 5.224132394790649, "epoch": 1.9082612872238234, "grad_norm": 1.1171875, "learning_rate": 0.0004638345824178631, "loss": 4.9337, "mean_token_accuracy": 0.2218421757221222, "num_tokens": 45535247.0, "step": 19865 }, { "entropy": 5.100824499130249, "epoch": 1.9087415946205573, "grad_norm": 1.1640625, "learning_rate": 0.00046381593947209215, "loss": 4.7771, "mean_token_accuracy": 0.23252510279417038, "num_tokens": 45546296.0, "step": 19870 }, { "entropy": 5.161165046691894, "epoch": 1.9092219020172911, "grad_norm": 1.1328125, "learning_rate": 0.0004637972921425757, "loss": 4.9398, "mean_token_accuracy": 0.22113918364048005, "num_tokens": 45557360.0, "step": 19875 }, { "entropy": 5.156943368911743, "epoch": 1.909702209414025, "grad_norm": 1.171875, "learning_rate": 0.00046377864042974675, "loss": 4.9039, "mean_token_accuracy": 0.22512820065021516, "num_tokens": 45568500.0, "step": 19880 }, { "entropy": 5.203023481369018, "epoch": 1.910182516810759, "grad_norm": 1.3984375, "learning_rate": 0.0004637599843340384, "loss": 4.9682, "mean_token_accuracy": 0.2143253818154335, "num_tokens": 45579513.0, "step": 19885 }, { "entropy": 5.187750816345215, "epoch": 1.9106628242074928, "grad_norm": 1.3203125, "learning_rate": 0.00046374132385588356, "loss": 4.879, "mean_token_accuracy": 0.22665640264749526, "num_tokens": 45591144.0, "step": 19890 }, { "entropy": 5.166464710235596, "epoch": 1.9111431316042267, "grad_norm": 1.2890625, "learning_rate": 0.00046372265899571576, "loss": 4.9192, "mean_token_accuracy": 0.22032105475664138, "num_tokens": 45602487.0, "step": 19895 }, { "entropy": 5.213956642150879, "epoch": 1.9116234390009605, "grad_norm": 1.2265625, "learning_rate": 0.00046370398975396817, "loss": 4.9565, "mean_token_accuracy": 0.2110735148191452, "num_tokens": 45614448.0, "step": 19900 }, { "entropy": 5.190959644317627, "epoch": 1.9121037463976944, "grad_norm": 1.3671875, "learning_rate": 0.0004636853161310743, "loss": 4.9115, "mean_token_accuracy": 0.21847577691078185, "num_tokens": 45625660.0, "step": 19905 }, { "entropy": 5.06601505279541, "epoch": 1.9125840537944283, "grad_norm": 1.3125, "learning_rate": 0.00046366663812746764, "loss": 4.7971, "mean_token_accuracy": 0.23348991572856903, "num_tokens": 45638602.0, "step": 19910 }, { "entropy": 5.091584253311157, "epoch": 1.9130643611911622, "grad_norm": 1.21875, "learning_rate": 0.0004636479557435818, "loss": 4.7884, "mean_token_accuracy": 0.2402698814868927, "num_tokens": 45649460.0, "step": 19915 }, { "entropy": 5.059809160232544, "epoch": 1.9135446685878963, "grad_norm": 1.1640625, "learning_rate": 0.00046362926897985067, "loss": 4.7924, "mean_token_accuracy": 0.23128978610038758, "num_tokens": 45659995.0, "step": 19920 }, { "entropy": 5.077728748321533, "epoch": 1.9140249759846302, "grad_norm": 1.1484375, "learning_rate": 0.000463610577836708, "loss": 4.8649, "mean_token_accuracy": 0.2222018852829933, "num_tokens": 45672262.0, "step": 19925 }, { "entropy": 5.184646224975586, "epoch": 1.914505283381364, "grad_norm": 1.2734375, "learning_rate": 0.00046359188231458783, "loss": 4.933, "mean_token_accuracy": 0.2203192874789238, "num_tokens": 45685210.0, "step": 19930 }, { "entropy": 5.165609121322632, "epoch": 1.914985590778098, "grad_norm": 1.1640625, "learning_rate": 0.00046357318241392414, "loss": 4.9574, "mean_token_accuracy": 0.21842219084501266, "num_tokens": 45696418.0, "step": 19935 }, { "entropy": 5.18068208694458, "epoch": 1.915465898174832, "grad_norm": 1.1953125, "learning_rate": 0.000463554478135151, "loss": 4.9414, "mean_token_accuracy": 0.21887536495923995, "num_tokens": 45708393.0, "step": 19940 }, { "entropy": 5.234921455383301, "epoch": 1.915946205571566, "grad_norm": 1.2578125, "learning_rate": 0.0004635357694787029, "loss": 4.9527, "mean_token_accuracy": 0.2121457889676094, "num_tokens": 45721287.0, "step": 19945 }, { "entropy": 5.099327230453492, "epoch": 1.9164265129682998, "grad_norm": 1.140625, "learning_rate": 0.000463517056445014, "loss": 4.7687, "mean_token_accuracy": 0.23359038829803466, "num_tokens": 45732846.0, "step": 19950 }, { "entropy": 5.162627077102661, "epoch": 1.9169068203650337, "grad_norm": 1.15625, "learning_rate": 0.00046349833903451884, "loss": 4.9632, "mean_token_accuracy": 0.21162394881248475, "num_tokens": 45743931.0, "step": 19955 }, { "entropy": 5.217878150939941, "epoch": 1.9173871277617676, "grad_norm": 1.1640625, "learning_rate": 0.00046347961724765196, "loss": 4.8885, "mean_token_accuracy": 0.22479778081178664, "num_tokens": 45755512.0, "step": 19960 }, { "entropy": 5.19102373123169, "epoch": 1.9178674351585014, "grad_norm": 1.2265625, "learning_rate": 0.00046346089108484806, "loss": 4.857, "mean_token_accuracy": 0.22538246363401412, "num_tokens": 45766793.0, "step": 19965 }, { "entropy": 5.146558666229248, "epoch": 1.9183477425552353, "grad_norm": 1.1875, "learning_rate": 0.00046344216054654193, "loss": 4.9382, "mean_token_accuracy": 0.2223276048898697, "num_tokens": 45778214.0, "step": 19970 }, { "entropy": 5.272023105621338, "epoch": 1.9188280499519692, "grad_norm": 1.2109375, "learning_rate": 0.00046342342563316833, "loss": 5.1017, "mean_token_accuracy": 0.20328507870435714, "num_tokens": 45791255.0, "step": 19975 }, { "entropy": 5.253413915634155, "epoch": 1.919308357348703, "grad_norm": 1.1484375, "learning_rate": 0.00046340468634516223, "loss": 4.9458, "mean_token_accuracy": 0.21117972880601882, "num_tokens": 45803351.0, "step": 19980 }, { "entropy": 5.097596788406372, "epoch": 1.919788664745437, "grad_norm": 1.1171875, "learning_rate": 0.00046338594268295884, "loss": 4.8919, "mean_token_accuracy": 0.23192906975746155, "num_tokens": 45814986.0, "step": 19985 }, { "entropy": 5.0635106563568115, "epoch": 1.9202689721421708, "grad_norm": 1.1484375, "learning_rate": 0.0004633671946469933, "loss": 4.8551, "mean_token_accuracy": 0.22398319989442825, "num_tokens": 45827073.0, "step": 19990 }, { "entropy": 5.206949281692505, "epoch": 1.920749279538905, "grad_norm": 1.03125, "learning_rate": 0.00046334844223770076, "loss": 4.917, "mean_token_accuracy": 0.231214801967144, "num_tokens": 45837808.0, "step": 19995 }, { "entropy": 5.1531201839447025, "epoch": 1.9212295869356388, "grad_norm": 1.1171875, "learning_rate": 0.00046332968545551674, "loss": 4.9168, "mean_token_accuracy": 0.22213385105133057, "num_tokens": 45849932.0, "step": 20000 }, { "entropy": 5.133365106582642, "epoch": 1.9217098943323727, "grad_norm": 1.09375, "learning_rate": 0.0004633109243008765, "loss": 4.8827, "mean_token_accuracy": 0.22360755801200866, "num_tokens": 45860961.0, "step": 20005 }, { "entropy": 5.0825090408325195, "epoch": 1.9221902017291066, "grad_norm": 1.171875, "learning_rate": 0.0004632921587742159, "loss": 4.8216, "mean_token_accuracy": 0.22799782902002336, "num_tokens": 45872251.0, "step": 20010 }, { "entropy": 5.222860527038574, "epoch": 1.9226705091258407, "grad_norm": 1.171875, "learning_rate": 0.00046327338887597043, "loss": 4.9614, "mean_token_accuracy": 0.21876602619886398, "num_tokens": 45883234.0, "step": 20015 }, { "entropy": 5.1399219036102295, "epoch": 1.9231508165225746, "grad_norm": 1.140625, "learning_rate": 0.000463254614606576, "loss": 4.8616, "mean_token_accuracy": 0.22610373198986053, "num_tokens": 45894003.0, "step": 20020 }, { "entropy": 5.038976621627808, "epoch": 1.9236311239193085, "grad_norm": 1.1328125, "learning_rate": 0.0004632358359664683, "loss": 4.8077, "mean_token_accuracy": 0.22781697660684586, "num_tokens": 45906109.0, "step": 20025 }, { "entropy": 5.154880475997925, "epoch": 1.9241114313160423, "grad_norm": 1.21875, "learning_rate": 0.00046321705295608356, "loss": 4.9599, "mean_token_accuracy": 0.21925552040338517, "num_tokens": 45918372.0, "step": 20030 }, { "entropy": 5.219087791442871, "epoch": 1.9245917387127762, "grad_norm": 1.2734375, "learning_rate": 0.00046319826557585764, "loss": 4.9844, "mean_token_accuracy": 0.21151201874017717, "num_tokens": 45928750.0, "step": 20035 }, { "entropy": 5.254603242874145, "epoch": 1.92507204610951, "grad_norm": 1.203125, "learning_rate": 0.0004631794738262269, "loss": 4.9662, "mean_token_accuracy": 0.2226362034678459, "num_tokens": 45939297.0, "step": 20040 }, { "entropy": 5.169375038146972, "epoch": 1.925552353506244, "grad_norm": 1.296875, "learning_rate": 0.0004631606777076275, "loss": 4.8812, "mean_token_accuracy": 0.22293281108140944, "num_tokens": 45949986.0, "step": 20045 }, { "entropy": 5.1302508354187015, "epoch": 1.9260326609029779, "grad_norm": 1.2109375, "learning_rate": 0.00046314187722049587, "loss": 4.9219, "mean_token_accuracy": 0.21803017556667328, "num_tokens": 45960747.0, "step": 20050 }, { "entropy": 5.203114032745361, "epoch": 1.9265129682997117, "grad_norm": 1.1171875, "learning_rate": 0.00046312307236526863, "loss": 4.9201, "mean_token_accuracy": 0.22686970084905625, "num_tokens": 45972260.0, "step": 20055 }, { "entropy": 5.126967477798462, "epoch": 1.9269932756964456, "grad_norm": 1.1796875, "learning_rate": 0.00046310426314238217, "loss": 4.8274, "mean_token_accuracy": 0.22837868332862854, "num_tokens": 45983589.0, "step": 20060 }, { "entropy": 5.058112907409668, "epoch": 1.9274735830931795, "grad_norm": 1.359375, "learning_rate": 0.0004630854495522733, "loss": 4.8587, "mean_token_accuracy": 0.22899621576070786, "num_tokens": 45994776.0, "step": 20065 }, { "entropy": 5.203479146957397, "epoch": 1.9279538904899134, "grad_norm": 1.140625, "learning_rate": 0.00046306663159537874, "loss": 4.9497, "mean_token_accuracy": 0.2172028511762619, "num_tokens": 46005530.0, "step": 20070 }, { "entropy": 5.165763235092163, "epoch": 1.9284341978866475, "grad_norm": 1.171875, "learning_rate": 0.00046304780927213554, "loss": 4.8475, "mean_token_accuracy": 0.23035346120595931, "num_tokens": 46017538.0, "step": 20075 }, { "entropy": 5.1419881820678714, "epoch": 1.9289145052833814, "grad_norm": 1.1015625, "learning_rate": 0.00046302898258298046, "loss": 4.845, "mean_token_accuracy": 0.2168242171406746, "num_tokens": 46029011.0, "step": 20080 }, { "entropy": 5.077986097335815, "epoch": 1.9293948126801153, "grad_norm": 1.109375, "learning_rate": 0.0004630101515283509, "loss": 4.8498, "mean_token_accuracy": 0.22713633477687836, "num_tokens": 46040354.0, "step": 20085 }, { "entropy": 5.12605299949646, "epoch": 1.9298751200768491, "grad_norm": 1.1796875, "learning_rate": 0.00046299131610868377, "loss": 4.8651, "mean_token_accuracy": 0.22488067746162416, "num_tokens": 46051127.0, "step": 20090 }, { "entropy": 5.192133140563965, "epoch": 1.9303554274735832, "grad_norm": 1.265625, "learning_rate": 0.0004629724763244165, "loss": 4.8896, "mean_token_accuracy": 0.22987643331289292, "num_tokens": 46062343.0, "step": 20095 }, { "entropy": 5.099241828918457, "epoch": 1.9308357348703171, "grad_norm": 1.1640625, "learning_rate": 0.0004629536321759866, "loss": 4.8436, "mean_token_accuracy": 0.22163994908332824, "num_tokens": 46074193.0, "step": 20100 }, { "entropy": 5.14430193901062, "epoch": 1.931316042267051, "grad_norm": 1.21875, "learning_rate": 0.00046293478366383133, "loss": 4.8598, "mean_token_accuracy": 0.22932973504066467, "num_tokens": 46085930.0, "step": 20105 }, { "entropy": 5.226075792312622, "epoch": 1.9317963496637849, "grad_norm": 1.0546875, "learning_rate": 0.0004629159307883885, "loss": 4.9729, "mean_token_accuracy": 0.21434650868177413, "num_tokens": 46099759.0, "step": 20110 }, { "entropy": 5.315010738372803, "epoch": 1.9322766570605188, "grad_norm": 1.1328125, "learning_rate": 0.0004628970735500958, "loss": 4.9977, "mean_token_accuracy": 0.20691378712654113, "num_tokens": 46110907.0, "step": 20115 }, { "entropy": 5.166486072540283, "epoch": 1.9327569644572526, "grad_norm": 1.2421875, "learning_rate": 0.00046287821194939094, "loss": 4.9242, "mean_token_accuracy": 0.22367848455905914, "num_tokens": 46122167.0, "step": 20120 }, { "entropy": 5.162136554718018, "epoch": 1.9332372718539865, "grad_norm": 1.1796875, "learning_rate": 0.00046285934598671186, "loss": 4.9868, "mean_token_accuracy": 0.21337527185678482, "num_tokens": 46132948.0, "step": 20125 }, { "entropy": 5.2009600639343265, "epoch": 1.9337175792507204, "grad_norm": 1.0625, "learning_rate": 0.00046284047566249665, "loss": 4.9534, "mean_token_accuracy": 0.21956295073032378, "num_tokens": 46144275.0, "step": 20130 }, { "entropy": 5.173071098327637, "epoch": 1.9341978866474543, "grad_norm": 1.15625, "learning_rate": 0.00046282160097718336, "loss": 4.9259, "mean_token_accuracy": 0.22100828140974044, "num_tokens": 46156138.0, "step": 20135 }, { "entropy": 5.237935400009155, "epoch": 1.9346781940441882, "grad_norm": 1.2109375, "learning_rate": 0.0004628027219312102, "loss": 4.9515, "mean_token_accuracy": 0.2174043759703636, "num_tokens": 46169437.0, "step": 20140 }, { "entropy": 5.155817985534668, "epoch": 1.935158501440922, "grad_norm": 1.109375, "learning_rate": 0.0004627838385250155, "loss": 4.8613, "mean_token_accuracy": 0.22143658697605134, "num_tokens": 46182150.0, "step": 20145 }, { "entropy": 5.1375514507293705, "epoch": 1.9356388088376562, "grad_norm": 1.171875, "learning_rate": 0.00046276495075903764, "loss": 4.8953, "mean_token_accuracy": 0.227192685008049, "num_tokens": 46193945.0, "step": 20150 }, { "entropy": 5.250063371658325, "epoch": 1.93611911623439, "grad_norm": 1.1328125, "learning_rate": 0.00046274605863371517, "loss": 4.9954, "mean_token_accuracy": 0.21560989171266556, "num_tokens": 46206130.0, "step": 20155 }, { "entropy": 5.096949529647827, "epoch": 1.936599423631124, "grad_norm": 1.2734375, "learning_rate": 0.0004627271621494868, "loss": 4.8381, "mean_token_accuracy": 0.2306928291916847, "num_tokens": 46217284.0, "step": 20160 }, { "entropy": 5.12322769165039, "epoch": 1.9370797310278578, "grad_norm": 1.359375, "learning_rate": 0.000462708261306791, "loss": 4.877, "mean_token_accuracy": 0.2295483872294426, "num_tokens": 46227949.0, "step": 20165 }, { "entropy": 5.13996729850769, "epoch": 1.937560038424592, "grad_norm": 1.1484375, "learning_rate": 0.0004626893561060669, "loss": 4.8539, "mean_token_accuracy": 0.21847130507230758, "num_tokens": 46239101.0, "step": 20170 }, { "entropy": 5.225299978256226, "epoch": 1.9380403458213258, "grad_norm": 1.203125, "learning_rate": 0.00046267044654775324, "loss": 5.0556, "mean_token_accuracy": 0.2185836911201477, "num_tokens": 46251272.0, "step": 20175 }, { "entropy": 5.208836698532105, "epoch": 1.9385206532180597, "grad_norm": 1.265625, "learning_rate": 0.000462651532632289, "loss": 4.9816, "mean_token_accuracy": 0.2192826822400093, "num_tokens": 46262603.0, "step": 20180 }, { "entropy": 5.184619903564453, "epoch": 1.9390009606147935, "grad_norm": 1.1875, "learning_rate": 0.00046263261436011344, "loss": 4.8913, "mean_token_accuracy": 0.22006487399339675, "num_tokens": 46275142.0, "step": 20185 }, { "entropy": 5.155868911743164, "epoch": 1.9394812680115274, "grad_norm": 1.2578125, "learning_rate": 0.0004626136917316657, "loss": 4.9209, "mean_token_accuracy": 0.2221606343984604, "num_tokens": 46286339.0, "step": 20190 }, { "entropy": 5.146366453170776, "epoch": 1.9399615754082613, "grad_norm": 1.078125, "learning_rate": 0.00046259476474738514, "loss": 4.8876, "mean_token_accuracy": 0.2205871284008026, "num_tokens": 46297514.0, "step": 20195 }, { "entropy": 5.10622353553772, "epoch": 1.9404418828049952, "grad_norm": 1.2734375, "learning_rate": 0.00046257583340771123, "loss": 4.772, "mean_token_accuracy": 0.23131893575191498, "num_tokens": 46309249.0, "step": 20200 }, { "entropy": 5.167063570022583, "epoch": 1.940922190201729, "grad_norm": 1.0625, "learning_rate": 0.0004625568977130835, "loss": 4.9408, "mean_token_accuracy": 0.21829203367233277, "num_tokens": 46320759.0, "step": 20205 }, { "entropy": 5.2694789409637455, "epoch": 1.941402497598463, "grad_norm": 1.3671875, "learning_rate": 0.0004625379576639414, "loss": 4.9495, "mean_token_accuracy": 0.22133799344301225, "num_tokens": 46332113.0, "step": 20210 }, { "entropy": 5.178436660766602, "epoch": 1.9418828049951968, "grad_norm": 1.234375, "learning_rate": 0.00046251901326072487, "loss": 4.9182, "mean_token_accuracy": 0.22542383521795273, "num_tokens": 46342648.0, "step": 20215 }, { "entropy": 5.069062662124634, "epoch": 1.9423631123919307, "grad_norm": 1.1484375, "learning_rate": 0.00046250006450387367, "loss": 4.836, "mean_token_accuracy": 0.2282964691519737, "num_tokens": 46354066.0, "step": 20220 }, { "entropy": 5.241002178192138, "epoch": 1.9428434197886646, "grad_norm": 1.1640625, "learning_rate": 0.0004624811113938277, "loss": 4.9557, "mean_token_accuracy": 0.21603974103927612, "num_tokens": 46366586.0, "step": 20225 }, { "entropy": 5.172988367080689, "epoch": 1.9433237271853987, "grad_norm": 1.1953125, "learning_rate": 0.0004624621539310271, "loss": 4.8469, "mean_token_accuracy": 0.22970083802938462, "num_tokens": 46378155.0, "step": 20230 }, { "entropy": 5.133848571777344, "epoch": 1.9438040345821326, "grad_norm": 1.2109375, "learning_rate": 0.00046244319211591193, "loss": 4.9357, "mean_token_accuracy": 0.21522724479436875, "num_tokens": 46389631.0, "step": 20235 }, { "entropy": 5.231821441650391, "epoch": 1.9442843419788665, "grad_norm": 1.1484375, "learning_rate": 0.0004624242259489223, "loss": 4.9675, "mean_token_accuracy": 0.21257461309432985, "num_tokens": 46401915.0, "step": 20240 }, { "entropy": 5.232212734222412, "epoch": 1.9447646493756003, "grad_norm": 1.28125, "learning_rate": 0.00046240525543049884, "loss": 4.9201, "mean_token_accuracy": 0.2200825333595276, "num_tokens": 46412396.0, "step": 20245 }, { "entropy": 5.070683145523072, "epoch": 1.9452449567723344, "grad_norm": 1.1953125, "learning_rate": 0.00046238628056108176, "loss": 4.83, "mean_token_accuracy": 0.23035591542720796, "num_tokens": 46423089.0, "step": 20250 }, { "entropy": 5.243849849700927, "epoch": 1.9457252641690683, "grad_norm": 1.2421875, "learning_rate": 0.00046236730134111166, "loss": 4.938, "mean_token_accuracy": 0.21500004231929778, "num_tokens": 46434858.0, "step": 20255 }, { "entropy": 5.151482677459716, "epoch": 1.9462055715658022, "grad_norm": 1.2109375, "learning_rate": 0.0004623483177710291, "loss": 4.8766, "mean_token_accuracy": 0.23079614639282225, "num_tokens": 46445628.0, "step": 20260 }, { "entropy": 5.125904130935669, "epoch": 1.946685878962536, "grad_norm": 1.3046875, "learning_rate": 0.0004623293298512751, "loss": 4.8532, "mean_token_accuracy": 0.22425605952739716, "num_tokens": 46457382.0, "step": 20265 }, { "entropy": 5.156096410751343, "epoch": 1.94716618635927, "grad_norm": 1.265625, "learning_rate": 0.00046231033758229026, "loss": 4.9302, "mean_token_accuracy": 0.21903278380632402, "num_tokens": 46469765.0, "step": 20270 }, { "entropy": 5.3018150806427, "epoch": 1.9476464937560038, "grad_norm": 1.09375, "learning_rate": 0.0004622913409645154, "loss": 5.006, "mean_token_accuracy": 0.2137613371014595, "num_tokens": 46481451.0, "step": 20275 }, { "entropy": 5.180299520492554, "epoch": 1.9481268011527377, "grad_norm": 1.1640625, "learning_rate": 0.0004622723399983919, "loss": 4.7839, "mean_token_accuracy": 0.22959625124931335, "num_tokens": 46491585.0, "step": 20280 }, { "entropy": 5.159808111190796, "epoch": 1.9486071085494716, "grad_norm": 1.1640625, "learning_rate": 0.00046225333468436077, "loss": 5.02, "mean_token_accuracy": 0.21366383731365204, "num_tokens": 46503789.0, "step": 20285 }, { "entropy": 5.278623628616333, "epoch": 1.9490874159462055, "grad_norm": 1.328125, "learning_rate": 0.00046223432502286323, "loss": 4.9789, "mean_token_accuracy": 0.2195223718881607, "num_tokens": 46516647.0, "step": 20290 }, { "entropy": 5.185705709457397, "epoch": 1.9495677233429394, "grad_norm": 1.234375, "learning_rate": 0.00046221531101434056, "loss": 4.9046, "mean_token_accuracy": 0.21597968637943268, "num_tokens": 46527432.0, "step": 20295 }, { "entropy": 5.209407901763916, "epoch": 1.9500480307396733, "grad_norm": 1.125, "learning_rate": 0.0004621962926592343, "loss": 4.9442, "mean_token_accuracy": 0.2232288047671318, "num_tokens": 46538990.0, "step": 20300 }, { "entropy": 5.1626382827758786, "epoch": 1.9505283381364071, "grad_norm": 1.34375, "learning_rate": 0.000462177269957986, "loss": 4.922, "mean_token_accuracy": 0.2165737271308899, "num_tokens": 46551291.0, "step": 20305 }, { "entropy": 5.165114831924439, "epoch": 1.9510086455331412, "grad_norm": 1.296875, "learning_rate": 0.0004621582429110373, "loss": 4.8466, "mean_token_accuracy": 0.2309819519519806, "num_tokens": 46562415.0, "step": 20310 }, { "entropy": 5.232455444335938, "epoch": 1.9514889529298751, "grad_norm": 1.1953125, "learning_rate": 0.00046213921151883, "loss": 4.9708, "mean_token_accuracy": 0.22538121789693832, "num_tokens": 46572544.0, "step": 20315 }, { "entropy": 5.136361789703369, "epoch": 1.951969260326609, "grad_norm": 1.15625, "learning_rate": 0.0004621201757818059, "loss": 4.8627, "mean_token_accuracy": 0.2275417312979698, "num_tokens": 46584694.0, "step": 20320 }, { "entropy": 5.199646091461181, "epoch": 1.952449567723343, "grad_norm": 1.28125, "learning_rate": 0.00046210113570040683, "loss": 4.9058, "mean_token_accuracy": 0.22562002390623093, "num_tokens": 46595086.0, "step": 20325 }, { "entropy": 5.240507936477661, "epoch": 1.952929875120077, "grad_norm": 1.0625, "learning_rate": 0.000462082091275075, "loss": 4.9261, "mean_token_accuracy": 0.2216821476817131, "num_tokens": 46606627.0, "step": 20330 }, { "entropy": 5.210877895355225, "epoch": 1.9534101825168109, "grad_norm": 1.3671875, "learning_rate": 0.0004620630425062526, "loss": 4.9342, "mean_token_accuracy": 0.2145678550004959, "num_tokens": 46618560.0, "step": 20335 }, { "entropy": 5.178671979904175, "epoch": 1.9538904899135447, "grad_norm": 1.1328125, "learning_rate": 0.0004620439893943817, "loss": 4.89, "mean_token_accuracy": 0.22025668919086455, "num_tokens": 46630109.0, "step": 20340 }, { "entropy": 5.146572542190552, "epoch": 1.9543707973102786, "grad_norm": 1.1171875, "learning_rate": 0.0004620249319399049, "loss": 4.8815, "mean_token_accuracy": 0.22778512686491012, "num_tokens": 46642247.0, "step": 20345 }, { "entropy": 5.222317838668824, "epoch": 1.9548511047070125, "grad_norm": 1.2734375, "learning_rate": 0.00046200587014326455, "loss": 4.9358, "mean_token_accuracy": 0.22239342778921128, "num_tokens": 46653119.0, "step": 20350 }, { "entropy": 5.147661209106445, "epoch": 1.9553314121037464, "grad_norm": 1.125, "learning_rate": 0.0004619868040049031, "loss": 4.8709, "mean_token_accuracy": 0.2261478677392006, "num_tokens": 46664392.0, "step": 20355 }, { "entropy": 5.09419617652893, "epoch": 1.9558117195004803, "grad_norm": 1.234375, "learning_rate": 0.0004619677335252633, "loss": 4.8705, "mean_token_accuracy": 0.23083866387605667, "num_tokens": 46674981.0, "step": 20360 }, { "entropy": 5.161249685287475, "epoch": 1.9562920268972142, "grad_norm": 1.078125, "learning_rate": 0.00046194865870478793, "loss": 4.9732, "mean_token_accuracy": 0.22053551971912383, "num_tokens": 46688692.0, "step": 20365 }, { "entropy": 5.051489639282226, "epoch": 1.956772334293948, "grad_norm": 1.171875, "learning_rate": 0.00046192957954391983, "loss": 4.7946, "mean_token_accuracy": 0.23118489980697632, "num_tokens": 46699799.0, "step": 20370 }, { "entropy": 5.137979364395141, "epoch": 1.957252641690682, "grad_norm": 1.2109375, "learning_rate": 0.000461910496043102, "loss": 4.9096, "mean_token_accuracy": 0.22183059453964232, "num_tokens": 46711994.0, "step": 20375 }, { "entropy": 5.189553737640381, "epoch": 1.9577329490874158, "grad_norm": 1.296875, "learning_rate": 0.0004618914082027773, "loss": 4.9327, "mean_token_accuracy": 0.22325861006975173, "num_tokens": 46723247.0, "step": 20380 }, { "entropy": 5.254940795898437, "epoch": 1.95821325648415, "grad_norm": 1.1796875, "learning_rate": 0.00046187231602338926, "loss": 4.9589, "mean_token_accuracy": 0.2160505548119545, "num_tokens": 46732824.0, "step": 20385 }, { "entropy": 5.099370813369751, "epoch": 1.9586935638808838, "grad_norm": 1.2421875, "learning_rate": 0.00046185321950538086, "loss": 4.8828, "mean_token_accuracy": 0.22503511756658554, "num_tokens": 46744640.0, "step": 20390 }, { "entropy": 5.152339124679566, "epoch": 1.9591738712776177, "grad_norm": 1.3515625, "learning_rate": 0.0004618341186491955, "loss": 4.9108, "mean_token_accuracy": 0.22266879081726074, "num_tokens": 46757513.0, "step": 20395 }, { "entropy": 5.143527030944824, "epoch": 1.9596541786743515, "grad_norm": 1.109375, "learning_rate": 0.0004618150134552768, "loss": 4.8554, "mean_token_accuracy": 0.2254609391093254, "num_tokens": 46769021.0, "step": 20400 }, { "entropy": 5.09747953414917, "epoch": 1.9601344860710856, "grad_norm": 1.328125, "learning_rate": 0.000461795903924068, "loss": 4.7974, "mean_token_accuracy": 0.22561995834112167, "num_tokens": 46781175.0, "step": 20405 }, { "entropy": 5.138916158676148, "epoch": 1.9606147934678195, "grad_norm": 1.34375, "learning_rate": 0.00046177679005601313, "loss": 4.8219, "mean_token_accuracy": 0.2253713935613632, "num_tokens": 46792256.0, "step": 20410 }, { "entropy": 5.13484263420105, "epoch": 1.9610951008645534, "grad_norm": 1.265625, "learning_rate": 0.0004617576718515558, "loss": 4.8069, "mean_token_accuracy": 0.23279052674770356, "num_tokens": 46803215.0, "step": 20415 }, { "entropy": 5.221542453765869, "epoch": 1.9615754082612873, "grad_norm": 1.21875, "learning_rate": 0.0004617385493111399, "loss": 4.9427, "mean_token_accuracy": 0.2263939142227173, "num_tokens": 46813770.0, "step": 20420 }, { "entropy": 5.142890548706054, "epoch": 1.9620557156580212, "grad_norm": 1.484375, "learning_rate": 0.0004617194224352093, "loss": 4.928, "mean_token_accuracy": 0.22260897755622863, "num_tokens": 46826923.0, "step": 20425 }, { "entropy": 5.1240606784820555, "epoch": 1.962536023054755, "grad_norm": 1.1796875, "learning_rate": 0.0004617002912242083, "loss": 4.8204, "mean_token_accuracy": 0.23046000003814698, "num_tokens": 46837761.0, "step": 20430 }, { "entropy": 5.143495988845825, "epoch": 1.963016330451489, "grad_norm": 1.125, "learning_rate": 0.00046168115567858084, "loss": 4.9023, "mean_token_accuracy": 0.2167316809296608, "num_tokens": 46848190.0, "step": 20435 }, { "entropy": 5.088902616500855, "epoch": 1.9634966378482228, "grad_norm": 1.1796875, "learning_rate": 0.00046166201579877125, "loss": 4.7869, "mean_token_accuracy": 0.23102391064167022, "num_tokens": 46858643.0, "step": 20440 }, { "entropy": 5.146438980102539, "epoch": 1.9639769452449567, "grad_norm": 1.3515625, "learning_rate": 0.0004616428715852239, "loss": 4.8859, "mean_token_accuracy": 0.22377959191799163, "num_tokens": 46868711.0, "step": 20445 }, { "entropy": 5.192721700668335, "epoch": 1.9644572526416906, "grad_norm": 1.1875, "learning_rate": 0.0004616237230383833, "loss": 4.9867, "mean_token_accuracy": 0.2198183998465538, "num_tokens": 46880045.0, "step": 20450 }, { "entropy": 5.1650725364685055, "epoch": 1.9649375600384245, "grad_norm": 1.2265625, "learning_rate": 0.00046160457015869414, "loss": 4.9148, "mean_token_accuracy": 0.22214917838573456, "num_tokens": 46890680.0, "step": 20455 }, { "entropy": 5.133912897109985, "epoch": 1.9654178674351583, "grad_norm": 1.171875, "learning_rate": 0.00046158541294660083, "loss": 4.8629, "mean_token_accuracy": 0.22117667347192765, "num_tokens": 46904807.0, "step": 20460 }, { "entropy": 5.1891176223754885, "epoch": 1.9658981748318924, "grad_norm": 1.3984375, "learning_rate": 0.0004615662514025483, "loss": 4.8667, "mean_token_accuracy": 0.22456077635288238, "num_tokens": 46916039.0, "step": 20465 }, { "entropy": 5.184952592849731, "epoch": 1.9663784822286263, "grad_norm": 1.1796875, "learning_rate": 0.00046154708552698147, "loss": 4.8863, "mean_token_accuracy": 0.2231151282787323, "num_tokens": 46927175.0, "step": 20470 }, { "entropy": 5.097032785415649, "epoch": 1.9668587896253602, "grad_norm": 1.2734375, "learning_rate": 0.00046152791532034517, "loss": 4.7743, "mean_token_accuracy": 0.23669862747192383, "num_tokens": 46938711.0, "step": 20475 }, { "entropy": 5.152624464035034, "epoch": 1.967339097022094, "grad_norm": 1.140625, "learning_rate": 0.00046150874078308463, "loss": 4.8935, "mean_token_accuracy": 0.21999486535787582, "num_tokens": 46951002.0, "step": 20480 }, { "entropy": 5.23142032623291, "epoch": 1.9678194044188282, "grad_norm": 1.2578125, "learning_rate": 0.0004614895619156449, "loss": 4.9591, "mean_token_accuracy": 0.22308919131755828, "num_tokens": 46962673.0, "step": 20485 }, { "entropy": 5.1750617027282715, "epoch": 1.968299711815562, "grad_norm": 1.265625, "learning_rate": 0.00046147037871847125, "loss": 4.8769, "mean_token_accuracy": 0.22022126466035843, "num_tokens": 46973955.0, "step": 20490 }, { "entropy": 5.196652221679687, "epoch": 1.968780019212296, "grad_norm": 1.140625, "learning_rate": 0.0004614511911920092, "loss": 5.0136, "mean_token_accuracy": 0.21228874027729033, "num_tokens": 46985791.0, "step": 20495 }, { "entropy": 5.277962875366211, "epoch": 1.9692603266090298, "grad_norm": 1.2265625, "learning_rate": 0.00046143199933670404, "loss": 5.053, "mean_token_accuracy": 0.21041589677333833, "num_tokens": 46998437.0, "step": 20500 }, { "entropy": 5.223962783813477, "epoch": 1.9697406340057637, "grad_norm": 1.375, "learning_rate": 0.0004614128031530015, "loss": 4.8518, "mean_token_accuracy": 0.21939769387245178, "num_tokens": 47009666.0, "step": 20505 }, { "entropy": 5.241269969940186, "epoch": 1.9702209414024976, "grad_norm": 1.109375, "learning_rate": 0.00046139360264134724, "loss": 4.936, "mean_token_accuracy": 0.21502902060747148, "num_tokens": 47021170.0, "step": 20510 }, { "entropy": 5.196494102478027, "epoch": 1.9707012487992315, "grad_norm": 1.171875, "learning_rate": 0.0004613743978021869, "loss": 4.8961, "mean_token_accuracy": 0.22281893640756606, "num_tokens": 47033077.0, "step": 20515 }, { "entropy": 5.200880479812622, "epoch": 1.9711815561959654, "grad_norm": 1.109375, "learning_rate": 0.00046135518863596654, "loss": 4.9337, "mean_token_accuracy": 0.2225465178489685, "num_tokens": 47044858.0, "step": 20520 }, { "entropy": 5.217056846618652, "epoch": 1.9716618635926992, "grad_norm": 1.1640625, "learning_rate": 0.00046133597514313204, "loss": 5.0065, "mean_token_accuracy": 0.2121262475848198, "num_tokens": 47057587.0, "step": 20525 }, { "entropy": 5.242043828964233, "epoch": 1.9721421709894331, "grad_norm": 1.21875, "learning_rate": 0.0004613167573241295, "loss": 4.9383, "mean_token_accuracy": 0.21319063603878022, "num_tokens": 47068690.0, "step": 20530 }, { "entropy": 5.134704160690307, "epoch": 1.972622478386167, "grad_norm": 1.1484375, "learning_rate": 0.0004612975351794051, "loss": 4.9088, "mean_token_accuracy": 0.22375574558973313, "num_tokens": 47081279.0, "step": 20535 }, { "entropy": 5.199207830429077, "epoch": 1.973102785782901, "grad_norm": 1.203125, "learning_rate": 0.0004612783087094051, "loss": 4.9617, "mean_token_accuracy": 0.21864840090274812, "num_tokens": 47093372.0, "step": 20540 }, { "entropy": 5.259644794464111, "epoch": 1.973583093179635, "grad_norm": 1.2890625, "learning_rate": 0.00046125907791457594, "loss": 5.0322, "mean_token_accuracy": 0.21205034255981445, "num_tokens": 47104332.0, "step": 20545 }, { "entropy": 5.201462650299073, "epoch": 1.9740634005763689, "grad_norm": 1.4296875, "learning_rate": 0.00046123984279536405, "loss": 4.9045, "mean_token_accuracy": 0.21743421554565429, "num_tokens": 47116941.0, "step": 20550 }, { "entropy": 5.270235967636109, "epoch": 1.9745437079731027, "grad_norm": 1.34375, "learning_rate": 0.00046122060335221604, "loss": 4.9614, "mean_token_accuracy": 0.21613819301128387, "num_tokens": 47128473.0, "step": 20555 }, { "entropy": 5.208339738845825, "epoch": 1.9750240153698368, "grad_norm": 1.2109375, "learning_rate": 0.00046120135958557855, "loss": 4.8999, "mean_token_accuracy": 0.21493572890758514, "num_tokens": 47140586.0, "step": 20560 }, { "entropy": 5.112038850784302, "epoch": 1.9755043227665707, "grad_norm": 1.1953125, "learning_rate": 0.00046118211149589843, "loss": 4.8393, "mean_token_accuracy": 0.23156896084547043, "num_tokens": 47150875.0, "step": 20565 }, { "entropy": 5.145128870010376, "epoch": 1.9759846301633046, "grad_norm": 1.375, "learning_rate": 0.0004611628590836225, "loss": 4.7982, "mean_token_accuracy": 0.23350438922643663, "num_tokens": 47162074.0, "step": 20570 }, { "entropy": 5.13048825263977, "epoch": 1.9764649375600385, "grad_norm": 1.21875, "learning_rate": 0.0004611436023491979, "loss": 4.8319, "mean_token_accuracy": 0.22945202738046647, "num_tokens": 47174544.0, "step": 20575 }, { "entropy": 5.209021663665771, "epoch": 1.9769452449567724, "grad_norm": 1.1484375, "learning_rate": 0.0004611243412930714, "loss": 4.9348, "mean_token_accuracy": 0.21808502674102784, "num_tokens": 47186260.0, "step": 20580 }, { "entropy": 5.1533670902252195, "epoch": 1.9774255523535063, "grad_norm": 1.484375, "learning_rate": 0.00046110507591569047, "loss": 4.8547, "mean_token_accuracy": 0.2219648018479347, "num_tokens": 47198059.0, "step": 20585 }, { "entropy": 5.1115552425384525, "epoch": 1.9779058597502401, "grad_norm": 1.265625, "learning_rate": 0.0004610858062175023, "loss": 4.8177, "mean_token_accuracy": 0.23198753446340561, "num_tokens": 47208300.0, "step": 20590 }, { "entropy": 5.135996913909912, "epoch": 1.978386167146974, "grad_norm": 1.125, "learning_rate": 0.00046106653219895417, "loss": 4.8764, "mean_token_accuracy": 0.23086352050304412, "num_tokens": 47220135.0, "step": 20595 }, { "entropy": 5.274822854995728, "epoch": 1.978866474543708, "grad_norm": 1.3046875, "learning_rate": 0.0004610472538604938, "loss": 4.9403, "mean_token_accuracy": 0.21703655570745467, "num_tokens": 47230385.0, "step": 20600 }, { "entropy": 5.115185976028442, "epoch": 1.9793467819404418, "grad_norm": 1.1796875, "learning_rate": 0.00046102797120256854, "loss": 4.8057, "mean_token_accuracy": 0.22790150493383407, "num_tokens": 47242593.0, "step": 20605 }, { "entropy": 5.228035879135132, "epoch": 1.9798270893371757, "grad_norm": 1.4140625, "learning_rate": 0.0004610086842256262, "loss": 4.9663, "mean_token_accuracy": 0.21830164194107055, "num_tokens": 47254223.0, "step": 20610 }, { "entropy": 5.213165378570556, "epoch": 1.9803073967339095, "grad_norm": 1.1796875, "learning_rate": 0.0004609893929301146, "loss": 4.9534, "mean_token_accuracy": 0.22537720054388047, "num_tokens": 47266559.0, "step": 20615 }, { "entropy": 5.259535551071167, "epoch": 1.9807877041306436, "grad_norm": 1.1328125, "learning_rate": 0.0004609700973164816, "loss": 4.9239, "mean_token_accuracy": 0.22068007290363312, "num_tokens": 47276438.0, "step": 20620 }, { "entropy": 5.153802680969238, "epoch": 1.9812680115273775, "grad_norm": 1.484375, "learning_rate": 0.0004609507973851751, "loss": 4.936, "mean_token_accuracy": 0.2177566260099411, "num_tokens": 47287791.0, "step": 20625 }, { "entropy": 5.166331243515015, "epoch": 1.9817483189241114, "grad_norm": 1.1484375, "learning_rate": 0.00046093149313664316, "loss": 4.8766, "mean_token_accuracy": 0.22625285685062407, "num_tokens": 47300282.0, "step": 20630 }, { "entropy": 5.292104244232178, "epoch": 1.9822286263208453, "grad_norm": 1.2109375, "learning_rate": 0.0004609121845713342, "loss": 4.9625, "mean_token_accuracy": 0.22391965836286545, "num_tokens": 47310473.0, "step": 20635 }, { "entropy": 5.091473150253296, "epoch": 1.9827089337175794, "grad_norm": 1.625, "learning_rate": 0.0004608928716896963, "loss": 4.8727, "mean_token_accuracy": 0.22272167056798936, "num_tokens": 47322245.0, "step": 20640 }, { "entropy": 5.215969657897949, "epoch": 1.9831892411143133, "grad_norm": 1.1953125, "learning_rate": 0.0004608735544921778, "loss": 4.8979, "mean_token_accuracy": 0.22845628559589387, "num_tokens": 47334316.0, "step": 20645 }, { "entropy": 5.24719614982605, "epoch": 1.9836695485110472, "grad_norm": 1.25, "learning_rate": 0.00046085423297922745, "loss": 4.9315, "mean_token_accuracy": 0.2245178133249283, "num_tokens": 47344909.0, "step": 20650 }, { "entropy": 5.075035095214844, "epoch": 1.984149855907781, "grad_norm": 1.3125, "learning_rate": 0.00046083490715129367, "loss": 4.806, "mean_token_accuracy": 0.23430878520011902, "num_tokens": 47356682.0, "step": 20655 }, { "entropy": 5.1300498962402346, "epoch": 1.984630163304515, "grad_norm": 1.3203125, "learning_rate": 0.0004608155770088251, "loss": 4.7656, "mean_token_accuracy": 0.2302561417222023, "num_tokens": 47368362.0, "step": 20660 }, { "entropy": 5.2220391750335695, "epoch": 1.9851104707012488, "grad_norm": 1.4765625, "learning_rate": 0.00046079624255227066, "loss": 4.8974, "mean_token_accuracy": 0.22234228402376174, "num_tokens": 47379932.0, "step": 20665 }, { "entropy": 5.178265142440796, "epoch": 1.9855907780979827, "grad_norm": 1.21875, "learning_rate": 0.0004607769037820791, "loss": 5.0239, "mean_token_accuracy": 0.2156722739338875, "num_tokens": 47391244.0, "step": 20670 }, { "entropy": 5.200816106796265, "epoch": 1.9860710854947166, "grad_norm": 1.125, "learning_rate": 0.0004607575606986995, "loss": 4.9086, "mean_token_accuracy": 0.221403868496418, "num_tokens": 47404005.0, "step": 20675 }, { "entropy": 5.120086431503296, "epoch": 1.9865513928914504, "grad_norm": 1.2265625, "learning_rate": 0.0004607382133025809, "loss": 4.8111, "mean_token_accuracy": 0.22623786628246306, "num_tokens": 47414555.0, "step": 20680 }, { "entropy": 5.157238340377807, "epoch": 1.9870317002881843, "grad_norm": 1.1171875, "learning_rate": 0.00046071886159417257, "loss": 4.9429, "mean_token_accuracy": 0.22154468148946763, "num_tokens": 47425902.0, "step": 20685 }, { "entropy": 5.058484172821045, "epoch": 1.9875120076849182, "grad_norm": 1.109375, "learning_rate": 0.0004606995055739238, "loss": 4.8014, "mean_token_accuracy": 0.22854122519493103, "num_tokens": 47438068.0, "step": 20690 }, { "entropy": 5.215109252929688, "epoch": 1.9879923150816523, "grad_norm": 1.1171875, "learning_rate": 0.00046068014524228374, "loss": 4.9531, "mean_token_accuracy": 0.2229817181825638, "num_tokens": 47450322.0, "step": 20695 }, { "entropy": 5.151501083374024, "epoch": 1.9884726224783862, "grad_norm": 1.28125, "learning_rate": 0.00046066078059970217, "loss": 4.8988, "mean_token_accuracy": 0.22711405605077745, "num_tokens": 47462615.0, "step": 20700 }, { "entropy": 5.158354616165161, "epoch": 1.98895292987512, "grad_norm": 1.40625, "learning_rate": 0.0004606414116466286, "loss": 4.8534, "mean_token_accuracy": 0.2186825007200241, "num_tokens": 47472530.0, "step": 20705 }, { "entropy": 5.094603776931763, "epoch": 1.989433237271854, "grad_norm": 1.1640625, "learning_rate": 0.00046062203838351267, "loss": 4.8324, "mean_token_accuracy": 0.22771522551774978, "num_tokens": 47484217.0, "step": 20710 }, { "entropy": 5.1945013999938965, "epoch": 1.989913544668588, "grad_norm": 1.1953125, "learning_rate": 0.00046060266081080414, "loss": 4.9128, "mean_token_accuracy": 0.22298493534326552, "num_tokens": 47496365.0, "step": 20715 }, { "entropy": 5.161307954788208, "epoch": 1.990393852065322, "grad_norm": 1.0703125, "learning_rate": 0.0004605832789289531, "loss": 4.8889, "mean_token_accuracy": 0.23147787898778915, "num_tokens": 47509437.0, "step": 20720 }, { "entropy": 5.249230623245239, "epoch": 1.9908741594620558, "grad_norm": 1.265625, "learning_rate": 0.0004605638927384093, "loss": 4.9501, "mean_token_accuracy": 0.21548073440790178, "num_tokens": 47520913.0, "step": 20725 }, { "entropy": 5.247469091415406, "epoch": 1.9913544668587897, "grad_norm": 1.1484375, "learning_rate": 0.00046054450223962284, "loss": 4.9293, "mean_token_accuracy": 0.21719965785741807, "num_tokens": 47532015.0, "step": 20730 }, { "entropy": 5.224554872512817, "epoch": 1.9918347742555236, "grad_norm": 1.2109375, "learning_rate": 0.00046052510743304405, "loss": 5.011, "mean_token_accuracy": 0.21654557585716247, "num_tokens": 47543675.0, "step": 20735 }, { "entropy": 5.162395858764649, "epoch": 1.9923150816522575, "grad_norm": 1.171875, "learning_rate": 0.0004605057083191232, "loss": 4.8745, "mean_token_accuracy": 0.22160074561834336, "num_tokens": 47555757.0, "step": 20740 }, { "entropy": 5.2275580883026125, "epoch": 1.9927953890489913, "grad_norm": 1.1875, "learning_rate": 0.0004604863048983106, "loss": 5.0157, "mean_token_accuracy": 0.21975622475147247, "num_tokens": 47568043.0, "step": 20745 }, { "entropy": 5.137474584579468, "epoch": 1.9932756964457252, "grad_norm": 1.25, "learning_rate": 0.0004604668971710568, "loss": 4.8694, "mean_token_accuracy": 0.22627678513526917, "num_tokens": 47577811.0, "step": 20750 }, { "entropy": 5.268559169769287, "epoch": 1.993756003842459, "grad_norm": 1.2265625, "learning_rate": 0.0004604474851378124, "loss": 4.9726, "mean_token_accuracy": 0.21728340685367584, "num_tokens": 47588309.0, "step": 20755 }, { "entropy": 5.331043815612793, "epoch": 1.994236311239193, "grad_norm": 1.2578125, "learning_rate": 0.00046042806879902803, "loss": 5.0637, "mean_token_accuracy": 0.20983712524175643, "num_tokens": 47599814.0, "step": 20760 }, { "entropy": 5.226640224456787, "epoch": 1.9947166186359269, "grad_norm": 1.2109375, "learning_rate": 0.0004604086481551546, "loss": 4.9663, "mean_token_accuracy": 0.2197731092572212, "num_tokens": 47611851.0, "step": 20765 }, { "entropy": 5.173228645324707, "epoch": 1.9951969260326607, "grad_norm": 1.2890625, "learning_rate": 0.0004603892232066428, "loss": 4.8209, "mean_token_accuracy": 0.22686802744865417, "num_tokens": 47624056.0, "step": 20770 }, { "entropy": 5.171985721588134, "epoch": 1.9956772334293948, "grad_norm": 1.265625, "learning_rate": 0.00046036979395394374, "loss": 4.9178, "mean_token_accuracy": 0.22749678641557694, "num_tokens": 47637214.0, "step": 20775 }, { "entropy": 5.163872909545899, "epoch": 1.9961575408261287, "grad_norm": 1.4375, "learning_rate": 0.0004603503603975085, "loss": 4.8755, "mean_token_accuracy": 0.2249054953455925, "num_tokens": 47647498.0, "step": 20780 }, { "entropy": 5.165285444259643, "epoch": 1.9966378482228626, "grad_norm": 1.1484375, "learning_rate": 0.0004603309225377883, "loss": 4.8804, "mean_token_accuracy": 0.2229431003332138, "num_tokens": 47659576.0, "step": 20785 }, { "entropy": 5.208536052703858, "epoch": 1.9971181556195965, "grad_norm": 1.15625, "learning_rate": 0.0004603114803752344, "loss": 4.8832, "mean_token_accuracy": 0.21672031581401824, "num_tokens": 47670631.0, "step": 20790 }, { "entropy": 5.1253608703613285, "epoch": 1.9975984630163306, "grad_norm": 1.671875, "learning_rate": 0.00046029203391029813, "loss": 4.8264, "mean_token_accuracy": 0.22523149996995925, "num_tokens": 47681647.0, "step": 20795 }, { "entropy": 5.123491811752319, "epoch": 1.9980787704130645, "grad_norm": 1.375, "learning_rate": 0.00046027258314343107, "loss": 4.8585, "mean_token_accuracy": 0.22700741440057753, "num_tokens": 47694386.0, "step": 20800 }, { "entropy": 5.194536399841309, "epoch": 1.9985590778097984, "grad_norm": 1.2734375, "learning_rate": 0.00046025312807508487, "loss": 4.902, "mean_token_accuracy": 0.2210228532552719, "num_tokens": 47704897.0, "step": 20805 }, { "entropy": 5.185521221160888, "epoch": 1.9990393852065322, "grad_norm": 1.1875, "learning_rate": 0.00046023366870571097, "loss": 4.9504, "mean_token_accuracy": 0.22025657594203948, "num_tokens": 47715771.0, "step": 20810 }, { "entropy": 5.205077028274536, "epoch": 1.9995196926032661, "grad_norm": 1.125, "learning_rate": 0.00046021420503576145, "loss": 4.832, "mean_token_accuracy": 0.22895766347646712, "num_tokens": 47726295.0, "step": 20815 }, { "entropy": 5.0892219066619875, "epoch": 2.0, "grad_norm": 1.4609375, "learning_rate": 0.0004601947370656879, "loss": 4.9096, "mean_token_accuracy": 0.21949992924928666, "num_tokens": 47737072.0, "step": 20820 }, { "entropy": 5.194896125793457, "epoch": 2.000480307396734, "grad_norm": 1.1796875, "learning_rate": 0.0004601752647959426, "loss": 4.8771, "mean_token_accuracy": 0.2225003719329834, "num_tokens": 47750379.0, "step": 20825 }, { "entropy": 5.293734169006347, "epoch": 2.0009606147934678, "grad_norm": 1.3671875, "learning_rate": 0.0004601557882269775, "loss": 4.9057, "mean_token_accuracy": 0.21733027547597886, "num_tokens": 47763059.0, "step": 20830 }, { "entropy": 5.149887371063232, "epoch": 2.0014409221902016, "grad_norm": 1.171875, "learning_rate": 0.0004601363073592447, "loss": 4.8075, "mean_token_accuracy": 0.22715968489646912, "num_tokens": 47775513.0, "step": 20835 }, { "entropy": 5.059220695495606, "epoch": 2.0019212295869355, "grad_norm": 1.21875, "learning_rate": 0.0004601168221931967, "loss": 4.7368, "mean_token_accuracy": 0.2307532474398613, "num_tokens": 47786215.0, "step": 20840 }, { "entropy": 5.1203209400177006, "epoch": 2.0024015369836694, "grad_norm": 1.234375, "learning_rate": 0.0004600973327292857, "loss": 4.8167, "mean_token_accuracy": 0.22654520273208617, "num_tokens": 47796894.0, "step": 20845 }, { "entropy": 5.16290979385376, "epoch": 2.0028818443804033, "grad_norm": 1.3046875, "learning_rate": 0.00046007783896796436, "loss": 4.8404, "mean_token_accuracy": 0.22516684383153915, "num_tokens": 47808505.0, "step": 20850 }, { "entropy": 5.139503717422485, "epoch": 2.003362151777137, "grad_norm": 1.1484375, "learning_rate": 0.0004600583409096851, "loss": 4.729, "mean_token_accuracy": 0.23259917199611663, "num_tokens": 47819617.0, "step": 20855 }, { "entropy": 5.103642559051513, "epoch": 2.0038424591738715, "grad_norm": 1.1484375, "learning_rate": 0.00046003883855490066, "loss": 4.7818, "mean_token_accuracy": 0.23402840942144393, "num_tokens": 47830655.0, "step": 20860 }, { "entropy": 5.106681871414184, "epoch": 2.0043227665706054, "grad_norm": 1.203125, "learning_rate": 0.0004600193319040638, "loss": 4.7267, "mean_token_accuracy": 0.23414026349782943, "num_tokens": 47842439.0, "step": 20865 }, { "entropy": 5.0792402744293215, "epoch": 2.0048030739673393, "grad_norm": 1.1875, "learning_rate": 0.00045999982095762756, "loss": 4.6937, "mean_token_accuracy": 0.23838399052619935, "num_tokens": 47853327.0, "step": 20870 }, { "entropy": 5.178643083572387, "epoch": 2.005283381364073, "grad_norm": 1.4921875, "learning_rate": 0.00045998030571604473, "loss": 4.8768, "mean_token_accuracy": 0.22799644619226456, "num_tokens": 47867155.0, "step": 20875 }, { "entropy": 5.142761993408203, "epoch": 2.005763688760807, "grad_norm": 1.2421875, "learning_rate": 0.0004599607861797685, "loss": 4.8213, "mean_token_accuracy": 0.22531704753637313, "num_tokens": 47879641.0, "step": 20880 }, { "entropy": 5.163343143463135, "epoch": 2.006243996157541, "grad_norm": 1.2265625, "learning_rate": 0.00045994126234925203, "loss": 4.8515, "mean_token_accuracy": 0.22218380719423295, "num_tokens": 47890512.0, "step": 20885 }, { "entropy": 5.1860779285430905, "epoch": 2.006724303554275, "grad_norm": 1.15625, "learning_rate": 0.00045992173422494865, "loss": 4.8701, "mean_token_accuracy": 0.21841327995061874, "num_tokens": 47902575.0, "step": 20890 }, { "entropy": 5.16825647354126, "epoch": 2.0072046109510087, "grad_norm": 1.234375, "learning_rate": 0.0004599022018073117, "loss": 4.7529, "mean_token_accuracy": 0.2366969734430313, "num_tokens": 47915447.0, "step": 20895 }, { "entropy": 5.090538883209229, "epoch": 2.0076849183477425, "grad_norm": 1.3515625, "learning_rate": 0.0004598826650967946, "loss": 4.8135, "mean_token_accuracy": 0.2228596404194832, "num_tokens": 47925702.0, "step": 20900 }, { "entropy": 5.163640880584717, "epoch": 2.0081652257444764, "grad_norm": 1.2421875, "learning_rate": 0.00045986312409385105, "loss": 4.8047, "mean_token_accuracy": 0.22682830542325974, "num_tokens": 47936621.0, "step": 20905 }, { "entropy": 5.110746955871582, "epoch": 2.0086455331412103, "grad_norm": 1.1328125, "learning_rate": 0.0004598435787989347, "loss": 4.7464, "mean_token_accuracy": 0.23588199466466903, "num_tokens": 47948026.0, "step": 20910 }, { "entropy": 5.19895453453064, "epoch": 2.009125840537944, "grad_norm": 1.3046875, "learning_rate": 0.00045982402921249934, "loss": 4.8678, "mean_token_accuracy": 0.22533251196146012, "num_tokens": 47959976.0, "step": 20915 }, { "entropy": 5.225091600418091, "epoch": 2.009606147934678, "grad_norm": 1.3046875, "learning_rate": 0.0004598044753349988, "loss": 4.8494, "mean_token_accuracy": 0.22337938696146012, "num_tokens": 47972071.0, "step": 20920 }, { "entropy": 5.185384368896484, "epoch": 2.010086455331412, "grad_norm": 1.1875, "learning_rate": 0.00045978491716688706, "loss": 4.7873, "mean_token_accuracy": 0.2292557254433632, "num_tokens": 47984789.0, "step": 20925 }, { "entropy": 5.139200592041016, "epoch": 2.010566762728146, "grad_norm": 1.28125, "learning_rate": 0.0004597653547086184, "loss": 4.7904, "mean_token_accuracy": 0.231342613697052, "num_tokens": 47995661.0, "step": 20930 }, { "entropy": 5.208558177947998, "epoch": 2.0110470701248797, "grad_norm": 1.1640625, "learning_rate": 0.0004597457879606467, "loss": 4.8689, "mean_token_accuracy": 0.22292304635047913, "num_tokens": 48006043.0, "step": 20935 }, { "entropy": 5.165970993041992, "epoch": 2.011527377521614, "grad_norm": 1.203125, "learning_rate": 0.00045972621692342636, "loss": 4.9192, "mean_token_accuracy": 0.22206807434558867, "num_tokens": 48016904.0, "step": 20940 }, { "entropy": 5.181275033950806, "epoch": 2.012007684918348, "grad_norm": 1.2265625, "learning_rate": 0.00045970664159741186, "loss": 4.8304, "mean_token_accuracy": 0.23026852905750275, "num_tokens": 48026533.0, "step": 20945 }, { "entropy": 5.155037689208984, "epoch": 2.012487992315082, "grad_norm": 1.2421875, "learning_rate": 0.00045968706198305765, "loss": 4.769, "mean_token_accuracy": 0.23366015553474426, "num_tokens": 48038280.0, "step": 20950 }, { "entropy": 5.073322820663452, "epoch": 2.0129682997118157, "grad_norm": 1.2578125, "learning_rate": 0.00045966747808081824, "loss": 4.7476, "mean_token_accuracy": 0.23707432001829148, "num_tokens": 48049663.0, "step": 20955 }, { "entropy": 5.147378587722779, "epoch": 2.0134486071085496, "grad_norm": 1.2578125, "learning_rate": 0.0004596478898911483, "loss": 4.8248, "mean_token_accuracy": 0.22687099874019623, "num_tokens": 48059826.0, "step": 20960 }, { "entropy": 5.163741254806519, "epoch": 2.0139289145052834, "grad_norm": 1.234375, "learning_rate": 0.00045962829741450265, "loss": 4.7405, "mean_token_accuracy": 0.23313153833150863, "num_tokens": 48071097.0, "step": 20965 }, { "entropy": 5.1321446895599365, "epoch": 2.0144092219020173, "grad_norm": 1.2421875, "learning_rate": 0.0004596087006513361, "loss": 4.7963, "mean_token_accuracy": 0.22714407742023468, "num_tokens": 48082297.0, "step": 20970 }, { "entropy": 5.073679256439209, "epoch": 2.014889529298751, "grad_norm": 1.3203125, "learning_rate": 0.00045958909960210385, "loss": 4.7386, "mean_token_accuracy": 0.22801399379968643, "num_tokens": 48092946.0, "step": 20975 }, { "entropy": 5.0920733451843265, "epoch": 2.015369836695485, "grad_norm": 1.109375, "learning_rate": 0.00045956949426726075, "loss": 4.7988, "mean_token_accuracy": 0.2351382240653038, "num_tokens": 48105000.0, "step": 20980 }, { "entropy": 5.10857367515564, "epoch": 2.015850144092219, "grad_norm": 1.1796875, "learning_rate": 0.00045954988464726203, "loss": 4.7492, "mean_token_accuracy": 0.22923020124435425, "num_tokens": 48117513.0, "step": 20985 }, { "entropy": 5.245584154129029, "epoch": 2.016330451488953, "grad_norm": 1.25, "learning_rate": 0.000459530270742563, "loss": 4.9732, "mean_token_accuracy": 0.21183091551065444, "num_tokens": 48130895.0, "step": 20990 }, { "entropy": 5.161238050460815, "epoch": 2.0168107588856867, "grad_norm": 1.25, "learning_rate": 0.00045951065255361905, "loss": 4.7578, "mean_token_accuracy": 0.22671116292476653, "num_tokens": 48143348.0, "step": 20995 }, { "entropy": 5.128641080856323, "epoch": 2.0172910662824206, "grad_norm": 1.1796875, "learning_rate": 0.0004594910300808856, "loss": 4.7912, "mean_token_accuracy": 0.23204947561025618, "num_tokens": 48154959.0, "step": 21000 }, { "epoch": 2.0172910662824206, "eval_entropy": 4.9797098409949365, "eval_loss": 4.967945575714111, "eval_mean_token_accuracy": 0.22761427323133668, "eval_num_tokens": 48154959.0, "eval_runtime": 26.6315, "eval_samples_per_second": 1232.188, "eval_steps_per_second": 154.028, "step": 21000 }, { "entropy": 5.058965253829956, "epoch": 2.0177713736791545, "grad_norm": 1.328125, "learning_rate": 0.0004594714033248183, "loss": 4.6705, "mean_token_accuracy": 0.23726486414670944, "num_tokens": 48165227.0, "step": 21005 }, { "entropy": 5.185387229919433, "epoch": 2.0182516810758884, "grad_norm": 1.2890625, "learning_rate": 0.0004594517722858728, "loss": 4.8974, "mean_token_accuracy": 0.22563222348690032, "num_tokens": 48175133.0, "step": 21010 }, { "entropy": 5.174899244308472, "epoch": 2.0187319884726227, "grad_norm": 1.1484375, "learning_rate": 0.00045943213696450475, "loss": 4.8064, "mean_token_accuracy": 0.227556312084198, "num_tokens": 48187554.0, "step": 21015 }, { "entropy": 5.197635507583618, "epoch": 2.0192122958693566, "grad_norm": 1.296875, "learning_rate": 0.00045941249736117023, "loss": 4.8612, "mean_token_accuracy": 0.22105547934770584, "num_tokens": 48199410.0, "step": 21020 }, { "entropy": 5.099879741668701, "epoch": 2.0196926032660905, "grad_norm": 1.140625, "learning_rate": 0.0004593928534763251, "loss": 4.778, "mean_token_accuracy": 0.22858137935400008, "num_tokens": 48211124.0, "step": 21025 }, { "entropy": 5.104985618591309, "epoch": 2.0201729106628243, "grad_norm": 1.1953125, "learning_rate": 0.0004593732053104254, "loss": 4.7302, "mean_token_accuracy": 0.2319321408867836, "num_tokens": 48222690.0, "step": 21030 }, { "entropy": 5.139070272445679, "epoch": 2.020653218059558, "grad_norm": 1.1171875, "learning_rate": 0.00045935355286392735, "loss": 4.7768, "mean_token_accuracy": 0.22877870500087738, "num_tokens": 48235054.0, "step": 21035 }, { "entropy": 5.045724630355835, "epoch": 2.021133525456292, "grad_norm": 1.1953125, "learning_rate": 0.0004593338961372873, "loss": 4.7274, "mean_token_accuracy": 0.2347530335187912, "num_tokens": 48246645.0, "step": 21040 }, { "entropy": 5.0374504089355465, "epoch": 2.021613832853026, "grad_norm": 1.1875, "learning_rate": 0.0004593142351309614, "loss": 4.7458, "mean_token_accuracy": 0.23782979398965837, "num_tokens": 48258149.0, "step": 21045 }, { "entropy": 5.162465238571167, "epoch": 2.02209414024976, "grad_norm": 1.15625, "learning_rate": 0.0004592945698454064, "loss": 4.8985, "mean_token_accuracy": 0.2223748430609703, "num_tokens": 48269892.0, "step": 21050 }, { "entropy": 5.217119455337524, "epoch": 2.0225744476464937, "grad_norm": 1.234375, "learning_rate": 0.00045927490028107866, "loss": 4.8687, "mean_token_accuracy": 0.22514340579509734, "num_tokens": 48281087.0, "step": 21055 }, { "entropy": 5.139309453964233, "epoch": 2.0230547550432276, "grad_norm": 1.1015625, "learning_rate": 0.000459255226438435, "loss": 4.7274, "mean_token_accuracy": 0.23785278648138047, "num_tokens": 48292177.0, "step": 21060 }, { "entropy": 5.058869981765747, "epoch": 2.0235350624399615, "grad_norm": 1.2734375, "learning_rate": 0.000459235548317932, "loss": 4.7745, "mean_token_accuracy": 0.2236475557088852, "num_tokens": 48304281.0, "step": 21065 }, { "entropy": 5.087431287765503, "epoch": 2.0240153698366954, "grad_norm": 1.2890625, "learning_rate": 0.00045921586592002667, "loss": 4.751, "mean_token_accuracy": 0.23001312762498854, "num_tokens": 48316758.0, "step": 21070 }, { "entropy": 5.2216087818145756, "epoch": 2.0244956772334293, "grad_norm": 1.21875, "learning_rate": 0.000459196179245176, "loss": 4.9064, "mean_token_accuracy": 0.22211889773607255, "num_tokens": 48327455.0, "step": 21075 }, { "entropy": 5.122845983505249, "epoch": 2.024975984630163, "grad_norm": 1.4140625, "learning_rate": 0.0004591764882938369, "loss": 4.8134, "mean_token_accuracy": 0.2234889015555382, "num_tokens": 48338105.0, "step": 21080 }, { "entropy": 5.1362800121307375, "epoch": 2.025456292026897, "grad_norm": 1.359375, "learning_rate": 0.0004591567930664667, "loss": 4.8668, "mean_token_accuracy": 0.22312761843204498, "num_tokens": 48349599.0, "step": 21085 }, { "entropy": 5.173755311965943, "epoch": 2.025936599423631, "grad_norm": 1.2890625, "learning_rate": 0.0004591370935635226, "loss": 4.8962, "mean_token_accuracy": 0.22018368989229203, "num_tokens": 48361488.0, "step": 21090 }, { "entropy": 5.229612350463867, "epoch": 2.0264169068203652, "grad_norm": 1.28125, "learning_rate": 0.0004591173897854619, "loss": 4.8223, "mean_token_accuracy": 0.22342414259910584, "num_tokens": 48372049.0, "step": 21095 }, { "entropy": 5.111845207214356, "epoch": 2.026897214217099, "grad_norm": 1.3046875, "learning_rate": 0.0004590976817327422, "loss": 4.7524, "mean_token_accuracy": 0.23141625225543977, "num_tokens": 48385599.0, "step": 21100 }, { "entropy": 5.187682151794434, "epoch": 2.027377521613833, "grad_norm": 1.40625, "learning_rate": 0.0004590779694058209, "loss": 4.8404, "mean_token_accuracy": 0.230179800093174, "num_tokens": 48397390.0, "step": 21105 }, { "entropy": 5.152220296859741, "epoch": 2.027857829010567, "grad_norm": 1.3125, "learning_rate": 0.00045905825280515586, "loss": 4.8487, "mean_token_accuracy": 0.22240738272666932, "num_tokens": 48409837.0, "step": 21110 }, { "entropy": 5.095877361297608, "epoch": 2.0283381364073008, "grad_norm": 1.3515625, "learning_rate": 0.00045903853193120464, "loss": 4.6985, "mean_token_accuracy": 0.2371117353439331, "num_tokens": 48420765.0, "step": 21115 }, { "entropy": 5.13346791267395, "epoch": 2.0288184438040346, "grad_norm": 1.1796875, "learning_rate": 0.00045901880678442524, "loss": 4.8352, "mean_token_accuracy": 0.22843140363693237, "num_tokens": 48431939.0, "step": 21120 }, { "entropy": 5.203976631164551, "epoch": 2.0292987512007685, "grad_norm": 1.125, "learning_rate": 0.00045899907736527556, "loss": 4.8942, "mean_token_accuracy": 0.21955382823944092, "num_tokens": 48444820.0, "step": 21125 }, { "entropy": 5.116818284988403, "epoch": 2.0297790585975024, "grad_norm": 1.1484375, "learning_rate": 0.00045897934367421364, "loss": 4.8264, "mean_token_accuracy": 0.2259441375732422, "num_tokens": 48457315.0, "step": 21130 }, { "entropy": 5.199477338790894, "epoch": 2.0302593659942363, "grad_norm": 1.328125, "learning_rate": 0.0004589596057116977, "loss": 4.8979, "mean_token_accuracy": 0.21822259724140167, "num_tokens": 48468612.0, "step": 21135 }, { "entropy": 5.089222574234009, "epoch": 2.03073967339097, "grad_norm": 1.1953125, "learning_rate": 0.00045893986347818593, "loss": 4.7565, "mean_token_accuracy": 0.22949785143136978, "num_tokens": 48479453.0, "step": 21140 }, { "entropy": 5.087232303619385, "epoch": 2.031219980787704, "grad_norm": 1.1171875, "learning_rate": 0.0004589201169741368, "loss": 4.7952, "mean_token_accuracy": 0.22973719537258147, "num_tokens": 48491656.0, "step": 21145 }, { "entropy": 5.103940820693969, "epoch": 2.031700288184438, "grad_norm": 1.1328125, "learning_rate": 0.00045890036620000856, "loss": 4.7292, "mean_token_accuracy": 0.2226880133152008, "num_tokens": 48503431.0, "step": 21150 }, { "entropy": 5.117399597167969, "epoch": 2.032180595581172, "grad_norm": 1.140625, "learning_rate": 0.00045888061115626, "loss": 4.7687, "mean_token_accuracy": 0.22986358106136323, "num_tokens": 48513922.0, "step": 21155 }, { "entropy": 5.036969900131226, "epoch": 2.0326609029779057, "grad_norm": 1.203125, "learning_rate": 0.0004588608518433496, "loss": 4.7416, "mean_token_accuracy": 0.2294941857457161, "num_tokens": 48524715.0, "step": 21160 }, { "entropy": 5.133905744552612, "epoch": 2.0331412103746396, "grad_norm": 1.1484375, "learning_rate": 0.0004588410882617362, "loss": 4.884, "mean_token_accuracy": 0.22323887348175048, "num_tokens": 48536696.0, "step": 21165 }, { "entropy": 5.264414978027344, "epoch": 2.0336215177713735, "grad_norm": 1.21875, "learning_rate": 0.0004588213204118786, "loss": 4.9228, "mean_token_accuracy": 0.22241499423980712, "num_tokens": 48550290.0, "step": 21170 }, { "entropy": 5.195666217803955, "epoch": 2.034101825168108, "grad_norm": 1.140625, "learning_rate": 0.00045880154829423586, "loss": 4.8509, "mean_token_accuracy": 0.22739229947328568, "num_tokens": 48561047.0, "step": 21175 }, { "entropy": 5.034094524383545, "epoch": 2.0345821325648417, "grad_norm": 1.2578125, "learning_rate": 0.0004587817719092668, "loss": 4.636, "mean_token_accuracy": 0.23849904984235765, "num_tokens": 48571268.0, "step": 21180 }, { "entropy": 5.0633808135986325, "epoch": 2.0350624399615755, "grad_norm": 1.2109375, "learning_rate": 0.00045876199125743087, "loss": 4.7628, "mean_token_accuracy": 0.22590662389993668, "num_tokens": 48583158.0, "step": 21185 }, { "entropy": 5.175520372390747, "epoch": 2.0355427473583094, "grad_norm": 1.1796875, "learning_rate": 0.0004587422063391871, "loss": 4.8415, "mean_token_accuracy": 0.22876403331756592, "num_tokens": 48593419.0, "step": 21190 }, { "entropy": 5.183585357666016, "epoch": 2.0360230547550433, "grad_norm": 1.171875, "learning_rate": 0.0004587224171549949, "loss": 4.8184, "mean_token_accuracy": 0.23225459605455398, "num_tokens": 48605845.0, "step": 21195 }, { "entropy": 5.118991184234619, "epoch": 2.036503362151777, "grad_norm": 1.1171875, "learning_rate": 0.00045870262370531376, "loss": 4.7669, "mean_token_accuracy": 0.23297881484031677, "num_tokens": 48616503.0, "step": 21200 }, { "entropy": 5.188354969024658, "epoch": 2.036983669548511, "grad_norm": 1.2734375, "learning_rate": 0.00045868282599060314, "loss": 4.8723, "mean_token_accuracy": 0.22089865952730178, "num_tokens": 48628410.0, "step": 21205 }, { "entropy": 5.106374835968017, "epoch": 2.037463976945245, "grad_norm": 1.265625, "learning_rate": 0.0004586630240113227, "loss": 4.7639, "mean_token_accuracy": 0.23033759295940398, "num_tokens": 48639457.0, "step": 21210 }, { "entropy": 5.113154745101928, "epoch": 2.037944284341979, "grad_norm": 1.1640625, "learning_rate": 0.0004586432177679322, "loss": 4.81, "mean_token_accuracy": 0.22925937473773955, "num_tokens": 48652250.0, "step": 21215 }, { "entropy": 5.124432039260864, "epoch": 2.0384245917387127, "grad_norm": 1.1171875, "learning_rate": 0.00045862340726089153, "loss": 4.8795, "mean_token_accuracy": 0.22818073034286498, "num_tokens": 48663386.0, "step": 21220 }, { "entropy": 5.189832639694214, "epoch": 2.0389048991354466, "grad_norm": 1.140625, "learning_rate": 0.0004586035924906606, "loss": 4.8491, "mean_token_accuracy": 0.22109754979610444, "num_tokens": 48675111.0, "step": 21225 }, { "entropy": 5.126422500610351, "epoch": 2.0393852065321805, "grad_norm": 1.2265625, "learning_rate": 0.00045858377345769946, "loss": 4.7458, "mean_token_accuracy": 0.22514686733484268, "num_tokens": 48686264.0, "step": 21230 }, { "entropy": 5.126074361801147, "epoch": 2.0398655139289144, "grad_norm": 1.09375, "learning_rate": 0.0004585639501624682, "loss": 4.7946, "mean_token_accuracy": 0.22376175075769425, "num_tokens": 48697249.0, "step": 21235 }, { "entropy": 5.101242637634277, "epoch": 2.0403458213256482, "grad_norm": 1.15625, "learning_rate": 0.000458544122605427, "loss": 4.7369, "mean_token_accuracy": 0.23103302717208862, "num_tokens": 48708808.0, "step": 21240 }, { "entropy": 5.159991884231568, "epoch": 2.040826128722382, "grad_norm": 1.1328125, "learning_rate": 0.00045852429078703646, "loss": 4.8868, "mean_token_accuracy": 0.22002633064985275, "num_tokens": 48719879.0, "step": 21245 }, { "entropy": 5.2014281272888185, "epoch": 2.0413064361191164, "grad_norm": 1.0546875, "learning_rate": 0.00045850445470775673, "loss": 4.8326, "mean_token_accuracy": 0.22262947410345077, "num_tokens": 48731110.0, "step": 21250 }, { "entropy": 5.137082576751709, "epoch": 2.0417867435158503, "grad_norm": 1.2109375, "learning_rate": 0.0004584846143680485, "loss": 4.7875, "mean_token_accuracy": 0.22981350421905516, "num_tokens": 48742799.0, "step": 21255 }, { "entropy": 5.246884727478028, "epoch": 2.042267050912584, "grad_norm": 1.375, "learning_rate": 0.0004584647697683723, "loss": 4.9331, "mean_token_accuracy": 0.2172001451253891, "num_tokens": 48753773.0, "step": 21260 }, { "entropy": 5.141643142700195, "epoch": 2.042747358309318, "grad_norm": 1.171875, "learning_rate": 0.00045844492090918904, "loss": 4.7857, "mean_token_accuracy": 0.23033898323774338, "num_tokens": 48764561.0, "step": 21265 }, { "entropy": 5.087370872497559, "epoch": 2.043227665706052, "grad_norm": 1.1015625, "learning_rate": 0.00045842506779095936, "loss": 4.6829, "mean_token_accuracy": 0.22804915010929108, "num_tokens": 48775319.0, "step": 21270 }, { "entropy": 5.2386863231658936, "epoch": 2.043707973102786, "grad_norm": 1.171875, "learning_rate": 0.0004584052104141443, "loss": 4.9516, "mean_token_accuracy": 0.208183716237545, "num_tokens": 48787595.0, "step": 21275 }, { "entropy": 5.194898891448974, "epoch": 2.0441882804995197, "grad_norm": 1.265625, "learning_rate": 0.0004583853487792048, "loss": 4.8145, "mean_token_accuracy": 0.2251271814107895, "num_tokens": 48800015.0, "step": 21280 }, { "entropy": 5.183368492126465, "epoch": 2.0446685878962536, "grad_norm": 1.1171875, "learning_rate": 0.0004583654828866021, "loss": 4.836, "mean_token_accuracy": 0.22027584314346313, "num_tokens": 48811651.0, "step": 21285 }, { "entropy": 5.174183654785156, "epoch": 2.0451488952929875, "grad_norm": 1.171875, "learning_rate": 0.0004583456127367973, "loss": 4.86, "mean_token_accuracy": 0.22795891016721725, "num_tokens": 48823561.0, "step": 21290 }, { "entropy": 5.126351070404053, "epoch": 2.0456292026897214, "grad_norm": 1.2265625, "learning_rate": 0.0004583257383302519, "loss": 4.8773, "mean_token_accuracy": 0.2161845326423645, "num_tokens": 48835385.0, "step": 21295 }, { "entropy": 5.1982903480529785, "epoch": 2.0461095100864553, "grad_norm": 1.1953125, "learning_rate": 0.0004583058596674271, "loss": 4.8693, "mean_token_accuracy": 0.22223322540521623, "num_tokens": 48846899.0, "step": 21300 }, { "entropy": 5.138012409210205, "epoch": 2.046589817483189, "grad_norm": 1.25, "learning_rate": 0.0004582859767487846, "loss": 4.8044, "mean_token_accuracy": 0.22934290319681166, "num_tokens": 48857850.0, "step": 21305 }, { "entropy": 5.0926210403442385, "epoch": 2.047070124879923, "grad_norm": 1.1796875, "learning_rate": 0.00045826608957478604, "loss": 4.8101, "mean_token_accuracy": 0.226822829246521, "num_tokens": 48869192.0, "step": 21310 }, { "entropy": 5.1192957878112795, "epoch": 2.047550432276657, "grad_norm": 1.2578125, "learning_rate": 0.00045824619814589297, "loss": 4.782, "mean_token_accuracy": 0.22894603312015532, "num_tokens": 48881314.0, "step": 21315 }, { "entropy": 5.085238838195801, "epoch": 2.0480307396733908, "grad_norm": 1.1796875, "learning_rate": 0.0004582263024625674, "loss": 4.8282, "mean_token_accuracy": 0.22456549853086472, "num_tokens": 48893292.0, "step": 21320 }, { "entropy": 5.155279207229614, "epoch": 2.048511047070125, "grad_norm": 1.234375, "learning_rate": 0.0004582064025252711, "loss": 4.8899, "mean_token_accuracy": 0.22350900620222092, "num_tokens": 48905607.0, "step": 21325 }, { "entropy": 5.1824195861816404, "epoch": 2.048991354466859, "grad_norm": 1.4140625, "learning_rate": 0.0004581864983344661, "loss": 4.7898, "mean_token_accuracy": 0.2279469147324562, "num_tokens": 48916194.0, "step": 21330 }, { "entropy": 5.151317453384399, "epoch": 2.049471661863593, "grad_norm": 1.171875, "learning_rate": 0.0004581665898906147, "loss": 4.793, "mean_token_accuracy": 0.22845213562250138, "num_tokens": 48927560.0, "step": 21335 }, { "entropy": 5.038725471496582, "epoch": 2.0499519692603267, "grad_norm": 1.125, "learning_rate": 0.00045814667719417887, "loss": 4.7988, "mean_token_accuracy": 0.23149679154157637, "num_tokens": 48938284.0, "step": 21340 }, { "entropy": 5.110639238357544, "epoch": 2.0504322766570606, "grad_norm": 1.1875, "learning_rate": 0.000458126760245621, "loss": 4.789, "mean_token_accuracy": 0.22833970189094543, "num_tokens": 48948325.0, "step": 21345 }, { "entropy": 5.164987468719483, "epoch": 2.0509125840537945, "grad_norm": 1.1640625, "learning_rate": 0.0004581068390454036, "loss": 4.79, "mean_token_accuracy": 0.23049985021352767, "num_tokens": 48960020.0, "step": 21350 }, { "entropy": 5.129503297805786, "epoch": 2.0513928914505284, "grad_norm": 1.2421875, "learning_rate": 0.00045808691359398905, "loss": 4.7458, "mean_token_accuracy": 0.2291984051465988, "num_tokens": 48970833.0, "step": 21355 }, { "entropy": 5.103326511383057, "epoch": 2.0518731988472623, "grad_norm": 1.15625, "learning_rate": 0.0004580669838918401, "loss": 4.8077, "mean_token_accuracy": 0.23423267751932145, "num_tokens": 48981821.0, "step": 21360 }, { "entropy": 5.16456298828125, "epoch": 2.052353506243996, "grad_norm": 1.1953125, "learning_rate": 0.00045804704993941935, "loss": 4.8188, "mean_token_accuracy": 0.22716680616140367, "num_tokens": 48992341.0, "step": 21365 }, { "entropy": 5.176882934570313, "epoch": 2.05283381364073, "grad_norm": 1.203125, "learning_rate": 0.00045802711173718966, "loss": 4.8875, "mean_token_accuracy": 0.22166429013013839, "num_tokens": 49003063.0, "step": 21370 }, { "entropy": 5.161245679855346, "epoch": 2.053314121037464, "grad_norm": 1.21875, "learning_rate": 0.0004580071692856138, "loss": 4.8891, "mean_token_accuracy": 0.2255357474088669, "num_tokens": 49015353.0, "step": 21375 }, { "entropy": 5.0843805313110355, "epoch": 2.053794428434198, "grad_norm": 1.1875, "learning_rate": 0.00045798722258515504, "loss": 4.7591, "mean_token_accuracy": 0.23294135332107543, "num_tokens": 49026405.0, "step": 21380 }, { "entropy": 5.180259132385254, "epoch": 2.0542747358309317, "grad_norm": 1.1640625, "learning_rate": 0.00045796727163627623, "loss": 4.8378, "mean_token_accuracy": 0.22284193336963654, "num_tokens": 49038433.0, "step": 21385 }, { "entropy": 5.066757488250732, "epoch": 2.0547550432276656, "grad_norm": 1.203125, "learning_rate": 0.0004579473164394408, "loss": 4.7192, "mean_token_accuracy": 0.2332998186349869, "num_tokens": 49049463.0, "step": 21390 }, { "entropy": 5.036176252365112, "epoch": 2.0552353506243994, "grad_norm": 1.140625, "learning_rate": 0.00045792735699511176, "loss": 4.7009, "mean_token_accuracy": 0.2337260901927948, "num_tokens": 49061607.0, "step": 21395 }, { "entropy": 5.158068370819092, "epoch": 2.0557156580211333, "grad_norm": 1.1796875, "learning_rate": 0.00045790739330375276, "loss": 4.8613, "mean_token_accuracy": 0.2220068097114563, "num_tokens": 49074687.0, "step": 21400 }, { "entropy": 5.102404451370239, "epoch": 2.0561959654178676, "grad_norm": 1.1640625, "learning_rate": 0.00045788742536582717, "loss": 4.7332, "mean_token_accuracy": 0.23337887227535248, "num_tokens": 49086151.0, "step": 21405 }, { "entropy": 5.126197957992554, "epoch": 2.0566762728146015, "grad_norm": 1.109375, "learning_rate": 0.00045786745318179866, "loss": 4.8397, "mean_token_accuracy": 0.2232088029384613, "num_tokens": 49097418.0, "step": 21410 }, { "entropy": 5.067611217498779, "epoch": 2.0571565802113354, "grad_norm": 1.0859375, "learning_rate": 0.0004578474767521309, "loss": 4.7617, "mean_token_accuracy": 0.23839059025049208, "num_tokens": 49108570.0, "step": 21415 }, { "entropy": 5.0432921886444095, "epoch": 2.0576368876080693, "grad_norm": 1.2890625, "learning_rate": 0.00045782749607728765, "loss": 4.8426, "mean_token_accuracy": 0.22151308357715607, "num_tokens": 49120906.0, "step": 21420 }, { "entropy": 5.13737416267395, "epoch": 2.058117195004803, "grad_norm": 1.2421875, "learning_rate": 0.00045780751115773286, "loss": 4.7463, "mean_token_accuracy": 0.2359360083937645, "num_tokens": 49132040.0, "step": 21425 }, { "entropy": 5.133153438568115, "epoch": 2.058597502401537, "grad_norm": 1.1484375, "learning_rate": 0.0004577875219939304, "loss": 4.8587, "mean_token_accuracy": 0.23113487362861634, "num_tokens": 49143427.0, "step": 21430 }, { "entropy": 5.129691934585571, "epoch": 2.059077809798271, "grad_norm": 1.125, "learning_rate": 0.0004577675285863446, "loss": 4.8194, "mean_token_accuracy": 0.22555259466171265, "num_tokens": 49155351.0, "step": 21435 }, { "entropy": 5.158038377761841, "epoch": 2.059558117195005, "grad_norm": 1.1875, "learning_rate": 0.00045774753093543943, "loss": 4.8271, "mean_token_accuracy": 0.22593292891979216, "num_tokens": 49167898.0, "step": 21440 }, { "entropy": 5.149058246612549, "epoch": 2.0600384245917387, "grad_norm": 1.15625, "learning_rate": 0.0004577275290416791, "loss": 4.8146, "mean_token_accuracy": 0.22242112308740616, "num_tokens": 49179209.0, "step": 21445 }, { "entropy": 5.105331230163574, "epoch": 2.0605187319884726, "grad_norm": 1.1484375, "learning_rate": 0.0004577075229055283, "loss": 4.7651, "mean_token_accuracy": 0.2339042067527771, "num_tokens": 49190382.0, "step": 21450 }, { "entropy": 5.0633519172668455, "epoch": 2.0609990393852065, "grad_norm": 1.2109375, "learning_rate": 0.00045768751252745133, "loss": 4.8038, "mean_token_accuracy": 0.23498952239751816, "num_tokens": 49203511.0, "step": 21455 }, { "entropy": 5.231168937683106, "epoch": 2.0614793467819403, "grad_norm": 1.2265625, "learning_rate": 0.00045766749790791274, "loss": 4.8476, "mean_token_accuracy": 0.22552503943443297, "num_tokens": 49214276.0, "step": 21460 }, { "entropy": 5.007623672485352, "epoch": 2.061959654178674, "grad_norm": 1.0859375, "learning_rate": 0.0004576474790473773, "loss": 4.7003, "mean_token_accuracy": 0.2353790283203125, "num_tokens": 49225803.0, "step": 21465 }, { "entropy": 5.100078535079956, "epoch": 2.062439961575408, "grad_norm": 1.1015625, "learning_rate": 0.00045762745594630973, "loss": 4.8239, "mean_token_accuracy": 0.22536432445049287, "num_tokens": 49237187.0, "step": 21470 }, { "entropy": 5.23383059501648, "epoch": 2.062920268972142, "grad_norm": 1.390625, "learning_rate": 0.0004576074286051749, "loss": 4.8905, "mean_token_accuracy": 0.22259025722742082, "num_tokens": 49250151.0, "step": 21475 }, { "entropy": 5.116559600830078, "epoch": 2.063400576368876, "grad_norm": 1.2734375, "learning_rate": 0.00045758739702443787, "loss": 4.7177, "mean_token_accuracy": 0.24015939831733704, "num_tokens": 49261970.0, "step": 21480 }, { "entropy": 5.085567474365234, "epoch": 2.06388088376561, "grad_norm": 1.203125, "learning_rate": 0.0004575673612045636, "loss": 4.7807, "mean_token_accuracy": 0.23618687838315963, "num_tokens": 49271427.0, "step": 21485 }, { "entropy": 5.068835973739624, "epoch": 2.064361191162344, "grad_norm": 1.09375, "learning_rate": 0.0004575473211460173, "loss": 4.8175, "mean_token_accuracy": 0.22350717782974244, "num_tokens": 49285102.0, "step": 21490 }, { "entropy": 5.153846311569214, "epoch": 2.064841498559078, "grad_norm": 1.2578125, "learning_rate": 0.0004575272768492644, "loss": 4.815, "mean_token_accuracy": 0.2282892346382141, "num_tokens": 49297569.0, "step": 21495 }, { "entropy": 5.153155851364136, "epoch": 2.065321805955812, "grad_norm": 1.15625, "learning_rate": 0.00045750722831476993, "loss": 4.7356, "mean_token_accuracy": 0.2330961436033249, "num_tokens": 49308795.0, "step": 21500 }, { "entropy": 5.150883054733276, "epoch": 2.0658021133525457, "grad_norm": 1.109375, "learning_rate": 0.00045748717554299964, "loss": 4.8588, "mean_token_accuracy": 0.22739875316619873, "num_tokens": 49320971.0, "step": 21505 }, { "entropy": 5.160495758056641, "epoch": 2.0662824207492796, "grad_norm": 1.1796875, "learning_rate": 0.000457467118534419, "loss": 4.9338, "mean_token_accuracy": 0.21607262045145034, "num_tokens": 49333624.0, "step": 21510 }, { "entropy": 5.244060420989991, "epoch": 2.0667627281460135, "grad_norm": 1.2265625, "learning_rate": 0.0004574470572894938, "loss": 4.9336, "mean_token_accuracy": 0.21895478069782257, "num_tokens": 49344863.0, "step": 21515 }, { "entropy": 5.147519111633301, "epoch": 2.0672430355427474, "grad_norm": 1.1328125, "learning_rate": 0.0004574269918086895, "loss": 4.7947, "mean_token_accuracy": 0.22707038521766662, "num_tokens": 49355501.0, "step": 21520 }, { "entropy": 5.124402475357056, "epoch": 2.0677233429394812, "grad_norm": 1.1796875, "learning_rate": 0.0004574069220924722, "loss": 4.8635, "mean_token_accuracy": 0.23370101898908616, "num_tokens": 49366384.0, "step": 21525 }, { "entropy": 5.225724935531616, "epoch": 2.068203650336215, "grad_norm": 1.3125, "learning_rate": 0.0004573868481413079, "loss": 4.9503, "mean_token_accuracy": 0.21145387589931489, "num_tokens": 49378670.0, "step": 21530 }, { "entropy": 5.120734691619873, "epoch": 2.068683957732949, "grad_norm": 1.296875, "learning_rate": 0.00045736676995566244, "loss": 4.7891, "mean_token_accuracy": 0.22941143959760665, "num_tokens": 49390273.0, "step": 21535 }, { "entropy": 5.159012079238892, "epoch": 2.069164265129683, "grad_norm": 1.1953125, "learning_rate": 0.00045734668753600217, "loss": 4.8598, "mean_token_accuracy": 0.22018487006425858, "num_tokens": 49402503.0, "step": 21540 }, { "entropy": 5.173748779296875, "epoch": 2.0696445725264168, "grad_norm": 1.1875, "learning_rate": 0.00045732660088279326, "loss": 4.7962, "mean_token_accuracy": 0.2246626928448677, "num_tokens": 49413948.0, "step": 21545 }, { "entropy": 5.095520544052124, "epoch": 2.0701248799231506, "grad_norm": 1.1640625, "learning_rate": 0.00045730650999650216, "loss": 4.7589, "mean_token_accuracy": 0.23621760606765746, "num_tokens": 49424164.0, "step": 21550 }, { "entropy": 5.136573696136475, "epoch": 2.0706051873198845, "grad_norm": 1.390625, "learning_rate": 0.00045728641487759506, "loss": 4.9694, "mean_token_accuracy": 0.21803467869758605, "num_tokens": 49435102.0, "step": 21555 }, { "entropy": 5.109771871566773, "epoch": 2.071085494716619, "grad_norm": 1.09375, "learning_rate": 0.0004572663155265388, "loss": 4.7947, "mean_token_accuracy": 0.23301592767238616, "num_tokens": 49445445.0, "step": 21560 }, { "entropy": 5.155466461181641, "epoch": 2.0715658021133527, "grad_norm": 1.21875, "learning_rate": 0.0004572462119437999, "loss": 4.7728, "mean_token_accuracy": 0.229949714243412, "num_tokens": 49457578.0, "step": 21565 }, { "entropy": 5.060719394683838, "epoch": 2.0720461095100866, "grad_norm": 1.5546875, "learning_rate": 0.00045722610412984513, "loss": 4.7735, "mean_token_accuracy": 0.2294953465461731, "num_tokens": 49470253.0, "step": 21570 }, { "entropy": 5.164776754379273, "epoch": 2.0725264169068205, "grad_norm": 1.21875, "learning_rate": 0.0004572059920851412, "loss": 4.847, "mean_token_accuracy": 0.22464604824781417, "num_tokens": 49481854.0, "step": 21575 }, { "entropy": 5.15602068901062, "epoch": 2.0730067243035544, "grad_norm": 1.1328125, "learning_rate": 0.00045718587581015534, "loss": 4.8702, "mean_token_accuracy": 0.2171690970659256, "num_tokens": 49494311.0, "step": 21580 }, { "entropy": 5.178106212615967, "epoch": 2.0734870317002883, "grad_norm": 1.125, "learning_rate": 0.0004571657553053544, "loss": 4.8043, "mean_token_accuracy": 0.23305046260356904, "num_tokens": 49506186.0, "step": 21585 }, { "entropy": 5.129500436782837, "epoch": 2.073967339097022, "grad_norm": 1.2109375, "learning_rate": 0.0004571456305712055, "loss": 4.7951, "mean_token_accuracy": 0.22732842117547988, "num_tokens": 49517841.0, "step": 21590 }, { "entropy": 5.098537969589233, "epoch": 2.074447646493756, "grad_norm": 1.0703125, "learning_rate": 0.0004571255016081759, "loss": 4.7894, "mean_token_accuracy": 0.2313293009996414, "num_tokens": 49529260.0, "step": 21595 }, { "entropy": 5.134753608703614, "epoch": 2.07492795389049, "grad_norm": 1.1796875, "learning_rate": 0.000457105368416733, "loss": 4.7866, "mean_token_accuracy": 0.23531247079372405, "num_tokens": 49540892.0, "step": 21600 }, { "entropy": 5.188248872756958, "epoch": 2.0754082612872238, "grad_norm": 1.3671875, "learning_rate": 0.00045708523099734417, "loss": 4.8153, "mean_token_accuracy": 0.22493548393249513, "num_tokens": 49550924.0, "step": 21605 }, { "entropy": 5.116950845718383, "epoch": 2.0758885686839577, "grad_norm": 1.2421875, "learning_rate": 0.00045706508935047693, "loss": 4.8371, "mean_token_accuracy": 0.23132913410663605, "num_tokens": 49562424.0, "step": 21610 }, { "entropy": 5.108313131332397, "epoch": 2.0763688760806915, "grad_norm": 1.25, "learning_rate": 0.000457044943476599, "loss": 4.7706, "mean_token_accuracy": 0.23251599222421646, "num_tokens": 49573341.0, "step": 21615 }, { "entropy": 5.13428955078125, "epoch": 2.0768491834774254, "grad_norm": 1.09375, "learning_rate": 0.00045702479337617795, "loss": 4.7639, "mean_token_accuracy": 0.22919657826423645, "num_tokens": 49584047.0, "step": 21620 }, { "entropy": 5.056707668304443, "epoch": 2.0773294908741593, "grad_norm": 1.328125, "learning_rate": 0.0004570046390496818, "loss": 4.7705, "mean_token_accuracy": 0.22819400131702422, "num_tokens": 49595898.0, "step": 21625 }, { "entropy": 5.098710060119629, "epoch": 2.077809798270893, "grad_norm": 1.078125, "learning_rate": 0.0004569844804975783, "loss": 4.8347, "mean_token_accuracy": 0.22691741287708284, "num_tokens": 49607865.0, "step": 21630 }, { "entropy": 5.2283299446105955, "epoch": 2.0782901056676275, "grad_norm": 1.21875, "learning_rate": 0.0004569643177203356, "loss": 4.8626, "mean_token_accuracy": 0.22496672421693803, "num_tokens": 49619513.0, "step": 21635 }, { "entropy": 5.142520713806152, "epoch": 2.0787704130643614, "grad_norm": 1.296875, "learning_rate": 0.0004569441507184218, "loss": 4.7594, "mean_token_accuracy": 0.23643842935562134, "num_tokens": 49631232.0, "step": 21640 }, { "entropy": 5.1284263134002686, "epoch": 2.0792507204610953, "grad_norm": 1.1484375, "learning_rate": 0.00045692397949230495, "loss": 4.7922, "mean_token_accuracy": 0.22983661592006682, "num_tokens": 49642653.0, "step": 21645 }, { "entropy": 5.0891499519348145, "epoch": 2.079731027857829, "grad_norm": 1.0625, "learning_rate": 0.00045690380404245364, "loss": 4.746, "mean_token_accuracy": 0.2293478086590767, "num_tokens": 49654479.0, "step": 21650 }, { "entropy": 5.111334705352784, "epoch": 2.080211335254563, "grad_norm": 1.203125, "learning_rate": 0.00045688362436933607, "loss": 4.8206, "mean_token_accuracy": 0.2245978146791458, "num_tokens": 49666461.0, "step": 21655 }, { "entropy": 5.1269042015075685, "epoch": 2.080691642651297, "grad_norm": 1.171875, "learning_rate": 0.0004568634404734208, "loss": 4.8667, "mean_token_accuracy": 0.23144145607948302, "num_tokens": 49676602.0, "step": 21660 }, { "entropy": 5.069075059890747, "epoch": 2.081171950048031, "grad_norm": 1.125, "learning_rate": 0.0004568432523551765, "loss": 4.7765, "mean_token_accuracy": 0.2366631269454956, "num_tokens": 49689225.0, "step": 21665 }, { "entropy": 5.090816020965576, "epoch": 2.0816522574447647, "grad_norm": 1.171875, "learning_rate": 0.0004568230600150718, "loss": 4.8148, "mean_token_accuracy": 0.22771646976470947, "num_tokens": 49700827.0, "step": 21670 }, { "entropy": 5.17385630607605, "epoch": 2.0821325648414986, "grad_norm": 1.34375, "learning_rate": 0.0004568028634535757, "loss": 4.8407, "mean_token_accuracy": 0.23026363849639891, "num_tokens": 49712777.0, "step": 21675 }, { "entropy": 5.157884550094605, "epoch": 2.0826128722382324, "grad_norm": 1.1171875, "learning_rate": 0.0004567826626711568, "loss": 4.825, "mean_token_accuracy": 0.23216718733310698, "num_tokens": 49725234.0, "step": 21680 }, { "entropy": 5.0832091808319095, "epoch": 2.0830931796349663, "grad_norm": 1.1640625, "learning_rate": 0.0004567624576682843, "loss": 4.7638, "mean_token_accuracy": 0.23463573008775712, "num_tokens": 49736933.0, "step": 21685 }, { "entropy": 5.1686598777771, "epoch": 2.0835734870317, "grad_norm": 1.3515625, "learning_rate": 0.0004567422484454273, "loss": 4.8665, "mean_token_accuracy": 0.22514163851737976, "num_tokens": 49748279.0, "step": 21690 }, { "entropy": 5.040525722503662, "epoch": 2.084053794428434, "grad_norm": 1.265625, "learning_rate": 0.00045672203500305493, "loss": 4.7803, "mean_token_accuracy": 0.23218834549188613, "num_tokens": 49760874.0, "step": 21695 }, { "entropy": 5.063557481765747, "epoch": 2.084534101825168, "grad_norm": 1.1484375, "learning_rate": 0.00045670181734163654, "loss": 4.7046, "mean_token_accuracy": 0.2362649843096733, "num_tokens": 49772520.0, "step": 21700 }, { "entropy": 5.07045087814331, "epoch": 2.085014409221902, "grad_norm": 1.1640625, "learning_rate": 0.0004566815954616414, "loss": 4.7643, "mean_token_accuracy": 0.2305879309773445, "num_tokens": 49783019.0, "step": 21705 }, { "entropy": 5.038512563705444, "epoch": 2.0854947166186357, "grad_norm": 1.7890625, "learning_rate": 0.00045666136936353913, "loss": 4.7758, "mean_token_accuracy": 0.22893125116825103, "num_tokens": 49795111.0, "step": 21710 }, { "entropy": 5.105225610733032, "epoch": 2.08597502401537, "grad_norm": 1.0859375, "learning_rate": 0.0004566411390477993, "loss": 4.7396, "mean_token_accuracy": 0.2333238035440445, "num_tokens": 49807022.0, "step": 21715 }, { "entropy": 5.177683877944946, "epoch": 2.086455331412104, "grad_norm": 1.359375, "learning_rate": 0.00045662090451489156, "loss": 4.838, "mean_token_accuracy": 0.2248334839940071, "num_tokens": 49819225.0, "step": 21720 }, { "entropy": 5.13505277633667, "epoch": 2.086935638808838, "grad_norm": 1.0859375, "learning_rate": 0.00045660066576528577, "loss": 4.7901, "mean_token_accuracy": 0.2331462487578392, "num_tokens": 49830989.0, "step": 21725 }, { "entropy": 5.134175109863281, "epoch": 2.0874159462055717, "grad_norm": 1.3515625, "learning_rate": 0.0004565804227994518, "loss": 4.7844, "mean_token_accuracy": 0.23327937126159667, "num_tokens": 49841618.0, "step": 21730 }, { "entropy": 5.0149389743804935, "epoch": 2.0878962536023056, "grad_norm": 1.1796875, "learning_rate": 0.0004565601756178595, "loss": 4.6533, "mean_token_accuracy": 0.23993728905916215, "num_tokens": 49851985.0, "step": 21735 }, { "entropy": 5.093489313125611, "epoch": 2.0883765609990395, "grad_norm": 1.09375, "learning_rate": 0.0004565399242209791, "loss": 4.866, "mean_token_accuracy": 0.22467585802078247, "num_tokens": 49863383.0, "step": 21740 }, { "entropy": 5.182189702987671, "epoch": 2.0888568683957733, "grad_norm": 1.0546875, "learning_rate": 0.0004565196686092807, "loss": 4.8218, "mean_token_accuracy": 0.23218757808208465, "num_tokens": 49875757.0, "step": 21745 }, { "entropy": 5.112382221221924, "epoch": 2.089337175792507, "grad_norm": 1.1171875, "learning_rate": 0.0004564994087832346, "loss": 4.7662, "mean_token_accuracy": 0.23386679738759994, "num_tokens": 49888419.0, "step": 21750 }, { "entropy": 5.132735776901245, "epoch": 2.089817483189241, "grad_norm": 1.4296875, "learning_rate": 0.00045647914474331123, "loss": 4.8519, "mean_token_accuracy": 0.22820329815149307, "num_tokens": 49899629.0, "step": 21755 }, { "entropy": 5.226576328277588, "epoch": 2.090297790585975, "grad_norm": 1.2890625, "learning_rate": 0.00045645887648998094, "loss": 4.8461, "mean_token_accuracy": 0.22697775065898895, "num_tokens": 49909858.0, "step": 21760 }, { "entropy": 5.0463744640350345, "epoch": 2.090778097982709, "grad_norm": 1.140625, "learning_rate": 0.00045643860402371433, "loss": 4.809, "mean_token_accuracy": 0.2285969987511635, "num_tokens": 49921305.0, "step": 21765 }, { "entropy": 5.13976731300354, "epoch": 2.0912584053794427, "grad_norm": 1.2109375, "learning_rate": 0.0004564183273449821, "loss": 4.8182, "mean_token_accuracy": 0.22704905718564988, "num_tokens": 49931966.0, "step": 21770 }, { "entropy": 5.118189096450806, "epoch": 2.0917387127761766, "grad_norm": 1.3359375, "learning_rate": 0.0004563980464542551, "loss": 4.7414, "mean_token_accuracy": 0.22773682326078415, "num_tokens": 49943781.0, "step": 21775 }, { "entropy": 5.141473865509033, "epoch": 2.0922190201729105, "grad_norm": 1.1640625, "learning_rate": 0.00045637776135200406, "loss": 4.7968, "mean_token_accuracy": 0.23308294266462326, "num_tokens": 49954149.0, "step": 21780 }, { "entropy": 5.181213665008545, "epoch": 2.0926993275696444, "grad_norm": 1.234375, "learning_rate": 0.0004563574720386999, "loss": 4.8447, "mean_token_accuracy": 0.2268218591809273, "num_tokens": 49965542.0, "step": 21785 }, { "entropy": 5.112164497375488, "epoch": 2.0931796349663783, "grad_norm": 1.09375, "learning_rate": 0.0004563371785148139, "loss": 4.7811, "mean_token_accuracy": 0.225405690073967, "num_tokens": 49977444.0, "step": 21790 }, { "entropy": 5.08275146484375, "epoch": 2.0936599423631126, "grad_norm": 1.234375, "learning_rate": 0.00045631688078081695, "loss": 4.7678, "mean_token_accuracy": 0.22847483456134796, "num_tokens": 49988409.0, "step": 21795 }, { "entropy": 5.13219313621521, "epoch": 2.0941402497598465, "grad_norm": 1.1484375, "learning_rate": 0.0004562965788371805, "loss": 4.8285, "mean_token_accuracy": 0.22423352152109147, "num_tokens": 49999639.0, "step": 21800 }, { "entropy": 5.11357626914978, "epoch": 2.0946205571565804, "grad_norm": 1.53125, "learning_rate": 0.0004562762726843758, "loss": 4.7884, "mean_token_accuracy": 0.22604774087667465, "num_tokens": 50010964.0, "step": 21805 }, { "entropy": 5.1882233142852785, "epoch": 2.0951008645533142, "grad_norm": 1.1015625, "learning_rate": 0.00045625596232287436, "loss": 4.9171, "mean_token_accuracy": 0.21435359567403794, "num_tokens": 50024487.0, "step": 21810 }, { "entropy": 5.275296449661255, "epoch": 2.095581171950048, "grad_norm": 1.328125, "learning_rate": 0.0004562356477531477, "loss": 4.9596, "mean_token_accuracy": 0.21690075546503068, "num_tokens": 50037202.0, "step": 21815 }, { "entropy": 5.1064393520355225, "epoch": 2.096061479346782, "grad_norm": 1.15625, "learning_rate": 0.0004562153289756674, "loss": 4.7115, "mean_token_accuracy": 0.23600296229124068, "num_tokens": 50048520.0, "step": 21820 }, { "entropy": 5.138644599914551, "epoch": 2.096541786743516, "grad_norm": 1.1875, "learning_rate": 0.0004561950059909053, "loss": 4.8322, "mean_token_accuracy": 0.22320448607206345, "num_tokens": 50058523.0, "step": 21825 }, { "entropy": 5.159434032440186, "epoch": 2.0970220941402498, "grad_norm": 1.1484375, "learning_rate": 0.0004561746787993332, "loss": 4.8459, "mean_token_accuracy": 0.22186700254678726, "num_tokens": 50070241.0, "step": 21830 }, { "entropy": 5.162479114532471, "epoch": 2.0975024015369836, "grad_norm": 1.2421875, "learning_rate": 0.00045615434740142307, "loss": 4.8536, "mean_token_accuracy": 0.22856236398220062, "num_tokens": 50081326.0, "step": 21835 }, { "entropy": 5.051312732696533, "epoch": 2.0979827089337175, "grad_norm": 1.1875, "learning_rate": 0.00045613401179764686, "loss": 4.7083, "mean_token_accuracy": 0.23097764253616332, "num_tokens": 50091767.0, "step": 21840 }, { "entropy": 5.1112017154693605, "epoch": 2.0984630163304514, "grad_norm": 1.1171875, "learning_rate": 0.00045611367198847676, "loss": 4.8492, "mean_token_accuracy": 0.22863307744264602, "num_tokens": 50103435.0, "step": 21845 }, { "entropy": 5.160291004180908, "epoch": 2.0989433237271853, "grad_norm": 1.125, "learning_rate": 0.000456093327974385, "loss": 4.8023, "mean_token_accuracy": 0.22593716233968736, "num_tokens": 50114127.0, "step": 21850 }, { "entropy": 5.141428422927857, "epoch": 2.099423631123919, "grad_norm": 1.3515625, "learning_rate": 0.0004560729797558438, "loss": 4.8195, "mean_token_accuracy": 0.23141436874866486, "num_tokens": 50126328.0, "step": 21855 }, { "entropy": 5.143086194992065, "epoch": 2.099903938520653, "grad_norm": 1.078125, "learning_rate": 0.0004560526273333259, "loss": 4.7925, "mean_token_accuracy": 0.2326791599392891, "num_tokens": 50137600.0, "step": 21860 }, { "entropy": 5.146692323684692, "epoch": 2.100384245917387, "grad_norm": 1.1640625, "learning_rate": 0.00045603227070730346, "loss": 4.861, "mean_token_accuracy": 0.222840116918087, "num_tokens": 50148711.0, "step": 21865 }, { "entropy": 5.104106807708741, "epoch": 2.1008645533141213, "grad_norm": 1.0625, "learning_rate": 0.00045601190987824933, "loss": 4.8259, "mean_token_accuracy": 0.21673232913017274, "num_tokens": 50161141.0, "step": 21870 }, { "entropy": 5.149733543395996, "epoch": 2.101344860710855, "grad_norm": 1.1796875, "learning_rate": 0.00045599154484663606, "loss": 4.8356, "mean_token_accuracy": 0.22281887978315354, "num_tokens": 50173145.0, "step": 21875 }, { "entropy": 5.15035605430603, "epoch": 2.101825168107589, "grad_norm": 1.0859375, "learning_rate": 0.00045597117561293663, "loss": 4.8121, "mean_token_accuracy": 0.22988341897726058, "num_tokens": 50184074.0, "step": 21880 }, { "entropy": 5.278981351852417, "epoch": 2.102305475504323, "grad_norm": 1.2109375, "learning_rate": 0.0004559508021776238, "loss": 5.0788, "mean_token_accuracy": 0.20862277299165727, "num_tokens": 50195004.0, "step": 21885 }, { "entropy": 5.090446996688843, "epoch": 2.1027857829010568, "grad_norm": 1.140625, "learning_rate": 0.0004559304245411707, "loss": 4.7646, "mean_token_accuracy": 0.23295176327228545, "num_tokens": 50205329.0, "step": 21890 }, { "entropy": 5.181223297119141, "epoch": 2.1032660902977907, "grad_norm": 1.0703125, "learning_rate": 0.00045591004270405044, "loss": 4.8294, "mean_token_accuracy": 0.22295380681753157, "num_tokens": 50218346.0, "step": 21895 }, { "entropy": 5.123899793624878, "epoch": 2.1037463976945245, "grad_norm": 1.1640625, "learning_rate": 0.0004558896566667361, "loss": 4.7986, "mean_token_accuracy": 0.23100828528404235, "num_tokens": 50229790.0, "step": 21900 }, { "entropy": 5.232024002075195, "epoch": 2.1042267050912584, "grad_norm": 1.125, "learning_rate": 0.00045586926642970113, "loss": 4.9623, "mean_token_accuracy": 0.21864116042852402, "num_tokens": 50240547.0, "step": 21905 }, { "entropy": 5.128908777236939, "epoch": 2.1047070124879923, "grad_norm": 1.1015625, "learning_rate": 0.0004558488719934188, "loss": 4.7698, "mean_token_accuracy": 0.23230497986078263, "num_tokens": 50251889.0, "step": 21910 }, { "entropy": 5.117974853515625, "epoch": 2.105187319884726, "grad_norm": 1.09375, "learning_rate": 0.0004558284733583627, "loss": 4.8416, "mean_token_accuracy": 0.2239149734377861, "num_tokens": 50263599.0, "step": 21915 }, { "entropy": 5.144348382949829, "epoch": 2.10566762728146, "grad_norm": 1.1484375, "learning_rate": 0.00045580807052500645, "loss": 4.8838, "mean_token_accuracy": 0.22369515597820283, "num_tokens": 50275645.0, "step": 21920 }, { "entropy": 5.09762921333313, "epoch": 2.106147934678194, "grad_norm": 1.1015625, "learning_rate": 0.0004557876634938236, "loss": 4.7573, "mean_token_accuracy": 0.23217075318098068, "num_tokens": 50287640.0, "step": 21925 }, { "entropy": 5.126234912872315, "epoch": 2.106628242074928, "grad_norm": 1.234375, "learning_rate": 0.0004557672522652881, "loss": 4.8627, "mean_token_accuracy": 0.22643891870975494, "num_tokens": 50299119.0, "step": 21930 }, { "entropy": 5.120940732955932, "epoch": 2.1071085494716617, "grad_norm": 1.1328125, "learning_rate": 0.0004557468368398738, "loss": 4.8007, "mean_token_accuracy": 0.22507584393024443, "num_tokens": 50311377.0, "step": 21935 }, { "entropy": 5.157046747207642, "epoch": 2.1075888568683956, "grad_norm": 1.1796875, "learning_rate": 0.0004557264172180546, "loss": 4.8418, "mean_token_accuracy": 0.2272740438580513, "num_tokens": 50323185.0, "step": 21940 }, { "entropy": 5.1661652565002445, "epoch": 2.1080691642651295, "grad_norm": 1.109375, "learning_rate": 0.0004557059934003046, "loss": 4.9028, "mean_token_accuracy": 0.22802554219961166, "num_tokens": 50333992.0, "step": 21945 }, { "entropy": 5.253039216995239, "epoch": 2.108549471661864, "grad_norm": 1.2265625, "learning_rate": 0.0004556855653870981, "loss": 4.8791, "mean_token_accuracy": 0.21829527467489243, "num_tokens": 50346087.0, "step": 21950 }, { "entropy": 5.206522464752197, "epoch": 2.1090297790585977, "grad_norm": 1.140625, "learning_rate": 0.0004556651331789092, "loss": 4.879, "mean_token_accuracy": 0.22323226183652878, "num_tokens": 50358274.0, "step": 21955 }, { "entropy": 5.15128870010376, "epoch": 2.1095100864553316, "grad_norm": 1.125, "learning_rate": 0.0004556446967762125, "loss": 4.7805, "mean_token_accuracy": 0.2334059163928032, "num_tokens": 50367784.0, "step": 21960 }, { "entropy": 5.178620862960815, "epoch": 2.1099903938520654, "grad_norm": 1.140625, "learning_rate": 0.00045562425617948226, "loss": 4.9087, "mean_token_accuracy": 0.2221836417913437, "num_tokens": 50377877.0, "step": 21965 }, { "entropy": 5.145533895492553, "epoch": 2.1104707012487993, "grad_norm": 1.1640625, "learning_rate": 0.00045560381138919315, "loss": 4.8704, "mean_token_accuracy": 0.22770289629697799, "num_tokens": 50391023.0, "step": 21970 }, { "entropy": 5.199602508544922, "epoch": 2.110951008645533, "grad_norm": 1.1171875, "learning_rate": 0.00045558336240581984, "loss": 4.852, "mean_token_accuracy": 0.2230261117219925, "num_tokens": 50404417.0, "step": 21975 }, { "entropy": 5.131056404113769, "epoch": 2.111431316042267, "grad_norm": 1.203125, "learning_rate": 0.00045556290922983705, "loss": 4.8492, "mean_token_accuracy": 0.22247645556926726, "num_tokens": 50416153.0, "step": 21980 }, { "entropy": 5.08565092086792, "epoch": 2.111911623439001, "grad_norm": 1.09375, "learning_rate": 0.0004555424518617197, "loss": 4.8037, "mean_token_accuracy": 0.23319306671619416, "num_tokens": 50426860.0, "step": 21985 }, { "entropy": 5.109177160263061, "epoch": 2.112391930835735, "grad_norm": 1.09375, "learning_rate": 0.00045552199030194274, "loss": 4.7496, "mean_token_accuracy": 0.2333011209964752, "num_tokens": 50437262.0, "step": 21990 }, { "entropy": 5.113007116317749, "epoch": 2.1128722382324687, "grad_norm": 1.1796875, "learning_rate": 0.00045550152455098113, "loss": 4.7431, "mean_token_accuracy": 0.23262507021427153, "num_tokens": 50448564.0, "step": 21995 }, { "entropy": 5.070738649368286, "epoch": 2.1133525456292026, "grad_norm": 1.1484375, "learning_rate": 0.0004554810546093102, "loss": 4.7446, "mean_token_accuracy": 0.2425445109605789, "num_tokens": 50459416.0, "step": 22000 }, { "entropy": 5.15314302444458, "epoch": 2.1138328530259365, "grad_norm": 1.296875, "learning_rate": 0.000455460580477405, "loss": 4.8402, "mean_token_accuracy": 0.2306630253791809, "num_tokens": 50470934.0, "step": 22005 }, { "entropy": 5.205999040603638, "epoch": 2.1143131604226704, "grad_norm": 1.21875, "learning_rate": 0.000455440102155741, "loss": 4.8454, "mean_token_accuracy": 0.22617195397615433, "num_tokens": 50483050.0, "step": 22010 }, { "entropy": 5.158279943466186, "epoch": 2.1147934678194042, "grad_norm": 1.1875, "learning_rate": 0.0004554196196447937, "loss": 4.87, "mean_token_accuracy": 0.21700138747692108, "num_tokens": 50494655.0, "step": 22015 }, { "entropy": 5.155607461929321, "epoch": 2.115273775216138, "grad_norm": 1.171875, "learning_rate": 0.0004553991329450385, "loss": 4.8006, "mean_token_accuracy": 0.22141497135162352, "num_tokens": 50506302.0, "step": 22020 }, { "entropy": 5.169172048568726, "epoch": 2.115754082612872, "grad_norm": 1.0625, "learning_rate": 0.00045537864205695116, "loss": 4.8307, "mean_token_accuracy": 0.2262236014008522, "num_tokens": 50517979.0, "step": 22025 }, { "entropy": 5.175928783416748, "epoch": 2.1162343900096063, "grad_norm": 1.2578125, "learning_rate": 0.0004553581469810073, "loss": 4.8986, "mean_token_accuracy": 0.21773719787597656, "num_tokens": 50530054.0, "step": 22030 }, { "entropy": 5.101979446411133, "epoch": 2.11671469740634, "grad_norm": 1.1484375, "learning_rate": 0.00045533764771768287, "loss": 4.7949, "mean_token_accuracy": 0.22100536227226258, "num_tokens": 50540664.0, "step": 22035 }, { "entropy": 5.193927669525147, "epoch": 2.117195004803074, "grad_norm": 1.1328125, "learning_rate": 0.00045531714426745373, "loss": 4.8578, "mean_token_accuracy": 0.2234180748462677, "num_tokens": 50551444.0, "step": 22040 }, { "entropy": 5.0862926006317135, "epoch": 2.117675312199808, "grad_norm": 1.2109375, "learning_rate": 0.0004552966366307959, "loss": 4.7849, "mean_token_accuracy": 0.23260476291179658, "num_tokens": 50562371.0, "step": 22045 }, { "entropy": 5.075074005126953, "epoch": 2.118155619596542, "grad_norm": 1.1328125, "learning_rate": 0.0004552761248081856, "loss": 4.7629, "mean_token_accuracy": 0.22897567898035048, "num_tokens": 50573545.0, "step": 22050 }, { "entropy": 5.192044448852539, "epoch": 2.1186359269932757, "grad_norm": 1.0546875, "learning_rate": 0.000455255608800099, "loss": 4.9045, "mean_token_accuracy": 0.21907887905836104, "num_tokens": 50584799.0, "step": 22055 }, { "entropy": 5.175659608840943, "epoch": 2.1191162343900096, "grad_norm": 1.1953125, "learning_rate": 0.00045523508860701237, "loss": 4.8346, "mean_token_accuracy": 0.22237366437911987, "num_tokens": 50596435.0, "step": 22060 }, { "entropy": 5.097451782226562, "epoch": 2.1195965417867435, "grad_norm": 1.1171875, "learning_rate": 0.0004552145642294021, "loss": 4.7367, "mean_token_accuracy": 0.2358742281794548, "num_tokens": 50606921.0, "step": 22065 }, { "entropy": 5.129500150680542, "epoch": 2.1200768491834774, "grad_norm": 1.21875, "learning_rate": 0.00045519403566774493, "loss": 4.9222, "mean_token_accuracy": 0.21600277125835418, "num_tokens": 50618107.0, "step": 22070 }, { "entropy": 5.139066696166992, "epoch": 2.1205571565802113, "grad_norm": 1.09375, "learning_rate": 0.0004551735029225172, "loss": 4.8115, "mean_token_accuracy": 0.2253260374069214, "num_tokens": 50628958.0, "step": 22075 }, { "entropy": 5.1106373310089115, "epoch": 2.121037463976945, "grad_norm": 1.078125, "learning_rate": 0.00045515296599419583, "loss": 4.79, "mean_token_accuracy": 0.22253842353820802, "num_tokens": 50640557.0, "step": 22080 }, { "entropy": 5.1263810157775875, "epoch": 2.121517771373679, "grad_norm": 1.1328125, "learning_rate": 0.0004551324248832574, "loss": 4.8494, "mean_token_accuracy": 0.2251705527305603, "num_tokens": 50652699.0, "step": 22085 }, { "entropy": 5.118668031692505, "epoch": 2.121998078770413, "grad_norm": 1.203125, "learning_rate": 0.0004551118795901791, "loss": 4.7231, "mean_token_accuracy": 0.2325097680091858, "num_tokens": 50663735.0, "step": 22090 }, { "entropy": 5.100133562088013, "epoch": 2.122478386167147, "grad_norm": 1.078125, "learning_rate": 0.0004550913301154376, "loss": 4.8066, "mean_token_accuracy": 0.2283138006925583, "num_tokens": 50676384.0, "step": 22095 }, { "entropy": 5.165930652618409, "epoch": 2.1229586935638807, "grad_norm": 1.1953125, "learning_rate": 0.0004550707764595103, "loss": 4.8467, "mean_token_accuracy": 0.22226257771253585, "num_tokens": 50687775.0, "step": 22100 }, { "entropy": 5.177178525924683, "epoch": 2.123439000960615, "grad_norm": 1.1484375, "learning_rate": 0.00045505021862287434, "loss": 4.8929, "mean_token_accuracy": 0.2158343955874443, "num_tokens": 50698522.0, "step": 22105 }, { "entropy": 5.11175765991211, "epoch": 2.123919308357349, "grad_norm": 1.1953125, "learning_rate": 0.00045502965660600684, "loss": 4.7975, "mean_token_accuracy": 0.2320536717772484, "num_tokens": 50709994.0, "step": 22110 }, { "entropy": 5.087244272232056, "epoch": 2.1243996157540828, "grad_norm": 1.1796875, "learning_rate": 0.0004550090904093853, "loss": 4.7694, "mean_token_accuracy": 0.22834665477275848, "num_tokens": 50720777.0, "step": 22115 }, { "entropy": 5.1074995517730715, "epoch": 2.1248799231508166, "grad_norm": 1.1015625, "learning_rate": 0.0004549885200334872, "loss": 4.7568, "mean_token_accuracy": 0.23058853149414063, "num_tokens": 50732836.0, "step": 22120 }, { "entropy": 5.137260675430298, "epoch": 2.1253602305475505, "grad_norm": 1.1640625, "learning_rate": 0.0004549679454787901, "loss": 4.8156, "mean_token_accuracy": 0.2311272978782654, "num_tokens": 50743854.0, "step": 22125 }, { "entropy": 5.062818193435669, "epoch": 2.1258405379442844, "grad_norm": 1.2265625, "learning_rate": 0.00045494736674577175, "loss": 4.7465, "mean_token_accuracy": 0.230141381919384, "num_tokens": 50755524.0, "step": 22130 }, { "entropy": 5.133535289764405, "epoch": 2.1263208453410183, "grad_norm": 1.1796875, "learning_rate": 0.0004549267838349099, "loss": 4.7783, "mean_token_accuracy": 0.23211053013801575, "num_tokens": 50766909.0, "step": 22135 }, { "entropy": 5.0793849468231205, "epoch": 2.126801152737752, "grad_norm": 1.1328125, "learning_rate": 0.0004549061967466823, "loss": 4.7333, "mean_token_accuracy": 0.23264259546995164, "num_tokens": 50778065.0, "step": 22140 }, { "entropy": 5.14521746635437, "epoch": 2.127281460134486, "grad_norm": 1.125, "learning_rate": 0.0004548856054815671, "loss": 4.8307, "mean_token_accuracy": 0.23031747192144394, "num_tokens": 50790122.0, "step": 22145 }, { "entropy": 5.094038772583008, "epoch": 2.12776176753122, "grad_norm": 1.1796875, "learning_rate": 0.00045486501004004225, "loss": 4.8313, "mean_token_accuracy": 0.2208801105618477, "num_tokens": 50802661.0, "step": 22150 }, { "entropy": 5.153134393692016, "epoch": 2.128242074927954, "grad_norm": 1.09375, "learning_rate": 0.000454844410422586, "loss": 4.8698, "mean_token_accuracy": 0.22763815373182297, "num_tokens": 50814279.0, "step": 22155 }, { "entropy": 5.065900707244873, "epoch": 2.1287223823246877, "grad_norm": 1.078125, "learning_rate": 0.00045482380662967655, "loss": 4.7076, "mean_token_accuracy": 0.23367461413145066, "num_tokens": 50826186.0, "step": 22160 }, { "entropy": 5.157932567596435, "epoch": 2.1292026897214216, "grad_norm": 1.078125, "learning_rate": 0.0004548031986617923, "loss": 4.8323, "mean_token_accuracy": 0.21744155138731003, "num_tokens": 50838185.0, "step": 22165 }, { "entropy": 5.084433650970459, "epoch": 2.1296829971181555, "grad_norm": 1.1328125, "learning_rate": 0.0004547825865194117, "loss": 4.7825, "mean_token_accuracy": 0.23202351927757264, "num_tokens": 50850461.0, "step": 22170 }, { "entropy": 5.170603656768799, "epoch": 2.1301633045148893, "grad_norm": 1.2109375, "learning_rate": 0.00045476197020301323, "loss": 4.8724, "mean_token_accuracy": 0.22438560724258422, "num_tokens": 50861859.0, "step": 22175 }, { "entropy": 5.137531900405884, "epoch": 2.1306436119116237, "grad_norm": 1.2421875, "learning_rate": 0.00045474134971307554, "loss": 4.7638, "mean_token_accuracy": 0.2304867058992386, "num_tokens": 50872776.0, "step": 22180 }, { "entropy": 5.075492095947266, "epoch": 2.1311239193083575, "grad_norm": 1.171875, "learning_rate": 0.0004547207250500775, "loss": 4.7869, "mean_token_accuracy": 0.22913870215415955, "num_tokens": 50885333.0, "step": 22185 }, { "entropy": 5.1370524883270265, "epoch": 2.1316042267050914, "grad_norm": 1.140625, "learning_rate": 0.000454700096214498, "loss": 4.7838, "mean_token_accuracy": 0.22217728048563004, "num_tokens": 50897256.0, "step": 22190 }, { "entropy": 5.099877500534058, "epoch": 2.1320845341018253, "grad_norm": 1.1953125, "learning_rate": 0.00045467946320681567, "loss": 4.7826, "mean_token_accuracy": 0.22619348019361496, "num_tokens": 50909558.0, "step": 22195 }, { "entropy": 5.117943239212036, "epoch": 2.132564841498559, "grad_norm": 1.09375, "learning_rate": 0.0004546588260275098, "loss": 4.818, "mean_token_accuracy": 0.2298060894012451, "num_tokens": 50920617.0, "step": 22200 }, { "entropy": 5.107833623886108, "epoch": 2.133045148895293, "grad_norm": 1.1015625, "learning_rate": 0.00045463818467705955, "loss": 4.7844, "mean_token_accuracy": 0.22848994582891463, "num_tokens": 50931323.0, "step": 22205 }, { "entropy": 5.100920295715332, "epoch": 2.133525456292027, "grad_norm": 1.1953125, "learning_rate": 0.000454617539155944, "loss": 4.6748, "mean_token_accuracy": 0.23774074912071227, "num_tokens": 50942798.0, "step": 22210 }, { "entropy": 5.047171878814697, "epoch": 2.134005763688761, "grad_norm": 1.1171875, "learning_rate": 0.00045459688946464255, "loss": 4.6721, "mean_token_accuracy": 0.2314303919672966, "num_tokens": 50954877.0, "step": 22215 }, { "entropy": 5.193651580810547, "epoch": 2.1344860710854947, "grad_norm": 1.140625, "learning_rate": 0.0004545762356036346, "loss": 4.7615, "mean_token_accuracy": 0.22914791703224183, "num_tokens": 50966133.0, "step": 22220 }, { "entropy": 5.151372289657592, "epoch": 2.1349663784822286, "grad_norm": 1.1171875, "learning_rate": 0.0004545555775733998, "loss": 4.7618, "mean_token_accuracy": 0.23861754089593887, "num_tokens": 50977334.0, "step": 22225 }, { "entropy": 5.105147123336792, "epoch": 2.1354466858789625, "grad_norm": 1.125, "learning_rate": 0.00045453491537441747, "loss": 4.8307, "mean_token_accuracy": 0.2180767059326172, "num_tokens": 50988743.0, "step": 22230 }, { "entropy": 5.126622343063355, "epoch": 2.1359269932756964, "grad_norm": 1.1015625, "learning_rate": 0.00045451424900716763, "loss": 4.7816, "mean_token_accuracy": 0.2273872137069702, "num_tokens": 51000591.0, "step": 22235 }, { "entropy": 5.193524265289307, "epoch": 2.1364073006724302, "grad_norm": 1.1484375, "learning_rate": 0.00045449357847212994, "loss": 4.9755, "mean_token_accuracy": 0.2240106552839279, "num_tokens": 51011565.0, "step": 22240 }, { "entropy": 5.134286689758301, "epoch": 2.136887608069164, "grad_norm": 1.234375, "learning_rate": 0.0004544729037697844, "loss": 4.8427, "mean_token_accuracy": 0.22441424876451493, "num_tokens": 51022755.0, "step": 22245 }, { "entropy": 5.170749378204346, "epoch": 2.137367915465898, "grad_norm": 1.2421875, "learning_rate": 0.00045445222490061093, "loss": 4.8751, "mean_token_accuracy": 0.22253476232290267, "num_tokens": 51032839.0, "step": 22250 }, { "entropy": 5.129157400131225, "epoch": 2.1378482228626323, "grad_norm": 1.1640625, "learning_rate": 0.0004544315418650897, "loss": 4.8233, "mean_token_accuracy": 0.22642728239297866, "num_tokens": 51045020.0, "step": 22255 }, { "entropy": 5.132080316543579, "epoch": 2.138328530259366, "grad_norm": 1.15625, "learning_rate": 0.0004544108546637008, "loss": 4.738, "mean_token_accuracy": 0.23691660314798355, "num_tokens": 51056765.0, "step": 22260 }, { "entropy": 5.070805454254151, "epoch": 2.1388088376561, "grad_norm": 1.125, "learning_rate": 0.0004543901632969247, "loss": 4.7495, "mean_token_accuracy": 0.2298893377184868, "num_tokens": 51069559.0, "step": 22265 }, { "entropy": 5.124291658401489, "epoch": 2.139289145052834, "grad_norm": 1.15625, "learning_rate": 0.00045436946776524157, "loss": 4.7954, "mean_token_accuracy": 0.22728511691093445, "num_tokens": 51080972.0, "step": 22270 }, { "entropy": 5.047629261016846, "epoch": 2.139769452449568, "grad_norm": 1.1328125, "learning_rate": 0.00045434876806913204, "loss": 4.7466, "mean_token_accuracy": 0.2311826914548874, "num_tokens": 51092101.0, "step": 22275 }, { "entropy": 5.180647182464599, "epoch": 2.1402497598463017, "grad_norm": 1.328125, "learning_rate": 0.0004543280642090767, "loss": 4.9278, "mean_token_accuracy": 0.2261410266160965, "num_tokens": 51103372.0, "step": 22280 }, { "entropy": 5.1281579494476315, "epoch": 2.1407300672430356, "grad_norm": 1.2109375, "learning_rate": 0.0004543073561855562, "loss": 4.8322, "mean_token_accuracy": 0.22201440036296843, "num_tokens": 51114856.0, "step": 22285 }, { "entropy": 5.22743821144104, "epoch": 2.1412103746397695, "grad_norm": 1.1484375, "learning_rate": 0.0004542866439990513, "loss": 4.9103, "mean_token_accuracy": 0.21878052353858948, "num_tokens": 51126354.0, "step": 22290 }, { "entropy": 5.209173345565796, "epoch": 2.1416906820365034, "grad_norm": 1.171875, "learning_rate": 0.0004542659276500429, "loss": 4.8647, "mean_token_accuracy": 0.22599587440490723, "num_tokens": 51138341.0, "step": 22295 }, { "entropy": 5.069614505767822, "epoch": 2.1421709894332372, "grad_norm": 1.1796875, "learning_rate": 0.00045424520713901204, "loss": 4.7487, "mean_token_accuracy": 0.23675140142440795, "num_tokens": 51150969.0, "step": 22300 }, { "entropy": 5.048203372955323, "epoch": 2.142651296829971, "grad_norm": 1.0703125, "learning_rate": 0.0004542244824664396, "loss": 4.7701, "mean_token_accuracy": 0.22264178842306137, "num_tokens": 51162199.0, "step": 22305 }, { "entropy": 5.094993686676025, "epoch": 2.143131604226705, "grad_norm": 1.078125, "learning_rate": 0.00045420375363280696, "loss": 4.8432, "mean_token_accuracy": 0.220883372426033, "num_tokens": 51173129.0, "step": 22310 }, { "entropy": 5.074619722366333, "epoch": 2.143611911623439, "grad_norm": 1.1484375, "learning_rate": 0.00045418302063859526, "loss": 4.7323, "mean_token_accuracy": 0.23448609113693236, "num_tokens": 51184593.0, "step": 22315 }, { "entropy": 5.133390474319458, "epoch": 2.1440922190201728, "grad_norm": 1.109375, "learning_rate": 0.00045416228348428583, "loss": 4.7951, "mean_token_accuracy": 0.23045875877141953, "num_tokens": 51196524.0, "step": 22320 }, { "entropy": 5.139809799194336, "epoch": 2.1445725264169067, "grad_norm": 1.0546875, "learning_rate": 0.00045414154217036023, "loss": 4.8641, "mean_token_accuracy": 0.2281458020210266, "num_tokens": 51209307.0, "step": 22325 }, { "entropy": 5.067409896850586, "epoch": 2.1450528338136405, "grad_norm": 1.1796875, "learning_rate": 0.00045412079669730006, "loss": 4.7651, "mean_token_accuracy": 0.23632195293903352, "num_tokens": 51220919.0, "step": 22330 }, { "entropy": 5.133350276947022, "epoch": 2.1455331412103744, "grad_norm": 1.1484375, "learning_rate": 0.0004541000470655867, "loss": 4.7202, "mean_token_accuracy": 0.23960795998573303, "num_tokens": 51232930.0, "step": 22335 }, { "entropy": 5.068939113616944, "epoch": 2.1460134486071087, "grad_norm": 1.15625, "learning_rate": 0.00045407929327570215, "loss": 4.8146, "mean_token_accuracy": 0.23344950377941132, "num_tokens": 51244697.0, "step": 22340 }, { "entropy": 5.100889825820923, "epoch": 2.1464937560038426, "grad_norm": 1.1328125, "learning_rate": 0.0004540585353281282, "loss": 4.8058, "mean_token_accuracy": 0.23310038298368455, "num_tokens": 51255333.0, "step": 22345 }, { "entropy": 5.168522882461548, "epoch": 2.1469740634005765, "grad_norm": 1.234375, "learning_rate": 0.0004540377732233467, "loss": 4.8088, "mean_token_accuracy": 0.22492084354162217, "num_tokens": 51267649.0, "step": 22350 }, { "entropy": 5.118089246749878, "epoch": 2.1474543707973104, "grad_norm": 1.1484375, "learning_rate": 0.0004540170069618397, "loss": 4.7318, "mean_token_accuracy": 0.22790632545948028, "num_tokens": 51279576.0, "step": 22355 }, { "entropy": 5.149295806884766, "epoch": 2.1479346781940443, "grad_norm": 1.1953125, "learning_rate": 0.00045399623654408946, "loss": 4.8046, "mean_token_accuracy": 0.22683072388172149, "num_tokens": 51290974.0, "step": 22360 }, { "entropy": 5.23675651550293, "epoch": 2.148414985590778, "grad_norm": 1.21875, "learning_rate": 0.000453975461970578, "loss": 4.8662, "mean_token_accuracy": 0.22433996796607972, "num_tokens": 51301405.0, "step": 22365 }, { "entropy": 5.199128198623657, "epoch": 2.148895292987512, "grad_norm": 1.1484375, "learning_rate": 0.0004539546832417879, "loss": 4.9317, "mean_token_accuracy": 0.21655915826559066, "num_tokens": 51313158.0, "step": 22370 }, { "entropy": 5.159457111358643, "epoch": 2.149375600384246, "grad_norm": 1.125, "learning_rate": 0.00045393390035820136, "loss": 4.8091, "mean_token_accuracy": 0.238103286921978, "num_tokens": 51323789.0, "step": 22375 }, { "entropy": 5.081663799285889, "epoch": 2.14985590778098, "grad_norm": 1.1328125, "learning_rate": 0.000453913113320301, "loss": 4.83, "mean_token_accuracy": 0.22566504627466202, "num_tokens": 51334864.0, "step": 22380 }, { "entropy": 5.1508691787719725, "epoch": 2.1503362151777137, "grad_norm": 1.1875, "learning_rate": 0.0004538923221285694, "loss": 4.7237, "mean_token_accuracy": 0.22757074534893035, "num_tokens": 51345473.0, "step": 22385 }, { "entropy": 5.209955358505249, "epoch": 2.1508165225744476, "grad_norm": 1.1484375, "learning_rate": 0.0004538715267834893, "loss": 4.883, "mean_token_accuracy": 0.22349812388420104, "num_tokens": 51356755.0, "step": 22390 }, { "entropy": 5.011656999588013, "epoch": 2.1512968299711814, "grad_norm": 1.125, "learning_rate": 0.0004538507272855434, "loss": 4.6891, "mean_token_accuracy": 0.2348538041114807, "num_tokens": 51368351.0, "step": 22395 }, { "entropy": 5.162692880630493, "epoch": 2.1517771373679153, "grad_norm": 1.1875, "learning_rate": 0.00045382992363521486, "loss": 4.9466, "mean_token_accuracy": 0.22023718655109406, "num_tokens": 51379144.0, "step": 22400 }, { "entropy": 5.127411413192749, "epoch": 2.152257444764649, "grad_norm": 1.21875, "learning_rate": 0.00045380911583298633, "loss": 4.7643, "mean_token_accuracy": 0.23007805794477462, "num_tokens": 51390732.0, "step": 22405 }, { "entropy": 5.080131578445434, "epoch": 2.152737752161383, "grad_norm": 1.3515625, "learning_rate": 0.00045378830387934123, "loss": 4.7175, "mean_token_accuracy": 0.2360814943909645, "num_tokens": 51401185.0, "step": 22410 }, { "entropy": 5.140558958053589, "epoch": 2.1532180595581174, "grad_norm": 1.125, "learning_rate": 0.0004537674877747626, "loss": 4.8385, "mean_token_accuracy": 0.22309084683656694, "num_tokens": 51412598.0, "step": 22415 }, { "entropy": 5.099089097976685, "epoch": 2.1536983669548513, "grad_norm": 1.3515625, "learning_rate": 0.00045374666751973365, "loss": 4.7378, "mean_token_accuracy": 0.23137754797935486, "num_tokens": 51423248.0, "step": 22420 }, { "entropy": 5.034801387786866, "epoch": 2.154178674351585, "grad_norm": 1.09375, "learning_rate": 0.00045372584311473784, "loss": 4.6803, "mean_token_accuracy": 0.24379053264856337, "num_tokens": 51433862.0, "step": 22425 }, { "entropy": 5.033099889755249, "epoch": 2.154658981748319, "grad_norm": 1.375, "learning_rate": 0.0004537050145602587, "loss": 4.7913, "mean_token_accuracy": 0.23027238994836807, "num_tokens": 51445311.0, "step": 22430 }, { "entropy": 5.064621496200561, "epoch": 2.155139289145053, "grad_norm": 1.1640625, "learning_rate": 0.0004536841818567798, "loss": 4.6955, "mean_token_accuracy": 0.24127983748912812, "num_tokens": 51456550.0, "step": 22435 }, { "entropy": 5.14633994102478, "epoch": 2.155619596541787, "grad_norm": 1.171875, "learning_rate": 0.0004536633450047847, "loss": 4.8075, "mean_token_accuracy": 0.22890851646661758, "num_tokens": 51467645.0, "step": 22440 }, { "entropy": 5.107261705398559, "epoch": 2.1560999039385207, "grad_norm": 1.28125, "learning_rate": 0.00045364250400475734, "loss": 4.7765, "mean_token_accuracy": 0.23856985867023467, "num_tokens": 51478275.0, "step": 22445 }, { "entropy": 5.05497145652771, "epoch": 2.1565802113352546, "grad_norm": 1.25, "learning_rate": 0.0004536216588571814, "loss": 4.746, "mean_token_accuracy": 0.23648817390203475, "num_tokens": 51489634.0, "step": 22450 }, { "entropy": 5.086511611938477, "epoch": 2.1570605187319885, "grad_norm": 1.3984375, "learning_rate": 0.000453600809562541, "loss": 4.7761, "mean_token_accuracy": 0.2348331943154335, "num_tokens": 51500245.0, "step": 22455 }, { "entropy": 5.160480165481568, "epoch": 2.1575408261287223, "grad_norm": 1.3046875, "learning_rate": 0.0004535799561213202, "loss": 4.7828, "mean_token_accuracy": 0.23081537187099457, "num_tokens": 51511996.0, "step": 22460 }, { "entropy": 5.074454116821289, "epoch": 2.158021133525456, "grad_norm": 1.078125, "learning_rate": 0.000453559098534003, "loss": 4.7398, "mean_token_accuracy": 0.23446240425109863, "num_tokens": 51523522.0, "step": 22465 }, { "entropy": 5.158469390869141, "epoch": 2.15850144092219, "grad_norm": 1.25, "learning_rate": 0.0004535382368010738, "loss": 4.7873, "mean_token_accuracy": 0.23328575044870375, "num_tokens": 51535261.0, "step": 22470 }, { "entropy": 5.031622505187988, "epoch": 2.158981748318924, "grad_norm": 1.1875, "learning_rate": 0.00045351737092301676, "loss": 4.732, "mean_token_accuracy": 0.23541125059127807, "num_tokens": 51546965.0, "step": 22475 }, { "entropy": 5.092594337463379, "epoch": 2.159462055715658, "grad_norm": 1.1484375, "learning_rate": 0.0004534965009003165, "loss": 4.7498, "mean_token_accuracy": 0.23316184133291246, "num_tokens": 51558976.0, "step": 22480 }, { "entropy": 5.18133511543274, "epoch": 2.1599423631123917, "grad_norm": 1.1171875, "learning_rate": 0.0004534756267334576, "loss": 4.8635, "mean_token_accuracy": 0.22220516204833984, "num_tokens": 51571625.0, "step": 22485 }, { "entropy": 5.218547773361206, "epoch": 2.160422670509126, "grad_norm": 1.234375, "learning_rate": 0.00045345474842292455, "loss": 4.8676, "mean_token_accuracy": 0.2245472252368927, "num_tokens": 51582732.0, "step": 22490 }, { "entropy": 5.110941934585571, "epoch": 2.16090297790586, "grad_norm": 1.078125, "learning_rate": 0.0004534338659692022, "loss": 4.7882, "mean_token_accuracy": 0.22675420343875885, "num_tokens": 51594346.0, "step": 22495 }, { "entropy": 5.1038251399993895, "epoch": 2.161383285302594, "grad_norm": 1.375, "learning_rate": 0.0004534129793727753, "loss": 4.7964, "mean_token_accuracy": 0.23224691152572632, "num_tokens": 51606909.0, "step": 22500 }, { "entropy": 5.046190071105957, "epoch": 2.1618635926993277, "grad_norm": 1.1953125, "learning_rate": 0.0004533920886341288, "loss": 4.7774, "mean_token_accuracy": 0.2357572838664055, "num_tokens": 51617596.0, "step": 22505 }, { "entropy": 5.081631660461426, "epoch": 2.1623439000960616, "grad_norm": 1.1640625, "learning_rate": 0.0004533711937537477, "loss": 4.6884, "mean_token_accuracy": 0.23987720012664795, "num_tokens": 51627573.0, "step": 22510 }, { "entropy": 5.03942461013794, "epoch": 2.1628242074927955, "grad_norm": 1.1796875, "learning_rate": 0.0004533502947321171, "loss": 4.7468, "mean_token_accuracy": 0.22461321353912353, "num_tokens": 51640070.0, "step": 22515 }, { "entropy": 5.15150785446167, "epoch": 2.1633045148895294, "grad_norm": 1.1875, "learning_rate": 0.0004533293915697223, "loss": 4.6963, "mean_token_accuracy": 0.2359900563955307, "num_tokens": 51651139.0, "step": 22520 }, { "entropy": 5.16046838760376, "epoch": 2.1637848222862632, "grad_norm": 1.25, "learning_rate": 0.00045330848426704853, "loss": 4.8447, "mean_token_accuracy": 0.2310606837272644, "num_tokens": 51661893.0, "step": 22525 }, { "entropy": 5.088311004638672, "epoch": 2.164265129682997, "grad_norm": 1.1953125, "learning_rate": 0.0004532875728245813, "loss": 4.7932, "mean_token_accuracy": 0.22725322842597961, "num_tokens": 51674266.0, "step": 22530 }, { "entropy": 5.105494022369385, "epoch": 2.164745437079731, "grad_norm": 1.2265625, "learning_rate": 0.00045326665724280594, "loss": 4.812, "mean_token_accuracy": 0.23540204167366027, "num_tokens": 51685450.0, "step": 22535 }, { "entropy": 5.1548017978668215, "epoch": 2.165225744476465, "grad_norm": 1.1484375, "learning_rate": 0.00045324573752220814, "loss": 4.8023, "mean_token_accuracy": 0.2320707470178604, "num_tokens": 51696779.0, "step": 22540 }, { "entropy": 5.172539901733399, "epoch": 2.1657060518731988, "grad_norm": 1.140625, "learning_rate": 0.00045322481366327365, "loss": 4.8312, "mean_token_accuracy": 0.2289508491754532, "num_tokens": 51709387.0, "step": 22545 }, { "entropy": 5.096820926666259, "epoch": 2.1661863592699326, "grad_norm": 1.296875, "learning_rate": 0.0004532038856664882, "loss": 4.7993, "mean_token_accuracy": 0.2279914915561676, "num_tokens": 51720978.0, "step": 22550 }, { "entropy": 5.249212408065796, "epoch": 2.1666666666666665, "grad_norm": 1.25, "learning_rate": 0.0004531829535323376, "loss": 4.9049, "mean_token_accuracy": 0.220039102435112, "num_tokens": 51733510.0, "step": 22555 }, { "entropy": 5.079677438735962, "epoch": 2.1671469740634004, "grad_norm": 1.1640625, "learning_rate": 0.000453162017261308, "loss": 4.7079, "mean_token_accuracy": 0.2436806619167328, "num_tokens": 51743660.0, "step": 22560 }, { "entropy": 5.074145793914795, "epoch": 2.1676272814601343, "grad_norm": 1.125, "learning_rate": 0.0004531410768538854, "loss": 4.7453, "mean_token_accuracy": 0.23445709496736528, "num_tokens": 51754592.0, "step": 22565 }, { "entropy": 5.062536191940308, "epoch": 2.168107588856868, "grad_norm": 1.078125, "learning_rate": 0.00045312013231055596, "loss": 4.7404, "mean_token_accuracy": 0.23631888031959533, "num_tokens": 51766241.0, "step": 22570 }, { "entropy": 5.134042263031006, "epoch": 2.1685878962536025, "grad_norm": 1.1484375, "learning_rate": 0.00045309918363180593, "loss": 4.7631, "mean_token_accuracy": 0.2316461443901062, "num_tokens": 51777718.0, "step": 22575 }, { "entropy": 4.987406969070435, "epoch": 2.1690682036503364, "grad_norm": 1.1953125, "learning_rate": 0.00045307823081812166, "loss": 4.6724, "mean_token_accuracy": 0.24229834973812103, "num_tokens": 51788290.0, "step": 22580 }, { "entropy": 5.032321119308472, "epoch": 2.1695485110470702, "grad_norm": 1.296875, "learning_rate": 0.00045305727386998977, "loss": 4.7373, "mean_token_accuracy": 0.22974208891391754, "num_tokens": 51800080.0, "step": 22585 }, { "entropy": 5.1141222476959225, "epoch": 2.170028818443804, "grad_norm": 1.1875, "learning_rate": 0.0004530363127878966, "loss": 4.7809, "mean_token_accuracy": 0.2325947716832161, "num_tokens": 51810155.0, "step": 22590 }, { "entropy": 5.146018838882446, "epoch": 2.170509125840538, "grad_norm": 1.1015625, "learning_rate": 0.00045301534757232885, "loss": 4.8255, "mean_token_accuracy": 0.23102711737155915, "num_tokens": 51823082.0, "step": 22595 }, { "entropy": 5.156763553619385, "epoch": 2.170989433237272, "grad_norm": 1.078125, "learning_rate": 0.0004529943782237735, "loss": 4.8598, "mean_token_accuracy": 0.22403838038444518, "num_tokens": 51833280.0, "step": 22600 }, { "entropy": 5.114382219314575, "epoch": 2.1714697406340058, "grad_norm": 1.0859375, "learning_rate": 0.00045297340474271717, "loss": 4.7748, "mean_token_accuracy": 0.22729934453964235, "num_tokens": 51845105.0, "step": 22605 }, { "entropy": 5.13478045463562, "epoch": 2.1719500480307397, "grad_norm": 1.0625, "learning_rate": 0.0004529524271296468, "loss": 4.758, "mean_token_accuracy": 0.23644569963216783, "num_tokens": 51856232.0, "step": 22610 }, { "entropy": 5.076405763626099, "epoch": 2.1724303554274735, "grad_norm": 1.1875, "learning_rate": 0.00045293144538504943, "loss": 4.779, "mean_token_accuracy": 0.23823365718126296, "num_tokens": 51866218.0, "step": 22615 }, { "entropy": 5.115446138381958, "epoch": 2.1729106628242074, "grad_norm": 1.1796875, "learning_rate": 0.0004529104595094124, "loss": 4.7947, "mean_token_accuracy": 0.22927693277597427, "num_tokens": 51877425.0, "step": 22620 }, { "entropy": 5.124441528320313, "epoch": 2.1733909702209413, "grad_norm": 1.234375, "learning_rate": 0.00045288946950322264, "loss": 4.7993, "mean_token_accuracy": 0.2279620125889778, "num_tokens": 51888818.0, "step": 22625 }, { "entropy": 5.106270027160645, "epoch": 2.173871277617675, "grad_norm": 1.140625, "learning_rate": 0.0004528684753669677, "loss": 4.7799, "mean_token_accuracy": 0.22617914527654648, "num_tokens": 51899875.0, "step": 22630 }, { "entropy": 5.124859189987182, "epoch": 2.174351585014409, "grad_norm": 1.203125, "learning_rate": 0.0004528474771011349, "loss": 4.7568, "mean_token_accuracy": 0.2302141085267067, "num_tokens": 51911084.0, "step": 22635 }, { "entropy": 5.080235481262207, "epoch": 2.174831892411143, "grad_norm": 1.15625, "learning_rate": 0.00045282647470621176, "loss": 4.7936, "mean_token_accuracy": 0.2341061756014824, "num_tokens": 51922165.0, "step": 22640 }, { "entropy": 5.127397632598877, "epoch": 2.175312199807877, "grad_norm": 1.0859375, "learning_rate": 0.00045280546818268595, "loss": 4.8127, "mean_token_accuracy": 0.2303827852010727, "num_tokens": 51934827.0, "step": 22645 }, { "entropy": 5.158881187438965, "epoch": 2.175792507204611, "grad_norm": 1.109375, "learning_rate": 0.0004527844575310452, "loss": 4.8226, "mean_token_accuracy": 0.22183109372854232, "num_tokens": 51947997.0, "step": 22650 }, { "entropy": 5.118029928207397, "epoch": 2.176272814601345, "grad_norm": 1.2421875, "learning_rate": 0.00045276344275177715, "loss": 4.842, "mean_token_accuracy": 0.22817557156085969, "num_tokens": 51959639.0, "step": 22655 }, { "entropy": 5.144334506988526, "epoch": 2.176753121998079, "grad_norm": 1.21875, "learning_rate": 0.00045274242384536984, "loss": 4.8592, "mean_token_accuracy": 0.2271733269095421, "num_tokens": 51970716.0, "step": 22660 }, { "entropy": 5.128339672088623, "epoch": 2.177233429394813, "grad_norm": 1.1328125, "learning_rate": 0.0004527214008123113, "loss": 4.7914, "mean_token_accuracy": 0.23347052782773972, "num_tokens": 51982556.0, "step": 22665 }, { "entropy": 5.134899616241455, "epoch": 2.1777137367915467, "grad_norm": 1.1640625, "learning_rate": 0.0004527003736530895, "loss": 4.7718, "mean_token_accuracy": 0.2311878204345703, "num_tokens": 51994894.0, "step": 22670 }, { "entropy": 5.13347806930542, "epoch": 2.1781940441882806, "grad_norm": 1.1953125, "learning_rate": 0.00045267934236819265, "loss": 4.8463, "mean_token_accuracy": 0.21979733407497407, "num_tokens": 52007128.0, "step": 22675 }, { "entropy": 5.114097976684571, "epoch": 2.1786743515850144, "grad_norm": 1.09375, "learning_rate": 0.0004526583069581091, "loss": 4.9296, "mean_token_accuracy": 0.21860868930816652, "num_tokens": 52019598.0, "step": 22680 }, { "entropy": 5.116789150238037, "epoch": 2.1791546589817483, "grad_norm": 1.359375, "learning_rate": 0.0004526372674233272, "loss": 4.8043, "mean_token_accuracy": 0.23327067494392395, "num_tokens": 52031445.0, "step": 22685 }, { "entropy": 5.212763261795044, "epoch": 2.179634966378482, "grad_norm": 1.1015625, "learning_rate": 0.00045261622376433543, "loss": 4.8476, "mean_token_accuracy": 0.2272602990269661, "num_tokens": 52042548.0, "step": 22690 }, { "entropy": 5.077626943588257, "epoch": 2.180115273775216, "grad_norm": 1.109375, "learning_rate": 0.00045259517598162237, "loss": 4.7724, "mean_token_accuracy": 0.23077887147665024, "num_tokens": 52054147.0, "step": 22695 }, { "entropy": 5.057162761688232, "epoch": 2.18059558117195, "grad_norm": 1.1796875, "learning_rate": 0.0004525741240756766, "loss": 4.7556, "mean_token_accuracy": 0.23871446251869202, "num_tokens": 52065556.0, "step": 22700 }, { "entropy": 5.189388418197632, "epoch": 2.181075888568684, "grad_norm": 1.140625, "learning_rate": 0.0004525530680469871, "loss": 4.85, "mean_token_accuracy": 0.23103740066289902, "num_tokens": 52077543.0, "step": 22705 }, { "entropy": 5.084654235839844, "epoch": 2.1815561959654177, "grad_norm": 1.2109375, "learning_rate": 0.00045253200789604245, "loss": 4.7745, "mean_token_accuracy": 0.23199355304241182, "num_tokens": 52088376.0, "step": 22710 }, { "entropy": 5.065414905548096, "epoch": 2.1820365033621516, "grad_norm": 1.1953125, "learning_rate": 0.00045251094362333186, "loss": 4.7859, "mean_token_accuracy": 0.22604561150074004, "num_tokens": 52100019.0, "step": 22715 }, { "entropy": 5.075776672363281, "epoch": 2.1825168107588855, "grad_norm": 1.09375, "learning_rate": 0.0004524898752293441, "loss": 4.7523, "mean_token_accuracy": 0.23174404501914977, "num_tokens": 52110643.0, "step": 22720 }, { "entropy": 5.025632047653199, "epoch": 2.18299711815562, "grad_norm": 1.140625, "learning_rate": 0.00045246880271456857, "loss": 4.7199, "mean_token_accuracy": 0.235849928855896, "num_tokens": 52123092.0, "step": 22725 }, { "entropy": 5.192577219009399, "epoch": 2.1834774255523537, "grad_norm": 1.0859375, "learning_rate": 0.0004524477260794944, "loss": 4.8897, "mean_token_accuracy": 0.22173816263675689, "num_tokens": 52135371.0, "step": 22730 }, { "entropy": 5.149078893661499, "epoch": 2.1839577329490876, "grad_norm": 1.1328125, "learning_rate": 0.00045242664532461094, "loss": 4.7368, "mean_token_accuracy": 0.23526560813188552, "num_tokens": 52145877.0, "step": 22735 }, { "entropy": 5.034790945053101, "epoch": 2.1844380403458215, "grad_norm": 1.1953125, "learning_rate": 0.00045240556045040767, "loss": 4.8035, "mean_token_accuracy": 0.22363627403974534, "num_tokens": 52158228.0, "step": 22740 }, { "entropy": 5.198207521438599, "epoch": 2.1849183477425553, "grad_norm": 1.2578125, "learning_rate": 0.00045238447145737397, "loss": 4.9068, "mean_token_accuracy": 0.2178325742483139, "num_tokens": 52169515.0, "step": 22745 }, { "entropy": 5.153679275512696, "epoch": 2.185398655139289, "grad_norm": 1.125, "learning_rate": 0.00045236337834599966, "loss": 4.8384, "mean_token_accuracy": 0.2262999877333641, "num_tokens": 52179846.0, "step": 22750 }, { "entropy": 5.009027528762817, "epoch": 2.185878962536023, "grad_norm": 1.1171875, "learning_rate": 0.00045234228111677434, "loss": 4.6796, "mean_token_accuracy": 0.23403288871049882, "num_tokens": 52191095.0, "step": 22755 }, { "entropy": 5.104574346542359, "epoch": 2.186359269932757, "grad_norm": 1.421875, "learning_rate": 0.0004523211797701878, "loss": 4.8442, "mean_token_accuracy": 0.22490133345127106, "num_tokens": 52202289.0, "step": 22760 }, { "entropy": 5.049128198623658, "epoch": 2.186839577329491, "grad_norm": 1.1953125, "learning_rate": 0.00045230007430673014, "loss": 4.7937, "mean_token_accuracy": 0.23295473158359528, "num_tokens": 52214424.0, "step": 22765 }, { "entropy": 5.171049499511719, "epoch": 2.1873198847262247, "grad_norm": 1.234375, "learning_rate": 0.0004522789647268911, "loss": 4.8374, "mean_token_accuracy": 0.2265281304717064, "num_tokens": 52225446.0, "step": 22770 }, { "entropy": 5.092030334472656, "epoch": 2.1878001921229586, "grad_norm": 1.203125, "learning_rate": 0.0004522578510311609, "loss": 4.7353, "mean_token_accuracy": 0.2287060409784317, "num_tokens": 52236063.0, "step": 22775 }, { "entropy": 5.024506282806397, "epoch": 2.1882804995196925, "grad_norm": 1.125, "learning_rate": 0.00045223673322002984, "loss": 4.7728, "mean_token_accuracy": 0.2300436779856682, "num_tokens": 52248618.0, "step": 22780 }, { "entropy": 5.129857873916626, "epoch": 2.1887608069164264, "grad_norm": 1.171875, "learning_rate": 0.00045221561129398804, "loss": 4.901, "mean_token_accuracy": 0.2197287231683731, "num_tokens": 52261913.0, "step": 22785 }, { "entropy": 5.165701150894165, "epoch": 2.1892411143131603, "grad_norm": 1.296875, "learning_rate": 0.000452194485253526, "loss": 4.7645, "mean_token_accuracy": 0.23000348508358, "num_tokens": 52273838.0, "step": 22790 }, { "entropy": 5.074887895584107, "epoch": 2.189721421709894, "grad_norm": 1.21875, "learning_rate": 0.0004521733550991342, "loss": 4.8224, "mean_token_accuracy": 0.2346891850233078, "num_tokens": 52285464.0, "step": 22795 }, { "entropy": 5.058576774597168, "epoch": 2.1902017291066285, "grad_norm": 1.1171875, "learning_rate": 0.00045215222083130316, "loss": 4.765, "mean_token_accuracy": 0.23533908724784852, "num_tokens": 52295800.0, "step": 22800 }, { "entropy": 5.189198160171509, "epoch": 2.1906820365033624, "grad_norm": 1.2265625, "learning_rate": 0.0004521310824505236, "loss": 4.888, "mean_token_accuracy": 0.22185751646757126, "num_tokens": 52307243.0, "step": 22805 }, { "entropy": 5.105354499816895, "epoch": 2.1911623439000962, "grad_norm": 1.1875, "learning_rate": 0.00045210993995728623, "loss": 4.7606, "mean_token_accuracy": 0.23480006754398347, "num_tokens": 52317065.0, "step": 22810 }, { "entropy": 5.121937274932861, "epoch": 2.19164265129683, "grad_norm": 1.1640625, "learning_rate": 0.000452088793352082, "loss": 4.7907, "mean_token_accuracy": 0.22747430503368377, "num_tokens": 52328385.0, "step": 22815 }, { "entropy": 5.000249195098877, "epoch": 2.192122958693564, "grad_norm": 1.1953125, "learning_rate": 0.0004520676426354018, "loss": 4.7284, "mean_token_accuracy": 0.23484476506710053, "num_tokens": 52340046.0, "step": 22820 }, { "entropy": 5.029712343215943, "epoch": 2.192603266090298, "grad_norm": 1.203125, "learning_rate": 0.0004520464878077368, "loss": 4.7419, "mean_token_accuracy": 0.23461335599422456, "num_tokens": 52352071.0, "step": 22825 }, { "entropy": 5.2037135601043705, "epoch": 2.1930835734870318, "grad_norm": 1.09375, "learning_rate": 0.00045202532886957805, "loss": 4.8642, "mean_token_accuracy": 0.2289625346660614, "num_tokens": 52364173.0, "step": 22830 }, { "entropy": 5.04330883026123, "epoch": 2.1935638808837656, "grad_norm": 1.2109375, "learning_rate": 0.00045200416582141676, "loss": 4.689, "mean_token_accuracy": 0.23917343467473984, "num_tokens": 52374690.0, "step": 22835 }, { "entropy": 5.183155298233032, "epoch": 2.1940441882804995, "grad_norm": 1.2109375, "learning_rate": 0.0004519829986637444, "loss": 4.9656, "mean_token_accuracy": 0.22324798554182052, "num_tokens": 52385661.0, "step": 22840 }, { "entropy": 5.145434188842773, "epoch": 2.1945244956772334, "grad_norm": 1.0703125, "learning_rate": 0.0004519618273970523, "loss": 4.8464, "mean_token_accuracy": 0.2259630724787712, "num_tokens": 52396759.0, "step": 22845 }, { "entropy": 5.199276876449585, "epoch": 2.1950048030739673, "grad_norm": 1.0703125, "learning_rate": 0.00045194065202183205, "loss": 4.9162, "mean_token_accuracy": 0.21793469041585922, "num_tokens": 52409122.0, "step": 22850 }, { "entropy": 5.250739097595215, "epoch": 2.195485110470701, "grad_norm": 1.203125, "learning_rate": 0.0004519194725385752, "loss": 4.8925, "mean_token_accuracy": 0.21719120740890502, "num_tokens": 52421318.0, "step": 22855 }, { "entropy": 5.042360639572143, "epoch": 2.195965417867435, "grad_norm": 1.1953125, "learning_rate": 0.00045189828894777364, "loss": 4.746, "mean_token_accuracy": 0.23408232480287552, "num_tokens": 52432595.0, "step": 22860 }, { "entropy": 5.068605136871338, "epoch": 2.196445725264169, "grad_norm": 1.1328125, "learning_rate": 0.00045187710124991904, "loss": 4.7719, "mean_token_accuracy": 0.2310498610138893, "num_tokens": 52444423.0, "step": 22865 }, { "entropy": 5.16375584602356, "epoch": 2.196926032660903, "grad_norm": 1.0703125, "learning_rate": 0.0004518559094455034, "loss": 4.8846, "mean_token_accuracy": 0.2241843119263649, "num_tokens": 52457183.0, "step": 22870 }, { "entropy": 5.163765239715576, "epoch": 2.1974063400576367, "grad_norm": 1.1171875, "learning_rate": 0.0004518347135350187, "loss": 4.8152, "mean_token_accuracy": 0.22464211732149125, "num_tokens": 52469624.0, "step": 22875 }, { "entropy": 5.1679223537445065, "epoch": 2.1978866474543706, "grad_norm": 1.0390625, "learning_rate": 0.00045181351351895703, "loss": 4.916, "mean_token_accuracy": 0.21683948189020158, "num_tokens": 52481369.0, "step": 22880 }, { "entropy": 5.0861540794372555, "epoch": 2.198366954851105, "grad_norm": 1.1953125, "learning_rate": 0.0004517923093978106, "loss": 4.7479, "mean_token_accuracy": 0.23464581966400147, "num_tokens": 52493315.0, "step": 22885 }, { "entropy": 5.099695634841919, "epoch": 2.1988472622478388, "grad_norm": 1.0546875, "learning_rate": 0.0004517711011720717, "loss": 4.8213, "mean_token_accuracy": 0.23163434565067292, "num_tokens": 52504929.0, "step": 22890 }, { "entropy": 5.067135286331177, "epoch": 2.1993275696445727, "grad_norm": 1.03125, "learning_rate": 0.0004517498888422327, "loss": 4.7238, "mean_token_accuracy": 0.2288893863558769, "num_tokens": 52518364.0, "step": 22895 }, { "entropy": 5.086096143722534, "epoch": 2.1998078770413065, "grad_norm": 1.0859375, "learning_rate": 0.0004517286724087862, "loss": 4.7689, "mean_token_accuracy": 0.23164378404617308, "num_tokens": 52530359.0, "step": 22900 }, { "entropy": 5.117498397827148, "epoch": 2.2002881844380404, "grad_norm": 1.09375, "learning_rate": 0.0004517074518722247, "loss": 4.7983, "mean_token_accuracy": 0.23343625366687776, "num_tokens": 52542242.0, "step": 22905 }, { "entropy": 5.153238010406494, "epoch": 2.2007684918347743, "grad_norm": 1.1640625, "learning_rate": 0.00045168622723304084, "loss": 4.8348, "mean_token_accuracy": 0.23051410913467407, "num_tokens": 52552838.0, "step": 22910 }, { "entropy": 5.107896041870117, "epoch": 2.201248799231508, "grad_norm": 1.09375, "learning_rate": 0.0004516649984917274, "loss": 4.7863, "mean_token_accuracy": 0.22998663187026977, "num_tokens": 52564568.0, "step": 22915 }, { "entropy": 5.170346450805664, "epoch": 2.201729106628242, "grad_norm": 1.0546875, "learning_rate": 0.00045164376564877734, "loss": 4.8308, "mean_token_accuracy": 0.22436516731977463, "num_tokens": 52577563.0, "step": 22920 }, { "entropy": 5.161020660400391, "epoch": 2.202209414024976, "grad_norm": 1.140625, "learning_rate": 0.00045162252870468354, "loss": 4.8411, "mean_token_accuracy": 0.22507598847150803, "num_tokens": 52590295.0, "step": 22925 }, { "entropy": 5.116861581802368, "epoch": 2.20268972142171, "grad_norm": 1.125, "learning_rate": 0.0004516012876599391, "loss": 4.7759, "mean_token_accuracy": 0.23662513345479966, "num_tokens": 52602727.0, "step": 22930 }, { "entropy": 5.012200355529785, "epoch": 2.2031700288184437, "grad_norm": 1.140625, "learning_rate": 0.00045158004251503715, "loss": 4.6927, "mean_token_accuracy": 0.2406057357788086, "num_tokens": 52613422.0, "step": 22935 }, { "entropy": 4.982286357879639, "epoch": 2.2036503362151776, "grad_norm": 1.25, "learning_rate": 0.00045155879327047087, "loss": 4.7271, "mean_token_accuracy": 0.2413996621966362, "num_tokens": 52624907.0, "step": 22940 }, { "entropy": 5.147170066833496, "epoch": 2.2041306436119115, "grad_norm": 1.09375, "learning_rate": 0.0004515375399267338, "loss": 4.8421, "mean_token_accuracy": 0.22663878351449968, "num_tokens": 52635864.0, "step": 22945 }, { "entropy": 5.1448752880096436, "epoch": 2.2046109510086453, "grad_norm": 1.234375, "learning_rate": 0.00045151628248431925, "loss": 4.8956, "mean_token_accuracy": 0.22156819105148315, "num_tokens": 52647023.0, "step": 22950 }, { "entropy": 5.151683616638183, "epoch": 2.2050912584053792, "grad_norm": 1.2421875, "learning_rate": 0.00045149502094372077, "loss": 4.8621, "mean_token_accuracy": 0.22776360362768172, "num_tokens": 52658846.0, "step": 22955 }, { "entropy": 5.090791654586792, "epoch": 2.2055715658021136, "grad_norm": 0.99609375, "learning_rate": 0.00045147375530543195, "loss": 4.652, "mean_token_accuracy": 0.2444545805454254, "num_tokens": 52671363.0, "step": 22960 }, { "entropy": 5.138968563079834, "epoch": 2.2060518731988474, "grad_norm": 1.0234375, "learning_rate": 0.00045145248556994653, "loss": 4.8832, "mean_token_accuracy": 0.22517444640398027, "num_tokens": 52682734.0, "step": 22965 }, { "entropy": 5.081032848358154, "epoch": 2.2065321805955813, "grad_norm": 1.2109375, "learning_rate": 0.0004514312117377584, "loss": 4.7326, "mean_token_accuracy": 0.2331282079219818, "num_tokens": 52693272.0, "step": 22970 }, { "entropy": 5.096043682098388, "epoch": 2.207012487992315, "grad_norm": 1.0859375, "learning_rate": 0.0004514099338093613, "loss": 4.8028, "mean_token_accuracy": 0.2290824383497238, "num_tokens": 52703395.0, "step": 22975 }, { "entropy": 5.066891288757324, "epoch": 2.207492795389049, "grad_norm": 1.09375, "learning_rate": 0.0004513886517852496, "loss": 4.7096, "mean_token_accuracy": 0.23548574298620223, "num_tokens": 52715870.0, "step": 22980 }, { "entropy": 5.105998754501343, "epoch": 2.207973102785783, "grad_norm": 1.0234375, "learning_rate": 0.000451367365665917, "loss": 4.8257, "mean_token_accuracy": 0.23159873336553574, "num_tokens": 52726922.0, "step": 22985 }, { "entropy": 5.127434110641479, "epoch": 2.208453410182517, "grad_norm": 1.1171875, "learning_rate": 0.00045134607545185785, "loss": 4.7961, "mean_token_accuracy": 0.23253277987241744, "num_tokens": 52738211.0, "step": 22990 }, { "entropy": 5.1540055751800535, "epoch": 2.2089337175792507, "grad_norm": 1.1328125, "learning_rate": 0.0004513247811435666, "loss": 4.7853, "mean_token_accuracy": 0.230952388048172, "num_tokens": 52749267.0, "step": 22995 }, { "entropy": 5.015000057220459, "epoch": 2.2094140249759846, "grad_norm": 1.15625, "learning_rate": 0.00045130348274153735, "loss": 4.7892, "mean_token_accuracy": 0.22779532968997956, "num_tokens": 52760520.0, "step": 23000 }, { "entropy": 5.087809896469116, "epoch": 2.2098943323727185, "grad_norm": 1.078125, "learning_rate": 0.00045128218024626486, "loss": 4.7342, "mean_token_accuracy": 0.24323214888572692, "num_tokens": 52772865.0, "step": 23005 }, { "entropy": 5.111327123641968, "epoch": 2.2103746397694524, "grad_norm": 1.1015625, "learning_rate": 0.0004512608736582436, "loss": 4.7896, "mean_token_accuracy": 0.23642992824316025, "num_tokens": 52783935.0, "step": 23010 }, { "entropy": 4.973553323745728, "epoch": 2.2108549471661862, "grad_norm": 1.0546875, "learning_rate": 0.0004512395629779682, "loss": 4.7307, "mean_token_accuracy": 0.2389068379998207, "num_tokens": 52794980.0, "step": 23015 }, { "entropy": 5.190898704528808, "epoch": 2.21133525456292, "grad_norm": 1.09375, "learning_rate": 0.0004512182482059335, "loss": 4.8966, "mean_token_accuracy": 0.21514176428318024, "num_tokens": 52806426.0, "step": 23020 }, { "entropy": 5.059572219848633, "epoch": 2.211815561959654, "grad_norm": 1.203125, "learning_rate": 0.0004511969293426343, "loss": 4.7195, "mean_token_accuracy": 0.2370661735534668, "num_tokens": 52816832.0, "step": 23025 }, { "entropy": 5.115974092483521, "epoch": 2.212295869356388, "grad_norm": 1.0703125, "learning_rate": 0.00045117560638856567, "loss": 4.7834, "mean_token_accuracy": 0.2324156790971756, "num_tokens": 52829913.0, "step": 23030 }, { "entropy": 5.101070880889893, "epoch": 2.212776176753122, "grad_norm": 1.109375, "learning_rate": 0.0004511542793442225, "loss": 4.8067, "mean_token_accuracy": 0.23004190921783446, "num_tokens": 52841300.0, "step": 23035 }, { "entropy": 5.085496854782105, "epoch": 2.213256484149856, "grad_norm": 1.0625, "learning_rate": 0.0004511329482101001, "loss": 4.7434, "mean_token_accuracy": 0.23557529896497725, "num_tokens": 52852862.0, "step": 23040 }, { "entropy": 5.185792589187622, "epoch": 2.21373679154659, "grad_norm": 1.1953125, "learning_rate": 0.0004511116129866936, "loss": 4.8093, "mean_token_accuracy": 0.22784344851970673, "num_tokens": 52863499.0, "step": 23045 }, { "entropy": 5.111684799194336, "epoch": 2.214217098943324, "grad_norm": 1.125, "learning_rate": 0.00045109027367449845, "loss": 4.8609, "mean_token_accuracy": 0.2328698828816414, "num_tokens": 52875459.0, "step": 23050 }, { "entropy": 5.057990264892578, "epoch": 2.2146974063400577, "grad_norm": 1.1015625, "learning_rate": 0.00045106893027400995, "loss": 4.7438, "mean_token_accuracy": 0.22991834282875062, "num_tokens": 52886758.0, "step": 23055 }, { "entropy": 5.102661371231079, "epoch": 2.2151777137367916, "grad_norm": 1.171875, "learning_rate": 0.00045104758278572375, "loss": 4.7455, "mean_token_accuracy": 0.23097250014543533, "num_tokens": 52898703.0, "step": 23060 }, { "entropy": 5.148905897140503, "epoch": 2.2156580211335255, "grad_norm": 1.1015625, "learning_rate": 0.0004510262312101355, "loss": 4.7997, "mean_token_accuracy": 0.2216897800564766, "num_tokens": 52910455.0, "step": 23065 }, { "entropy": 4.989671325683593, "epoch": 2.2161383285302594, "grad_norm": 1.1953125, "learning_rate": 0.0004510048755477407, "loss": 4.7085, "mean_token_accuracy": 0.23190236687660218, "num_tokens": 52921992.0, "step": 23070 }, { "entropy": 5.081862878799439, "epoch": 2.2166186359269933, "grad_norm": 1.0546875, "learning_rate": 0.0004509835157990354, "loss": 4.8352, "mean_token_accuracy": 0.2297719180583954, "num_tokens": 52934994.0, "step": 23075 }, { "entropy": 5.142678594589233, "epoch": 2.217098943323727, "grad_norm": 1.140625, "learning_rate": 0.00045096215196451547, "loss": 4.895, "mean_token_accuracy": 0.2210488885641098, "num_tokens": 52945745.0, "step": 23080 }, { "entropy": 5.145126438140869, "epoch": 2.217579250720461, "grad_norm": 1.0546875, "learning_rate": 0.00045094078404467683, "loss": 4.8814, "mean_token_accuracy": 0.22445845007896423, "num_tokens": 52957470.0, "step": 23085 }, { "entropy": 5.179237985610962, "epoch": 2.218059558117195, "grad_norm": 1.109375, "learning_rate": 0.00045091941204001564, "loss": 4.8069, "mean_token_accuracy": 0.23417698442935944, "num_tokens": 52968584.0, "step": 23090 }, { "entropy": 5.103171491622925, "epoch": 2.218539865513929, "grad_norm": 1.0625, "learning_rate": 0.000450898035951028, "loss": 4.8118, "mean_token_accuracy": 0.22522962391376494, "num_tokens": 52980187.0, "step": 23095 }, { "entropy": 5.015201663970947, "epoch": 2.2190201729106627, "grad_norm": 1.078125, "learning_rate": 0.00045087665577821034, "loss": 4.7333, "mean_token_accuracy": 0.2403994545340538, "num_tokens": 52990478.0, "step": 23100 }, { "entropy": 5.108901691436768, "epoch": 2.2195004803073966, "grad_norm": 1.203125, "learning_rate": 0.000450855271522059, "loss": 4.8785, "mean_token_accuracy": 0.21689791679382325, "num_tokens": 53001130.0, "step": 23105 }, { "entropy": 5.1791112422943115, "epoch": 2.219980787704131, "grad_norm": 1.1875, "learning_rate": 0.00045083388318307044, "loss": 4.8656, "mean_token_accuracy": 0.22352044582366942, "num_tokens": 53013057.0, "step": 23110 }, { "entropy": 5.1998707294464115, "epoch": 2.2204610951008648, "grad_norm": 1.078125, "learning_rate": 0.0004508124907617411, "loss": 4.8818, "mean_token_accuracy": 0.2214874416589737, "num_tokens": 53024333.0, "step": 23115 }, { "entropy": 5.098461532592774, "epoch": 2.2209414024975986, "grad_norm": 1.015625, "learning_rate": 0.0004507910942585679, "loss": 4.801, "mean_token_accuracy": 0.22544850260019303, "num_tokens": 53036723.0, "step": 23120 }, { "entropy": 5.149092721939087, "epoch": 2.2214217098943325, "grad_norm": 1.1875, "learning_rate": 0.0004507696936740475, "loss": 4.8479, "mean_token_accuracy": 0.23135247081518173, "num_tokens": 53048183.0, "step": 23125 }, { "entropy": 5.089809226989746, "epoch": 2.2219020172910664, "grad_norm": 1.4140625, "learning_rate": 0.0004507482890086767, "loss": 4.8052, "mean_token_accuracy": 0.23036097437143327, "num_tokens": 53061467.0, "step": 23130 }, { "entropy": 5.070451068878174, "epoch": 2.2223823246878003, "grad_norm": 1.2109375, "learning_rate": 0.0004507268802629525, "loss": 4.8054, "mean_token_accuracy": 0.23150533586740493, "num_tokens": 53072833.0, "step": 23135 }, { "entropy": 5.148936319351196, "epoch": 2.222862632084534, "grad_norm": 1.046875, "learning_rate": 0.0004507054674373719, "loss": 4.7687, "mean_token_accuracy": 0.22679489552974702, "num_tokens": 53084109.0, "step": 23140 }, { "entropy": 5.129045248031616, "epoch": 2.223342939481268, "grad_norm": 1.1171875, "learning_rate": 0.00045068405053243216, "loss": 4.8048, "mean_token_accuracy": 0.2320725902915001, "num_tokens": 53095997.0, "step": 23145 }, { "entropy": 5.0516626834869385, "epoch": 2.223823246878002, "grad_norm": 1.1484375, "learning_rate": 0.0004506626295486304, "loss": 4.7096, "mean_token_accuracy": 0.2448731392621994, "num_tokens": 53107518.0, "step": 23150 }, { "entropy": 5.1101579666137695, "epoch": 2.224303554274736, "grad_norm": 1.1953125, "learning_rate": 0.00045064120448646405, "loss": 4.7798, "mean_token_accuracy": 0.2291765719652176, "num_tokens": 53118908.0, "step": 23155 }, { "entropy": 5.082378721237182, "epoch": 2.2247838616714697, "grad_norm": 1.1484375, "learning_rate": 0.0004506197753464304, "loss": 4.7706, "mean_token_accuracy": 0.23491850942373277, "num_tokens": 53129404.0, "step": 23160 }, { "entropy": 5.156738233566284, "epoch": 2.2252641690682036, "grad_norm": 1.0859375, "learning_rate": 0.00045059834212902707, "loss": 4.8714, "mean_token_accuracy": 0.22582435458898545, "num_tokens": 53141364.0, "step": 23165 }, { "entropy": 5.176635265350342, "epoch": 2.2257444764649374, "grad_norm": 1.34375, "learning_rate": 0.00045057690483475167, "loss": 4.7811, "mean_token_accuracy": 0.23388808816671372, "num_tokens": 53153617.0, "step": 23170 }, { "entropy": 5.103824949264526, "epoch": 2.2262247838616713, "grad_norm": 1.1328125, "learning_rate": 0.0004505554634641019, "loss": 4.8897, "mean_token_accuracy": 0.22136110216379165, "num_tokens": 53166537.0, "step": 23175 }, { "entropy": 5.135892486572265, "epoch": 2.226705091258405, "grad_norm": 1.0625, "learning_rate": 0.00045053401801757554, "loss": 4.7672, "mean_token_accuracy": 0.23385286778211595, "num_tokens": 53178979.0, "step": 23180 }, { "entropy": 5.119091510772705, "epoch": 2.227185398655139, "grad_norm": 1.390625, "learning_rate": 0.00045051256849567054, "loss": 4.8117, "mean_token_accuracy": 0.23698574751615525, "num_tokens": 53190177.0, "step": 23185 }, { "entropy": 5.080839538574219, "epoch": 2.227665706051873, "grad_norm": 1.1328125, "learning_rate": 0.00045049111489888486, "loss": 4.7824, "mean_token_accuracy": 0.2351214364171028, "num_tokens": 53201398.0, "step": 23190 }, { "entropy": 5.166760444641113, "epoch": 2.2281460134486073, "grad_norm": 1.1796875, "learning_rate": 0.0004504696572277165, "loss": 4.8306, "mean_token_accuracy": 0.22560259848833084, "num_tokens": 53213795.0, "step": 23195 }, { "entropy": 5.140866422653199, "epoch": 2.228626320845341, "grad_norm": 1.109375, "learning_rate": 0.00045044819548266385, "loss": 4.861, "mean_token_accuracy": 0.22336962670087815, "num_tokens": 53226831.0, "step": 23200 }, { "entropy": 5.182083988189698, "epoch": 2.229106628242075, "grad_norm": 1.0859375, "learning_rate": 0.00045042672966422506, "loss": 4.872, "mean_token_accuracy": 0.22910162359476088, "num_tokens": 53238558.0, "step": 23205 }, { "entropy": 5.104340553283691, "epoch": 2.229586935638809, "grad_norm": 1.0703125, "learning_rate": 0.00045040525977289847, "loss": 4.7725, "mean_token_accuracy": 0.22808566242456435, "num_tokens": 53250376.0, "step": 23210 }, { "entropy": 5.119502019882202, "epoch": 2.230067243035543, "grad_norm": 1.125, "learning_rate": 0.0004503837858091826, "loss": 4.7659, "mean_token_accuracy": 0.23049113750457764, "num_tokens": 53260872.0, "step": 23215 }, { "entropy": 5.084338140487671, "epoch": 2.2305475504322767, "grad_norm": 1.203125, "learning_rate": 0.00045036230777357604, "loss": 4.7652, "mean_token_accuracy": 0.23104382902383805, "num_tokens": 53271363.0, "step": 23220 }, { "entropy": 5.171426820755005, "epoch": 2.2310278578290106, "grad_norm": 1.1015625, "learning_rate": 0.0004503408256665774, "loss": 4.9026, "mean_token_accuracy": 0.21932850778102875, "num_tokens": 53282356.0, "step": 23225 }, { "entropy": 5.074285173416138, "epoch": 2.2315081652257445, "grad_norm": 1.09375, "learning_rate": 0.00045031933948868545, "loss": 4.8338, "mean_token_accuracy": 0.2266443893313408, "num_tokens": 53293675.0, "step": 23230 }, { "entropy": 5.120794582366943, "epoch": 2.2319884726224783, "grad_norm": 1.1328125, "learning_rate": 0.00045029784924039903, "loss": 4.7167, "mean_token_accuracy": 0.2397169515490532, "num_tokens": 53305193.0, "step": 23235 }, { "entropy": 5.133155250549317, "epoch": 2.2324687800192122, "grad_norm": 1.1484375, "learning_rate": 0.00045027635492221716, "loss": 4.7169, "mean_token_accuracy": 0.24129508286714554, "num_tokens": 53315729.0, "step": 23240 }, { "entropy": 5.038401126861572, "epoch": 2.232949087415946, "grad_norm": 1.1796875, "learning_rate": 0.00045025485653463866, "loss": 4.794, "mean_token_accuracy": 0.2298990473151207, "num_tokens": 53327220.0, "step": 23245 }, { "entropy": 5.081314659118652, "epoch": 2.23342939481268, "grad_norm": 1.0859375, "learning_rate": 0.0004502333540781628, "loss": 4.8294, "mean_token_accuracy": 0.2268001616001129, "num_tokens": 53337565.0, "step": 23250 }, { "entropy": 5.1318401336669925, "epoch": 2.233909702209414, "grad_norm": 1.2890625, "learning_rate": 0.0004502118475532888, "loss": 4.8018, "mean_token_accuracy": 0.23005928546190263, "num_tokens": 53348791.0, "step": 23255 }, { "entropy": 5.074340200424194, "epoch": 2.2343900096061478, "grad_norm": 1.140625, "learning_rate": 0.000450190336960516, "loss": 4.7488, "mean_token_accuracy": 0.23026071041822432, "num_tokens": 53361341.0, "step": 23260 }, { "entropy": 5.094004583358765, "epoch": 2.2348703170028816, "grad_norm": 1.1796875, "learning_rate": 0.0004501688223003438, "loss": 4.828, "mean_token_accuracy": 0.22650541067123414, "num_tokens": 53372085.0, "step": 23265 }, { "entropy": 5.079095840454102, "epoch": 2.235350624399616, "grad_norm": 1.1171875, "learning_rate": 0.0004501473035732717, "loss": 4.7845, "mean_token_accuracy": 0.2256512776017189, "num_tokens": 53383665.0, "step": 23270 }, { "entropy": 5.098389530181885, "epoch": 2.23583093179635, "grad_norm": 1.1953125, "learning_rate": 0.0004501257807797993, "loss": 4.7929, "mean_token_accuracy": 0.22722006291151048, "num_tokens": 53394697.0, "step": 23275 }, { "entropy": 5.0698864459991455, "epoch": 2.2363112391930837, "grad_norm": 1.3046875, "learning_rate": 0.00045010425392042624, "loss": 4.8061, "mean_token_accuracy": 0.22553065419197083, "num_tokens": 53405538.0, "step": 23280 }, { "entropy": 5.212454700469971, "epoch": 2.2367915465898176, "grad_norm": 1.1328125, "learning_rate": 0.0004500827229956524, "loss": 4.903, "mean_token_accuracy": 0.22110868543386458, "num_tokens": 53416515.0, "step": 23285 }, { "entropy": 5.109039974212647, "epoch": 2.2372718539865515, "grad_norm": 1.1171875, "learning_rate": 0.00045006118800597757, "loss": 4.7505, "mean_token_accuracy": 0.2293446272611618, "num_tokens": 53426530.0, "step": 23290 }, { "entropy": 5.094769811630249, "epoch": 2.2377521613832854, "grad_norm": 1.21875, "learning_rate": 0.00045003964895190177, "loss": 4.8575, "mean_token_accuracy": 0.2245904505252838, "num_tokens": 53438514.0, "step": 23295 }, { "entropy": 5.1014293193817135, "epoch": 2.2382324687800192, "grad_norm": 1.21875, "learning_rate": 0.0004500181058339251, "loss": 4.7228, "mean_token_accuracy": 0.23853760361671447, "num_tokens": 53448845.0, "step": 23300 }, { "entropy": 5.093882274627686, "epoch": 2.238712776176753, "grad_norm": 1.1953125, "learning_rate": 0.0004499965586525478, "loss": 4.8414, "mean_token_accuracy": 0.22516778856515884, "num_tokens": 53460526.0, "step": 23305 }, { "entropy": 5.183574533462524, "epoch": 2.239193083573487, "grad_norm": 1.1796875, "learning_rate": 0.00044997500740826993, "loss": 4.8775, "mean_token_accuracy": 0.22742329388856888, "num_tokens": 53471997.0, "step": 23310 }, { "entropy": 5.103569459915161, "epoch": 2.239673390970221, "grad_norm": 1.390625, "learning_rate": 0.000449953452101592, "loss": 4.8401, "mean_token_accuracy": 0.22768707275390626, "num_tokens": 53483097.0, "step": 23315 }, { "entropy": 5.116818475723266, "epoch": 2.2401536983669548, "grad_norm": 1.140625, "learning_rate": 0.00044993189273301445, "loss": 4.7605, "mean_token_accuracy": 0.23938217461109162, "num_tokens": 53494877.0, "step": 23320 }, { "entropy": 5.0951494693756105, "epoch": 2.2406340057636887, "grad_norm": 1.2421875, "learning_rate": 0.0004499103293030377, "loss": 4.7768, "mean_token_accuracy": 0.2359197422862053, "num_tokens": 53507065.0, "step": 23325 }, { "entropy": 5.169115924835205, "epoch": 2.2411143131604225, "grad_norm": 1.203125, "learning_rate": 0.0004498887618121625, "loss": 4.8494, "mean_token_accuracy": 0.23051951229572296, "num_tokens": 53518188.0, "step": 23330 }, { "entropy": 5.06104588508606, "epoch": 2.2415946205571564, "grad_norm": 1.1171875, "learning_rate": 0.0004498671902608897, "loss": 4.7008, "mean_token_accuracy": 0.2351858526468277, "num_tokens": 53529872.0, "step": 23335 }, { "entropy": 5.143391942977905, "epoch": 2.2420749279538903, "grad_norm": 1.1328125, "learning_rate": 0.0004498456146497198, "loss": 4.7767, "mean_token_accuracy": 0.22685908675193786, "num_tokens": 53540175.0, "step": 23340 }, { "entropy": 5.0613484382629395, "epoch": 2.2425552353506246, "grad_norm": 1.2109375, "learning_rate": 0.00044982403497915405, "loss": 4.7051, "mean_token_accuracy": 0.23086913526058198, "num_tokens": 53552520.0, "step": 23345 }, { "entropy": 5.091251564025879, "epoch": 2.2430355427473585, "grad_norm": 1.125, "learning_rate": 0.00044980245124969333, "loss": 4.7983, "mean_token_accuracy": 0.22655535042285918, "num_tokens": 53563937.0, "step": 23350 }, { "entropy": 5.100781488418579, "epoch": 2.2435158501440924, "grad_norm": 1.171875, "learning_rate": 0.0004497808634618386, "loss": 4.7599, "mean_token_accuracy": 0.22517484724521636, "num_tokens": 53574654.0, "step": 23355 }, { "entropy": 5.123874092102051, "epoch": 2.2439961575408263, "grad_norm": 1.1484375, "learning_rate": 0.0004497592716160913, "loss": 4.8158, "mean_token_accuracy": 0.22852961719036102, "num_tokens": 53586848.0, "step": 23360 }, { "entropy": 5.088836193084717, "epoch": 2.24447646493756, "grad_norm": 1.0859375, "learning_rate": 0.00044973767571295273, "loss": 4.796, "mean_token_accuracy": 0.2306663915514946, "num_tokens": 53598200.0, "step": 23365 }, { "entropy": 5.068975734710693, "epoch": 2.244956772334294, "grad_norm": 1.09375, "learning_rate": 0.0004497160757529241, "loss": 4.7346, "mean_token_accuracy": 0.2349630042910576, "num_tokens": 53609957.0, "step": 23370 }, { "entropy": 5.128179311752319, "epoch": 2.245437079731028, "grad_norm": 1.1171875, "learning_rate": 0.00044969447173650695, "loss": 4.8083, "mean_token_accuracy": 0.23668235242366792, "num_tokens": 53621029.0, "step": 23375 }, { "entropy": 5.086098146438599, "epoch": 2.245917387127762, "grad_norm": 1.09375, "learning_rate": 0.000449672863664203, "loss": 4.7745, "mean_token_accuracy": 0.22932577580213548, "num_tokens": 53633279.0, "step": 23380 }, { "entropy": 5.0923463821411135, "epoch": 2.2463976945244957, "grad_norm": 1.0703125, "learning_rate": 0.00044965125153651375, "loss": 4.793, "mean_token_accuracy": 0.22933094650506974, "num_tokens": 53643959.0, "step": 23385 }, { "entropy": 5.102643966674805, "epoch": 2.2468780019212296, "grad_norm": 1.125, "learning_rate": 0.0004496296353539411, "loss": 4.7822, "mean_token_accuracy": 0.22918579429388047, "num_tokens": 53655215.0, "step": 23390 }, { "entropy": 5.177783489227295, "epoch": 2.2473583093179634, "grad_norm": 1.21875, "learning_rate": 0.0004496080151169869, "loss": 4.7781, "mean_token_accuracy": 0.23483146280050277, "num_tokens": 53666557.0, "step": 23395 }, { "entropy": 5.139918661117553, "epoch": 2.2478386167146973, "grad_norm": 1.1328125, "learning_rate": 0.00044958639082615294, "loss": 4.8471, "mean_token_accuracy": 0.2253044903278351, "num_tokens": 53679626.0, "step": 23400 }, { "entropy": 5.118360996246338, "epoch": 2.248318924111431, "grad_norm": 1.09375, "learning_rate": 0.0004495647624819415, "loss": 4.7777, "mean_token_accuracy": 0.2323940321803093, "num_tokens": 53691681.0, "step": 23405 }, { "entropy": 5.150889730453491, "epoch": 2.248799231508165, "grad_norm": 1.109375, "learning_rate": 0.00044954313008485457, "loss": 4.7347, "mean_token_accuracy": 0.23787372261285783, "num_tokens": 53703320.0, "step": 23410 }, { "entropy": 5.032420969009399, "epoch": 2.249279538904899, "grad_norm": 0.9921875, "learning_rate": 0.00044952149363539453, "loss": 4.7102, "mean_token_accuracy": 0.23317102640867232, "num_tokens": 53714867.0, "step": 23415 }, { "entropy": 5.101826238632202, "epoch": 2.249759846301633, "grad_norm": 1.0859375, "learning_rate": 0.0004494998531340635, "loss": 4.8373, "mean_token_accuracy": 0.2332456275820732, "num_tokens": 53726121.0, "step": 23420 }, { "entropy": 5.173738813400268, "epoch": 2.2502401536983667, "grad_norm": 1.1640625, "learning_rate": 0.0004494782085813641, "loss": 4.8415, "mean_token_accuracy": 0.22598159462213516, "num_tokens": 53736966.0, "step": 23425 }, { "entropy": 5.074115467071533, "epoch": 2.250720461095101, "grad_norm": 1.078125, "learning_rate": 0.0004494565599777988, "loss": 4.7125, "mean_token_accuracy": 0.24225749671459199, "num_tokens": 53747446.0, "step": 23430 }, { "entropy": 5.092739295959473, "epoch": 2.251200768491835, "grad_norm": 1.15625, "learning_rate": 0.00044943490732387025, "loss": 4.8496, "mean_token_accuracy": 0.2204935595393181, "num_tokens": 53759588.0, "step": 23435 }, { "entropy": 5.169422292709351, "epoch": 2.251681075888569, "grad_norm": 1.171875, "learning_rate": 0.0004494132506200811, "loss": 4.8258, "mean_token_accuracy": 0.23271931260824202, "num_tokens": 53770314.0, "step": 23440 }, { "entropy": 5.100150728225708, "epoch": 2.2521613832853027, "grad_norm": 1.078125, "learning_rate": 0.0004493915898669341, "loss": 4.7553, "mean_token_accuracy": 0.23632270991802215, "num_tokens": 53781251.0, "step": 23445 }, { "entropy": 5.1533149719238285, "epoch": 2.2526416906820366, "grad_norm": 1.0546875, "learning_rate": 0.0004493699250649323, "loss": 4.8618, "mean_token_accuracy": 0.2235618367791176, "num_tokens": 53792676.0, "step": 23450 }, { "entropy": 5.1419731140136715, "epoch": 2.2531219980787704, "grad_norm": 1.2265625, "learning_rate": 0.0004493482562145786, "loss": 4.8624, "mean_token_accuracy": 0.22552858293056488, "num_tokens": 53802714.0, "step": 23455 }, { "entropy": 5.067761516571045, "epoch": 2.2536023054755043, "grad_norm": 1.1796875, "learning_rate": 0.00044932658331637605, "loss": 4.8241, "mean_token_accuracy": 0.2310011625289917, "num_tokens": 53815043.0, "step": 23460 }, { "entropy": 5.161076307296753, "epoch": 2.254082612872238, "grad_norm": 1.0625, "learning_rate": 0.0004493049063708279, "loss": 4.8072, "mean_token_accuracy": 0.22329722046852113, "num_tokens": 53827903.0, "step": 23465 }, { "entropy": 5.223559808731079, "epoch": 2.254562920268972, "grad_norm": 1.1484375, "learning_rate": 0.00044928322537843746, "loss": 4.8834, "mean_token_accuracy": 0.22471548467874528, "num_tokens": 53839574.0, "step": 23470 }, { "entropy": 5.110046672821045, "epoch": 2.255043227665706, "grad_norm": 1.34375, "learning_rate": 0.00044926154033970793, "loss": 4.7297, "mean_token_accuracy": 0.2362123802304268, "num_tokens": 53850488.0, "step": 23475 }, { "entropy": 5.1028900146484375, "epoch": 2.25552353506244, "grad_norm": 1.21875, "learning_rate": 0.000449239851255143, "loss": 4.7653, "mean_token_accuracy": 0.23202899992465972, "num_tokens": 53860766.0, "step": 23480 }, { "entropy": 5.041772985458374, "epoch": 2.2560038424591737, "grad_norm": 1.1875, "learning_rate": 0.00044921815812524606, "loss": 4.7769, "mean_token_accuracy": 0.22772487998008728, "num_tokens": 53872088.0, "step": 23485 }, { "entropy": 5.1281756401062015, "epoch": 2.2564841498559076, "grad_norm": 1.1171875, "learning_rate": 0.00044919646095052077, "loss": 4.8177, "mean_token_accuracy": 0.22846906781196594, "num_tokens": 53883302.0, "step": 23490 }, { "entropy": 5.061516618728637, "epoch": 2.2569644572526415, "grad_norm": 1.2265625, "learning_rate": 0.000449174759731471, "loss": 4.7135, "mean_token_accuracy": 0.2427988812327385, "num_tokens": 53894310.0, "step": 23495 }, { "entropy": 5.1370398044586185, "epoch": 2.2574447646493754, "grad_norm": 1.1953125, "learning_rate": 0.00044915305446860046, "loss": 4.8432, "mean_token_accuracy": 0.22062044441699982, "num_tokens": 53906760.0, "step": 23500 }, { "entropy": 5.098146247863769, "epoch": 2.2579250720461097, "grad_norm": 1.2421875, "learning_rate": 0.00044913134516241305, "loss": 4.7964, "mean_token_accuracy": 0.2295202597975731, "num_tokens": 53918758.0, "step": 23505 }, { "entropy": 5.026399803161621, "epoch": 2.2584053794428436, "grad_norm": 1.1328125, "learning_rate": 0.0004491096318134129, "loss": 4.7536, "mean_token_accuracy": 0.2342957004904747, "num_tokens": 53932574.0, "step": 23510 }, { "entropy": 5.158463478088379, "epoch": 2.2588856868395775, "grad_norm": 1.15625, "learning_rate": 0.0004490879144221042, "loss": 4.8358, "mean_token_accuracy": 0.2323979079723358, "num_tokens": 53943801.0, "step": 23515 }, { "entropy": 5.069459533691406, "epoch": 2.2593659942363113, "grad_norm": 1.2265625, "learning_rate": 0.00044906619298899097, "loss": 4.7704, "mean_token_accuracy": 0.24293200224637984, "num_tokens": 53955458.0, "step": 23520 }, { "entropy": 5.143928813934326, "epoch": 2.2598463016330452, "grad_norm": 1.171875, "learning_rate": 0.0004490444675145777, "loss": 4.8139, "mean_token_accuracy": 0.2232506200671196, "num_tokens": 53967242.0, "step": 23525 }, { "entropy": 5.126056480407715, "epoch": 2.260326609029779, "grad_norm": 1.125, "learning_rate": 0.0004490227379993686, "loss": 4.8327, "mean_token_accuracy": 0.2304875761270523, "num_tokens": 53978953.0, "step": 23530 }, { "entropy": 5.167446851730347, "epoch": 2.260806916426513, "grad_norm": 1.15625, "learning_rate": 0.0004490010044438683, "loss": 4.8344, "mean_token_accuracy": 0.22517274022102357, "num_tokens": 53990553.0, "step": 23535 }, { "entropy": 5.10265703201294, "epoch": 2.261287223823247, "grad_norm": 1.1328125, "learning_rate": 0.00044897926684858133, "loss": 4.819, "mean_token_accuracy": 0.23733688592910768, "num_tokens": 54002187.0, "step": 23540 }, { "entropy": 5.160494375228882, "epoch": 2.2617675312199808, "grad_norm": 1.078125, "learning_rate": 0.00044895752521401246, "loss": 4.7928, "mean_token_accuracy": 0.22392333447933196, "num_tokens": 54014202.0, "step": 23545 }, { "entropy": 5.08729305267334, "epoch": 2.2622478386167146, "grad_norm": 1.1015625, "learning_rate": 0.0004489357795406663, "loss": 4.7955, "mean_token_accuracy": 0.2341983512043953, "num_tokens": 54026286.0, "step": 23550 }, { "entropy": 5.094353675842285, "epoch": 2.2627281460134485, "grad_norm": 1.1484375, "learning_rate": 0.0004489140298290479, "loss": 4.8109, "mean_token_accuracy": 0.22887043058872222, "num_tokens": 54038845.0, "step": 23555 }, { "entropy": 5.211902523040772, "epoch": 2.2632084534101824, "grad_norm": 1.234375, "learning_rate": 0.00044889227607966217, "loss": 4.8762, "mean_token_accuracy": 0.22510544210672379, "num_tokens": 54049991.0, "step": 23560 }, { "entropy": 5.100750732421875, "epoch": 2.2636887608069163, "grad_norm": 1.1875, "learning_rate": 0.00044887051829301406, "loss": 4.7642, "mean_token_accuracy": 0.2320538729429245, "num_tokens": 54061408.0, "step": 23565 }, { "entropy": 5.088111400604248, "epoch": 2.26416906820365, "grad_norm": 1.109375, "learning_rate": 0.00044884875646960886, "loss": 4.7678, "mean_token_accuracy": 0.22809007316827773, "num_tokens": 54072670.0, "step": 23570 }, { "entropy": 5.122489023208618, "epoch": 2.264649375600384, "grad_norm": 1.1484375, "learning_rate": 0.00044882699060995175, "loss": 4.7954, "mean_token_accuracy": 0.2249316856265068, "num_tokens": 54084555.0, "step": 23575 }, { "entropy": 5.021838665008545, "epoch": 2.2651296829971184, "grad_norm": 1.1015625, "learning_rate": 0.0004488052207145481, "loss": 4.7386, "mean_token_accuracy": 0.23619322329759598, "num_tokens": 54096764.0, "step": 23580 }, { "entropy": 5.170163249969482, "epoch": 2.2656099903938522, "grad_norm": 1.0390625, "learning_rate": 0.00044878344678390324, "loss": 4.831, "mean_token_accuracy": 0.2345489665865898, "num_tokens": 54109101.0, "step": 23585 }, { "entropy": 5.196159839630127, "epoch": 2.266090297790586, "grad_norm": 1.1171875, "learning_rate": 0.00044876166881852286, "loss": 4.8298, "mean_token_accuracy": 0.22761160433292388, "num_tokens": 54120217.0, "step": 23590 }, { "entropy": 5.139959335327148, "epoch": 2.26657060518732, "grad_norm": 1.125, "learning_rate": 0.0004487398868189125, "loss": 4.8119, "mean_token_accuracy": 0.2269844725728035, "num_tokens": 54132059.0, "step": 23595 }, { "entropy": 5.175757598876953, "epoch": 2.267050912584054, "grad_norm": 1.1015625, "learning_rate": 0.00044871810078557777, "loss": 4.8604, "mean_token_accuracy": 0.22221217453479766, "num_tokens": 54141928.0, "step": 23600 }, { "entropy": 5.15883059501648, "epoch": 2.2675312199807878, "grad_norm": 1.203125, "learning_rate": 0.0004486963107190247, "loss": 4.8823, "mean_token_accuracy": 0.22258317917585374, "num_tokens": 54153052.0, "step": 23605 }, { "entropy": 5.18836932182312, "epoch": 2.2680115273775217, "grad_norm": 1.1171875, "learning_rate": 0.00044867451661975894, "loss": 4.8245, "mean_token_accuracy": 0.22355309426784514, "num_tokens": 54163539.0, "step": 23610 }, { "entropy": 5.065151262283325, "epoch": 2.2684918347742555, "grad_norm": 1.125, "learning_rate": 0.00044865271848828673, "loss": 4.7707, "mean_token_accuracy": 0.2353520065546036, "num_tokens": 54176313.0, "step": 23615 }, { "entropy": 5.156122493743896, "epoch": 2.2689721421709894, "grad_norm": 1.109375, "learning_rate": 0.000448630916325114, "loss": 4.8559, "mean_token_accuracy": 0.22095520347356795, "num_tokens": 54187080.0, "step": 23620 }, { "entropy": 5.154760360717773, "epoch": 2.2694524495677233, "grad_norm": 1.265625, "learning_rate": 0.000448609110130747, "loss": 4.8292, "mean_token_accuracy": 0.23027612417936325, "num_tokens": 54197856.0, "step": 23625 }, { "entropy": 5.1559305667877195, "epoch": 2.269932756964457, "grad_norm": 1.1640625, "learning_rate": 0.00044858729990569193, "loss": 4.8263, "mean_token_accuracy": 0.22898895889520646, "num_tokens": 54210865.0, "step": 23630 }, { "entropy": 5.0686627388000485, "epoch": 2.270413064361191, "grad_norm": 1.1640625, "learning_rate": 0.00044856548565045523, "loss": 4.7199, "mean_token_accuracy": 0.22956303358078003, "num_tokens": 54222719.0, "step": 23635 }, { "entropy": 5.11489691734314, "epoch": 2.270893371757925, "grad_norm": 1.125, "learning_rate": 0.00044854366736554323, "loss": 4.7648, "mean_token_accuracy": 0.23259628117084502, "num_tokens": 54233874.0, "step": 23640 }, { "entropy": 5.0634620666503904, "epoch": 2.271373679154659, "grad_norm": 1.1796875, "learning_rate": 0.00044852184505146274, "loss": 4.6935, "mean_token_accuracy": 0.2324426457285881, "num_tokens": 54245966.0, "step": 23645 }, { "entropy": 5.093727016448975, "epoch": 2.2718539865513927, "grad_norm": 1.1484375, "learning_rate": 0.0004485000187087202, "loss": 4.7653, "mean_token_accuracy": 0.23520055264234543, "num_tokens": 54256468.0, "step": 23650 }, { "entropy": 5.049545001983643, "epoch": 2.272334293948127, "grad_norm": 1.0859375, "learning_rate": 0.0004484781883378224, "loss": 4.7044, "mean_token_accuracy": 0.2259022116661072, "num_tokens": 54266362.0, "step": 23655 }, { "entropy": 5.10016827583313, "epoch": 2.2728146013448605, "grad_norm": 1.15625, "learning_rate": 0.00044845635393927623, "loss": 4.8972, "mean_token_accuracy": 0.21926648616790773, "num_tokens": 54278365.0, "step": 23660 }, { "entropy": 5.151840925216675, "epoch": 2.273294908741595, "grad_norm": 1.1015625, "learning_rate": 0.0004484345155135886, "loss": 4.8911, "mean_token_accuracy": 0.22861198633909224, "num_tokens": 54289310.0, "step": 23665 }, { "entropy": 5.096772718429565, "epoch": 2.2737752161383287, "grad_norm": 1.09375, "learning_rate": 0.0004484126730612664, "loss": 4.7457, "mean_token_accuracy": 0.2337497740983963, "num_tokens": 54300421.0, "step": 23670 }, { "entropy": 5.0949928760528564, "epoch": 2.2742555235350626, "grad_norm": 1.59375, "learning_rate": 0.000448390826582817, "loss": 4.8159, "mean_token_accuracy": 0.2337231382727623, "num_tokens": 54312803.0, "step": 23675 }, { "entropy": 5.162675762176514, "epoch": 2.2747358309317964, "grad_norm": 1.1640625, "learning_rate": 0.00044836897607874744, "loss": 4.8348, "mean_token_accuracy": 0.23068183213472365, "num_tokens": 54324651.0, "step": 23680 }, { "entropy": 5.091884756088257, "epoch": 2.2752161383285303, "grad_norm": 1.1171875, "learning_rate": 0.000448347121549565, "loss": 4.7145, "mean_token_accuracy": 0.24177918434143067, "num_tokens": 54334841.0, "step": 23685 }, { "entropy": 5.127256727218628, "epoch": 2.275696445725264, "grad_norm": 1.140625, "learning_rate": 0.0004483252629957771, "loss": 4.7806, "mean_token_accuracy": 0.23010292500257493, "num_tokens": 54346685.0, "step": 23690 }, { "entropy": 5.068298482894898, "epoch": 2.276176753121998, "grad_norm": 1.1953125, "learning_rate": 0.00044830340041789133, "loss": 4.8236, "mean_token_accuracy": 0.2222338065505028, "num_tokens": 54358710.0, "step": 23695 }, { "entropy": 5.092076301574707, "epoch": 2.276657060518732, "grad_norm": 1.1171875, "learning_rate": 0.0004482815338164152, "loss": 4.8428, "mean_token_accuracy": 0.23226935118436814, "num_tokens": 54369695.0, "step": 23700 }, { "entropy": 5.104079532623291, "epoch": 2.277137367915466, "grad_norm": 1.125, "learning_rate": 0.0004482596631918564, "loss": 4.7529, "mean_token_accuracy": 0.23143748939037323, "num_tokens": 54381605.0, "step": 23705 }, { "entropy": 5.129797792434692, "epoch": 2.2776176753121997, "grad_norm": 1.15625, "learning_rate": 0.00044823778854472267, "loss": 4.7956, "mean_token_accuracy": 0.23360486328601837, "num_tokens": 54392309.0, "step": 23710 }, { "entropy": 5.115341234207153, "epoch": 2.2780979827089336, "grad_norm": 1.203125, "learning_rate": 0.0004482159098755219, "loss": 4.8227, "mean_token_accuracy": 0.2333234429359436, "num_tokens": 54403195.0, "step": 23715 }, { "entropy": 5.061098051071167, "epoch": 2.2785782901056675, "grad_norm": 1.2734375, "learning_rate": 0.0004481940271847621, "loss": 4.7589, "mean_token_accuracy": 0.23784710019826888, "num_tokens": 54415105.0, "step": 23720 }, { "entropy": 5.095363569259644, "epoch": 2.2790585975024014, "grad_norm": 1.15625, "learning_rate": 0.0004481721404729513, "loss": 4.7834, "mean_token_accuracy": 0.2298209086060524, "num_tokens": 54425798.0, "step": 23725 }, { "entropy": 5.179401540756226, "epoch": 2.2795389048991357, "grad_norm": 1.0859375, "learning_rate": 0.0004481502497405975, "loss": 4.8219, "mean_token_accuracy": 0.2277689814567566, "num_tokens": 54437254.0, "step": 23730 }, { "entropy": 5.160029888153076, "epoch": 2.280019212295869, "grad_norm": 1.1640625, "learning_rate": 0.0004481283549882091, "loss": 4.7782, "mean_token_accuracy": 0.2246842786669731, "num_tokens": 54448152.0, "step": 23735 }, { "entropy": 5.111397838592529, "epoch": 2.2804995196926034, "grad_norm": 1.1328125, "learning_rate": 0.00044810645621629443, "loss": 4.8327, "mean_token_accuracy": 0.22581015527248383, "num_tokens": 54460066.0, "step": 23740 }, { "entropy": 5.1661521911621096, "epoch": 2.2809798270893373, "grad_norm": 1.0625, "learning_rate": 0.00044808455342536176, "loss": 4.8551, "mean_token_accuracy": 0.22923391908407212, "num_tokens": 54471769.0, "step": 23745 }, { "entropy": 5.098841714859009, "epoch": 2.281460134486071, "grad_norm": 1.09375, "learning_rate": 0.00044806264661591976, "loss": 4.6924, "mean_token_accuracy": 0.24215531200170518, "num_tokens": 54483630.0, "step": 23750 }, { "entropy": 5.086545419692993, "epoch": 2.281940441882805, "grad_norm": 1.109375, "learning_rate": 0.0004480407357884771, "loss": 4.7871, "mean_token_accuracy": 0.22741892635822297, "num_tokens": 54495696.0, "step": 23755 }, { "entropy": 5.096376657485962, "epoch": 2.282420749279539, "grad_norm": 1.09375, "learning_rate": 0.00044801882094354226, "loss": 4.7937, "mean_token_accuracy": 0.22762938886880874, "num_tokens": 54507153.0, "step": 23760 }, { "entropy": 5.191448068618774, "epoch": 2.282901056676273, "grad_norm": 1.1484375, "learning_rate": 0.0004479969020816242, "loss": 4.8577, "mean_token_accuracy": 0.227578766644001, "num_tokens": 54519922.0, "step": 23765 }, { "entropy": 5.086835479736328, "epoch": 2.2833813640730067, "grad_norm": 1.0078125, "learning_rate": 0.00044797497920323175, "loss": 4.7584, "mean_token_accuracy": 0.2371147409081459, "num_tokens": 54531857.0, "step": 23770 }, { "entropy": 5.093832492828369, "epoch": 2.2838616714697406, "grad_norm": 1.046875, "learning_rate": 0.0004479530523088739, "loss": 4.7632, "mean_token_accuracy": 0.24112324118614198, "num_tokens": 54542079.0, "step": 23775 }, { "entropy": 5.015680027008057, "epoch": 2.2843419788664745, "grad_norm": 1.1328125, "learning_rate": 0.0004479311213990598, "loss": 4.7534, "mean_token_accuracy": 0.23263996243476867, "num_tokens": 54552722.0, "step": 23780 }, { "entropy": 5.055504846572876, "epoch": 2.2848222862632084, "grad_norm": 1.1328125, "learning_rate": 0.00044790918647429854, "loss": 4.753, "mean_token_accuracy": 0.23083293735980986, "num_tokens": 54564224.0, "step": 23785 }, { "entropy": 5.142749786376953, "epoch": 2.2853025936599423, "grad_norm": 1.1796875, "learning_rate": 0.00044788724753509935, "loss": 4.731, "mean_token_accuracy": 0.23213585019111632, "num_tokens": 54575553.0, "step": 23790 }, { "entropy": 5.122098827362061, "epoch": 2.285782901056676, "grad_norm": 1.0859375, "learning_rate": 0.0004478653045819717, "loss": 4.9048, "mean_token_accuracy": 0.22266788631677628, "num_tokens": 54587315.0, "step": 23795 }, { "entropy": 5.168012189865112, "epoch": 2.28626320845341, "grad_norm": 1.0625, "learning_rate": 0.0004478433576154249, "loss": 4.817, "mean_token_accuracy": 0.23212073296308516, "num_tokens": 54598776.0, "step": 23800 }, { "entropy": 5.197915554046631, "epoch": 2.286743515850144, "grad_norm": 1.0546875, "learning_rate": 0.0004478214066359687, "loss": 4.8842, "mean_token_accuracy": 0.2148657962679863, "num_tokens": 54610272.0, "step": 23805 }, { "entropy": 5.140944910049439, "epoch": 2.287223823246878, "grad_norm": 1.0625, "learning_rate": 0.00044779945164411254, "loss": 4.8522, "mean_token_accuracy": 0.22315765023231507, "num_tokens": 54622105.0, "step": 23810 }, { "entropy": 5.1300232887268065, "epoch": 2.287704130643612, "grad_norm": 1.171875, "learning_rate": 0.0004477774926403662, "loss": 4.8058, "mean_token_accuracy": 0.23476217389106752, "num_tokens": 54631771.0, "step": 23815 }, { "entropy": 5.058514976501465, "epoch": 2.288184438040346, "grad_norm": 1.1171875, "learning_rate": 0.0004477555296252396, "loss": 4.7358, "mean_token_accuracy": 0.23375988900661468, "num_tokens": 54642724.0, "step": 23820 }, { "entropy": 5.0570252418518065, "epoch": 2.28866474543708, "grad_norm": 1.1796875, "learning_rate": 0.00044773356259924255, "loss": 4.7505, "mean_token_accuracy": 0.2327996626496315, "num_tokens": 54655126.0, "step": 23825 }, { "entropy": 5.049631500244141, "epoch": 2.2891450528338138, "grad_norm": 1.21875, "learning_rate": 0.00044771159156288505, "loss": 4.7099, "mean_token_accuracy": 0.23850573152303695, "num_tokens": 54665334.0, "step": 23830 }, { "entropy": 5.051522731781006, "epoch": 2.2896253602305476, "grad_norm": 1.2109375, "learning_rate": 0.0004476896165166772, "loss": 4.7049, "mean_token_accuracy": 0.23670212179422379, "num_tokens": 54676205.0, "step": 23835 }, { "entropy": 5.1268744468688965, "epoch": 2.2901056676272815, "grad_norm": 1.1171875, "learning_rate": 0.00044766763746112936, "loss": 4.8099, "mean_token_accuracy": 0.23005885928869246, "num_tokens": 54686586.0, "step": 23840 }, { "entropy": 5.132952785491943, "epoch": 2.2905859750240154, "grad_norm": 1.21875, "learning_rate": 0.0004476456543967517, "loss": 4.7902, "mean_token_accuracy": 0.23009070456027986, "num_tokens": 54698879.0, "step": 23845 }, { "entropy": 5.073030281066894, "epoch": 2.2910662824207493, "grad_norm": 1.03125, "learning_rate": 0.00044762366732405454, "loss": 4.765, "mean_token_accuracy": 0.23329982608556749, "num_tokens": 54710140.0, "step": 23850 }, { "entropy": 5.046080446243286, "epoch": 2.291546589817483, "grad_norm": 1.09375, "learning_rate": 0.0004476016762435484, "loss": 4.6823, "mean_token_accuracy": 0.23559171855449676, "num_tokens": 54721317.0, "step": 23855 }, { "entropy": 5.207999563217163, "epoch": 2.292026897214217, "grad_norm": 1.2109375, "learning_rate": 0.0004475796811557439, "loss": 4.9528, "mean_token_accuracy": 0.21821181774139403, "num_tokens": 54733329.0, "step": 23860 }, { "entropy": 5.176177167892456, "epoch": 2.292507204610951, "grad_norm": 1.0859375, "learning_rate": 0.00044755768206115155, "loss": 4.8303, "mean_token_accuracy": 0.220322947204113, "num_tokens": 54745230.0, "step": 23865 }, { "entropy": 5.09772367477417, "epoch": 2.292987512007685, "grad_norm": 1.1328125, "learning_rate": 0.0004475356789602823, "loss": 4.8181, "mean_token_accuracy": 0.2298712059855461, "num_tokens": 54757428.0, "step": 23870 }, { "entropy": 5.073602104187012, "epoch": 2.2934678194044187, "grad_norm": 1.1875, "learning_rate": 0.00044751367185364696, "loss": 4.6947, "mean_token_accuracy": 0.2402465119957924, "num_tokens": 54768886.0, "step": 23875 }, { "entropy": 5.1595619201660154, "epoch": 2.2939481268011526, "grad_norm": 1.1171875, "learning_rate": 0.00044749166074175634, "loss": 4.9136, "mean_token_accuracy": 0.2256806194782257, "num_tokens": 54780243.0, "step": 23880 }, { "entropy": 5.073761415481568, "epoch": 2.2944284341978864, "grad_norm": 1.1015625, "learning_rate": 0.00044746964562512154, "loss": 4.69, "mean_token_accuracy": 0.2334916412830353, "num_tokens": 54792462.0, "step": 23885 }, { "entropy": 5.176160907745361, "epoch": 2.2949087415946208, "grad_norm": 1.2109375, "learning_rate": 0.00044744762650425376, "loss": 4.8299, "mean_token_accuracy": 0.22740929424762726, "num_tokens": 54804412.0, "step": 23890 }, { "entropy": 5.115084457397461, "epoch": 2.2953890489913547, "grad_norm": 1.140625, "learning_rate": 0.00044742560337966415, "loss": 4.7857, "mean_token_accuracy": 0.23885392695665358, "num_tokens": 54814910.0, "step": 23895 }, { "entropy": 5.088543176651001, "epoch": 2.2958693563880885, "grad_norm": 1.0625, "learning_rate": 0.000447403576251864, "loss": 4.7708, "mean_token_accuracy": 0.23376950323581697, "num_tokens": 54826500.0, "step": 23900 }, { "entropy": 5.084391689300537, "epoch": 2.2963496637848224, "grad_norm": 1.171875, "learning_rate": 0.0004473815451213648, "loss": 4.777, "mean_token_accuracy": 0.22784235030412675, "num_tokens": 54837682.0, "step": 23905 }, { "entropy": 5.085320568084716, "epoch": 2.2968299711815563, "grad_norm": 1.09375, "learning_rate": 0.0004473595099886779, "loss": 4.7319, "mean_token_accuracy": 0.23291932046413422, "num_tokens": 54848937.0, "step": 23910 }, { "entropy": 5.1188312530517575, "epoch": 2.29731027857829, "grad_norm": 1.171875, "learning_rate": 0.0004473374708543151, "loss": 4.6921, "mean_token_accuracy": 0.2435552567243576, "num_tokens": 54859384.0, "step": 23915 }, { "entropy": 5.03520393371582, "epoch": 2.297790585975024, "grad_norm": 1.09375, "learning_rate": 0.0004473154277187878, "loss": 4.7073, "mean_token_accuracy": 0.24123682230710983, "num_tokens": 54871280.0, "step": 23920 }, { "entropy": 5.054597043991089, "epoch": 2.298270893371758, "grad_norm": 1.0859375, "learning_rate": 0.00044729338058260805, "loss": 4.6948, "mean_token_accuracy": 0.2395238071680069, "num_tokens": 54882699.0, "step": 23925 }, { "entropy": 5.184109544754028, "epoch": 2.298751200768492, "grad_norm": 1.0546875, "learning_rate": 0.0004472713294462876, "loss": 4.841, "mean_token_accuracy": 0.2287002757191658, "num_tokens": 54894041.0, "step": 23930 }, { "entropy": 5.16756534576416, "epoch": 2.2992315081652257, "grad_norm": 1.109375, "learning_rate": 0.00044724927431033843, "loss": 4.8109, "mean_token_accuracy": 0.22440232038497926, "num_tokens": 54905007.0, "step": 23935 }, { "entropy": 5.154220914840698, "epoch": 2.2997118155619596, "grad_norm": 1.140625, "learning_rate": 0.0004472272151752725, "loss": 4.8183, "mean_token_accuracy": 0.2311574250459671, "num_tokens": 54915735.0, "step": 23940 }, { "entropy": 5.02349214553833, "epoch": 2.3001921229586935, "grad_norm": 1.0546875, "learning_rate": 0.0004472051520416022, "loss": 4.6565, "mean_token_accuracy": 0.235342113673687, "num_tokens": 54926350.0, "step": 23945 }, { "entropy": 5.0691643238067625, "epoch": 2.3006724303554273, "grad_norm": 1.0703125, "learning_rate": 0.0004471830849098395, "loss": 4.8175, "mean_token_accuracy": 0.22720725387334822, "num_tokens": 54938354.0, "step": 23950 }, { "entropy": 5.146879768371582, "epoch": 2.3011527377521612, "grad_norm": 1.1875, "learning_rate": 0.00044716101378049683, "loss": 4.8074, "mean_token_accuracy": 0.2259829819202423, "num_tokens": 54949440.0, "step": 23955 }, { "entropy": 5.15836272239685, "epoch": 2.301633045148895, "grad_norm": 1.09375, "learning_rate": 0.00044713893865408667, "loss": 4.7659, "mean_token_accuracy": 0.2332813635468483, "num_tokens": 54961751.0, "step": 23960 }, { "entropy": 5.211078453063965, "epoch": 2.3021133525456294, "grad_norm": 1.078125, "learning_rate": 0.0004471168595311215, "loss": 4.9402, "mean_token_accuracy": 0.21601903587579727, "num_tokens": 54975415.0, "step": 23965 }, { "entropy": 5.160834455490113, "epoch": 2.302593659942363, "grad_norm": 1.1328125, "learning_rate": 0.00044709477641211395, "loss": 4.7611, "mean_token_accuracy": 0.22833193838596344, "num_tokens": 54986599.0, "step": 23970 }, { "entropy": 5.120666885375977, "epoch": 2.303073967339097, "grad_norm": 1.140625, "learning_rate": 0.0004470726892975767, "loss": 4.7707, "mean_token_accuracy": 0.23147224336862565, "num_tokens": 54997428.0, "step": 23975 }, { "entropy": 5.139890432357788, "epoch": 2.303554274735831, "grad_norm": 1.2734375, "learning_rate": 0.00044705059818802255, "loss": 4.8604, "mean_token_accuracy": 0.21991382539272308, "num_tokens": 55008434.0, "step": 23980 }, { "entropy": 5.044625473022461, "epoch": 2.304034582132565, "grad_norm": 1.203125, "learning_rate": 0.0004470285030839644, "loss": 4.7534, "mean_token_accuracy": 0.2294306129217148, "num_tokens": 55019834.0, "step": 23985 }, { "entropy": 5.145654344558716, "epoch": 2.304514889529299, "grad_norm": 1.09375, "learning_rate": 0.00044700640398591526, "loss": 4.7596, "mean_token_accuracy": 0.23586667478084564, "num_tokens": 55030432.0, "step": 23990 }, { "entropy": 5.18192458152771, "epoch": 2.3049951969260327, "grad_norm": 1.125, "learning_rate": 0.0004469843008943881, "loss": 4.8233, "mean_token_accuracy": 0.2283098965883255, "num_tokens": 55041482.0, "step": 23995 }, { "entropy": 5.060428190231323, "epoch": 2.3054755043227666, "grad_norm": 1.0625, "learning_rate": 0.0004469621938098962, "loss": 4.683, "mean_token_accuracy": 0.23807364255189895, "num_tokens": 55051894.0, "step": 24000 }, { "epoch": 2.3054755043227666, "eval_entropy": 4.882824030810598, "eval_loss": 4.924381732940674, "eval_mean_token_accuracy": 0.23270893815826754, "eval_num_tokens": 55051894.0, "eval_runtime": 26.627, "eval_samples_per_second": 1232.394, "eval_steps_per_second": 154.054, "step": 24000 }, { "entropy": 5.088639545440674, "epoch": 2.3059558117195005, "grad_norm": 1.15625, "learning_rate": 0.0004469400827329528, "loss": 4.8239, "mean_token_accuracy": 0.23225966691970826, "num_tokens": 55063907.0, "step": 24005 }, { "entropy": 5.140566349029541, "epoch": 2.3064361191162344, "grad_norm": 1.21875, "learning_rate": 0.0004469179676640712, "loss": 4.7989, "mean_token_accuracy": 0.22386384904384612, "num_tokens": 55076004.0, "step": 24010 }, { "entropy": 5.029444885253906, "epoch": 2.3069164265129682, "grad_norm": 1.1484375, "learning_rate": 0.0004468958486037649, "loss": 4.6965, "mean_token_accuracy": 0.24416131228208543, "num_tokens": 55087506.0, "step": 24015 }, { "entropy": 5.067629337310791, "epoch": 2.307396733909702, "grad_norm": 1.1171875, "learning_rate": 0.0004468737255525474, "loss": 4.7896, "mean_token_accuracy": 0.22917503416538237, "num_tokens": 55099406.0, "step": 24020 }, { "entropy": 5.218100070953369, "epoch": 2.307877041306436, "grad_norm": 1.0546875, "learning_rate": 0.0004468515985109324, "loss": 4.8918, "mean_token_accuracy": 0.219009730219841, "num_tokens": 55111733.0, "step": 24025 }, { "entropy": 5.145392227172851, "epoch": 2.30835734870317, "grad_norm": 1.09375, "learning_rate": 0.0004468294674794335, "loss": 4.7831, "mean_token_accuracy": 0.22559967041015624, "num_tokens": 55123455.0, "step": 24030 }, { "entropy": 5.125941610336303, "epoch": 2.3088376560999038, "grad_norm": 1.0625, "learning_rate": 0.0004468073324585647, "loss": 4.7914, "mean_token_accuracy": 0.22934773564338684, "num_tokens": 55134284.0, "step": 24035 }, { "entropy": 5.141989803314209, "epoch": 2.309317963496638, "grad_norm": 1.0625, "learning_rate": 0.0004467851934488398, "loss": 4.8032, "mean_token_accuracy": 0.22890194356441498, "num_tokens": 55147224.0, "step": 24040 }, { "entropy": 5.089644145965576, "epoch": 2.3097982708933715, "grad_norm": 1.09375, "learning_rate": 0.0004467630504507727, "loss": 4.7432, "mean_token_accuracy": 0.2306762605905533, "num_tokens": 55158144.0, "step": 24045 }, { "entropy": 5.090526151657104, "epoch": 2.310278578290106, "grad_norm": 1.0703125, "learning_rate": 0.0004467409034648776, "loss": 4.8517, "mean_token_accuracy": 0.23368075489997864, "num_tokens": 55171522.0, "step": 24050 }, { "entropy": 5.121523189544678, "epoch": 2.3107588856868397, "grad_norm": 1.21875, "learning_rate": 0.0004467187524916688, "loss": 4.7947, "mean_token_accuracy": 0.23201913088560105, "num_tokens": 55182097.0, "step": 24055 }, { "entropy": 5.181788063049316, "epoch": 2.3112391930835736, "grad_norm": 1.125, "learning_rate": 0.0004466965975316604, "loss": 4.8179, "mean_token_accuracy": 0.21985867619514465, "num_tokens": 55192287.0, "step": 24060 }, { "entropy": 5.124038362503052, "epoch": 2.3117195004803075, "grad_norm": 1.1328125, "learning_rate": 0.00044667443858536685, "loss": 4.8373, "mean_token_accuracy": 0.23194748163223267, "num_tokens": 55204049.0, "step": 24065 }, { "entropy": 5.010155916213989, "epoch": 2.3121998078770414, "grad_norm": 1.09375, "learning_rate": 0.0004466522756533026, "loss": 4.6349, "mean_token_accuracy": 0.24204261302948, "num_tokens": 55214969.0, "step": 24070 }, { "entropy": 5.138979721069336, "epoch": 2.3126801152737753, "grad_norm": 1.1875, "learning_rate": 0.0004466301087359822, "loss": 4.8743, "mean_token_accuracy": 0.2311398297548294, "num_tokens": 55226362.0, "step": 24075 }, { "entropy": 5.224458885192871, "epoch": 2.313160422670509, "grad_norm": 1.1640625, "learning_rate": 0.00044660793783392035, "loss": 4.8783, "mean_token_accuracy": 0.225470569729805, "num_tokens": 55237345.0, "step": 24080 }, { "entropy": 5.187859106063843, "epoch": 2.313640730067243, "grad_norm": 1.171875, "learning_rate": 0.0004465857629476317, "loss": 4.8906, "mean_token_accuracy": 0.22839065492153168, "num_tokens": 55248426.0, "step": 24085 }, { "entropy": 5.087028169631958, "epoch": 2.314121037463977, "grad_norm": 1.1484375, "learning_rate": 0.0004465635840776312, "loss": 4.8218, "mean_token_accuracy": 0.229251691699028, "num_tokens": 55259017.0, "step": 24090 }, { "entropy": 5.154250049591065, "epoch": 2.314601344860711, "grad_norm": 1.0859375, "learning_rate": 0.00044654140122443373, "loss": 4.8562, "mean_token_accuracy": 0.22954557836055756, "num_tokens": 55270074.0, "step": 24095 }, { "entropy": 5.123119354248047, "epoch": 2.3150816522574447, "grad_norm": 1.09375, "learning_rate": 0.0004465192143885542, "loss": 4.7823, "mean_token_accuracy": 0.23124384135007858, "num_tokens": 55282291.0, "step": 24100 }, { "entropy": 5.168704271316528, "epoch": 2.3155619596541785, "grad_norm": 1.1015625, "learning_rate": 0.00044649702357050787, "loss": 4.9123, "mean_token_accuracy": 0.22692507654428482, "num_tokens": 55293113.0, "step": 24105 }, { "entropy": 5.142979717254638, "epoch": 2.3160422670509124, "grad_norm": 1.296875, "learning_rate": 0.0004464748287708099, "loss": 4.8226, "mean_token_accuracy": 0.22785865962505342, "num_tokens": 55304182.0, "step": 24110 }, { "entropy": 5.087536096572876, "epoch": 2.3165225744476463, "grad_norm": 1.1953125, "learning_rate": 0.00044645262998997557, "loss": 4.7536, "mean_token_accuracy": 0.2385023683309555, "num_tokens": 55315292.0, "step": 24115 }, { "entropy": 5.09465913772583, "epoch": 2.31700288184438, "grad_norm": 1.234375, "learning_rate": 0.00044643042722852024, "loss": 4.7532, "mean_token_accuracy": 0.2335251748561859, "num_tokens": 55327000.0, "step": 24120 }, { "entropy": 5.059976387023926, "epoch": 2.3174831892411145, "grad_norm": 1.09375, "learning_rate": 0.0004464082204869595, "loss": 4.7052, "mean_token_accuracy": 0.24020502716302872, "num_tokens": 55337506.0, "step": 24125 }, { "entropy": 5.041950273513794, "epoch": 2.3179634966378484, "grad_norm": 1.109375, "learning_rate": 0.0004463860097658088, "loss": 4.6782, "mean_token_accuracy": 0.23825192600488662, "num_tokens": 55348538.0, "step": 24130 }, { "entropy": 5.19771637916565, "epoch": 2.3184438040345823, "grad_norm": 1.25, "learning_rate": 0.0004463637950655839, "loss": 4.8208, "mean_token_accuracy": 0.2281821385025978, "num_tokens": 55360501.0, "step": 24135 }, { "entropy": 5.0973663330078125, "epoch": 2.318924111431316, "grad_norm": 1.140625, "learning_rate": 0.00044634157638680054, "loss": 4.7656, "mean_token_accuracy": 0.2324952080845833, "num_tokens": 55371616.0, "step": 24140 }, { "entropy": 5.11321268081665, "epoch": 2.31940441882805, "grad_norm": 1.1484375, "learning_rate": 0.00044631935372997455, "loss": 4.8792, "mean_token_accuracy": 0.21913430839776993, "num_tokens": 55382702.0, "step": 24145 }, { "entropy": 5.1108448028564455, "epoch": 2.319884726224784, "grad_norm": 1.265625, "learning_rate": 0.0004462971270956219, "loss": 4.7452, "mean_token_accuracy": 0.23138088285923003, "num_tokens": 55394826.0, "step": 24150 }, { "entropy": 5.062004709243775, "epoch": 2.320365033621518, "grad_norm": 1.1015625, "learning_rate": 0.0004462748964842586, "loss": 4.7143, "mean_token_accuracy": 0.2407672330737114, "num_tokens": 55406584.0, "step": 24155 }, { "entropy": 5.088969039916992, "epoch": 2.3208453410182517, "grad_norm": 1.1796875, "learning_rate": 0.0004462526618964008, "loss": 4.8449, "mean_token_accuracy": 0.22762546092271804, "num_tokens": 55419640.0, "step": 24160 }, { "entropy": 5.054949760437012, "epoch": 2.3213256484149856, "grad_norm": 1.0625, "learning_rate": 0.0004462304233325647, "loss": 4.783, "mean_token_accuracy": 0.2326088547706604, "num_tokens": 55430197.0, "step": 24165 }, { "entropy": 5.0983367443084715, "epoch": 2.3218059558117194, "grad_norm": 1.1171875, "learning_rate": 0.0004462081807932666, "loss": 4.8065, "mean_token_accuracy": 0.23061417937278747, "num_tokens": 55441832.0, "step": 24170 }, { "entropy": 5.076362228393554, "epoch": 2.3222862632084533, "grad_norm": 1.1796875, "learning_rate": 0.0004461859342790229, "loss": 4.6935, "mean_token_accuracy": 0.2416341170668602, "num_tokens": 55452656.0, "step": 24175 }, { "entropy": 4.971002101898193, "epoch": 2.322766570605187, "grad_norm": 1.15625, "learning_rate": 0.0004461636837903501, "loss": 4.6705, "mean_token_accuracy": 0.23961927890777587, "num_tokens": 55463936.0, "step": 24180 }, { "entropy": 5.142798280715942, "epoch": 2.323246878001921, "grad_norm": 1.1328125, "learning_rate": 0.0004461414293277649, "loss": 4.8605, "mean_token_accuracy": 0.22794176787137985, "num_tokens": 55475885.0, "step": 24185 }, { "entropy": 5.1128401279449465, "epoch": 2.323727185398655, "grad_norm": 1.078125, "learning_rate": 0.0004461191708917839, "loss": 4.7341, "mean_token_accuracy": 0.2254903048276901, "num_tokens": 55489022.0, "step": 24190 }, { "entropy": 5.06376371383667, "epoch": 2.324207492795389, "grad_norm": 1.1953125, "learning_rate": 0.00044609690848292376, "loss": 4.7527, "mean_token_accuracy": 0.2291923373937607, "num_tokens": 55500008.0, "step": 24195 }, { "entropy": 5.1281633377075195, "epoch": 2.324687800192123, "grad_norm": 1.140625, "learning_rate": 0.0004460746421017016, "loss": 4.8874, "mean_token_accuracy": 0.2207140639424324, "num_tokens": 55511024.0, "step": 24200 }, { "entropy": 5.200454950332642, "epoch": 2.325168107588857, "grad_norm": 1.1015625, "learning_rate": 0.00044605237174863405, "loss": 4.809, "mean_token_accuracy": 0.23201199173927306, "num_tokens": 55522199.0, "step": 24205 }, { "entropy": 5.098220157623291, "epoch": 2.325648414985591, "grad_norm": 1.109375, "learning_rate": 0.0004460300974242386, "loss": 4.7585, "mean_token_accuracy": 0.23540398478507996, "num_tokens": 55533197.0, "step": 24210 }, { "entropy": 5.101766872406006, "epoch": 2.326128722382325, "grad_norm": 1.140625, "learning_rate": 0.0004460078191290319, "loss": 4.7875, "mean_token_accuracy": 0.2317366361618042, "num_tokens": 55544317.0, "step": 24215 }, { "entropy": 5.124845695495606, "epoch": 2.3266090297790587, "grad_norm": 1.2109375, "learning_rate": 0.00044598553686353153, "loss": 4.8222, "mean_token_accuracy": 0.22728889137506486, "num_tokens": 55555388.0, "step": 24220 }, { "entropy": 5.128252983093262, "epoch": 2.3270893371757926, "grad_norm": 1.1953125, "learning_rate": 0.00044596325062825476, "loss": 4.7615, "mean_token_accuracy": 0.22990836650133134, "num_tokens": 55566189.0, "step": 24225 }, { "entropy": 5.047675561904907, "epoch": 2.3275696445725265, "grad_norm": 0.9921875, "learning_rate": 0.0004459409604237189, "loss": 4.7352, "mean_token_accuracy": 0.23851094841957093, "num_tokens": 55578235.0, "step": 24230 }, { "entropy": 4.9796771049499515, "epoch": 2.3280499519692603, "grad_norm": 1.15625, "learning_rate": 0.00044591866625044154, "loss": 4.7211, "mean_token_accuracy": 0.23492682725191116, "num_tokens": 55590300.0, "step": 24235 }, { "entropy": 5.157256269454956, "epoch": 2.3285302593659942, "grad_norm": 1.1796875, "learning_rate": 0.0004458963681089403, "loss": 4.8027, "mean_token_accuracy": 0.23687927275896073, "num_tokens": 55601017.0, "step": 24240 }, { "entropy": 5.188142204284668, "epoch": 2.329010566762728, "grad_norm": 1.1484375, "learning_rate": 0.0004458740659997328, "loss": 4.8333, "mean_token_accuracy": 0.23082994669675827, "num_tokens": 55612411.0, "step": 24245 }, { "entropy": 5.150514221191406, "epoch": 2.329490874159462, "grad_norm": 1.1015625, "learning_rate": 0.0004458517599233369, "loss": 4.8387, "mean_token_accuracy": 0.22728616893291473, "num_tokens": 55623872.0, "step": 24250 }, { "entropy": 5.202387571334839, "epoch": 2.329971181556196, "grad_norm": 1.1640625, "learning_rate": 0.0004458294498802706, "loss": 4.8477, "mean_token_accuracy": 0.21881984323263168, "num_tokens": 55634887.0, "step": 24255 }, { "entropy": 5.094072580337524, "epoch": 2.3304514889529298, "grad_norm": 1.0703125, "learning_rate": 0.0004458071358710516, "loss": 4.7988, "mean_token_accuracy": 0.23121218085289003, "num_tokens": 55646286.0, "step": 24260 }, { "entropy": 5.139373064041138, "epoch": 2.3309317963496636, "grad_norm": 1.0859375, "learning_rate": 0.0004457848178961981, "loss": 4.8043, "mean_token_accuracy": 0.22870695888996123, "num_tokens": 55657647.0, "step": 24265 }, { "entropy": 5.083712530136109, "epoch": 2.3314121037463975, "grad_norm": 1.140625, "learning_rate": 0.00044576249595622833, "loss": 4.7764, "mean_token_accuracy": 0.2308934897184372, "num_tokens": 55669000.0, "step": 24270 }, { "entropy": 5.115909576416016, "epoch": 2.331892411143132, "grad_norm": 1.046875, "learning_rate": 0.0004457401700516603, "loss": 4.7746, "mean_token_accuracy": 0.22995221614837646, "num_tokens": 55680678.0, "step": 24275 }, { "entropy": 5.134755945205688, "epoch": 2.3323727185398653, "grad_norm": 1.203125, "learning_rate": 0.00044571784018301267, "loss": 4.795, "mean_token_accuracy": 0.23035201877355577, "num_tokens": 55692429.0, "step": 24280 }, { "entropy": 5.1184654712677, "epoch": 2.3328530259365996, "grad_norm": 1.1796875, "learning_rate": 0.00044569550635080365, "loss": 4.8635, "mean_token_accuracy": 0.22763997316360474, "num_tokens": 55704659.0, "step": 24285 }, { "entropy": 5.0660813331604, "epoch": 2.3333333333333335, "grad_norm": 1.15625, "learning_rate": 0.00044567316855555184, "loss": 4.6613, "mean_token_accuracy": 0.23994368612766265, "num_tokens": 55715584.0, "step": 24290 }, { "entropy": 5.162021112442017, "epoch": 2.3338136407300674, "grad_norm": 1.1015625, "learning_rate": 0.0004456508267977759, "loss": 4.8847, "mean_token_accuracy": 0.22551312744617463, "num_tokens": 55726715.0, "step": 24295 }, { "entropy": 5.064092540740967, "epoch": 2.3342939481268012, "grad_norm": 1.125, "learning_rate": 0.00044562848107799444, "loss": 4.7438, "mean_token_accuracy": 0.2381514459848404, "num_tokens": 55737069.0, "step": 24300 }, { "entropy": 5.0794895648956295, "epoch": 2.334774255523535, "grad_norm": 1.1484375, "learning_rate": 0.00044560613139672627, "loss": 4.7254, "mean_token_accuracy": 0.2401443362236023, "num_tokens": 55747774.0, "step": 24305 }, { "entropy": 5.0981381893157955, "epoch": 2.335254562920269, "grad_norm": 1.1875, "learning_rate": 0.00044558377775449036, "loss": 4.775, "mean_token_accuracy": 0.2361172690987587, "num_tokens": 55758209.0, "step": 24310 }, { "entropy": 5.187936162948608, "epoch": 2.335734870317003, "grad_norm": 1.1875, "learning_rate": 0.00044556142015180573, "loss": 4.7946, "mean_token_accuracy": 0.22627351880073548, "num_tokens": 55768668.0, "step": 24315 }, { "entropy": 5.277261114120483, "epoch": 2.3362151777137368, "grad_norm": 1.125, "learning_rate": 0.00044553905858919134, "loss": 4.9289, "mean_token_accuracy": 0.22501867413520812, "num_tokens": 55780254.0, "step": 24320 }, { "entropy": 5.141206741333008, "epoch": 2.3366954851104706, "grad_norm": 1.109375, "learning_rate": 0.0004455166930671664, "loss": 4.7969, "mean_token_accuracy": 0.2337252080440521, "num_tokens": 55790241.0, "step": 24325 }, { "entropy": 5.156115913391114, "epoch": 2.3371757925072045, "grad_norm": 1.0859375, "learning_rate": 0.00044549432358625014, "loss": 4.8835, "mean_token_accuracy": 0.22898427098989488, "num_tokens": 55802040.0, "step": 24330 }, { "entropy": 5.128282070159912, "epoch": 2.3376560999039384, "grad_norm": 1.1328125, "learning_rate": 0.000445471950146962, "loss": 4.7985, "mean_token_accuracy": 0.23188591599464417, "num_tokens": 55813926.0, "step": 24335 }, { "entropy": 5.106117057800293, "epoch": 2.3381364073006723, "grad_norm": 1.1953125, "learning_rate": 0.0004454495727498214, "loss": 4.7634, "mean_token_accuracy": 0.23359781950712205, "num_tokens": 55825281.0, "step": 24340 }, { "entropy": 5.084489583969116, "epoch": 2.338616714697406, "grad_norm": 1.2265625, "learning_rate": 0.0004454271913953478, "loss": 4.7665, "mean_token_accuracy": 0.23407331258058547, "num_tokens": 55836295.0, "step": 24345 }, { "entropy": 5.046988296508789, "epoch": 2.3390970220941405, "grad_norm": 1.09375, "learning_rate": 0.00044540480608406093, "loss": 4.7572, "mean_token_accuracy": 0.24078552424907684, "num_tokens": 55847150.0, "step": 24350 }, { "entropy": 5.127749443054199, "epoch": 2.339577329490874, "grad_norm": 1.2109375, "learning_rate": 0.0004453824168164804, "loss": 4.84, "mean_token_accuracy": 0.2280465394258499, "num_tokens": 55858135.0, "step": 24355 }, { "entropy": 5.231775188446045, "epoch": 2.3400576368876083, "grad_norm": 1.1171875, "learning_rate": 0.0004453600235931261, "loss": 4.8786, "mean_token_accuracy": 0.2284584030508995, "num_tokens": 55869319.0, "step": 24360 }, { "entropy": 5.1386302471160885, "epoch": 2.340537944284342, "grad_norm": 1.09375, "learning_rate": 0.000445337626414518, "loss": 4.851, "mean_token_accuracy": 0.2288869395852089, "num_tokens": 55881383.0, "step": 24365 }, { "entropy": 5.093533611297607, "epoch": 2.341018251681076, "grad_norm": 1.1796875, "learning_rate": 0.00044531522528117593, "loss": 4.7657, "mean_token_accuracy": 0.23314369469881058, "num_tokens": 55893726.0, "step": 24370 }, { "entropy": 5.150975704193115, "epoch": 2.34149855907781, "grad_norm": 1.0390625, "learning_rate": 0.00044529282019362007, "loss": 4.8339, "mean_token_accuracy": 0.23597938418388367, "num_tokens": 55905260.0, "step": 24375 }, { "entropy": 5.138191604614258, "epoch": 2.341978866474544, "grad_norm": 1.1796875, "learning_rate": 0.00044527041115237056, "loss": 4.8237, "mean_token_accuracy": 0.23421450853347778, "num_tokens": 55915404.0, "step": 24380 }, { "entropy": 5.051256847381592, "epoch": 2.3424591738712777, "grad_norm": 1.1328125, "learning_rate": 0.0004452479981579477, "loss": 4.7269, "mean_token_accuracy": 0.23279765248298645, "num_tokens": 55925972.0, "step": 24385 }, { "entropy": 5.081080436706543, "epoch": 2.3429394812680115, "grad_norm": 1.1328125, "learning_rate": 0.0004452255812108719, "loss": 4.7605, "mean_token_accuracy": 0.22747449278831483, "num_tokens": 55937545.0, "step": 24390 }, { "entropy": 5.030919933319092, "epoch": 2.3434197886647454, "grad_norm": 1.0234375, "learning_rate": 0.0004452031603116636, "loss": 4.7176, "mean_token_accuracy": 0.2428242087364197, "num_tokens": 55949934.0, "step": 24395 }, { "entropy": 5.092579746246338, "epoch": 2.3439000960614793, "grad_norm": 1.09375, "learning_rate": 0.00044518073546084326, "loss": 4.81, "mean_token_accuracy": 0.2370229333639145, "num_tokens": 55960414.0, "step": 24400 }, { "entropy": 5.066870450973511, "epoch": 2.344380403458213, "grad_norm": 1.171875, "learning_rate": 0.0004451583066589316, "loss": 4.7419, "mean_token_accuracy": 0.22898071706295015, "num_tokens": 55970999.0, "step": 24405 }, { "entropy": 5.101868677139282, "epoch": 2.344860710854947, "grad_norm": 1.203125, "learning_rate": 0.00044513587390644925, "loss": 4.7664, "mean_token_accuracy": 0.23451221138238906, "num_tokens": 55982801.0, "step": 24410 }, { "entropy": 5.186482763290405, "epoch": 2.345341018251681, "grad_norm": 1.0390625, "learning_rate": 0.00044511343720391724, "loss": 4.8305, "mean_token_accuracy": 0.22371030896902083, "num_tokens": 55995143.0, "step": 24415 }, { "entropy": 5.081795644760132, "epoch": 2.345821325648415, "grad_norm": 1.265625, "learning_rate": 0.0004450909965518563, "loss": 4.8215, "mean_token_accuracy": 0.22740914970636367, "num_tokens": 56006368.0, "step": 24420 }, { "entropy": 5.138456726074219, "epoch": 2.3463016330451487, "grad_norm": 1.046875, "learning_rate": 0.00044506855195078755, "loss": 4.8737, "mean_token_accuracy": 0.22875082045793532, "num_tokens": 56019245.0, "step": 24425 }, { "entropy": 5.166050815582276, "epoch": 2.3467819404418826, "grad_norm": 1.1328125, "learning_rate": 0.00044504610340123185, "loss": 4.8532, "mean_token_accuracy": 0.21877577304840087, "num_tokens": 56030982.0, "step": 24430 }, { "entropy": 5.135728549957276, "epoch": 2.347262247838617, "grad_norm": 1.21875, "learning_rate": 0.00044502365090371066, "loss": 4.8184, "mean_token_accuracy": 0.22757762521505356, "num_tokens": 56042871.0, "step": 24435 }, { "entropy": 5.1946179389953615, "epoch": 2.347742555235351, "grad_norm": 1.171875, "learning_rate": 0.0004450011944587452, "loss": 4.8264, "mean_token_accuracy": 0.22652811259031297, "num_tokens": 56054099.0, "step": 24440 }, { "entropy": 5.047242975234985, "epoch": 2.3482228626320847, "grad_norm": 1.0625, "learning_rate": 0.00044497873406685673, "loss": 4.767, "mean_token_accuracy": 0.2269476056098938, "num_tokens": 56064315.0, "step": 24445 }, { "entropy": 5.022474241256714, "epoch": 2.3487031700288186, "grad_norm": 1.0390625, "learning_rate": 0.0004449562697285669, "loss": 4.7471, "mean_token_accuracy": 0.22951697558164597, "num_tokens": 56076446.0, "step": 24450 }, { "entropy": 5.190836668014526, "epoch": 2.3491834774255524, "grad_norm": 1.078125, "learning_rate": 0.00044493380144439707, "loss": 4.8706, "mean_token_accuracy": 0.2270712062716484, "num_tokens": 56089356.0, "step": 24455 }, { "entropy": 5.1657195568084715, "epoch": 2.3496637848222863, "grad_norm": 1.078125, "learning_rate": 0.000444911329214869, "loss": 4.8631, "mean_token_accuracy": 0.22868295907974243, "num_tokens": 56101245.0, "step": 24460 }, { "entropy": 5.008936548233033, "epoch": 2.35014409221902, "grad_norm": 1.0625, "learning_rate": 0.00044488885304050434, "loss": 4.6899, "mean_token_accuracy": 0.23889461904764175, "num_tokens": 56112756.0, "step": 24465 }, { "entropy": 5.117912292480469, "epoch": 2.350624399615754, "grad_norm": 0.99609375, "learning_rate": 0.000444866372921825, "loss": 4.8119, "mean_token_accuracy": 0.22884083539247513, "num_tokens": 56124316.0, "step": 24470 }, { "entropy": 5.052361536026001, "epoch": 2.351104707012488, "grad_norm": 1.109375, "learning_rate": 0.00044484388885935287, "loss": 4.732, "mean_token_accuracy": 0.24278282523155212, "num_tokens": 56135972.0, "step": 24475 }, { "entropy": 5.156489706039428, "epoch": 2.351585014409222, "grad_norm": 1.1484375, "learning_rate": 0.00044482140085361005, "loss": 4.8235, "mean_token_accuracy": 0.2255215510725975, "num_tokens": 56147335.0, "step": 24480 }, { "entropy": 5.050457763671875, "epoch": 2.3520653218059557, "grad_norm": 1.0234375, "learning_rate": 0.00044479890890511853, "loss": 4.7099, "mean_token_accuracy": 0.23369956463575364, "num_tokens": 56158294.0, "step": 24485 }, { "entropy": 5.035678100585938, "epoch": 2.3525456292026896, "grad_norm": 1.0625, "learning_rate": 0.00044477641301440054, "loss": 4.6703, "mean_token_accuracy": 0.2425915777683258, "num_tokens": 56169468.0, "step": 24490 }, { "entropy": 5.119379329681396, "epoch": 2.3530259365994235, "grad_norm": 1.125, "learning_rate": 0.0004447539131819784, "loss": 4.7564, "mean_token_accuracy": 0.22898904383182525, "num_tokens": 56180112.0, "step": 24495 }, { "entropy": 5.158781814575195, "epoch": 2.3535062439961574, "grad_norm": 1.1015625, "learning_rate": 0.00044473140940837436, "loss": 4.7991, "mean_token_accuracy": 0.23098965287208556, "num_tokens": 56191290.0, "step": 24500 }, { "entropy": 5.157944536209106, "epoch": 2.3539865513928913, "grad_norm": 1.125, "learning_rate": 0.00044470890169411107, "loss": 4.859, "mean_token_accuracy": 0.22956420928239823, "num_tokens": 56202447.0, "step": 24505 }, { "entropy": 5.150196647644043, "epoch": 2.3544668587896256, "grad_norm": 1.109375, "learning_rate": 0.000444686390039711, "loss": 4.8725, "mean_token_accuracy": 0.2220148727297783, "num_tokens": 56213802.0, "step": 24510 }, { "entropy": 5.100703859329224, "epoch": 2.3549471661863595, "grad_norm": 1.140625, "learning_rate": 0.0004446638744456968, "loss": 4.7799, "mean_token_accuracy": 0.23059052973985672, "num_tokens": 56225243.0, "step": 24515 }, { "entropy": 5.120981311798095, "epoch": 2.3554274735830933, "grad_norm": 1.1171875, "learning_rate": 0.00044464135491259135, "loss": 4.7158, "mean_token_accuracy": 0.23078160136938095, "num_tokens": 56235116.0, "step": 24520 }, { "entropy": 5.068836307525634, "epoch": 2.3559077809798272, "grad_norm": 1.125, "learning_rate": 0.0004446188314409172, "loss": 4.7882, "mean_token_accuracy": 0.23106331676244735, "num_tokens": 56246654.0, "step": 24525 }, { "entropy": 5.145525598526001, "epoch": 2.356388088376561, "grad_norm": 1.0625, "learning_rate": 0.0004445963040311975, "loss": 4.7791, "mean_token_accuracy": 0.2322086364030838, "num_tokens": 56257787.0, "step": 24530 }, { "entropy": 5.057206106185913, "epoch": 2.356868395773295, "grad_norm": 1.28125, "learning_rate": 0.00044457377268395526, "loss": 4.6896, "mean_token_accuracy": 0.2340852975845337, "num_tokens": 56268881.0, "step": 24535 }, { "entropy": 4.9917152404785154, "epoch": 2.357348703170029, "grad_norm": 1.15625, "learning_rate": 0.00044455123739971355, "loss": 4.7075, "mean_token_accuracy": 0.2374115616083145, "num_tokens": 56281515.0, "step": 24540 }, { "entropy": 4.995164155960083, "epoch": 2.3578290105667628, "grad_norm": 1.109375, "learning_rate": 0.00044452869817899554, "loss": 4.6698, "mean_token_accuracy": 0.2395367980003357, "num_tokens": 56293226.0, "step": 24545 }, { "entropy": 5.118550491333008, "epoch": 2.3583093179634966, "grad_norm": 1.1328125, "learning_rate": 0.0004445061550223246, "loss": 4.7668, "mean_token_accuracy": 0.2326399102807045, "num_tokens": 56303408.0, "step": 24550 }, { "entropy": 5.112800025939942, "epoch": 2.3587896253602305, "grad_norm": 1.0625, "learning_rate": 0.00044448360793022403, "loss": 4.7617, "mean_token_accuracy": 0.2299429401755333, "num_tokens": 56315379.0, "step": 24555 }, { "entropy": 5.091808128356933, "epoch": 2.3592699327569644, "grad_norm": 1.1953125, "learning_rate": 0.0004444610569032174, "loss": 4.7931, "mean_token_accuracy": 0.23082323074340821, "num_tokens": 56326496.0, "step": 24560 }, { "entropy": 5.028980159759522, "epoch": 2.3597502401536983, "grad_norm": 1.0625, "learning_rate": 0.0004444385019418281, "loss": 4.731, "mean_token_accuracy": 0.2351315811276436, "num_tokens": 56339416.0, "step": 24565 }, { "entropy": 5.138836240768432, "epoch": 2.360230547550432, "grad_norm": 1.2578125, "learning_rate": 0.00044441594304658004, "loss": 4.8032, "mean_token_accuracy": 0.22849867790937423, "num_tokens": 56352305.0, "step": 24570 }, { "entropy": 5.0952881336212155, "epoch": 2.360710854947166, "grad_norm": 1.0859375, "learning_rate": 0.0004443933802179968, "loss": 4.7269, "mean_token_accuracy": 0.23680901676416397, "num_tokens": 56364598.0, "step": 24575 }, { "entropy": 5.13135256767273, "epoch": 2.3611911623439, "grad_norm": 1.15625, "learning_rate": 0.00044437081345660224, "loss": 4.8741, "mean_token_accuracy": 0.2198468491435051, "num_tokens": 56376651.0, "step": 24580 }, { "entropy": 5.090953397750854, "epoch": 2.3616714697406342, "grad_norm": 1.09375, "learning_rate": 0.0004443482427629204, "loss": 4.7517, "mean_token_accuracy": 0.23561461567878722, "num_tokens": 56388038.0, "step": 24585 }, { "entropy": 5.10401463508606, "epoch": 2.3621517771373677, "grad_norm": 1.09375, "learning_rate": 0.0004443256681374751, "loss": 4.7729, "mean_token_accuracy": 0.23340231776237488, "num_tokens": 56398889.0, "step": 24590 }, { "entropy": 5.080159282684326, "epoch": 2.362632084534102, "grad_norm": 0.98828125, "learning_rate": 0.0004443030895807907, "loss": 4.804, "mean_token_accuracy": 0.2332967445254326, "num_tokens": 56411548.0, "step": 24595 }, { "entropy": 5.1313213348388675, "epoch": 2.363112391930836, "grad_norm": 1.1484375, "learning_rate": 0.00044428050709339117, "loss": 4.6843, "mean_token_accuracy": 0.23506833761930465, "num_tokens": 56423292.0, "step": 24600 }, { "entropy": 5.024478149414063, "epoch": 2.3635926993275698, "grad_norm": 1.0546875, "learning_rate": 0.000444257920675801, "loss": 4.7636, "mean_token_accuracy": 0.23502004146575928, "num_tokens": 56434834.0, "step": 24605 }, { "entropy": 5.061081123352051, "epoch": 2.3640730067243036, "grad_norm": 1.203125, "learning_rate": 0.00044423533032854454, "loss": 4.8158, "mean_token_accuracy": 0.22861055731773378, "num_tokens": 56445554.0, "step": 24610 }, { "entropy": 5.078392887115479, "epoch": 2.3645533141210375, "grad_norm": 1.078125, "learning_rate": 0.0004442127360521462, "loss": 4.7748, "mean_token_accuracy": 0.2413931518793106, "num_tokens": 56457748.0, "step": 24615 }, { "entropy": 5.069872283935547, "epoch": 2.3650336215177714, "grad_norm": 1.03125, "learning_rate": 0.0004441901378471306, "loss": 4.7002, "mean_token_accuracy": 0.2420249953866005, "num_tokens": 56469650.0, "step": 24620 }, { "entropy": 5.040999507904052, "epoch": 2.3655139289145053, "grad_norm": 1.09375, "learning_rate": 0.00044416753571402233, "loss": 4.7438, "mean_token_accuracy": 0.2333931416273117, "num_tokens": 56482061.0, "step": 24625 }, { "entropy": 5.136098337173462, "epoch": 2.365994236311239, "grad_norm": 1.1171875, "learning_rate": 0.0004441449296533462, "loss": 4.8222, "mean_token_accuracy": 0.22758017480373383, "num_tokens": 56493599.0, "step": 24630 }, { "entropy": 5.165631055831909, "epoch": 2.366474543707973, "grad_norm": 1.09375, "learning_rate": 0.00044412231966562717, "loss": 4.8648, "mean_token_accuracy": 0.22515485137701036, "num_tokens": 56505842.0, "step": 24635 }, { "entropy": 5.052130842208863, "epoch": 2.366954851104707, "grad_norm": 1.109375, "learning_rate": 0.00044409970575139, "loss": 4.7477, "mean_token_accuracy": 0.23796017169952394, "num_tokens": 56517688.0, "step": 24640 }, { "entropy": 5.12772536277771, "epoch": 2.367435158501441, "grad_norm": 1.1640625, "learning_rate": 0.0004440770879111598, "loss": 4.7834, "mean_token_accuracy": 0.23488107621669768, "num_tokens": 56528665.0, "step": 24645 }, { "entropy": 4.977132081985474, "epoch": 2.3679154658981747, "grad_norm": 1.140625, "learning_rate": 0.00044405446614546163, "loss": 4.6826, "mean_token_accuracy": 0.2373212084174156, "num_tokens": 56539097.0, "step": 24650 }, { "entropy": 5.1884829044342045, "epoch": 2.3683957732949086, "grad_norm": 1.1328125, "learning_rate": 0.0004440318404548208, "loss": 4.8751, "mean_token_accuracy": 0.23188122361898422, "num_tokens": 56550373.0, "step": 24655 }, { "entropy": 5.152935409545899, "epoch": 2.3688760806916425, "grad_norm": 1.109375, "learning_rate": 0.00044400921083976246, "loss": 4.8138, "mean_token_accuracy": 0.22929610162973404, "num_tokens": 56561289.0, "step": 24660 }, { "entropy": 5.109140205383301, "epoch": 2.3693563880883763, "grad_norm": 1.1171875, "learning_rate": 0.0004439865773008122, "loss": 4.8364, "mean_token_accuracy": 0.23006531596183777, "num_tokens": 56572331.0, "step": 24665 }, { "entropy": 5.076235580444336, "epoch": 2.3698366954851107, "grad_norm": 1.09375, "learning_rate": 0.0004439639398384953, "loss": 4.7167, "mean_token_accuracy": 0.23638332933187484, "num_tokens": 56584375.0, "step": 24670 }, { "entropy": 5.078734445571899, "epoch": 2.3703170028818445, "grad_norm": 1.4296875, "learning_rate": 0.00044394129845333756, "loss": 4.7333, "mean_token_accuracy": 0.22900070548057555, "num_tokens": 56595825.0, "step": 24675 }, { "entropy": 5.089966487884522, "epoch": 2.3707973102785784, "grad_norm": 1.0546875, "learning_rate": 0.0004439186531458645, "loss": 4.8249, "mean_token_accuracy": 0.2364494889974594, "num_tokens": 56607842.0, "step": 24680 }, { "entropy": 5.140970182418823, "epoch": 2.3712776176753123, "grad_norm": 1.3046875, "learning_rate": 0.00044389600391660185, "loss": 4.8352, "mean_token_accuracy": 0.23416105657815933, "num_tokens": 56618509.0, "step": 24685 }, { "entropy": 5.044953441619873, "epoch": 2.371757925072046, "grad_norm": 1.125, "learning_rate": 0.00044387335076607554, "loss": 4.7198, "mean_token_accuracy": 0.2367233455181122, "num_tokens": 56630293.0, "step": 24690 }, { "entropy": 5.189683341979981, "epoch": 2.37223823246878, "grad_norm": 1.3515625, "learning_rate": 0.0004438506936948115, "loss": 4.8887, "mean_token_accuracy": 0.22379377484321594, "num_tokens": 56641852.0, "step": 24695 }, { "entropy": 5.250722169876099, "epoch": 2.372718539865514, "grad_norm": 1.1484375, "learning_rate": 0.00044382803270333565, "loss": 4.8611, "mean_token_accuracy": 0.22589135318994522, "num_tokens": 56654409.0, "step": 24700 }, { "entropy": 5.074213266372681, "epoch": 2.373198847262248, "grad_norm": 1.1328125, "learning_rate": 0.0004438053677921743, "loss": 4.7654, "mean_token_accuracy": 0.2345658928155899, "num_tokens": 56665227.0, "step": 24705 }, { "entropy": 5.101580572128296, "epoch": 2.3736791546589817, "grad_norm": 1.078125, "learning_rate": 0.00044378269896185344, "loss": 4.8164, "mean_token_accuracy": 0.23337904661893843, "num_tokens": 56677924.0, "step": 24710 }, { "entropy": 5.117094945907593, "epoch": 2.3741594620557156, "grad_norm": 1.203125, "learning_rate": 0.0004437600262128996, "loss": 4.7135, "mean_token_accuracy": 0.23515274077653886, "num_tokens": 56689789.0, "step": 24715 }, { "entropy": 5.135672187805175, "epoch": 2.3746397694524495, "grad_norm": 1.203125, "learning_rate": 0.000443737349545839, "loss": 4.8667, "mean_token_accuracy": 0.2295097976922989, "num_tokens": 56700595.0, "step": 24720 }, { "entropy": 5.15782356262207, "epoch": 2.3751200768491834, "grad_norm": 1.0703125, "learning_rate": 0.00044371466896119823, "loss": 4.8917, "mean_token_accuracy": 0.2281707540154457, "num_tokens": 56712554.0, "step": 24725 }, { "entropy": 5.170312452316284, "epoch": 2.3756003842459172, "grad_norm": 1.1875, "learning_rate": 0.00044369198445950384, "loss": 4.7679, "mean_token_accuracy": 0.22951631993055344, "num_tokens": 56723698.0, "step": 24730 }, { "entropy": 5.124538946151733, "epoch": 2.376080691642651, "grad_norm": 1.1171875, "learning_rate": 0.0004436692960412824, "loss": 4.7966, "mean_token_accuracy": 0.2397472620010376, "num_tokens": 56734660.0, "step": 24735 }, { "entropy": 5.09040675163269, "epoch": 2.376560999039385, "grad_norm": 1.1328125, "learning_rate": 0.0004436466037070608, "loss": 4.8102, "mean_token_accuracy": 0.22980419397354127, "num_tokens": 56746150.0, "step": 24740 }, { "entropy": 5.187903594970703, "epoch": 2.3770413064361193, "grad_norm": 1.078125, "learning_rate": 0.00044362390745736585, "loss": 4.9064, "mean_token_accuracy": 0.22432048469781876, "num_tokens": 56758018.0, "step": 24745 }, { "entropy": 5.102304887771607, "epoch": 2.377521613832853, "grad_norm": 1.21875, "learning_rate": 0.0004436012072927245, "loss": 4.7469, "mean_token_accuracy": 0.23562378585338592, "num_tokens": 56769129.0, "step": 24750 }, { "entropy": 5.136471176147461, "epoch": 2.378001921229587, "grad_norm": 1.2421875, "learning_rate": 0.00044357850321366375, "loss": 4.8011, "mean_token_accuracy": 0.23276554346084594, "num_tokens": 56780445.0, "step": 24755 }, { "entropy": 5.076611423492432, "epoch": 2.378482228626321, "grad_norm": 1.140625, "learning_rate": 0.0004435557952207107, "loss": 4.811, "mean_token_accuracy": 0.23137751519680022, "num_tokens": 56792338.0, "step": 24760 }, { "entropy": 5.113924932479859, "epoch": 2.378962536023055, "grad_norm": 1.1171875, "learning_rate": 0.00044353308331439257, "loss": 4.7922, "mean_token_accuracy": 0.2352191373705864, "num_tokens": 56802345.0, "step": 24765 }, { "entropy": 5.175993299484253, "epoch": 2.3794428434197887, "grad_norm": 1.2265625, "learning_rate": 0.0004435103674952367, "loss": 4.8465, "mean_token_accuracy": 0.23275587558746338, "num_tokens": 56813865.0, "step": 24770 }, { "entropy": 5.101814031600952, "epoch": 2.3799231508165226, "grad_norm": 1.046875, "learning_rate": 0.00044348764776377047, "loss": 4.7193, "mean_token_accuracy": 0.23715719431638718, "num_tokens": 56824135.0, "step": 24775 }, { "entropy": 5.057256031036377, "epoch": 2.3804034582132565, "grad_norm": 1.0546875, "learning_rate": 0.0004434649241205214, "loss": 4.7761, "mean_token_accuracy": 0.23238783925771714, "num_tokens": 56836104.0, "step": 24780 }, { "entropy": 5.077281188964844, "epoch": 2.3808837656099904, "grad_norm": 1.1875, "learning_rate": 0.00044344219656601704, "loss": 4.7555, "mean_token_accuracy": 0.23776894956827163, "num_tokens": 56848308.0, "step": 24785 }, { "entropy": 5.078270006179809, "epoch": 2.3813640730067243, "grad_norm": 1.109375, "learning_rate": 0.000443419465100785, "loss": 4.7897, "mean_token_accuracy": 0.23689354062080384, "num_tokens": 56859351.0, "step": 24790 }, { "entropy": 5.093816423416138, "epoch": 2.381844380403458, "grad_norm": 1.109375, "learning_rate": 0.0004433967297253531, "loss": 4.8158, "mean_token_accuracy": 0.23118849694728852, "num_tokens": 56870645.0, "step": 24795 }, { "entropy": 5.073959541320801, "epoch": 2.382324687800192, "grad_norm": 1.1015625, "learning_rate": 0.00044337399044024924, "loss": 4.7549, "mean_token_accuracy": 0.2352516159415245, "num_tokens": 56881246.0, "step": 24800 }, { "entropy": 5.077259492874146, "epoch": 2.382804995196926, "grad_norm": 1.1015625, "learning_rate": 0.0004433512472460012, "loss": 4.7521, "mean_token_accuracy": 0.23205724507570266, "num_tokens": 56893477.0, "step": 24805 }, { "entropy": 5.086012125015259, "epoch": 2.38328530259366, "grad_norm": 1.1015625, "learning_rate": 0.00044332850014313713, "loss": 4.7813, "mean_token_accuracy": 0.22985356599092482, "num_tokens": 56904813.0, "step": 24810 }, { "entropy": 5.07104344367981, "epoch": 2.3837656099903937, "grad_norm": 1.171875, "learning_rate": 0.0004433057491321851, "loss": 4.759, "mean_token_accuracy": 0.22867830097675323, "num_tokens": 56916160.0, "step": 24815 }, { "entropy": 5.059772109985351, "epoch": 2.384245917387128, "grad_norm": 1.09375, "learning_rate": 0.00044328299421367333, "loss": 4.7429, "mean_token_accuracy": 0.2354225590825081, "num_tokens": 56927791.0, "step": 24820 }, { "entropy": 5.142007541656494, "epoch": 2.3847262247838614, "grad_norm": 1.0625, "learning_rate": 0.0004432602353881302, "loss": 4.8027, "mean_token_accuracy": 0.23089745938777922, "num_tokens": 56938848.0, "step": 24825 }, { "entropy": 5.1220542907714846, "epoch": 2.3852065321805958, "grad_norm": 1.0703125, "learning_rate": 0.00044323747265608395, "loss": 4.7882, "mean_token_accuracy": 0.23613769859075545, "num_tokens": 56950116.0, "step": 24830 }, { "entropy": 5.070098209381103, "epoch": 2.3856868395773296, "grad_norm": 1.09375, "learning_rate": 0.0004432147060180632, "loss": 4.7478, "mean_token_accuracy": 0.23370686769485474, "num_tokens": 56961911.0, "step": 24835 }, { "entropy": 5.168145179748535, "epoch": 2.3861671469740635, "grad_norm": 1.0625, "learning_rate": 0.00044319193547459645, "loss": 4.8208, "mean_token_accuracy": 0.2195179507136345, "num_tokens": 56974105.0, "step": 24840 }, { "entropy": 5.104530096054077, "epoch": 2.3866474543707974, "grad_norm": 1.078125, "learning_rate": 0.0004431691610262124, "loss": 4.7759, "mean_token_accuracy": 0.23273722231388091, "num_tokens": 56985564.0, "step": 24845 }, { "entropy": 5.078565549850464, "epoch": 2.3871277617675313, "grad_norm": 1.1328125, "learning_rate": 0.00044314638267343976, "loss": 4.7083, "mean_token_accuracy": 0.2425155222415924, "num_tokens": 56995835.0, "step": 24850 }, { "entropy": 4.960423183441162, "epoch": 2.387608069164265, "grad_norm": 1.0859375, "learning_rate": 0.0004431236004168075, "loss": 4.6584, "mean_token_accuracy": 0.23421775847673415, "num_tokens": 57008114.0, "step": 24855 }, { "entropy": 5.015463972091675, "epoch": 2.388088376560999, "grad_norm": 1.2109375, "learning_rate": 0.0004431008142568444, "loss": 4.7401, "mean_token_accuracy": 0.2357071578502655, "num_tokens": 57018968.0, "step": 24860 }, { "entropy": 5.0624267578125, "epoch": 2.388568683957733, "grad_norm": 1.109375, "learning_rate": 0.00044307802419407954, "loss": 4.7803, "mean_token_accuracy": 0.238409586250782, "num_tokens": 57031297.0, "step": 24865 }, { "entropy": 5.12186484336853, "epoch": 2.389048991354467, "grad_norm": 1.1484375, "learning_rate": 0.0004430552302290421, "loss": 4.7949, "mean_token_accuracy": 0.228073151409626, "num_tokens": 57043457.0, "step": 24870 }, { "entropy": 5.0508099555969235, "epoch": 2.3895292987512007, "grad_norm": 0.98828125, "learning_rate": 0.0004430324323622611, "loss": 4.7178, "mean_token_accuracy": 0.23357915431261062, "num_tokens": 57054474.0, "step": 24875 }, { "entropy": 5.072915172576904, "epoch": 2.3900096061479346, "grad_norm": 1.125, "learning_rate": 0.00044300963059426605, "loss": 4.7986, "mean_token_accuracy": 0.23037643283605574, "num_tokens": 57065120.0, "step": 24880 }, { "entropy": 5.044278001785278, "epoch": 2.3904899135446684, "grad_norm": 1.1328125, "learning_rate": 0.00044298682492558637, "loss": 4.7529, "mean_token_accuracy": 0.2331436961889267, "num_tokens": 57076712.0, "step": 24885 }, { "entropy": 5.173734998703003, "epoch": 2.3909702209414023, "grad_norm": 1.09375, "learning_rate": 0.00044296401535675136, "loss": 4.8632, "mean_token_accuracy": 0.22800443917512894, "num_tokens": 57087894.0, "step": 24890 }, { "entropy": 5.182175540924073, "epoch": 2.3914505283381366, "grad_norm": 1.0703125, "learning_rate": 0.00044294120188829056, "loss": 4.9352, "mean_token_accuracy": 0.2199179157614708, "num_tokens": 57101029.0, "step": 24895 }, { "entropy": 5.202096891403198, "epoch": 2.39193083573487, "grad_norm": 1.0390625, "learning_rate": 0.0004429183845207339, "loss": 4.8049, "mean_token_accuracy": 0.22310205698013305, "num_tokens": 57112893.0, "step": 24900 }, { "entropy": 5.1200279712677, "epoch": 2.3924111431316044, "grad_norm": 1.1640625, "learning_rate": 0.0004428955632546108, "loss": 4.838, "mean_token_accuracy": 0.23471231013536453, "num_tokens": 57123899.0, "step": 24905 }, { "entropy": 5.041331338882446, "epoch": 2.3928914505283383, "grad_norm": 1.0703125, "learning_rate": 0.0004428727380904514, "loss": 4.8432, "mean_token_accuracy": 0.23019375056028366, "num_tokens": 57134896.0, "step": 24910 }, { "entropy": 4.98831615447998, "epoch": 2.393371757925072, "grad_norm": 1.140625, "learning_rate": 0.00044284990902878545, "loss": 4.6332, "mean_token_accuracy": 0.2411554917693138, "num_tokens": 57146278.0, "step": 24915 }, { "entropy": 5.117031049728394, "epoch": 2.393852065321806, "grad_norm": 1.078125, "learning_rate": 0.00044282707607014304, "loss": 4.7572, "mean_token_accuracy": 0.23582173883914948, "num_tokens": 57157663.0, "step": 24920 }, { "entropy": 5.070637035369873, "epoch": 2.39433237271854, "grad_norm": 1.0703125, "learning_rate": 0.00044280423921505427, "loss": 4.7232, "mean_token_accuracy": 0.2379809319972992, "num_tokens": 57169131.0, "step": 24925 }, { "entropy": 5.091719150543213, "epoch": 2.394812680115274, "grad_norm": 1.0546875, "learning_rate": 0.0004427813984640493, "loss": 4.7648, "mean_token_accuracy": 0.22879979461431504, "num_tokens": 57181653.0, "step": 24930 }, { "entropy": 5.034427452087402, "epoch": 2.3952929875120077, "grad_norm": 1.0234375, "learning_rate": 0.0004427585538176585, "loss": 4.752, "mean_token_accuracy": 0.23759771287441253, "num_tokens": 57192817.0, "step": 24935 }, { "entropy": 5.11380443572998, "epoch": 2.3957732949087416, "grad_norm": 1.1171875, "learning_rate": 0.00044273570527641223, "loss": 4.8208, "mean_token_accuracy": 0.228305621445179, "num_tokens": 57203570.0, "step": 24940 }, { "entropy": 5.065595197677612, "epoch": 2.3962536023054755, "grad_norm": 1.15625, "learning_rate": 0.00044271285284084097, "loss": 4.7001, "mean_token_accuracy": 0.23602611124515532, "num_tokens": 57214388.0, "step": 24945 }, { "entropy": 5.124724578857422, "epoch": 2.3967339097022093, "grad_norm": 1.15625, "learning_rate": 0.0004426899965114752, "loss": 4.8045, "mean_token_accuracy": 0.22568911015987397, "num_tokens": 57225004.0, "step": 24950 }, { "entropy": 5.051713514328003, "epoch": 2.3972142170989432, "grad_norm": 1.09375, "learning_rate": 0.00044266713628884566, "loss": 4.7179, "mean_token_accuracy": 0.2410505473613739, "num_tokens": 57236643.0, "step": 24955 }, { "entropy": 5.09238977432251, "epoch": 2.397694524495677, "grad_norm": 1.078125, "learning_rate": 0.00044264427217348315, "loss": 4.7375, "mean_token_accuracy": 0.2336786285042763, "num_tokens": 57247502.0, "step": 24960 }, { "entropy": 5.191510391235352, "epoch": 2.398174831892411, "grad_norm": 1.1328125, "learning_rate": 0.0004426214041659184, "loss": 4.9253, "mean_token_accuracy": 0.2243089497089386, "num_tokens": 57259539.0, "step": 24965 }, { "entropy": 5.225309705734253, "epoch": 2.398655139289145, "grad_norm": 1.1953125, "learning_rate": 0.0004425985322666824, "loss": 4.8844, "mean_token_accuracy": 0.22500480264425277, "num_tokens": 57270946.0, "step": 24970 }, { "entropy": 5.12941083908081, "epoch": 2.3991354466858787, "grad_norm": 1.1484375, "learning_rate": 0.0004425756564763061, "loss": 4.8302, "mean_token_accuracy": 0.22922021597623826, "num_tokens": 57282855.0, "step": 24975 }, { "entropy": 5.125387191772461, "epoch": 2.399615754082613, "grad_norm": 1.015625, "learning_rate": 0.00044255277679532075, "loss": 4.827, "mean_token_accuracy": 0.2260574668645859, "num_tokens": 57292638.0, "step": 24980 }, { "entropy": 5.144128751754761, "epoch": 2.400096061479347, "grad_norm": 1.0234375, "learning_rate": 0.00044252989322425735, "loss": 4.7697, "mean_token_accuracy": 0.2287430688738823, "num_tokens": 57304088.0, "step": 24985 }, { "entropy": 5.150278091430664, "epoch": 2.400576368876081, "grad_norm": 1.078125, "learning_rate": 0.00044250700576364734, "loss": 4.7633, "mean_token_accuracy": 0.23506819903850557, "num_tokens": 57314534.0, "step": 24990 }, { "entropy": 5.115130281448364, "epoch": 2.4010566762728147, "grad_norm": 1.1015625, "learning_rate": 0.000442484114414022, "loss": 4.8511, "mean_token_accuracy": 0.2235242545604706, "num_tokens": 57326787.0, "step": 24995 }, { "entropy": 5.050136089324951, "epoch": 2.4015369836695486, "grad_norm": 1.1484375, "learning_rate": 0.0004424612191759129, "loss": 4.7492, "mean_token_accuracy": 0.22763265818357467, "num_tokens": 57338935.0, "step": 25000 }, { "entropy": 5.17376184463501, "epoch": 2.4020172910662825, "grad_norm": 1.171875, "learning_rate": 0.0004424383200498515, "loss": 4.7813, "mean_token_accuracy": 0.2324720099568367, "num_tokens": 57349963.0, "step": 25005 }, { "entropy": 5.074791526794433, "epoch": 2.4024975984630164, "grad_norm": 1.15625, "learning_rate": 0.0004424154170363696, "loss": 4.7924, "mean_token_accuracy": 0.23403090387582778, "num_tokens": 57361305.0, "step": 25010 }, { "entropy": 5.015868616104126, "epoch": 2.4029779058597502, "grad_norm": 1.078125, "learning_rate": 0.0004423925101359987, "loss": 4.7577, "mean_token_accuracy": 0.22880287021398543, "num_tokens": 57371582.0, "step": 25015 }, { "entropy": 5.091523838043213, "epoch": 2.403458213256484, "grad_norm": 1.125, "learning_rate": 0.0004423695993492709, "loss": 4.7413, "mean_token_accuracy": 0.23197826892137527, "num_tokens": 57382222.0, "step": 25020 }, { "entropy": 5.099827480316162, "epoch": 2.403938520653218, "grad_norm": 1.140625, "learning_rate": 0.0004423466846767179, "loss": 4.7438, "mean_token_accuracy": 0.23061240166425706, "num_tokens": 57393939.0, "step": 25025 }, { "entropy": 5.127017211914063, "epoch": 2.404418828049952, "grad_norm": 1.0546875, "learning_rate": 0.00044232376611887185, "loss": 4.7366, "mean_token_accuracy": 0.23424165695905685, "num_tokens": 57404889.0, "step": 25030 }, { "entropy": 5.120872449874878, "epoch": 2.4048991354466858, "grad_norm": 1.1015625, "learning_rate": 0.00044230084367626477, "loss": 4.834, "mean_token_accuracy": 0.2335745483636856, "num_tokens": 57416296.0, "step": 25035 }, { "entropy": 5.063665580749512, "epoch": 2.4053794428434196, "grad_norm": 1.1171875, "learning_rate": 0.0004422779173494288, "loss": 4.7934, "mean_token_accuracy": 0.23263732492923736, "num_tokens": 57427926.0, "step": 25040 }, { "entropy": 5.122995281219483, "epoch": 2.4058597502401535, "grad_norm": 1.109375, "learning_rate": 0.0004422549871388965, "loss": 4.7749, "mean_token_accuracy": 0.23277383893728257, "num_tokens": 57439397.0, "step": 25045 }, { "entropy": 5.136260223388672, "epoch": 2.4063400576368874, "grad_norm": 1.0546875, "learning_rate": 0.00044223205304519994, "loss": 4.8298, "mean_token_accuracy": 0.2284989833831787, "num_tokens": 57449976.0, "step": 25050 }, { "entropy": 5.135150289535522, "epoch": 2.4068203650336217, "grad_norm": 1.078125, "learning_rate": 0.0004422091150688717, "loss": 4.8585, "mean_token_accuracy": 0.22531607747077942, "num_tokens": 57461578.0, "step": 25055 }, { "entropy": 5.116857385635376, "epoch": 2.4073006724303556, "grad_norm": 1.0546875, "learning_rate": 0.0004421861732104443, "loss": 4.7996, "mean_token_accuracy": 0.22948758900165558, "num_tokens": 57471832.0, "step": 25060 }, { "entropy": 5.1676966667175295, "epoch": 2.4077809798270895, "grad_norm": 1.0390625, "learning_rate": 0.0004421632274704504, "loss": 4.825, "mean_token_accuracy": 0.22644471675157546, "num_tokens": 57482854.0, "step": 25065 }, { "entropy": 5.165962553024292, "epoch": 2.4082612872238234, "grad_norm": 1.0625, "learning_rate": 0.0004421402778494227, "loss": 4.8661, "mean_token_accuracy": 0.2232041284441948, "num_tokens": 57494243.0, "step": 25070 }, { "entropy": 5.136231994628906, "epoch": 2.4087415946205573, "grad_norm": 1.2109375, "learning_rate": 0.0004421173243478941, "loss": 4.7417, "mean_token_accuracy": 0.23099250942468644, "num_tokens": 57506185.0, "step": 25075 }, { "entropy": 5.1206300258636475, "epoch": 2.409221902017291, "grad_norm": 1.1015625, "learning_rate": 0.00044209436696639745, "loss": 4.7652, "mean_token_accuracy": 0.23854574412107468, "num_tokens": 57517980.0, "step": 25080 }, { "entropy": 5.012642097473145, "epoch": 2.409702209414025, "grad_norm": 1.125, "learning_rate": 0.00044207140570546574, "loss": 4.6386, "mean_token_accuracy": 0.2411160036921501, "num_tokens": 57529331.0, "step": 25085 }, { "entropy": 5.095395565032959, "epoch": 2.410182516810759, "grad_norm": 1.1484375, "learning_rate": 0.00044204844056563216, "loss": 4.7346, "mean_token_accuracy": 0.23751559257507324, "num_tokens": 57540463.0, "step": 25090 }, { "entropy": 5.154835271835327, "epoch": 2.410662824207493, "grad_norm": 1.1015625, "learning_rate": 0.0004420254715474297, "loss": 4.8798, "mean_token_accuracy": 0.2264431521296501, "num_tokens": 57550896.0, "step": 25095 }, { "entropy": 5.081275081634521, "epoch": 2.4111431316042267, "grad_norm": 1.015625, "learning_rate": 0.00044200249865139187, "loss": 4.7653, "mean_token_accuracy": 0.23305115401744841, "num_tokens": 57562041.0, "step": 25100 }, { "entropy": 5.226250839233399, "epoch": 2.4116234390009605, "grad_norm": 1.1328125, "learning_rate": 0.00044197952187805185, "loss": 4.9266, "mean_token_accuracy": 0.2227129802107811, "num_tokens": 57573771.0, "step": 25105 }, { "entropy": 5.150281715393066, "epoch": 2.4121037463976944, "grad_norm": 1.046875, "learning_rate": 0.00044195654122794324, "loss": 4.7519, "mean_token_accuracy": 0.23074692040681838, "num_tokens": 57585651.0, "step": 25110 }, { "entropy": 5.072700929641724, "epoch": 2.4125840537944283, "grad_norm": 1.140625, "learning_rate": 0.0004419335567015994, "loss": 4.7639, "mean_token_accuracy": 0.23477055728435517, "num_tokens": 57596328.0, "step": 25115 }, { "entropy": 5.054577207565307, "epoch": 2.413064361191162, "grad_norm": 1.09375, "learning_rate": 0.0004419105682995542, "loss": 4.6737, "mean_token_accuracy": 0.23865675032138825, "num_tokens": 57608012.0, "step": 25120 }, { "entropy": 5.1676277160644535, "epoch": 2.413544668587896, "grad_norm": 1.0390625, "learning_rate": 0.0004418875760223411, "loss": 4.7777, "mean_token_accuracy": 0.22592634558677674, "num_tokens": 57619711.0, "step": 25125 }, { "entropy": 5.076362609863281, "epoch": 2.4140249759846304, "grad_norm": 1.1796875, "learning_rate": 0.00044186457987049405, "loss": 4.755, "mean_token_accuracy": 0.2357165664434433, "num_tokens": 57630263.0, "step": 25130 }, { "entropy": 5.052877140045166, "epoch": 2.414505283381364, "grad_norm": 1.1015625, "learning_rate": 0.000441841579844547, "loss": 4.7552, "mean_token_accuracy": 0.23254823684692383, "num_tokens": 57640973.0, "step": 25135 }, { "entropy": 5.1252655506134035, "epoch": 2.414985590778098, "grad_norm": 1.140625, "learning_rate": 0.0004418185759450338, "loss": 4.7086, "mean_token_accuracy": 0.232559834420681, "num_tokens": 57651698.0, "step": 25140 }, { "entropy": 5.069209814071655, "epoch": 2.415465898174832, "grad_norm": 1.1171875, "learning_rate": 0.0004417955681724887, "loss": 4.7524, "mean_token_accuracy": 0.23003358244895936, "num_tokens": 57662547.0, "step": 25145 }, { "entropy": 5.152061462402344, "epoch": 2.415946205571566, "grad_norm": 1.125, "learning_rate": 0.00044177255652744576, "loss": 4.8545, "mean_token_accuracy": 0.22393652498722078, "num_tokens": 57675665.0, "step": 25150 }, { "entropy": 5.0961161136627195, "epoch": 2.4164265129683, "grad_norm": 1.171875, "learning_rate": 0.00044174954101043926, "loss": 4.7158, "mean_token_accuracy": 0.24122230857610702, "num_tokens": 57687765.0, "step": 25155 }, { "entropy": 5.111287069320679, "epoch": 2.4169068203650337, "grad_norm": 1.1328125, "learning_rate": 0.00044172652162200354, "loss": 4.806, "mean_token_accuracy": 0.22566018700599672, "num_tokens": 57699575.0, "step": 25160 }, { "entropy": 5.1426841735839846, "epoch": 2.4173871277617676, "grad_norm": 1.125, "learning_rate": 0.0004417034983626731, "loss": 4.8203, "mean_token_accuracy": 0.23345723748207092, "num_tokens": 57710014.0, "step": 25165 }, { "entropy": 5.054921293258667, "epoch": 2.4178674351585014, "grad_norm": 1.1171875, "learning_rate": 0.0004416804712329825, "loss": 4.6687, "mean_token_accuracy": 0.24694691896438598, "num_tokens": 57720763.0, "step": 25170 }, { "entropy": 5.112240743637085, "epoch": 2.4183477425552353, "grad_norm": 1.109375, "learning_rate": 0.00044165744023346614, "loss": 4.8264, "mean_token_accuracy": 0.22955900579690933, "num_tokens": 57731581.0, "step": 25175 }, { "entropy": 5.151104831695557, "epoch": 2.418828049951969, "grad_norm": 1.0703125, "learning_rate": 0.00044163440536465904, "loss": 4.8684, "mean_token_accuracy": 0.23169025778770447, "num_tokens": 57742473.0, "step": 25180 }, { "entropy": 5.178462362289428, "epoch": 2.419308357348703, "grad_norm": 1.078125, "learning_rate": 0.00044161136662709577, "loss": 4.876, "mean_token_accuracy": 0.22610796988010406, "num_tokens": 57754317.0, "step": 25185 }, { "entropy": 5.075574684143066, "epoch": 2.419788664745437, "grad_norm": 1.1875, "learning_rate": 0.00044158832402131133, "loss": 4.7637, "mean_token_accuracy": 0.2303366556763649, "num_tokens": 57765882.0, "step": 25190 }, { "entropy": 5.116954517364502, "epoch": 2.420268972142171, "grad_norm": 1.21875, "learning_rate": 0.00044156527754784066, "loss": 4.7507, "mean_token_accuracy": 0.23588968962430953, "num_tokens": 57776977.0, "step": 25195 }, { "entropy": 5.129793405532837, "epoch": 2.4207492795389047, "grad_norm": 1.015625, "learning_rate": 0.00044154222720721887, "loss": 4.8295, "mean_token_accuracy": 0.2288350611925125, "num_tokens": 57788652.0, "step": 25200 }, { "entropy": 5.172764730453491, "epoch": 2.421229586935639, "grad_norm": 1.109375, "learning_rate": 0.000441519172999981, "loss": 4.8086, "mean_token_accuracy": 0.22164968103170396, "num_tokens": 57799660.0, "step": 25205 }, { "entropy": 5.151558542251587, "epoch": 2.4217098943323725, "grad_norm": 1.078125, "learning_rate": 0.0004414961149266625, "loss": 4.8751, "mean_token_accuracy": 0.22082775533199311, "num_tokens": 57811754.0, "step": 25210 }, { "entropy": 5.142359209060669, "epoch": 2.422190201729107, "grad_norm": 1.09375, "learning_rate": 0.00044147305298779856, "loss": 4.7975, "mean_token_accuracy": 0.23699275106191636, "num_tokens": 57823470.0, "step": 25215 }, { "entropy": 5.211223363876343, "epoch": 2.4226705091258407, "grad_norm": 1.0546875, "learning_rate": 0.0004414499871839247, "loss": 4.8946, "mean_token_accuracy": 0.22818945497274398, "num_tokens": 57834917.0, "step": 25220 }, { "entropy": 5.136872911453247, "epoch": 2.4231508165225746, "grad_norm": 1.09375, "learning_rate": 0.0004414269175155763, "loss": 4.8005, "mean_token_accuracy": 0.2274375304579735, "num_tokens": 57844655.0, "step": 25225 }, { "entropy": 5.085866117477417, "epoch": 2.4236311239193085, "grad_norm": 1.078125, "learning_rate": 0.0004414038439832891, "loss": 4.8088, "mean_token_accuracy": 0.23270874172449113, "num_tokens": 57856019.0, "step": 25230 }, { "entropy": 5.147326040267944, "epoch": 2.4241114313160423, "grad_norm": 1.09375, "learning_rate": 0.0004413807665875988, "loss": 4.7681, "mean_token_accuracy": 0.23173436522483826, "num_tokens": 57867661.0, "step": 25235 }, { "entropy": 5.150324010848999, "epoch": 2.4245917387127762, "grad_norm": 1.0859375, "learning_rate": 0.00044135768532904104, "loss": 4.7815, "mean_token_accuracy": 0.22542063891887665, "num_tokens": 57879899.0, "step": 25240 }, { "entropy": 5.136937427520752, "epoch": 2.42507204610951, "grad_norm": 0.98046875, "learning_rate": 0.000441334600208152, "loss": 4.8528, "mean_token_accuracy": 0.22068443894386292, "num_tokens": 57891168.0, "step": 25245 }, { "entropy": 5.083675765991211, "epoch": 2.425552353506244, "grad_norm": 1.0546875, "learning_rate": 0.00044131151122546724, "loss": 4.7149, "mean_token_accuracy": 0.23319132924079894, "num_tokens": 57901997.0, "step": 25250 }, { "entropy": 5.115674448013306, "epoch": 2.426032660902978, "grad_norm": 1.0859375, "learning_rate": 0.00044128841838152313, "loss": 4.7814, "mean_token_accuracy": 0.23135359287261964, "num_tokens": 57913380.0, "step": 25255 }, { "entropy": 5.047495317459107, "epoch": 2.4265129682997117, "grad_norm": 1.1328125, "learning_rate": 0.0004412653216768558, "loss": 4.6851, "mean_token_accuracy": 0.23554279059171676, "num_tokens": 57923675.0, "step": 25260 }, { "entropy": 5.1381189823150635, "epoch": 2.4269932756964456, "grad_norm": 1.1328125, "learning_rate": 0.0004412422211120013, "loss": 4.8304, "mean_token_accuracy": 0.2330308437347412, "num_tokens": 57935885.0, "step": 25265 }, { "entropy": 5.08838529586792, "epoch": 2.4274735830931795, "grad_norm": 1.0390625, "learning_rate": 0.0004412191166874961, "loss": 4.7604, "mean_token_accuracy": 0.23321136087179184, "num_tokens": 57946551.0, "step": 25270 }, { "entropy": 5.0506280899047855, "epoch": 2.4279538904899134, "grad_norm": 1.0078125, "learning_rate": 0.0004411960084038766, "loss": 4.6814, "mean_token_accuracy": 0.24472323954105377, "num_tokens": 57957778.0, "step": 25275 }, { "entropy": 5.0799188137054445, "epoch": 2.4284341978866473, "grad_norm": 1.0625, "learning_rate": 0.00044117289626167917, "loss": 4.7983, "mean_token_accuracy": 0.2373049721121788, "num_tokens": 57969256.0, "step": 25280 }, { "entropy": 5.145950269699097, "epoch": 2.428914505283381, "grad_norm": 1.1875, "learning_rate": 0.0004411497802614406, "loss": 4.7595, "mean_token_accuracy": 0.2394431099295616, "num_tokens": 57981368.0, "step": 25285 }, { "entropy": 5.103071928024292, "epoch": 2.4293948126801155, "grad_norm": 1.03125, "learning_rate": 0.0004411266604036975, "loss": 4.8485, "mean_token_accuracy": 0.23120342344045638, "num_tokens": 57992862.0, "step": 25290 }, { "entropy": 5.055536603927612, "epoch": 2.4298751200768494, "grad_norm": 1.15625, "learning_rate": 0.00044110353668898674, "loss": 4.6424, "mean_token_accuracy": 0.24648849815130233, "num_tokens": 58003492.0, "step": 25295 }, { "entropy": 5.051392936706543, "epoch": 2.4303554274735832, "grad_norm": 1.09375, "learning_rate": 0.0004410804091178449, "loss": 4.6773, "mean_token_accuracy": 0.23831250369548798, "num_tokens": 58014623.0, "step": 25300 }, { "entropy": 5.129943037033081, "epoch": 2.430835734870317, "grad_norm": 1.140625, "learning_rate": 0.0004410572776908092, "loss": 4.8595, "mean_token_accuracy": 0.22626224607229234, "num_tokens": 58024886.0, "step": 25305 }, { "entropy": 5.164258241653442, "epoch": 2.431316042267051, "grad_norm": 1.1015625, "learning_rate": 0.00044103414240841664, "loss": 4.837, "mean_token_accuracy": 0.23164584636688232, "num_tokens": 58035998.0, "step": 25310 }, { "entropy": 5.151918363571167, "epoch": 2.431796349663785, "grad_norm": 1.0859375, "learning_rate": 0.0004410110032712043, "loss": 4.7838, "mean_token_accuracy": 0.23683720380067824, "num_tokens": 58046577.0, "step": 25315 }, { "entropy": 5.03354926109314, "epoch": 2.4322766570605188, "grad_norm": 1.1015625, "learning_rate": 0.0004409878602797094, "loss": 4.7273, "mean_token_accuracy": 0.2367846444249153, "num_tokens": 58057711.0, "step": 25320 }, { "entropy": 5.1005795955657955, "epoch": 2.4327569644572526, "grad_norm": 1.28125, "learning_rate": 0.00044096471343446923, "loss": 4.7228, "mean_token_accuracy": 0.2318343847990036, "num_tokens": 58069657.0, "step": 25325 }, { "entropy": 5.083721780776978, "epoch": 2.4332372718539865, "grad_norm": 1.1328125, "learning_rate": 0.0004409415627360213, "loss": 4.7779, "mean_token_accuracy": 0.2350016176700592, "num_tokens": 58081081.0, "step": 25330 }, { "entropy": 5.1245338916778564, "epoch": 2.4337175792507204, "grad_norm": 1.1328125, "learning_rate": 0.00044091840818490303, "loss": 4.7303, "mean_token_accuracy": 0.2299958735704422, "num_tokens": 58091561.0, "step": 25335 }, { "entropy": 5.00467004776001, "epoch": 2.4341978866474543, "grad_norm": 1.0703125, "learning_rate": 0.00044089524978165197, "loss": 4.7628, "mean_token_accuracy": 0.23996685147285463, "num_tokens": 58102640.0, "step": 25340 }, { "entropy": 5.1055091381072994, "epoch": 2.434678194044188, "grad_norm": 1.078125, "learning_rate": 0.00044087208752680577, "loss": 4.8772, "mean_token_accuracy": 0.22194685488939286, "num_tokens": 58113356.0, "step": 25345 }, { "entropy": 5.228708505630493, "epoch": 2.435158501440922, "grad_norm": 1.140625, "learning_rate": 0.0004408489214209023, "loss": 4.8862, "mean_token_accuracy": 0.22105642706155776, "num_tokens": 58125437.0, "step": 25350 }, { "entropy": 5.217840385437012, "epoch": 2.435638808837656, "grad_norm": 1.140625, "learning_rate": 0.0004408257514644793, "loss": 4.8646, "mean_token_accuracy": 0.22782525420188904, "num_tokens": 58138491.0, "step": 25355 }, { "entropy": 5.073573255538941, "epoch": 2.43611911623439, "grad_norm": 1.1015625, "learning_rate": 0.00044080257765807476, "loss": 4.7787, "mean_token_accuracy": 0.22393606305122377, "num_tokens": 58150200.0, "step": 25360 }, { "entropy": 5.111953592300415, "epoch": 2.436599423631124, "grad_norm": 1.1171875, "learning_rate": 0.0004407794000022267, "loss": 4.8759, "mean_token_accuracy": 0.23408314138650893, "num_tokens": 58162467.0, "step": 25365 }, { "entropy": 5.198561382293701, "epoch": 2.437079731027858, "grad_norm": 1.1171875, "learning_rate": 0.0004407562184974732, "loss": 4.9004, "mean_token_accuracy": 0.22066261023283004, "num_tokens": 58175052.0, "step": 25370 }, { "entropy": 5.134199714660644, "epoch": 2.437560038424592, "grad_norm": 1.109375, "learning_rate": 0.0004407330331443526, "loss": 4.7854, "mean_token_accuracy": 0.2339022383093834, "num_tokens": 58186999.0, "step": 25375 }, { "entropy": 5.0688145637512205, "epoch": 2.438040345821326, "grad_norm": 1.1015625, "learning_rate": 0.000440709843943403, "loss": 4.6838, "mean_token_accuracy": 0.23762887567281724, "num_tokens": 58197811.0, "step": 25380 }, { "entropy": 5.206927680969239, "epoch": 2.4385206532180597, "grad_norm": 1.03125, "learning_rate": 0.000440686650895163, "loss": 4.9039, "mean_token_accuracy": 0.22684629559516906, "num_tokens": 58209545.0, "step": 25385 }, { "entropy": 5.0824668407440186, "epoch": 2.4390009606147935, "grad_norm": 1.0234375, "learning_rate": 0.00044066345400017084, "loss": 4.7038, "mean_token_accuracy": 0.23633986413478852, "num_tokens": 58220994.0, "step": 25390 }, { "entropy": 5.027984094619751, "epoch": 2.4394812680115274, "grad_norm": 1.1015625, "learning_rate": 0.00044064025325896524, "loss": 4.6381, "mean_token_accuracy": 0.24538477808237075, "num_tokens": 58232778.0, "step": 25395 }, { "entropy": 5.163522481918335, "epoch": 2.4399615754082613, "grad_norm": 1.0859375, "learning_rate": 0.00044061704867208484, "loss": 4.8416, "mean_token_accuracy": 0.23376935720443726, "num_tokens": 58243889.0, "step": 25400 }, { "entropy": 5.054913663864136, "epoch": 2.440441882804995, "grad_norm": 1.0234375, "learning_rate": 0.00044059384024006825, "loss": 4.7418, "mean_token_accuracy": 0.23509032428264617, "num_tokens": 58256144.0, "step": 25405 }, { "entropy": 5.045040321350098, "epoch": 2.440922190201729, "grad_norm": 1.1796875, "learning_rate": 0.0004405706279634545, "loss": 4.7683, "mean_token_accuracy": 0.2350820556282997, "num_tokens": 58267509.0, "step": 25410 }, { "entropy": 5.059934377670288, "epoch": 2.441402497598463, "grad_norm": 1.140625, "learning_rate": 0.00044054741184278243, "loss": 4.6942, "mean_token_accuracy": 0.24670542627573014, "num_tokens": 58278880.0, "step": 25415 }, { "entropy": 5.173495578765869, "epoch": 2.441882804995197, "grad_norm": 1.0625, "learning_rate": 0.00044052419187859095, "loss": 4.7589, "mean_token_accuracy": 0.2268371656537056, "num_tokens": 58290046.0, "step": 25420 }, { "entropy": 5.016854190826416, "epoch": 2.4423631123919307, "grad_norm": 1.0859375, "learning_rate": 0.0004405009680714193, "loss": 4.6244, "mean_token_accuracy": 0.2436349555850029, "num_tokens": 58301406.0, "step": 25425 }, { "entropy": 5.0364861488342285, "epoch": 2.4428434197886646, "grad_norm": 1.15625, "learning_rate": 0.0004404777404218065, "loss": 4.7178, "mean_token_accuracy": 0.23964912444353104, "num_tokens": 58313064.0, "step": 25430 }, { "entropy": 5.1077882766723635, "epoch": 2.4433237271853985, "grad_norm": 1.109375, "learning_rate": 0.000440454508930292, "loss": 4.8441, "mean_token_accuracy": 0.22159260958433152, "num_tokens": 58324779.0, "step": 25435 }, { "entropy": 5.182451915740967, "epoch": 2.443804034582133, "grad_norm": 1.21875, "learning_rate": 0.0004404312735974152, "loss": 4.8368, "mean_token_accuracy": 0.2312808156013489, "num_tokens": 58336064.0, "step": 25440 }, { "entropy": 5.032112169265747, "epoch": 2.4442843419788662, "grad_norm": 1.15625, "learning_rate": 0.00044040803442371533, "loss": 4.6852, "mean_token_accuracy": 0.23814561814069748, "num_tokens": 58347497.0, "step": 25445 }, { "entropy": 4.990532445907593, "epoch": 2.4447646493756006, "grad_norm": 1.15625, "learning_rate": 0.0004403847914097321, "loss": 4.6822, "mean_token_accuracy": 0.2447928160429001, "num_tokens": 58357716.0, "step": 25450 }, { "entropy": 5.114711761474609, "epoch": 2.4452449567723344, "grad_norm": 1.15625, "learning_rate": 0.00044036154455600517, "loss": 4.8153, "mean_token_accuracy": 0.23038498014211656, "num_tokens": 58368451.0, "step": 25455 }, { "entropy": 5.123874950408935, "epoch": 2.4457252641690683, "grad_norm": 1.0859375, "learning_rate": 0.0004403382938630741, "loss": 4.7933, "mean_token_accuracy": 0.22636810839176177, "num_tokens": 58378915.0, "step": 25460 }, { "entropy": 5.079519605636596, "epoch": 2.446205571565802, "grad_norm": 1.0234375, "learning_rate": 0.00044031503933147887, "loss": 4.7004, "mean_token_accuracy": 0.24838587641716003, "num_tokens": 58390251.0, "step": 25465 }, { "entropy": 5.059087228775025, "epoch": 2.446685878962536, "grad_norm": 1.09375, "learning_rate": 0.00044029178096175934, "loss": 4.7154, "mean_token_accuracy": 0.23726015239953996, "num_tokens": 58401390.0, "step": 25470 }, { "entropy": 5.05997052192688, "epoch": 2.44716618635927, "grad_norm": 1.125, "learning_rate": 0.0004402685187544554, "loss": 4.7455, "mean_token_accuracy": 0.24133506268262864, "num_tokens": 58412530.0, "step": 25475 }, { "entropy": 5.161045837402344, "epoch": 2.447646493756004, "grad_norm": 1.109375, "learning_rate": 0.0004402452527101072, "loss": 4.848, "mean_token_accuracy": 0.22996888011693956, "num_tokens": 58424437.0, "step": 25480 }, { "entropy": 5.153648948669433, "epoch": 2.4481268011527377, "grad_norm": 1.1875, "learning_rate": 0.0004402219828292549, "loss": 4.8021, "mean_token_accuracy": 0.22747268825769423, "num_tokens": 58435661.0, "step": 25485 }, { "entropy": 5.1356048583984375, "epoch": 2.4486071085494716, "grad_norm": 1.1171875, "learning_rate": 0.0004401987091124388, "loss": 4.9027, "mean_token_accuracy": 0.21983251720666885, "num_tokens": 58445840.0, "step": 25490 }, { "entropy": 5.135798072814941, "epoch": 2.4490874159462055, "grad_norm": 1.0078125, "learning_rate": 0.0004401754315601992, "loss": 4.8442, "mean_token_accuracy": 0.22019636034965515, "num_tokens": 58460160.0, "step": 25495 }, { "entropy": 5.105370426177979, "epoch": 2.4495677233429394, "grad_norm": 1.0546875, "learning_rate": 0.0004401521501730765, "loss": 4.6796, "mean_token_accuracy": 0.24194978177547455, "num_tokens": 58471257.0, "step": 25500 }, { "entropy": 4.981129550933838, "epoch": 2.4500480307396733, "grad_norm": 1.09375, "learning_rate": 0.00044012886495161144, "loss": 4.7116, "mean_token_accuracy": 0.23703904449939728, "num_tokens": 58482830.0, "step": 25505 }, { "entropy": 5.105340099334716, "epoch": 2.450528338136407, "grad_norm": 1.0546875, "learning_rate": 0.0004401055758963443, "loss": 4.7269, "mean_token_accuracy": 0.24424219131469727, "num_tokens": 58494278.0, "step": 25510 }, { "entropy": 5.112192296981812, "epoch": 2.451008645533141, "grad_norm": 1.0703125, "learning_rate": 0.000440082283007816, "loss": 4.7334, "mean_token_accuracy": 0.22664321213960648, "num_tokens": 58505653.0, "step": 25515 }, { "entropy": 5.135557985305786, "epoch": 2.451488952929875, "grad_norm": 1.203125, "learning_rate": 0.00044005898628656734, "loss": 4.7579, "mean_token_accuracy": 0.23311397582292556, "num_tokens": 58516906.0, "step": 25520 }, { "entropy": 5.103575658798218, "epoch": 2.4519692603266092, "grad_norm": 1.21875, "learning_rate": 0.000440035685733139, "loss": 4.7533, "mean_token_accuracy": 0.23946669548749924, "num_tokens": 58527326.0, "step": 25525 }, { "entropy": 5.078144502639771, "epoch": 2.452449567723343, "grad_norm": 1.0390625, "learning_rate": 0.0004400123813480722, "loss": 4.7108, "mean_token_accuracy": 0.24113842248916625, "num_tokens": 58537749.0, "step": 25530 }, { "entropy": 5.087492990493774, "epoch": 2.452929875120077, "grad_norm": 1.0390625, "learning_rate": 0.00043998907313190787, "loss": 4.7554, "mean_token_accuracy": 0.2300448402762413, "num_tokens": 58549479.0, "step": 25535 }, { "entropy": 5.093681192398071, "epoch": 2.453410182516811, "grad_norm": 1.1015625, "learning_rate": 0.0004399657610851873, "loss": 4.7711, "mean_token_accuracy": 0.23146681785583495, "num_tokens": 58561051.0, "step": 25540 }, { "entropy": 5.136078310012818, "epoch": 2.4538904899135447, "grad_norm": 1.078125, "learning_rate": 0.00043994244520845146, "loss": 4.8589, "mean_token_accuracy": 0.22926601022481918, "num_tokens": 58571482.0, "step": 25545 }, { "entropy": 5.14628643989563, "epoch": 2.4543707973102786, "grad_norm": 1.15625, "learning_rate": 0.0004399191255022418, "loss": 4.7427, "mean_token_accuracy": 0.23322267979383468, "num_tokens": 58584369.0, "step": 25550 }, { "entropy": 5.113161659240722, "epoch": 2.4548511047070125, "grad_norm": 1.0703125, "learning_rate": 0.0004398958019670998, "loss": 4.7312, "mean_token_accuracy": 0.2340619221329689, "num_tokens": 58595786.0, "step": 25555 }, { "entropy": 5.0093278884887695, "epoch": 2.4553314121037464, "grad_norm": 1.078125, "learning_rate": 0.00043987247460356696, "loss": 4.7354, "mean_token_accuracy": 0.233761328458786, "num_tokens": 58608536.0, "step": 25560 }, { "entropy": 5.031365156173706, "epoch": 2.4558117195004803, "grad_norm": 1.15625, "learning_rate": 0.0004398491434121847, "loss": 4.6684, "mean_token_accuracy": 0.2444691464304924, "num_tokens": 58620981.0, "step": 25565 }, { "entropy": 5.117167186737061, "epoch": 2.456292026897214, "grad_norm": 1.15625, "learning_rate": 0.000439825808393495, "loss": 4.7756, "mean_token_accuracy": 0.2298066183924675, "num_tokens": 58632802.0, "step": 25570 }, { "entropy": 5.1339428424835205, "epoch": 2.456772334293948, "grad_norm": 1.2890625, "learning_rate": 0.0004398024695480394, "loss": 4.819, "mean_token_accuracy": 0.23125423789024352, "num_tokens": 58644577.0, "step": 25575 }, { "entropy": 5.117880201339721, "epoch": 2.457252641690682, "grad_norm": 1.203125, "learning_rate": 0.0004397791268763598, "loss": 4.7427, "mean_token_accuracy": 0.2311472088098526, "num_tokens": 58654970.0, "step": 25580 }, { "entropy": 5.114732074737549, "epoch": 2.457732949087416, "grad_norm": 1.078125, "learning_rate": 0.00043975578037899814, "loss": 4.8628, "mean_token_accuracy": 0.2292526826262474, "num_tokens": 58666534.0, "step": 25585 }, { "entropy": 5.08922929763794, "epoch": 2.4582132564841497, "grad_norm": 1.0234375, "learning_rate": 0.0004397324300564966, "loss": 4.7486, "mean_token_accuracy": 0.23776388466358184, "num_tokens": 58677451.0, "step": 25590 }, { "entropy": 5.102073621749878, "epoch": 2.4586935638808836, "grad_norm": 1.1171875, "learning_rate": 0.0004397090759093971, "loss": 4.7518, "mean_token_accuracy": 0.23649442344903945, "num_tokens": 58689179.0, "step": 25595 }, { "entropy": 5.10036883354187, "epoch": 2.459173871277618, "grad_norm": 1.0859375, "learning_rate": 0.00043968571793824194, "loss": 4.8596, "mean_token_accuracy": 0.22949687093496324, "num_tokens": 58701939.0, "step": 25600 }, { "entropy": 5.045658206939697, "epoch": 2.4596541786743518, "grad_norm": 1.015625, "learning_rate": 0.0004396623561435734, "loss": 4.6265, "mean_token_accuracy": 0.24700823426246643, "num_tokens": 58714302.0, "step": 25605 }, { "entropy": 5.052271366119385, "epoch": 2.4601344860710856, "grad_norm": 1.140625, "learning_rate": 0.0004396389905259339, "loss": 4.7653, "mean_token_accuracy": 0.22990579009056092, "num_tokens": 58727619.0, "step": 25610 }, { "entropy": 5.145890998840332, "epoch": 2.4606147934678195, "grad_norm": 1.0625, "learning_rate": 0.00043961562108586603, "loss": 4.8212, "mean_token_accuracy": 0.22512982934713363, "num_tokens": 58738929.0, "step": 25615 }, { "entropy": 5.2230690002441404, "epoch": 2.4610951008645534, "grad_norm": 1.1171875, "learning_rate": 0.00043959224782391215, "loss": 4.8429, "mean_token_accuracy": 0.23013273477554322, "num_tokens": 58751771.0, "step": 25620 }, { "entropy": 5.004738235473633, "epoch": 2.4615754082612873, "grad_norm": 1.2109375, "learning_rate": 0.000439568870740615, "loss": 4.6611, "mean_token_accuracy": 0.2486381411552429, "num_tokens": 58762304.0, "step": 25625 }, { "entropy": 5.052642393112182, "epoch": 2.462055715658021, "grad_norm": 1.1484375, "learning_rate": 0.0004395454898365174, "loss": 4.7486, "mean_token_accuracy": 0.22977619022130966, "num_tokens": 58775222.0, "step": 25630 }, { "entropy": 5.007144784927368, "epoch": 2.462536023054755, "grad_norm": 1.1015625, "learning_rate": 0.00043952210511216205, "loss": 4.7316, "mean_token_accuracy": 0.23972392976284027, "num_tokens": 58785904.0, "step": 25635 }, { "entropy": 5.0395995616912845, "epoch": 2.463016330451489, "grad_norm": 1.1953125, "learning_rate": 0.00043949871656809205, "loss": 4.7426, "mean_token_accuracy": 0.2365034982562065, "num_tokens": 58796186.0, "step": 25640 }, { "entropy": 5.159918355941772, "epoch": 2.463496637848223, "grad_norm": 1.171875, "learning_rate": 0.00043947532420485024, "loss": 4.8109, "mean_token_accuracy": 0.23558780550956726, "num_tokens": 58807317.0, "step": 25645 }, { "entropy": 5.176284885406494, "epoch": 2.4639769452449567, "grad_norm": 1.1328125, "learning_rate": 0.0004394519280229798, "loss": 4.8786, "mean_token_accuracy": 0.22497029304504396, "num_tokens": 58819108.0, "step": 25650 }, { "entropy": 5.087363910675049, "epoch": 2.4644572526416906, "grad_norm": 1.1015625, "learning_rate": 0.00043942852802302397, "loss": 4.8114, "mean_token_accuracy": 0.22998191565275192, "num_tokens": 58830962.0, "step": 25655 }, { "entropy": 5.166617107391358, "epoch": 2.4649375600384245, "grad_norm": 1.0234375, "learning_rate": 0.0004394051242055259, "loss": 4.8907, "mean_token_accuracy": 0.22262165695428848, "num_tokens": 58843236.0, "step": 25660 }, { "entropy": 5.128620433807373, "epoch": 2.4654178674351583, "grad_norm": 1.0625, "learning_rate": 0.0004393817165710291, "loss": 4.7161, "mean_token_accuracy": 0.2342569798231125, "num_tokens": 58853995.0, "step": 25665 }, { "entropy": 5.0780110359191895, "epoch": 2.465898174831892, "grad_norm": 1.0703125, "learning_rate": 0.00043935830512007687, "loss": 4.7098, "mean_token_accuracy": 0.24568893015384674, "num_tokens": 58866563.0, "step": 25670 }, { "entropy": 5.14299054145813, "epoch": 2.4663784822286265, "grad_norm": 0.94921875, "learning_rate": 0.00043933488985321286, "loss": 4.8673, "mean_token_accuracy": 0.22950955629348754, "num_tokens": 58878660.0, "step": 25675 }, { "entropy": 5.0542590618133545, "epoch": 2.46685878962536, "grad_norm": 1.1015625, "learning_rate": 0.0004393114707709807, "loss": 4.7065, "mean_token_accuracy": 0.2350746512413025, "num_tokens": 58890332.0, "step": 25680 }, { "entropy": 5.098819255828857, "epoch": 2.4673390970220943, "grad_norm": 1.15625, "learning_rate": 0.0004392880478739241, "loss": 4.7921, "mean_token_accuracy": 0.2282637909054756, "num_tokens": 58902228.0, "step": 25685 }, { "entropy": 5.1141167163848875, "epoch": 2.467819404418828, "grad_norm": 1.0390625, "learning_rate": 0.0004392646211625869, "loss": 4.7941, "mean_token_accuracy": 0.23558991700410842, "num_tokens": 58914452.0, "step": 25690 }, { "entropy": 5.116840028762818, "epoch": 2.468299711815562, "grad_norm": 1.171875, "learning_rate": 0.0004392411906375129, "loss": 4.7676, "mean_token_accuracy": 0.22971803247928618, "num_tokens": 58926079.0, "step": 25695 }, { "entropy": 5.092357540130616, "epoch": 2.468780019212296, "grad_norm": 1.21875, "learning_rate": 0.00043921775629924615, "loss": 4.7433, "mean_token_accuracy": 0.23688182830810547, "num_tokens": 58937292.0, "step": 25700 }, { "entropy": 5.097441005706787, "epoch": 2.46926032660903, "grad_norm": 1.0859375, "learning_rate": 0.00043919431814833077, "loss": 4.8116, "mean_token_accuracy": 0.23292779326438903, "num_tokens": 58949425.0, "step": 25705 }, { "entropy": 5.1808027744293215, "epoch": 2.4697406340057637, "grad_norm": 1.0859375, "learning_rate": 0.00043917087618531084, "loss": 4.7755, "mean_token_accuracy": 0.23483059853315352, "num_tokens": 58961620.0, "step": 25710 }, { "entropy": 5.1284150123596195, "epoch": 2.4702209414024976, "grad_norm": 1.109375, "learning_rate": 0.0004391474304107307, "loss": 4.7137, "mean_token_accuracy": 0.2397887110710144, "num_tokens": 58973043.0, "step": 25715 }, { "entropy": 5.0684206008911135, "epoch": 2.4707012487992315, "grad_norm": 1.0703125, "learning_rate": 0.00043912398082513463, "loss": 4.8725, "mean_token_accuracy": 0.22367147654294967, "num_tokens": 58984516.0, "step": 25720 }, { "entropy": 5.085205459594727, "epoch": 2.4711815561959654, "grad_norm": 1.1875, "learning_rate": 0.000439100527429067, "loss": 4.7149, "mean_token_accuracy": 0.23901582807302474, "num_tokens": 58996473.0, "step": 25725 }, { "entropy": 5.107437419891357, "epoch": 2.4716618635926992, "grad_norm": 1.0703125, "learning_rate": 0.00043907707022307243, "loss": 4.7239, "mean_token_accuracy": 0.2298218384385109, "num_tokens": 59009001.0, "step": 25730 }, { "entropy": 5.174347257614135, "epoch": 2.472142170989433, "grad_norm": 1.046875, "learning_rate": 0.00043905360920769553, "loss": 4.8174, "mean_token_accuracy": 0.23512738794088364, "num_tokens": 59021238.0, "step": 25735 }, { "entropy": 5.152868318557739, "epoch": 2.472622478386167, "grad_norm": 1.0625, "learning_rate": 0.000439030144383481, "loss": 4.8156, "mean_token_accuracy": 0.23765633702278138, "num_tokens": 59031311.0, "step": 25740 }, { "entropy": 5.0604266166687015, "epoch": 2.473102785782901, "grad_norm": 1.25, "learning_rate": 0.00043900667575097355, "loss": 4.7536, "mean_token_accuracy": 0.2329123303294182, "num_tokens": 59043151.0, "step": 25745 }, { "entropy": 5.134230995178223, "epoch": 2.473583093179635, "grad_norm": 1.1015625, "learning_rate": 0.0004389832033107181, "loss": 4.8443, "mean_token_accuracy": 0.2332065299153328, "num_tokens": 59054513.0, "step": 25750 }, { "entropy": 5.129967975616455, "epoch": 2.4740634005763686, "grad_norm": 1.15625, "learning_rate": 0.00043895972706325953, "loss": 4.7195, "mean_token_accuracy": 0.237641379237175, "num_tokens": 59064909.0, "step": 25755 }, { "entropy": 5.071166515350342, "epoch": 2.474543707973103, "grad_norm": 1.1171875, "learning_rate": 0.000438936247009143, "loss": 4.7855, "mean_token_accuracy": 0.22626807391643525, "num_tokens": 59077089.0, "step": 25760 }, { "entropy": 5.099506616592407, "epoch": 2.475024015369837, "grad_norm": 1.2734375, "learning_rate": 0.00043891276314891365, "loss": 4.7394, "mean_token_accuracy": 0.23274567127227783, "num_tokens": 59089188.0, "step": 25765 }, { "entropy": 5.167963123321533, "epoch": 2.4755043227665707, "grad_norm": 1.0703125, "learning_rate": 0.0004388892754831166, "loss": 4.8073, "mean_token_accuracy": 0.22330971658229828, "num_tokens": 59101384.0, "step": 25770 }, { "entropy": 5.132992935180664, "epoch": 2.4759846301633046, "grad_norm": 1.109375, "learning_rate": 0.0004388657840122973, "loss": 4.7529, "mean_token_accuracy": 0.23194568306207658, "num_tokens": 59112327.0, "step": 25775 }, { "entropy": 5.120018386840821, "epoch": 2.4764649375600385, "grad_norm": 1.1171875, "learning_rate": 0.000438842288737001, "loss": 4.8109, "mean_token_accuracy": 0.22848654985427858, "num_tokens": 59123577.0, "step": 25780 }, { "entropy": 5.031744861602784, "epoch": 2.4769452449567724, "grad_norm": 1.09375, "learning_rate": 0.00043881878965777325, "loss": 4.6937, "mean_token_accuracy": 0.24387203007936478, "num_tokens": 59135359.0, "step": 25785 }, { "entropy": 5.032623624801635, "epoch": 2.4774255523535063, "grad_norm": 1.0234375, "learning_rate": 0.00043879528677515973, "loss": 4.6516, "mean_token_accuracy": 0.24132218658924104, "num_tokens": 59146679.0, "step": 25790 }, { "entropy": 5.063320732116699, "epoch": 2.47790585975024, "grad_norm": 1.2890625, "learning_rate": 0.00043877178008970596, "loss": 4.788, "mean_token_accuracy": 0.232273106276989, "num_tokens": 59159893.0, "step": 25795 }, { "entropy": 5.072348546981812, "epoch": 2.478386167146974, "grad_norm": 1.3671875, "learning_rate": 0.0004387482696019578, "loss": 4.7349, "mean_token_accuracy": 0.2393971264362335, "num_tokens": 59172592.0, "step": 25800 }, { "entropy": 5.076195192337036, "epoch": 2.478866474543708, "grad_norm": 1.0390625, "learning_rate": 0.00043872475531246105, "loss": 4.7512, "mean_token_accuracy": 0.23732175081968307, "num_tokens": 59183650.0, "step": 25805 }, { "entropy": 5.10286431312561, "epoch": 2.479346781940442, "grad_norm": 1.1640625, "learning_rate": 0.00043870123722176166, "loss": 4.7915, "mean_token_accuracy": 0.23892272710800172, "num_tokens": 59195423.0, "step": 25810 }, { "entropy": 5.10826735496521, "epoch": 2.4798270893371757, "grad_norm": 1.0078125, "learning_rate": 0.0004386777153304056, "loss": 4.7901, "mean_token_accuracy": 0.22936685085296632, "num_tokens": 59208486.0, "step": 25815 }, { "entropy": 5.073663139343262, "epoch": 2.4803073967339095, "grad_norm": 1.140625, "learning_rate": 0.00043865418963893896, "loss": 4.7637, "mean_token_accuracy": 0.23651630729436873, "num_tokens": 59220230.0, "step": 25820 }, { "entropy": 5.084012842178344, "epoch": 2.4807877041306434, "grad_norm": 1.2890625, "learning_rate": 0.0004386306601479081, "loss": 4.7628, "mean_token_accuracy": 0.2342966765165329, "num_tokens": 59231524.0, "step": 25825 }, { "entropy": 5.099392652511597, "epoch": 2.4812680115273773, "grad_norm": 1.0390625, "learning_rate": 0.0004386071268578591, "loss": 4.8511, "mean_token_accuracy": 0.22831531316041948, "num_tokens": 59243525.0, "step": 25830 }, { "entropy": 5.136034917831421, "epoch": 2.4817483189241116, "grad_norm": 1.1640625, "learning_rate": 0.00043858358976933844, "loss": 4.6946, "mean_token_accuracy": 0.23946435898542404, "num_tokens": 59253774.0, "step": 25835 }, { "entropy": 5.108441257476807, "epoch": 2.4822286263208455, "grad_norm": 1.1015625, "learning_rate": 0.00043856004888289264, "loss": 4.771, "mean_token_accuracy": 0.23220686763525009, "num_tokens": 59265640.0, "step": 25840 }, { "entropy": 5.0565966129302975, "epoch": 2.4827089337175794, "grad_norm": 1.09375, "learning_rate": 0.0004385365041990681, "loss": 4.8102, "mean_token_accuracy": 0.23166382163763047, "num_tokens": 59277513.0, "step": 25845 }, { "entropy": 5.199204635620117, "epoch": 2.4831892411143133, "grad_norm": 1.0390625, "learning_rate": 0.0004385129557184116, "loss": 4.7662, "mean_token_accuracy": 0.23623427748680115, "num_tokens": 59289446.0, "step": 25850 }, { "entropy": 5.113775300979614, "epoch": 2.483669548511047, "grad_norm": 1.1328125, "learning_rate": 0.00043848940344146976, "loss": 4.8157, "mean_token_accuracy": 0.23519984036684036, "num_tokens": 59300156.0, "step": 25855 }, { "entropy": 5.048851490020752, "epoch": 2.484149855907781, "grad_norm": 1.0546875, "learning_rate": 0.0004384658473687894, "loss": 4.7629, "mean_token_accuracy": 0.23632101267576217, "num_tokens": 59311500.0, "step": 25860 }, { "entropy": 5.210094690322876, "epoch": 2.484630163304515, "grad_norm": 1.1015625, "learning_rate": 0.0004384422875009176, "loss": 4.8639, "mean_token_accuracy": 0.22739754617214203, "num_tokens": 59323299.0, "step": 25865 }, { "entropy": 5.152326250076294, "epoch": 2.485110470701249, "grad_norm": 1.1171875, "learning_rate": 0.0004384187238384011, "loss": 4.8075, "mean_token_accuracy": 0.2332732543349266, "num_tokens": 59335955.0, "step": 25870 }, { "entropy": 5.007804298400879, "epoch": 2.4855907780979827, "grad_norm": 1.1328125, "learning_rate": 0.0004383951563817871, "loss": 4.6885, "mean_token_accuracy": 0.23482925742864608, "num_tokens": 59347571.0, "step": 25875 }, { "entropy": 5.103703784942627, "epoch": 2.4860710854947166, "grad_norm": 1.125, "learning_rate": 0.0004383715851316227, "loss": 4.7639, "mean_token_accuracy": 0.23006821870803834, "num_tokens": 59358932.0, "step": 25880 }, { "entropy": 5.0206766605377195, "epoch": 2.4865513928914504, "grad_norm": 1.1171875, "learning_rate": 0.00043834801008845527, "loss": 4.6671, "mean_token_accuracy": 0.2419501304626465, "num_tokens": 59371425.0, "step": 25885 }, { "entropy": 5.026921224594116, "epoch": 2.4870317002881843, "grad_norm": 1.234375, "learning_rate": 0.0004383244312528321, "loss": 4.7173, "mean_token_accuracy": 0.23288854360580444, "num_tokens": 59382917.0, "step": 25890 }, { "entropy": 5.1574970245361325, "epoch": 2.487512007684918, "grad_norm": 1.1484375, "learning_rate": 0.0004383008486253006, "loss": 4.833, "mean_token_accuracy": 0.22575733363628386, "num_tokens": 59395014.0, "step": 25895 }, { "entropy": 5.147827816009522, "epoch": 2.487992315081652, "grad_norm": 1.09375, "learning_rate": 0.00043827726220640827, "loss": 4.8096, "mean_token_accuracy": 0.22929120808839798, "num_tokens": 59407115.0, "step": 25900 }, { "entropy": 5.234961748123169, "epoch": 2.488472622478386, "grad_norm": 1.15625, "learning_rate": 0.00043825367199670274, "loss": 4.8872, "mean_token_accuracy": 0.22529138028621673, "num_tokens": 59418744.0, "step": 25905 }, { "entropy": 5.047393321990967, "epoch": 2.4889529298751203, "grad_norm": 1.1796875, "learning_rate": 0.0004382300779967318, "loss": 4.6953, "mean_token_accuracy": 0.24463900476694106, "num_tokens": 59429950.0, "step": 25910 }, { "entropy": 5.096230125427246, "epoch": 2.489433237271854, "grad_norm": 1.1875, "learning_rate": 0.00043820648020704303, "loss": 4.8733, "mean_token_accuracy": 0.23028983771800995, "num_tokens": 59441335.0, "step": 25915 }, { "entropy": 5.147711133956909, "epoch": 2.489913544668588, "grad_norm": 1.2109375, "learning_rate": 0.00043818287862818444, "loss": 4.7661, "mean_token_accuracy": 0.23482993692159654, "num_tokens": 59452325.0, "step": 25920 }, { "entropy": 5.134023904800415, "epoch": 2.490393852065322, "grad_norm": 1.0546875, "learning_rate": 0.000438159273260704, "loss": 4.8731, "mean_token_accuracy": 0.23030917197465897, "num_tokens": 59464352.0, "step": 25925 }, { "entropy": 5.108134984970093, "epoch": 2.490874159462056, "grad_norm": 1.0234375, "learning_rate": 0.0004381356641051497, "loss": 4.7492, "mean_token_accuracy": 0.23972258418798448, "num_tokens": 59476048.0, "step": 25930 }, { "entropy": 5.152993965148926, "epoch": 2.4913544668587897, "grad_norm": 1.15625, "learning_rate": 0.0004381120511620697, "loss": 4.8179, "mean_token_accuracy": 0.2259794533252716, "num_tokens": 59488139.0, "step": 25935 }, { "entropy": 5.102795743942261, "epoch": 2.4918347742555236, "grad_norm": 1.0859375, "learning_rate": 0.00043808843443201217, "loss": 4.8476, "mean_token_accuracy": 0.22818621397018432, "num_tokens": 59498279.0, "step": 25940 }, { "entropy": 5.15069465637207, "epoch": 2.4923150816522575, "grad_norm": 1.1640625, "learning_rate": 0.0004380648139155255, "loss": 4.7699, "mean_token_accuracy": 0.23187788128852843, "num_tokens": 59509914.0, "step": 25945 }, { "entropy": 5.200746536254883, "epoch": 2.4927953890489913, "grad_norm": 1.234375, "learning_rate": 0.0004380411896131581, "loss": 4.9313, "mean_token_accuracy": 0.2188297063112259, "num_tokens": 59521441.0, "step": 25950 }, { "entropy": 5.111665678024292, "epoch": 2.493275696445725, "grad_norm": 1.109375, "learning_rate": 0.00043801756152545836, "loss": 4.844, "mean_token_accuracy": 0.22982463389635086, "num_tokens": 59533086.0, "step": 25955 }, { "entropy": 5.176836347579956, "epoch": 2.493756003842459, "grad_norm": 1.046875, "learning_rate": 0.00043799392965297496, "loss": 4.8271, "mean_token_accuracy": 0.22817039489746094, "num_tokens": 59545165.0, "step": 25960 }, { "entropy": 5.115931177139283, "epoch": 2.494236311239193, "grad_norm": 1.1015625, "learning_rate": 0.0004379702939962564, "loss": 4.8065, "mean_token_accuracy": 0.22424955815076827, "num_tokens": 59556016.0, "step": 25965 }, { "entropy": 5.119119167327881, "epoch": 2.494716618635927, "grad_norm": 1.2109375, "learning_rate": 0.0004379466545558516, "loss": 4.7896, "mean_token_accuracy": 0.22895194143056868, "num_tokens": 59566431.0, "step": 25970 }, { "entropy": 5.152138757705688, "epoch": 2.4951969260326607, "grad_norm": 1.09375, "learning_rate": 0.00043792301133230933, "loss": 4.8451, "mean_token_accuracy": 0.2285153165459633, "num_tokens": 59579371.0, "step": 25975 }, { "entropy": 5.109066534042358, "epoch": 2.4956772334293946, "grad_norm": 1.0234375, "learning_rate": 0.0004378993643261785, "loss": 4.7384, "mean_token_accuracy": 0.24127317667007447, "num_tokens": 59590088.0, "step": 25980 }, { "entropy": 5.071279907226563, "epoch": 2.496157540826129, "grad_norm": 1.1328125, "learning_rate": 0.00043787571353800814, "loss": 4.7119, "mean_token_accuracy": 0.24527304023504257, "num_tokens": 59600973.0, "step": 25985 }, { "entropy": 5.117540597915649, "epoch": 2.4966378482228624, "grad_norm": 1.0390625, "learning_rate": 0.0004378520589683475, "loss": 4.7536, "mean_token_accuracy": 0.23426486998796464, "num_tokens": 59613194.0, "step": 25990 }, { "entropy": 5.076265668869018, "epoch": 2.4971181556195967, "grad_norm": 1.109375, "learning_rate": 0.00043782840061774544, "loss": 4.7693, "mean_token_accuracy": 0.23121773898601533, "num_tokens": 59623932.0, "step": 25995 }, { "entropy": 5.0959716796875, "epoch": 2.4975984630163306, "grad_norm": 1.1171875, "learning_rate": 0.00043780473848675143, "loss": 4.8548, "mean_token_accuracy": 0.227916020154953, "num_tokens": 59635095.0, "step": 26000 }, { "entropy": 5.101946401596069, "epoch": 2.4980787704130645, "grad_norm": 1.0390625, "learning_rate": 0.0004377810725759149, "loss": 4.7575, "mean_token_accuracy": 0.2337815672159195, "num_tokens": 59646930.0, "step": 26005 }, { "entropy": 5.095354413986206, "epoch": 2.4985590778097984, "grad_norm": 0.9765625, "learning_rate": 0.00043775740288578516, "loss": 4.7307, "mean_token_accuracy": 0.23744526356458664, "num_tokens": 59658729.0, "step": 26010 }, { "entropy": 5.054209232330322, "epoch": 2.4990393852065322, "grad_norm": 1.09375, "learning_rate": 0.0004377337294169118, "loss": 4.8073, "mean_token_accuracy": 0.23229757994413375, "num_tokens": 59670619.0, "step": 26015 }, { "entropy": 5.174720811843872, "epoch": 2.499519692603266, "grad_norm": 1.2578125, "learning_rate": 0.00043771005216984457, "loss": 4.8931, "mean_token_accuracy": 0.22655860781669618, "num_tokens": 59682307.0, "step": 26020 }, { "entropy": 5.136973333358765, "epoch": 2.5, "grad_norm": 1.03125, "learning_rate": 0.000437686371145133, "loss": 4.7471, "mean_token_accuracy": 0.23237128108739852, "num_tokens": 59693515.0, "step": 26025 }, { "entropy": 5.1173011302948, "epoch": 2.500480307396734, "grad_norm": 1.03125, "learning_rate": 0.000437662686343327, "loss": 4.799, "mean_token_accuracy": 0.22752099484205246, "num_tokens": 59705012.0, "step": 26030 }, { "entropy": 5.171797037124634, "epoch": 2.5009606147934678, "grad_norm": 1.125, "learning_rate": 0.0004376389977649764, "loss": 4.8797, "mean_token_accuracy": 0.23065385967493057, "num_tokens": 59716776.0, "step": 26035 }, { "entropy": 5.087970304489136, "epoch": 2.5014409221902016, "grad_norm": 1.09375, "learning_rate": 0.0004376153054106313, "loss": 4.7667, "mean_token_accuracy": 0.23524891585111618, "num_tokens": 59727526.0, "step": 26040 }, { "entropy": 5.122954797744751, "epoch": 2.5019212295869355, "grad_norm": 1.046875, "learning_rate": 0.0004375916092808416, "loss": 4.7422, "mean_token_accuracy": 0.23872457444667816, "num_tokens": 59738861.0, "step": 26045 }, { "entropy": 5.249390029907227, "epoch": 2.5024015369836694, "grad_norm": 1.109375, "learning_rate": 0.0004375679093761575, "loss": 4.9846, "mean_token_accuracy": 0.2129766032099724, "num_tokens": 59751019.0, "step": 26050 }, { "entropy": 5.14567813873291, "epoch": 2.5028818443804033, "grad_norm": 1.03125, "learning_rate": 0.00043754420569712925, "loss": 4.7858, "mean_token_accuracy": 0.23589466214179994, "num_tokens": 59762936.0, "step": 26055 }, { "entropy": 5.094907569885254, "epoch": 2.5033621517771376, "grad_norm": 1.03125, "learning_rate": 0.00043752049824430736, "loss": 4.7202, "mean_token_accuracy": 0.23375146836042404, "num_tokens": 59772909.0, "step": 26060 }, { "entropy": 5.0118269443511965, "epoch": 2.503842459173871, "grad_norm": 0.99609375, "learning_rate": 0.00043749678701824197, "loss": 4.6903, "mean_token_accuracy": 0.23832600712776184, "num_tokens": 59784003.0, "step": 26065 }, { "entropy": 5.098705244064331, "epoch": 2.5043227665706054, "grad_norm": 1.1875, "learning_rate": 0.0004374730720194837, "loss": 4.835, "mean_token_accuracy": 0.2284349739551544, "num_tokens": 59796200.0, "step": 26070 }, { "entropy": 5.090223264694214, "epoch": 2.5048030739673393, "grad_norm": 1.09375, "learning_rate": 0.0004374493532485832, "loss": 4.7573, "mean_token_accuracy": 0.23184773176908494, "num_tokens": 59807559.0, "step": 26075 }, { "entropy": 5.136758613586426, "epoch": 2.505283381364073, "grad_norm": 1.203125, "learning_rate": 0.000437425630706091, "loss": 4.819, "mean_token_accuracy": 0.22647526264190673, "num_tokens": 59818213.0, "step": 26080 }, { "entropy": 5.130527019500732, "epoch": 2.505763688760807, "grad_norm": 1.046875, "learning_rate": 0.000437401904392558, "loss": 4.812, "mean_token_accuracy": 0.2289934679865837, "num_tokens": 59829485.0, "step": 26085 }, { "entropy": 5.078642749786377, "epoch": 2.506243996157541, "grad_norm": 1.0546875, "learning_rate": 0.00043737817430853504, "loss": 4.8662, "mean_token_accuracy": 0.22903375029563905, "num_tokens": 59842187.0, "step": 26090 }, { "entropy": 5.200610637664795, "epoch": 2.506724303554275, "grad_norm": 1.8203125, "learning_rate": 0.00043735444045457303, "loss": 4.9089, "mean_token_accuracy": 0.2234987273812294, "num_tokens": 59852783.0, "step": 26095 }, { "entropy": 5.155863475799561, "epoch": 2.5072046109510087, "grad_norm": 1.078125, "learning_rate": 0.00043733070283122306, "loss": 4.755, "mean_token_accuracy": 0.23283324986696244, "num_tokens": 59863661.0, "step": 26100 }, { "entropy": 5.051503658294678, "epoch": 2.5076849183477425, "grad_norm": 1.1796875, "learning_rate": 0.00043730696143903607, "loss": 4.7325, "mean_token_accuracy": 0.23420979529619218, "num_tokens": 59876109.0, "step": 26105 }, { "entropy": 5.024322080612182, "epoch": 2.5081652257444764, "grad_norm": 1.1015625, "learning_rate": 0.0004372832162785635, "loss": 4.5933, "mean_token_accuracy": 0.24501541554927825, "num_tokens": 59886322.0, "step": 26110 }, { "entropy": 5.029271793365479, "epoch": 2.5086455331412103, "grad_norm": 1.078125, "learning_rate": 0.0004372594673503565, "loss": 4.7192, "mean_token_accuracy": 0.23893527537584305, "num_tokens": 59897307.0, "step": 26115 }, { "entropy": 5.052051687240601, "epoch": 2.509125840537944, "grad_norm": 1.0625, "learning_rate": 0.0004372357146549664, "loss": 4.6937, "mean_token_accuracy": 0.2373049482703209, "num_tokens": 59908709.0, "step": 26120 }, { "entropy": 5.067273283004761, "epoch": 2.509606147934678, "grad_norm": 1.078125, "learning_rate": 0.00043721195819294487, "loss": 4.7122, "mean_token_accuracy": 0.24751894921064377, "num_tokens": 59920082.0, "step": 26125 }, { "entropy": 5.077537918090821, "epoch": 2.510086455331412, "grad_norm": 1.109375, "learning_rate": 0.0004371881979648433, "loss": 4.7497, "mean_token_accuracy": 0.2332884654402733, "num_tokens": 59931632.0, "step": 26130 }, { "entropy": 5.0393143653869625, "epoch": 2.5105667627281463, "grad_norm": 1.0703125, "learning_rate": 0.0004371644339712133, "loss": 4.696, "mean_token_accuracy": 0.2380808562040329, "num_tokens": 59944093.0, "step": 26135 }, { "entropy": 5.0620362758636475, "epoch": 2.5110470701248797, "grad_norm": 1.1640625, "learning_rate": 0.0004371406662126067, "loss": 4.7571, "mean_token_accuracy": 0.22995698899030687, "num_tokens": 59954879.0, "step": 26140 }, { "entropy": 5.057656860351562, "epoch": 2.511527377521614, "grad_norm": 1.0859375, "learning_rate": 0.00043711689468957534, "loss": 4.7668, "mean_token_accuracy": 0.23091911375522614, "num_tokens": 59967074.0, "step": 26145 }, { "entropy": 5.194906091690063, "epoch": 2.5120076849183475, "grad_norm": 1.078125, "learning_rate": 0.00043709311940267107, "loss": 4.8049, "mean_token_accuracy": 0.23293969184160232, "num_tokens": 59978939.0, "step": 26150 }, { "entropy": 5.159637403488159, "epoch": 2.512487992315082, "grad_norm": 1.0859375, "learning_rate": 0.0004370693403524458, "loss": 4.7721, "mean_token_accuracy": 0.23222009539604188, "num_tokens": 59989251.0, "step": 26155 }, { "entropy": 5.043718004226685, "epoch": 2.5129682997118157, "grad_norm": 1.109375, "learning_rate": 0.0004370455575394518, "loss": 4.7492, "mean_token_accuracy": 0.23366657197475432, "num_tokens": 60000267.0, "step": 26160 }, { "entropy": 5.143046808242798, "epoch": 2.5134486071085496, "grad_norm": 1.09375, "learning_rate": 0.0004370217709642411, "loss": 4.7776, "mean_token_accuracy": 0.23386308401823044, "num_tokens": 60011190.0, "step": 26165 }, { "entropy": 5.128286361694336, "epoch": 2.5139289145052834, "grad_norm": 1.03125, "learning_rate": 0.000436997980627366, "loss": 4.8399, "mean_token_accuracy": 0.22566935122013093, "num_tokens": 60023496.0, "step": 26170 }, { "entropy": 5.190862083435059, "epoch": 2.5144092219020173, "grad_norm": 1.125, "learning_rate": 0.00043697418652937877, "loss": 4.8595, "mean_token_accuracy": 0.22487357556819915, "num_tokens": 60034585.0, "step": 26175 }, { "entropy": 5.187910270690918, "epoch": 2.514889529298751, "grad_norm": 1.0859375, "learning_rate": 0.0004369503886708319, "loss": 4.8479, "mean_token_accuracy": 0.22261265069246292, "num_tokens": 60046333.0, "step": 26180 }, { "entropy": 5.111104536056518, "epoch": 2.515369836695485, "grad_norm": 1.0859375, "learning_rate": 0.00043692658705227796, "loss": 4.8294, "mean_token_accuracy": 0.22929517477750777, "num_tokens": 60058744.0, "step": 26185 }, { "entropy": 5.182344436645508, "epoch": 2.515850144092219, "grad_norm": 1.2265625, "learning_rate": 0.00043690278167426945, "loss": 4.87, "mean_token_accuracy": 0.22710851728916168, "num_tokens": 60069303.0, "step": 26190 }, { "entropy": 5.06790828704834, "epoch": 2.516330451488953, "grad_norm": 1.1328125, "learning_rate": 0.0004368789725373591, "loss": 4.7214, "mean_token_accuracy": 0.2350516840815544, "num_tokens": 60080604.0, "step": 26195 }, { "entropy": 5.1351783752441404, "epoch": 2.5168107588856867, "grad_norm": 1.1875, "learning_rate": 0.00043685515964209977, "loss": 4.7637, "mean_token_accuracy": 0.23912479281425475, "num_tokens": 60090151.0, "step": 26200 }, { "entropy": 5.189052581787109, "epoch": 2.5172910662824206, "grad_norm": 1.1875, "learning_rate": 0.0004368313429890441, "loss": 4.8757, "mean_token_accuracy": 0.22785695642232895, "num_tokens": 60102528.0, "step": 26205 }, { "entropy": 5.059284973144531, "epoch": 2.517771373679155, "grad_norm": 1.046875, "learning_rate": 0.0004368075225787453, "loss": 4.7217, "mean_token_accuracy": 0.23929833620786667, "num_tokens": 60113078.0, "step": 26210 }, { "entropy": 5.032156848907471, "epoch": 2.5182516810758884, "grad_norm": 0.984375, "learning_rate": 0.0004367836984117562, "loss": 4.7139, "mean_token_accuracy": 0.2332184448838234, "num_tokens": 60124728.0, "step": 26215 }, { "entropy": 5.013386535644531, "epoch": 2.5187319884726227, "grad_norm": 1.1171875, "learning_rate": 0.00043675987048863, "loss": 4.6951, "mean_token_accuracy": 0.23805242478847505, "num_tokens": 60136224.0, "step": 26220 }, { "entropy": 5.131964063644409, "epoch": 2.519212295869356, "grad_norm": 1.109375, "learning_rate": 0.0004367360388099201, "loss": 4.8383, "mean_token_accuracy": 0.2287626013159752, "num_tokens": 60148654.0, "step": 26225 }, { "entropy": 5.223377132415772, "epoch": 2.5196926032660905, "grad_norm": 1.2421875, "learning_rate": 0.0004367122033761796, "loss": 4.8275, "mean_token_accuracy": 0.22268654704093932, "num_tokens": 60159740.0, "step": 26230 }, { "entropy": 5.142727422714233, "epoch": 2.5201729106628243, "grad_norm": 1.15625, "learning_rate": 0.0004366883641879618, "loss": 4.8051, "mean_token_accuracy": 0.22963927835226058, "num_tokens": 60171387.0, "step": 26235 }, { "entropy": 5.037440156936645, "epoch": 2.520653218059558, "grad_norm": 1.1484375, "learning_rate": 0.00043666452124582034, "loss": 4.7227, "mean_token_accuracy": 0.23929983526468276, "num_tokens": 60181562.0, "step": 26240 }, { "entropy": 4.980976724624634, "epoch": 2.521133525456292, "grad_norm": 1.15625, "learning_rate": 0.0004366406745503088, "loss": 4.6635, "mean_token_accuracy": 0.2430158495903015, "num_tokens": 60191187.0, "step": 26245 }, { "entropy": 5.0385167598724365, "epoch": 2.521613832853026, "grad_norm": 1.0390625, "learning_rate": 0.0004366168241019807, "loss": 4.714, "mean_token_accuracy": 0.2383538380265236, "num_tokens": 60203792.0, "step": 26250 }, { "entropy": 5.156890964508056, "epoch": 2.52209414024976, "grad_norm": 1.0, "learning_rate": 0.0004365929699013899, "loss": 4.7941, "mean_token_accuracy": 0.23458444625139235, "num_tokens": 60215523.0, "step": 26255 }, { "entropy": 5.120486068725586, "epoch": 2.5225744476464937, "grad_norm": 1.1953125, "learning_rate": 0.0004365691119490902, "loss": 4.831, "mean_token_accuracy": 0.2312985271215439, "num_tokens": 60226113.0, "step": 26260 }, { "entropy": 5.1754053115844725, "epoch": 2.5230547550432276, "grad_norm": 0.95703125, "learning_rate": 0.0004365452502456354, "loss": 4.8934, "mean_token_accuracy": 0.2248413920402527, "num_tokens": 60238521.0, "step": 26265 }, { "entropy": 5.02523455619812, "epoch": 2.5235350624399615, "grad_norm": 1.1171875, "learning_rate": 0.0004365213847915796, "loss": 4.6587, "mean_token_accuracy": 0.24070182889699937, "num_tokens": 60249717.0, "step": 26270 }, { "entropy": 5.152025556564331, "epoch": 2.5240153698366954, "grad_norm": 1.0703125, "learning_rate": 0.00043649751558747695, "loss": 4.8645, "mean_token_accuracy": 0.22075033336877822, "num_tokens": 60261749.0, "step": 26275 }, { "entropy": 5.1482549667358395, "epoch": 2.5244956772334293, "grad_norm": 1.171875, "learning_rate": 0.00043647364263388143, "loss": 4.8425, "mean_token_accuracy": 0.22865833938121796, "num_tokens": 60272925.0, "step": 26280 }, { "entropy": 5.139992952346802, "epoch": 2.524975984630163, "grad_norm": 1.046875, "learning_rate": 0.0004364497659313475, "loss": 4.7977, "mean_token_accuracy": 0.2323908507823944, "num_tokens": 60285081.0, "step": 26285 }, { "entropy": 5.155612897872925, "epoch": 2.525456292026897, "grad_norm": 0.9765625, "learning_rate": 0.0004364258854804294, "loss": 4.7932, "mean_token_accuracy": 0.237948477268219, "num_tokens": 60296796.0, "step": 26290 }, { "entropy": 5.1394867420196535, "epoch": 2.5259365994236314, "grad_norm": 1.0703125, "learning_rate": 0.0004364020012816815, "loss": 4.8584, "mean_token_accuracy": 0.22993704825639724, "num_tokens": 60308600.0, "step": 26295 }, { "entropy": 5.062632894515991, "epoch": 2.526416906820365, "grad_norm": 1.078125, "learning_rate": 0.0004363781133356584, "loss": 4.7279, "mean_token_accuracy": 0.2410830244421959, "num_tokens": 60319188.0, "step": 26300 }, { "entropy": 5.1485496997833256, "epoch": 2.526897214217099, "grad_norm": 1.171875, "learning_rate": 0.0004363542216429147, "loss": 4.8249, "mean_token_accuracy": 0.2361416146159172, "num_tokens": 60329912.0, "step": 26305 }, { "entropy": 5.187692260742187, "epoch": 2.527377521613833, "grad_norm": 1.15625, "learning_rate": 0.0004363303262040051, "loss": 4.796, "mean_token_accuracy": 0.2305053174495697, "num_tokens": 60339760.0, "step": 26310 }, { "entropy": 5.007799482345581, "epoch": 2.527857829010567, "grad_norm": 1.078125, "learning_rate": 0.00043630642701948446, "loss": 4.6331, "mean_token_accuracy": 0.2434243828058243, "num_tokens": 60349947.0, "step": 26315 }, { "entropy": 5.076165342330933, "epoch": 2.5283381364073008, "grad_norm": 1.0390625, "learning_rate": 0.00043628252408990756, "loss": 4.8564, "mean_token_accuracy": 0.22621034681797028, "num_tokens": 60362620.0, "step": 26320 }, { "entropy": 5.204525279998779, "epoch": 2.5288184438040346, "grad_norm": 1.2265625, "learning_rate": 0.00043625861741582926, "loss": 4.8204, "mean_token_accuracy": 0.23000132739543916, "num_tokens": 60373729.0, "step": 26325 }, { "entropy": 5.181565809249878, "epoch": 2.5292987512007685, "grad_norm": 1.0703125, "learning_rate": 0.00043623470699780483, "loss": 4.7641, "mean_token_accuracy": 0.23317065536975862, "num_tokens": 60386423.0, "step": 26330 }, { "entropy": 5.090573835372925, "epoch": 2.5297790585975024, "grad_norm": 1.2109375, "learning_rate": 0.0004362107928363892, "loss": 4.7975, "mean_token_accuracy": 0.22879107743501664, "num_tokens": 60398364.0, "step": 26335 }, { "entropy": 5.127240180969238, "epoch": 2.5302593659942363, "grad_norm": 1.0625, "learning_rate": 0.0004361868749321377, "loss": 4.8279, "mean_token_accuracy": 0.23116668611764907, "num_tokens": 60408423.0, "step": 26340 }, { "entropy": 5.175243282318116, "epoch": 2.53073967339097, "grad_norm": 0.9921875, "learning_rate": 0.0004361629532856055, "loss": 4.8457, "mean_token_accuracy": 0.2255500078201294, "num_tokens": 60420229.0, "step": 26345 }, { "entropy": 5.2243876457214355, "epoch": 2.531219980787704, "grad_norm": 1.1328125, "learning_rate": 0.00043613902789734816, "loss": 4.873, "mean_token_accuracy": 0.2286778062582016, "num_tokens": 60430711.0, "step": 26350 }, { "entropy": 5.182257080078125, "epoch": 2.531700288184438, "grad_norm": 1.078125, "learning_rate": 0.000436115098767921, "loss": 4.8509, "mean_token_accuracy": 0.226052425801754, "num_tokens": 60442293.0, "step": 26355 }, { "entropy": 5.0641368389129635, "epoch": 2.532180595581172, "grad_norm": 1.0390625, "learning_rate": 0.00043609116589787974, "loss": 4.7563, "mean_token_accuracy": 0.23511586487293243, "num_tokens": 60452616.0, "step": 26360 }, { "entropy": 5.088418579101562, "epoch": 2.5326609029779057, "grad_norm": 1.0546875, "learning_rate": 0.0004360672292877799, "loss": 4.7419, "mean_token_accuracy": 0.23013172149658204, "num_tokens": 60463254.0, "step": 26365 }, { "entropy": 5.086653232574463, "epoch": 2.53314121037464, "grad_norm": 1.1328125, "learning_rate": 0.00043604328893817726, "loss": 4.7792, "mean_token_accuracy": 0.23134986758232118, "num_tokens": 60473725.0, "step": 26370 }, { "entropy": 5.126363468170166, "epoch": 2.5336215177713735, "grad_norm": 1.09375, "learning_rate": 0.00043601934484962775, "loss": 4.7947, "mean_token_accuracy": 0.2362975835800171, "num_tokens": 60484502.0, "step": 26375 }, { "entropy": 5.046994161605835, "epoch": 2.534101825168108, "grad_norm": 1.0, "learning_rate": 0.0004359953970226871, "loss": 4.7355, "mean_token_accuracy": 0.2410357877612114, "num_tokens": 60496134.0, "step": 26380 }, { "entropy": 5.132125663757324, "epoch": 2.5345821325648417, "grad_norm": 1.1171875, "learning_rate": 0.00043597144545791134, "loss": 4.8178, "mean_token_accuracy": 0.23338208943605424, "num_tokens": 60507236.0, "step": 26385 }, { "entropy": 5.057839679718017, "epoch": 2.5350624399615755, "grad_norm": 1.0859375, "learning_rate": 0.0004359474901558567, "loss": 4.7484, "mean_token_accuracy": 0.24036937803030015, "num_tokens": 60517499.0, "step": 26390 }, { "entropy": 5.162707138061523, "epoch": 2.5355427473583094, "grad_norm": 1.0390625, "learning_rate": 0.0004359235311170792, "loss": 4.8445, "mean_token_accuracy": 0.2248290151357651, "num_tokens": 60529434.0, "step": 26395 }, { "entropy": 5.166756963729858, "epoch": 2.5360230547550433, "grad_norm": 1.0625, "learning_rate": 0.0004358995683421352, "loss": 4.8376, "mean_token_accuracy": 0.22776952087879182, "num_tokens": 60542571.0, "step": 26400 }, { "entropy": 5.0918957710266115, "epoch": 2.536503362151777, "grad_norm": 1.0390625, "learning_rate": 0.00043587560183158095, "loss": 4.7352, "mean_token_accuracy": 0.2310478702187538, "num_tokens": 60553826.0, "step": 26405 }, { "entropy": 5.049466800689697, "epoch": 2.536983669548511, "grad_norm": 1.140625, "learning_rate": 0.000435851631585973, "loss": 4.7643, "mean_token_accuracy": 0.23094182163476945, "num_tokens": 60565173.0, "step": 26410 }, { "entropy": 5.071749114990235, "epoch": 2.537463976945245, "grad_norm": 1.0078125, "learning_rate": 0.0004358276576058677, "loss": 4.7518, "mean_token_accuracy": 0.24238502383232116, "num_tokens": 60577267.0, "step": 26415 }, { "entropy": 5.134958410263062, "epoch": 2.537944284341979, "grad_norm": 1.1796875, "learning_rate": 0.0004358036798918218, "loss": 4.7841, "mean_token_accuracy": 0.22933975905179976, "num_tokens": 60587441.0, "step": 26420 }, { "entropy": 5.195329713821411, "epoch": 2.5384245917387127, "grad_norm": 1.0, "learning_rate": 0.0004357796984443919, "loss": 4.8349, "mean_token_accuracy": 0.23350724577903748, "num_tokens": 60597173.0, "step": 26425 }, { "entropy": 5.05770468711853, "epoch": 2.5389048991354466, "grad_norm": 1.0703125, "learning_rate": 0.00043575571326413484, "loss": 4.7077, "mean_token_accuracy": 0.23081723600625992, "num_tokens": 60608299.0, "step": 26430 }, { "entropy": 5.198756313323974, "epoch": 2.5393852065321805, "grad_norm": 1.140625, "learning_rate": 0.0004357317243516075, "loss": 4.8884, "mean_token_accuracy": 0.22811190336942672, "num_tokens": 60619106.0, "step": 26435 }, { "entropy": 5.089833211898804, "epoch": 2.5398655139289144, "grad_norm": 1.140625, "learning_rate": 0.00043570773170736676, "loss": 4.6762, "mean_token_accuracy": 0.23719825744628906, "num_tokens": 60629339.0, "step": 26440 }, { "entropy": 5.109752893447876, "epoch": 2.5403458213256487, "grad_norm": 1.0703125, "learning_rate": 0.00043568373533196976, "loss": 4.716, "mean_token_accuracy": 0.2442714810371399, "num_tokens": 60640162.0, "step": 26445 }, { "entropy": 5.031629228591919, "epoch": 2.540826128722382, "grad_norm": 1.0078125, "learning_rate": 0.00043565973522597344, "loss": 4.7499, "mean_token_accuracy": 0.2293874904513359, "num_tokens": 60652529.0, "step": 26450 }, { "entropy": 5.047484922409057, "epoch": 2.5413064361191164, "grad_norm": 1.109375, "learning_rate": 0.00043563573138993524, "loss": 4.6916, "mean_token_accuracy": 0.233546245098114, "num_tokens": 60663035.0, "step": 26455 }, { "entropy": 5.098540163040161, "epoch": 2.54178674351585, "grad_norm": 1.03125, "learning_rate": 0.0004356117238244123, "loss": 4.7277, "mean_token_accuracy": 0.23866922706365584, "num_tokens": 60674430.0, "step": 26460 }, { "entropy": 5.027708196640015, "epoch": 2.542267050912584, "grad_norm": 1.0546875, "learning_rate": 0.00043558771252996204, "loss": 4.7001, "mean_token_accuracy": 0.23966480642557145, "num_tokens": 60685923.0, "step": 26465 }, { "entropy": 5.07547516822815, "epoch": 2.542747358309318, "grad_norm": 1.0703125, "learning_rate": 0.000435563697507142, "loss": 4.7643, "mean_token_accuracy": 0.23587315380573273, "num_tokens": 60696344.0, "step": 26470 }, { "entropy": 5.108949947357178, "epoch": 2.543227665706052, "grad_norm": 1.078125, "learning_rate": 0.0004355396787565096, "loss": 4.7781, "mean_token_accuracy": 0.23361520916223527, "num_tokens": 60709941.0, "step": 26475 }, { "entropy": 5.096314716339111, "epoch": 2.543707973102786, "grad_norm": 1.03125, "learning_rate": 0.00043551565627862257, "loss": 4.7535, "mean_token_accuracy": 0.2340693786740303, "num_tokens": 60721245.0, "step": 26480 }, { "entropy": 5.066954851150513, "epoch": 2.5441882804995197, "grad_norm": 1.015625, "learning_rate": 0.0004354916300740387, "loss": 4.7433, "mean_token_accuracy": 0.23161177486181259, "num_tokens": 60732763.0, "step": 26485 }, { "entropy": 5.016753816604615, "epoch": 2.5446685878962536, "grad_norm": 1.0859375, "learning_rate": 0.0004354676001433157, "loss": 4.7248, "mean_token_accuracy": 0.24093882292509078, "num_tokens": 60743192.0, "step": 26490 }, { "entropy": 4.984168529510498, "epoch": 2.5451488952929875, "grad_norm": 1.1015625, "learning_rate": 0.0004354435664870116, "loss": 4.6551, "mean_token_accuracy": 0.23839119523763658, "num_tokens": 60754115.0, "step": 26495 }, { "entropy": 5.142358589172363, "epoch": 2.5456292026897214, "grad_norm": 1.109375, "learning_rate": 0.00043541952910568417, "loss": 4.8166, "mean_token_accuracy": 0.23036390393972397, "num_tokens": 60765516.0, "step": 26500 }, { "entropy": 5.143162775039673, "epoch": 2.5461095100864553, "grad_norm": 1.1875, "learning_rate": 0.0004353954879998916, "loss": 4.7868, "mean_token_accuracy": 0.23051732331514357, "num_tokens": 60776206.0, "step": 26505 }, { "entropy": 5.050413990020752, "epoch": 2.546589817483189, "grad_norm": 1.0078125, "learning_rate": 0.0004353714431701922, "loss": 4.7071, "mean_token_accuracy": 0.23476076275110244, "num_tokens": 60786992.0, "step": 26510 }, { "entropy": 5.112949514389038, "epoch": 2.547070124879923, "grad_norm": 1.171875, "learning_rate": 0.0004353473946171441, "loss": 4.798, "mean_token_accuracy": 0.22756085842847823, "num_tokens": 60797765.0, "step": 26515 }, { "entropy": 5.15986328125, "epoch": 2.547550432276657, "grad_norm": 1.078125, "learning_rate": 0.00043532334234130547, "loss": 4.8078, "mean_token_accuracy": 0.22541076242923735, "num_tokens": 60808873.0, "step": 26520 }, { "entropy": 5.143631410598755, "epoch": 2.5480307396733908, "grad_norm": 1.0703125, "learning_rate": 0.00043529928634323503, "loss": 4.8367, "mean_token_accuracy": 0.22586060762405397, "num_tokens": 60820245.0, "step": 26525 }, { "entropy": 5.073045969009399, "epoch": 2.548511047070125, "grad_norm": 1.0625, "learning_rate": 0.00043527522662349113, "loss": 4.7861, "mean_token_accuracy": 0.23567369878292083, "num_tokens": 60832159.0, "step": 26530 }, { "entropy": 5.082254457473755, "epoch": 2.5489913544668585, "grad_norm": 1.09375, "learning_rate": 0.0004352511631826324, "loss": 4.7432, "mean_token_accuracy": 0.2327280730009079, "num_tokens": 60842957.0, "step": 26535 }, { "entropy": 5.082061862945556, "epoch": 2.549471661863593, "grad_norm": 1.1796875, "learning_rate": 0.0004352270960212175, "loss": 4.7523, "mean_token_accuracy": 0.2365626201033592, "num_tokens": 60852537.0, "step": 26540 }, { "entropy": 5.237741470336914, "epoch": 2.5499519692603267, "grad_norm": 0.96484375, "learning_rate": 0.0004352030251398052, "loss": 4.9144, "mean_token_accuracy": 0.2198659136891365, "num_tokens": 60865761.0, "step": 26545 }, { "entropy": 5.127560138702393, "epoch": 2.5504322766570606, "grad_norm": 1.1171875, "learning_rate": 0.00043517895053895434, "loss": 4.77, "mean_token_accuracy": 0.2361322596669197, "num_tokens": 60877805.0, "step": 26550 }, { "entropy": 5.085631465911865, "epoch": 2.5509125840537945, "grad_norm": 1.0546875, "learning_rate": 0.0004351548722192239, "loss": 4.7979, "mean_token_accuracy": 0.2329874664545059, "num_tokens": 60889868.0, "step": 26555 }, { "entropy": 5.120602083206177, "epoch": 2.5513928914505284, "grad_norm": 1.078125, "learning_rate": 0.0004351307901811729, "loss": 4.7695, "mean_token_accuracy": 0.23612948954105378, "num_tokens": 60902046.0, "step": 26560 }, { "entropy": 5.121815824508667, "epoch": 2.5518731988472623, "grad_norm": 1.0546875, "learning_rate": 0.0004351067044253605, "loss": 4.7806, "mean_token_accuracy": 0.2298535168170929, "num_tokens": 60913447.0, "step": 26565 }, { "entropy": 5.1224446296691895, "epoch": 2.552353506243996, "grad_norm": 1.1171875, "learning_rate": 0.00043508261495234577, "loss": 4.757, "mean_token_accuracy": 0.2374941736459732, "num_tokens": 60924920.0, "step": 26570 }, { "entropy": 5.0311089038848875, "epoch": 2.55283381364073, "grad_norm": 1.0859375, "learning_rate": 0.000435058521762688, "loss": 4.6902, "mean_token_accuracy": 0.23946462124586104, "num_tokens": 60936937.0, "step": 26575 }, { "entropy": 5.035173082351685, "epoch": 2.553314121037464, "grad_norm": 1.1171875, "learning_rate": 0.0004350344248569467, "loss": 4.7392, "mean_token_accuracy": 0.24265369772911072, "num_tokens": 60947836.0, "step": 26580 }, { "entropy": 5.0782520294189455, "epoch": 2.553794428434198, "grad_norm": 1.0703125, "learning_rate": 0.0004350103242356813, "loss": 4.8239, "mean_token_accuracy": 0.23157395124435426, "num_tokens": 60959380.0, "step": 26585 }, { "entropy": 5.1494324684143065, "epoch": 2.5542747358309317, "grad_norm": 1.0546875, "learning_rate": 0.0004349862198994513, "loss": 4.8105, "mean_token_accuracy": 0.2270434394478798, "num_tokens": 60970148.0, "step": 26590 }, { "entropy": 5.107781076431275, "epoch": 2.5547550432276656, "grad_norm": 1.1171875, "learning_rate": 0.0004349621118488163, "loss": 4.7167, "mean_token_accuracy": 0.2449997827410698, "num_tokens": 60981139.0, "step": 26595 }, { "entropy": 4.997005796432495, "epoch": 2.5552353506243994, "grad_norm": 0.99609375, "learning_rate": 0.0004349380000843361, "loss": 4.7246, "mean_token_accuracy": 0.23980281502008438, "num_tokens": 60992204.0, "step": 26600 }, { "entropy": 5.0700541019439695, "epoch": 2.5557156580211338, "grad_norm": 1.046875, "learning_rate": 0.0004349138846065704, "loss": 4.7374, "mean_token_accuracy": 0.23640399128198625, "num_tokens": 61002198.0, "step": 26605 }, { "entropy": 5.165579319000244, "epoch": 2.556195965417867, "grad_norm": 1.1015625, "learning_rate": 0.0004348897654160791, "loss": 4.8376, "mean_token_accuracy": 0.2286193385720253, "num_tokens": 61012987.0, "step": 26610 }, { "entropy": 5.0254762172698975, "epoch": 2.5566762728146015, "grad_norm": 1.0625, "learning_rate": 0.0004348656425134223, "loss": 4.6344, "mean_token_accuracy": 0.24285491108894347, "num_tokens": 61023668.0, "step": 26615 }, { "entropy": 5.082610177993774, "epoch": 2.5571565802113354, "grad_norm": 1.109375, "learning_rate": 0.00043484151589916, "loss": 4.6805, "mean_token_accuracy": 0.24342512488365173, "num_tokens": 61034375.0, "step": 26620 }, { "entropy": 5.10274305343628, "epoch": 2.5576368876080693, "grad_norm": 1.03125, "learning_rate": 0.0004348173855738523, "loss": 4.775, "mean_token_accuracy": 0.231153304874897, "num_tokens": 61045625.0, "step": 26625 }, { "entropy": 5.083966779708862, "epoch": 2.558117195004803, "grad_norm": 1.0703125, "learning_rate": 0.0004347932515380595, "loss": 4.8058, "mean_token_accuracy": 0.23799928277730942, "num_tokens": 61056359.0, "step": 26630 }, { "entropy": 5.089589023590088, "epoch": 2.558597502401537, "grad_norm": 1.078125, "learning_rate": 0.0004347691137923418, "loss": 4.7357, "mean_token_accuracy": 0.23540275543928146, "num_tokens": 61067194.0, "step": 26635 }, { "entropy": 5.176966857910156, "epoch": 2.559077809798271, "grad_norm": 1.046875, "learning_rate": 0.0004347449723372598, "loss": 4.8407, "mean_token_accuracy": 0.22330044209957123, "num_tokens": 61079341.0, "step": 26640 }, { "entropy": 5.10914101600647, "epoch": 2.559558117195005, "grad_norm": 1.1015625, "learning_rate": 0.0004347208271733738, "loss": 4.824, "mean_token_accuracy": 0.23296955972909927, "num_tokens": 61090843.0, "step": 26645 }, { "entropy": 5.112281465530396, "epoch": 2.5600384245917387, "grad_norm": 1.0703125, "learning_rate": 0.0004346966783012445, "loss": 4.7308, "mean_token_accuracy": 0.2376183584332466, "num_tokens": 61101780.0, "step": 26650 }, { "entropy": 5.108506917953491, "epoch": 2.5605187319884726, "grad_norm": 1.0625, "learning_rate": 0.00043467252572143247, "loss": 4.8093, "mean_token_accuracy": 0.23573538511991501, "num_tokens": 61113026.0, "step": 26655 }, { "entropy": 5.07466197013855, "epoch": 2.5609990393852065, "grad_norm": 0.9453125, "learning_rate": 0.00043464836943449866, "loss": 4.7777, "mean_token_accuracy": 0.23345693796873093, "num_tokens": 61124772.0, "step": 26660 }, { "entropy": 5.091155910491944, "epoch": 2.5614793467819403, "grad_norm": 1.1015625, "learning_rate": 0.0004346242094410036, "loss": 4.7689, "mean_token_accuracy": 0.23651626110076904, "num_tokens": 61136683.0, "step": 26665 }, { "entropy": 5.103244400024414, "epoch": 2.561959654178674, "grad_norm": 1.1171875, "learning_rate": 0.0004346000457415085, "loss": 4.7298, "mean_token_accuracy": 0.24229811131954193, "num_tokens": 61148047.0, "step": 26670 }, { "entropy": 5.061260938644409, "epoch": 2.562439961575408, "grad_norm": 1.1015625, "learning_rate": 0.00043457587833657424, "loss": 4.6959, "mean_token_accuracy": 0.24003661572933196, "num_tokens": 61160680.0, "step": 26675 }, { "entropy": 5.118103408813477, "epoch": 2.5629202689721424, "grad_norm": 1.015625, "learning_rate": 0.00043455170722676194, "loss": 4.7777, "mean_token_accuracy": 0.23091289401054382, "num_tokens": 61172624.0, "step": 26680 }, { "entropy": 4.997263336181641, "epoch": 2.563400576368876, "grad_norm": 1.0859375, "learning_rate": 0.0004345275324126328, "loss": 4.6791, "mean_token_accuracy": 0.24024482667446137, "num_tokens": 61183971.0, "step": 26685 }, { "entropy": 5.060653734207153, "epoch": 2.56388088376561, "grad_norm": 1.09375, "learning_rate": 0.00043450335389474796, "loss": 4.6853, "mean_token_accuracy": 0.24575116485357285, "num_tokens": 61194511.0, "step": 26690 }, { "entropy": 5.046016693115234, "epoch": 2.5643611911623436, "grad_norm": 1.0546875, "learning_rate": 0.0004344791716736689, "loss": 4.6947, "mean_token_accuracy": 0.23781549483537673, "num_tokens": 61205235.0, "step": 26695 }, { "entropy": 5.028386497497559, "epoch": 2.564841498559078, "grad_norm": 1.0859375, "learning_rate": 0.00043445498574995705, "loss": 4.7266, "mean_token_accuracy": 0.23924607634544373, "num_tokens": 61217460.0, "step": 26700 }, { "entropy": 5.087894010543823, "epoch": 2.565321805955812, "grad_norm": 1.1796875, "learning_rate": 0.00043443079612417394, "loss": 4.7255, "mean_token_accuracy": 0.24343910813331604, "num_tokens": 61229917.0, "step": 26705 }, { "entropy": 5.15901665687561, "epoch": 2.5658021133525457, "grad_norm": 1.1328125, "learning_rate": 0.0004344066027968812, "loss": 4.8817, "mean_token_accuracy": 0.22483388036489488, "num_tokens": 61241826.0, "step": 26710 }, { "entropy": 5.1887860774993895, "epoch": 2.5662824207492796, "grad_norm": 1.0546875, "learning_rate": 0.00043438240576864034, "loss": 4.8689, "mean_token_accuracy": 0.23257529586553574, "num_tokens": 61254095.0, "step": 26715 }, { "entropy": 5.109234619140625, "epoch": 2.5667627281460135, "grad_norm": 1.0234375, "learning_rate": 0.0004343582050400134, "loss": 4.8407, "mean_token_accuracy": 0.23535792976617814, "num_tokens": 61266114.0, "step": 26720 }, { "entropy": 5.076544189453125, "epoch": 2.5672430355427474, "grad_norm": 1.1328125, "learning_rate": 0.0004343340006115621, "loss": 4.7512, "mean_token_accuracy": 0.2435067653656006, "num_tokens": 61277289.0, "step": 26725 }, { "entropy": 5.213899040222168, "epoch": 2.5677233429394812, "grad_norm": 1.1171875, "learning_rate": 0.0004343097924838483, "loss": 4.8736, "mean_token_accuracy": 0.22794857025146484, "num_tokens": 61289424.0, "step": 26730 }, { "entropy": 5.176094341278076, "epoch": 2.568203650336215, "grad_norm": 1.1484375, "learning_rate": 0.0004342855806574344, "loss": 4.7694, "mean_token_accuracy": 0.23666035383939743, "num_tokens": 61300821.0, "step": 26735 }, { "entropy": 5.061013555526733, "epoch": 2.568683957732949, "grad_norm": 1.0234375, "learning_rate": 0.0004342613651328822, "loss": 4.7296, "mean_token_accuracy": 0.2397848516702652, "num_tokens": 61312097.0, "step": 26740 }, { "entropy": 5.012730407714844, "epoch": 2.569164265129683, "grad_norm": 1.0625, "learning_rate": 0.000434237145910754, "loss": 4.7368, "mean_token_accuracy": 0.24157094657421113, "num_tokens": 61324144.0, "step": 26745 }, { "entropy": 5.09266152381897, "epoch": 2.5696445725264168, "grad_norm": 1.0234375, "learning_rate": 0.00043421292299161213, "loss": 4.7273, "mean_token_accuracy": 0.23389368057250975, "num_tokens": 61335809.0, "step": 26750 }, { "entropy": 5.086174964904785, "epoch": 2.570124879923151, "grad_norm": 1.1171875, "learning_rate": 0.00043418869637601887, "loss": 4.7346, "mean_token_accuracy": 0.23781654387712478, "num_tokens": 61346980.0, "step": 26755 }, { "entropy": 5.129696369171143, "epoch": 2.5706051873198845, "grad_norm": 1.2265625, "learning_rate": 0.00043416446606453686, "loss": 4.8818, "mean_token_accuracy": 0.22965129613876342, "num_tokens": 61358401.0, "step": 26760 }, { "entropy": 5.049072360992431, "epoch": 2.571085494716619, "grad_norm": 1.1328125, "learning_rate": 0.0004341402320577285, "loss": 4.733, "mean_token_accuracy": 0.2349224328994751, "num_tokens": 61369383.0, "step": 26765 }, { "entropy": 5.117078590393066, "epoch": 2.5715658021133523, "grad_norm": 1.0859375, "learning_rate": 0.0004341159943561566, "loss": 4.767, "mean_token_accuracy": 0.23110248148441315, "num_tokens": 61380439.0, "step": 26770 }, { "entropy": 5.0966477394104, "epoch": 2.5720461095100866, "grad_norm": 1.03125, "learning_rate": 0.0004340917529603837, "loss": 4.7259, "mean_token_accuracy": 0.2448524907231331, "num_tokens": 61391218.0, "step": 26775 }, { "entropy": 5.062763261795044, "epoch": 2.5725264169068205, "grad_norm": 1.0546875, "learning_rate": 0.0004340675078709728, "loss": 4.7023, "mean_token_accuracy": 0.23651713877916336, "num_tokens": 61402715.0, "step": 26780 }, { "entropy": 5.110429859161377, "epoch": 2.5730067243035544, "grad_norm": 1.0859375, "learning_rate": 0.0004340432590884866, "loss": 4.8037, "mean_token_accuracy": 0.23051778227090836, "num_tokens": 61414361.0, "step": 26785 }, { "entropy": 5.071790027618408, "epoch": 2.5734870317002883, "grad_norm": 0.9765625, "learning_rate": 0.00043401900661348825, "loss": 4.7136, "mean_token_accuracy": 0.2407862976193428, "num_tokens": 61426817.0, "step": 26790 }, { "entropy": 5.04749174118042, "epoch": 2.573967339097022, "grad_norm": 1.1640625, "learning_rate": 0.00043399475044654073, "loss": 4.7295, "mean_token_accuracy": 0.23616889715194703, "num_tokens": 61438142.0, "step": 26795 }, { "entropy": 5.0938348293304445, "epoch": 2.574447646493756, "grad_norm": 1.0625, "learning_rate": 0.00043397049058820726, "loss": 4.7878, "mean_token_accuracy": 0.2282659724354744, "num_tokens": 61450708.0, "step": 26800 }, { "entropy": 5.106382703781128, "epoch": 2.57492795389049, "grad_norm": 1.078125, "learning_rate": 0.000433946227039051, "loss": 4.7747, "mean_token_accuracy": 0.23768922835588455, "num_tokens": 61462266.0, "step": 26805 }, { "entropy": 5.117368936538696, "epoch": 2.5754082612872238, "grad_norm": 1.0625, "learning_rate": 0.0004339219597996353, "loss": 4.7742, "mean_token_accuracy": 0.23543445467948915, "num_tokens": 61473690.0, "step": 26810 }, { "entropy": 4.9948780059814455, "epoch": 2.5758885686839577, "grad_norm": 1.171875, "learning_rate": 0.0004338976888705236, "loss": 4.6335, "mean_token_accuracy": 0.23899848759174347, "num_tokens": 61484997.0, "step": 26815 }, { "entropy": 4.981661653518676, "epoch": 2.5763688760806915, "grad_norm": 1.0234375, "learning_rate": 0.00043387341425227944, "loss": 4.8124, "mean_token_accuracy": 0.2320055529475212, "num_tokens": 61497792.0, "step": 26820 }, { "entropy": 5.081102275848389, "epoch": 2.5768491834774254, "grad_norm": 1.0546875, "learning_rate": 0.0004338491359454664, "loss": 4.6597, "mean_token_accuracy": 0.24134160727262496, "num_tokens": 61509060.0, "step": 26825 }, { "entropy": 5.135309314727783, "epoch": 2.5773294908741593, "grad_norm": 1.0546875, "learning_rate": 0.00043382485395064796, "loss": 4.7687, "mean_token_accuracy": 0.24514102190732956, "num_tokens": 61519577.0, "step": 26830 }, { "entropy": 5.019241142272949, "epoch": 2.577809798270893, "grad_norm": 1.09375, "learning_rate": 0.0004338005682683881, "loss": 4.7197, "mean_token_accuracy": 0.2386346474289894, "num_tokens": 61531956.0, "step": 26835 }, { "entropy": 5.139713191986084, "epoch": 2.5782901056676275, "grad_norm": 1.03125, "learning_rate": 0.00043377627889925057, "loss": 4.8069, "mean_token_accuracy": 0.2325481116771698, "num_tokens": 61543801.0, "step": 26840 }, { "entropy": 5.137574863433838, "epoch": 2.578770413064361, "grad_norm": 1.0703125, "learning_rate": 0.0004337519858437994, "loss": 4.8302, "mean_token_accuracy": 0.2269312933087349, "num_tokens": 61555899.0, "step": 26845 }, { "entropy": 5.0105894088745115, "epoch": 2.5792507204610953, "grad_norm": 1.109375, "learning_rate": 0.0004337276891025984, "loss": 4.6782, "mean_token_accuracy": 0.2456341192126274, "num_tokens": 61567280.0, "step": 26850 }, { "entropy": 5.057559442520142, "epoch": 2.579731027857829, "grad_norm": 1.0546875, "learning_rate": 0.00043370338867621184, "loss": 4.7743, "mean_token_accuracy": 0.23362419605255128, "num_tokens": 61578026.0, "step": 26855 }, { "entropy": 5.123590898513794, "epoch": 2.580211335254563, "grad_norm": 1.03125, "learning_rate": 0.00043367908456520387, "loss": 4.7506, "mean_token_accuracy": 0.23303580284118652, "num_tokens": 61589399.0, "step": 26860 }, { "entropy": 5.094915103912354, "epoch": 2.580691642651297, "grad_norm": 1.0625, "learning_rate": 0.0004336547767701387, "loss": 4.7913, "mean_token_accuracy": 0.23051347732543945, "num_tokens": 61601025.0, "step": 26865 }, { "entropy": 5.1751209735870365, "epoch": 2.581171950048031, "grad_norm": 1.0390625, "learning_rate": 0.00043363046529158077, "loss": 4.8907, "mean_token_accuracy": 0.22715967744588852, "num_tokens": 61612529.0, "step": 26870 }, { "entropy": 5.114480495452881, "epoch": 2.5816522574447647, "grad_norm": 1.1875, "learning_rate": 0.0004336061501300944, "loss": 4.8035, "mean_token_accuracy": 0.23027193993330003, "num_tokens": 61624423.0, "step": 26875 }, { "entropy": 5.162697601318359, "epoch": 2.5821325648414986, "grad_norm": 1.1171875, "learning_rate": 0.0004335818312862442, "loss": 4.8424, "mean_token_accuracy": 0.2312071889638901, "num_tokens": 61635002.0, "step": 26880 }, { "entropy": 5.069757986068725, "epoch": 2.5826128722382324, "grad_norm": 1.1015625, "learning_rate": 0.00043355750876059485, "loss": 4.7046, "mean_token_accuracy": 0.24180822521448136, "num_tokens": 61645911.0, "step": 26885 }, { "entropy": 5.083767700195312, "epoch": 2.5830931796349663, "grad_norm": 1.0859375, "learning_rate": 0.00043353318255371086, "loss": 4.7485, "mean_token_accuracy": 0.2376156985759735, "num_tokens": 61657545.0, "step": 26890 }, { "entropy": 5.162339305877685, "epoch": 2.5835734870317, "grad_norm": 1.2578125, "learning_rate": 0.0004335088526661571, "loss": 4.8215, "mean_token_accuracy": 0.2307661294937134, "num_tokens": 61669567.0, "step": 26895 }, { "entropy": 5.1503373146057125, "epoch": 2.584053794428434, "grad_norm": 1.1484375, "learning_rate": 0.00043348451909849855, "loss": 4.8068, "mean_token_accuracy": 0.23385066986083985, "num_tokens": 61681725.0, "step": 26900 }, { "entropy": 5.151612710952759, "epoch": 2.584534101825168, "grad_norm": 1.1015625, "learning_rate": 0.00043346018185130006, "loss": 4.836, "mean_token_accuracy": 0.23845063894987106, "num_tokens": 61692503.0, "step": 26905 }, { "entropy": 5.039955091476441, "epoch": 2.585014409221902, "grad_norm": 1.03125, "learning_rate": 0.00043343584092512665, "loss": 4.7092, "mean_token_accuracy": 0.24210308790206908, "num_tokens": 61704567.0, "step": 26910 }, { "entropy": 5.066004133224487, "epoch": 2.585494716618636, "grad_norm": 1.015625, "learning_rate": 0.0004334114963205435, "loss": 4.7261, "mean_token_accuracy": 0.2413608655333519, "num_tokens": 61716155.0, "step": 26915 }, { "entropy": 5.088526725769043, "epoch": 2.5859750240153696, "grad_norm": 1.0703125, "learning_rate": 0.0004333871480381158, "loss": 4.8512, "mean_token_accuracy": 0.2299770951271057, "num_tokens": 61727489.0, "step": 26920 }, { "entropy": 5.126882791519165, "epoch": 2.586455331412104, "grad_norm": 1.046875, "learning_rate": 0.0004333627960784088, "loss": 4.7744, "mean_token_accuracy": 0.23732642978429794, "num_tokens": 61739792.0, "step": 26925 }, { "entropy": 5.118010234832764, "epoch": 2.586935638808838, "grad_norm": 1.0703125, "learning_rate": 0.0004333384404419879, "loss": 4.7854, "mean_token_accuracy": 0.23018379360437394, "num_tokens": 61750891.0, "step": 26930 }, { "entropy": 5.066667318344116, "epoch": 2.5874159462055717, "grad_norm": 1.0234375, "learning_rate": 0.0004333140811294186, "loss": 4.7431, "mean_token_accuracy": 0.24288289994001389, "num_tokens": 61761691.0, "step": 26935 }, { "entropy": 5.106765794754028, "epoch": 2.5878962536023056, "grad_norm": 1.046875, "learning_rate": 0.0004332897181412664, "loss": 4.7476, "mean_token_accuracy": 0.23299417197704314, "num_tokens": 61772371.0, "step": 26940 }, { "entropy": 5.064568710327149, "epoch": 2.5883765609990395, "grad_norm": 1.0390625, "learning_rate": 0.00043326535147809696, "loss": 4.7184, "mean_token_accuracy": 0.23607280999422073, "num_tokens": 61784908.0, "step": 26945 }, { "entropy": 5.105681419372559, "epoch": 2.5888568683957733, "grad_norm": 1.1171875, "learning_rate": 0.00043324098114047604, "loss": 4.8136, "mean_token_accuracy": 0.23087596744298935, "num_tokens": 61795948.0, "step": 26950 }, { "entropy": 5.14426064491272, "epoch": 2.589337175792507, "grad_norm": 1.09375, "learning_rate": 0.0004332166071289695, "loss": 4.7804, "mean_token_accuracy": 0.22809687554836272, "num_tokens": 61807528.0, "step": 26955 }, { "entropy": 5.120312786102295, "epoch": 2.589817483189241, "grad_norm": 1.0625, "learning_rate": 0.00043319222944414304, "loss": 4.7465, "mean_token_accuracy": 0.23673148155212403, "num_tokens": 61819971.0, "step": 26960 }, { "entropy": 5.0299958229064945, "epoch": 2.590297790585975, "grad_norm": 1.0546875, "learning_rate": 0.00043316784808656276, "loss": 4.7058, "mean_token_accuracy": 0.24276293814182281, "num_tokens": 61832284.0, "step": 26965 }, { "entropy": 5.098874902725219, "epoch": 2.590778097982709, "grad_norm": 1.109375, "learning_rate": 0.00043314346305679477, "loss": 4.7955, "mean_token_accuracy": 0.2335294172167778, "num_tokens": 61844350.0, "step": 26970 }, { "entropy": 5.099506902694702, "epoch": 2.5912584053794427, "grad_norm": 1.0390625, "learning_rate": 0.00043311907435540517, "loss": 4.7377, "mean_token_accuracy": 0.24343179762363434, "num_tokens": 61855180.0, "step": 26975 }, { "entropy": 5.059788990020752, "epoch": 2.5917387127761766, "grad_norm": 1.265625, "learning_rate": 0.0004330946819829601, "loss": 4.7935, "mean_token_accuracy": 0.2240220710635185, "num_tokens": 61866963.0, "step": 26980 }, { "entropy": 5.133587837219238, "epoch": 2.5922190201729105, "grad_norm": 1.0703125, "learning_rate": 0.00043307028594002597, "loss": 4.831, "mean_token_accuracy": 0.23132447004318238, "num_tokens": 61877503.0, "step": 26985 }, { "entropy": 5.092684125900268, "epoch": 2.592699327569645, "grad_norm": 1.015625, "learning_rate": 0.00043304588622716924, "loss": 4.7193, "mean_token_accuracy": 0.24377114772796632, "num_tokens": 61888426.0, "step": 26990 }, { "entropy": 5.100298738479614, "epoch": 2.5931796349663783, "grad_norm": 1.109375, "learning_rate": 0.0004330214828449563, "loss": 4.7229, "mean_token_accuracy": 0.23494363129138945, "num_tokens": 61899952.0, "step": 26995 }, { "entropy": 4.997558212280273, "epoch": 2.5936599423631126, "grad_norm": 1.0859375, "learning_rate": 0.00043299707579395365, "loss": 4.7283, "mean_token_accuracy": 0.23868339210748674, "num_tokens": 61911956.0, "step": 27000 }, { "epoch": 2.5936599423631126, "eval_entropy": 4.852371316488518, "eval_loss": 4.87723970413208, "eval_mean_token_accuracy": 0.23725959622386983, "eval_num_tokens": 61911956.0, "eval_runtime": 26.6641, "eval_samples_per_second": 1230.683, "eval_steps_per_second": 153.84, "step": 27000 }, { "entropy": 5.029786920547485, "epoch": 2.594140249759846, "grad_norm": 1.140625, "learning_rate": 0.0004329726650747282, "loss": 4.7416, "mean_token_accuracy": 0.23479766696691512, "num_tokens": 61922916.0, "step": 27005 }, { "entropy": 5.17045111656189, "epoch": 2.5946205571565804, "grad_norm": 1.0703125, "learning_rate": 0.0004329482506878465, "loss": 4.7953, "mean_token_accuracy": 0.2329192042350769, "num_tokens": 61933683.0, "step": 27010 }, { "entropy": 5.054750871658325, "epoch": 2.5951008645533142, "grad_norm": 1.15625, "learning_rate": 0.00043292383263387536, "loss": 4.8124, "mean_token_accuracy": 0.2400738313794136, "num_tokens": 61945063.0, "step": 27015 }, { "entropy": 5.075078678131104, "epoch": 2.595581171950048, "grad_norm": 1.1015625, "learning_rate": 0.00043289941091338187, "loss": 4.7742, "mean_token_accuracy": 0.23035979121923447, "num_tokens": 61955919.0, "step": 27020 }, { "entropy": 5.244627285003662, "epoch": 2.596061479346782, "grad_norm": 1.0859375, "learning_rate": 0.0004328749855269329, "loss": 4.9298, "mean_token_accuracy": 0.22583782225847243, "num_tokens": 61967083.0, "step": 27025 }, { "entropy": 5.064927673339843, "epoch": 2.596541786743516, "grad_norm": 1.0390625, "learning_rate": 0.0004328505564750955, "loss": 4.7086, "mean_token_accuracy": 0.2429245576262474, "num_tokens": 61978659.0, "step": 27030 }, { "entropy": 5.051366949081421, "epoch": 2.5970220941402498, "grad_norm": 1.0703125, "learning_rate": 0.000432826123758437, "loss": 4.6966, "mean_token_accuracy": 0.24067613929510118, "num_tokens": 61990966.0, "step": 27035 }, { "entropy": 5.026900863647461, "epoch": 2.5975024015369836, "grad_norm": 1.0625, "learning_rate": 0.0004328016873775244, "loss": 4.713, "mean_token_accuracy": 0.23913576900959016, "num_tokens": 62002417.0, "step": 27040 }, { "entropy": 5.1119975566864015, "epoch": 2.5979827089337175, "grad_norm": 1.1171875, "learning_rate": 0.00043277724733292527, "loss": 4.8071, "mean_token_accuracy": 0.22981019616127013, "num_tokens": 62014698.0, "step": 27045 }, { "entropy": 5.070964431762695, "epoch": 2.5984630163304514, "grad_norm": 1.0546875, "learning_rate": 0.0004327528036252071, "loss": 4.6792, "mean_token_accuracy": 0.243856018781662, "num_tokens": 62025298.0, "step": 27050 }, { "entropy": 5.006353187561035, "epoch": 2.5989433237271853, "grad_norm": 1.0703125, "learning_rate": 0.0004327283562549371, "loss": 4.6973, "mean_token_accuracy": 0.2428078219294548, "num_tokens": 62036419.0, "step": 27055 }, { "entropy": 5.1200206756591795, "epoch": 2.599423631123919, "grad_norm": 1.15625, "learning_rate": 0.000432703905222683, "loss": 4.8084, "mean_token_accuracy": 0.23436084985733033, "num_tokens": 62046800.0, "step": 27060 }, { "entropy": 5.110630846023559, "epoch": 2.5999039385206535, "grad_norm": 1.1015625, "learning_rate": 0.00043267945052901264, "loss": 4.7779, "mean_token_accuracy": 0.23510639518499374, "num_tokens": 62058519.0, "step": 27065 }, { "entropy": 5.109530687332153, "epoch": 2.600384245917387, "grad_norm": 1.0625, "learning_rate": 0.0004326549921744935, "loss": 4.757, "mean_token_accuracy": 0.23558929562568665, "num_tokens": 62070445.0, "step": 27070 }, { "entropy": 5.126020717620849, "epoch": 2.6008645533141213, "grad_norm": 1.0703125, "learning_rate": 0.0004326305301596937, "loss": 4.822, "mean_token_accuracy": 0.22863227427005767, "num_tokens": 62083084.0, "step": 27075 }, { "entropy": 5.143060207366943, "epoch": 2.6013448607108547, "grad_norm": 1.0703125, "learning_rate": 0.00043260606448518096, "loss": 4.763, "mean_token_accuracy": 0.2291410893201828, "num_tokens": 62094334.0, "step": 27080 }, { "entropy": 5.088135766983032, "epoch": 2.601825168107589, "grad_norm": 1.109375, "learning_rate": 0.00043258159515152347, "loss": 4.7475, "mean_token_accuracy": 0.23574207574129105, "num_tokens": 62105065.0, "step": 27085 }, { "entropy": 5.125645399093628, "epoch": 2.602305475504323, "grad_norm": 1.1171875, "learning_rate": 0.0004325571221592892, "loss": 4.8434, "mean_token_accuracy": 0.22585551887750627, "num_tokens": 62116990.0, "step": 27090 }, { "entropy": 5.089757871627808, "epoch": 2.6027857829010568, "grad_norm": 1.140625, "learning_rate": 0.00043253264550904646, "loss": 4.6787, "mean_token_accuracy": 0.2427651584148407, "num_tokens": 62128096.0, "step": 27095 }, { "entropy": 5.205277824401856, "epoch": 2.6032660902977907, "grad_norm": 1.09375, "learning_rate": 0.0004325081652013635, "loss": 4.8836, "mean_token_accuracy": 0.22276324778795242, "num_tokens": 62141139.0, "step": 27100 }, { "entropy": 5.055752944946289, "epoch": 2.6037463976945245, "grad_norm": 1.2734375, "learning_rate": 0.00043248368123680855, "loss": 4.6158, "mean_token_accuracy": 0.24648532420396804, "num_tokens": 62152522.0, "step": 27105 }, { "entropy": 4.990445327758789, "epoch": 2.6042267050912584, "grad_norm": 1.15625, "learning_rate": 0.00043245919361595026, "loss": 4.6983, "mean_token_accuracy": 0.24285367727279664, "num_tokens": 62162919.0, "step": 27110 }, { "entropy": 5.088385057449341, "epoch": 2.6047070124879923, "grad_norm": 1.1015625, "learning_rate": 0.00043243470233935696, "loss": 4.736, "mean_token_accuracy": 0.23736914843320847, "num_tokens": 62174936.0, "step": 27115 }, { "entropy": 5.158363199234008, "epoch": 2.605187319884726, "grad_norm": 1.1015625, "learning_rate": 0.0004324102074075973, "loss": 4.7903, "mean_token_accuracy": 0.23448725491762162, "num_tokens": 62186260.0, "step": 27120 }, { "entropy": 5.0671289443969725, "epoch": 2.60566762728146, "grad_norm": 0.98828125, "learning_rate": 0.0004323857088212402, "loss": 4.6928, "mean_token_accuracy": 0.2398076131939888, "num_tokens": 62198616.0, "step": 27125 }, { "entropy": 5.043847560882568, "epoch": 2.606147934678194, "grad_norm": 1.15625, "learning_rate": 0.0004323612065808541, "loss": 4.7053, "mean_token_accuracy": 0.23870875984430312, "num_tokens": 62209318.0, "step": 27130 }, { "entropy": 5.1781915664672855, "epoch": 2.606628242074928, "grad_norm": 1.0703125, "learning_rate": 0.00043233670068700827, "loss": 4.8822, "mean_token_accuracy": 0.22753420770168303, "num_tokens": 62220510.0, "step": 27135 }, { "entropy": 5.175050258636475, "epoch": 2.6071085494716617, "grad_norm": 1.1171875, "learning_rate": 0.0004323121911402713, "loss": 4.8766, "mean_token_accuracy": 0.22503857761621476, "num_tokens": 62233676.0, "step": 27140 }, { "entropy": 5.11496376991272, "epoch": 2.6075888568683956, "grad_norm": 1.0859375, "learning_rate": 0.00043228767794121245, "loss": 4.7784, "mean_token_accuracy": 0.23106486797332765, "num_tokens": 62244604.0, "step": 27145 }, { "entropy": 5.075182390213013, "epoch": 2.60806916426513, "grad_norm": 1.09375, "learning_rate": 0.0004322631610904006, "loss": 4.792, "mean_token_accuracy": 0.23115721493959426, "num_tokens": 62255431.0, "step": 27150 }, { "entropy": 5.2251488208770756, "epoch": 2.6085494716618634, "grad_norm": 1.109375, "learning_rate": 0.00043223864058840534, "loss": 4.9481, "mean_token_accuracy": 0.22021759003400804, "num_tokens": 62267050.0, "step": 27155 }, { "entropy": 5.163397312164307, "epoch": 2.6090297790585977, "grad_norm": 0.99609375, "learning_rate": 0.00043221411643579557, "loss": 4.8037, "mean_token_accuracy": 0.23190855830907822, "num_tokens": 62278939.0, "step": 27160 }, { "entropy": 5.035414218902588, "epoch": 2.6095100864553316, "grad_norm": 1.1484375, "learning_rate": 0.00043218958863314096, "loss": 4.764, "mean_token_accuracy": 0.23242587298154832, "num_tokens": 62291275.0, "step": 27165 }, { "entropy": 4.995542573928833, "epoch": 2.6099903938520654, "grad_norm": 1.0703125, "learning_rate": 0.0004321650571810109, "loss": 4.6767, "mean_token_accuracy": 0.23926113694906234, "num_tokens": 62302191.0, "step": 27170 }, { "entropy": 5.10365104675293, "epoch": 2.6104707012487993, "grad_norm": 1.1328125, "learning_rate": 0.0004321405220799747, "loss": 4.7078, "mean_token_accuracy": 0.24113160371780396, "num_tokens": 62313678.0, "step": 27175 }, { "entropy": 5.059454250335693, "epoch": 2.610951008645533, "grad_norm": 1.0390625, "learning_rate": 0.0004321159833306024, "loss": 4.723, "mean_token_accuracy": 0.2401316285133362, "num_tokens": 62324324.0, "step": 27180 }, { "entropy": 5.137457323074341, "epoch": 2.611431316042267, "grad_norm": 1.328125, "learning_rate": 0.0004320914409334634, "loss": 4.8844, "mean_token_accuracy": 0.2295592397451401, "num_tokens": 62335385.0, "step": 27185 }, { "entropy": 5.141259002685547, "epoch": 2.611911623439001, "grad_norm": 1.1328125, "learning_rate": 0.0004320668948891276, "loss": 4.83, "mean_token_accuracy": 0.22620759904384613, "num_tokens": 62346733.0, "step": 27190 }, { "entropy": 5.138140726089477, "epoch": 2.612391930835735, "grad_norm": 1.0390625, "learning_rate": 0.00043204234519816486, "loss": 4.7799, "mean_token_accuracy": 0.23260708153247833, "num_tokens": 62358674.0, "step": 27195 }, { "entropy": 5.097907829284668, "epoch": 2.6128722382324687, "grad_norm": 1.109375, "learning_rate": 0.0004320177918611453, "loss": 4.7617, "mean_token_accuracy": 0.2355603665113449, "num_tokens": 62370431.0, "step": 27200 }, { "entropy": 5.071603536605835, "epoch": 2.6133525456292026, "grad_norm": 1.125, "learning_rate": 0.00043199323487863876, "loss": 4.7055, "mean_token_accuracy": 0.23453706800937651, "num_tokens": 62381098.0, "step": 27205 }, { "entropy": 5.105896091461181, "epoch": 2.6138328530259365, "grad_norm": 1.0234375, "learning_rate": 0.00043196867425121554, "loss": 4.7289, "mean_token_accuracy": 0.2379012182354927, "num_tokens": 62393446.0, "step": 27210 }, { "entropy": 5.0916307926177975, "epoch": 2.6143131604226704, "grad_norm": 1.1796875, "learning_rate": 0.00043194410997944577, "loss": 4.8031, "mean_token_accuracy": 0.2336391270160675, "num_tokens": 62406581.0, "step": 27215 }, { "entropy": 5.140932083129883, "epoch": 2.6147934678194042, "grad_norm": 1.0390625, "learning_rate": 0.00043191954206389985, "loss": 4.8223, "mean_token_accuracy": 0.22878152132034302, "num_tokens": 62419516.0, "step": 27220 }, { "entropy": 5.123697328567505, "epoch": 2.6152737752161386, "grad_norm": 1.0234375, "learning_rate": 0.0004318949705051481, "loss": 4.7438, "mean_token_accuracy": 0.2326513096690178, "num_tokens": 62431577.0, "step": 27225 }, { "entropy": 5.015741491317749, "epoch": 2.615754082612872, "grad_norm": 1.0703125, "learning_rate": 0.0004318703953037609, "loss": 4.7388, "mean_token_accuracy": 0.23688763231039048, "num_tokens": 62444507.0, "step": 27230 }, { "entropy": 5.019672060012818, "epoch": 2.6162343900096063, "grad_norm": 1.1875, "learning_rate": 0.0004318458164603091, "loss": 4.7087, "mean_token_accuracy": 0.2319497287273407, "num_tokens": 62455692.0, "step": 27235 }, { "entropy": 5.1985191822052, "epoch": 2.61671469740634, "grad_norm": 1.1171875, "learning_rate": 0.0004318212339753631, "loss": 4.8151, "mean_token_accuracy": 0.2287910521030426, "num_tokens": 62467215.0, "step": 27240 }, { "entropy": 5.120895576477051, "epoch": 2.617195004803074, "grad_norm": 1.078125, "learning_rate": 0.00043179664784949375, "loss": 4.8341, "mean_token_accuracy": 0.230615296959877, "num_tokens": 62480016.0, "step": 27245 }, { "entropy": 5.132107305526733, "epoch": 2.617675312199808, "grad_norm": 1.15625, "learning_rate": 0.0004317720580832717, "loss": 4.8261, "mean_token_accuracy": 0.23632377982139588, "num_tokens": 62491088.0, "step": 27250 }, { "entropy": 5.117570686340332, "epoch": 2.618155619596542, "grad_norm": 1.0859375, "learning_rate": 0.0004317474646772681, "loss": 4.7609, "mean_token_accuracy": 0.23309019654989244, "num_tokens": 62502405.0, "step": 27255 }, { "entropy": 5.130024766921997, "epoch": 2.6186359269932757, "grad_norm": 1.1796875, "learning_rate": 0.0004317228676320539, "loss": 4.7606, "mean_token_accuracy": 0.2346285194158554, "num_tokens": 62512968.0, "step": 27260 }, { "entropy": 5.057028913497925, "epoch": 2.6191162343900096, "grad_norm": 1.015625, "learning_rate": 0.00043169826694819987, "loss": 4.6535, "mean_token_accuracy": 0.24227974116802214, "num_tokens": 62524242.0, "step": 27265 }, { "entropy": 5.120878648757935, "epoch": 2.6195965417867435, "grad_norm": 1.2578125, "learning_rate": 0.0004316736626262775, "loss": 4.8243, "mean_token_accuracy": 0.23332867175340652, "num_tokens": 62536556.0, "step": 27270 }, { "entropy": 5.16615777015686, "epoch": 2.6200768491834774, "grad_norm": 1.0625, "learning_rate": 0.0004316490546668579, "loss": 4.7916, "mean_token_accuracy": 0.2371727392077446, "num_tokens": 62547897.0, "step": 27275 }, { "entropy": 5.110966777801513, "epoch": 2.6205571565802113, "grad_norm": 1.125, "learning_rate": 0.0004316244430705124, "loss": 4.7821, "mean_token_accuracy": 0.2275903344154358, "num_tokens": 62560190.0, "step": 27280 }, { "entropy": 5.09627046585083, "epoch": 2.621037463976945, "grad_norm": 1.15625, "learning_rate": 0.0004315998278378123, "loss": 4.856, "mean_token_accuracy": 0.22962083518505097, "num_tokens": 62571885.0, "step": 27285 }, { "entropy": 5.106700801849366, "epoch": 2.621517771373679, "grad_norm": 1.109375, "learning_rate": 0.00043157520896932943, "loss": 4.7795, "mean_token_accuracy": 0.2367392286658287, "num_tokens": 62583698.0, "step": 27290 }, { "entropy": 5.210479211807251, "epoch": 2.621998078770413, "grad_norm": 1.015625, "learning_rate": 0.000431550586465635, "loss": 4.8258, "mean_token_accuracy": 0.2318419858813286, "num_tokens": 62594813.0, "step": 27295 }, { "entropy": 5.029856395721436, "epoch": 2.6224783861671472, "grad_norm": 1.09375, "learning_rate": 0.00043152596032730085, "loss": 4.6127, "mean_token_accuracy": 0.24580606669187546, "num_tokens": 62604868.0, "step": 27300 }, { "entropy": 4.966013050079345, "epoch": 2.6229586935638807, "grad_norm": 1.0546875, "learning_rate": 0.00043150133055489865, "loss": 4.6922, "mean_token_accuracy": 0.23778087943792342, "num_tokens": 62616553.0, "step": 27305 }, { "entropy": 5.088241481781006, "epoch": 2.623439000960615, "grad_norm": 1.0859375, "learning_rate": 0.0004314766971490003, "loss": 4.6779, "mean_token_accuracy": 0.23836185783147812, "num_tokens": 62627468.0, "step": 27310 }, { "entropy": 5.1208264350891115, "epoch": 2.6239193083573484, "grad_norm": 1.1015625, "learning_rate": 0.0004314520601101776, "loss": 4.7641, "mean_token_accuracy": 0.23261116296052933, "num_tokens": 62639255.0, "step": 27315 }, { "entropy": 5.080009984970093, "epoch": 2.6243996157540828, "grad_norm": 1.0625, "learning_rate": 0.00043142741943900275, "loss": 4.7307, "mean_token_accuracy": 0.2292654573917389, "num_tokens": 62650553.0, "step": 27320 }, { "entropy": 5.138628101348877, "epoch": 2.6248799231508166, "grad_norm": 1.15625, "learning_rate": 0.00043140277513604763, "loss": 4.883, "mean_token_accuracy": 0.22575445473194122, "num_tokens": 62662653.0, "step": 27325 }, { "entropy": 5.092043256759643, "epoch": 2.6253602305475505, "grad_norm": 1.0703125, "learning_rate": 0.0004313781272018845, "loss": 4.7271, "mean_token_accuracy": 0.23904713839292527, "num_tokens": 62673618.0, "step": 27330 }, { "entropy": 5.065589809417725, "epoch": 2.6258405379442844, "grad_norm": 1.109375, "learning_rate": 0.0004313534756370857, "loss": 4.6816, "mean_token_accuracy": 0.23946530520915985, "num_tokens": 62685221.0, "step": 27335 }, { "entropy": 5.036132669448852, "epoch": 2.6263208453410183, "grad_norm": 1.15625, "learning_rate": 0.00043132882044222336, "loss": 4.6175, "mean_token_accuracy": 0.24858939349651338, "num_tokens": 62695544.0, "step": 27340 }, { "entropy": 4.945739364624023, "epoch": 2.626801152737752, "grad_norm": 1.0234375, "learning_rate": 0.00043130416161787005, "loss": 4.5768, "mean_token_accuracy": 0.24817198663949966, "num_tokens": 62706945.0, "step": 27345 }, { "entropy": 5.086688137054443, "epoch": 2.627281460134486, "grad_norm": 1.140625, "learning_rate": 0.00043127949916459823, "loss": 4.7471, "mean_token_accuracy": 0.23233324140310288, "num_tokens": 62717821.0, "step": 27350 }, { "entropy": 5.041226196289062, "epoch": 2.62776176753122, "grad_norm": 1.109375, "learning_rate": 0.00043125483308298053, "loss": 4.664, "mean_token_accuracy": 0.2415490359067917, "num_tokens": 62730477.0, "step": 27355 }, { "entropy": 5.064918756484985, "epoch": 2.628242074927954, "grad_norm": 1.125, "learning_rate": 0.0004312301633735895, "loss": 4.7282, "mean_token_accuracy": 0.24113650470972062, "num_tokens": 62741725.0, "step": 27360 }, { "entropy": 5.099993944168091, "epoch": 2.6287223823246877, "grad_norm": 1.1015625, "learning_rate": 0.0004312054900369981, "loss": 4.8248, "mean_token_accuracy": 0.22712087631225586, "num_tokens": 62753600.0, "step": 27365 }, { "entropy": 5.102942943572998, "epoch": 2.6292026897214216, "grad_norm": 1.125, "learning_rate": 0.000431180813073779, "loss": 4.84, "mean_token_accuracy": 0.23344789445400238, "num_tokens": 62766218.0, "step": 27370 }, { "entropy": 5.087442827224732, "epoch": 2.629682997118156, "grad_norm": 1.515625, "learning_rate": 0.0004311561324845051, "loss": 4.7105, "mean_token_accuracy": 0.23643388450145722, "num_tokens": 62777723.0, "step": 27375 }, { "entropy": 5.167104578018188, "epoch": 2.6301633045148893, "grad_norm": 1.1796875, "learning_rate": 0.0004311314482697496, "loss": 4.8839, "mean_token_accuracy": 0.22849978357553483, "num_tokens": 62789711.0, "step": 27380 }, { "entropy": 5.061525678634643, "epoch": 2.6306436119116237, "grad_norm": 1.0703125, "learning_rate": 0.0004311067604300855, "loss": 4.7134, "mean_token_accuracy": 0.24252797961235045, "num_tokens": 62800600.0, "step": 27385 }, { "entropy": 5.123406314849854, "epoch": 2.631123919308357, "grad_norm": 1.109375, "learning_rate": 0.0004310820689660859, "loss": 4.7797, "mean_token_accuracy": 0.2355131432414055, "num_tokens": 62811872.0, "step": 27390 }, { "entropy": 5.110841369628906, "epoch": 2.6316042267050914, "grad_norm": 1.0703125, "learning_rate": 0.0004310573738783242, "loss": 4.7553, "mean_token_accuracy": 0.24118882417678833, "num_tokens": 62822886.0, "step": 27395 }, { "entropy": 5.041201734542847, "epoch": 2.6320845341018253, "grad_norm": 1.0234375, "learning_rate": 0.0004310326751673736, "loss": 4.6955, "mean_token_accuracy": 0.23755438774824142, "num_tokens": 62833911.0, "step": 27400 }, { "entropy": 5.047704887390137, "epoch": 2.632564841498559, "grad_norm": 1.078125, "learning_rate": 0.00043100797283380756, "loss": 4.7228, "mean_token_accuracy": 0.2393511191010475, "num_tokens": 62844756.0, "step": 27405 }, { "entropy": 5.068274259567261, "epoch": 2.633045148895293, "grad_norm": 1.1640625, "learning_rate": 0.00043098326687819973, "loss": 4.7178, "mean_token_accuracy": 0.23599186539649963, "num_tokens": 62856072.0, "step": 27410 }, { "entropy": 5.170894289016724, "epoch": 2.633525456292027, "grad_norm": 1.046875, "learning_rate": 0.0004309585573011236, "loss": 4.8316, "mean_token_accuracy": 0.22551288306713105, "num_tokens": 62868334.0, "step": 27415 }, { "entropy": 5.042424583435059, "epoch": 2.634005763688761, "grad_norm": 1.203125, "learning_rate": 0.0004309338441031528, "loss": 4.6388, "mean_token_accuracy": 0.2423781707882881, "num_tokens": 62879459.0, "step": 27420 }, { "entropy": 4.985827016830444, "epoch": 2.6344860710854947, "grad_norm": 1.078125, "learning_rate": 0.00043090912728486135, "loss": 4.6274, "mean_token_accuracy": 0.24838806539773942, "num_tokens": 62890137.0, "step": 27425 }, { "entropy": 5.121232414245606, "epoch": 2.6349663784822286, "grad_norm": 1.0859375, "learning_rate": 0.0004308844068468228, "loss": 4.7832, "mean_token_accuracy": 0.22631029188632965, "num_tokens": 62902688.0, "step": 27430 }, { "entropy": 5.035251426696777, "epoch": 2.6354466858789625, "grad_norm": 1.1640625, "learning_rate": 0.00043085968278961116, "loss": 4.748, "mean_token_accuracy": 0.24102103114128112, "num_tokens": 62913008.0, "step": 27435 }, { "entropy": 5.134324455261231, "epoch": 2.6359269932756964, "grad_norm": 1.1015625, "learning_rate": 0.00043083495511380055, "loss": 4.7854, "mean_token_accuracy": 0.2309037923812866, "num_tokens": 62924894.0, "step": 27440 }, { "entropy": 5.075713205337524, "epoch": 2.6364073006724302, "grad_norm": 1.140625, "learning_rate": 0.00043081022381996506, "loss": 4.6053, "mean_token_accuracy": 0.25516229718923567, "num_tokens": 62935709.0, "step": 27445 }, { "entropy": 5.1133420944213865, "epoch": 2.636887608069164, "grad_norm": 1.125, "learning_rate": 0.0004307854889086787, "loss": 4.8046, "mean_token_accuracy": 0.23403265476226806, "num_tokens": 62947122.0, "step": 27450 }, { "entropy": 5.044565868377686, "epoch": 2.637367915465898, "grad_norm": 1.0625, "learning_rate": 0.00043076075038051605, "loss": 4.7006, "mean_token_accuracy": 0.23542070537805557, "num_tokens": 62957993.0, "step": 27455 }, { "entropy": 5.1721728324890135, "epoch": 2.6378482228626323, "grad_norm": 1.0625, "learning_rate": 0.0004307360082360512, "loss": 4.8084, "mean_token_accuracy": 0.23182854056358337, "num_tokens": 62969645.0, "step": 27460 }, { "entropy": 5.086919450759888, "epoch": 2.6383285302593658, "grad_norm": 1.1328125, "learning_rate": 0.00043071126247585866, "loss": 4.7242, "mean_token_accuracy": 0.2411841481924057, "num_tokens": 62980770.0, "step": 27465 }, { "entropy": 5.03817572593689, "epoch": 2.6388088376561, "grad_norm": 1.3046875, "learning_rate": 0.000430686513100513, "loss": 4.6346, "mean_token_accuracy": 0.24868223518133165, "num_tokens": 62991418.0, "step": 27470 }, { "entropy": 5.026139974594116, "epoch": 2.639289145052834, "grad_norm": 1.0546875, "learning_rate": 0.00043066176011058877, "loss": 4.6808, "mean_token_accuracy": 0.24658098071813583, "num_tokens": 63002228.0, "step": 27475 }, { "entropy": 5.067081069946289, "epoch": 2.639769452449568, "grad_norm": 1.0390625, "learning_rate": 0.00043063700350666066, "loss": 4.7243, "mean_token_accuracy": 0.23191245943307875, "num_tokens": 63013502.0, "step": 27480 }, { "entropy": 5.13173246383667, "epoch": 2.6402497598463017, "grad_norm": 1.0234375, "learning_rate": 0.0004306122432893036, "loss": 4.8459, "mean_token_accuracy": 0.2285308927297592, "num_tokens": 63024530.0, "step": 27485 }, { "entropy": 5.074685430526733, "epoch": 2.6407300672430356, "grad_norm": 1.125, "learning_rate": 0.00043058747945909224, "loss": 4.7229, "mean_token_accuracy": 0.24568860083818436, "num_tokens": 63035668.0, "step": 27490 }, { "entropy": 5.076900672912598, "epoch": 2.6412103746397695, "grad_norm": 1.125, "learning_rate": 0.00043056271201660166, "loss": 4.7192, "mean_token_accuracy": 0.240654818713665, "num_tokens": 63048020.0, "step": 27495 }, { "entropy": 5.099683141708374, "epoch": 2.6416906820365034, "grad_norm": 1.1015625, "learning_rate": 0.0004305379409624068, "loss": 4.7328, "mean_token_accuracy": 0.23687127530574797, "num_tokens": 63060019.0, "step": 27500 }, { "entropy": 5.041384553909301, "epoch": 2.6421709894332372, "grad_norm": 1.09375, "learning_rate": 0.0004305131662970827, "loss": 4.6942, "mean_token_accuracy": 0.23739387094974518, "num_tokens": 63070151.0, "step": 27505 }, { "entropy": 5.056379270553589, "epoch": 2.642651296829971, "grad_norm": 0.99609375, "learning_rate": 0.0004304883880212048, "loss": 4.7149, "mean_token_accuracy": 0.2374277889728546, "num_tokens": 63082174.0, "step": 27510 }, { "entropy": 5.059743070602417, "epoch": 2.643131604226705, "grad_norm": 1.125, "learning_rate": 0.0004304636061353482, "loss": 4.7311, "mean_token_accuracy": 0.24360514134168626, "num_tokens": 63093478.0, "step": 27515 }, { "entropy": 5.002159929275512, "epoch": 2.643611911623439, "grad_norm": 1.1015625, "learning_rate": 0.0004304388206400883, "loss": 4.6349, "mean_token_accuracy": 0.2428338959813118, "num_tokens": 63104153.0, "step": 27520 }, { "entropy": 4.995828104019165, "epoch": 2.6440922190201728, "grad_norm": 1.203125, "learning_rate": 0.0004304140315360005, "loss": 4.6428, "mean_token_accuracy": 0.24678767919540406, "num_tokens": 63115469.0, "step": 27525 }, { "entropy": 5.022479581832886, "epoch": 2.6445725264169067, "grad_norm": 1.125, "learning_rate": 0.0004303892388236604, "loss": 4.7931, "mean_token_accuracy": 0.22994169294834138, "num_tokens": 63126917.0, "step": 27530 }, { "entropy": 5.1109912395477295, "epoch": 2.645052833813641, "grad_norm": 1.0625, "learning_rate": 0.0004303644425036436, "loss": 4.7511, "mean_token_accuracy": 0.23468243926763535, "num_tokens": 63138865.0, "step": 27535 }, { "entropy": 5.048393392562867, "epoch": 2.6455331412103744, "grad_norm": 1.078125, "learning_rate": 0.00043033964257652575, "loss": 4.573, "mean_token_accuracy": 0.24706293046474456, "num_tokens": 63148937.0, "step": 27540 }, { "entropy": 4.972245502471924, "epoch": 2.6460134486071087, "grad_norm": 1.0703125, "learning_rate": 0.0004303148390428827, "loss": 4.6733, "mean_token_accuracy": 0.24274368435144425, "num_tokens": 63160850.0, "step": 27545 }, { "entropy": 5.079576253890991, "epoch": 2.6464937560038426, "grad_norm": 1.09375, "learning_rate": 0.00043029003190329023, "loss": 4.779, "mean_token_accuracy": 0.23910272717475892, "num_tokens": 63171660.0, "step": 27550 }, { "entropy": 5.109176349639893, "epoch": 2.6469740634005765, "grad_norm": 1.09375, "learning_rate": 0.0004302652211583244, "loss": 4.7526, "mean_token_accuracy": 0.23146633058786392, "num_tokens": 63182291.0, "step": 27555 }, { "entropy": 5.085505628585816, "epoch": 2.6474543707973104, "grad_norm": 1.0625, "learning_rate": 0.0004302404068085612, "loss": 4.6465, "mean_token_accuracy": 0.24001386463642121, "num_tokens": 63193646.0, "step": 27560 }, { "entropy": 5.036965370178223, "epoch": 2.6479346781940443, "grad_norm": 1.09375, "learning_rate": 0.0004302155888545766, "loss": 4.807, "mean_token_accuracy": 0.23138225376605986, "num_tokens": 63205021.0, "step": 27565 }, { "entropy": 5.088585996627808, "epoch": 2.648414985590778, "grad_norm": 0.984375, "learning_rate": 0.000430190767296947, "loss": 4.7587, "mean_token_accuracy": 0.2371032327413559, "num_tokens": 63217315.0, "step": 27570 }, { "entropy": 5.120937442779541, "epoch": 2.648895292987512, "grad_norm": 1.0625, "learning_rate": 0.0004301659421362486, "loss": 4.7396, "mean_token_accuracy": 0.23861674815416337, "num_tokens": 63229082.0, "step": 27575 }, { "entropy": 5.21462049484253, "epoch": 2.649375600384246, "grad_norm": 1.0703125, "learning_rate": 0.0004301411133730578, "loss": 4.8436, "mean_token_accuracy": 0.2371189922094345, "num_tokens": 63240498.0, "step": 27580 }, { "entropy": 5.082724905014038, "epoch": 2.64985590778098, "grad_norm": 1.1875, "learning_rate": 0.00043011628100795093, "loss": 4.8199, "mean_token_accuracy": 0.22867369055747985, "num_tokens": 63251149.0, "step": 27585 }, { "entropy": 5.183015918731689, "epoch": 2.6503362151777137, "grad_norm": 1.1171875, "learning_rate": 0.0004300914450415047, "loss": 4.8088, "mean_token_accuracy": 0.23819313049316407, "num_tokens": 63261903.0, "step": 27590 }, { "entropy": 5.048046827316284, "epoch": 2.6508165225744476, "grad_norm": 1.078125, "learning_rate": 0.00043006660547429565, "loss": 4.7319, "mean_token_accuracy": 0.23319058120250702, "num_tokens": 63274281.0, "step": 27595 }, { "entropy": 4.994765424728394, "epoch": 2.6512968299711814, "grad_norm": 1.09375, "learning_rate": 0.0004300417623069005, "loss": 4.6199, "mean_token_accuracy": 0.24826875925064087, "num_tokens": 63285421.0, "step": 27600 }, { "entropy": 5.075887060165405, "epoch": 2.6517771373679153, "grad_norm": 1.109375, "learning_rate": 0.0004300169155398959, "loss": 4.721, "mean_token_accuracy": 0.23933835029602052, "num_tokens": 63296828.0, "step": 27605 }, { "entropy": 5.192564821243286, "epoch": 2.6522574447646496, "grad_norm": 1.109375, "learning_rate": 0.00042999206517385885, "loss": 4.8035, "mean_token_accuracy": 0.2297218009829521, "num_tokens": 63307132.0, "step": 27610 }, { "entropy": 5.140701389312744, "epoch": 2.652737752161383, "grad_norm": 0.99609375, "learning_rate": 0.0004299672112093663, "loss": 4.8137, "mean_token_accuracy": 0.22922593206167222, "num_tokens": 63318911.0, "step": 27615 }, { "entropy": 5.119142007827759, "epoch": 2.6532180595581174, "grad_norm": 1.125, "learning_rate": 0.00042994235364699526, "loss": 4.8356, "mean_token_accuracy": 0.22658012062311172, "num_tokens": 63329973.0, "step": 27620 }, { "entropy": 5.096360969543457, "epoch": 2.653698366954851, "grad_norm": 1.0234375, "learning_rate": 0.0004299174924873229, "loss": 4.764, "mean_token_accuracy": 0.23279329836368562, "num_tokens": 63341576.0, "step": 27625 }, { "entropy": 5.0801787853240965, "epoch": 2.654178674351585, "grad_norm": 1.03125, "learning_rate": 0.0004298926277309263, "loss": 4.7749, "mean_token_accuracy": 0.23500386327505113, "num_tokens": 63353988.0, "step": 27630 }, { "entropy": 5.150972509384156, "epoch": 2.654658981748319, "grad_norm": 1.0625, "learning_rate": 0.00042986775937838283, "loss": 4.771, "mean_token_accuracy": 0.23352182358503343, "num_tokens": 63364746.0, "step": 27635 }, { "entropy": 5.140335607528686, "epoch": 2.655139289145053, "grad_norm": 1.0703125, "learning_rate": 0.0004298428874302699, "loss": 4.786, "mean_token_accuracy": 0.2273385837674141, "num_tokens": 63375608.0, "step": 27640 }, { "entropy": 5.034888410568238, "epoch": 2.655619596541787, "grad_norm": 0.9921875, "learning_rate": 0.0004298180118871649, "loss": 4.6918, "mean_token_accuracy": 0.2379670947790146, "num_tokens": 63386581.0, "step": 27645 }, { "entropy": 5.061635303497314, "epoch": 2.6560999039385207, "grad_norm": 1.1484375, "learning_rate": 0.00042979313274964535, "loss": 4.7719, "mean_token_accuracy": 0.23388247340917587, "num_tokens": 63398844.0, "step": 27650 }, { "entropy": 5.147040510177613, "epoch": 2.6565802113352546, "grad_norm": 1.0546875, "learning_rate": 0.00042976825001828897, "loss": 4.7711, "mean_token_accuracy": 0.240594382584095, "num_tokens": 63409972.0, "step": 27655 }, { "entropy": 5.079216957092285, "epoch": 2.6570605187319885, "grad_norm": 1.09375, "learning_rate": 0.00042974336369367333, "loss": 4.6831, "mean_token_accuracy": 0.24222468584775925, "num_tokens": 63420598.0, "step": 27660 }, { "entropy": 5.138257026672363, "epoch": 2.6575408261287223, "grad_norm": 1.0859375, "learning_rate": 0.0004297184737763763, "loss": 4.8325, "mean_token_accuracy": 0.23077375292778016, "num_tokens": 63431943.0, "step": 27665 }, { "entropy": 5.031608772277832, "epoch": 2.658021133525456, "grad_norm": 1.0859375, "learning_rate": 0.00042969358026697567, "loss": 4.6484, "mean_token_accuracy": 0.24392486214637757, "num_tokens": 63442539.0, "step": 27670 }, { "entropy": 5.033156442642212, "epoch": 2.65850144092219, "grad_norm": 1.0859375, "learning_rate": 0.0004296686831660495, "loss": 4.68, "mean_token_accuracy": 0.23908789008855819, "num_tokens": 63453100.0, "step": 27675 }, { "entropy": 5.078754043579101, "epoch": 2.658981748318924, "grad_norm": 1.0078125, "learning_rate": 0.0004296437824741758, "loss": 4.7895, "mean_token_accuracy": 0.23553553819656373, "num_tokens": 63464226.0, "step": 27680 }, { "entropy": 5.083715200424194, "epoch": 2.659462055715658, "grad_norm": 1.09375, "learning_rate": 0.00042961887819193263, "loss": 4.6776, "mean_token_accuracy": 0.2441350758075714, "num_tokens": 63475159.0, "step": 27685 }, { "entropy": 5.163394260406494, "epoch": 2.6599423631123917, "grad_norm": 0.98828125, "learning_rate": 0.0004295939703198983, "loss": 4.9027, "mean_token_accuracy": 0.2212497740983963, "num_tokens": 63488015.0, "step": 27690 }, { "entropy": 5.097438478469849, "epoch": 2.660422670509126, "grad_norm": 1.015625, "learning_rate": 0.000429569058858651, "loss": 4.7295, "mean_token_accuracy": 0.23866434544324874, "num_tokens": 63501216.0, "step": 27695 }, { "entropy": 5.1523370265960695, "epoch": 2.6609029779058595, "grad_norm": 1.0859375, "learning_rate": 0.00042954414380876906, "loss": 4.813, "mean_token_accuracy": 0.2316722974181175, "num_tokens": 63511560.0, "step": 27700 }, { "entropy": 5.157254886627197, "epoch": 2.661383285302594, "grad_norm": 1.140625, "learning_rate": 0.00042951922517083104, "loss": 4.8948, "mean_token_accuracy": 0.23377819657325744, "num_tokens": 63522762.0, "step": 27705 }, { "entropy": 5.125922918319702, "epoch": 2.6618635926993277, "grad_norm": 1.078125, "learning_rate": 0.0004294943029454155, "loss": 4.8184, "mean_token_accuracy": 0.23747162520885468, "num_tokens": 63535095.0, "step": 27710 }, { "entropy": 5.068044853210449, "epoch": 2.6623439000960616, "grad_norm": 1.0546875, "learning_rate": 0.00042946937713310093, "loss": 4.6503, "mean_token_accuracy": 0.24541468024253846, "num_tokens": 63545256.0, "step": 27715 }, { "entropy": 5.084882020950317, "epoch": 2.6628242074927955, "grad_norm": 1.0859375, "learning_rate": 0.0004294444477344661, "loss": 4.7642, "mean_token_accuracy": 0.2323298290371895, "num_tokens": 63557269.0, "step": 27720 }, { "entropy": 5.0054723739624025, "epoch": 2.6633045148895294, "grad_norm": 1.0234375, "learning_rate": 0.0004294195147500898, "loss": 4.6352, "mean_token_accuracy": 0.24495234042406083, "num_tokens": 63568452.0, "step": 27725 }, { "entropy": 5.117168378829956, "epoch": 2.6637848222862632, "grad_norm": 0.984375, "learning_rate": 0.00042939457818055095, "loss": 4.8288, "mean_token_accuracy": 0.22925741076469422, "num_tokens": 63581653.0, "step": 27730 }, { "entropy": 5.056406259536743, "epoch": 2.664265129682997, "grad_norm": 1.0546875, "learning_rate": 0.00042936963802642843, "loss": 4.6295, "mean_token_accuracy": 0.23729730695486068, "num_tokens": 63592862.0, "step": 27735 }, { "entropy": 5.104184293746949, "epoch": 2.664745437079731, "grad_norm": 1.015625, "learning_rate": 0.0004293446942883013, "loss": 4.7992, "mean_token_accuracy": 0.23112255334854126, "num_tokens": 63603306.0, "step": 27740 }, { "entropy": 5.138091516494751, "epoch": 2.665225744476465, "grad_norm": 1.0859375, "learning_rate": 0.00042931974696674866, "loss": 4.7845, "mean_token_accuracy": 0.23358572274446487, "num_tokens": 63615031.0, "step": 27745 }, { "entropy": 5.125448942184448, "epoch": 2.6657060518731988, "grad_norm": 0.98828125, "learning_rate": 0.0004292947960623497, "loss": 4.8156, "mean_token_accuracy": 0.2346802145242691, "num_tokens": 63627004.0, "step": 27750 }, { "entropy": 5.10497636795044, "epoch": 2.6661863592699326, "grad_norm": 1.015625, "learning_rate": 0.00042926984157568384, "loss": 4.7762, "mean_token_accuracy": 0.23420519083738328, "num_tokens": 63637712.0, "step": 27755 }, { "entropy": 5.08453893661499, "epoch": 2.6666666666666665, "grad_norm": 0.99609375, "learning_rate": 0.00042924488350733024, "loss": 4.787, "mean_token_accuracy": 0.23482900261878967, "num_tokens": 63649631.0, "step": 27760 }, { "entropy": 5.143103504180909, "epoch": 2.6671469740634004, "grad_norm": 1.1796875, "learning_rate": 0.00042921992185786847, "loss": 4.7379, "mean_token_accuracy": 0.23483039140701295, "num_tokens": 63661929.0, "step": 27765 }, { "entropy": 5.112990140914917, "epoch": 2.6676272814601347, "grad_norm": 1.0390625, "learning_rate": 0.00042919495662787813, "loss": 4.8094, "mean_token_accuracy": 0.22674295753240586, "num_tokens": 63674411.0, "step": 27770 }, { "entropy": 5.083460283279419, "epoch": 2.668107588856868, "grad_norm": 1.2109375, "learning_rate": 0.0004291699878179387, "loss": 4.677, "mean_token_accuracy": 0.23690251559019088, "num_tokens": 63685182.0, "step": 27775 }, { "entropy": 5.082758903503418, "epoch": 2.6685878962536025, "grad_norm": 1.09375, "learning_rate": 0.0004291450154286299, "loss": 4.7682, "mean_token_accuracy": 0.22938674837350845, "num_tokens": 63696374.0, "step": 27780 }, { "entropy": 5.066847276687622, "epoch": 2.6690682036503364, "grad_norm": 1.046875, "learning_rate": 0.0004291200394605317, "loss": 4.7926, "mean_token_accuracy": 0.23720103353261948, "num_tokens": 63708092.0, "step": 27785 }, { "entropy": 5.027221441268921, "epoch": 2.6695485110470702, "grad_norm": 1.046875, "learning_rate": 0.0004290950599142237, "loss": 4.6912, "mean_token_accuracy": 0.24183435142040252, "num_tokens": 63721143.0, "step": 27790 }, { "entropy": 5.13440351486206, "epoch": 2.670028818443804, "grad_norm": 0.95703125, "learning_rate": 0.000429070076790286, "loss": 4.7732, "mean_token_accuracy": 0.23022121489048003, "num_tokens": 63732346.0, "step": 27795 }, { "entropy": 5.032449340820312, "epoch": 2.670509125840538, "grad_norm": 1.0703125, "learning_rate": 0.00042904509008929873, "loss": 4.6695, "mean_token_accuracy": 0.24931943714618682, "num_tokens": 63744008.0, "step": 27800 }, { "entropy": 5.087543678283692, "epoch": 2.670989433237272, "grad_norm": 1.0234375, "learning_rate": 0.0004290200998118417, "loss": 4.7047, "mean_token_accuracy": 0.24750794917345048, "num_tokens": 63754792.0, "step": 27805 }, { "entropy": 5.059582996368408, "epoch": 2.6714697406340058, "grad_norm": 1.0625, "learning_rate": 0.00042899510595849544, "loss": 4.6903, "mean_token_accuracy": 0.23685694187879563, "num_tokens": 63767194.0, "step": 27810 }, { "entropy": 5.0132557392120365, "epoch": 2.6719500480307397, "grad_norm": 1.078125, "learning_rate": 0.00042897010852984004, "loss": 4.7079, "mean_token_accuracy": 0.2424158573150635, "num_tokens": 63778641.0, "step": 27815 }, { "entropy": 5.095559215545654, "epoch": 2.6724303554274735, "grad_norm": 1.0078125, "learning_rate": 0.00042894510752645586, "loss": 4.7444, "mean_token_accuracy": 0.23031575381755828, "num_tokens": 63791447.0, "step": 27820 }, { "entropy": 5.129614448547363, "epoch": 2.6729106628242074, "grad_norm": 1.171875, "learning_rate": 0.0004289201029489235, "loss": 4.8211, "mean_token_accuracy": 0.2294018790125847, "num_tokens": 63802162.0, "step": 27825 }, { "entropy": 5.1430269241333, "epoch": 2.6733909702209413, "grad_norm": 1.0625, "learning_rate": 0.0004288950947978234, "loss": 4.7005, "mean_token_accuracy": 0.23975236117839813, "num_tokens": 63812689.0, "step": 27830 }, { "entropy": 5.1405720710754395, "epoch": 2.673871277617675, "grad_norm": 1.140625, "learning_rate": 0.0004288700830737361, "loss": 4.7763, "mean_token_accuracy": 0.23490793853998185, "num_tokens": 63822915.0, "step": 27835 }, { "entropy": 5.038186025619507, "epoch": 2.674351585014409, "grad_norm": 1.046875, "learning_rate": 0.00042884506777724244, "loss": 4.7465, "mean_token_accuracy": 0.23948826938867568, "num_tokens": 63834350.0, "step": 27840 }, { "entropy": 4.974474334716797, "epoch": 2.6748318924111434, "grad_norm": 1.0703125, "learning_rate": 0.0004288200489089231, "loss": 4.667, "mean_token_accuracy": 0.24383982121944428, "num_tokens": 63846150.0, "step": 27845 }, { "entropy": 5.162803077697754, "epoch": 2.675312199807877, "grad_norm": 1.1171875, "learning_rate": 0.000428795026469359, "loss": 4.7259, "mean_token_accuracy": 0.23322818130254747, "num_tokens": 63857675.0, "step": 27850 }, { "entropy": 5.184978771209717, "epoch": 2.675792507204611, "grad_norm": 1.1640625, "learning_rate": 0.000428770000459131, "loss": 4.8741, "mean_token_accuracy": 0.2263594910502434, "num_tokens": 63868073.0, "step": 27855 }, { "entropy": 5.11458010673523, "epoch": 2.6762728146013446, "grad_norm": 1.1875, "learning_rate": 0.0004287449708788202, "loss": 4.7497, "mean_token_accuracy": 0.239044252038002, "num_tokens": 63879085.0, "step": 27860 }, { "entropy": 5.038825511932373, "epoch": 2.676753121998079, "grad_norm": 1.125, "learning_rate": 0.0004287199377290077, "loss": 4.6583, "mean_token_accuracy": 0.24389497488737105, "num_tokens": 63890885.0, "step": 27865 }, { "entropy": 5.092413330078125, "epoch": 2.677233429394813, "grad_norm": 1.1015625, "learning_rate": 0.0004286949010102748, "loss": 4.7329, "mean_token_accuracy": 0.23647015541791916, "num_tokens": 63902421.0, "step": 27870 }, { "entropy": 5.077416276931762, "epoch": 2.6777137367915467, "grad_norm": 1.1328125, "learning_rate": 0.0004286698607232026, "loss": 4.7645, "mean_token_accuracy": 0.24023524522781373, "num_tokens": 63913845.0, "step": 27875 }, { "entropy": 4.969077301025391, "epoch": 2.6781940441882806, "grad_norm": 1.0859375, "learning_rate": 0.00042864481686837253, "loss": 4.6337, "mean_token_accuracy": 0.2435666501522064, "num_tokens": 63925429.0, "step": 27880 }, { "entropy": 5.053189754486084, "epoch": 2.6786743515850144, "grad_norm": 1.0, "learning_rate": 0.00042861976944636604, "loss": 4.7372, "mean_token_accuracy": 0.23916109502315522, "num_tokens": 63939451.0, "step": 27885 }, { "entropy": 5.203045845031738, "epoch": 2.6791546589817483, "grad_norm": 1.125, "learning_rate": 0.0004285947184577647, "loss": 4.8214, "mean_token_accuracy": 0.23106684535741806, "num_tokens": 63949077.0, "step": 27890 }, { "entropy": 5.1415793895721436, "epoch": 2.679634966378482, "grad_norm": 1.0703125, "learning_rate": 0.00042856966390315013, "loss": 4.7746, "mean_token_accuracy": 0.2313278779387474, "num_tokens": 63960553.0, "step": 27895 }, { "entropy": 5.140982055664063, "epoch": 2.680115273775216, "grad_norm": 1.109375, "learning_rate": 0.0004285446057831039, "loss": 4.8116, "mean_token_accuracy": 0.22729451060295106, "num_tokens": 63973086.0, "step": 27900 }, { "entropy": 5.056582450866699, "epoch": 2.68059558117195, "grad_norm": 1.1875, "learning_rate": 0.0004285195440982078, "loss": 4.7607, "mean_token_accuracy": 0.22835081964731216, "num_tokens": 63983933.0, "step": 27905 }, { "entropy": 5.026354026794434, "epoch": 2.681075888568684, "grad_norm": 1.03125, "learning_rate": 0.0004284944788490439, "loss": 4.636, "mean_token_accuracy": 0.24929089844226837, "num_tokens": 63994557.0, "step": 27910 }, { "entropy": 5.155221891403198, "epoch": 2.6815561959654177, "grad_norm": 1.109375, "learning_rate": 0.0004284694100361938, "loss": 4.8504, "mean_token_accuracy": 0.2283597931265831, "num_tokens": 64006368.0, "step": 27915 }, { "entropy": 5.095969390869141, "epoch": 2.682036503362152, "grad_norm": 1.0703125, "learning_rate": 0.0004284443376602398, "loss": 4.6959, "mean_token_accuracy": 0.2350688710808754, "num_tokens": 64018336.0, "step": 27920 }, { "entropy": 5.120890522003174, "epoch": 2.6825168107588855, "grad_norm": 1.1328125, "learning_rate": 0.0004284192617217639, "loss": 4.811, "mean_token_accuracy": 0.23156831711530684, "num_tokens": 64030329.0, "step": 27925 }, { "entropy": 5.079585886001587, "epoch": 2.68299711815562, "grad_norm": 1.1328125, "learning_rate": 0.0004283941822213484, "loss": 4.7732, "mean_token_accuracy": 0.23211552947759628, "num_tokens": 64042502.0, "step": 27930 }, { "entropy": 5.088845252990723, "epoch": 2.6834774255523532, "grad_norm": 1.0859375, "learning_rate": 0.0004283690991595754, "loss": 4.7424, "mean_token_accuracy": 0.23303174823522568, "num_tokens": 64054217.0, "step": 27935 }, { "entropy": 5.216445446014404, "epoch": 2.6839577329490876, "grad_norm": 1.0390625, "learning_rate": 0.00042834401253702734, "loss": 4.8074, "mean_token_accuracy": 0.2379058927297592, "num_tokens": 64065019.0, "step": 27940 }, { "entropy": 5.018643236160278, "epoch": 2.6844380403458215, "grad_norm": 1.015625, "learning_rate": 0.0004283189223542866, "loss": 4.6702, "mean_token_accuracy": 0.2423916980624199, "num_tokens": 64076251.0, "step": 27945 }, { "entropy": 5.0703541278839115, "epoch": 2.6849183477425553, "grad_norm": 1.1328125, "learning_rate": 0.00042829382861193585, "loss": 4.7322, "mean_token_accuracy": 0.23972439169883727, "num_tokens": 64087257.0, "step": 27950 }, { "entropy": 5.083153772354126, "epoch": 2.685398655139289, "grad_norm": 1.0078125, "learning_rate": 0.0004282687313105575, "loss": 4.7656, "mean_token_accuracy": 0.22837662547826768, "num_tokens": 64098304.0, "step": 27955 }, { "entropy": 5.092272567749023, "epoch": 2.685878962536023, "grad_norm": 1.0546875, "learning_rate": 0.00042824363045073434, "loss": 4.7113, "mean_token_accuracy": 0.24303918182849885, "num_tokens": 64108839.0, "step": 27960 }, { "entropy": 5.149320268630982, "epoch": 2.686359269932757, "grad_norm": 1.09375, "learning_rate": 0.0004282185260330491, "loss": 4.821, "mean_token_accuracy": 0.23023061752319335, "num_tokens": 64121579.0, "step": 27965 }, { "entropy": 5.015567398071289, "epoch": 2.686839577329491, "grad_norm": 1.15625, "learning_rate": 0.00042819341805808473, "loss": 4.6742, "mean_token_accuracy": 0.24303774684667587, "num_tokens": 64132220.0, "step": 27970 }, { "entropy": 5.044483709335327, "epoch": 2.6873198847262247, "grad_norm": 1.3203125, "learning_rate": 0.00042816830652642396, "loss": 4.7284, "mean_token_accuracy": 0.2366749495267868, "num_tokens": 64143580.0, "step": 27975 }, { "entropy": 5.081659555435181, "epoch": 2.6878001921229586, "grad_norm": 1.15625, "learning_rate": 0.0004281431914386501, "loss": 4.7454, "mean_token_accuracy": 0.23123384416103362, "num_tokens": 64155042.0, "step": 27980 }, { "entropy": 5.1955053329467775, "epoch": 2.6882804995196925, "grad_norm": 1.0625, "learning_rate": 0.000428118072795346, "loss": 4.8289, "mean_token_accuracy": 0.23210142105817794, "num_tokens": 64167755.0, "step": 27985 }, { "entropy": 5.125277090072632, "epoch": 2.6887608069164264, "grad_norm": 1.1328125, "learning_rate": 0.00042809295059709483, "loss": 4.7947, "mean_token_accuracy": 0.23100631237030028, "num_tokens": 64180930.0, "step": 27990 }, { "entropy": 5.084764528274536, "epoch": 2.6892411143131603, "grad_norm": 1.1328125, "learning_rate": 0.00042806782484448, "loss": 4.7767, "mean_token_accuracy": 0.2393814891576767, "num_tokens": 64192994.0, "step": 27995 }, { "entropy": 5.138259649276733, "epoch": 2.689721421709894, "grad_norm": 1.09375, "learning_rate": 0.0004280426955380848, "loss": 4.7912, "mean_token_accuracy": 0.23672110140323638, "num_tokens": 64204572.0, "step": 28000 }, { "entropy": 5.04312310218811, "epoch": 2.6902017291066285, "grad_norm": 1.2578125, "learning_rate": 0.00042801756267849266, "loss": 4.7363, "mean_token_accuracy": 0.23942153304815292, "num_tokens": 64216361.0, "step": 28005 }, { "entropy": 5.029325532913208, "epoch": 2.690682036503362, "grad_norm": 1.1875, "learning_rate": 0.0004279924262662871, "loss": 4.6864, "mean_token_accuracy": 0.24289357662200928, "num_tokens": 64227074.0, "step": 28010 }, { "entropy": 5.0360212326049805, "epoch": 2.6911623439000962, "grad_norm": 1.1796875, "learning_rate": 0.0004279672863020517, "loss": 4.6994, "mean_token_accuracy": 0.2375594422221184, "num_tokens": 64239507.0, "step": 28015 }, { "entropy": 5.150512218475342, "epoch": 2.69164265129683, "grad_norm": 1.109375, "learning_rate": 0.00042794214278637013, "loss": 4.8131, "mean_token_accuracy": 0.23091583847999572, "num_tokens": 64251307.0, "step": 28020 }, { "entropy": 5.080439472198487, "epoch": 2.692122958693564, "grad_norm": 1.015625, "learning_rate": 0.00042791699571982606, "loss": 4.7844, "mean_token_accuracy": 0.23450076282024385, "num_tokens": 64263988.0, "step": 28025 }, { "entropy": 5.104472923278808, "epoch": 2.692603266090298, "grad_norm": 1.1953125, "learning_rate": 0.0004278918451030035, "loss": 4.7919, "mean_token_accuracy": 0.2293301820755005, "num_tokens": 64275311.0, "step": 28030 }, { "entropy": 5.169442796707154, "epoch": 2.6930835734870318, "grad_norm": 1.1015625, "learning_rate": 0.0004278666909364863, "loss": 4.8172, "mean_token_accuracy": 0.23217507153749467, "num_tokens": 64286055.0, "step": 28035 }, { "entropy": 5.032286691665649, "epoch": 2.6935638808837656, "grad_norm": 1.1015625, "learning_rate": 0.0004278415332208584, "loss": 4.711, "mean_token_accuracy": 0.24192306250333787, "num_tokens": 64297286.0, "step": 28040 }, { "entropy": 5.09624924659729, "epoch": 2.6940441882804995, "grad_norm": 1.0859375, "learning_rate": 0.00042781637195670396, "loss": 4.7968, "mean_token_accuracy": 0.23089852035045624, "num_tokens": 64308277.0, "step": 28045 }, { "entropy": 5.267874479293823, "epoch": 2.6945244956772334, "grad_norm": 1.1171875, "learning_rate": 0.0004277912071446072, "loss": 4.9367, "mean_token_accuracy": 0.21548387855291368, "num_tokens": 64319260.0, "step": 28050 }, { "entropy": 5.188047170639038, "epoch": 2.6950048030739673, "grad_norm": 1.1015625, "learning_rate": 0.0004277660387851522, "loss": 4.8236, "mean_token_accuracy": 0.2330697700381279, "num_tokens": 64329840.0, "step": 28055 }, { "entropy": 5.199683332443238, "epoch": 2.695485110470701, "grad_norm": 1.046875, "learning_rate": 0.0004277408668789235, "loss": 4.894, "mean_token_accuracy": 0.22882702201604843, "num_tokens": 64342375.0, "step": 28060 }, { "entropy": 4.991396713256836, "epoch": 2.695965417867435, "grad_norm": 1.015625, "learning_rate": 0.0004277156914265054, "loss": 4.6266, "mean_token_accuracy": 0.25021957606077194, "num_tokens": 64353396.0, "step": 28065 }, { "entropy": 5.153245162963867, "epoch": 2.696445725264169, "grad_norm": 1.1328125, "learning_rate": 0.0004276905124284824, "loss": 4.809, "mean_token_accuracy": 0.23273993134498597, "num_tokens": 64364802.0, "step": 28070 }, { "entropy": 5.133913660049439, "epoch": 2.696926032660903, "grad_norm": 1.0859375, "learning_rate": 0.0004276653298854391, "loss": 4.7728, "mean_token_accuracy": 0.2290853813290596, "num_tokens": 64376379.0, "step": 28075 }, { "entropy": 5.084238815307617, "epoch": 2.697406340057637, "grad_norm": 1.109375, "learning_rate": 0.0004276401437979601, "loss": 4.7893, "mean_token_accuracy": 0.23213465213775636, "num_tokens": 64388039.0, "step": 28080 }, { "entropy": 5.1134857654571535, "epoch": 2.6978866474543706, "grad_norm": 1.046875, "learning_rate": 0.0004276149541666303, "loss": 4.7497, "mean_token_accuracy": 0.23422105759382247, "num_tokens": 64401288.0, "step": 28085 }, { "entropy": 5.13084306716919, "epoch": 2.698366954851105, "grad_norm": 1.0859375, "learning_rate": 0.00042758976099203444, "loss": 4.7295, "mean_token_accuracy": 0.2404603809118271, "num_tokens": 64411392.0, "step": 28090 }, { "entropy": 5.123020887374878, "epoch": 2.6988472622478388, "grad_norm": 1.140625, "learning_rate": 0.00042756456427475736, "loss": 4.8165, "mean_token_accuracy": 0.23579190522432328, "num_tokens": 64421955.0, "step": 28095 }, { "entropy": 5.074358367919922, "epoch": 2.6993275696445727, "grad_norm": 0.984375, "learning_rate": 0.0004275393640153842, "loss": 4.7291, "mean_token_accuracy": 0.23813998848199844, "num_tokens": 64434365.0, "step": 28100 }, { "entropy": 5.056596994400024, "epoch": 2.6998078770413065, "grad_norm": 1.140625, "learning_rate": 0.00042751416021449986, "loss": 4.691, "mean_token_accuracy": 0.2383261129260063, "num_tokens": 64444846.0, "step": 28105 }, { "entropy": 5.061080837249756, "epoch": 2.7002881844380404, "grad_norm": 1.09375, "learning_rate": 0.0004274889528726896, "loss": 4.6862, "mean_token_accuracy": 0.23701339811086655, "num_tokens": 64455215.0, "step": 28110 }, { "entropy": 5.132177495956421, "epoch": 2.7007684918347743, "grad_norm": 1.0703125, "learning_rate": 0.0004274637419905388, "loss": 4.724, "mean_token_accuracy": 0.24133433252573014, "num_tokens": 64465810.0, "step": 28115 }, { "entropy": 5.069816207885742, "epoch": 2.701248799231508, "grad_norm": 1.25, "learning_rate": 0.00042743852756863253, "loss": 4.6915, "mean_token_accuracy": 0.24327150732278824, "num_tokens": 64476615.0, "step": 28120 }, { "entropy": 5.0869025707244875, "epoch": 2.701729106628242, "grad_norm": 1.1328125, "learning_rate": 0.0004274133096075563, "loss": 4.7701, "mean_token_accuracy": 0.23665109276771545, "num_tokens": 64490137.0, "step": 28125 }, { "entropy": 5.078602361679077, "epoch": 2.702209414024976, "grad_norm": 1.140625, "learning_rate": 0.0004273880881078956, "loss": 4.7441, "mean_token_accuracy": 0.24227222949266433, "num_tokens": 64501357.0, "step": 28130 }, { "entropy": 4.969816112518311, "epoch": 2.70268972142171, "grad_norm": 1.0859375, "learning_rate": 0.0004273628630702359, "loss": 4.5719, "mean_token_accuracy": 0.24865195155143738, "num_tokens": 64512784.0, "step": 28135 }, { "entropy": 5.0676685810089115, "epoch": 2.7031700288184437, "grad_norm": 1.1015625, "learning_rate": 0.00042733763449516313, "loss": 4.7317, "mean_token_accuracy": 0.24380185604095458, "num_tokens": 64523800.0, "step": 28140 }, { "entropy": 5.086323976516724, "epoch": 2.7036503362151776, "grad_norm": 1.03125, "learning_rate": 0.00042731240238326273, "loss": 4.7152, "mean_token_accuracy": 0.23434408009052277, "num_tokens": 64535551.0, "step": 28145 }, { "entropy": 5.077157020568848, "epoch": 2.7041306436119115, "grad_norm": 1.0, "learning_rate": 0.00042728716673512065, "loss": 4.7252, "mean_token_accuracy": 0.24621228575706483, "num_tokens": 64547892.0, "step": 28150 }, { "entropy": 4.9974264144897464, "epoch": 2.704610951008646, "grad_norm": 1.125, "learning_rate": 0.00042726192755132276, "loss": 4.6678, "mean_token_accuracy": 0.24135030061006546, "num_tokens": 64559687.0, "step": 28155 }, { "entropy": 5.046790599822998, "epoch": 2.7050912584053792, "grad_norm": 1.0, "learning_rate": 0.00042723668483245496, "loss": 4.6399, "mean_token_accuracy": 0.24230584502220154, "num_tokens": 64571691.0, "step": 28160 }, { "entropy": 5.146127080917358, "epoch": 2.7055715658021136, "grad_norm": 1.0, "learning_rate": 0.0004272114385791035, "loss": 4.8018, "mean_token_accuracy": 0.23526528775691985, "num_tokens": 64582570.0, "step": 28165 }, { "entropy": 5.111129426956177, "epoch": 2.706051873198847, "grad_norm": 1.2265625, "learning_rate": 0.00042718618879185435, "loss": 4.7435, "mean_token_accuracy": 0.23832932561635972, "num_tokens": 64593610.0, "step": 28170 }, { "entropy": 5.058528900146484, "epoch": 2.7065321805955813, "grad_norm": 0.984375, "learning_rate": 0.0004271609354712938, "loss": 4.6835, "mean_token_accuracy": 0.24811802953481674, "num_tokens": 64604802.0, "step": 28175 }, { "entropy": 5.051104402542114, "epoch": 2.707012487992315, "grad_norm": 1.0859375, "learning_rate": 0.0004271356786180082, "loss": 4.7137, "mean_token_accuracy": 0.2327448919415474, "num_tokens": 64615816.0, "step": 28180 }, { "entropy": 5.00749945640564, "epoch": 2.707492795389049, "grad_norm": 1.1015625, "learning_rate": 0.0004271104182325838, "loss": 4.6554, "mean_token_accuracy": 0.24093613475561143, "num_tokens": 64628890.0, "step": 28185 }, { "entropy": 5.027282094955444, "epoch": 2.707973102785783, "grad_norm": 1.1484375, "learning_rate": 0.00042708515431560723, "loss": 4.6951, "mean_token_accuracy": 0.24452045410871506, "num_tokens": 64639494.0, "step": 28190 }, { "entropy": 5.180206060409546, "epoch": 2.708453410182517, "grad_norm": 1.078125, "learning_rate": 0.000427059886867665, "loss": 4.9158, "mean_token_accuracy": 0.22348858714103698, "num_tokens": 64651235.0, "step": 28195 }, { "entropy": 5.1558619976043705, "epoch": 2.7089337175792507, "grad_norm": 1.1015625, "learning_rate": 0.0004270346158893436, "loss": 4.7674, "mean_token_accuracy": 0.23493716567754747, "num_tokens": 64662517.0, "step": 28200 }, { "entropy": 5.112757205963135, "epoch": 2.7094140249759846, "grad_norm": 1.21875, "learning_rate": 0.00042700934138123004, "loss": 4.8213, "mean_token_accuracy": 0.2212560459971428, "num_tokens": 64674521.0, "step": 28205 }, { "entropy": 5.166494464874267, "epoch": 2.7098943323727185, "grad_norm": 1.015625, "learning_rate": 0.00042698406334391084, "loss": 4.9204, "mean_token_accuracy": 0.22131302058696747, "num_tokens": 64686894.0, "step": 28210 }, { "entropy": 5.172549533843994, "epoch": 2.7103746397694524, "grad_norm": 1.046875, "learning_rate": 0.000426958781777973, "loss": 4.8365, "mean_token_accuracy": 0.23947456032037734, "num_tokens": 64697948.0, "step": 28215 }, { "entropy": 5.080643033981323, "epoch": 2.7108549471661862, "grad_norm": 1.1640625, "learning_rate": 0.0004269334966840035, "loss": 4.7004, "mean_token_accuracy": 0.24020812660455704, "num_tokens": 64709429.0, "step": 28220 }, { "entropy": 5.072011661529541, "epoch": 2.71133525456292, "grad_norm": 1.09375, "learning_rate": 0.00042690820806258933, "loss": 4.6912, "mean_token_accuracy": 0.24333603233098983, "num_tokens": 64720529.0, "step": 28225 }, { "entropy": 5.007713651657104, "epoch": 2.7118155619596545, "grad_norm": 1.046875, "learning_rate": 0.0004268829159143176, "loss": 4.667, "mean_token_accuracy": 0.2444024607539177, "num_tokens": 64731602.0, "step": 28230 }, { "entropy": 5.138308429718018, "epoch": 2.712295869356388, "grad_norm": 1.21875, "learning_rate": 0.0004268576202397757, "loss": 4.6776, "mean_token_accuracy": 0.23795579075813295, "num_tokens": 64742746.0, "step": 28235 }, { "entropy": 5.0846727848052975, "epoch": 2.712776176753122, "grad_norm": 0.99609375, "learning_rate": 0.0004268323210395506, "loss": 4.7464, "mean_token_accuracy": 0.23865850120782853, "num_tokens": 64754643.0, "step": 28240 }, { "entropy": 5.182872915267945, "epoch": 2.7132564841498557, "grad_norm": 1.03125, "learning_rate": 0.00042680701831423004, "loss": 4.8211, "mean_token_accuracy": 0.230291485786438, "num_tokens": 64766349.0, "step": 28245 }, { "entropy": 5.03706693649292, "epoch": 2.71373679154659, "grad_norm": 1.125, "learning_rate": 0.0004267817120644012, "loss": 4.7334, "mean_token_accuracy": 0.23736280649900438, "num_tokens": 64777645.0, "step": 28250 }, { "entropy": 5.090947532653809, "epoch": 2.714217098943324, "grad_norm": 1.0859375, "learning_rate": 0.00042675640229065167, "loss": 4.7662, "mean_token_accuracy": 0.2323785498738289, "num_tokens": 64788065.0, "step": 28255 }, { "entropy": 5.138057613372803, "epoch": 2.7146974063400577, "grad_norm": 1.125, "learning_rate": 0.00042673108899356915, "loss": 4.7284, "mean_token_accuracy": 0.24340671747922898, "num_tokens": 64798938.0, "step": 28260 }, { "entropy": 5.0383988380432125, "epoch": 2.7151777137367916, "grad_norm": 1.046875, "learning_rate": 0.0004267057721737413, "loss": 4.7627, "mean_token_accuracy": 0.23495194613933562, "num_tokens": 64810572.0, "step": 28265 }, { "entropy": 5.016577291488647, "epoch": 2.7156580211335255, "grad_norm": 1.1015625, "learning_rate": 0.0004266804518317559, "loss": 4.7593, "mean_token_accuracy": 0.2415664240717888, "num_tokens": 64821877.0, "step": 28270 }, { "entropy": 5.098750972747803, "epoch": 2.7161383285302594, "grad_norm": 1.1484375, "learning_rate": 0.0004266551279682008, "loss": 4.6987, "mean_token_accuracy": 0.2440878689289093, "num_tokens": 64832737.0, "step": 28275 }, { "entropy": 5.076100206375122, "epoch": 2.7166186359269933, "grad_norm": 1.140625, "learning_rate": 0.0004266298005836639, "loss": 4.7607, "mean_token_accuracy": 0.22922112345695494, "num_tokens": 64844713.0, "step": 28280 }, { "entropy": 5.078899669647217, "epoch": 2.717098943323727, "grad_norm": 1.046875, "learning_rate": 0.00042660446967873327, "loss": 4.7614, "mean_token_accuracy": 0.23753941804170609, "num_tokens": 64855896.0, "step": 28285 }, { "entropy": 5.141567993164062, "epoch": 2.717579250720461, "grad_norm": 1.234375, "learning_rate": 0.00042657913525399703, "loss": 4.8146, "mean_token_accuracy": 0.23122312724590302, "num_tokens": 64866529.0, "step": 28290 }, { "entropy": 5.0869077205657955, "epoch": 2.718059558117195, "grad_norm": 1.171875, "learning_rate": 0.0004265537973100435, "loss": 4.7199, "mean_token_accuracy": 0.2347149908542633, "num_tokens": 64876691.0, "step": 28295 }, { "entropy": 5.036520099639892, "epoch": 2.718539865513929, "grad_norm": 1.046875, "learning_rate": 0.0004265284558474607, "loss": 4.7108, "mean_token_accuracy": 0.23933310508728028, "num_tokens": 64887271.0, "step": 28300 }, { "entropy": 5.102957010269165, "epoch": 2.7190201729106627, "grad_norm": 0.96875, "learning_rate": 0.00042650311086683715, "loss": 4.8139, "mean_token_accuracy": 0.22686078101396562, "num_tokens": 64900698.0, "step": 28305 }, { "entropy": 5.1559515476226805, "epoch": 2.7195004803073966, "grad_norm": 1.09375, "learning_rate": 0.0004264777623687612, "loss": 4.7448, "mean_token_accuracy": 0.23295564502477645, "num_tokens": 64911349.0, "step": 28310 }, { "entropy": 5.051888799667358, "epoch": 2.719980787704131, "grad_norm": 1.046875, "learning_rate": 0.0004264524103538214, "loss": 4.7408, "mean_token_accuracy": 0.24038063436746598, "num_tokens": 64922522.0, "step": 28315 }, { "entropy": 5.099909830093384, "epoch": 2.7204610951008643, "grad_norm": 1.1484375, "learning_rate": 0.0004264270548226064, "loss": 4.766, "mean_token_accuracy": 0.23822131007909775, "num_tokens": 64934796.0, "step": 28320 }, { "entropy": 5.10898756980896, "epoch": 2.7209414024975986, "grad_norm": 1.0234375, "learning_rate": 0.0004264016957757048, "loss": 4.7571, "mean_token_accuracy": 0.24004254788160323, "num_tokens": 64946717.0, "step": 28325 }, { "entropy": 5.098352909088135, "epoch": 2.7214217098943325, "grad_norm": 1.0078125, "learning_rate": 0.00042637633321370545, "loss": 4.8395, "mean_token_accuracy": 0.2257276654243469, "num_tokens": 64958818.0, "step": 28330 }, { "entropy": 5.11536021232605, "epoch": 2.7219020172910664, "grad_norm": 1.0625, "learning_rate": 0.0004263509671371971, "loss": 4.6868, "mean_token_accuracy": 0.23188695609569548, "num_tokens": 64968974.0, "step": 28335 }, { "entropy": 5.114422798156738, "epoch": 2.7223823246878003, "grad_norm": 1.1015625, "learning_rate": 0.00042632559754676865, "loss": 4.7642, "mean_token_accuracy": 0.23466452211141586, "num_tokens": 64980347.0, "step": 28340 }, { "entropy": 5.088268184661866, "epoch": 2.722862632084534, "grad_norm": 1.0078125, "learning_rate": 0.0004263002244430092, "loss": 4.7339, "mean_token_accuracy": 0.23719714879989623, "num_tokens": 64992916.0, "step": 28345 }, { "entropy": 5.093293190002441, "epoch": 2.723342939481268, "grad_norm": 1.03125, "learning_rate": 0.0004262748478265078, "loss": 4.81, "mean_token_accuracy": 0.23484614342451096, "num_tokens": 65003382.0, "step": 28350 }, { "entropy": 4.988045167922974, "epoch": 2.723823246878002, "grad_norm": 0.98828125, "learning_rate": 0.0004262494676978537, "loss": 4.5804, "mean_token_accuracy": 0.25099532306194305, "num_tokens": 65014064.0, "step": 28355 }, { "entropy": 5.103696966171265, "epoch": 2.724303554274736, "grad_norm": 1.1328125, "learning_rate": 0.00042622408405763607, "loss": 4.7906, "mean_token_accuracy": 0.22995427697896959, "num_tokens": 65025701.0, "step": 28360 }, { "entropy": 5.142344427108765, "epoch": 2.7247838616714697, "grad_norm": 1.0546875, "learning_rate": 0.0004261986969064442, "loss": 4.7664, "mean_token_accuracy": 0.23485698848962783, "num_tokens": 65038270.0, "step": 28365 }, { "entropy": 5.065335035324097, "epoch": 2.7252641690682036, "grad_norm": 1.0703125, "learning_rate": 0.00042617330624486753, "loss": 4.7015, "mean_token_accuracy": 0.24524183720350265, "num_tokens": 65049453.0, "step": 28370 }, { "entropy": 4.921413230895996, "epoch": 2.7257444764649374, "grad_norm": 1.0078125, "learning_rate": 0.0004261479120734956, "loss": 4.5774, "mean_token_accuracy": 0.2530855819582939, "num_tokens": 65061211.0, "step": 28375 }, { "entropy": 5.010696601867676, "epoch": 2.7262247838616713, "grad_norm": 1.15625, "learning_rate": 0.000426122514392918, "loss": 4.649, "mean_token_accuracy": 0.24367837458848954, "num_tokens": 65073073.0, "step": 28380 }, { "entropy": 5.078413200378418, "epoch": 2.726705091258405, "grad_norm": 1.015625, "learning_rate": 0.00042609711320372435, "loss": 4.7231, "mean_token_accuracy": 0.24840225130319596, "num_tokens": 65084647.0, "step": 28385 }, { "entropy": 5.070221567153931, "epoch": 2.7271853986551395, "grad_norm": 1.125, "learning_rate": 0.0004260717085065045, "loss": 4.6986, "mean_token_accuracy": 0.24333804994821548, "num_tokens": 65095405.0, "step": 28390 }, { "entropy": 5.0989728450775145, "epoch": 2.727665706051873, "grad_norm": 0.99609375, "learning_rate": 0.00042604630030184797, "loss": 4.7636, "mean_token_accuracy": 0.23326624184846878, "num_tokens": 65106981.0, "step": 28395 }, { "entropy": 5.173717212677002, "epoch": 2.7281460134486073, "grad_norm": 1.0625, "learning_rate": 0.000426020888590345, "loss": 4.8478, "mean_token_accuracy": 0.22634569704532623, "num_tokens": 65118957.0, "step": 28400 }, { "entropy": 5.059170246124268, "epoch": 2.728626320845341, "grad_norm": 1.2421875, "learning_rate": 0.00042599547337258536, "loss": 4.7339, "mean_token_accuracy": 0.2388172686100006, "num_tokens": 65130897.0, "step": 28405 }, { "entropy": 4.973813772201538, "epoch": 2.729106628242075, "grad_norm": 1.1875, "learning_rate": 0.00042597005464915924, "loss": 4.5451, "mean_token_accuracy": 0.2504707619547844, "num_tokens": 65142828.0, "step": 28410 }, { "entropy": 4.978707313537598, "epoch": 2.729586935638809, "grad_norm": 1.0390625, "learning_rate": 0.00042594463242065674, "loss": 4.6767, "mean_token_accuracy": 0.24309882372617722, "num_tokens": 65154787.0, "step": 28415 }, { "entropy": 5.0573992252349855, "epoch": 2.730067243035543, "grad_norm": 1.03125, "learning_rate": 0.0004259192066876681, "loss": 4.7096, "mean_token_accuracy": 0.2386387825012207, "num_tokens": 65165530.0, "step": 28420 }, { "entropy": 5.140051460266113, "epoch": 2.7305475504322767, "grad_norm": 1.1484375, "learning_rate": 0.00042589377745078354, "loss": 4.829, "mean_token_accuracy": 0.2319121852517128, "num_tokens": 65176809.0, "step": 28425 }, { "entropy": 5.166042423248291, "epoch": 2.7310278578290106, "grad_norm": 1.0625, "learning_rate": 0.00042586834471059366, "loss": 4.8491, "mean_token_accuracy": 0.2317554920911789, "num_tokens": 65189251.0, "step": 28430 }, { "entropy": 5.004992914199829, "epoch": 2.7315081652257445, "grad_norm": 1.0625, "learning_rate": 0.00042584290846768867, "loss": 4.6268, "mean_token_accuracy": 0.2488670364022255, "num_tokens": 65200918.0, "step": 28435 }, { "entropy": 5.097946310043335, "epoch": 2.7319884726224783, "grad_norm": 1.03125, "learning_rate": 0.0004258174687226593, "loss": 4.7161, "mean_token_accuracy": 0.2372460260987282, "num_tokens": 65213283.0, "step": 28440 }, { "entropy": 5.092672300338745, "epoch": 2.7324687800192122, "grad_norm": 1.09375, "learning_rate": 0.0004257920254760962, "loss": 4.7477, "mean_token_accuracy": 0.24076730161905288, "num_tokens": 65223680.0, "step": 28445 }, { "entropy": 5.094247150421142, "epoch": 2.732949087415946, "grad_norm": 1.015625, "learning_rate": 0.0004257665787285899, "loss": 4.7471, "mean_token_accuracy": 0.23675734102725982, "num_tokens": 65234570.0, "step": 28450 }, { "entropy": 5.12308988571167, "epoch": 2.73342939481268, "grad_norm": 1.046875, "learning_rate": 0.00042574112848073147, "loss": 4.7203, "mean_token_accuracy": 0.2399858608841896, "num_tokens": 65246616.0, "step": 28455 }, { "entropy": 5.042817211151123, "epoch": 2.733909702209414, "grad_norm": 1.046875, "learning_rate": 0.00042571567473311157, "loss": 4.7393, "mean_token_accuracy": 0.23980943709611893, "num_tokens": 65257853.0, "step": 28460 }, { "entropy": 5.070425033569336, "epoch": 2.734390009606148, "grad_norm": 1.015625, "learning_rate": 0.0004256902174863213, "loss": 4.7108, "mean_token_accuracy": 0.24079181402921676, "num_tokens": 65268843.0, "step": 28465 }, { "entropy": 5.209898042678833, "epoch": 2.7348703170028816, "grad_norm": 1.0703125, "learning_rate": 0.00042566475674095155, "loss": 4.857, "mean_token_accuracy": 0.225233294069767, "num_tokens": 65279824.0, "step": 28470 }, { "entropy": 5.0555259704589846, "epoch": 2.735350624399616, "grad_norm": 1.0859375, "learning_rate": 0.0004256392924975936, "loss": 4.7751, "mean_token_accuracy": 0.23760847896337509, "num_tokens": 65291318.0, "step": 28475 }, { "entropy": 5.009477043151856, "epoch": 2.7358309317963494, "grad_norm": 1.03125, "learning_rate": 0.00042561382475683854, "loss": 4.7233, "mean_token_accuracy": 0.23941340893507004, "num_tokens": 65302997.0, "step": 28480 }, { "entropy": 5.039452934265137, "epoch": 2.7363112391930837, "grad_norm": 1.3125, "learning_rate": 0.0004255883535192777, "loss": 4.6862, "mean_token_accuracy": 0.2484783872961998, "num_tokens": 65314336.0, "step": 28485 }, { "entropy": 5.114763593673706, "epoch": 2.7367915465898176, "grad_norm": 1.078125, "learning_rate": 0.0004255628787855025, "loss": 4.7705, "mean_token_accuracy": 0.23738002330064772, "num_tokens": 65325138.0, "step": 28490 }, { "entropy": 5.137652969360351, "epoch": 2.7372718539865515, "grad_norm": 1.140625, "learning_rate": 0.0004255374005561043, "loss": 4.7312, "mean_token_accuracy": 0.23959817737340927, "num_tokens": 65337067.0, "step": 28495 }, { "entropy": 5.104769468307495, "epoch": 2.7377521613832854, "grad_norm": 1.15625, "learning_rate": 0.00042551191883167464, "loss": 4.7638, "mean_token_accuracy": 0.23433667719364165, "num_tokens": 65349598.0, "step": 28500 }, { "entropy": 5.133802175521851, "epoch": 2.7382324687800192, "grad_norm": 1.140625, "learning_rate": 0.0004254864336128052, "loss": 4.8134, "mean_token_accuracy": 0.2284764528274536, "num_tokens": 65361052.0, "step": 28505 }, { "entropy": 5.139010906219482, "epoch": 2.738712776176753, "grad_norm": 1.03125, "learning_rate": 0.00042546094490008765, "loss": 4.8087, "mean_token_accuracy": 0.23835351914167405, "num_tokens": 65373246.0, "step": 28510 }, { "entropy": 5.076985883712768, "epoch": 2.739193083573487, "grad_norm": 1.15625, "learning_rate": 0.0004254354526941136, "loss": 4.6916, "mean_token_accuracy": 0.24306633770465852, "num_tokens": 65384349.0, "step": 28515 }, { "entropy": 4.981937980651855, "epoch": 2.739673390970221, "grad_norm": 1.0234375, "learning_rate": 0.0004254099569954751, "loss": 4.6957, "mean_token_accuracy": 0.23665003925561906, "num_tokens": 65396241.0, "step": 28520 }, { "entropy": 5.017926216125488, "epoch": 2.7401536983669548, "grad_norm": 1.046875, "learning_rate": 0.0004253844578047641, "loss": 4.77, "mean_token_accuracy": 0.23209561556577682, "num_tokens": 65408583.0, "step": 28525 }, { "entropy": 5.111822986602784, "epoch": 2.7406340057636887, "grad_norm": 1.0703125, "learning_rate": 0.0004253589551225725, "loss": 4.6833, "mean_token_accuracy": 0.2423287332057953, "num_tokens": 65419845.0, "step": 28530 }, { "entropy": 5.136012268066406, "epoch": 2.7411143131604225, "grad_norm": 1.0078125, "learning_rate": 0.00042533344894949245, "loss": 4.7398, "mean_token_accuracy": 0.230571748316288, "num_tokens": 65433790.0, "step": 28535 }, { "entropy": 5.122740936279297, "epoch": 2.7415946205571564, "grad_norm": 0.9921875, "learning_rate": 0.00042530793928611605, "loss": 4.801, "mean_token_accuracy": 0.23554478138685225, "num_tokens": 65445958.0, "step": 28540 }, { "entropy": 5.0635027408599855, "epoch": 2.7420749279538903, "grad_norm": 1.015625, "learning_rate": 0.0004252824261330357, "loss": 4.7747, "mean_token_accuracy": 0.23710028380155562, "num_tokens": 65457973.0, "step": 28545 }, { "entropy": 5.090057420730591, "epoch": 2.7425552353506246, "grad_norm": 1.1484375, "learning_rate": 0.00042525690949084364, "loss": 4.7557, "mean_token_accuracy": 0.2368753135204315, "num_tokens": 65470245.0, "step": 28550 }, { "entropy": 5.017141246795655, "epoch": 2.743035542747358, "grad_norm": 1.0703125, "learning_rate": 0.00042523138936013233, "loss": 4.7262, "mean_token_accuracy": 0.24332302957773208, "num_tokens": 65482541.0, "step": 28555 }, { "entropy": 5.177900648117065, "epoch": 2.7435158501440924, "grad_norm": 1.0546875, "learning_rate": 0.00042520586574149423, "loss": 4.8372, "mean_token_accuracy": 0.22966494411230087, "num_tokens": 65494499.0, "step": 28560 }, { "entropy": 5.228618240356445, "epoch": 2.7439961575408263, "grad_norm": 1.1875, "learning_rate": 0.00042518033863552185, "loss": 4.8589, "mean_token_accuracy": 0.22425288259983062, "num_tokens": 65506377.0, "step": 28565 }, { "entropy": 5.14985466003418, "epoch": 2.74447646493756, "grad_norm": 1.0703125, "learning_rate": 0.0004251548080428081, "loss": 4.7947, "mean_token_accuracy": 0.23412707149982454, "num_tokens": 65517476.0, "step": 28570 }, { "entropy": 5.153133773803711, "epoch": 2.744956772334294, "grad_norm": 1.046875, "learning_rate": 0.0004251292739639455, "loss": 4.7353, "mean_token_accuracy": 0.23334655314683914, "num_tokens": 65529829.0, "step": 28575 }, { "entropy": 4.9859757900238035, "epoch": 2.745437079731028, "grad_norm": 1.0859375, "learning_rate": 0.00042510373639952694, "loss": 4.6211, "mean_token_accuracy": 0.24574377238750458, "num_tokens": 65541030.0, "step": 28580 }, { "entropy": 5.0365828514099125, "epoch": 2.745917387127762, "grad_norm": 1.03125, "learning_rate": 0.00042507819535014547, "loss": 4.7371, "mean_token_accuracy": 0.23719628006219864, "num_tokens": 65551727.0, "step": 28585 }, { "entropy": 5.021198701858521, "epoch": 2.7463976945244957, "grad_norm": 1.0625, "learning_rate": 0.00042505265081639376, "loss": 4.7006, "mean_token_accuracy": 0.24776863306760788, "num_tokens": 65563085.0, "step": 28590 }, { "entropy": 5.107274675369263, "epoch": 2.7468780019212296, "grad_norm": 1.0390625, "learning_rate": 0.0004250271027988652, "loss": 4.7229, "mean_token_accuracy": 0.23755284249782563, "num_tokens": 65574909.0, "step": 28595 }, { "entropy": 5.060042953491211, "epoch": 2.7473583093179634, "grad_norm": 1.046875, "learning_rate": 0.00042500155129815274, "loss": 4.7173, "mean_token_accuracy": 0.24350056499242784, "num_tokens": 65585515.0, "step": 28600 }, { "entropy": 5.024354028701782, "epoch": 2.7478386167146973, "grad_norm": 1.0546875, "learning_rate": 0.00042497599631484965, "loss": 4.6989, "mean_token_accuracy": 0.23446216583251953, "num_tokens": 65598160.0, "step": 28605 }, { "entropy": 5.121377420425415, "epoch": 2.748318924111431, "grad_norm": 1.0390625, "learning_rate": 0.00042495043784954926, "loss": 4.7677, "mean_token_accuracy": 0.23277996033430098, "num_tokens": 65609886.0, "step": 28610 }, { "entropy": 5.088078260421753, "epoch": 2.748799231508165, "grad_norm": 1.0546875, "learning_rate": 0.000424924875902845, "loss": 4.7246, "mean_token_accuracy": 0.23480461686849594, "num_tokens": 65620256.0, "step": 28615 }, { "entropy": 5.12593822479248, "epoch": 2.749279538904899, "grad_norm": 1.0546875, "learning_rate": 0.0004248993104753303, "loss": 4.7698, "mean_token_accuracy": 0.23956041187047958, "num_tokens": 65631108.0, "step": 28620 }, { "entropy": 5.117194652557373, "epoch": 2.7497598463016333, "grad_norm": 1.1171875, "learning_rate": 0.0004248737415675987, "loss": 4.7618, "mean_token_accuracy": 0.23265804052352906, "num_tokens": 65642423.0, "step": 28625 }, { "entropy": 5.039339065551758, "epoch": 2.7502401536983667, "grad_norm": 1.0390625, "learning_rate": 0.0004248481691802439, "loss": 4.7433, "mean_token_accuracy": 0.23624941408634187, "num_tokens": 65654105.0, "step": 28630 }, { "entropy": 5.145820808410645, "epoch": 2.750720461095101, "grad_norm": 1.09375, "learning_rate": 0.0004248225933138595, "loss": 4.8094, "mean_token_accuracy": 0.23825272619724275, "num_tokens": 65664738.0, "step": 28635 }, { "entropy": 5.058314323425293, "epoch": 2.751200768491835, "grad_norm": 1.0546875, "learning_rate": 0.00042479701396903945, "loss": 4.6796, "mean_token_accuracy": 0.2405511423945427, "num_tokens": 65676815.0, "step": 28640 }, { "entropy": 5.125538015365601, "epoch": 2.751681075888569, "grad_norm": 1.1953125, "learning_rate": 0.0004247714311463775, "loss": 4.772, "mean_token_accuracy": 0.23657451570034027, "num_tokens": 65687234.0, "step": 28645 }, { "entropy": 5.122481727600098, "epoch": 2.7521613832853027, "grad_norm": 1.109375, "learning_rate": 0.00042474584484646766, "loss": 4.8587, "mean_token_accuracy": 0.22422019988298417, "num_tokens": 65700356.0, "step": 28650 }, { "entropy": 5.0451537609100345, "epoch": 2.7526416906820366, "grad_norm": 1.078125, "learning_rate": 0.0004247202550699039, "loss": 4.6113, "mean_token_accuracy": 0.24570255875587463, "num_tokens": 65711320.0, "step": 28655 }, { "entropy": 5.073936176300049, "epoch": 2.7531219980787704, "grad_norm": 1.125, "learning_rate": 0.0004246946618172805, "loss": 4.7644, "mean_token_accuracy": 0.23449680656194688, "num_tokens": 65722461.0, "step": 28660 }, { "entropy": 4.981145191192627, "epoch": 2.7536023054755043, "grad_norm": 1.09375, "learning_rate": 0.0004246690650891915, "loss": 4.6224, "mean_token_accuracy": 0.24322707056999207, "num_tokens": 65732311.0, "step": 28665 }, { "entropy": 5.082883024215699, "epoch": 2.754082612872238, "grad_norm": 1.046875, "learning_rate": 0.0004246434648862312, "loss": 4.7577, "mean_token_accuracy": 0.242487533390522, "num_tokens": 65743830.0, "step": 28670 }, { "entropy": 5.124186706542969, "epoch": 2.754562920268972, "grad_norm": 1.078125, "learning_rate": 0.0004246178612089941, "loss": 4.8113, "mean_token_accuracy": 0.2254979908466339, "num_tokens": 65754851.0, "step": 28675 }, { "entropy": 5.125278568267822, "epoch": 2.755043227665706, "grad_norm": 1.2109375, "learning_rate": 0.0004245922540580744, "loss": 4.787, "mean_token_accuracy": 0.2342199668288231, "num_tokens": 65765228.0, "step": 28680 }, { "entropy": 5.126844644546509, "epoch": 2.75552353506244, "grad_norm": 1.2578125, "learning_rate": 0.0004245666434340668, "loss": 4.8303, "mean_token_accuracy": 0.22773058861494064, "num_tokens": 65775334.0, "step": 28685 }, { "entropy": 4.998766899108887, "epoch": 2.7560038424591737, "grad_norm": 1.0234375, "learning_rate": 0.0004245410293375659, "loss": 4.6538, "mean_token_accuracy": 0.2525763615965843, "num_tokens": 65786256.0, "step": 28690 }, { "entropy": 5.094659471511841, "epoch": 2.7564841498559076, "grad_norm": 1.0859375, "learning_rate": 0.0004245154117691664, "loss": 4.7088, "mean_token_accuracy": 0.24570492506027222, "num_tokens": 65798831.0, "step": 28695 }, { "entropy": 5.041133260726928, "epoch": 2.756964457252642, "grad_norm": 1.1640625, "learning_rate": 0.0004244897907294628, "loss": 4.7536, "mean_token_accuracy": 0.2359408512711525, "num_tokens": 65810256.0, "step": 28700 }, { "entropy": 5.226623058319092, "epoch": 2.7574447646493754, "grad_norm": 1.0625, "learning_rate": 0.0004244641662190504, "loss": 4.8904, "mean_token_accuracy": 0.2273922637104988, "num_tokens": 65821752.0, "step": 28705 }, { "entropy": 5.13972659111023, "epoch": 2.7579250720461097, "grad_norm": 1.0625, "learning_rate": 0.00042443853823852376, "loss": 4.793, "mean_token_accuracy": 0.22990813702344895, "num_tokens": 65833056.0, "step": 28710 }, { "entropy": 5.105608987808227, "epoch": 2.7584053794428436, "grad_norm": 1.078125, "learning_rate": 0.000424412906788478, "loss": 4.6967, "mean_token_accuracy": 0.23675004094839097, "num_tokens": 65845081.0, "step": 28715 }, { "entropy": 5.074363422393799, "epoch": 2.7588856868395775, "grad_norm": 1.2578125, "learning_rate": 0.0004243872718695082, "loss": 4.7417, "mean_token_accuracy": 0.23619790077209474, "num_tokens": 65856024.0, "step": 28720 }, { "entropy": 5.0468220710754395, "epoch": 2.7593659942363113, "grad_norm": 1.1171875, "learning_rate": 0.00042436163348220956, "loss": 4.6461, "mean_token_accuracy": 0.25140986293554307, "num_tokens": 65868282.0, "step": 28725 }, { "entropy": 4.991868495941162, "epoch": 2.7598463016330452, "grad_norm": 1.09375, "learning_rate": 0.0004243359916271773, "loss": 4.6913, "mean_token_accuracy": 0.24277271181344987, "num_tokens": 65879835.0, "step": 28730 }, { "entropy": 5.024890518188476, "epoch": 2.760326609029779, "grad_norm": 1.0546875, "learning_rate": 0.0004243103463050067, "loss": 4.6609, "mean_token_accuracy": 0.2428322196006775, "num_tokens": 65890916.0, "step": 28735 }, { "entropy": 5.154018449783325, "epoch": 2.760806916426513, "grad_norm": 1.0390625, "learning_rate": 0.0004242846975162933, "loss": 4.7635, "mean_token_accuracy": 0.23299016058444977, "num_tokens": 65901716.0, "step": 28740 }, { "entropy": 5.115095472335815, "epoch": 2.761287223823247, "grad_norm": 1.0078125, "learning_rate": 0.00042425904526163246, "loss": 4.8004, "mean_token_accuracy": 0.23285145312547684, "num_tokens": 65914366.0, "step": 28745 }, { "entropy": 5.145034217834473, "epoch": 2.7617675312199808, "grad_norm": 1.1015625, "learning_rate": 0.0004242333895416198, "loss": 4.9146, "mean_token_accuracy": 0.2228606328368187, "num_tokens": 65926396.0, "step": 28750 }, { "entropy": 5.1206944465637205, "epoch": 2.7622478386167146, "grad_norm": 0.9921875, "learning_rate": 0.000424207730356851, "loss": 4.7526, "mean_token_accuracy": 0.23485395759344102, "num_tokens": 65939200.0, "step": 28755 }, { "entropy": 5.132875204086304, "epoch": 2.7627281460134485, "grad_norm": 1.1328125, "learning_rate": 0.0004241820677079218, "loss": 4.7511, "mean_token_accuracy": 0.23293102085590361, "num_tokens": 65948978.0, "step": 28760 }, { "entropy": 5.078152847290039, "epoch": 2.7632084534101824, "grad_norm": 1.0703125, "learning_rate": 0.00042415640159542783, "loss": 4.6865, "mean_token_accuracy": 0.2410816565155983, "num_tokens": 65959661.0, "step": 28765 }, { "entropy": 4.9896392822265625, "epoch": 2.7636887608069163, "grad_norm": 1.0546875, "learning_rate": 0.0004241307320199652, "loss": 4.6194, "mean_token_accuracy": 0.2544292494654655, "num_tokens": 65970208.0, "step": 28770 }, { "entropy": 5.10506763458252, "epoch": 2.7641690682036506, "grad_norm": 1.1171875, "learning_rate": 0.0004241050589821298, "loss": 4.7439, "mean_token_accuracy": 0.23854973167181015, "num_tokens": 65982606.0, "step": 28775 }, { "entropy": 5.0196840286254885, "epoch": 2.764649375600384, "grad_norm": 1.0234375, "learning_rate": 0.0004240793824825177, "loss": 4.6577, "mean_token_accuracy": 0.2405321404337883, "num_tokens": 65993984.0, "step": 28780 }, { "entropy": 5.08947868347168, "epoch": 2.7651296829971184, "grad_norm": 1.0703125, "learning_rate": 0.00042405370252172496, "loss": 4.7187, "mean_token_accuracy": 0.23867221027612687, "num_tokens": 66004740.0, "step": 28785 }, { "entropy": 5.052742671966553, "epoch": 2.765609990393852, "grad_norm": 0.99609375, "learning_rate": 0.0004240280191003479, "loss": 4.7952, "mean_token_accuracy": 0.23609665632247925, "num_tokens": 66019265.0, "step": 28790 }, { "entropy": 5.046192693710327, "epoch": 2.766090297790586, "grad_norm": 1.09375, "learning_rate": 0.0004240023322189828, "loss": 4.705, "mean_token_accuracy": 0.23975181132555007, "num_tokens": 66030434.0, "step": 28795 }, { "entropy": 5.129871892929077, "epoch": 2.76657060518732, "grad_norm": 1.125, "learning_rate": 0.0004239766418782258, "loss": 4.7147, "mean_token_accuracy": 0.2322618395090103, "num_tokens": 66041100.0, "step": 28800 }, { "entropy": 5.0403828620910645, "epoch": 2.767050912584054, "grad_norm": 1.125, "learning_rate": 0.0004239509480786737, "loss": 4.7009, "mean_token_accuracy": 0.2370862916111946, "num_tokens": 66053046.0, "step": 28805 }, { "entropy": 5.147002124786377, "epoch": 2.7675312199807878, "grad_norm": 1.0859375, "learning_rate": 0.00042392525082092286, "loss": 4.8408, "mean_token_accuracy": 0.22971219569444656, "num_tokens": 66063508.0, "step": 28810 }, { "entropy": 5.126589155197143, "epoch": 2.7680115273775217, "grad_norm": 1.078125, "learning_rate": 0.0004238995501055699, "loss": 4.845, "mean_token_accuracy": 0.22856855392456055, "num_tokens": 66076135.0, "step": 28815 }, { "entropy": 5.110952854156494, "epoch": 2.7684918347742555, "grad_norm": 1.03125, "learning_rate": 0.0004238738459332115, "loss": 4.79, "mean_token_accuracy": 0.23187243789434434, "num_tokens": 66087565.0, "step": 28820 }, { "entropy": 5.137646675109863, "epoch": 2.7689721421709894, "grad_norm": 1.109375, "learning_rate": 0.0004238481383044445, "loss": 4.7791, "mean_token_accuracy": 0.2414294421672821, "num_tokens": 66099430.0, "step": 28825 }, { "entropy": 5.0425450801849365, "epoch": 2.7694524495677233, "grad_norm": 1.0, "learning_rate": 0.00042382242721986573, "loss": 4.6932, "mean_token_accuracy": 0.23897880762815477, "num_tokens": 66111060.0, "step": 28830 }, { "entropy": 4.9416584968566895, "epoch": 2.769932756964457, "grad_norm": 0.98828125, "learning_rate": 0.00042379671268007207, "loss": 4.643, "mean_token_accuracy": 0.2510156065225601, "num_tokens": 66123367.0, "step": 28835 }, { "entropy": 5.090784740447998, "epoch": 2.770413064361191, "grad_norm": 1.109375, "learning_rate": 0.0004237709946856607, "loss": 4.7071, "mean_token_accuracy": 0.24626193791627884, "num_tokens": 66134198.0, "step": 28840 }, { "entropy": 5.100472450256348, "epoch": 2.770893371757925, "grad_norm": 1.0, "learning_rate": 0.00042374527323722836, "loss": 4.7161, "mean_token_accuracy": 0.23950215280056, "num_tokens": 66145991.0, "step": 28845 }, { "entropy": 5.112044715881348, "epoch": 2.771373679154659, "grad_norm": 1.0703125, "learning_rate": 0.00042371954833537263, "loss": 4.7642, "mean_token_accuracy": 0.2365383803844452, "num_tokens": 66157447.0, "step": 28850 }, { "entropy": 5.01291127204895, "epoch": 2.7718539865513927, "grad_norm": 1.0, "learning_rate": 0.00042369381998069055, "loss": 4.7748, "mean_token_accuracy": 0.24323766380548478, "num_tokens": 66169259.0, "step": 28855 }, { "entropy": 5.067882633209228, "epoch": 2.772334293948127, "grad_norm": 1.0078125, "learning_rate": 0.0004236680881737795, "loss": 4.7304, "mean_token_accuracy": 0.23722611963748932, "num_tokens": 66181221.0, "step": 28860 }, { "entropy": 5.1237670421600345, "epoch": 2.7728146013448605, "grad_norm": 1.0078125, "learning_rate": 0.0004236423529152369, "loss": 4.7753, "mean_token_accuracy": 0.22963873594999312, "num_tokens": 66193513.0, "step": 28865 }, { "entropy": 5.064371585845947, "epoch": 2.773294908741595, "grad_norm": 1.0546875, "learning_rate": 0.0004236166142056602, "loss": 4.6319, "mean_token_accuracy": 0.246799498796463, "num_tokens": 66204951.0, "step": 28870 }, { "entropy": 4.997515344619751, "epoch": 2.7737752161383287, "grad_norm": 1.1328125, "learning_rate": 0.0004235908720456471, "loss": 4.7317, "mean_token_accuracy": 0.2407101422548294, "num_tokens": 66215845.0, "step": 28875 }, { "entropy": 5.101406002044678, "epoch": 2.7742555235350626, "grad_norm": 1.15625, "learning_rate": 0.0004235651264357951, "loss": 4.8379, "mean_token_accuracy": 0.23241375535726547, "num_tokens": 66226510.0, "step": 28880 }, { "entropy": 5.141011476516724, "epoch": 2.7747358309317964, "grad_norm": 1.046875, "learning_rate": 0.00042353937737670206, "loss": 4.7585, "mean_token_accuracy": 0.2364454001188278, "num_tokens": 66238134.0, "step": 28885 }, { "entropy": 5.10180025100708, "epoch": 2.7752161383285303, "grad_norm": 1.1328125, "learning_rate": 0.0004235136248689658, "loss": 4.7388, "mean_token_accuracy": 0.23604300916194915, "num_tokens": 66250526.0, "step": 28890 }, { "entropy": 5.133022356033325, "epoch": 2.775696445725264, "grad_norm": 1.0390625, "learning_rate": 0.0004234878689131841, "loss": 4.7886, "mean_token_accuracy": 0.235670568048954, "num_tokens": 66262144.0, "step": 28895 }, { "entropy": 5.164633417129517, "epoch": 2.776176753121998, "grad_norm": 1.0390625, "learning_rate": 0.000423462109509955, "loss": 4.8015, "mean_token_accuracy": 0.22956809103488923, "num_tokens": 66273262.0, "step": 28900 }, { "entropy": 5.0345587730407715, "epoch": 2.776657060518732, "grad_norm": 1.0234375, "learning_rate": 0.0004234363466598765, "loss": 4.6986, "mean_token_accuracy": 0.2430693671107292, "num_tokens": 66284127.0, "step": 28905 }, { "entropy": 5.019112014770508, "epoch": 2.777137367915466, "grad_norm": 1.1015625, "learning_rate": 0.00042341058036354687, "loss": 4.7241, "mean_token_accuracy": 0.23520620614290239, "num_tokens": 66295092.0, "step": 28910 }, { "entropy": 4.953162574768067, "epoch": 2.7776176753121997, "grad_norm": 1.09375, "learning_rate": 0.00042338481062156424, "loss": 4.5774, "mean_token_accuracy": 0.24913661181926727, "num_tokens": 66306050.0, "step": 28915 }, { "entropy": 4.981451845169067, "epoch": 2.7780979827089336, "grad_norm": 1.0703125, "learning_rate": 0.00042335903743452694, "loss": 4.6705, "mean_token_accuracy": 0.2409507527947426, "num_tokens": 66317186.0, "step": 28920 }, { "entropy": 5.073168516159058, "epoch": 2.7785782901056675, "grad_norm": 1.046875, "learning_rate": 0.0004233332608030333, "loss": 4.7515, "mean_token_accuracy": 0.23477090448141097, "num_tokens": 66328486.0, "step": 28925 }, { "entropy": 5.048687314987182, "epoch": 2.7790585975024014, "grad_norm": 1.03125, "learning_rate": 0.00042330748072768183, "loss": 4.6605, "mean_token_accuracy": 0.24403852671384813, "num_tokens": 66340649.0, "step": 28930 }, { "entropy": 5.113717126846313, "epoch": 2.7795389048991357, "grad_norm": 0.9453125, "learning_rate": 0.000423281697209071, "loss": 4.7913, "mean_token_accuracy": 0.22693513035774232, "num_tokens": 66353555.0, "step": 28935 }, { "entropy": 5.196904087066651, "epoch": 2.780019212295869, "grad_norm": 1.0390625, "learning_rate": 0.0004232559102477995, "loss": 4.8463, "mean_token_accuracy": 0.22786442339420318, "num_tokens": 66365445.0, "step": 28940 }, { "entropy": 5.024850273132325, "epoch": 2.7804995196926034, "grad_norm": 1.109375, "learning_rate": 0.000423230119844466, "loss": 4.6531, "mean_token_accuracy": 0.24638040065765382, "num_tokens": 66377203.0, "step": 28945 }, { "entropy": 5.088182783126831, "epoch": 2.7809798270893373, "grad_norm": 1.0859375, "learning_rate": 0.0004232043259996692, "loss": 4.7946, "mean_token_accuracy": 0.2338147297501564, "num_tokens": 66388637.0, "step": 28950 }, { "entropy": 5.0920398235321045, "epoch": 2.781460134486071, "grad_norm": 1.09375, "learning_rate": 0.0004231785287140081, "loss": 4.7654, "mean_token_accuracy": 0.2338176667690277, "num_tokens": 66400470.0, "step": 28955 }, { "entropy": 5.170683860778809, "epoch": 2.781940441882805, "grad_norm": 1.0234375, "learning_rate": 0.0004231527279880816, "loss": 4.788, "mean_token_accuracy": 0.2422279790043831, "num_tokens": 66411079.0, "step": 28960 }, { "entropy": 5.074834871292114, "epoch": 2.782420749279539, "grad_norm": 1.0625, "learning_rate": 0.0004231269238224885, "loss": 4.7482, "mean_token_accuracy": 0.2362466499209404, "num_tokens": 66423706.0, "step": 28965 }, { "entropy": 4.980147123336792, "epoch": 2.782901056676273, "grad_norm": 1.171875, "learning_rate": 0.0004231011162178282, "loss": 4.6911, "mean_token_accuracy": 0.2411831110715866, "num_tokens": 66434994.0, "step": 28970 }, { "entropy": 5.05233211517334, "epoch": 2.7833813640730067, "grad_norm": 1.0546875, "learning_rate": 0.0004230753051746998, "loss": 4.6128, "mean_token_accuracy": 0.24962817281484603, "num_tokens": 66446474.0, "step": 28975 }, { "entropy": 5.07179479598999, "epoch": 2.7838616714697406, "grad_norm": 1.0703125, "learning_rate": 0.00042304949069370246, "loss": 4.7032, "mean_token_accuracy": 0.2386288583278656, "num_tokens": 66457633.0, "step": 28980 }, { "entropy": 5.0545814514160154, "epoch": 2.7843419788664745, "grad_norm": 1.109375, "learning_rate": 0.00042302367277543553, "loss": 4.7138, "mean_token_accuracy": 0.2398442029953003, "num_tokens": 66468160.0, "step": 28985 }, { "entropy": 5.078740501403809, "epoch": 2.7848222862632084, "grad_norm": 1.1640625, "learning_rate": 0.00042299785142049855, "loss": 4.7339, "mean_token_accuracy": 0.23528432250022888, "num_tokens": 66478609.0, "step": 28990 }, { "entropy": 5.0767899513244625, "epoch": 2.7853025936599423, "grad_norm": 1.125, "learning_rate": 0.0004229720266294908, "loss": 4.7871, "mean_token_accuracy": 0.23646434545516967, "num_tokens": 66489992.0, "step": 28995 }, { "entropy": 5.17231707572937, "epoch": 2.785782901056676, "grad_norm": 1.109375, "learning_rate": 0.000422946198403012, "loss": 4.8007, "mean_token_accuracy": 0.2307385966181755, "num_tokens": 66501331.0, "step": 29000 }, { "entropy": 5.195744562149048, "epoch": 2.78626320845341, "grad_norm": 1.0859375, "learning_rate": 0.0004229203667416619, "loss": 4.7932, "mean_token_accuracy": 0.23550139963626862, "num_tokens": 66513336.0, "step": 29005 }, { "entropy": 5.071753883361817, "epoch": 2.7867435158501443, "grad_norm": 1.1015625, "learning_rate": 0.00042289453164604, "loss": 4.6473, "mean_token_accuracy": 0.249504953622818, "num_tokens": 66523577.0, "step": 29010 }, { "entropy": 5.017402505874633, "epoch": 2.787223823246878, "grad_norm": 1.078125, "learning_rate": 0.0004228686931167463, "loss": 4.7436, "mean_token_accuracy": 0.2307626038789749, "num_tokens": 66536181.0, "step": 29015 }, { "entropy": 5.065583562850952, "epoch": 2.787704130643612, "grad_norm": 1.046875, "learning_rate": 0.0004228428511543806, "loss": 4.7102, "mean_token_accuracy": 0.23388897031545638, "num_tokens": 66546881.0, "step": 29020 }, { "entropy": 5.175033617019653, "epoch": 2.7881844380403455, "grad_norm": 1.0390625, "learning_rate": 0.00042281700575954283, "loss": 4.8451, "mean_token_accuracy": 0.2391469433903694, "num_tokens": 66559091.0, "step": 29025 }, { "entropy": 5.142588186264038, "epoch": 2.78866474543708, "grad_norm": 1.2265625, "learning_rate": 0.0004227911569328332, "loss": 4.8233, "mean_token_accuracy": 0.22944566160440444, "num_tokens": 66570584.0, "step": 29030 }, { "entropy": 5.087484645843506, "epoch": 2.7891450528338138, "grad_norm": 0.98828125, "learning_rate": 0.0004227653046748517, "loss": 4.7233, "mean_token_accuracy": 0.2394936978816986, "num_tokens": 66581921.0, "step": 29035 }, { "entropy": 5.0868641376495365, "epoch": 2.7896253602305476, "grad_norm": 1.0234375, "learning_rate": 0.00042273944898619864, "loss": 4.835, "mean_token_accuracy": 0.23168757557868958, "num_tokens": 66595132.0, "step": 29040 }, { "entropy": 5.173955488204956, "epoch": 2.7901056676272815, "grad_norm": 0.984375, "learning_rate": 0.00042271358986747427, "loss": 4.7861, "mean_token_accuracy": 0.23523377031087875, "num_tokens": 66607212.0, "step": 29045 }, { "entropy": 5.135601615905761, "epoch": 2.7905859750240154, "grad_norm": 0.9453125, "learning_rate": 0.00042268772731927895, "loss": 4.748, "mean_token_accuracy": 0.24023500680923462, "num_tokens": 66619502.0, "step": 29050 }, { "entropy": 5.078439521789551, "epoch": 2.7910662824207493, "grad_norm": 1.0859375, "learning_rate": 0.00042266186134221317, "loss": 4.7932, "mean_token_accuracy": 0.22847750633955002, "num_tokens": 66631813.0, "step": 29055 }, { "entropy": 5.136983346939087, "epoch": 2.791546589817483, "grad_norm": 1.0859375, "learning_rate": 0.0004226359919368774, "loss": 4.8381, "mean_token_accuracy": 0.22895507216453553, "num_tokens": 66643879.0, "step": 29060 }, { "entropy": 5.174340677261353, "epoch": 2.792026897214217, "grad_norm": 1.0859375, "learning_rate": 0.00042261011910387224, "loss": 4.8108, "mean_token_accuracy": 0.23358631283044815, "num_tokens": 66656819.0, "step": 29065 }, { "entropy": 5.0694104671478275, "epoch": 2.792507204610951, "grad_norm": 0.9921875, "learning_rate": 0.0004225842428437985, "loss": 4.6674, "mean_token_accuracy": 0.24253317564725876, "num_tokens": 66668745.0, "step": 29070 }, { "entropy": 5.050758314132691, "epoch": 2.792987512007685, "grad_norm": 1.015625, "learning_rate": 0.00042255836315725694, "loss": 4.7219, "mean_token_accuracy": 0.24155887365341186, "num_tokens": 66681620.0, "step": 29075 }, { "entropy": 5.057884931564331, "epoch": 2.7934678194044187, "grad_norm": 1.109375, "learning_rate": 0.0004225324800448483, "loss": 4.6954, "mean_token_accuracy": 0.2354402020573616, "num_tokens": 66692232.0, "step": 29080 }, { "entropy": 5.067146492004395, "epoch": 2.793948126801153, "grad_norm": 1.046875, "learning_rate": 0.00042250659350717343, "loss": 4.6955, "mean_token_accuracy": 0.24201476722955703, "num_tokens": 66703401.0, "step": 29085 }, { "entropy": 4.958551979064941, "epoch": 2.7944284341978864, "grad_norm": 1.0546875, "learning_rate": 0.00042248070354483354, "loss": 4.6304, "mean_token_accuracy": 0.24315544962882996, "num_tokens": 66714066.0, "step": 29090 }, { "entropy": 4.9515398979187015, "epoch": 2.7949087415946208, "grad_norm": 1.1640625, "learning_rate": 0.0004224548101584297, "loss": 4.5899, "mean_token_accuracy": 0.2479192927479744, "num_tokens": 66724504.0, "step": 29095 }, { "entropy": 5.094303226470947, "epoch": 2.795389048991354, "grad_norm": 1.046875, "learning_rate": 0.000422428913348563, "loss": 4.7101, "mean_token_accuracy": 0.23548691868782043, "num_tokens": 66735772.0, "step": 29100 }, { "entropy": 5.090736150741577, "epoch": 2.7958693563880885, "grad_norm": 1.765625, "learning_rate": 0.0004224030131158346, "loss": 4.7576, "mean_token_accuracy": 0.2333929643034935, "num_tokens": 66747956.0, "step": 29105 }, { "entropy": 5.064262390136719, "epoch": 2.7963496637848224, "grad_norm": 1.09375, "learning_rate": 0.0004223771094608461, "loss": 4.7749, "mean_token_accuracy": 0.23554529398679733, "num_tokens": 66759112.0, "step": 29110 }, { "entropy": 5.056329822540283, "epoch": 2.7968299711815563, "grad_norm": 1.078125, "learning_rate": 0.0004223512023841986, "loss": 4.7125, "mean_token_accuracy": 0.2383354589343071, "num_tokens": 66770927.0, "step": 29115 }, { "entropy": 5.102386045455932, "epoch": 2.79731027857829, "grad_norm": 1.09375, "learning_rate": 0.00042232529188649374, "loss": 4.819, "mean_token_accuracy": 0.2361222356557846, "num_tokens": 66781809.0, "step": 29120 }, { "entropy": 5.017792701721191, "epoch": 2.797790585975024, "grad_norm": 1.1328125, "learning_rate": 0.0004222993779683331, "loss": 4.697, "mean_token_accuracy": 0.23675900995731353, "num_tokens": 66793495.0, "step": 29125 }, { "entropy": 5.044356393814087, "epoch": 2.798270893371758, "grad_norm": 1.046875, "learning_rate": 0.00042227346063031837, "loss": 4.7142, "mean_token_accuracy": 0.24150240570306777, "num_tokens": 66804568.0, "step": 29130 }, { "entropy": 5.061636066436767, "epoch": 2.798751200768492, "grad_norm": 1.0390625, "learning_rate": 0.0004222475398730511, "loss": 4.6495, "mean_token_accuracy": 0.24463913440704346, "num_tokens": 66816698.0, "step": 29135 }, { "entropy": 5.104371786117554, "epoch": 2.7992315081652257, "grad_norm": 0.95703125, "learning_rate": 0.0004222216156971332, "loss": 4.7831, "mean_token_accuracy": 0.23421378880739213, "num_tokens": 66829299.0, "step": 29140 }, { "entropy": 4.958194351196289, "epoch": 2.7997118155619596, "grad_norm": 1.0546875, "learning_rate": 0.00042219568810316656, "loss": 4.589, "mean_token_accuracy": 0.2420770525932312, "num_tokens": 66839871.0, "step": 29145 }, { "entropy": 5.1890904903411865, "epoch": 2.8001921229586935, "grad_norm": 1.0859375, "learning_rate": 0.0004221697570917531, "loss": 4.7849, "mean_token_accuracy": 0.2348181426525116, "num_tokens": 66850368.0, "step": 29150 }, { "entropy": 5.111753463745117, "epoch": 2.8006724303554273, "grad_norm": 1.09375, "learning_rate": 0.000422143822663495, "loss": 4.7684, "mean_token_accuracy": 0.235346058011055, "num_tokens": 66861067.0, "step": 29155 }, { "entropy": 5.082866239547729, "epoch": 2.8011527377521612, "grad_norm": 1.0078125, "learning_rate": 0.0004221178848189941, "loss": 4.8492, "mean_token_accuracy": 0.22439933717250823, "num_tokens": 66873841.0, "step": 29160 }, { "entropy": 5.106626224517822, "epoch": 2.801633045148895, "grad_norm": 1.1171875, "learning_rate": 0.00042209194355885283, "loss": 4.7841, "mean_token_accuracy": 0.22784036695957183, "num_tokens": 66885042.0, "step": 29165 }, { "entropy": 5.114370965957642, "epoch": 2.8021133525456294, "grad_norm": 1.125, "learning_rate": 0.0004220659988836734, "loss": 4.7594, "mean_token_accuracy": 0.23598769903182984, "num_tokens": 66897076.0, "step": 29170 }, { "entropy": 5.031212520599365, "epoch": 2.802593659942363, "grad_norm": 1.0625, "learning_rate": 0.0004220400507940582, "loss": 4.6507, "mean_token_accuracy": 0.23573495745658873, "num_tokens": 66907879.0, "step": 29175 }, { "entropy": 5.082202768325805, "epoch": 2.803073967339097, "grad_norm": 0.9609375, "learning_rate": 0.00042201409929060955, "loss": 4.8137, "mean_token_accuracy": 0.23303966522216796, "num_tokens": 66920801.0, "step": 29180 }, { "entropy": 5.096289825439453, "epoch": 2.803554274735831, "grad_norm": 1.0625, "learning_rate": 0.0004219881443739301, "loss": 4.6695, "mean_token_accuracy": 0.23883183002471925, "num_tokens": 66931411.0, "step": 29185 }, { "entropy": 5.111446666717529, "epoch": 2.804034582132565, "grad_norm": 1.109375, "learning_rate": 0.0004219621860446225, "loss": 4.7507, "mean_token_accuracy": 0.23893794417381287, "num_tokens": 66942122.0, "step": 29190 }, { "entropy": 5.04069766998291, "epoch": 2.804514889529299, "grad_norm": 1.078125, "learning_rate": 0.0004219362243032892, "loss": 4.7822, "mean_token_accuracy": 0.23491215258836745, "num_tokens": 66954096.0, "step": 29195 }, { "entropy": 5.125007200241089, "epoch": 2.8049951969260327, "grad_norm": 1.0234375, "learning_rate": 0.00042191025915053323, "loss": 4.8025, "mean_token_accuracy": 0.23034960478544236, "num_tokens": 66965729.0, "step": 29200 }, { "entropy": 5.113742399215698, "epoch": 2.8054755043227666, "grad_norm": 1.1484375, "learning_rate": 0.00042188429058695714, "loss": 4.768, "mean_token_accuracy": 0.22859703600406647, "num_tokens": 66975819.0, "step": 29205 }, { "entropy": 5.046423530578613, "epoch": 2.8059558117195005, "grad_norm": 1.1015625, "learning_rate": 0.00042185831861316406, "loss": 4.7134, "mean_token_accuracy": 0.24926585853099822, "num_tokens": 66987150.0, "step": 29210 }, { "entropy": 5.07684817314148, "epoch": 2.8064361191162344, "grad_norm": 1.0390625, "learning_rate": 0.0004218323432297568, "loss": 4.6882, "mean_token_accuracy": 0.23167644441127777, "num_tokens": 66999449.0, "step": 29215 }, { "entropy": 5.0162135601043705, "epoch": 2.8069164265129682, "grad_norm": 1.09375, "learning_rate": 0.00042180636443733864, "loss": 4.6862, "mean_token_accuracy": 0.24328800439834594, "num_tokens": 67011256.0, "step": 29220 }, { "entropy": 4.9040539264678955, "epoch": 2.807396733909702, "grad_norm": 1.1484375, "learning_rate": 0.00042178038223651253, "loss": 4.587, "mean_token_accuracy": 0.24731760174036027, "num_tokens": 67021878.0, "step": 29225 }, { "entropy": 5.091517114639283, "epoch": 2.807877041306436, "grad_norm": 1.09375, "learning_rate": 0.00042175439662788195, "loss": 4.8519, "mean_token_accuracy": 0.22310329526662825, "num_tokens": 67032544.0, "step": 29230 }, { "entropy": 5.048261499404907, "epoch": 2.80835734870317, "grad_norm": 1.078125, "learning_rate": 0.00042172840761204986, "loss": 4.715, "mean_token_accuracy": 0.24167356193065642, "num_tokens": 67042119.0, "step": 29235 }, { "entropy": 5.085355520248413, "epoch": 2.8088376560999038, "grad_norm": 1.140625, "learning_rate": 0.0004217024151896199, "loss": 4.7588, "mean_token_accuracy": 0.2463191419839859, "num_tokens": 67053126.0, "step": 29240 }, { "entropy": 5.1328675746917725, "epoch": 2.809317963496638, "grad_norm": 1.1484375, "learning_rate": 0.00042167641936119557, "loss": 4.8357, "mean_token_accuracy": 0.2317552775144577, "num_tokens": 67063818.0, "step": 29245 }, { "entropy": 5.040121793746948, "epoch": 2.8097982708933715, "grad_norm": 1.1484375, "learning_rate": 0.0004216504201273802, "loss": 4.736, "mean_token_accuracy": 0.24150463789701462, "num_tokens": 67074477.0, "step": 29250 }, { "entropy": 5.108019542694092, "epoch": 2.810278578290106, "grad_norm": 1.015625, "learning_rate": 0.0004216244174887776, "loss": 4.7634, "mean_token_accuracy": 0.23590658009052276, "num_tokens": 67085920.0, "step": 29255 }, { "entropy": 5.0599141120910645, "epoch": 2.8107588856868397, "grad_norm": 1.1015625, "learning_rate": 0.00042159841144599145, "loss": 4.7853, "mean_token_accuracy": 0.2368880867958069, "num_tokens": 67097542.0, "step": 29260 }, { "entropy": 5.1055501937866214, "epoch": 2.8112391930835736, "grad_norm": 1.0859375, "learning_rate": 0.00042157240199962537, "loss": 4.8005, "mean_token_accuracy": 0.23813695907592775, "num_tokens": 67108514.0, "step": 29265 }, { "entropy": 5.073172616958618, "epoch": 2.8117195004803075, "grad_norm": 1.0, "learning_rate": 0.0004215463891502834, "loss": 4.6163, "mean_token_accuracy": 0.245599864423275, "num_tokens": 67120263.0, "step": 29270 }, { "entropy": 5.074782133102417, "epoch": 2.8121998078770414, "grad_norm": 1.0078125, "learning_rate": 0.00042152037289856954, "loss": 4.709, "mean_token_accuracy": 0.23398882746696473, "num_tokens": 67133607.0, "step": 29275 }, { "entropy": 5.077461671829224, "epoch": 2.8126801152737753, "grad_norm": 1.0546875, "learning_rate": 0.00042149435324508755, "loss": 4.7064, "mean_token_accuracy": 0.2428443506360054, "num_tokens": 67145452.0, "step": 29280 }, { "entropy": 4.991470813751221, "epoch": 2.813160422670509, "grad_norm": 1.03125, "learning_rate": 0.0004214683301904417, "loss": 4.7309, "mean_token_accuracy": 0.24190180599689484, "num_tokens": 67159178.0, "step": 29285 }, { "entropy": 5.179703235626221, "epoch": 2.813640730067243, "grad_norm": 0.98046875, "learning_rate": 0.00042144230373523624, "loss": 4.8226, "mean_token_accuracy": 0.2293036624789238, "num_tokens": 67171652.0, "step": 29290 }, { "entropy": 5.142070007324219, "epoch": 2.814121037463977, "grad_norm": 1.1015625, "learning_rate": 0.0004214162738800753, "loss": 4.7704, "mean_token_accuracy": 0.23842364102602004, "num_tokens": 67181844.0, "step": 29295 }, { "entropy": 5.066088676452637, "epoch": 2.814601344860711, "grad_norm": 1.0234375, "learning_rate": 0.0004213902406255632, "loss": 4.7882, "mean_token_accuracy": 0.2425612300634384, "num_tokens": 67193156.0, "step": 29300 }, { "entropy": 5.030477046966553, "epoch": 2.8150816522574447, "grad_norm": 1.03125, "learning_rate": 0.0004213642039723044, "loss": 4.7371, "mean_token_accuracy": 0.23682460188865662, "num_tokens": 67205447.0, "step": 29305 }, { "entropy": 5.194211769104004, "epoch": 2.8155619596541785, "grad_norm": 1.046875, "learning_rate": 0.00042133816392090343, "loss": 4.8293, "mean_token_accuracy": 0.23123976290225984, "num_tokens": 67216101.0, "step": 29310 }, { "entropy": 5.037980031967163, "epoch": 2.8160422670509124, "grad_norm": 1.1171875, "learning_rate": 0.00042131212047196484, "loss": 4.5674, "mean_token_accuracy": 0.25225337147712706, "num_tokens": 67226972.0, "step": 29315 }, { "entropy": 4.998749828338623, "epoch": 2.8165225744476468, "grad_norm": 1.0078125, "learning_rate": 0.00042128607362609317, "loss": 4.7108, "mean_token_accuracy": 0.24861632883548737, "num_tokens": 67237968.0, "step": 29320 }, { "entropy": 5.095122480392456, "epoch": 2.81700288184438, "grad_norm": 1.0234375, "learning_rate": 0.00042126002338389336, "loss": 4.8231, "mean_token_accuracy": 0.23481558561325072, "num_tokens": 67250817.0, "step": 29325 }, { "entropy": 5.1704460144042965, "epoch": 2.8174831892411145, "grad_norm": 1.0234375, "learning_rate": 0.00042123396974597007, "loss": 4.7091, "mean_token_accuracy": 0.23950430005788803, "num_tokens": 67261491.0, "step": 29330 }, { "entropy": 5.069094800949097, "epoch": 2.817963496637848, "grad_norm": 1.046875, "learning_rate": 0.00042120791271292823, "loss": 4.7515, "mean_token_accuracy": 0.241105617582798, "num_tokens": 67272684.0, "step": 29335 }, { "entropy": 5.080519819259644, "epoch": 2.8184438040345823, "grad_norm": 1.15625, "learning_rate": 0.00042118185228537283, "loss": 4.7718, "mean_token_accuracy": 0.233907251060009, "num_tokens": 67284070.0, "step": 29340 }, { "entropy": 5.135709619522094, "epoch": 2.818924111431316, "grad_norm": 1.0234375, "learning_rate": 0.00042115578846390884, "loss": 4.8109, "mean_token_accuracy": 0.23303520381450654, "num_tokens": 67296302.0, "step": 29345 }, { "entropy": 5.1688251972198485, "epoch": 2.81940441882805, "grad_norm": 1.0546875, "learning_rate": 0.0004211297212491414, "loss": 4.759, "mean_token_accuracy": 0.23793066143989564, "num_tokens": 67307123.0, "step": 29350 }, { "entropy": 5.125912284851074, "epoch": 2.819884726224784, "grad_norm": 1.0625, "learning_rate": 0.0004211036506416759, "loss": 4.742, "mean_token_accuracy": 0.2363669753074646, "num_tokens": 67318832.0, "step": 29355 }, { "entropy": 5.099205780029297, "epoch": 2.820365033621518, "grad_norm": 1.1171875, "learning_rate": 0.0004210775766421173, "loss": 4.8252, "mean_token_accuracy": 0.2343818336725235, "num_tokens": 67329821.0, "step": 29360 }, { "entropy": 5.085246849060058, "epoch": 2.8208453410182517, "grad_norm": 0.98828125, "learning_rate": 0.0004210514992510713, "loss": 4.7023, "mean_token_accuracy": 0.2376452013850212, "num_tokens": 67341569.0, "step": 29365 }, { "entropy": 5.1617189884185795, "epoch": 2.8213256484149856, "grad_norm": 1.0234375, "learning_rate": 0.0004210254184691431, "loss": 4.8206, "mean_token_accuracy": 0.23409080356359482, "num_tokens": 67352499.0, "step": 29370 }, { "entropy": 5.065718507766723, "epoch": 2.8218059558117194, "grad_norm": 1.09375, "learning_rate": 0.00042099933429693814, "loss": 4.7227, "mean_token_accuracy": 0.23651075959205628, "num_tokens": 67363810.0, "step": 29375 }, { "entropy": 5.126027631759643, "epoch": 2.8222862632084533, "grad_norm": 1.0703125, "learning_rate": 0.0004209732467350624, "loss": 4.8102, "mean_token_accuracy": 0.2424784705042839, "num_tokens": 67374690.0, "step": 29380 }, { "entropy": 5.087665176391601, "epoch": 2.822766570605187, "grad_norm": 1.0078125, "learning_rate": 0.0004209471557841212, "loss": 4.7307, "mean_token_accuracy": 0.23689046800136565, "num_tokens": 67386297.0, "step": 29385 }, { "entropy": 5.081095743179321, "epoch": 2.823246878001921, "grad_norm": 0.99609375, "learning_rate": 0.0004209210614447204, "loss": 4.6725, "mean_token_accuracy": 0.24220550954341888, "num_tokens": 67397544.0, "step": 29390 }, { "entropy": 5.125997829437256, "epoch": 2.8237271853986554, "grad_norm": 0.984375, "learning_rate": 0.000420894963717466, "loss": 4.7593, "mean_token_accuracy": 0.23870523571968078, "num_tokens": 67408609.0, "step": 29395 }, { "entropy": 5.0870969772338865, "epoch": 2.824207492795389, "grad_norm": 1.125, "learning_rate": 0.0004208688626029636, "loss": 4.7304, "mean_token_accuracy": 0.23566053956747054, "num_tokens": 67421091.0, "step": 29400 }, { "entropy": 5.089567852020264, "epoch": 2.824687800192123, "grad_norm": 1.171875, "learning_rate": 0.0004208427581018194, "loss": 4.7638, "mean_token_accuracy": 0.23751177489757538, "num_tokens": 67433141.0, "step": 29405 }, { "entropy": 5.149106502532959, "epoch": 2.8251681075888566, "grad_norm": 1.1328125, "learning_rate": 0.0004208166502146394, "loss": 4.8157, "mean_token_accuracy": 0.2274396926164627, "num_tokens": 67443646.0, "step": 29410 }, { "entropy": 5.127561283111572, "epoch": 2.825648414985591, "grad_norm": 1.0234375, "learning_rate": 0.00042079053894202977, "loss": 4.755, "mean_token_accuracy": 0.2354283645749092, "num_tokens": 67455783.0, "step": 29415 }, { "entropy": 5.097440528869629, "epoch": 2.826128722382325, "grad_norm": 1.046875, "learning_rate": 0.0004207644242845968, "loss": 4.712, "mean_token_accuracy": 0.24154630899429322, "num_tokens": 67467185.0, "step": 29420 }, { "entropy": 5.0761716842651365, "epoch": 2.8266090297790587, "grad_norm": 1.0078125, "learning_rate": 0.0004207383062429467, "loss": 4.7032, "mean_token_accuracy": 0.23896313607692718, "num_tokens": 67478740.0, "step": 29425 }, { "entropy": 5.02870888710022, "epoch": 2.8270893371757926, "grad_norm": 1.0859375, "learning_rate": 0.0004207121848176858, "loss": 4.6894, "mean_token_accuracy": 0.24051135778427124, "num_tokens": 67490492.0, "step": 29430 }, { "entropy": 4.981909322738647, "epoch": 2.8275696445725265, "grad_norm": 1.09375, "learning_rate": 0.00042068606000942075, "loss": 4.6571, "mean_token_accuracy": 0.24181026667356492, "num_tokens": 67500861.0, "step": 29435 }, { "entropy": 5.078646659851074, "epoch": 2.8280499519692603, "grad_norm": 1.0546875, "learning_rate": 0.00042065993181875794, "loss": 4.7278, "mean_token_accuracy": 0.2341497138142586, "num_tokens": 67513577.0, "step": 29440 }, { "entropy": 5.077435159683228, "epoch": 2.8285302593659942, "grad_norm": 1.015625, "learning_rate": 0.000420633800246304, "loss": 4.7257, "mean_token_accuracy": 0.2428266689181328, "num_tokens": 67525147.0, "step": 29445 }, { "entropy": 5.069917392730713, "epoch": 2.829010566762728, "grad_norm": 1.03125, "learning_rate": 0.00042060766529266577, "loss": 4.7276, "mean_token_accuracy": 0.23703081905841827, "num_tokens": 67536183.0, "step": 29450 }, { "entropy": 5.130881977081299, "epoch": 2.829490874159462, "grad_norm": 1.0859375, "learning_rate": 0.00042058152695844986, "loss": 4.7522, "mean_token_accuracy": 0.2344597414135933, "num_tokens": 67546868.0, "step": 29455 }, { "entropy": 5.037741565704346, "epoch": 2.829971181556196, "grad_norm": 1.0234375, "learning_rate": 0.00042055538524426317, "loss": 4.6772, "mean_token_accuracy": 0.24406108856201172, "num_tokens": 67557243.0, "step": 29460 }, { "entropy": 5.034839630126953, "epoch": 2.8304514889529298, "grad_norm": 1.046875, "learning_rate": 0.0004205292401507127, "loss": 4.7484, "mean_token_accuracy": 0.2283693253993988, "num_tokens": 67568977.0, "step": 29465 }, { "entropy": 5.0676023960113525, "epoch": 2.8309317963496636, "grad_norm": 0.99609375, "learning_rate": 0.0004205030916784053, "loss": 4.7028, "mean_token_accuracy": 0.24158486872911453, "num_tokens": 67581318.0, "step": 29470 }, { "entropy": 5.010189628601074, "epoch": 2.8314121037463975, "grad_norm": 0.9921875, "learning_rate": 0.00042047693982794824, "loss": 4.6528, "mean_token_accuracy": 0.2412082239985466, "num_tokens": 67592707.0, "step": 29475 }, { "entropy": 5.0895514488220215, "epoch": 2.831892411143132, "grad_norm": 1.0703125, "learning_rate": 0.00042045078459994854, "loss": 4.7254, "mean_token_accuracy": 0.23643447011709212, "num_tokens": 67602929.0, "step": 29480 }, { "entropy": 5.115679359436035, "epoch": 2.8323727185398653, "grad_norm": 1.046875, "learning_rate": 0.0004204246259950136, "loss": 4.7374, "mean_token_accuracy": 0.24406941384077072, "num_tokens": 67613049.0, "step": 29485 }, { "entropy": 5.171131896972656, "epoch": 2.8328530259365996, "grad_norm": 1.1484375, "learning_rate": 0.00042039846401375065, "loss": 4.8206, "mean_token_accuracy": 0.2334555834531784, "num_tokens": 67623355.0, "step": 29490 }, { "entropy": 5.166270017623901, "epoch": 2.8333333333333335, "grad_norm": 0.97265625, "learning_rate": 0.00042037229865676714, "loss": 4.8016, "mean_token_accuracy": 0.23626454472541808, "num_tokens": 67634607.0, "step": 29495 }, { "entropy": 5.046123886108399, "epoch": 2.8338136407300674, "grad_norm": 1.1171875, "learning_rate": 0.00042034612992467046, "loss": 4.6883, "mean_token_accuracy": 0.2428459644317627, "num_tokens": 67646549.0, "step": 29500 }, { "entropy": 4.962723445892334, "epoch": 2.8342939481268012, "grad_norm": 1.03125, "learning_rate": 0.0004203199578180683, "loss": 4.619, "mean_token_accuracy": 0.2527529805898666, "num_tokens": 67657785.0, "step": 29505 }, { "entropy": 5.064220571517945, "epoch": 2.834774255523535, "grad_norm": 1.046875, "learning_rate": 0.0004202937823375682, "loss": 4.715, "mean_token_accuracy": 0.2429947003722191, "num_tokens": 67670415.0, "step": 29510 }, { "entropy": 5.099671173095703, "epoch": 2.835254562920269, "grad_norm": 0.98046875, "learning_rate": 0.0004202676034837779, "loss": 4.7237, "mean_token_accuracy": 0.23474127650260926, "num_tokens": 67682815.0, "step": 29515 }, { "entropy": 5.086864423751831, "epoch": 2.835734870317003, "grad_norm": 0.96484375, "learning_rate": 0.0004202414212573052, "loss": 4.7447, "mean_token_accuracy": 0.23662538975477218, "num_tokens": 67693820.0, "step": 29520 }, { "entropy": 5.068673038482666, "epoch": 2.8362151777137368, "grad_norm": 0.96875, "learning_rate": 0.00042021523565875796, "loss": 4.7307, "mean_token_accuracy": 0.23210255354642867, "num_tokens": 67705375.0, "step": 29525 }, { "entropy": 5.118408823013306, "epoch": 2.8366954851104706, "grad_norm": 0.984375, "learning_rate": 0.0004201890466887442, "loss": 4.7608, "mean_token_accuracy": 0.23971533328294753, "num_tokens": 67717549.0, "step": 29530 }, { "entropy": 5.064080238342285, "epoch": 2.8371757925072045, "grad_norm": 1.1796875, "learning_rate": 0.0004201628543478718, "loss": 4.719, "mean_token_accuracy": 0.24084258824586868, "num_tokens": 67727462.0, "step": 29535 }, { "entropy": 5.07521619796753, "epoch": 2.8376560999039384, "grad_norm": 0.984375, "learning_rate": 0.000420136658636749, "loss": 4.7104, "mean_token_accuracy": 0.23347580134868623, "num_tokens": 67740059.0, "step": 29540 }, { "entropy": 5.1352294921875, "epoch": 2.8381364073006723, "grad_norm": 1.0703125, "learning_rate": 0.0004201104595559841, "loss": 4.7545, "mean_token_accuracy": 0.2352105587720871, "num_tokens": 67751010.0, "step": 29545 }, { "entropy": 5.056938123703003, "epoch": 2.838616714697406, "grad_norm": 1.0078125, "learning_rate": 0.00042008425710618507, "loss": 4.727, "mean_token_accuracy": 0.24354122281074525, "num_tokens": 67762131.0, "step": 29550 }, { "entropy": 5.065296459197998, "epoch": 2.8390970220941405, "grad_norm": 1.0078125, "learning_rate": 0.00042005805128796043, "loss": 4.7733, "mean_token_accuracy": 0.2421492114663124, "num_tokens": 67773759.0, "step": 29555 }, { "entropy": 5.059252643585205, "epoch": 2.839577329490874, "grad_norm": 0.96484375, "learning_rate": 0.0004200318421019186, "loss": 4.6424, "mean_token_accuracy": 0.24407144635915756, "num_tokens": 67785009.0, "step": 29560 }, { "entropy": 5.083651447296143, "epoch": 2.8400576368876083, "grad_norm": 1.0078125, "learning_rate": 0.000420005629548668, "loss": 4.7388, "mean_token_accuracy": 0.2332341268658638, "num_tokens": 67796588.0, "step": 29565 }, { "entropy": 5.070565891265869, "epoch": 2.840537944284342, "grad_norm": 1.3671875, "learning_rate": 0.00041997941362881735, "loss": 4.6984, "mean_token_accuracy": 0.2380545437335968, "num_tokens": 67808777.0, "step": 29570 }, { "entropy": 5.10891318321228, "epoch": 2.841018251681076, "grad_norm": 1.171875, "learning_rate": 0.0004199531943429752, "loss": 4.748, "mean_token_accuracy": 0.23936144560575484, "num_tokens": 67818562.0, "step": 29575 }, { "entropy": 5.008887243270874, "epoch": 2.84149855907781, "grad_norm": 1.1484375, "learning_rate": 0.0004199269716917502, "loss": 4.691, "mean_token_accuracy": 0.24895972162485122, "num_tokens": 67829938.0, "step": 29580 }, { "entropy": 5.102813577651977, "epoch": 2.841978866474544, "grad_norm": 1.0625, "learning_rate": 0.0004199007456757513, "loss": 4.7173, "mean_token_accuracy": 0.23261762410402298, "num_tokens": 67841735.0, "step": 29585 }, { "entropy": 5.013525819778442, "epoch": 2.8424591738712777, "grad_norm": 1.0703125, "learning_rate": 0.00041987451629558743, "loss": 4.6723, "mean_token_accuracy": 0.23993164747953416, "num_tokens": 67853545.0, "step": 29590 }, { "entropy": 5.048679161071777, "epoch": 2.8429394812680115, "grad_norm": 1.078125, "learning_rate": 0.0004198482835518674, "loss": 4.6501, "mean_token_accuracy": 0.247216035425663, "num_tokens": 67864763.0, "step": 29595 }, { "entropy": 5.145204257965088, "epoch": 2.8434197886647454, "grad_norm": 1.0234375, "learning_rate": 0.0004198220474452004, "loss": 4.8172, "mean_token_accuracy": 0.23050425350666046, "num_tokens": 67875837.0, "step": 29600 }, { "entropy": 5.084896230697632, "epoch": 2.8439000960614793, "grad_norm": 1.0546875, "learning_rate": 0.0004197958079761954, "loss": 4.7623, "mean_token_accuracy": 0.236508746445179, "num_tokens": 67887623.0, "step": 29605 }, { "entropy": 5.0404211521148685, "epoch": 2.844380403458213, "grad_norm": 1.03125, "learning_rate": 0.00041976956514546185, "loss": 4.7101, "mean_token_accuracy": 0.2442680910229683, "num_tokens": 67898883.0, "step": 29610 }, { "entropy": 5.066223621368408, "epoch": 2.844860710854947, "grad_norm": 1.0703125, "learning_rate": 0.00041974331895360873, "loss": 4.6971, "mean_token_accuracy": 0.24751601368188858, "num_tokens": 67908871.0, "step": 29615 }, { "entropy": 5.031841135025024, "epoch": 2.845341018251681, "grad_norm": 0.9765625, "learning_rate": 0.0004197170694012456, "loss": 4.7118, "mean_token_accuracy": 0.23562415242195128, "num_tokens": 67920812.0, "step": 29620 }, { "entropy": 5.023913812637329, "epoch": 2.845821325648415, "grad_norm": 1.0546875, "learning_rate": 0.0004196908164889818, "loss": 4.6841, "mean_token_accuracy": 0.24350014626979827, "num_tokens": 67931703.0, "step": 29625 }, { "entropy": 5.110364866256714, "epoch": 2.846301633045149, "grad_norm": 1.125, "learning_rate": 0.0004196645602174269, "loss": 4.7696, "mean_token_accuracy": 0.23735318034887315, "num_tokens": 67943423.0, "step": 29630 }, { "entropy": 5.149677801132202, "epoch": 2.8467819404418826, "grad_norm": 1.015625, "learning_rate": 0.00041963830058719046, "loss": 4.7765, "mean_token_accuracy": 0.23049827367067338, "num_tokens": 67954047.0, "step": 29635 }, { "entropy": 5.070647478103638, "epoch": 2.847262247838617, "grad_norm": 1.0390625, "learning_rate": 0.0004196120375988822, "loss": 4.7384, "mean_token_accuracy": 0.24005120545625686, "num_tokens": 67964833.0, "step": 29640 }, { "entropy": 5.0713142395019535, "epoch": 2.8477425552353504, "grad_norm": 1.046875, "learning_rate": 0.0004195857712531119, "loss": 4.7856, "mean_token_accuracy": 0.23458235412836076, "num_tokens": 67977370.0, "step": 29645 }, { "entropy": 5.132041311264038, "epoch": 2.8482228626320847, "grad_norm": 0.98828125, "learning_rate": 0.0004195595015504892, "loss": 4.6999, "mean_token_accuracy": 0.24161131531000138, "num_tokens": 67988680.0, "step": 29650 }, { "entropy": 5.133564519882202, "epoch": 2.8487031700288186, "grad_norm": 1.0390625, "learning_rate": 0.00041953322849162415, "loss": 4.8106, "mean_token_accuracy": 0.23893430978059768, "num_tokens": 67999927.0, "step": 29655 }, { "entropy": 5.0700671672821045, "epoch": 2.8491834774255524, "grad_norm": 1.0078125, "learning_rate": 0.0004195069520771268, "loss": 4.8085, "mean_token_accuracy": 0.23393811881542206, "num_tokens": 68012176.0, "step": 29660 }, { "entropy": 5.156960439682007, "epoch": 2.8496637848222863, "grad_norm": 1.125, "learning_rate": 0.00041948067230760706, "loss": 4.7311, "mean_token_accuracy": 0.23428400307893754, "num_tokens": 68023561.0, "step": 29665 }, { "entropy": 5.136725044250488, "epoch": 2.85014409221902, "grad_norm": 1.0, "learning_rate": 0.00041945438918367513, "loss": 4.8433, "mean_token_accuracy": 0.23022455126047134, "num_tokens": 68035768.0, "step": 29670 }, { "entropy": 5.175477504730225, "epoch": 2.850624399615754, "grad_norm": 0.9453125, "learning_rate": 0.00041942810270594115, "loss": 4.8345, "mean_token_accuracy": 0.2284504994750023, "num_tokens": 68049414.0, "step": 29675 }, { "entropy": 5.059452867507934, "epoch": 2.851104707012488, "grad_norm": 1.0703125, "learning_rate": 0.0004194018128750157, "loss": 4.6302, "mean_token_accuracy": 0.2474249631166458, "num_tokens": 68060743.0, "step": 29680 }, { "entropy": 5.07676739692688, "epoch": 2.851585014409222, "grad_norm": 0.98828125, "learning_rate": 0.00041937551969150873, "loss": 4.6701, "mean_token_accuracy": 0.2410699486732483, "num_tokens": 68070525.0, "step": 29685 }, { "entropy": 5.086371564865113, "epoch": 2.8520653218059557, "grad_norm": 0.9765625, "learning_rate": 0.000419349223156031, "loss": 4.7683, "mean_token_accuracy": 0.23472922891378403, "num_tokens": 68082160.0, "step": 29690 }, { "entropy": 5.043384456634522, "epoch": 2.8525456292026896, "grad_norm": 1.0546875, "learning_rate": 0.0004193229232691929, "loss": 4.676, "mean_token_accuracy": 0.23813123106956482, "num_tokens": 68092473.0, "step": 29695 }, { "entropy": 4.977475833892822, "epoch": 2.8530259365994235, "grad_norm": 1.1484375, "learning_rate": 0.00041929662003160504, "loss": 4.6828, "mean_token_accuracy": 0.24753793478012084, "num_tokens": 68103820.0, "step": 29700 }, { "entropy": 4.998800277709961, "epoch": 2.8535062439961574, "grad_norm": 1.0078125, "learning_rate": 0.00041927031344387824, "loss": 4.5875, "mean_token_accuracy": 0.2567884773015976, "num_tokens": 68116364.0, "step": 29705 }, { "entropy": 4.9717323780059814, "epoch": 2.8539865513928913, "grad_norm": 1.0859375, "learning_rate": 0.00041924400350662304, "loss": 4.5668, "mean_token_accuracy": 0.2500680357217789, "num_tokens": 68127970.0, "step": 29710 }, { "entropy": 5.091880464553833, "epoch": 2.8544668587896256, "grad_norm": 1.125, "learning_rate": 0.00041921769022045045, "loss": 4.7633, "mean_token_accuracy": 0.23118812441825867, "num_tokens": 68139433.0, "step": 29715 }, { "entropy": 5.053804922103882, "epoch": 2.854947166186359, "grad_norm": 0.9453125, "learning_rate": 0.00041919137358597137, "loss": 4.7022, "mean_token_accuracy": 0.23204765170812608, "num_tokens": 68153169.0, "step": 29720 }, { "entropy": 5.1572469711303714, "epoch": 2.8554274735830933, "grad_norm": 1.0390625, "learning_rate": 0.0004191650536037967, "loss": 4.8032, "mean_token_accuracy": 0.22558769285678865, "num_tokens": 68163899.0, "step": 29725 }, { "entropy": 5.027514362335205, "epoch": 2.8559077809798272, "grad_norm": 1.03125, "learning_rate": 0.00041913873027453756, "loss": 4.6108, "mean_token_accuracy": 0.24452922493219376, "num_tokens": 68175342.0, "step": 29730 }, { "entropy": 5.049619913101196, "epoch": 2.856388088376561, "grad_norm": 1.0859375, "learning_rate": 0.00041911240359880517, "loss": 4.7712, "mean_token_accuracy": 0.23811262100934982, "num_tokens": 68186646.0, "step": 29735 }, { "entropy": 5.008252954483032, "epoch": 2.856868395773295, "grad_norm": 1.0234375, "learning_rate": 0.00041908607357721067, "loss": 4.6464, "mean_token_accuracy": 0.24562440663576127, "num_tokens": 68197464.0, "step": 29740 }, { "entropy": 4.955460834503174, "epoch": 2.857348703170029, "grad_norm": 1.0, "learning_rate": 0.00041905974021036533, "loss": 4.5972, "mean_token_accuracy": 0.2518482759594917, "num_tokens": 68208846.0, "step": 29745 }, { "entropy": 5.135580253601074, "epoch": 2.8578290105667628, "grad_norm": 1.0859375, "learning_rate": 0.00041903340349888065, "loss": 4.8025, "mean_token_accuracy": 0.2359012097120285, "num_tokens": 68220273.0, "step": 29750 }, { "entropy": 5.188638544082641, "epoch": 2.8583093179634966, "grad_norm": 1.015625, "learning_rate": 0.000419007063443368, "loss": 4.8926, "mean_token_accuracy": 0.22688197046518327, "num_tokens": 68233871.0, "step": 29755 }, { "entropy": 5.102911806106567, "epoch": 2.8587896253602305, "grad_norm": 1.015625, "learning_rate": 0.00041898072004443906, "loss": 4.737, "mean_token_accuracy": 0.2358393609523773, "num_tokens": 68246155.0, "step": 29760 }, { "entropy": 5.095201587677002, "epoch": 2.8592699327569644, "grad_norm": 1.078125, "learning_rate": 0.0004189543733027052, "loss": 4.6985, "mean_token_accuracy": 0.24999535232782363, "num_tokens": 68258201.0, "step": 29765 }, { "entropy": 5.004236316680908, "epoch": 2.8597502401536983, "grad_norm": 1.0078125, "learning_rate": 0.0004189280232187783, "loss": 4.6663, "mean_token_accuracy": 0.24319817423820494, "num_tokens": 68270106.0, "step": 29770 }, { "entropy": 5.041296529769897, "epoch": 2.860230547550432, "grad_norm": 1.0703125, "learning_rate": 0.0004189016697932701, "loss": 4.7071, "mean_token_accuracy": 0.2420891910791397, "num_tokens": 68282549.0, "step": 29775 }, { "entropy": 5.132345914840698, "epoch": 2.860710854947166, "grad_norm": 1.015625, "learning_rate": 0.0004188753130267924, "loss": 4.7506, "mean_token_accuracy": 0.2420486569404602, "num_tokens": 68294141.0, "step": 29780 }, { "entropy": 5.106274557113648, "epoch": 2.8611911623439, "grad_norm": 0.9921875, "learning_rate": 0.0004188489529199572, "loss": 4.7665, "mean_token_accuracy": 0.24300543516874312, "num_tokens": 68305520.0, "step": 29785 }, { "entropy": 5.028087186813354, "epoch": 2.8616714697406342, "grad_norm": 1.0, "learning_rate": 0.00041882258947337637, "loss": 4.7375, "mean_token_accuracy": 0.2376639112830162, "num_tokens": 68317906.0, "step": 29790 }, { "entropy": 5.079030466079712, "epoch": 2.8621517771373677, "grad_norm": 0.984375, "learning_rate": 0.00041879622268766207, "loss": 4.7601, "mean_token_accuracy": 0.23383686393499375, "num_tokens": 68328998.0, "step": 29795 }, { "entropy": 5.046128749847412, "epoch": 2.862632084534102, "grad_norm": 1.0, "learning_rate": 0.0004187698525634266, "loss": 4.7362, "mean_token_accuracy": 0.23082706332206726, "num_tokens": 68341329.0, "step": 29800 }, { "entropy": 5.101367521286011, "epoch": 2.863112391930836, "grad_norm": 1.0078125, "learning_rate": 0.00041874347910128193, "loss": 4.7714, "mean_token_accuracy": 0.23045611530542373, "num_tokens": 68353791.0, "step": 29805 }, { "entropy": 5.05668420791626, "epoch": 2.8635926993275698, "grad_norm": 0.984375, "learning_rate": 0.0004187171023018406, "loss": 4.7231, "mean_token_accuracy": 0.24457271993160248, "num_tokens": 68365493.0, "step": 29810 }, { "entropy": 5.002443075180054, "epoch": 2.8640730067243036, "grad_norm": 1.015625, "learning_rate": 0.00041869072216571486, "loss": 4.6475, "mean_token_accuracy": 0.24708616137504577, "num_tokens": 68377658.0, "step": 29815 }, { "entropy": 5.060185384750366, "epoch": 2.8645533141210375, "grad_norm": 1.0234375, "learning_rate": 0.00041866433869351715, "loss": 4.6804, "mean_token_accuracy": 0.2429857924580574, "num_tokens": 68387710.0, "step": 29820 }, { "entropy": 5.112653636932373, "epoch": 2.8650336215177714, "grad_norm": 1.1015625, "learning_rate": 0.0004186379518858602, "loss": 4.8256, "mean_token_accuracy": 0.2314462423324585, "num_tokens": 68400726.0, "step": 29825 }, { "entropy": 5.214086675643921, "epoch": 2.8655139289145053, "grad_norm": 0.984375, "learning_rate": 0.0004186115617433565, "loss": 4.9045, "mean_token_accuracy": 0.223735249042511, "num_tokens": 68413414.0, "step": 29830 }, { "entropy": 5.0527424812316895, "epoch": 2.865994236311239, "grad_norm": 1.03125, "learning_rate": 0.00041858516826661876, "loss": 4.6331, "mean_token_accuracy": 0.2486885368824005, "num_tokens": 68424834.0, "step": 29835 }, { "entropy": 4.991256952285767, "epoch": 2.866474543707973, "grad_norm": 0.96875, "learning_rate": 0.00041855877145625974, "loss": 4.6051, "mean_token_accuracy": 0.24843001514673232, "num_tokens": 68437220.0, "step": 29840 }, { "entropy": 5.040426588058471, "epoch": 2.866954851104707, "grad_norm": 0.984375, "learning_rate": 0.0004185323713128924, "loss": 4.7959, "mean_token_accuracy": 0.23011162132024765, "num_tokens": 68449571.0, "step": 29845 }, { "entropy": 5.065598964691162, "epoch": 2.867435158501441, "grad_norm": 1.0078125, "learning_rate": 0.00041850596783712956, "loss": 4.6656, "mean_token_accuracy": 0.23874905556440354, "num_tokens": 68460592.0, "step": 29850 }, { "entropy": 5.074351978302002, "epoch": 2.8679154658981747, "grad_norm": 1.1015625, "learning_rate": 0.0004184795610295843, "loss": 4.6854, "mean_token_accuracy": 0.24854440093040467, "num_tokens": 68472268.0, "step": 29855 }, { "entropy": 5.063524532318115, "epoch": 2.8683957732949086, "grad_norm": 1.0703125, "learning_rate": 0.0004184531508908697, "loss": 4.7075, "mean_token_accuracy": 0.23350406885147096, "num_tokens": 68483382.0, "step": 29860 }, { "entropy": 5.056384468078614, "epoch": 2.868876080691643, "grad_norm": 1.0078125, "learning_rate": 0.0004184267374215989, "loss": 4.7365, "mean_token_accuracy": 0.24392684549093246, "num_tokens": 68494421.0, "step": 29865 }, { "entropy": 5.107250928878784, "epoch": 2.8693563880883763, "grad_norm": 1.0703125, "learning_rate": 0.000418400320622385, "loss": 4.7586, "mean_token_accuracy": 0.23305828124284744, "num_tokens": 68505810.0, "step": 29870 }, { "entropy": 4.997715044021606, "epoch": 2.8698366954851107, "grad_norm": 1.046875, "learning_rate": 0.0004183739004938416, "loss": 4.5571, "mean_token_accuracy": 0.2586457535624504, "num_tokens": 68515632.0, "step": 29875 }, { "entropy": 5.050461959838867, "epoch": 2.870317002881844, "grad_norm": 1.0234375, "learning_rate": 0.000418347477036582, "loss": 4.7899, "mean_token_accuracy": 0.2352416917681694, "num_tokens": 68526930.0, "step": 29880 }, { "entropy": 5.027235651016236, "epoch": 2.8707973102785784, "grad_norm": 0.99609375, "learning_rate": 0.00041832105025121956, "loss": 4.6573, "mean_token_accuracy": 0.24573568105697632, "num_tokens": 68537381.0, "step": 29885 }, { "entropy": 4.983240127563477, "epoch": 2.8712776176753123, "grad_norm": 1.0078125, "learning_rate": 0.0004182946201383679, "loss": 4.5922, "mean_token_accuracy": 0.24639476090669632, "num_tokens": 68548982.0, "step": 29890 }, { "entropy": 4.999363040924072, "epoch": 2.871757925072046, "grad_norm": 1.0546875, "learning_rate": 0.0004182681866986407, "loss": 4.6704, "mean_token_accuracy": 0.24055392146110535, "num_tokens": 68560572.0, "step": 29895 }, { "entropy": 5.091953277587891, "epoch": 2.87223823246878, "grad_norm": 1.03125, "learning_rate": 0.00041824174993265165, "loss": 4.7362, "mean_token_accuracy": 0.23719154596328734, "num_tokens": 68572080.0, "step": 29900 }, { "entropy": 5.0495476722717285, "epoch": 2.872718539865514, "grad_norm": 1.0546875, "learning_rate": 0.00041821530984101444, "loss": 4.6898, "mean_token_accuracy": 0.2397722065448761, "num_tokens": 68584481.0, "step": 29905 }, { "entropy": 5.023144483566284, "epoch": 2.873198847262248, "grad_norm": 1.0078125, "learning_rate": 0.000418188866424343, "loss": 4.7048, "mean_token_accuracy": 0.2494543418288231, "num_tokens": 68595673.0, "step": 29910 }, { "entropy": 5.162940740585327, "epoch": 2.8736791546589817, "grad_norm": 1.0625, "learning_rate": 0.0004181624196832513, "loss": 4.793, "mean_token_accuracy": 0.237021242082119, "num_tokens": 68607310.0, "step": 29915 }, { "entropy": 5.037101364135742, "epoch": 2.8741594620557156, "grad_norm": 1.140625, "learning_rate": 0.00041813596961835336, "loss": 4.7375, "mean_token_accuracy": 0.238271826505661, "num_tokens": 68618579.0, "step": 29920 }, { "entropy": 5.016074514389038, "epoch": 2.8746397694524495, "grad_norm": 1.03125, "learning_rate": 0.00041810951623026313, "loss": 4.5767, "mean_token_accuracy": 0.2503830775618553, "num_tokens": 68629977.0, "step": 29925 }, { "entropy": 5.0946588039398195, "epoch": 2.8751200768491834, "grad_norm": 0.9921875, "learning_rate": 0.00041808305951959496, "loss": 4.7958, "mean_token_accuracy": 0.23696231693029404, "num_tokens": 68641740.0, "step": 29930 }, { "entropy": 5.073351049423218, "epoch": 2.8756003842459172, "grad_norm": 1.3203125, "learning_rate": 0.000418056599486963, "loss": 4.7563, "mean_token_accuracy": 0.2414123848080635, "num_tokens": 68653171.0, "step": 29935 }, { "entropy": 5.060429573059082, "epoch": 2.8760806916426516, "grad_norm": 1.1015625, "learning_rate": 0.0004180301361329816, "loss": 4.7642, "mean_token_accuracy": 0.24232802242040635, "num_tokens": 68664658.0, "step": 29940 }, { "entropy": 5.095524263381958, "epoch": 2.876560999039385, "grad_norm": 1.0234375, "learning_rate": 0.0004180036694582651, "loss": 4.7003, "mean_token_accuracy": 0.24127245396375657, "num_tokens": 68675912.0, "step": 29945 }, { "entropy": 5.06903920173645, "epoch": 2.8770413064361193, "grad_norm": 1.0703125, "learning_rate": 0.00041797719946342813, "loss": 4.6845, "mean_token_accuracy": 0.24589995294809341, "num_tokens": 68686586.0, "step": 29950 }, { "entropy": 5.057815790176392, "epoch": 2.8775216138328528, "grad_norm": 0.984375, "learning_rate": 0.00041795072614908503, "loss": 4.7498, "mean_token_accuracy": 0.23042766749858856, "num_tokens": 68698280.0, "step": 29955 }, { "entropy": 5.093736410140991, "epoch": 2.878001921229587, "grad_norm": 1.0234375, "learning_rate": 0.00041792424951585055, "loss": 4.7126, "mean_token_accuracy": 0.24679953008890151, "num_tokens": 68709424.0, "step": 29960 }, { "entropy": 5.128250551223755, "epoch": 2.878482228626321, "grad_norm": 1.0, "learning_rate": 0.00041789776956433947, "loss": 4.7641, "mean_token_accuracy": 0.24207992255687713, "num_tokens": 68720785.0, "step": 29965 }, { "entropy": 5.014354610443116, "epoch": 2.878962536023055, "grad_norm": 0.98828125, "learning_rate": 0.00041787128629516645, "loss": 4.6963, "mean_token_accuracy": 0.23785762786865233, "num_tokens": 68732157.0, "step": 29970 }, { "entropy": 5.011946821212769, "epoch": 2.8794428434197887, "grad_norm": 1.0703125, "learning_rate": 0.0004178447997089464, "loss": 4.7272, "mean_token_accuracy": 0.24545604437589646, "num_tokens": 68743604.0, "step": 29975 }, { "entropy": 5.0876930236816404, "epoch": 2.8799231508165226, "grad_norm": 1.03125, "learning_rate": 0.0004178183098062943, "loss": 4.7432, "mean_token_accuracy": 0.23434943705797195, "num_tokens": 68755056.0, "step": 29980 }, { "entropy": 5.081747007369995, "epoch": 2.8804034582132565, "grad_norm": 0.98828125, "learning_rate": 0.0004177918165878251, "loss": 4.749, "mean_token_accuracy": 0.23806031793355942, "num_tokens": 68766390.0, "step": 29985 }, { "entropy": 5.096911478042602, "epoch": 2.8808837656099904, "grad_norm": 1.0703125, "learning_rate": 0.0004177653200541539, "loss": 4.7508, "mean_token_accuracy": 0.23880307376384735, "num_tokens": 68778780.0, "step": 29990 }, { "entropy": 5.076303577423095, "epoch": 2.8813640730067243, "grad_norm": 1.0390625, "learning_rate": 0.000417738820205896, "loss": 4.6311, "mean_token_accuracy": 0.24493061006069183, "num_tokens": 68789545.0, "step": 29995 }, { "entropy": 5.038773441314698, "epoch": 2.881844380403458, "grad_norm": 0.91796875, "learning_rate": 0.0004177123170436665, "loss": 4.6587, "mean_token_accuracy": 0.24653734415769576, "num_tokens": 68802231.0, "step": 30000 }, { "epoch": 2.881844380403458, "eval_entropy": 4.925570929416617, "eval_loss": 4.830386638641357, "eval_mean_token_accuracy": 0.24216745788007177, "eval_num_tokens": 68802231.0, "eval_runtime": 26.6573, "eval_samples_per_second": 1230.996, "eval_steps_per_second": 153.879, "step": 30000 }, { "entropy": 5.199371862411499, "epoch": 2.882324687800192, "grad_norm": 1.0390625, "learning_rate": 0.0004176858105680807, "loss": 4.8046, "mean_token_accuracy": 0.2428019016981125, "num_tokens": 68812424.0, "step": 30005 }, { "entropy": 5.091816568374634, "epoch": 2.882804995196926, "grad_norm": 1.1171875, "learning_rate": 0.00041765930077975415, "loss": 4.6796, "mean_token_accuracy": 0.2414682224392891, "num_tokens": 68823770.0, "step": 30010 }, { "entropy": 5.043640804290772, "epoch": 2.88328530259366, "grad_norm": 1.015625, "learning_rate": 0.00041763278767930213, "loss": 4.6956, "mean_token_accuracy": 0.23768699169158936, "num_tokens": 68835007.0, "step": 30015 }, { "entropy": 4.995925331115723, "epoch": 2.8837656099903937, "grad_norm": 1.0625, "learning_rate": 0.0004176062712673404, "loss": 4.6096, "mean_token_accuracy": 0.2436898961663246, "num_tokens": 68847070.0, "step": 30020 }, { "entropy": 5.06178035736084, "epoch": 2.884245917387128, "grad_norm": 1.109375, "learning_rate": 0.0004175797515444845, "loss": 4.6785, "mean_token_accuracy": 0.24139561355113984, "num_tokens": 68857197.0, "step": 30025 }, { "entropy": 5.091560554504395, "epoch": 2.8847262247838614, "grad_norm": 0.98046875, "learning_rate": 0.0004175532285113501, "loss": 4.7369, "mean_token_accuracy": 0.2393686965107918, "num_tokens": 68869719.0, "step": 30030 }, { "entropy": 5.039063549041748, "epoch": 2.8852065321805958, "grad_norm": 1.0546875, "learning_rate": 0.0004175267021685531, "loss": 4.7125, "mean_token_accuracy": 0.24368225634098054, "num_tokens": 68880300.0, "step": 30035 }, { "entropy": 5.0167113780975345, "epoch": 2.8856868395773296, "grad_norm": 1.046875, "learning_rate": 0.00041750017251670926, "loss": 4.6726, "mean_token_accuracy": 0.24538416266441346, "num_tokens": 68892616.0, "step": 30040 }, { "entropy": 5.151379156112671, "epoch": 2.8861671469740635, "grad_norm": 0.99609375, "learning_rate": 0.0004174736395564345, "loss": 4.8745, "mean_token_accuracy": 0.2226516544818878, "num_tokens": 68904595.0, "step": 30045 }, { "entropy": 5.084953117370605, "epoch": 2.8866474543707974, "grad_norm": 1.0625, "learning_rate": 0.00041744710328834493, "loss": 4.7178, "mean_token_accuracy": 0.23825270533561707, "num_tokens": 68917036.0, "step": 30050 }, { "entropy": 5.0371910572052006, "epoch": 2.8871277617675313, "grad_norm": 1.015625, "learning_rate": 0.00041742056371305665, "loss": 4.6936, "mean_token_accuracy": 0.23949169963598252, "num_tokens": 68928216.0, "step": 30055 }, { "entropy": 5.035665798187256, "epoch": 2.887608069164265, "grad_norm": 1.0390625, "learning_rate": 0.00041739402083118576, "loss": 4.7231, "mean_token_accuracy": 0.23918746560811996, "num_tokens": 68939806.0, "step": 30060 }, { "entropy": 5.039865207672119, "epoch": 2.888088376560999, "grad_norm": 1.0546875, "learning_rate": 0.0004173674746433485, "loss": 4.6532, "mean_token_accuracy": 0.24816398173570633, "num_tokens": 68950402.0, "step": 30065 }, { "entropy": 5.079453849792481, "epoch": 2.888568683957733, "grad_norm": 1.1171875, "learning_rate": 0.00041734092515016127, "loss": 4.7442, "mean_token_accuracy": 0.23737122267484664, "num_tokens": 68961472.0, "step": 30070 }, { "entropy": 5.094537591934204, "epoch": 2.889048991354467, "grad_norm": 1.03125, "learning_rate": 0.00041731437235224036, "loss": 4.8154, "mean_token_accuracy": 0.23389466851949692, "num_tokens": 68972844.0, "step": 30075 }, { "entropy": 5.091789960861206, "epoch": 2.8895292987512007, "grad_norm": 0.9765625, "learning_rate": 0.0004172878162502023, "loss": 4.7876, "mean_token_accuracy": 0.23112728744745253, "num_tokens": 68985219.0, "step": 30080 }, { "entropy": 5.134024047851563, "epoch": 2.8900096061479346, "grad_norm": 1.0625, "learning_rate": 0.00041726125684466374, "loss": 4.7597, "mean_token_accuracy": 0.24076538532972336, "num_tokens": 68997243.0, "step": 30085 }, { "entropy": 5.191966152191162, "epoch": 2.8904899135446684, "grad_norm": 1.0546875, "learning_rate": 0.0004172346941362412, "loss": 4.8106, "mean_token_accuracy": 0.22986829429864883, "num_tokens": 69008325.0, "step": 30090 }, { "entropy": 5.186703205108643, "epoch": 2.8909702209414023, "grad_norm": 1.0078125, "learning_rate": 0.00041720812812555137, "loss": 4.8409, "mean_token_accuracy": 0.2290444403886795, "num_tokens": 69020203.0, "step": 30095 }, { "entropy": 5.029536485671997, "epoch": 2.8914505283381366, "grad_norm": 0.98046875, "learning_rate": 0.0004171815588132111, "loss": 4.699, "mean_token_accuracy": 0.2431316554546356, "num_tokens": 69032538.0, "step": 30100 }, { "entropy": 5.016508913040161, "epoch": 2.89193083573487, "grad_norm": 1.0078125, "learning_rate": 0.0004171549861998372, "loss": 4.6994, "mean_token_accuracy": 0.2457281082868576, "num_tokens": 69045106.0, "step": 30105 }, { "entropy": 5.064055061340332, "epoch": 2.8924111431316044, "grad_norm": 1.0546875, "learning_rate": 0.0004171284102860467, "loss": 4.727, "mean_token_accuracy": 0.23846855461597444, "num_tokens": 69056526.0, "step": 30110 }, { "entropy": 5.085626649856567, "epoch": 2.8928914505283383, "grad_norm": 1.15625, "learning_rate": 0.0004171018310724565, "loss": 4.6923, "mean_token_accuracy": 0.24671979546546935, "num_tokens": 69067497.0, "step": 30115 }, { "entropy": 5.1378021240234375, "epoch": 2.893371757925072, "grad_norm": 1.046875, "learning_rate": 0.0004170752485596838, "loss": 4.6944, "mean_token_accuracy": 0.23808026313781738, "num_tokens": 69078114.0, "step": 30120 }, { "entropy": 5.032099866867066, "epoch": 2.893852065321806, "grad_norm": 1.109375, "learning_rate": 0.00041704866274834557, "loss": 4.6707, "mean_token_accuracy": 0.23985347300767898, "num_tokens": 69089329.0, "step": 30125 }, { "entropy": 5.136480760574341, "epoch": 2.89433237271854, "grad_norm": 1.0546875, "learning_rate": 0.00041702207363905933, "loss": 4.8024, "mean_token_accuracy": 0.22398771792650224, "num_tokens": 69101487.0, "step": 30130 }, { "entropy": 5.101910257339478, "epoch": 2.894812680115274, "grad_norm": 1.015625, "learning_rate": 0.00041699548123244216, "loss": 4.6678, "mean_token_accuracy": 0.24428293704986573, "num_tokens": 69112297.0, "step": 30135 }, { "entropy": 5.135105895996094, "epoch": 2.8952929875120077, "grad_norm": 1.0546875, "learning_rate": 0.0004169688855291116, "loss": 4.7799, "mean_token_accuracy": 0.2393356144428253, "num_tokens": 69123628.0, "step": 30140 }, { "entropy": 4.937651538848877, "epoch": 2.8957732949087416, "grad_norm": 1.1015625, "learning_rate": 0.0004169422865296851, "loss": 4.598, "mean_token_accuracy": 0.24106810986995697, "num_tokens": 69135040.0, "step": 30145 }, { "entropy": 5.136594533920288, "epoch": 2.8962536023054755, "grad_norm": 1.125, "learning_rate": 0.0004169156842347802, "loss": 4.8299, "mean_token_accuracy": 0.22800944298505782, "num_tokens": 69146251.0, "step": 30150 }, { "entropy": 5.176572513580322, "epoch": 2.8967339097022093, "grad_norm": 1.0234375, "learning_rate": 0.0004168890786450144, "loss": 4.825, "mean_token_accuracy": 0.23472704142332076, "num_tokens": 69157040.0, "step": 30155 }, { "entropy": 5.140036869049072, "epoch": 2.8972142170989432, "grad_norm": 1.1328125, "learning_rate": 0.0004168624697610056, "loss": 4.7024, "mean_token_accuracy": 0.24542928189039231, "num_tokens": 69167330.0, "step": 30160 }, { "entropy": 4.981908178329467, "epoch": 2.897694524495677, "grad_norm": 1.0859375, "learning_rate": 0.00041683585758337156, "loss": 4.6426, "mean_token_accuracy": 0.24661931693553923, "num_tokens": 69177903.0, "step": 30165 }, { "entropy": 4.951627206802368, "epoch": 2.898174831892411, "grad_norm": 1.140625, "learning_rate": 0.0004168092421127299, "loss": 4.6666, "mean_token_accuracy": 0.2440599873661995, "num_tokens": 69189946.0, "step": 30170 }, { "entropy": 4.968938875198364, "epoch": 2.8986551392891453, "grad_norm": 1.015625, "learning_rate": 0.0004167826233496989, "loss": 4.6074, "mean_token_accuracy": 0.25470257848501204, "num_tokens": 69201393.0, "step": 30175 }, { "entropy": 5.121877193450928, "epoch": 2.8991354466858787, "grad_norm": 1.046875, "learning_rate": 0.0004167560012948963, "loss": 4.768, "mean_token_accuracy": 0.23285290002822875, "num_tokens": 69213894.0, "step": 30180 }, { "entropy": 5.116933870315552, "epoch": 2.899615754082613, "grad_norm": 1.046875, "learning_rate": 0.00041672937594894034, "loss": 4.7454, "mean_token_accuracy": 0.2330809399485588, "num_tokens": 69224798.0, "step": 30185 }, { "entropy": 5.108615732192993, "epoch": 2.9000960614793465, "grad_norm": 1.0703125, "learning_rate": 0.00041670274731244903, "loss": 4.7199, "mean_token_accuracy": 0.23888099640607835, "num_tokens": 69235893.0, "step": 30190 }, { "entropy": 4.994650602340698, "epoch": 2.900576368876081, "grad_norm": 1.0703125, "learning_rate": 0.0004166761153860408, "loss": 4.6267, "mean_token_accuracy": 0.24582148790359498, "num_tokens": 69246554.0, "step": 30195 }, { "entropy": 5.008595514297485, "epoch": 2.9010566762728147, "grad_norm": 0.984375, "learning_rate": 0.00041664948017033383, "loss": 4.6906, "mean_token_accuracy": 0.24325044751167296, "num_tokens": 69259232.0, "step": 30200 }, { "entropy": 5.028008651733399, "epoch": 2.9015369836695486, "grad_norm": 1.078125, "learning_rate": 0.0004166228416659465, "loss": 4.6078, "mean_token_accuracy": 0.24577678143978118, "num_tokens": 69270143.0, "step": 30205 }, { "entropy": 5.063057708740234, "epoch": 2.9020172910662825, "grad_norm": 1.0390625, "learning_rate": 0.00041659619987349734, "loss": 4.7292, "mean_token_accuracy": 0.24639766663312912, "num_tokens": 69280702.0, "step": 30210 }, { "entropy": 5.100144577026367, "epoch": 2.9024975984630164, "grad_norm": 1.03125, "learning_rate": 0.00041656955479360487, "loss": 4.7, "mean_token_accuracy": 0.2368340790271759, "num_tokens": 69292605.0, "step": 30215 }, { "entropy": 5.02829761505127, "epoch": 2.9029779058597502, "grad_norm": 1.0078125, "learning_rate": 0.0004165429064268877, "loss": 4.6562, "mean_token_accuracy": 0.24210800975561142, "num_tokens": 69303243.0, "step": 30220 }, { "entropy": 5.054769611358642, "epoch": 2.903458213256484, "grad_norm": 1.015625, "learning_rate": 0.0004165162547739646, "loss": 4.7601, "mean_token_accuracy": 0.23413576632738115, "num_tokens": 69315059.0, "step": 30225 }, { "entropy": 5.101757431030274, "epoch": 2.903938520653218, "grad_norm": 1.0546875, "learning_rate": 0.0004164895998354542, "loss": 4.7719, "mean_token_accuracy": 0.23380008190870286, "num_tokens": 69327338.0, "step": 30230 }, { "entropy": 5.104634141921997, "epoch": 2.904418828049952, "grad_norm": 0.99609375, "learning_rate": 0.0004164629416119755, "loss": 4.707, "mean_token_accuracy": 0.23432289361953734, "num_tokens": 69339955.0, "step": 30235 }, { "entropy": 5.097865915298462, "epoch": 2.9048991354466858, "grad_norm": 1.0859375, "learning_rate": 0.00041643628010414735, "loss": 4.7182, "mean_token_accuracy": 0.23935549706220627, "num_tokens": 69350847.0, "step": 30240 }, { "entropy": 5.073919820785522, "epoch": 2.9053794428434196, "grad_norm": 1.0234375, "learning_rate": 0.00041640961531258877, "loss": 4.6868, "mean_token_accuracy": 0.23589784801006317, "num_tokens": 69362066.0, "step": 30245 }, { "entropy": 5.023674488067627, "epoch": 2.905859750240154, "grad_norm": 1.0078125, "learning_rate": 0.0004163829472379187, "loss": 4.7058, "mean_token_accuracy": 0.23785745352506638, "num_tokens": 69373239.0, "step": 30250 }, { "entropy": 5.056734991073609, "epoch": 2.9063400576368874, "grad_norm": 1.09375, "learning_rate": 0.00041635627588075655, "loss": 4.6772, "mean_token_accuracy": 0.23729420304298401, "num_tokens": 69384094.0, "step": 30255 }, { "entropy": 5.007504224777222, "epoch": 2.9068203650336217, "grad_norm": 1.015625, "learning_rate": 0.0004163296012417213, "loss": 4.6821, "mean_token_accuracy": 0.24327708333730697, "num_tokens": 69394667.0, "step": 30260 }, { "entropy": 5.083268547058106, "epoch": 2.907300672430355, "grad_norm": 1.0703125, "learning_rate": 0.00041630292332143245, "loss": 4.7385, "mean_token_accuracy": 0.24209891855716706, "num_tokens": 69405146.0, "step": 30265 }, { "entropy": 5.138149356842041, "epoch": 2.9077809798270895, "grad_norm": 0.95703125, "learning_rate": 0.0004162762421205093, "loss": 4.7803, "mean_token_accuracy": 0.24056767225265502, "num_tokens": 69416147.0, "step": 30270 }, { "entropy": 5.072920036315918, "epoch": 2.9082612872238234, "grad_norm": 0.9921875, "learning_rate": 0.00041624955763957134, "loss": 4.7042, "mean_token_accuracy": 0.24318288415670394, "num_tokens": 69426065.0, "step": 30275 }, { "entropy": 5.004269218444824, "epoch": 2.9087415946205573, "grad_norm": 1.1015625, "learning_rate": 0.0004162228698792381, "loss": 4.6758, "mean_token_accuracy": 0.24937786161899567, "num_tokens": 69437266.0, "step": 30280 }, { "entropy": 5.061780214309692, "epoch": 2.909221902017291, "grad_norm": 1.0703125, "learning_rate": 0.00041619617884012904, "loss": 4.7524, "mean_token_accuracy": 0.24395810514688493, "num_tokens": 69449539.0, "step": 30285 }, { "entropy": 5.095350646972657, "epoch": 2.909702209414025, "grad_norm": 1.0625, "learning_rate": 0.0004161694845228641, "loss": 4.7695, "mean_token_accuracy": 0.23942003697156905, "num_tokens": 69460515.0, "step": 30290 }, { "entropy": 5.1438220024108885, "epoch": 2.910182516810759, "grad_norm": 1.078125, "learning_rate": 0.0004161427869280628, "loss": 4.8374, "mean_token_accuracy": 0.2357712507247925, "num_tokens": 69472997.0, "step": 30295 }, { "entropy": 5.122387933731079, "epoch": 2.910662824207493, "grad_norm": 1.0234375, "learning_rate": 0.00041611608605634517, "loss": 4.7841, "mean_token_accuracy": 0.2316685900092125, "num_tokens": 69485118.0, "step": 30300 }, { "entropy": 5.07972412109375, "epoch": 2.9111431316042267, "grad_norm": 1.203125, "learning_rate": 0.000416089381908331, "loss": 4.7233, "mean_token_accuracy": 0.24024315625429155, "num_tokens": 69495696.0, "step": 30305 }, { "entropy": 4.99944052696228, "epoch": 2.9116234390009605, "grad_norm": 1.0625, "learning_rate": 0.0004160626744846404, "loss": 4.5839, "mean_token_accuracy": 0.2539135843515396, "num_tokens": 69507832.0, "step": 30310 }, { "entropy": 5.075605297088623, "epoch": 2.9121037463976944, "grad_norm": 1.015625, "learning_rate": 0.0004160359637858933, "loss": 4.7946, "mean_token_accuracy": 0.23319020718336106, "num_tokens": 69520180.0, "step": 30315 }, { "entropy": 5.02910418510437, "epoch": 2.9125840537944283, "grad_norm": 1.125, "learning_rate": 0.00041600924981270997, "loss": 4.6737, "mean_token_accuracy": 0.24387964457273484, "num_tokens": 69531201.0, "step": 30320 }, { "entropy": 5.127144908905029, "epoch": 2.913064361191162, "grad_norm": 1.03125, "learning_rate": 0.00041598253256571057, "loss": 4.7544, "mean_token_accuracy": 0.24092497825622558, "num_tokens": 69542552.0, "step": 30325 }, { "entropy": 5.0829455852508545, "epoch": 2.913544668587896, "grad_norm": 1.046875, "learning_rate": 0.0004159558120455154, "loss": 4.6802, "mean_token_accuracy": 0.24230518639087678, "num_tokens": 69554424.0, "step": 30330 }, { "entropy": 4.9297326564788815, "epoch": 2.9140249759846304, "grad_norm": 1.0234375, "learning_rate": 0.0004159290882527448, "loss": 4.5564, "mean_token_accuracy": 0.2559991866350174, "num_tokens": 69565043.0, "step": 30335 }, { "entropy": 5.063700532913208, "epoch": 2.914505283381364, "grad_norm": 1.1484375, "learning_rate": 0.0004159023611880192, "loss": 4.6943, "mean_token_accuracy": 0.23595308661460876, "num_tokens": 69576734.0, "step": 30340 }, { "entropy": 5.1198193550109865, "epoch": 2.914985590778098, "grad_norm": 0.96484375, "learning_rate": 0.0004158756308519592, "loss": 4.7153, "mean_token_accuracy": 0.23583280593156813, "num_tokens": 69588565.0, "step": 30345 }, { "entropy": 5.076626873016357, "epoch": 2.915465898174832, "grad_norm": 1.0546875, "learning_rate": 0.00041584889724518545, "loss": 4.7295, "mean_token_accuracy": 0.24376252442598342, "num_tokens": 69600595.0, "step": 30350 }, { "entropy": 5.013409376144409, "epoch": 2.915946205571566, "grad_norm": 0.9765625, "learning_rate": 0.00041582216036831844, "loss": 4.6144, "mean_token_accuracy": 0.2499750643968582, "num_tokens": 69611091.0, "step": 30355 }, { "entropy": 5.137284612655639, "epoch": 2.9164265129683, "grad_norm": 1.0390625, "learning_rate": 0.000415795420221979, "loss": 4.7759, "mean_token_accuracy": 0.2369133248925209, "num_tokens": 69621882.0, "step": 30360 }, { "entropy": 5.088633012771607, "epoch": 2.9169068203650337, "grad_norm": 1.0234375, "learning_rate": 0.00041576867680678803, "loss": 4.7074, "mean_token_accuracy": 0.2345858931541443, "num_tokens": 69634673.0, "step": 30365 }, { "entropy": 5.076688671112061, "epoch": 2.9173871277617676, "grad_norm": 1.125, "learning_rate": 0.0004157419301233664, "loss": 4.7224, "mean_token_accuracy": 0.24228535294532777, "num_tokens": 69644950.0, "step": 30370 }, { "entropy": 5.0582098960876465, "epoch": 2.9178674351585014, "grad_norm": 1.5, "learning_rate": 0.00041571518017233505, "loss": 4.7228, "mean_token_accuracy": 0.2429313540458679, "num_tokens": 69656750.0, "step": 30375 }, { "entropy": 5.109919261932373, "epoch": 2.9183477425552353, "grad_norm": 1.0625, "learning_rate": 0.0004156884269543151, "loss": 4.6876, "mean_token_accuracy": 0.24676658511161803, "num_tokens": 69667945.0, "step": 30380 }, { "entropy": 5.120557546615601, "epoch": 2.918828049951969, "grad_norm": 1.0546875, "learning_rate": 0.0004156616704699275, "loss": 4.7351, "mean_token_accuracy": 0.24469823092222215, "num_tokens": 69679995.0, "step": 30385 }, { "entropy": 5.047893619537353, "epoch": 2.919308357348703, "grad_norm": 1.015625, "learning_rate": 0.00041563491071979375, "loss": 4.6986, "mean_token_accuracy": 0.24317895323038102, "num_tokens": 69691029.0, "step": 30390 }, { "entropy": 5.051618337631226, "epoch": 2.919788664745437, "grad_norm": 1.078125, "learning_rate": 0.00041560814770453495, "loss": 4.638, "mean_token_accuracy": 0.24778493344783784, "num_tokens": 69702497.0, "step": 30395 }, { "entropy": 5.079649162292481, "epoch": 2.920268972142171, "grad_norm": 1.140625, "learning_rate": 0.00041558138142477235, "loss": 4.7508, "mean_token_accuracy": 0.23697479963302612, "num_tokens": 69713127.0, "step": 30400 }, { "entropy": 5.047370290756225, "epoch": 2.9207492795389047, "grad_norm": 1.015625, "learning_rate": 0.00041555461188112763, "loss": 4.7005, "mean_token_accuracy": 0.24249713867902756, "num_tokens": 69725314.0, "step": 30405 }, { "entropy": 4.973897171020508, "epoch": 2.921229586935639, "grad_norm": 1.078125, "learning_rate": 0.00041552783907422217, "loss": 4.6301, "mean_token_accuracy": 0.24654280990362168, "num_tokens": 69736696.0, "step": 30410 }, { "entropy": 5.017396688461304, "epoch": 2.9217098943323725, "grad_norm": 1.0625, "learning_rate": 0.0004155010630046775, "loss": 4.681, "mean_token_accuracy": 0.24379423260688782, "num_tokens": 69748639.0, "step": 30415 }, { "entropy": 5.086289119720459, "epoch": 2.922190201729107, "grad_norm": 0.99609375, "learning_rate": 0.0004154742836731155, "loss": 4.7823, "mean_token_accuracy": 0.23747721016407014, "num_tokens": 69760124.0, "step": 30420 }, { "entropy": 5.0709563255310055, "epoch": 2.9226705091258407, "grad_norm": 0.9921875, "learning_rate": 0.0004154475010801576, "loss": 4.7064, "mean_token_accuracy": 0.2438757374882698, "num_tokens": 69772280.0, "step": 30425 }, { "entropy": 5.042429447174072, "epoch": 2.9231508165225746, "grad_norm": 1.0703125, "learning_rate": 0.00041542071522642583, "loss": 4.692, "mean_token_accuracy": 0.24266358464956284, "num_tokens": 69784308.0, "step": 30430 }, { "entropy": 5.0773824691772464, "epoch": 2.9236311239193085, "grad_norm": 1.0859375, "learning_rate": 0.0004153939261125421, "loss": 4.744, "mean_token_accuracy": 0.23616353273391724, "num_tokens": 69795761.0, "step": 30435 }, { "entropy": 5.0080304622650145, "epoch": 2.9241114313160423, "grad_norm": 1.046875, "learning_rate": 0.0004153671337391281, "loss": 4.5837, "mean_token_accuracy": 0.2585609257221222, "num_tokens": 69805581.0, "step": 30440 }, { "entropy": 5.037991571426391, "epoch": 2.9245917387127762, "grad_norm": 1.0, "learning_rate": 0.0004153403381068062, "loss": 4.7104, "mean_token_accuracy": 0.2361886367201805, "num_tokens": 69816593.0, "step": 30445 }, { "entropy": 5.021226501464843, "epoch": 2.92507204610951, "grad_norm": 1.03125, "learning_rate": 0.00041531353921619833, "loss": 4.7122, "mean_token_accuracy": 0.24069665968418122, "num_tokens": 69828424.0, "step": 30450 }, { "entropy": 5.104537153244019, "epoch": 2.925552353506244, "grad_norm": 0.94140625, "learning_rate": 0.0004152867370679267, "loss": 4.6893, "mean_token_accuracy": 0.24043979048728942, "num_tokens": 69839250.0, "step": 30455 }, { "entropy": 5.0309265613555905, "epoch": 2.926032660902978, "grad_norm": 1.0078125, "learning_rate": 0.00041525993166261366, "loss": 4.6449, "mean_token_accuracy": 0.24812211096286774, "num_tokens": 69850910.0, "step": 30460 }, { "entropy": 5.049241256713867, "epoch": 2.9265129682997117, "grad_norm": 1.0078125, "learning_rate": 0.0004152331230008814, "loss": 4.6886, "mean_token_accuracy": 0.24405804723501207, "num_tokens": 69863048.0, "step": 30465 }, { "entropy": 5.099570608139038, "epoch": 2.9269932756964456, "grad_norm": 1.0390625, "learning_rate": 0.00041520631108335254, "loss": 4.7521, "mean_token_accuracy": 0.23641604334115982, "num_tokens": 69873512.0, "step": 30470 }, { "entropy": 5.15320987701416, "epoch": 2.9274735830931795, "grad_norm": 1.0625, "learning_rate": 0.0004151794959106494, "loss": 4.8425, "mean_token_accuracy": 0.23239615708589553, "num_tokens": 69884813.0, "step": 30475 }, { "entropy": 5.053913259506226, "epoch": 2.9279538904899134, "grad_norm": 1.0546875, "learning_rate": 0.0004151526774833947, "loss": 4.7175, "mean_token_accuracy": 0.24499699771404265, "num_tokens": 69896832.0, "step": 30480 }, { "entropy": 5.05629334449768, "epoch": 2.9284341978866477, "grad_norm": 0.98046875, "learning_rate": 0.00041512585580221086, "loss": 4.6807, "mean_token_accuracy": 0.24927377551794053, "num_tokens": 69908181.0, "step": 30485 }, { "entropy": 5.101822566986084, "epoch": 2.928914505283381, "grad_norm": 1.0703125, "learning_rate": 0.0004150990308677208, "loss": 4.7579, "mean_token_accuracy": 0.23209206461906434, "num_tokens": 69918839.0, "step": 30490 }, { "entropy": 5.16028847694397, "epoch": 2.9293948126801155, "grad_norm": 1.0, "learning_rate": 0.00041507220268054737, "loss": 4.8004, "mean_token_accuracy": 0.23220235705375672, "num_tokens": 69929637.0, "step": 30495 }, { "entropy": 5.0922932624816895, "epoch": 2.929875120076849, "grad_norm": 1.1015625, "learning_rate": 0.0004150453712413131, "loss": 4.7395, "mean_token_accuracy": 0.23931036442518233, "num_tokens": 69940202.0, "step": 30500 }, { "entropy": 5.084680032730103, "epoch": 2.9303554274735832, "grad_norm": 1.1484375, "learning_rate": 0.00041501853655064134, "loss": 4.6797, "mean_token_accuracy": 0.24100466072559357, "num_tokens": 69951049.0, "step": 30505 }, { "entropy": 5.038010835647583, "epoch": 2.930835734870317, "grad_norm": 1.0390625, "learning_rate": 0.0004149916986091548, "loss": 4.7313, "mean_token_accuracy": 0.23879321068525314, "num_tokens": 69962950.0, "step": 30510 }, { "entropy": 5.068257236480713, "epoch": 2.931316042267051, "grad_norm": 1.03125, "learning_rate": 0.0004149648574174768, "loss": 4.7867, "mean_token_accuracy": 0.23279785811901094, "num_tokens": 69974961.0, "step": 30515 }, { "entropy": 5.1210166931152346, "epoch": 2.931796349663785, "grad_norm": 1.0546875, "learning_rate": 0.0004149380129762304, "loss": 4.7003, "mean_token_accuracy": 0.24840328395366668, "num_tokens": 69985184.0, "step": 30520 }, { "entropy": 4.960989856719971, "epoch": 2.9322766570605188, "grad_norm": 1.046875, "learning_rate": 0.0004149111652860389, "loss": 4.6017, "mean_token_accuracy": 0.2513887107372284, "num_tokens": 69996773.0, "step": 30525 }, { "entropy": 5.100791120529175, "epoch": 2.9327569644572526, "grad_norm": 0.9765625, "learning_rate": 0.0004148843143475255, "loss": 4.72, "mean_token_accuracy": 0.24242961257696152, "num_tokens": 70008834.0, "step": 30530 }, { "entropy": 5.047762680053711, "epoch": 2.9332372718539865, "grad_norm": 1.046875, "learning_rate": 0.0004148574601613137, "loss": 4.6659, "mean_token_accuracy": 0.24508444368839263, "num_tokens": 70019708.0, "step": 30535 }, { "entropy": 5.204137897491455, "epoch": 2.9337175792507204, "grad_norm": 1.0234375, "learning_rate": 0.0004148306027280271, "loss": 4.8914, "mean_token_accuracy": 0.22802623510360717, "num_tokens": 70029680.0, "step": 30540 }, { "entropy": 5.0172882080078125, "epoch": 2.9341978866474543, "grad_norm": 0.9921875, "learning_rate": 0.00041480374204828896, "loss": 4.6428, "mean_token_accuracy": 0.2494751915335655, "num_tokens": 70041473.0, "step": 30545 }, { "entropy": 5.061795854568482, "epoch": 2.934678194044188, "grad_norm": 1.1015625, "learning_rate": 0.00041477687812272314, "loss": 4.7419, "mean_token_accuracy": 0.24007183611392974, "num_tokens": 70051646.0, "step": 30550 }, { "entropy": 5.126728296279907, "epoch": 2.935158501440922, "grad_norm": 1.0, "learning_rate": 0.00041475001095195324, "loss": 4.7568, "mean_token_accuracy": 0.23328060656785965, "num_tokens": 70063795.0, "step": 30555 }, { "entropy": 5.114735889434814, "epoch": 2.9356388088376564, "grad_norm": 1.0859375, "learning_rate": 0.0004147231405366031, "loss": 4.692, "mean_token_accuracy": 0.24236190021038057, "num_tokens": 70074481.0, "step": 30560 }, { "entropy": 4.975852680206299, "epoch": 2.93611911623439, "grad_norm": 1.0625, "learning_rate": 0.0004146962668772965, "loss": 4.6963, "mean_token_accuracy": 0.24521246701478958, "num_tokens": 70085939.0, "step": 30565 }, { "entropy": 5.053282451629639, "epoch": 2.936599423631124, "grad_norm": 1.015625, "learning_rate": 0.00041466938997465744, "loss": 4.7489, "mean_token_accuracy": 0.23896246999502183, "num_tokens": 70097466.0, "step": 30570 }, { "entropy": 5.067724561691284, "epoch": 2.9370797310278576, "grad_norm": 1.0390625, "learning_rate": 0.00041464250982930974, "loss": 4.6551, "mean_token_accuracy": 0.24233511537313462, "num_tokens": 70109720.0, "step": 30575 }, { "entropy": 5.153060054779052, "epoch": 2.937560038424592, "grad_norm": 0.99609375, "learning_rate": 0.00041461562644187777, "loss": 4.8261, "mean_token_accuracy": 0.2364367201924324, "num_tokens": 70121101.0, "step": 30580 }, { "entropy": 5.115277433395386, "epoch": 2.938040345821326, "grad_norm": 1.03125, "learning_rate": 0.00041458873981298547, "loss": 4.7482, "mean_token_accuracy": 0.24553181678056718, "num_tokens": 70131512.0, "step": 30585 }, { "entropy": 5.103408145904541, "epoch": 2.9385206532180597, "grad_norm": 1.0546875, "learning_rate": 0.00041456184994325714, "loss": 4.7965, "mean_token_accuracy": 0.23667109608650208, "num_tokens": 70142873.0, "step": 30590 }, { "entropy": 5.090587425231933, "epoch": 2.9390009606147935, "grad_norm": 1.0390625, "learning_rate": 0.00041453495683331694, "loss": 4.7584, "mean_token_accuracy": 0.2342955946922302, "num_tokens": 70154476.0, "step": 30595 }, { "entropy": 5.174236869812011, "epoch": 2.9394812680115274, "grad_norm": 0.99609375, "learning_rate": 0.00041450806048378954, "loss": 4.7685, "mean_token_accuracy": 0.23725222200155258, "num_tokens": 70165439.0, "step": 30600 }, { "entropy": 5.062460136413574, "epoch": 2.9399615754082613, "grad_norm": 1.0390625, "learning_rate": 0.0004144811608952992, "loss": 4.7285, "mean_token_accuracy": 0.2378845065832138, "num_tokens": 70176515.0, "step": 30605 }, { "entropy": 5.014520120620728, "epoch": 2.940441882804995, "grad_norm": 1.140625, "learning_rate": 0.00041445425806847043, "loss": 4.6886, "mean_token_accuracy": 0.24345237016677856, "num_tokens": 70189064.0, "step": 30610 }, { "entropy": 5.020877265930176, "epoch": 2.940922190201729, "grad_norm": 1.0078125, "learning_rate": 0.00041442735200392783, "loss": 4.7009, "mean_token_accuracy": 0.23655428886413574, "num_tokens": 70200984.0, "step": 30615 }, { "entropy": 5.021997499465942, "epoch": 2.941402497598463, "grad_norm": 1.0625, "learning_rate": 0.0004144004427022962, "loss": 4.687, "mean_token_accuracy": 0.23773080855607986, "num_tokens": 70212071.0, "step": 30620 }, { "entropy": 5.130324935913086, "epoch": 2.941882804995197, "grad_norm": 1.0703125, "learning_rate": 0.00041437353016420025, "loss": 4.7864, "mean_token_accuracy": 0.23806737065315248, "num_tokens": 70223395.0, "step": 30625 }, { "entropy": 5.040337181091308, "epoch": 2.9423631123919307, "grad_norm": 1.1015625, "learning_rate": 0.0004143466143902648, "loss": 4.6063, "mean_token_accuracy": 0.25084982961416247, "num_tokens": 70233130.0, "step": 30630 }, { "entropy": 5.104285478591919, "epoch": 2.9428434197886646, "grad_norm": 1.0234375, "learning_rate": 0.00041431969538111463, "loss": 4.7762, "mean_token_accuracy": 0.23282581716775894, "num_tokens": 70245227.0, "step": 30635 }, { "entropy": 5.101534843444824, "epoch": 2.9433237271853985, "grad_norm": 1.0078125, "learning_rate": 0.0004142927731373749, "loss": 4.6754, "mean_token_accuracy": 0.24160946905612946, "num_tokens": 70256483.0, "step": 30640 }, { "entropy": 5.066981267929077, "epoch": 2.943804034582133, "grad_norm": 1.0, "learning_rate": 0.0004142658476596706, "loss": 4.7047, "mean_token_accuracy": 0.24397629797458648, "num_tokens": 70267484.0, "step": 30645 }, { "entropy": 5.141232967376709, "epoch": 2.9442843419788662, "grad_norm": 1.0546875, "learning_rate": 0.00041423891894862687, "loss": 4.8418, "mean_token_accuracy": 0.23121191561222076, "num_tokens": 70278137.0, "step": 30650 }, { "entropy": 5.089439582824707, "epoch": 2.9447646493756006, "grad_norm": 1.0703125, "learning_rate": 0.0004142119870048689, "loss": 4.7201, "mean_token_accuracy": 0.2364410549402237, "num_tokens": 70289255.0, "step": 30655 }, { "entropy": 5.108114242553711, "epoch": 2.9452449567723344, "grad_norm": 1.140625, "learning_rate": 0.0004141850518290219, "loss": 4.7759, "mean_token_accuracy": 0.24463301599025727, "num_tokens": 70300672.0, "step": 30660 }, { "entropy": 5.047538185119629, "epoch": 2.9457252641690683, "grad_norm": 1.0078125, "learning_rate": 0.00041415811342171134, "loss": 4.6993, "mean_token_accuracy": 0.2412917673587799, "num_tokens": 70311964.0, "step": 30665 }, { "entropy": 5.017793035507202, "epoch": 2.946205571565802, "grad_norm": 0.94140625, "learning_rate": 0.0004141311717835625, "loss": 4.6647, "mean_token_accuracy": 0.24118449687957763, "num_tokens": 70323354.0, "step": 30670 }, { "entropy": 5.121945190429687, "epoch": 2.946685878962536, "grad_norm": 1.046875, "learning_rate": 0.00041410422691520114, "loss": 4.8108, "mean_token_accuracy": 0.22432501018047332, "num_tokens": 70335432.0, "step": 30675 }, { "entropy": 5.122028827667236, "epoch": 2.94716618635927, "grad_norm": 1.1640625, "learning_rate": 0.00041407727881725265, "loss": 4.7586, "mean_token_accuracy": 0.2394048422574997, "num_tokens": 70347012.0, "step": 30680 }, { "entropy": 5.01928391456604, "epoch": 2.947646493756004, "grad_norm": 1.015625, "learning_rate": 0.0004140503274903426, "loss": 4.7025, "mean_token_accuracy": 0.2462942734360695, "num_tokens": 70358680.0, "step": 30685 }, { "entropy": 4.992564296722412, "epoch": 2.9481268011527377, "grad_norm": 1.078125, "learning_rate": 0.000414023372935097, "loss": 4.5839, "mean_token_accuracy": 0.25254774689674375, "num_tokens": 70369081.0, "step": 30690 }, { "entropy": 5.092398643493652, "epoch": 2.9486071085494716, "grad_norm": 1.0390625, "learning_rate": 0.00041399641515214137, "loss": 4.7644, "mean_token_accuracy": 0.23195332586765288, "num_tokens": 70380831.0, "step": 30695 }, { "entropy": 5.062392044067383, "epoch": 2.9490874159462055, "grad_norm": 0.99609375, "learning_rate": 0.0004139694541421018, "loss": 4.7069, "mean_token_accuracy": 0.24111398011446, "num_tokens": 70392403.0, "step": 30700 }, { "entropy": 5.103695678710937, "epoch": 2.9495677233429394, "grad_norm": 1.1328125, "learning_rate": 0.0004139424899056042, "loss": 4.7827, "mean_token_accuracy": 0.23661188781261444, "num_tokens": 70404332.0, "step": 30705 }, { "entropy": 5.041910696029663, "epoch": 2.9500480307396733, "grad_norm": 1.0078125, "learning_rate": 0.00041391552244327446, "loss": 4.6822, "mean_token_accuracy": 0.24355427026748658, "num_tokens": 70416119.0, "step": 30710 }, { "entropy": 5.151484203338623, "epoch": 2.950528338136407, "grad_norm": 1.078125, "learning_rate": 0.0004138885517557387, "loss": 4.7472, "mean_token_accuracy": 0.23242215514183046, "num_tokens": 70425874.0, "step": 30715 }, { "entropy": 5.123206567764282, "epoch": 2.9510086455331415, "grad_norm": 1.0546875, "learning_rate": 0.0004138615778436234, "loss": 4.7905, "mean_token_accuracy": 0.23205123245716094, "num_tokens": 70435755.0, "step": 30720 }, { "entropy": 4.9721211910247805, "epoch": 2.951488952929875, "grad_norm": 0.99609375, "learning_rate": 0.00041383460070755447, "loss": 4.5833, "mean_token_accuracy": 0.25036960244178774, "num_tokens": 70446764.0, "step": 30725 }, { "entropy": 5.033250236511231, "epoch": 2.9519692603266092, "grad_norm": 1.0390625, "learning_rate": 0.00041380762034815834, "loss": 4.6268, "mean_token_accuracy": 0.25020085871219633, "num_tokens": 70457635.0, "step": 30730 }, { "entropy": 5.122746801376342, "epoch": 2.952449567723343, "grad_norm": 1.140625, "learning_rate": 0.00041378063676606147, "loss": 4.7993, "mean_token_accuracy": 0.23039833158254625, "num_tokens": 70469082.0, "step": 30735 }, { "entropy": 5.082587003707886, "epoch": 2.952929875120077, "grad_norm": 0.95703125, "learning_rate": 0.00041375364996189035, "loss": 4.7299, "mean_token_accuracy": 0.24270468205213547, "num_tokens": 70481634.0, "step": 30740 }, { "entropy": 5.108648157119751, "epoch": 2.953410182516811, "grad_norm": 1.0390625, "learning_rate": 0.00041372665993627143, "loss": 4.76, "mean_token_accuracy": 0.24007008969783783, "num_tokens": 70493320.0, "step": 30745 }, { "entropy": 5.084290456771851, "epoch": 2.9538904899135447, "grad_norm": 0.99609375, "learning_rate": 0.00041369966668983144, "loss": 4.6791, "mean_token_accuracy": 0.24068784862756729, "num_tokens": 70504998.0, "step": 30750 }, { "entropy": 5.084942245483399, "epoch": 2.9543707973102786, "grad_norm": 1.0546875, "learning_rate": 0.00041367267022319706, "loss": 4.7026, "mean_token_accuracy": 0.24692281186580659, "num_tokens": 70516096.0, "step": 30755 }, { "entropy": 5.053485965728759, "epoch": 2.9548511047070125, "grad_norm": 0.97265625, "learning_rate": 0.000413645670536995, "loss": 4.7645, "mean_token_accuracy": 0.23790201544761658, "num_tokens": 70527193.0, "step": 30760 }, { "entropy": 5.153659057617188, "epoch": 2.9553314121037464, "grad_norm": 1.015625, "learning_rate": 0.0004136186676318522, "loss": 4.8127, "mean_token_accuracy": 0.22736653536558152, "num_tokens": 70538733.0, "step": 30765 }, { "entropy": 5.118077564239502, "epoch": 2.9558117195004803, "grad_norm": 1.078125, "learning_rate": 0.0004135916615083956, "loss": 4.6531, "mean_token_accuracy": 0.24210628718137742, "num_tokens": 70549381.0, "step": 30770 }, { "entropy": 5.057273435592651, "epoch": 2.956292026897214, "grad_norm": 0.96875, "learning_rate": 0.00041356465216725195, "loss": 4.7175, "mean_token_accuracy": 0.2406172752380371, "num_tokens": 70560576.0, "step": 30775 }, { "entropy": 5.035550498962403, "epoch": 2.956772334293948, "grad_norm": 1.046875, "learning_rate": 0.00041353763960904873, "loss": 4.703, "mean_token_accuracy": 0.24104924649000167, "num_tokens": 70572093.0, "step": 30780 }, { "entropy": 5.135087013244629, "epoch": 2.957252641690682, "grad_norm": 1.1015625, "learning_rate": 0.00041351062383441286, "loss": 4.6771, "mean_token_accuracy": 0.24282461851835252, "num_tokens": 70581567.0, "step": 30785 }, { "entropy": 5.044927167892456, "epoch": 2.957732949087416, "grad_norm": 1.109375, "learning_rate": 0.0004134836048439716, "loss": 4.7113, "mean_token_accuracy": 0.2433431163430214, "num_tokens": 70593729.0, "step": 30790 }, { "entropy": 5.036692953109741, "epoch": 2.95821325648415, "grad_norm": 1.0390625, "learning_rate": 0.00041345658263835215, "loss": 4.7166, "mean_token_accuracy": 0.24356502592563628, "num_tokens": 70605816.0, "step": 30795 }, { "entropy": 5.0685828685760494, "epoch": 2.9586935638808836, "grad_norm": 1.140625, "learning_rate": 0.00041342955721818207, "loss": 4.7186, "mean_token_accuracy": 0.239371594786644, "num_tokens": 70618310.0, "step": 30800 }, { "entropy": 5.079177808761597, "epoch": 2.959173871277618, "grad_norm": 0.96484375, "learning_rate": 0.00041340252858408866, "loss": 4.7261, "mean_token_accuracy": 0.23834066540002824, "num_tokens": 70629842.0, "step": 30805 }, { "entropy": 5.098159217834473, "epoch": 2.9596541786743513, "grad_norm": 1.0390625, "learning_rate": 0.00041337549673669963, "loss": 4.8281, "mean_token_accuracy": 0.2345459997653961, "num_tokens": 70642564.0, "step": 30810 }, { "entropy": 5.130438327789307, "epoch": 2.9601344860710856, "grad_norm": 1.0234375, "learning_rate": 0.0004133484616766423, "loss": 4.7584, "mean_token_accuracy": 0.2377777561545372, "num_tokens": 70653225.0, "step": 30815 }, { "entropy": 5.1525421142578125, "epoch": 2.9606147934678195, "grad_norm": 1.046875, "learning_rate": 0.00041332142340454463, "loss": 4.756, "mean_token_accuracy": 0.2319066643714905, "num_tokens": 70665428.0, "step": 30820 }, { "entropy": 5.115548801422119, "epoch": 2.9610951008645534, "grad_norm": 1.1328125, "learning_rate": 0.0004132943819210342, "loss": 4.8091, "mean_token_accuracy": 0.23419805765151977, "num_tokens": 70676441.0, "step": 30825 }, { "entropy": 4.977198314666748, "epoch": 2.9615754082612873, "grad_norm": 1.046875, "learning_rate": 0.00041326733722673876, "loss": 4.5935, "mean_token_accuracy": 0.25220203697681426, "num_tokens": 70687561.0, "step": 30830 }, { "entropy": 5.093676567077637, "epoch": 2.962055715658021, "grad_norm": 1.03125, "learning_rate": 0.00041324028932228645, "loss": 4.7668, "mean_token_accuracy": 0.23979503959417342, "num_tokens": 70699949.0, "step": 30835 }, { "entropy": 5.060846853256225, "epoch": 2.962536023054755, "grad_norm": 1.0546875, "learning_rate": 0.0004132132382083051, "loss": 4.6801, "mean_token_accuracy": 0.24920041114091873, "num_tokens": 70711176.0, "step": 30840 }, { "entropy": 5.1384134769439695, "epoch": 2.963016330451489, "grad_norm": 1.109375, "learning_rate": 0.00041318618388542274, "loss": 4.7248, "mean_token_accuracy": 0.23304660767316818, "num_tokens": 70722974.0, "step": 30845 }, { "entropy": 4.942797088623047, "epoch": 2.963496637848223, "grad_norm": 0.99609375, "learning_rate": 0.0004131591263542675, "loss": 4.5784, "mean_token_accuracy": 0.2528442844748497, "num_tokens": 70735158.0, "step": 30850 }, { "entropy": 5.0456760883331295, "epoch": 2.9639769452449567, "grad_norm": 0.96875, "learning_rate": 0.0004131320656154676, "loss": 4.7117, "mean_token_accuracy": 0.23798534274101257, "num_tokens": 70746444.0, "step": 30855 }, { "entropy": 5.114153432846069, "epoch": 2.9644572526416906, "grad_norm": 1.078125, "learning_rate": 0.0004131050016696514, "loss": 4.6967, "mean_token_accuracy": 0.24223651736974716, "num_tokens": 70757985.0, "step": 30860 }, { "entropy": 5.092724561691284, "epoch": 2.9649375600384245, "grad_norm": 0.984375, "learning_rate": 0.000413077934517447, "loss": 4.7346, "mean_token_accuracy": 0.23988196402788162, "num_tokens": 70769867.0, "step": 30865 }, { "entropy": 5.087793588638306, "epoch": 2.9654178674351583, "grad_norm": 1.1875, "learning_rate": 0.0004130508641594831, "loss": 4.7746, "mean_token_accuracy": 0.23190059661865234, "num_tokens": 70781610.0, "step": 30870 }, { "entropy": 5.124959897994995, "epoch": 2.965898174831892, "grad_norm": 1.1015625, "learning_rate": 0.00041302379059638794, "loss": 4.6942, "mean_token_accuracy": 0.2405524343252182, "num_tokens": 70793336.0, "step": 30875 }, { "entropy": 5.0523522853851315, "epoch": 2.9663784822286265, "grad_norm": 1.078125, "learning_rate": 0.00041299671382879024, "loss": 4.6792, "mean_token_accuracy": 0.24535722136497498, "num_tokens": 70802714.0, "step": 30880 }, { "entropy": 5.112238979339599, "epoch": 2.96685878962536, "grad_norm": 1.046875, "learning_rate": 0.0004129696338573187, "loss": 4.8274, "mean_token_accuracy": 0.23021909594535828, "num_tokens": 70813300.0, "step": 30885 }, { "entropy": 5.15190544128418, "epoch": 2.9673390970220943, "grad_norm": 1.0546875, "learning_rate": 0.0004129425506826019, "loss": 4.852, "mean_token_accuracy": 0.22498805224895477, "num_tokens": 70824828.0, "step": 30890 }, { "entropy": 5.073496341705322, "epoch": 2.967819404418828, "grad_norm": 1.0, "learning_rate": 0.00041291546430526863, "loss": 4.7641, "mean_token_accuracy": 0.2451646074652672, "num_tokens": 70836788.0, "step": 30895 }, { "entropy": 5.084818410873413, "epoch": 2.968299711815562, "grad_norm": 1.0390625, "learning_rate": 0.0004128883747259478, "loss": 4.6997, "mean_token_accuracy": 0.2437497243285179, "num_tokens": 70849308.0, "step": 30900 }, { "entropy": 5.208883285522461, "epoch": 2.968780019212296, "grad_norm": 1.1328125, "learning_rate": 0.0004128612819452684, "loss": 4.8601, "mean_token_accuracy": 0.22244778871536255, "num_tokens": 70859987.0, "step": 30905 }, { "entropy": 5.175189065933227, "epoch": 2.96926032660903, "grad_norm": 1.1015625, "learning_rate": 0.00041283418596385944, "loss": 4.7633, "mean_token_accuracy": 0.24494120478630066, "num_tokens": 70872096.0, "step": 30910 }, { "entropy": 5.02183575630188, "epoch": 2.9697406340057637, "grad_norm": 1.078125, "learning_rate": 0.0004128070867823499, "loss": 4.6654, "mean_token_accuracy": 0.24081777781248093, "num_tokens": 70884336.0, "step": 30915 }, { "entropy": 4.991226148605347, "epoch": 2.9702209414024976, "grad_norm": 1.1953125, "learning_rate": 0.000412779984401369, "loss": 4.656, "mean_token_accuracy": 0.25205512493848803, "num_tokens": 70894707.0, "step": 30920 }, { "entropy": 5.095764636993408, "epoch": 2.9707012487992315, "grad_norm": 1.1875, "learning_rate": 0.000412752878821546, "loss": 4.7567, "mean_token_accuracy": 0.23996711522340775, "num_tokens": 70906461.0, "step": 30925 }, { "entropy": 5.088328266143799, "epoch": 2.9711815561959654, "grad_norm": 1.0, "learning_rate": 0.00041272577004351026, "loss": 4.6148, "mean_token_accuracy": 0.24836140722036362, "num_tokens": 70917282.0, "step": 30930 }, { "entropy": 5.056382131576538, "epoch": 2.9716618635926992, "grad_norm": 1.046875, "learning_rate": 0.00041269865806789095, "loss": 4.6492, "mean_token_accuracy": 0.2463774561882019, "num_tokens": 70928253.0, "step": 30935 }, { "entropy": 5.009007358551026, "epoch": 2.972142170989433, "grad_norm": 1.171875, "learning_rate": 0.0004126715428953178, "loss": 4.6959, "mean_token_accuracy": 0.23733440935611724, "num_tokens": 70939089.0, "step": 30940 }, { "entropy": 5.088467168807983, "epoch": 2.972622478386167, "grad_norm": 1.078125, "learning_rate": 0.0004126444245264202, "loss": 4.6494, "mean_token_accuracy": 0.24308509677648543, "num_tokens": 70949533.0, "step": 30945 }, { "entropy": 5.080672836303711, "epoch": 2.973102785782901, "grad_norm": 1.015625, "learning_rate": 0.0004126173029618278, "loss": 4.6657, "mean_token_accuracy": 0.2412404879927635, "num_tokens": 70961449.0, "step": 30950 }, { "entropy": 5.045295476913452, "epoch": 2.973583093179635, "grad_norm": 1.046875, "learning_rate": 0.0004125901782021702, "loss": 4.7037, "mean_token_accuracy": 0.24060867428779603, "num_tokens": 70973008.0, "step": 30955 }, { "entropy": 5.009149217605591, "epoch": 2.9740634005763686, "grad_norm": 1.03125, "learning_rate": 0.0004125630502480773, "loss": 4.6909, "mean_token_accuracy": 0.2449464187026024, "num_tokens": 70984170.0, "step": 30960 }, { "entropy": 5.0899590969085695, "epoch": 2.974543707973103, "grad_norm": 1.1953125, "learning_rate": 0.0004125359191001788, "loss": 4.7568, "mean_token_accuracy": 0.23973051458597183, "num_tokens": 70994900.0, "step": 30965 }, { "entropy": 5.081603336334228, "epoch": 2.975024015369837, "grad_norm": 0.99609375, "learning_rate": 0.0004125087847591047, "loss": 4.694, "mean_token_accuracy": 0.24175404906272888, "num_tokens": 71007430.0, "step": 30970 }, { "entropy": 5.079427099227905, "epoch": 2.9755043227665707, "grad_norm": 1.0, "learning_rate": 0.00041248164722548493, "loss": 4.7742, "mean_token_accuracy": 0.23830578327178956, "num_tokens": 71020306.0, "step": 30975 }, { "entropy": 5.0432030200958256, "epoch": 2.9759846301633046, "grad_norm": 0.9765625, "learning_rate": 0.0004124545064999495, "loss": 4.6331, "mean_token_accuracy": 0.24684105515480043, "num_tokens": 71031114.0, "step": 30980 }, { "entropy": 4.98370327949524, "epoch": 2.9764649375600385, "grad_norm": 0.97265625, "learning_rate": 0.00041242736258312866, "loss": 4.6545, "mean_token_accuracy": 0.24585938602685928, "num_tokens": 71043289.0, "step": 30985 }, { "entropy": 5.014498090744018, "epoch": 2.9769452449567724, "grad_norm": 0.98828125, "learning_rate": 0.0004124002154756525, "loss": 4.6412, "mean_token_accuracy": 0.2520130693912506, "num_tokens": 71055163.0, "step": 30990 }, { "entropy": 5.01092038154602, "epoch": 2.9774255523535063, "grad_norm": 1.046875, "learning_rate": 0.00041237306517815124, "loss": 4.648, "mean_token_accuracy": 0.24476557970046997, "num_tokens": 71067830.0, "step": 30995 }, { "entropy": 5.04044828414917, "epoch": 2.97790585975024, "grad_norm": 1.0625, "learning_rate": 0.0004123459116912554, "loss": 4.632, "mean_token_accuracy": 0.25297962725162504, "num_tokens": 71079242.0, "step": 31000 }, { "entropy": 5.093390607833863, "epoch": 2.978386167146974, "grad_norm": 1.09375, "learning_rate": 0.00041231875501559535, "loss": 4.7382, "mean_token_accuracy": 0.23652055859565735, "num_tokens": 71091076.0, "step": 31005 }, { "entropy": 5.1234206199646, "epoch": 2.978866474543708, "grad_norm": 1.125, "learning_rate": 0.00041229159515180155, "loss": 4.6592, "mean_token_accuracy": 0.2390012726187706, "num_tokens": 71101226.0, "step": 31010 }, { "entropy": 5.055122804641724, "epoch": 2.979346781940442, "grad_norm": 1.0078125, "learning_rate": 0.0004122644321005046, "loss": 4.5985, "mean_token_accuracy": 0.24532987773418427, "num_tokens": 71112689.0, "step": 31015 }, { "entropy": 5.007474517822265, "epoch": 2.9798270893371757, "grad_norm": 1.1953125, "learning_rate": 0.00041223726586233505, "loss": 4.7317, "mean_token_accuracy": 0.23529644906520844, "num_tokens": 71124256.0, "step": 31020 }, { "entropy": 5.05922384262085, "epoch": 2.9803073967339095, "grad_norm": 1.09375, "learning_rate": 0.00041221009643792377, "loss": 4.7223, "mean_token_accuracy": 0.24540119916200637, "num_tokens": 71134636.0, "step": 31025 }, { "entropy": 5.042840385437012, "epoch": 2.980787704130644, "grad_norm": 1.015625, "learning_rate": 0.0004121829238279014, "loss": 4.6302, "mean_token_accuracy": 0.2476649507880211, "num_tokens": 71145969.0, "step": 31030 }, { "entropy": 5.046140527725219, "epoch": 2.9812680115273773, "grad_norm": 1.0, "learning_rate": 0.00041215574803289896, "loss": 4.7102, "mean_token_accuracy": 0.24303379356861116, "num_tokens": 71157491.0, "step": 31035 }, { "entropy": 5.035579776763916, "epoch": 2.9817483189241116, "grad_norm": 1.015625, "learning_rate": 0.0004121285690535473, "loss": 4.759, "mean_token_accuracy": 0.23988830000162126, "num_tokens": 71169325.0, "step": 31040 }, { "entropy": 5.052158689498901, "epoch": 2.982228626320845, "grad_norm": 1.0390625, "learning_rate": 0.00041210138689047745, "loss": 4.7013, "mean_token_accuracy": 0.24333547949790954, "num_tokens": 71181179.0, "step": 31045 }, { "entropy": 5.0599668502807615, "epoch": 2.9827089337175794, "grad_norm": 0.99609375, "learning_rate": 0.0004120742015443206, "loss": 4.7178, "mean_token_accuracy": 0.2443860277533531, "num_tokens": 71193250.0, "step": 31050 }, { "entropy": 4.984365558624267, "epoch": 2.9831892411143133, "grad_norm": 0.95703125, "learning_rate": 0.0004120470130157077, "loss": 4.6182, "mean_token_accuracy": 0.25142668187618256, "num_tokens": 71207544.0, "step": 31055 }, { "entropy": 5.091627693176269, "epoch": 2.983669548511047, "grad_norm": 1.0625, "learning_rate": 0.00041201982130527006, "loss": 4.7437, "mean_token_accuracy": 0.2435833767056465, "num_tokens": 71219989.0, "step": 31060 }, { "entropy": 5.089881372451782, "epoch": 2.984149855907781, "grad_norm": 1.0078125, "learning_rate": 0.00041199262641363914, "loss": 4.7047, "mean_token_accuracy": 0.23696542531251907, "num_tokens": 71232455.0, "step": 31065 }, { "entropy": 5.069763135910034, "epoch": 2.984630163304515, "grad_norm": 0.9609375, "learning_rate": 0.00041196542834144617, "loss": 4.6223, "mean_token_accuracy": 0.24865677803754807, "num_tokens": 71244179.0, "step": 31070 }, { "entropy": 5.01129002571106, "epoch": 2.985110470701249, "grad_norm": 1.078125, "learning_rate": 0.00041193822708932265, "loss": 4.7036, "mean_token_accuracy": 0.24350427836179733, "num_tokens": 71255468.0, "step": 31075 }, { "entropy": 5.0638265132904055, "epoch": 2.9855907780979827, "grad_norm": 1.0390625, "learning_rate": 0.0004119110226579002, "loss": 4.7421, "mean_token_accuracy": 0.24235102981328965, "num_tokens": 71266192.0, "step": 31080 }, { "entropy": 5.00823655128479, "epoch": 2.9860710854947166, "grad_norm": 1.0546875, "learning_rate": 0.00041188381504781026, "loss": 4.6432, "mean_token_accuracy": 0.24189938753843307, "num_tokens": 71277166.0, "step": 31085 }, { "entropy": 5.074084186553955, "epoch": 2.9865513928914504, "grad_norm": 1.0234375, "learning_rate": 0.0004118566042596846, "loss": 4.6538, "mean_token_accuracy": 0.2429642543196678, "num_tokens": 71288077.0, "step": 31090 }, { "entropy": 5.0013875484466555, "epoch": 2.9870317002881843, "grad_norm": 1.0, "learning_rate": 0.000411829390294155, "loss": 4.6541, "mean_token_accuracy": 0.2529631584882736, "num_tokens": 71300423.0, "step": 31095 }, { "entropy": 5.069975185394287, "epoch": 2.987512007684918, "grad_norm": 1.0078125, "learning_rate": 0.00041180217315185333, "loss": 4.6413, "mean_token_accuracy": 0.24462012946605682, "num_tokens": 71312265.0, "step": 31100 }, { "entropy": 4.979261779785157, "epoch": 2.9879923150816525, "grad_norm": 0.9609375, "learning_rate": 0.00041177495283341124, "loss": 4.6162, "mean_token_accuracy": 0.24670014828443526, "num_tokens": 71323369.0, "step": 31105 }, { "entropy": 5.056342935562133, "epoch": 2.988472622478386, "grad_norm": 1.0703125, "learning_rate": 0.000411747729339461, "loss": 4.6817, "mean_token_accuracy": 0.24661661982536315, "num_tokens": 71333977.0, "step": 31110 }, { "entropy": 4.998224973678589, "epoch": 2.9889529298751203, "grad_norm": 1.09375, "learning_rate": 0.0004117205026706345, "loss": 4.6844, "mean_token_accuracy": 0.24256878048181535, "num_tokens": 71345176.0, "step": 31115 }, { "entropy": 5.1082902431488035, "epoch": 2.9894332372718537, "grad_norm": 1.03125, "learning_rate": 0.00041169327282756396, "loss": 4.7581, "mean_token_accuracy": 0.2398600995540619, "num_tokens": 71357310.0, "step": 31120 }, { "entropy": 5.189077949523925, "epoch": 2.989913544668588, "grad_norm": 1.0078125, "learning_rate": 0.0004116660398108815, "loss": 4.8598, "mean_token_accuracy": 0.23253771811723709, "num_tokens": 71369610.0, "step": 31125 }, { "entropy": 5.117611217498779, "epoch": 2.990393852065322, "grad_norm": 1.0625, "learning_rate": 0.0004116388036212194, "loss": 4.8285, "mean_token_accuracy": 0.2345252439379692, "num_tokens": 71381973.0, "step": 31130 }, { "entropy": 5.016928100585938, "epoch": 2.990874159462056, "grad_norm": 1.0234375, "learning_rate": 0.00041161156425921004, "loss": 4.691, "mean_token_accuracy": 0.24259027391672133, "num_tokens": 71393739.0, "step": 31135 }, { "entropy": 5.103773975372315, "epoch": 2.9913544668587897, "grad_norm": 1.0234375, "learning_rate": 0.00041158432172548577, "loss": 4.7692, "mean_token_accuracy": 0.23819297105073928, "num_tokens": 71405205.0, "step": 31140 }, { "entropy": 5.047510766983033, "epoch": 2.9918347742555236, "grad_norm": 0.98828125, "learning_rate": 0.00041155707602067923, "loss": 4.631, "mean_token_accuracy": 0.24435337632894516, "num_tokens": 71416645.0, "step": 31145 }, { "entropy": 5.104709720611572, "epoch": 2.9923150816522575, "grad_norm": 0.99609375, "learning_rate": 0.0004115298271454227, "loss": 4.7448, "mean_token_accuracy": 0.23368489295244216, "num_tokens": 71428854.0, "step": 31150 }, { "entropy": 5.063992261886597, "epoch": 2.9927953890489913, "grad_norm": 1.0703125, "learning_rate": 0.0004115025751003491, "loss": 4.6758, "mean_token_accuracy": 0.24616134017705918, "num_tokens": 71439950.0, "step": 31155 }, { "entropy": 5.045139503479004, "epoch": 2.993275696445725, "grad_norm": 1.09375, "learning_rate": 0.000411475319886091, "loss": 4.6799, "mean_token_accuracy": 0.2455049678683281, "num_tokens": 71451901.0, "step": 31160 }, { "entropy": 5.191359376907348, "epoch": 2.993756003842459, "grad_norm": 1.0546875, "learning_rate": 0.00041144806150328117, "loss": 4.8261, "mean_token_accuracy": 0.24100210070610045, "num_tokens": 71463688.0, "step": 31165 }, { "entropy": 5.174281358718872, "epoch": 2.994236311239193, "grad_norm": 1.0234375, "learning_rate": 0.0004114207999525527, "loss": 4.7989, "mean_token_accuracy": 0.2344155117869377, "num_tokens": 71475392.0, "step": 31170 }, { "entropy": 5.080008792877197, "epoch": 2.994716618635927, "grad_norm": 0.95703125, "learning_rate": 0.00041139353523453814, "loss": 4.734, "mean_token_accuracy": 0.24095366150140762, "num_tokens": 71486004.0, "step": 31175 }, { "entropy": 5.104208135604859, "epoch": 2.9951969260326607, "grad_norm": 1.0390625, "learning_rate": 0.0004113662673498708, "loss": 4.8248, "mean_token_accuracy": 0.23622356951236725, "num_tokens": 71497850.0, "step": 31180 }, { "entropy": 5.087505769729614, "epoch": 2.9956772334293946, "grad_norm": 1.09375, "learning_rate": 0.00041133899629918364, "loss": 4.6633, "mean_token_accuracy": 0.2459829866886139, "num_tokens": 71507778.0, "step": 31185 }, { "entropy": 5.126943159103393, "epoch": 2.996157540826129, "grad_norm": 0.99609375, "learning_rate": 0.00041131172208310986, "loss": 4.7444, "mean_token_accuracy": 0.23904307037591935, "num_tokens": 71519506.0, "step": 31190 }, { "entropy": 5.041216564178467, "epoch": 2.9966378482228624, "grad_norm": 1.0625, "learning_rate": 0.00041128444470228253, "loss": 4.6498, "mean_token_accuracy": 0.24707060307264328, "num_tokens": 71530523.0, "step": 31195 }, { "entropy": 5.0975563526153564, "epoch": 2.9971181556195967, "grad_norm": 1.015625, "learning_rate": 0.00041125716415733524, "loss": 4.7557, "mean_token_accuracy": 0.23416275084018706, "num_tokens": 71541473.0, "step": 31200 }, { "entropy": 5.041246032714843, "epoch": 2.9975984630163306, "grad_norm": 1.0625, "learning_rate": 0.0004112298804489011, "loss": 4.6961, "mean_token_accuracy": 0.24679289758205414, "num_tokens": 71552012.0, "step": 31205 }, { "entropy": 5.099580383300781, "epoch": 2.9980787704130645, "grad_norm": 1.0546875, "learning_rate": 0.0004112025935776137, "loss": 4.7124, "mean_token_accuracy": 0.23619519770145417, "num_tokens": 71562239.0, "step": 31210 }, { "entropy": 5.034806203842163, "epoch": 2.9985590778097984, "grad_norm": 1.046875, "learning_rate": 0.00041117530354410647, "loss": 4.6254, "mean_token_accuracy": 0.2423456683754921, "num_tokens": 71573296.0, "step": 31215 }, { "entropy": 5.062445402145386, "epoch": 2.9990393852065322, "grad_norm": 1.03125, "learning_rate": 0.0004111480103490131, "loss": 4.7671, "mean_token_accuracy": 0.2363523632287979, "num_tokens": 71584659.0, "step": 31220 }, { "entropy": 5.051762437820434, "epoch": 2.999519692603266, "grad_norm": 1.1953125, "learning_rate": 0.00041112071399296724, "loss": 4.6701, "mean_token_accuracy": 0.24363380372524263, "num_tokens": 71595672.0, "step": 31225 }, { "entropy": 5.046540021896362, "epoch": 3.0, "grad_norm": 1.1328125, "learning_rate": 0.0004110934144766025, "loss": 4.6338, "mean_token_accuracy": 0.2554298684000969, "num_tokens": 71605608.0, "step": 31230 }, { "entropy": 5.051793956756592, "epoch": 3.000480307396734, "grad_norm": 1.1015625, "learning_rate": 0.00041106611180055284, "loss": 4.6857, "mean_token_accuracy": 0.23950463086366652, "num_tokens": 71617542.0, "step": 31235 }, { "entropy": 4.998350143432617, "epoch": 3.0009606147934678, "grad_norm": 1.125, "learning_rate": 0.00041103880596545206, "loss": 4.5653, "mean_token_accuracy": 0.2447400540113449, "num_tokens": 71629087.0, "step": 31240 }, { "entropy": 5.108054494857788, "epoch": 3.0014409221902016, "grad_norm": 1.0390625, "learning_rate": 0.0004110114969719342, "loss": 4.6754, "mean_token_accuracy": 0.24601958692073822, "num_tokens": 71640929.0, "step": 31245 }, { "entropy": 5.113180303573609, "epoch": 3.0019212295869355, "grad_norm": 0.99609375, "learning_rate": 0.0004109841848206333, "loss": 4.6396, "mean_token_accuracy": 0.24572105705738068, "num_tokens": 71652641.0, "step": 31250 }, { "entropy": 5.016724395751953, "epoch": 3.0024015369836694, "grad_norm": 1.109375, "learning_rate": 0.0004109568695121833, "loss": 4.5788, "mean_token_accuracy": 0.24781061559915543, "num_tokens": 71664482.0, "step": 31255 }, { "entropy": 4.972348403930664, "epoch": 3.0028818443804033, "grad_norm": 0.9609375, "learning_rate": 0.0004109295510472186, "loss": 4.5179, "mean_token_accuracy": 0.25019769221544264, "num_tokens": 71675568.0, "step": 31260 }, { "entropy": 5.152708053588867, "epoch": 3.003362151777137, "grad_norm": 1.0234375, "learning_rate": 0.00041090222942637323, "loss": 4.693, "mean_token_accuracy": 0.23853697031736373, "num_tokens": 71685731.0, "step": 31265 }, { "entropy": 5.118443155288697, "epoch": 3.0038424591738715, "grad_norm": 1.078125, "learning_rate": 0.00041087490465028175, "loss": 4.7673, "mean_token_accuracy": 0.2384372115135193, "num_tokens": 71697212.0, "step": 31270 }, { "entropy": 4.985861873626709, "epoch": 3.0043227665706054, "grad_norm": 1.0546875, "learning_rate": 0.00041084757671957844, "loss": 4.5955, "mean_token_accuracy": 0.24781534671783448, "num_tokens": 71708444.0, "step": 31275 }, { "entropy": 5.059682369232178, "epoch": 3.0048030739673393, "grad_norm": 1.078125, "learning_rate": 0.00041082024563489773, "loss": 4.6412, "mean_token_accuracy": 0.24888246655464172, "num_tokens": 71718645.0, "step": 31280 }, { "entropy": 5.0275898456573485, "epoch": 3.005283381364073, "grad_norm": 1.078125, "learning_rate": 0.0004107929113968743, "loss": 4.5796, "mean_token_accuracy": 0.24466054141521454, "num_tokens": 71728798.0, "step": 31285 }, { "entropy": 5.090812873840332, "epoch": 3.005763688760807, "grad_norm": 1.046875, "learning_rate": 0.0004107655740061427, "loss": 4.7235, "mean_token_accuracy": 0.23481216579675673, "num_tokens": 71741237.0, "step": 31290 }, { "entropy": 5.096500730514526, "epoch": 3.006243996157541, "grad_norm": 1.0078125, "learning_rate": 0.0004107382334633375, "loss": 4.6834, "mean_token_accuracy": 0.24862920194864274, "num_tokens": 71753755.0, "step": 31295 }, { "entropy": 5.12779860496521, "epoch": 3.006724303554275, "grad_norm": 1.109375, "learning_rate": 0.0004107108897690936, "loss": 4.6643, "mean_token_accuracy": 0.24606042802333833, "num_tokens": 71765447.0, "step": 31300 }, { "entropy": 5.089486980438233, "epoch": 3.0072046109510087, "grad_norm": 1.03125, "learning_rate": 0.0004106835429240458, "loss": 4.6896, "mean_token_accuracy": 0.24308264255523682, "num_tokens": 71777106.0, "step": 31305 }, { "entropy": 5.017534351348877, "epoch": 3.0076849183477425, "grad_norm": 1.015625, "learning_rate": 0.0004106561929288292, "loss": 4.5891, "mean_token_accuracy": 0.24899385422468184, "num_tokens": 71789226.0, "step": 31310 }, { "entropy": 5.0785496711730955, "epoch": 3.0081652257444764, "grad_norm": 1.1484375, "learning_rate": 0.00041062883978407844, "loss": 4.6816, "mean_token_accuracy": 0.24357341527938842, "num_tokens": 71800754.0, "step": 31315 }, { "entropy": 5.0972143650054935, "epoch": 3.0086455331412103, "grad_norm": 1.015625, "learning_rate": 0.00041060148349042876, "loss": 4.7153, "mean_token_accuracy": 0.24523738622665406, "num_tokens": 71812107.0, "step": 31320 }, { "entropy": 5.081027412414551, "epoch": 3.009125840537944, "grad_norm": 1.1015625, "learning_rate": 0.00041057412404851536, "loss": 4.6248, "mean_token_accuracy": 0.24740613400936126, "num_tokens": 71822031.0, "step": 31325 }, { "entropy": 5.061629343032837, "epoch": 3.009606147934678, "grad_norm": 1.1640625, "learning_rate": 0.0004105467614589734, "loss": 4.6452, "mean_token_accuracy": 0.2387404829263687, "num_tokens": 71833744.0, "step": 31330 }, { "entropy": 5.074277353286743, "epoch": 3.010086455331412, "grad_norm": 1.03125, "learning_rate": 0.0004105193957224381, "loss": 4.5915, "mean_token_accuracy": 0.25017276108264924, "num_tokens": 71845385.0, "step": 31335 }, { "entropy": 5.0031942367553714, "epoch": 3.010566762728146, "grad_norm": 1.484375, "learning_rate": 0.00041049202683954473, "loss": 4.4989, "mean_token_accuracy": 0.2636774554848671, "num_tokens": 71858065.0, "step": 31340 }, { "entropy": 4.990669107437133, "epoch": 3.0110470701248797, "grad_norm": 1.0625, "learning_rate": 0.00041046465481092893, "loss": 4.5891, "mean_token_accuracy": 0.24581137001514436, "num_tokens": 71869553.0, "step": 31345 }, { "entropy": 5.045453214645386, "epoch": 3.011527377521614, "grad_norm": 0.9453125, "learning_rate": 0.00041043727963722607, "loss": 4.6313, "mean_token_accuracy": 0.24703803807497024, "num_tokens": 71881196.0, "step": 31350 }, { "entropy": 5.046996307373047, "epoch": 3.012007684918348, "grad_norm": 1.0078125, "learning_rate": 0.0004104099013190718, "loss": 4.6513, "mean_token_accuracy": 0.2366416335105896, "num_tokens": 71893323.0, "step": 31355 }, { "entropy": 5.045062065124512, "epoch": 3.012487992315082, "grad_norm": 0.96484375, "learning_rate": 0.00041038251985710164, "loss": 4.5923, "mean_token_accuracy": 0.24852931946516038, "num_tokens": 71904679.0, "step": 31360 }, { "entropy": 5.050914001464844, "epoch": 3.0129682997118157, "grad_norm": 0.98828125, "learning_rate": 0.0004103551352519514, "loss": 4.6881, "mean_token_accuracy": 0.25243211090564727, "num_tokens": 71916478.0, "step": 31365 }, { "entropy": 5.066346502304077, "epoch": 3.0134486071085496, "grad_norm": 1.0703125, "learning_rate": 0.00041032774750425683, "loss": 4.6731, "mean_token_accuracy": 0.24854901880025865, "num_tokens": 71928203.0, "step": 31370 }, { "entropy": 5.051746463775634, "epoch": 3.0139289145052834, "grad_norm": 1.1640625, "learning_rate": 0.0004103003566146538, "loss": 4.5767, "mean_token_accuracy": 0.24995588511228561, "num_tokens": 71939118.0, "step": 31375 }, { "entropy": 5.079462766647339, "epoch": 3.0144092219020173, "grad_norm": 1.0625, "learning_rate": 0.0004102729625837782, "loss": 4.7, "mean_token_accuracy": 0.2432078868150711, "num_tokens": 71950964.0, "step": 31380 }, { "entropy": 4.987093687057495, "epoch": 3.014889529298751, "grad_norm": 1.1640625, "learning_rate": 0.0004102455654122662, "loss": 4.5463, "mean_token_accuracy": 0.25703095048666, "num_tokens": 71960587.0, "step": 31385 }, { "entropy": 5.0261084079742435, "epoch": 3.015369836695485, "grad_norm": 1.0234375, "learning_rate": 0.00041021816510075366, "loss": 4.601, "mean_token_accuracy": 0.24622438251972198, "num_tokens": 71972014.0, "step": 31390 }, { "entropy": 5.0990345001220705, "epoch": 3.015850144092219, "grad_norm": 1.1171875, "learning_rate": 0.00041019076164987696, "loss": 4.6506, "mean_token_accuracy": 0.24761470407247543, "num_tokens": 71982842.0, "step": 31395 }, { "entropy": 5.067615079879761, "epoch": 3.016330451488953, "grad_norm": 1.125, "learning_rate": 0.0004101633550602721, "loss": 4.5783, "mean_token_accuracy": 0.24062599539756774, "num_tokens": 71994450.0, "step": 31400 }, { "entropy": 4.987122678756714, "epoch": 3.0168107588856867, "grad_norm": 1.0078125, "learning_rate": 0.0004101359453325755, "loss": 4.533, "mean_token_accuracy": 0.24856770038604736, "num_tokens": 72005490.0, "step": 31405 }, { "entropy": 5.035937976837158, "epoch": 3.0172910662824206, "grad_norm": 1.0234375, "learning_rate": 0.00041010853246742357, "loss": 4.6569, "mean_token_accuracy": 0.24072497636079787, "num_tokens": 72016723.0, "step": 31410 }, { "entropy": 4.9769618034362795, "epoch": 3.0177713736791545, "grad_norm": 1.046875, "learning_rate": 0.0004100811164654527, "loss": 4.5524, "mean_token_accuracy": 0.2532226011157036, "num_tokens": 72027189.0, "step": 31415 }, { "entropy": 5.104083442687989, "epoch": 3.0182516810758884, "grad_norm": 0.98828125, "learning_rate": 0.0004100536973272994, "loss": 4.7135, "mean_token_accuracy": 0.24044599682092666, "num_tokens": 72039662.0, "step": 31420 }, { "entropy": 5.051034784317016, "epoch": 3.0187319884726227, "grad_norm": 0.98828125, "learning_rate": 0.0004100262750536003, "loss": 4.6436, "mean_token_accuracy": 0.24606235325336456, "num_tokens": 72050769.0, "step": 31425 }, { "entropy": 5.049038457870483, "epoch": 3.0192122958693566, "grad_norm": 1.03125, "learning_rate": 0.00040999884964499196, "loss": 4.6587, "mean_token_accuracy": 0.24487811475992202, "num_tokens": 72062657.0, "step": 31430 }, { "entropy": 5.040445280075073, "epoch": 3.0196926032660905, "grad_norm": 0.98046875, "learning_rate": 0.00040997142110211127, "loss": 4.6133, "mean_token_accuracy": 0.25074315518140794, "num_tokens": 72074805.0, "step": 31435 }, { "entropy": 5.018621397018433, "epoch": 3.0201729106628243, "grad_norm": 0.9921875, "learning_rate": 0.00040994398942559496, "loss": 4.6005, "mean_token_accuracy": 0.25499855279922484, "num_tokens": 72085842.0, "step": 31440 }, { "entropy": 5.127358055114746, "epoch": 3.020653218059558, "grad_norm": 1.09375, "learning_rate": 0.0004099165546160799, "loss": 4.7126, "mean_token_accuracy": 0.23831754177808762, "num_tokens": 72096750.0, "step": 31445 }, { "entropy": 5.018946027755737, "epoch": 3.021133525456292, "grad_norm": 1.1015625, "learning_rate": 0.00040988911667420305, "loss": 4.6262, "mean_token_accuracy": 0.24294245690107347, "num_tokens": 72108842.0, "step": 31450 }, { "entropy": 5.105809259414673, "epoch": 3.021613832853026, "grad_norm": 1.015625, "learning_rate": 0.0004098616756006015, "loss": 4.6976, "mean_token_accuracy": 0.23442134708166124, "num_tokens": 72119582.0, "step": 31455 }, { "entropy": 5.097371196746826, "epoch": 3.02209414024976, "grad_norm": 1.046875, "learning_rate": 0.0004098342313959122, "loss": 4.7045, "mean_token_accuracy": 0.24634762108325958, "num_tokens": 72129984.0, "step": 31460 }, { "entropy": 5.103570604324341, "epoch": 3.0225744476464937, "grad_norm": 0.99609375, "learning_rate": 0.0004098067840607725, "loss": 4.6752, "mean_token_accuracy": 0.23882693648338318, "num_tokens": 72142160.0, "step": 31465 }, { "entropy": 5.148630142211914, "epoch": 3.0230547550432276, "grad_norm": 1.03125, "learning_rate": 0.0004097793335958195, "loss": 4.7492, "mean_token_accuracy": 0.23690109103918075, "num_tokens": 72153691.0, "step": 31470 }, { "entropy": 5.104357385635376, "epoch": 3.0235350624399615, "grad_norm": 1.03125, "learning_rate": 0.00040975188000169074, "loss": 4.6469, "mean_token_accuracy": 0.24874790906906127, "num_tokens": 72165123.0, "step": 31475 }, { "entropy": 5.050551652908325, "epoch": 3.0240153698366954, "grad_norm": 1.1015625, "learning_rate": 0.00040972442327902325, "loss": 4.622, "mean_token_accuracy": 0.2462276890873909, "num_tokens": 72177006.0, "step": 31480 }, { "entropy": 5.109850168228149, "epoch": 3.0244956772334293, "grad_norm": 1.046875, "learning_rate": 0.0004096969634284549, "loss": 4.7331, "mean_token_accuracy": 0.23722968250513077, "num_tokens": 72189185.0, "step": 31485 }, { "entropy": 5.044874048233032, "epoch": 3.024975984630163, "grad_norm": 0.96484375, "learning_rate": 0.0004096695004506229, "loss": 4.6689, "mean_token_accuracy": 0.24475235491991043, "num_tokens": 72201530.0, "step": 31490 }, { "entropy": 5.059578609466553, "epoch": 3.025456292026897, "grad_norm": 0.96875, "learning_rate": 0.00040964203434616496, "loss": 4.6296, "mean_token_accuracy": 0.24872735887765884, "num_tokens": 72213514.0, "step": 31495 }, { "entropy": 5.025071525573731, "epoch": 3.025936599423631, "grad_norm": 1.1015625, "learning_rate": 0.0004096145651157189, "loss": 4.6059, "mean_token_accuracy": 0.2459734320640564, "num_tokens": 72224811.0, "step": 31500 }, { "entropy": 5.109280920028686, "epoch": 3.0264169068203652, "grad_norm": 1.0390625, "learning_rate": 0.0004095870927599223, "loss": 4.7321, "mean_token_accuracy": 0.24028065651655198, "num_tokens": 72236201.0, "step": 31505 }, { "entropy": 5.123455333709717, "epoch": 3.026897214217099, "grad_norm": 1.0546875, "learning_rate": 0.00040955961727941306, "loss": 4.6981, "mean_token_accuracy": 0.23767663836479186, "num_tokens": 72248963.0, "step": 31510 }, { "entropy": 5.1077268600463865, "epoch": 3.027377521613833, "grad_norm": 1.0703125, "learning_rate": 0.0004095321386748291, "loss": 4.6995, "mean_token_accuracy": 0.24405122101306914, "num_tokens": 72259489.0, "step": 31515 }, { "entropy": 4.999540519714356, "epoch": 3.027857829010567, "grad_norm": 1.0234375, "learning_rate": 0.00040950465694680825, "loss": 4.6631, "mean_token_accuracy": 0.2449243649840355, "num_tokens": 72271124.0, "step": 31520 }, { "entropy": 5.109373617172241, "epoch": 3.0283381364073008, "grad_norm": 1.046875, "learning_rate": 0.00040947717209598877, "loss": 4.6778, "mean_token_accuracy": 0.24264902174472808, "num_tokens": 72281144.0, "step": 31525 }, { "entropy": 5.048031806945801, "epoch": 3.0288184438040346, "grad_norm": 0.9765625, "learning_rate": 0.00040944968412300867, "loss": 4.6363, "mean_token_accuracy": 0.2484264850616455, "num_tokens": 72293422.0, "step": 31530 }, { "entropy": 5.077163648605347, "epoch": 3.0292987512007685, "grad_norm": 1.0703125, "learning_rate": 0.00040942219302850605, "loss": 4.6672, "mean_token_accuracy": 0.24520911127328873, "num_tokens": 72304450.0, "step": 31535 }, { "entropy": 5.0657103061676025, "epoch": 3.0297790585975024, "grad_norm": 1.046875, "learning_rate": 0.0004093946988131193, "loss": 4.593, "mean_token_accuracy": 0.25350708812475203, "num_tokens": 72315000.0, "step": 31540 }, { "entropy": 5.075184917449951, "epoch": 3.0302593659942363, "grad_norm": 0.99609375, "learning_rate": 0.0004093672014774868, "loss": 4.6842, "mean_token_accuracy": 0.2380824714899063, "num_tokens": 72327268.0, "step": 31545 }, { "entropy": 5.029738235473633, "epoch": 3.03073967339097, "grad_norm": 1.078125, "learning_rate": 0.00040933970102224675, "loss": 4.5874, "mean_token_accuracy": 0.25333288311958313, "num_tokens": 72338371.0, "step": 31550 }, { "entropy": 5.065733528137207, "epoch": 3.031219980787704, "grad_norm": 0.984375, "learning_rate": 0.00040931219744803774, "loss": 4.6277, "mean_token_accuracy": 0.2469482719898224, "num_tokens": 72349211.0, "step": 31555 }, { "entropy": 5.114510488510132, "epoch": 3.031700288184438, "grad_norm": 1.0078125, "learning_rate": 0.0004092846907554985, "loss": 4.6983, "mean_token_accuracy": 0.24264060258865355, "num_tokens": 72361634.0, "step": 31560 }, { "entropy": 5.035785341262818, "epoch": 3.032180595581172, "grad_norm": 1.046875, "learning_rate": 0.00040925718094526724, "loss": 4.6533, "mean_token_accuracy": 0.24493586719036103, "num_tokens": 72374071.0, "step": 31565 }, { "entropy": 5.134601068496704, "epoch": 3.0326609029779057, "grad_norm": 1.078125, "learning_rate": 0.00040922966801798305, "loss": 4.763, "mean_token_accuracy": 0.23960959017276764, "num_tokens": 72386272.0, "step": 31570 }, { "entropy": 5.095665788650512, "epoch": 3.0331412103746396, "grad_norm": 0.9765625, "learning_rate": 0.00040920215197428456, "loss": 4.6388, "mean_token_accuracy": 0.24907959252595901, "num_tokens": 72398832.0, "step": 31575 }, { "entropy": 5.14979248046875, "epoch": 3.0336215177713735, "grad_norm": 1.0, "learning_rate": 0.00040917463281481053, "loss": 4.7366, "mean_token_accuracy": 0.2345603808760643, "num_tokens": 72409970.0, "step": 31580 }, { "entropy": 5.038898181915283, "epoch": 3.034101825168108, "grad_norm": 0.9296875, "learning_rate": 0.0004091471105402, "loss": 4.6299, "mean_token_accuracy": 0.2518488377332687, "num_tokens": 72420640.0, "step": 31585 }, { "entropy": 5.0984334468841555, "epoch": 3.0345821325648417, "grad_norm": 1.0234375, "learning_rate": 0.0004091195851510918, "loss": 4.692, "mean_token_accuracy": 0.24351897835731506, "num_tokens": 72432111.0, "step": 31590 }, { "entropy": 5.010867547988892, "epoch": 3.0350624399615755, "grad_norm": 1.0390625, "learning_rate": 0.0004090920566481252, "loss": 4.5947, "mean_token_accuracy": 0.25169342905282976, "num_tokens": 72444427.0, "step": 31595 }, { "entropy": 5.098999500274658, "epoch": 3.0355427473583094, "grad_norm": 1.0625, "learning_rate": 0.0004090645250319392, "loss": 4.7229, "mean_token_accuracy": 0.2429332360625267, "num_tokens": 72454955.0, "step": 31600 }, { "entropy": 5.021222496032715, "epoch": 3.0360230547550433, "grad_norm": 1.03125, "learning_rate": 0.000409036990303173, "loss": 4.5377, "mean_token_accuracy": 0.25624181628227233, "num_tokens": 72464914.0, "step": 31605 }, { "entropy": 4.999936819076538, "epoch": 3.036503362151777, "grad_norm": 0.9921875, "learning_rate": 0.0004090094524624658, "loss": 4.6868, "mean_token_accuracy": 0.250473652780056, "num_tokens": 72476379.0, "step": 31610 }, { "entropy": 5.097587633132934, "epoch": 3.036983669548511, "grad_norm": 1.0390625, "learning_rate": 0.00040898191151045717, "loss": 4.6602, "mean_token_accuracy": 0.2441670000553131, "num_tokens": 72488249.0, "step": 31615 }, { "entropy": 5.078186416625977, "epoch": 3.037463976945245, "grad_norm": 1.0078125, "learning_rate": 0.0004089543674477864, "loss": 4.6243, "mean_token_accuracy": 0.24243185818195342, "num_tokens": 72499365.0, "step": 31620 }, { "entropy": 5.031374263763428, "epoch": 3.037944284341979, "grad_norm": 1.109375, "learning_rate": 0.0004089268202750929, "loss": 4.619, "mean_token_accuracy": 0.245622855424881, "num_tokens": 72511231.0, "step": 31625 }, { "entropy": 5.039086246490479, "epoch": 3.0384245917387127, "grad_norm": 0.99609375, "learning_rate": 0.00040889926999301634, "loss": 4.625, "mean_token_accuracy": 0.2482289418578148, "num_tokens": 72524357.0, "step": 31630 }, { "entropy": 5.095204496383667, "epoch": 3.0389048991354466, "grad_norm": 1.0703125, "learning_rate": 0.0004088717166021964, "loss": 4.6793, "mean_token_accuracy": 0.24549530297517777, "num_tokens": 72537170.0, "step": 31635 }, { "entropy": 5.026371765136719, "epoch": 3.0393852065321805, "grad_norm": 1.09375, "learning_rate": 0.0004088441601032727, "loss": 4.5856, "mean_token_accuracy": 0.25135585814714434, "num_tokens": 72548631.0, "step": 31640 }, { "entropy": 5.005275440216065, "epoch": 3.0398655139289144, "grad_norm": 0.96484375, "learning_rate": 0.0004088166004968849, "loss": 4.5858, "mean_token_accuracy": 0.24760494828224183, "num_tokens": 72561787.0, "step": 31645 }, { "entropy": 5.027503967285156, "epoch": 3.0403458213256482, "grad_norm": 0.98828125, "learning_rate": 0.00040878903778367317, "loss": 4.6303, "mean_token_accuracy": 0.2522509038448334, "num_tokens": 72572480.0, "step": 31650 }, { "entropy": 5.0430741786956785, "epoch": 3.040826128722382, "grad_norm": 1.015625, "learning_rate": 0.0004087614719642772, "loss": 4.6666, "mean_token_accuracy": 0.2388513207435608, "num_tokens": 72584636.0, "step": 31655 }, { "entropy": 5.107775402069092, "epoch": 3.0413064361191164, "grad_norm": 0.9609375, "learning_rate": 0.00040873390303933693, "loss": 4.7112, "mean_token_accuracy": 0.23879486471414565, "num_tokens": 72595425.0, "step": 31660 }, { "entropy": 5.145121431350708, "epoch": 3.0417867435158503, "grad_norm": 1.078125, "learning_rate": 0.00040870633100949266, "loss": 4.6938, "mean_token_accuracy": 0.24287839978933334, "num_tokens": 72607344.0, "step": 31665 }, { "entropy": 5.0275026798248295, "epoch": 3.042267050912584, "grad_norm": 0.9375, "learning_rate": 0.00040867875587538436, "loss": 4.6932, "mean_token_accuracy": 0.24585120677947997, "num_tokens": 72619704.0, "step": 31670 }, { "entropy": 5.039684009552002, "epoch": 3.042747358309318, "grad_norm": 1.0, "learning_rate": 0.0004086511776376523, "loss": 4.5973, "mean_token_accuracy": 0.254203824698925, "num_tokens": 72632143.0, "step": 31675 }, { "entropy": 5.071047306060791, "epoch": 3.043227665706052, "grad_norm": 0.96484375, "learning_rate": 0.00040862359629693684, "loss": 4.632, "mean_token_accuracy": 0.245218189060688, "num_tokens": 72644581.0, "step": 31680 }, { "entropy": 5.1263385772705075, "epoch": 3.043707973102786, "grad_norm": 1.0, "learning_rate": 0.0004085960118538781, "loss": 4.699, "mean_token_accuracy": 0.24387965947389603, "num_tokens": 72656309.0, "step": 31685 }, { "entropy": 5.067390775680542, "epoch": 3.0441882804995197, "grad_norm": 0.98046875, "learning_rate": 0.0004085684243091168, "loss": 4.6763, "mean_token_accuracy": 0.2433240920305252, "num_tokens": 72668347.0, "step": 31690 }, { "entropy": 4.992222261428833, "epoch": 3.0446685878962536, "grad_norm": 1.046875, "learning_rate": 0.0004085408336632933, "loss": 4.5899, "mean_token_accuracy": 0.2498211979866028, "num_tokens": 72679375.0, "step": 31695 }, { "entropy": 5.048721837997436, "epoch": 3.0451488952929875, "grad_norm": 0.9921875, "learning_rate": 0.00040851323991704803, "loss": 4.6505, "mean_token_accuracy": 0.2475501537322998, "num_tokens": 72691020.0, "step": 31700 }, { "entropy": 5.035050344467163, "epoch": 3.0456292026897214, "grad_norm": 0.9921875, "learning_rate": 0.0004084856430710219, "loss": 4.6038, "mean_token_accuracy": 0.2471386671066284, "num_tokens": 72703215.0, "step": 31705 }, { "entropy": 5.072145318984985, "epoch": 3.0461095100864553, "grad_norm": 0.953125, "learning_rate": 0.0004084580431258555, "loss": 4.6137, "mean_token_accuracy": 0.24602895975112915, "num_tokens": 72715107.0, "step": 31710 }, { "entropy": 5.084800767898559, "epoch": 3.046589817483189, "grad_norm": 1.0234375, "learning_rate": 0.0004084304400821896, "loss": 4.6965, "mean_token_accuracy": 0.24280115067958832, "num_tokens": 72726301.0, "step": 31715 }, { "entropy": 5.033972501754761, "epoch": 3.047070124879923, "grad_norm": 0.98046875, "learning_rate": 0.0004084028339406651, "loss": 4.6276, "mean_token_accuracy": 0.24668311178684235, "num_tokens": 72738102.0, "step": 31720 }, { "entropy": 5.022152471542358, "epoch": 3.047550432276657, "grad_norm": 1.0703125, "learning_rate": 0.00040837522470192297, "loss": 4.7009, "mean_token_accuracy": 0.2479358971118927, "num_tokens": 72748840.0, "step": 31725 }, { "entropy": 4.960737705230713, "epoch": 3.0480307396733908, "grad_norm": 1.015625, "learning_rate": 0.0004083476123666041, "loss": 4.5325, "mean_token_accuracy": 0.2612351909279823, "num_tokens": 72760480.0, "step": 31730 }, { "entropy": 5.061456727981567, "epoch": 3.048511047070125, "grad_norm": 1.0703125, "learning_rate": 0.0004083199969353496, "loss": 4.5844, "mean_token_accuracy": 0.2589627534151077, "num_tokens": 72770446.0, "step": 31735 }, { "entropy": 4.988167381286621, "epoch": 3.048991354466859, "grad_norm": 1.0234375, "learning_rate": 0.00040829237840880075, "loss": 4.6273, "mean_token_accuracy": 0.24723846316337586, "num_tokens": 72782262.0, "step": 31740 }, { "entropy": 5.02937421798706, "epoch": 3.049471661863593, "grad_norm": 1.046875, "learning_rate": 0.00040826475678759855, "loss": 4.6373, "mean_token_accuracy": 0.2460236892104149, "num_tokens": 72793857.0, "step": 31745 }, { "entropy": 5.058753204345703, "epoch": 3.0499519692603267, "grad_norm": 1.046875, "learning_rate": 0.0004082371320723845, "loss": 4.6553, "mean_token_accuracy": 0.2392961248755455, "num_tokens": 72805396.0, "step": 31750 }, { "entropy": 5.1182475090026855, "epoch": 3.0504322766570606, "grad_norm": 1.0546875, "learning_rate": 0.00040820950426379986, "loss": 4.6899, "mean_token_accuracy": 0.24830501526594162, "num_tokens": 72816946.0, "step": 31755 }, { "entropy": 5.026547241210937, "epoch": 3.0509125840537945, "grad_norm": 1.0625, "learning_rate": 0.0004081818733624861, "loss": 4.5531, "mean_token_accuracy": 0.25185046941041944, "num_tokens": 72827589.0, "step": 31760 }, { "entropy": 5.005900955200195, "epoch": 3.0513928914505284, "grad_norm": 1.0, "learning_rate": 0.0004081542393690847, "loss": 4.6135, "mean_token_accuracy": 0.24803243577480316, "num_tokens": 72838333.0, "step": 31765 }, { "entropy": 5.0348255157470705, "epoch": 3.0518731988472623, "grad_norm": 1.1953125, "learning_rate": 0.0004081266022842372, "loss": 4.5967, "mean_token_accuracy": 0.25163368284702303, "num_tokens": 72850212.0, "step": 31770 }, { "entropy": 5.064818906784057, "epoch": 3.052353506243996, "grad_norm": 1.0078125, "learning_rate": 0.00040809896210858537, "loss": 4.5652, "mean_token_accuracy": 0.2413250043988228, "num_tokens": 72861406.0, "step": 31775 }, { "entropy": 5.032543516159057, "epoch": 3.05283381364073, "grad_norm": 1.046875, "learning_rate": 0.00040807131884277085, "loss": 4.6498, "mean_token_accuracy": 0.25353990495204926, "num_tokens": 72872563.0, "step": 31780 }, { "entropy": 5.070755195617676, "epoch": 3.053314121037464, "grad_norm": 1.078125, "learning_rate": 0.0004080436724874354, "loss": 4.6947, "mean_token_accuracy": 0.23914626091718674, "num_tokens": 72883718.0, "step": 31785 }, { "entropy": 5.148820590972901, "epoch": 3.053794428434198, "grad_norm": 0.96875, "learning_rate": 0.00040801602304322095, "loss": 4.7136, "mean_token_accuracy": 0.24600790739059447, "num_tokens": 72896423.0, "step": 31790 }, { "entropy": 5.160016441345215, "epoch": 3.0542747358309317, "grad_norm": 1.1875, "learning_rate": 0.00040798837051076944, "loss": 4.7666, "mean_token_accuracy": 0.23096111565828323, "num_tokens": 72907681.0, "step": 31795 }, { "entropy": 4.996777534484863, "epoch": 3.0547550432276656, "grad_norm": 1.0234375, "learning_rate": 0.00040796071489072286, "loss": 4.5336, "mean_token_accuracy": 0.25322402119636533, "num_tokens": 72919782.0, "step": 31800 }, { "entropy": 5.021294403076172, "epoch": 3.0552353506243994, "grad_norm": 1.0078125, "learning_rate": 0.0004079330561837233, "loss": 4.6145, "mean_token_accuracy": 0.2486003264784813, "num_tokens": 72931458.0, "step": 31805 }, { "entropy": 5.0960643768310545, "epoch": 3.0557156580211333, "grad_norm": 1.0859375, "learning_rate": 0.00040790539439041287, "loss": 4.7014, "mean_token_accuracy": 0.24244564771652222, "num_tokens": 72942415.0, "step": 31810 }, { "entropy": 4.990510129928589, "epoch": 3.0561959654178676, "grad_norm": 0.99609375, "learning_rate": 0.00040787772951143386, "loss": 4.4642, "mean_token_accuracy": 0.2599411576986313, "num_tokens": 72953134.0, "step": 31815 }, { "entropy": 4.966206312179565, "epoch": 3.0566762728146015, "grad_norm": 0.9609375, "learning_rate": 0.0004078500615474285, "loss": 4.5925, "mean_token_accuracy": 0.24848204404115676, "num_tokens": 72964390.0, "step": 31820 }, { "entropy": 4.9862120151519775, "epoch": 3.0571565802113354, "grad_norm": 1.09375, "learning_rate": 0.00040782239049903926, "loss": 4.6309, "mean_token_accuracy": 0.24385963827371598, "num_tokens": 72976899.0, "step": 31825 }, { "entropy": 5.071328496932983, "epoch": 3.0576368876080693, "grad_norm": 1.03125, "learning_rate": 0.00040779471636690845, "loss": 4.711, "mean_token_accuracy": 0.24568732529878617, "num_tokens": 72989074.0, "step": 31830 }, { "entropy": 5.058893823623658, "epoch": 3.058117195004803, "grad_norm": 1.0234375, "learning_rate": 0.00040776703915167866, "loss": 4.6436, "mean_token_accuracy": 0.2440729945898056, "num_tokens": 73000434.0, "step": 31835 }, { "entropy": 5.039112997055054, "epoch": 3.058597502401537, "grad_norm": 1.078125, "learning_rate": 0.00040773935885399254, "loss": 4.6076, "mean_token_accuracy": 0.24866542369127273, "num_tokens": 73011902.0, "step": 31840 }, { "entropy": 5.0925158023834225, "epoch": 3.059077809798271, "grad_norm": 1.1171875, "learning_rate": 0.0004077116754744926, "loss": 4.6983, "mean_token_accuracy": 0.24309301227331162, "num_tokens": 73022349.0, "step": 31845 }, { "entropy": 5.008359956741333, "epoch": 3.059558117195005, "grad_norm": 1.0703125, "learning_rate": 0.00040768398901382157, "loss": 4.57, "mean_token_accuracy": 0.25645024329423904, "num_tokens": 73033958.0, "step": 31850 }, { "entropy": 5.018523263931274, "epoch": 3.0600384245917387, "grad_norm": 0.94140625, "learning_rate": 0.0004076562994726223, "loss": 4.5625, "mean_token_accuracy": 0.25753410458564757, "num_tokens": 73045807.0, "step": 31855 }, { "entropy": 5.078787231445313, "epoch": 3.0605187319884726, "grad_norm": 1.0390625, "learning_rate": 0.0004076286068515378, "loss": 4.5796, "mean_token_accuracy": 0.24990254342556, "num_tokens": 73056883.0, "step": 31860 }, { "entropy": 5.0730164527893065, "epoch": 3.0609990393852065, "grad_norm": 1.0703125, "learning_rate": 0.0004076009111512108, "loss": 4.6016, "mean_token_accuracy": 0.2521222934126854, "num_tokens": 73067924.0, "step": 31865 }, { "entropy": 5.054807090759278, "epoch": 3.0614793467819403, "grad_norm": 0.98046875, "learning_rate": 0.0004075732123722844, "loss": 4.6469, "mean_token_accuracy": 0.24260211735963821, "num_tokens": 73079965.0, "step": 31870 }, { "entropy": 5.052465772628784, "epoch": 3.061959654178674, "grad_norm": 1.046875, "learning_rate": 0.0004075455105154016, "loss": 4.672, "mean_token_accuracy": 0.24341681152582167, "num_tokens": 73091986.0, "step": 31875 }, { "entropy": 5.114961290359497, "epoch": 3.062439961575408, "grad_norm": 0.9921875, "learning_rate": 0.00040751780558120573, "loss": 4.6762, "mean_token_accuracy": 0.24254262447357178, "num_tokens": 73102445.0, "step": 31880 }, { "entropy": 5.07858681678772, "epoch": 3.062920268972142, "grad_norm": 1.0859375, "learning_rate": 0.0004074900975703398, "loss": 4.6021, "mean_token_accuracy": 0.2494074746966362, "num_tokens": 73112921.0, "step": 31885 }, { "entropy": 5.085866212844849, "epoch": 3.063400576368876, "grad_norm": 1.015625, "learning_rate": 0.0004074623864834473, "loss": 4.609, "mean_token_accuracy": 0.24578240364789963, "num_tokens": 73123653.0, "step": 31890 }, { "entropy": 5.074773836135864, "epoch": 3.06388088376561, "grad_norm": 1.03125, "learning_rate": 0.0004074346723211715, "loss": 4.6255, "mean_token_accuracy": 0.25057290941476823, "num_tokens": 73135568.0, "step": 31895 }, { "entropy": 4.977847385406494, "epoch": 3.064361191162344, "grad_norm": 1.0546875, "learning_rate": 0.00040740695508415583, "loss": 4.5762, "mean_token_accuracy": 0.2518226861953735, "num_tokens": 73145964.0, "step": 31900 }, { "entropy": 4.93131742477417, "epoch": 3.064841498559078, "grad_norm": 1.1171875, "learning_rate": 0.00040737923477304386, "loss": 4.5278, "mean_token_accuracy": 0.2546772018074989, "num_tokens": 73156941.0, "step": 31905 }, { "entropy": 5.045769786834716, "epoch": 3.065321805955812, "grad_norm": 1.0703125, "learning_rate": 0.00040735151138847917, "loss": 4.555, "mean_token_accuracy": 0.24858633130788804, "num_tokens": 73167928.0, "step": 31910 }, { "entropy": 4.972653770446778, "epoch": 3.0658021133525457, "grad_norm": 0.93359375, "learning_rate": 0.0004073237849311053, "loss": 4.5367, "mean_token_accuracy": 0.25381753146648406, "num_tokens": 73179861.0, "step": 31915 }, { "entropy": 5.035164880752563, "epoch": 3.0662824207492796, "grad_norm": 1.125, "learning_rate": 0.0004072960554015661, "loss": 4.6648, "mean_token_accuracy": 0.24390652775764465, "num_tokens": 73191152.0, "step": 31920 }, { "entropy": 5.026795482635498, "epoch": 3.0667627281460135, "grad_norm": 0.98046875, "learning_rate": 0.0004072683228005055, "loss": 4.6204, "mean_token_accuracy": 0.2502211079001427, "num_tokens": 73202952.0, "step": 31925 }, { "entropy": 4.971695852279663, "epoch": 3.0672430355427474, "grad_norm": 1.09375, "learning_rate": 0.00040724058712856697, "loss": 4.5412, "mean_token_accuracy": 0.250248646736145, "num_tokens": 73214899.0, "step": 31930 }, { "entropy": 5.007768440246582, "epoch": 3.0677233429394812, "grad_norm": 1.09375, "learning_rate": 0.0004072128483863948, "loss": 4.6875, "mean_token_accuracy": 0.24407064020633698, "num_tokens": 73226599.0, "step": 31935 }, { "entropy": 5.001120471954346, "epoch": 3.068203650336215, "grad_norm": 1.0703125, "learning_rate": 0.0004071851065746328, "loss": 4.5448, "mean_token_accuracy": 0.24898416101932525, "num_tokens": 73238588.0, "step": 31940 }, { "entropy": 5.078129243850708, "epoch": 3.068683957732949, "grad_norm": 1.046875, "learning_rate": 0.0004071573616939252, "loss": 4.6786, "mean_token_accuracy": 0.24310262948274614, "num_tokens": 73250727.0, "step": 31945 }, { "entropy": 5.019961357116699, "epoch": 3.069164265129683, "grad_norm": 1.0, "learning_rate": 0.0004071296137449161, "loss": 4.5707, "mean_token_accuracy": 0.2604907304048538, "num_tokens": 73261937.0, "step": 31950 }, { "entropy": 4.974229001998902, "epoch": 3.0696445725264168, "grad_norm": 0.97265625, "learning_rate": 0.00040710186272824967, "loss": 4.5917, "mean_token_accuracy": 0.24307173639535903, "num_tokens": 73274853.0, "step": 31955 }, { "entropy": 5.068817138671875, "epoch": 3.0701248799231506, "grad_norm": 1.1015625, "learning_rate": 0.0004070741086445703, "loss": 4.6664, "mean_token_accuracy": 0.24610565453767777, "num_tokens": 73284901.0, "step": 31960 }, { "entropy": 5.026659536361694, "epoch": 3.0706051873198845, "grad_norm": 1.0, "learning_rate": 0.00040704635149452223, "loss": 4.6434, "mean_token_accuracy": 0.24464693963527678, "num_tokens": 73295989.0, "step": 31965 }, { "entropy": 5.097943210601807, "epoch": 3.071085494716619, "grad_norm": 1.03125, "learning_rate": 0.00040701859127875, "loss": 4.7203, "mean_token_accuracy": 0.2406073048710823, "num_tokens": 73306661.0, "step": 31970 }, { "entropy": 5.129525518417358, "epoch": 3.0715658021133527, "grad_norm": 1.03125, "learning_rate": 0.00040699082799789814, "loss": 4.7095, "mean_token_accuracy": 0.23878285884857178, "num_tokens": 73318456.0, "step": 31975 }, { "entropy": 5.124619007110596, "epoch": 3.0720461095100866, "grad_norm": 0.99609375, "learning_rate": 0.00040696306165261117, "loss": 4.7345, "mean_token_accuracy": 0.24056761413812638, "num_tokens": 73330577.0, "step": 31980 }, { "entropy": 5.060488891601563, "epoch": 3.0725264169068205, "grad_norm": 1.109375, "learning_rate": 0.0004069352922435337, "loss": 4.6589, "mean_token_accuracy": 0.24906503558158874, "num_tokens": 73341682.0, "step": 31985 }, { "entropy": 5.021332120895385, "epoch": 3.0730067243035544, "grad_norm": 1.0078125, "learning_rate": 0.0004069075197713106, "loss": 4.6333, "mean_token_accuracy": 0.24834639877080916, "num_tokens": 73352924.0, "step": 31990 }, { "entropy": 5.041073036193848, "epoch": 3.0734870317002883, "grad_norm": 1.0703125, "learning_rate": 0.00040687974423658655, "loss": 4.6495, "mean_token_accuracy": 0.250208979845047, "num_tokens": 73364904.0, "step": 31995 }, { "entropy": 5.111618137359619, "epoch": 3.073967339097022, "grad_norm": 1.140625, "learning_rate": 0.00040685196564000644, "loss": 4.6803, "mean_token_accuracy": 0.24720986187458038, "num_tokens": 73376680.0, "step": 32000 }, { "entropy": 4.999582099914551, "epoch": 3.074447646493756, "grad_norm": 0.96875, "learning_rate": 0.00040682418398221517, "loss": 4.6003, "mean_token_accuracy": 0.2523179829120636, "num_tokens": 73388631.0, "step": 32005 }, { "entropy": 5.027138090133667, "epoch": 3.07492795389049, "grad_norm": 0.96875, "learning_rate": 0.00040679639926385783, "loss": 4.6205, "mean_token_accuracy": 0.25160788297653197, "num_tokens": 73400953.0, "step": 32010 }, { "entropy": 5.050076913833618, "epoch": 3.0754082612872238, "grad_norm": 1.0703125, "learning_rate": 0.0004067686114855794, "loss": 4.5331, "mean_token_accuracy": 0.25700239688158033, "num_tokens": 73411167.0, "step": 32015 }, { "entropy": 4.919393301010132, "epoch": 3.0758885686839577, "grad_norm": 1.0, "learning_rate": 0.00040674082064802507, "loss": 4.5571, "mean_token_accuracy": 0.2541386589407921, "num_tokens": 73422434.0, "step": 32020 }, { "entropy": 5.088863754272461, "epoch": 3.0763688760806915, "grad_norm": 1.015625, "learning_rate": 0.0004067130267518401, "loss": 4.6668, "mean_token_accuracy": 0.23891474008560182, "num_tokens": 73433543.0, "step": 32025 }, { "entropy": 5.165063953399658, "epoch": 3.0768491834774254, "grad_norm": 1.0703125, "learning_rate": 0.0004066852297976698, "loss": 4.8349, "mean_token_accuracy": 0.231815005838871, "num_tokens": 73444624.0, "step": 32030 }, { "entropy": 5.046006107330323, "epoch": 3.0773294908741593, "grad_norm": 1.0625, "learning_rate": 0.0004066574297861595, "loss": 4.6193, "mean_token_accuracy": 0.24984999746084213, "num_tokens": 73454446.0, "step": 32035 }, { "entropy": 5.038469409942627, "epoch": 3.077809798270893, "grad_norm": 0.9609375, "learning_rate": 0.00040662962671795454, "loss": 4.6141, "mean_token_accuracy": 0.23815218806266786, "num_tokens": 73465557.0, "step": 32040 }, { "entropy": 5.092407846450806, "epoch": 3.0782901056676275, "grad_norm": 1.0234375, "learning_rate": 0.0004066018205937006, "loss": 4.6328, "mean_token_accuracy": 0.24465030431747437, "num_tokens": 73477939.0, "step": 32045 }, { "entropy": 5.025490808486938, "epoch": 3.0787704130643614, "grad_norm": 1.0078125, "learning_rate": 0.0004065740114140431, "loss": 4.6916, "mean_token_accuracy": 0.2474071577191353, "num_tokens": 73490715.0, "step": 32050 }, { "entropy": 5.011799049377442, "epoch": 3.0792507204610953, "grad_norm": 0.95703125, "learning_rate": 0.00040654619917962774, "loss": 4.5714, "mean_token_accuracy": 0.250951412320137, "num_tokens": 73503147.0, "step": 32055 }, { "entropy": 5.1020965576171875, "epoch": 3.079731027857829, "grad_norm": 1.015625, "learning_rate": 0.0004065183838911003, "loss": 4.6753, "mean_token_accuracy": 0.2404956191778183, "num_tokens": 73514750.0, "step": 32060 }, { "entropy": 5.023135662078857, "epoch": 3.080211335254563, "grad_norm": 1.0234375, "learning_rate": 0.0004064905655491065, "loss": 4.6466, "mean_token_accuracy": 0.2515063464641571, "num_tokens": 73525845.0, "step": 32065 }, { "entropy": 5.136309814453125, "epoch": 3.080691642651297, "grad_norm": 1.0234375, "learning_rate": 0.00040646274415429224, "loss": 4.6948, "mean_token_accuracy": 0.23915023505687713, "num_tokens": 73537086.0, "step": 32070 }, { "entropy": 5.008394193649292, "epoch": 3.081171950048031, "grad_norm": 1.0234375, "learning_rate": 0.0004064349197073033, "loss": 4.5785, "mean_token_accuracy": 0.25241281688213346, "num_tokens": 73548708.0, "step": 32075 }, { "entropy": 5.098199844360352, "epoch": 3.0816522574447647, "grad_norm": 0.984375, "learning_rate": 0.0004064070922087859, "loss": 4.6855, "mean_token_accuracy": 0.24001459777355194, "num_tokens": 73559703.0, "step": 32080 }, { "entropy": 5.013723945617675, "epoch": 3.0821325648414986, "grad_norm": 1.15625, "learning_rate": 0.00040637926165938606, "loss": 4.5682, "mean_token_accuracy": 0.2518329590559006, "num_tokens": 73570858.0, "step": 32085 }, { "entropy": 5.0118945121765135, "epoch": 3.0826128722382324, "grad_norm": 0.9609375, "learning_rate": 0.00040635142805974986, "loss": 4.6485, "mean_token_accuracy": 0.2444664478302002, "num_tokens": 73583442.0, "step": 32090 }, { "entropy": 5.038678121566773, "epoch": 3.0830931796349663, "grad_norm": 1.015625, "learning_rate": 0.0004063235914105235, "loss": 4.6603, "mean_token_accuracy": 0.24657151848077774, "num_tokens": 73595336.0, "step": 32095 }, { "entropy": 5.009463834762573, "epoch": 3.0835734870317, "grad_norm": 1.0546875, "learning_rate": 0.00040629575171235327, "loss": 4.6539, "mean_token_accuracy": 0.2388184517621994, "num_tokens": 73606582.0, "step": 32100 }, { "entropy": 5.074433517456055, "epoch": 3.084053794428434, "grad_norm": 0.9453125, "learning_rate": 0.0004062679089658856, "loss": 4.6588, "mean_token_accuracy": 0.24748821556568146, "num_tokens": 73618432.0, "step": 32105 }, { "entropy": 5.029176378250122, "epoch": 3.084534101825168, "grad_norm": 0.984375, "learning_rate": 0.00040624006317176685, "loss": 4.5241, "mean_token_accuracy": 0.25378997027873995, "num_tokens": 73628880.0, "step": 32110 }, { "entropy": 4.98155779838562, "epoch": 3.085014409221902, "grad_norm": 1.0, "learning_rate": 0.00040621221433064354, "loss": 4.6398, "mean_token_accuracy": 0.24306266158819198, "num_tokens": 73639813.0, "step": 32115 }, { "entropy": 5.170345878601074, "epoch": 3.0854947166186357, "grad_norm": 1.015625, "learning_rate": 0.0004061843624431623, "loss": 4.8138, "mean_token_accuracy": 0.23637549877166747, "num_tokens": 73651276.0, "step": 32120 }, { "entropy": 5.044910049438476, "epoch": 3.08597502401537, "grad_norm": 0.97265625, "learning_rate": 0.00040615650750996956, "loss": 4.5826, "mean_token_accuracy": 0.2504544660449028, "num_tokens": 73663869.0, "step": 32125 }, { "entropy": 5.04099407196045, "epoch": 3.086455331412104, "grad_norm": 1.0625, "learning_rate": 0.00040612864953171223, "loss": 4.6749, "mean_token_accuracy": 0.23652782887220383, "num_tokens": 73676089.0, "step": 32130 }, { "entropy": 5.044053459167481, "epoch": 3.086935638808838, "grad_norm": 1.0703125, "learning_rate": 0.00040610078850903715, "loss": 4.6424, "mean_token_accuracy": 0.2413918137550354, "num_tokens": 73688718.0, "step": 32135 }, { "entropy": 5.012424325942993, "epoch": 3.0874159462055717, "grad_norm": 1.0546875, "learning_rate": 0.00040607292444259094, "loss": 4.5776, "mean_token_accuracy": 0.24864151626825332, "num_tokens": 73700363.0, "step": 32140 }, { "entropy": 4.92096848487854, "epoch": 3.0878962536023056, "grad_norm": 1.0859375, "learning_rate": 0.0004060450573330206, "loss": 4.4865, "mean_token_accuracy": 0.24904475957155228, "num_tokens": 73711873.0, "step": 32145 }, { "entropy": 5.008581447601318, "epoch": 3.0883765609990395, "grad_norm": 1.0859375, "learning_rate": 0.00040601718718097325, "loss": 4.5498, "mean_token_accuracy": 0.2478708654642105, "num_tokens": 73722466.0, "step": 32150 }, { "entropy": 5.022934818267823, "epoch": 3.0888568683957733, "grad_norm": 0.9921875, "learning_rate": 0.00040598931398709576, "loss": 4.6124, "mean_token_accuracy": 0.24530298858880997, "num_tokens": 73734712.0, "step": 32155 }, { "entropy": 5.189019346237183, "epoch": 3.089337175792507, "grad_norm": 1.0703125, "learning_rate": 0.00040596143775203534, "loss": 4.8077, "mean_token_accuracy": 0.2337539538741112, "num_tokens": 73746367.0, "step": 32160 }, { "entropy": 5.022787284851074, "epoch": 3.089817483189241, "grad_norm": 1.1015625, "learning_rate": 0.00040593355847643933, "loss": 4.5566, "mean_token_accuracy": 0.26082643419504165, "num_tokens": 73758223.0, "step": 32165 }, { "entropy": 5.062843608856201, "epoch": 3.090297790585975, "grad_norm": 1.125, "learning_rate": 0.0004059056761609548, "loss": 4.6652, "mean_token_accuracy": 0.2431433767080307, "num_tokens": 73769278.0, "step": 32170 }, { "entropy": 5.04819803237915, "epoch": 3.090778097982709, "grad_norm": 1.0625, "learning_rate": 0.0004058777908062292, "loss": 4.6069, "mean_token_accuracy": 0.24774401038885116, "num_tokens": 73778999.0, "step": 32175 }, { "entropy": 5.053538417816162, "epoch": 3.0912584053794427, "grad_norm": 1.09375, "learning_rate": 0.0004058499024129099, "loss": 4.6141, "mean_token_accuracy": 0.24732532650232314, "num_tokens": 73789556.0, "step": 32180 }, { "entropy": 5.054281997680664, "epoch": 3.0917387127761766, "grad_norm": 1.046875, "learning_rate": 0.00040582201098164443, "loss": 4.6434, "mean_token_accuracy": 0.25142176151275636, "num_tokens": 73800691.0, "step": 32185 }, { "entropy": 5.051549243927002, "epoch": 3.0922190201729105, "grad_norm": 1.0390625, "learning_rate": 0.00040579411651308034, "loss": 4.6921, "mean_token_accuracy": 0.24632177203893663, "num_tokens": 73811353.0, "step": 32190 }, { "entropy": 5.004362773895264, "epoch": 3.0926993275696444, "grad_norm": 0.91796875, "learning_rate": 0.00040576621900786523, "loss": 4.6235, "mean_token_accuracy": 0.24753634631633759, "num_tokens": 73822531.0, "step": 32195 }, { "entropy": 5.035271883010864, "epoch": 3.0931796349663783, "grad_norm": 1.1640625, "learning_rate": 0.0004057383184666468, "loss": 4.6885, "mean_token_accuracy": 0.2388680472970009, "num_tokens": 73833857.0, "step": 32200 }, { "entropy": 5.075099754333496, "epoch": 3.0936599423631126, "grad_norm": 1.0703125, "learning_rate": 0.00040571041489007286, "loss": 4.6281, "mean_token_accuracy": 0.25046379268169405, "num_tokens": 73845664.0, "step": 32205 }, { "entropy": 5.013817644119262, "epoch": 3.0941402497598465, "grad_norm": 0.98828125, "learning_rate": 0.00040568250827879127, "loss": 4.4909, "mean_token_accuracy": 0.2567852586507797, "num_tokens": 73856860.0, "step": 32210 }, { "entropy": 4.977983236312866, "epoch": 3.0946205571565804, "grad_norm": 1.09375, "learning_rate": 0.0004056545986334497, "loss": 4.6433, "mean_token_accuracy": 0.2497049480676651, "num_tokens": 73867532.0, "step": 32215 }, { "entropy": 5.0502697944641115, "epoch": 3.0951008645533142, "grad_norm": 1.0078125, "learning_rate": 0.0004056266859546965, "loss": 4.683, "mean_token_accuracy": 0.24047670662403106, "num_tokens": 73880458.0, "step": 32220 }, { "entropy": 5.107214832305909, "epoch": 3.095581171950048, "grad_norm": 0.96875, "learning_rate": 0.0004055987702431795, "loss": 4.6897, "mean_token_accuracy": 0.24284666925668716, "num_tokens": 73892842.0, "step": 32225 }, { "entropy": 5.085785531997681, "epoch": 3.096061479346782, "grad_norm": 1.046875, "learning_rate": 0.00040557085149954677, "loss": 4.6671, "mean_token_accuracy": 0.24765777289867402, "num_tokens": 73903491.0, "step": 32230 }, { "entropy": 4.990330410003662, "epoch": 3.096541786743516, "grad_norm": 1.046875, "learning_rate": 0.00040554292972444663, "loss": 4.6217, "mean_token_accuracy": 0.24172378480434417, "num_tokens": 73915255.0, "step": 32235 }, { "entropy": 4.974247837066651, "epoch": 3.0970220941402498, "grad_norm": 1.0390625, "learning_rate": 0.00040551500491852735, "loss": 4.5191, "mean_token_accuracy": 0.26239279806613924, "num_tokens": 73925571.0, "step": 32240 }, { "entropy": 5.119597434997559, "epoch": 3.0975024015369836, "grad_norm": 1.0390625, "learning_rate": 0.0004054870770824371, "loss": 4.6965, "mean_token_accuracy": 0.24282235503196717, "num_tokens": 73936957.0, "step": 32245 }, { "entropy": 5.1229418277740475, "epoch": 3.0979827089337175, "grad_norm": 1.0703125, "learning_rate": 0.00040545914621682445, "loss": 4.5914, "mean_token_accuracy": 0.25307161509990694, "num_tokens": 73948018.0, "step": 32250 }, { "entropy": 5.030294179916382, "epoch": 3.0984630163304514, "grad_norm": 1.03125, "learning_rate": 0.0004054312123223378, "loss": 4.6265, "mean_token_accuracy": 0.24262434989213943, "num_tokens": 73958713.0, "step": 32255 }, { "entropy": 5.038964891433716, "epoch": 3.0989433237271853, "grad_norm": 1.0390625, "learning_rate": 0.00040540327539962567, "loss": 4.6892, "mean_token_accuracy": 0.24435101002454757, "num_tokens": 73969299.0, "step": 32260 }, { "entropy": 5.195756816864014, "epoch": 3.099423631123919, "grad_norm": 1.0625, "learning_rate": 0.00040537533544933674, "loss": 4.7559, "mean_token_accuracy": 0.2307824045419693, "num_tokens": 73980149.0, "step": 32265 }, { "entropy": 5.074833631515503, "epoch": 3.099903938520653, "grad_norm": 1.0234375, "learning_rate": 0.0004053473924721197, "loss": 4.5805, "mean_token_accuracy": 0.24621020555496215, "num_tokens": 73991462.0, "step": 32270 }, { "entropy": 5.0965595722198485, "epoch": 3.100384245917387, "grad_norm": 1.078125, "learning_rate": 0.0004053194464686232, "loss": 4.6776, "mean_token_accuracy": 0.24148496985435486, "num_tokens": 74003442.0, "step": 32275 }, { "entropy": 4.947234678268432, "epoch": 3.1008645533141213, "grad_norm": 1.015625, "learning_rate": 0.0004052914974394961, "loss": 4.4884, "mean_token_accuracy": 0.2542691543698311, "num_tokens": 74014394.0, "step": 32280 }, { "entropy": 5.109122323989868, "epoch": 3.101344860710855, "grad_norm": 1.15625, "learning_rate": 0.00040526354538538735, "loss": 4.7513, "mean_token_accuracy": 0.24323177933692933, "num_tokens": 74025377.0, "step": 32285 }, { "entropy": 5.013289499282837, "epoch": 3.101825168107589, "grad_norm": 1.0625, "learning_rate": 0.0004052355903069459, "loss": 4.5951, "mean_token_accuracy": 0.2537130072712898, "num_tokens": 74037094.0, "step": 32290 }, { "entropy": 4.960607957839966, "epoch": 3.102305475504323, "grad_norm": 1.0546875, "learning_rate": 0.0004052076322048207, "loss": 4.4895, "mean_token_accuracy": 0.2608163744211197, "num_tokens": 74048043.0, "step": 32295 }, { "entropy": 4.994056320190429, "epoch": 3.1027857829010568, "grad_norm": 0.95703125, "learning_rate": 0.00040517967107966095, "loss": 4.6733, "mean_token_accuracy": 0.24547887295484544, "num_tokens": 74061026.0, "step": 32300 }, { "entropy": 5.084767150878906, "epoch": 3.1032660902977907, "grad_norm": 1.0078125, "learning_rate": 0.00040515170693211584, "loss": 4.7038, "mean_token_accuracy": 0.24629230946302413, "num_tokens": 74072930.0, "step": 32305 }, { "entropy": 5.06509747505188, "epoch": 3.1037463976945245, "grad_norm": 0.9375, "learning_rate": 0.0004051237397628345, "loss": 4.6509, "mean_token_accuracy": 0.24190901219844818, "num_tokens": 74083847.0, "step": 32310 }, { "entropy": 5.071443939208985, "epoch": 3.1042267050912584, "grad_norm": 0.96484375, "learning_rate": 0.0004050957695724663, "loss": 4.6169, "mean_token_accuracy": 0.2452313095331192, "num_tokens": 74096187.0, "step": 32315 }, { "entropy": 5.024712753295899, "epoch": 3.1047070124879923, "grad_norm": 1.078125, "learning_rate": 0.0004050677963616607, "loss": 4.6233, "mean_token_accuracy": 0.2480012759566307, "num_tokens": 74108894.0, "step": 32320 }, { "entropy": 5.007587575912476, "epoch": 3.105187319884726, "grad_norm": 1.0078125, "learning_rate": 0.00040503982013106706, "loss": 4.7044, "mean_token_accuracy": 0.24248451441526414, "num_tokens": 74119941.0, "step": 32325 }, { "entropy": 5.037981605529785, "epoch": 3.10566762728146, "grad_norm": 1.0234375, "learning_rate": 0.000405011840881335, "loss": 4.6051, "mean_token_accuracy": 0.24886149317026138, "num_tokens": 74131550.0, "step": 32330 }, { "entropy": 5.0070899486541744, "epoch": 3.106147934678194, "grad_norm": 1.0234375, "learning_rate": 0.0004049838586131139, "loss": 4.5378, "mean_token_accuracy": 0.2561880812048912, "num_tokens": 74142530.0, "step": 32335 }, { "entropy": 4.993922376632691, "epoch": 3.106628242074928, "grad_norm": 1.0234375, "learning_rate": 0.0004049558733270537, "loss": 4.6352, "mean_token_accuracy": 0.24153310656547547, "num_tokens": 74155117.0, "step": 32340 }, { "entropy": 5.09845290184021, "epoch": 3.1071085494716617, "grad_norm": 0.9765625, "learning_rate": 0.000404927885023804, "loss": 4.7432, "mean_token_accuracy": 0.23723605871200562, "num_tokens": 74165880.0, "step": 32345 }, { "entropy": 4.944010925292969, "epoch": 3.1075888568683956, "grad_norm": 1.0703125, "learning_rate": 0.00040489989370401456, "loss": 4.4294, "mean_token_accuracy": 0.2656581252813339, "num_tokens": 74175777.0, "step": 32350 }, { "entropy": 5.002707529067993, "epoch": 3.1080691642651295, "grad_norm": 1.0078125, "learning_rate": 0.0004048718993683353, "loss": 4.6105, "mean_token_accuracy": 0.24675467163324355, "num_tokens": 74187745.0, "step": 32355 }, { "entropy": 4.991137838363647, "epoch": 3.108549471661864, "grad_norm": 1.078125, "learning_rate": 0.00040484390201741627, "loss": 4.6143, "mean_token_accuracy": 0.24088763147592546, "num_tokens": 74198606.0, "step": 32360 }, { "entropy": 5.1798583507537845, "epoch": 3.1090297790585977, "grad_norm": 1.015625, "learning_rate": 0.0004048159016519073, "loss": 4.7346, "mean_token_accuracy": 0.23228639662265776, "num_tokens": 74210530.0, "step": 32365 }, { "entropy": 4.999586725234986, "epoch": 3.1095100864553316, "grad_norm": 1.046875, "learning_rate": 0.0004047878982724586, "loss": 4.5747, "mean_token_accuracy": 0.25134737193584444, "num_tokens": 74222155.0, "step": 32370 }, { "entropy": 5.147057437896729, "epoch": 3.1099903938520654, "grad_norm": 1.1171875, "learning_rate": 0.00040475989187972034, "loss": 4.7621, "mean_token_accuracy": 0.23894109278917314, "num_tokens": 74233003.0, "step": 32375 }, { "entropy": 5.058832311630249, "epoch": 3.1104707012487993, "grad_norm": 1.046875, "learning_rate": 0.00040473188247434265, "loss": 4.5372, "mean_token_accuracy": 0.25587199479341505, "num_tokens": 74244013.0, "step": 32380 }, { "entropy": 5.03604474067688, "epoch": 3.110951008645533, "grad_norm": 1.0078125, "learning_rate": 0.00040470387005697587, "loss": 4.6202, "mean_token_accuracy": 0.2468089148402214, "num_tokens": 74256010.0, "step": 32385 }, { "entropy": 5.095707035064697, "epoch": 3.111431316042267, "grad_norm": 1.0234375, "learning_rate": 0.0004046758546282704, "loss": 4.6826, "mean_token_accuracy": 0.23876849859952926, "num_tokens": 74268680.0, "step": 32390 }, { "entropy": 5.128348350524902, "epoch": 3.111911623439001, "grad_norm": 1.0390625, "learning_rate": 0.0004046478361888766, "loss": 4.7623, "mean_token_accuracy": 0.23709394335746764, "num_tokens": 74280807.0, "step": 32395 }, { "entropy": 5.055472660064697, "epoch": 3.112391930835735, "grad_norm": 1.0546875, "learning_rate": 0.000404619814739445, "loss": 4.6744, "mean_token_accuracy": 0.24537662863731385, "num_tokens": 74292076.0, "step": 32400 }, { "entropy": 5.100376605987549, "epoch": 3.1128722382324687, "grad_norm": 1.0078125, "learning_rate": 0.0004045917902806263, "loss": 4.665, "mean_token_accuracy": 0.24423255324363707, "num_tokens": 74303965.0, "step": 32405 }, { "entropy": 4.975015449523926, "epoch": 3.1133525456292026, "grad_norm": 1.125, "learning_rate": 0.000404563762813071, "loss": 4.5322, "mean_token_accuracy": 0.25500834733247757, "num_tokens": 74315774.0, "step": 32410 }, { "entropy": 4.988288164138794, "epoch": 3.1138328530259365, "grad_norm": 1.0546875, "learning_rate": 0.0004045357323374298, "loss": 4.5869, "mean_token_accuracy": 0.25064926892518996, "num_tokens": 74326624.0, "step": 32415 }, { "entropy": 5.026540088653564, "epoch": 3.1143131604226704, "grad_norm": 1.1171875, "learning_rate": 0.00040450769885435364, "loss": 4.6438, "mean_token_accuracy": 0.24512701481580734, "num_tokens": 74337595.0, "step": 32420 }, { "entropy": 5.070287609100342, "epoch": 3.1147934678194042, "grad_norm": 1.0234375, "learning_rate": 0.00040447966236449313, "loss": 4.6207, "mean_token_accuracy": 0.2501947954297066, "num_tokens": 74347695.0, "step": 32425 }, { "entropy": 4.9649329662323, "epoch": 3.115273775216138, "grad_norm": 0.9453125, "learning_rate": 0.00040445162286849935, "loss": 4.5405, "mean_token_accuracy": 0.2583227038383484, "num_tokens": 74358491.0, "step": 32430 }, { "entropy": 5.066698455810547, "epoch": 3.115754082612872, "grad_norm": 1.3046875, "learning_rate": 0.00040442358036702343, "loss": 4.7051, "mean_token_accuracy": 0.24266576766967773, "num_tokens": 74371038.0, "step": 32435 }, { "entropy": 5.048079109191894, "epoch": 3.1162343900096063, "grad_norm": 1.046875, "learning_rate": 0.0004043955348607161, "loss": 4.6636, "mean_token_accuracy": 0.24872395098209382, "num_tokens": 74381449.0, "step": 32440 }, { "entropy": 5.04279842376709, "epoch": 3.11671469740634, "grad_norm": 1.015625, "learning_rate": 0.0004043674863502288, "loss": 4.5907, "mean_token_accuracy": 0.2526773661375046, "num_tokens": 74392970.0, "step": 32445 }, { "entropy": 4.9978090763092045, "epoch": 3.117195004803074, "grad_norm": 0.96484375, "learning_rate": 0.00040433943483621253, "loss": 4.5426, "mean_token_accuracy": 0.2519802376627922, "num_tokens": 74404243.0, "step": 32450 }, { "entropy": 5.077501392364502, "epoch": 3.117675312199808, "grad_norm": 1.0546875, "learning_rate": 0.0004043113803193187, "loss": 4.7123, "mean_token_accuracy": 0.24618444442749024, "num_tokens": 74415546.0, "step": 32455 }, { "entropy": 5.010422039031982, "epoch": 3.118155619596542, "grad_norm": 0.98046875, "learning_rate": 0.00040428332280019864, "loss": 4.6038, "mean_token_accuracy": 0.25266663581132887, "num_tokens": 74427138.0, "step": 32460 }, { "entropy": 4.997100400924682, "epoch": 3.1186359269932757, "grad_norm": 1.0390625, "learning_rate": 0.0004042552622795036, "loss": 4.5335, "mean_token_accuracy": 0.24878908544778824, "num_tokens": 74438088.0, "step": 32465 }, { "entropy": 5.05629243850708, "epoch": 3.1191162343900096, "grad_norm": 1.0078125, "learning_rate": 0.0004042271987578852, "loss": 4.6924, "mean_token_accuracy": 0.24206115007400514, "num_tokens": 74450310.0, "step": 32470 }, { "entropy": 5.05663366317749, "epoch": 3.1195965417867435, "grad_norm": 1.0703125, "learning_rate": 0.00040419913223599505, "loss": 4.5353, "mean_token_accuracy": 0.2491496294736862, "num_tokens": 74461128.0, "step": 32475 }, { "entropy": 5.04512767791748, "epoch": 3.1200768491834774, "grad_norm": 1.09375, "learning_rate": 0.00040417106271448464, "loss": 4.66, "mean_token_accuracy": 0.24802774637937547, "num_tokens": 74472459.0, "step": 32480 }, { "entropy": 4.929509115219116, "epoch": 3.1205571565802113, "grad_norm": 1.015625, "learning_rate": 0.0004041429901940057, "loss": 4.5578, "mean_token_accuracy": 0.2539891391992569, "num_tokens": 74485257.0, "step": 32485 }, { "entropy": 4.945969963073731, "epoch": 3.121037463976945, "grad_norm": 1.0390625, "learning_rate": 0.00040411491467521, "loss": 4.5456, "mean_token_accuracy": 0.2536383971571922, "num_tokens": 74496277.0, "step": 32490 }, { "entropy": 5.058421993255616, "epoch": 3.121517771373679, "grad_norm": 1.140625, "learning_rate": 0.0004040868361587494, "loss": 4.6489, "mean_token_accuracy": 0.24318694770336152, "num_tokens": 74508083.0, "step": 32495 }, { "entropy": 5.097837829589844, "epoch": 3.121998078770413, "grad_norm": 1.0625, "learning_rate": 0.0004040587546452758, "loss": 4.8113, "mean_token_accuracy": 0.22928981035947799, "num_tokens": 74520403.0, "step": 32500 }, { "entropy": 5.079450845718384, "epoch": 3.122478386167147, "grad_norm": 1.0546875, "learning_rate": 0.00040403067013544116, "loss": 4.6485, "mean_token_accuracy": 0.243613338470459, "num_tokens": 74531657.0, "step": 32505 }, { "entropy": 5.081295108795166, "epoch": 3.1229586935638807, "grad_norm": 1.09375, "learning_rate": 0.00040400258262989744, "loss": 4.6174, "mean_token_accuracy": 0.25055552572011947, "num_tokens": 74541841.0, "step": 32510 }, { "entropy": 5.108765125274658, "epoch": 3.123439000960615, "grad_norm": 1.0234375, "learning_rate": 0.00040397449212929676, "loss": 4.6998, "mean_token_accuracy": 0.24419642835855485, "num_tokens": 74553312.0, "step": 32515 }, { "entropy": 5.089429330825806, "epoch": 3.123919308357349, "grad_norm": 1.0, "learning_rate": 0.0004039463986342914, "loss": 4.6735, "mean_token_accuracy": 0.24255333691835404, "num_tokens": 74564470.0, "step": 32520 }, { "entropy": 5.055105543136596, "epoch": 3.1243996157540828, "grad_norm": 1.0390625, "learning_rate": 0.00040391830214553365, "loss": 4.6643, "mean_token_accuracy": 0.2442426785826683, "num_tokens": 74575548.0, "step": 32525 }, { "entropy": 5.062867975234985, "epoch": 3.1248799231508166, "grad_norm": 1.015625, "learning_rate": 0.0004038902026636756, "loss": 4.6639, "mean_token_accuracy": 0.2471578299999237, "num_tokens": 74587470.0, "step": 32530 }, { "entropy": 5.053456354141235, "epoch": 3.1253602305475505, "grad_norm": 1.0859375, "learning_rate": 0.0004038621001893698, "loss": 4.5787, "mean_token_accuracy": 0.2550749212503433, "num_tokens": 74598743.0, "step": 32535 }, { "entropy": 5.0474076747894285, "epoch": 3.1258405379442844, "grad_norm": 1.046875, "learning_rate": 0.00040383399472326874, "loss": 4.6202, "mean_token_accuracy": 0.24675973057746886, "num_tokens": 74609057.0, "step": 32540 }, { "entropy": 5.058999300003052, "epoch": 3.1263208453410183, "grad_norm": 0.99609375, "learning_rate": 0.00040380588626602484, "loss": 4.7125, "mean_token_accuracy": 0.238733471930027, "num_tokens": 74620938.0, "step": 32545 }, { "entropy": 5.054383659362793, "epoch": 3.126801152737752, "grad_norm": 1.125, "learning_rate": 0.0004037777748182907, "loss": 4.606, "mean_token_accuracy": 0.24833089411258696, "num_tokens": 74631877.0, "step": 32550 }, { "entropy": 5.107970142364502, "epoch": 3.127281460134486, "grad_norm": 1.1640625, "learning_rate": 0.0004037496603807191, "loss": 4.68, "mean_token_accuracy": 0.24343400448560715, "num_tokens": 74641921.0, "step": 32555 }, { "entropy": 4.992760705947876, "epoch": 3.12776176753122, "grad_norm": 1.09375, "learning_rate": 0.0004037215429539626, "loss": 4.5844, "mean_token_accuracy": 0.2517207324504852, "num_tokens": 74652950.0, "step": 32560 }, { "entropy": 5.068590259552002, "epoch": 3.128242074927954, "grad_norm": 1.0390625, "learning_rate": 0.00040369342253867413, "loss": 4.6339, "mean_token_accuracy": 0.25402029752731325, "num_tokens": 74665512.0, "step": 32565 }, { "entropy": 5.074625062942505, "epoch": 3.1287223823246877, "grad_norm": 1.015625, "learning_rate": 0.0004036652991355066, "loss": 4.6357, "mean_token_accuracy": 0.25480311959981916, "num_tokens": 74675833.0, "step": 32570 }, { "entropy": 4.91657395362854, "epoch": 3.1292026897214216, "grad_norm": 0.99609375, "learning_rate": 0.0004036371727451128, "loss": 4.556, "mean_token_accuracy": 0.2552287966012955, "num_tokens": 74688356.0, "step": 32575 }, { "entropy": 4.994753122329712, "epoch": 3.1296829971181555, "grad_norm": 1.046875, "learning_rate": 0.00040360904336814586, "loss": 4.584, "mean_token_accuracy": 0.25450084954500196, "num_tokens": 74699617.0, "step": 32580 }, { "entropy": 4.967467546463013, "epoch": 3.1301633045148893, "grad_norm": 0.9609375, "learning_rate": 0.0004035809110052588, "loss": 4.5129, "mean_token_accuracy": 0.2529334306716919, "num_tokens": 74711319.0, "step": 32585 }, { "entropy": 4.942798805236817, "epoch": 3.1306436119116237, "grad_norm": 1.0, "learning_rate": 0.0004035527756571048, "loss": 4.5741, "mean_token_accuracy": 0.2528795599937439, "num_tokens": 74723378.0, "step": 32590 }, { "entropy": 5.046454286575317, "epoch": 3.1311239193083575, "grad_norm": 1.1015625, "learning_rate": 0.00040352463732433707, "loss": 4.6337, "mean_token_accuracy": 0.2438918486237526, "num_tokens": 74734011.0, "step": 32595 }, { "entropy": 5.0227948188781735, "epoch": 3.1316042267050914, "grad_norm": 1.0078125, "learning_rate": 0.00040349649600760894, "loss": 4.6183, "mean_token_accuracy": 0.25802345871925353, "num_tokens": 74745534.0, "step": 32600 }, { "entropy": 5.0454918384552006, "epoch": 3.1320845341018253, "grad_norm": 1.046875, "learning_rate": 0.0004034683517075737, "loss": 4.6479, "mean_token_accuracy": 0.25018918663263323, "num_tokens": 74757288.0, "step": 32605 }, { "entropy": 4.996994590759277, "epoch": 3.132564841498559, "grad_norm": 0.96875, "learning_rate": 0.00040344020442488476, "loss": 4.6325, "mean_token_accuracy": 0.2472263753414154, "num_tokens": 74770520.0, "step": 32610 }, { "entropy": 4.991967868804932, "epoch": 3.133045148895293, "grad_norm": 1.0234375, "learning_rate": 0.00040341205416019577, "loss": 4.6022, "mean_token_accuracy": 0.25307370722293854, "num_tokens": 74781967.0, "step": 32615 }, { "entropy": 5.0619368076324465, "epoch": 3.133525456292027, "grad_norm": 0.96875, "learning_rate": 0.0004033839009141601, "loss": 4.6281, "mean_token_accuracy": 0.24463185667991638, "num_tokens": 74793022.0, "step": 32620 }, { "entropy": 5.021758079528809, "epoch": 3.134005763688761, "grad_norm": 1.03125, "learning_rate": 0.00040335574468743145, "loss": 4.6094, "mean_token_accuracy": 0.25471110343933107, "num_tokens": 74805165.0, "step": 32625 }, { "entropy": 5.006101131439209, "epoch": 3.1344860710854947, "grad_norm": 1.0, "learning_rate": 0.0004033275854806636, "loss": 4.6742, "mean_token_accuracy": 0.2463693603873253, "num_tokens": 74816607.0, "step": 32630 }, { "entropy": 4.991056966781616, "epoch": 3.1349663784822286, "grad_norm": 1.0546875, "learning_rate": 0.0004032994232945103, "loss": 4.6592, "mean_token_accuracy": 0.24665045738220215, "num_tokens": 74827606.0, "step": 32635 }, { "entropy": 5.031443881988525, "epoch": 3.1354466858789625, "grad_norm": 1.0078125, "learning_rate": 0.0004032712581296253, "loss": 4.5666, "mean_token_accuracy": 0.2526061311364174, "num_tokens": 74837507.0, "step": 32640 }, { "entropy": 5.039136600494385, "epoch": 3.1359269932756964, "grad_norm": 1.0625, "learning_rate": 0.00040324308998666267, "loss": 4.6673, "mean_token_accuracy": 0.23908869177103043, "num_tokens": 74848164.0, "step": 32645 }, { "entropy": 5.0491025924682615, "epoch": 3.1364073006724302, "grad_norm": 1.015625, "learning_rate": 0.00040321491886627614, "loss": 4.6114, "mean_token_accuracy": 0.2472015827894211, "num_tokens": 74858010.0, "step": 32650 }, { "entropy": 5.024115800857544, "epoch": 3.136887608069164, "grad_norm": 1.0390625, "learning_rate": 0.00040318674476912006, "loss": 4.569, "mean_token_accuracy": 0.24728527516126633, "num_tokens": 74868654.0, "step": 32655 }, { "entropy": 5.099744749069214, "epoch": 3.137367915465898, "grad_norm": 1.0703125, "learning_rate": 0.0004031585676958483, "loss": 4.7081, "mean_token_accuracy": 0.24407673478126526, "num_tokens": 74879751.0, "step": 32660 }, { "entropy": 5.027320194244385, "epoch": 3.1378482228626323, "grad_norm": 1.0390625, "learning_rate": 0.00040313038764711517, "loss": 4.6401, "mean_token_accuracy": 0.24803238958120347, "num_tokens": 74890569.0, "step": 32665 }, { "entropy": 5.08983359336853, "epoch": 3.138328530259366, "grad_norm": 0.9765625, "learning_rate": 0.00040310220462357494, "loss": 4.6672, "mean_token_accuracy": 0.23627711683511735, "num_tokens": 74902078.0, "step": 32670 }, { "entropy": 5.068285226821899, "epoch": 3.1388088376561, "grad_norm": 1.03125, "learning_rate": 0.0004030740186258819, "loss": 4.6392, "mean_token_accuracy": 0.24788211435079574, "num_tokens": 74913228.0, "step": 32675 }, { "entropy": 5.011826419830323, "epoch": 3.139289145052834, "grad_norm": 0.99609375, "learning_rate": 0.0004030458296546905, "loss": 4.6343, "mean_token_accuracy": 0.2474340334534645, "num_tokens": 74925419.0, "step": 32680 }, { "entropy": 5.0667013168334964, "epoch": 3.139769452449568, "grad_norm": 0.9765625, "learning_rate": 0.00040301763771065504, "loss": 4.6582, "mean_token_accuracy": 0.24502648413181305, "num_tokens": 74936219.0, "step": 32685 }, { "entropy": 5.016709995269776, "epoch": 3.1402497598463017, "grad_norm": 1.0390625, "learning_rate": 0.0004029894427944302, "loss": 4.6411, "mean_token_accuracy": 0.25027802437543867, "num_tokens": 74948015.0, "step": 32690 }, { "entropy": 5.030672311782837, "epoch": 3.1407300672430356, "grad_norm": 1.0, "learning_rate": 0.00040296124490667065, "loss": 4.6175, "mean_token_accuracy": 0.2463405415415764, "num_tokens": 74959753.0, "step": 32695 }, { "entropy": 4.999003887176514, "epoch": 3.1412103746397695, "grad_norm": 1.046875, "learning_rate": 0.0004029330440480308, "loss": 4.5653, "mean_token_accuracy": 0.2514141842722893, "num_tokens": 74970696.0, "step": 32700 }, { "entropy": 5.0660217761993405, "epoch": 3.1416906820365034, "grad_norm": 1.03125, "learning_rate": 0.0004029048402191656, "loss": 4.6656, "mean_token_accuracy": 0.2468088760972023, "num_tokens": 74982070.0, "step": 32705 }, { "entropy": 5.025319671630859, "epoch": 3.1421709894332372, "grad_norm": 1.046875, "learning_rate": 0.0004028766334207299, "loss": 4.6414, "mean_token_accuracy": 0.25440729707479476, "num_tokens": 74992823.0, "step": 32710 }, { "entropy": 4.922139263153076, "epoch": 3.142651296829971, "grad_norm": 1.015625, "learning_rate": 0.0004028484236533784, "loss": 4.5103, "mean_token_accuracy": 0.25190082639455796, "num_tokens": 75004456.0, "step": 32715 }, { "entropy": 5.1129429817199705, "epoch": 3.143131604226705, "grad_norm": 1.03125, "learning_rate": 0.00040282021091776624, "loss": 4.7352, "mean_token_accuracy": 0.23567797243595123, "num_tokens": 75017039.0, "step": 32720 }, { "entropy": 5.042050075531006, "epoch": 3.143611911623439, "grad_norm": 1.15625, "learning_rate": 0.0004027919952145482, "loss": 4.5894, "mean_token_accuracy": 0.24799265563488007, "num_tokens": 75028155.0, "step": 32725 }, { "entropy": 5.044452762603759, "epoch": 3.1440922190201728, "grad_norm": 0.97265625, "learning_rate": 0.0004027637765443795, "loss": 4.6271, "mean_token_accuracy": 0.2433112919330597, "num_tokens": 75039529.0, "step": 32730 }, { "entropy": 5.03828911781311, "epoch": 3.1445725264169067, "grad_norm": 1.0, "learning_rate": 0.00040273555490791534, "loss": 4.6377, "mean_token_accuracy": 0.24551484733819962, "num_tokens": 75051205.0, "step": 32735 }, { "entropy": 5.053845071792603, "epoch": 3.1450528338136405, "grad_norm": 0.96875, "learning_rate": 0.0004027073303058109, "loss": 4.6596, "mean_token_accuracy": 0.25166534036397936, "num_tokens": 75063379.0, "step": 32740 }, { "entropy": 5.039496374130249, "epoch": 3.1455331412103744, "grad_norm": 0.96875, "learning_rate": 0.0004026791027387214, "loss": 4.6273, "mean_token_accuracy": 0.24387053847312928, "num_tokens": 75074034.0, "step": 32745 }, { "entropy": 5.083844327926636, "epoch": 3.1460134486071087, "grad_norm": 1.0078125, "learning_rate": 0.0004026508722073024, "loss": 4.6436, "mean_token_accuracy": 0.23956361413002014, "num_tokens": 75087004.0, "step": 32750 }, { "entropy": 5.0490403175354, "epoch": 3.1464937560038426, "grad_norm": 0.9921875, "learning_rate": 0.00040262263871220904, "loss": 4.6842, "mean_token_accuracy": 0.2463624134659767, "num_tokens": 75100270.0, "step": 32755 }, { "entropy": 5.005458498001099, "epoch": 3.1469740634005765, "grad_norm": 0.9609375, "learning_rate": 0.0004025944022540971, "loss": 4.5903, "mean_token_accuracy": 0.25398978739976885, "num_tokens": 75113116.0, "step": 32760 }, { "entropy": 5.084502840042115, "epoch": 3.1474543707973104, "grad_norm": 1.125, "learning_rate": 0.00040256616283362195, "loss": 4.7061, "mean_token_accuracy": 0.24278795272111892, "num_tokens": 75124225.0, "step": 32765 }, { "entropy": 5.027612638473511, "epoch": 3.1479346781940443, "grad_norm": 0.9921875, "learning_rate": 0.00040253792045143926, "loss": 4.5723, "mean_token_accuracy": 0.2548971638083458, "num_tokens": 75135859.0, "step": 32770 }, { "entropy": 5.14524474143982, "epoch": 3.148414985590778, "grad_norm": 0.9140625, "learning_rate": 0.0004025096751082048, "loss": 4.7552, "mean_token_accuracy": 0.24182595908641816, "num_tokens": 75148626.0, "step": 32775 }, { "entropy": 5.047990942001343, "epoch": 3.148895292987512, "grad_norm": 1.0546875, "learning_rate": 0.0004024814268045743, "loss": 4.6451, "mean_token_accuracy": 0.2424531862139702, "num_tokens": 75159539.0, "step": 32780 }, { "entropy": 5.037782907485962, "epoch": 3.149375600384246, "grad_norm": 1.0, "learning_rate": 0.00040245317554120363, "loss": 4.6381, "mean_token_accuracy": 0.24708918929100038, "num_tokens": 75172393.0, "step": 32785 }, { "entropy": 5.08271894454956, "epoch": 3.14985590778098, "grad_norm": 0.95703125, "learning_rate": 0.0004024249213187487, "loss": 4.6625, "mean_token_accuracy": 0.24983277618885041, "num_tokens": 75183777.0, "step": 32790 }, { "entropy": 4.977076625823974, "epoch": 3.1503362151777137, "grad_norm": 0.97265625, "learning_rate": 0.0004023966641378655, "loss": 4.5685, "mean_token_accuracy": 0.2511023834347725, "num_tokens": 75194976.0, "step": 32795 }, { "entropy": 4.969881248474121, "epoch": 3.1508165225744476, "grad_norm": 0.96484375, "learning_rate": 0.00040236840399920996, "loss": 4.582, "mean_token_accuracy": 0.24932862371206282, "num_tokens": 75207709.0, "step": 32800 }, { "entropy": 4.944665145874024, "epoch": 3.1512968299711814, "grad_norm": 1.0078125, "learning_rate": 0.00040234014090343833, "loss": 4.5121, "mean_token_accuracy": 0.2562754929065704, "num_tokens": 75217848.0, "step": 32805 }, { "entropy": 5.089080810546875, "epoch": 3.1517771373679153, "grad_norm": 1.046875, "learning_rate": 0.0004023118748512068, "loss": 4.6909, "mean_token_accuracy": 0.24239312559366227, "num_tokens": 75229670.0, "step": 32810 }, { "entropy": 4.988765907287598, "epoch": 3.152257444764649, "grad_norm": 1.0234375, "learning_rate": 0.0004022836058431715, "loss": 4.5864, "mean_token_accuracy": 0.25130273699760436, "num_tokens": 75241752.0, "step": 32815 }, { "entropy": 4.97612624168396, "epoch": 3.152737752161383, "grad_norm": 1.265625, "learning_rate": 0.00040225533387998883, "loss": 4.582, "mean_token_accuracy": 0.2506746083498001, "num_tokens": 75252818.0, "step": 32820 }, { "entropy": 5.016368865966797, "epoch": 3.1532180595581174, "grad_norm": 1.0625, "learning_rate": 0.0004022270589623152, "loss": 4.6216, "mean_token_accuracy": 0.24806735217571257, "num_tokens": 75264076.0, "step": 32825 }, { "entropy": 5.069680118560791, "epoch": 3.1536983669548513, "grad_norm": 1.0078125, "learning_rate": 0.000402198781090807, "loss": 4.646, "mean_token_accuracy": 0.24499400407075883, "num_tokens": 75275281.0, "step": 32830 }, { "entropy": 4.994699382781983, "epoch": 3.154178674351585, "grad_norm": 1.0078125, "learning_rate": 0.0004021705002661208, "loss": 4.6135, "mean_token_accuracy": 0.24449268132448196, "num_tokens": 75287443.0, "step": 32835 }, { "entropy": 4.988004922866821, "epoch": 3.154658981748319, "grad_norm": 1.0703125, "learning_rate": 0.0004021422164889133, "loss": 4.638, "mean_token_accuracy": 0.24222270548343658, "num_tokens": 75299256.0, "step": 32840 }, { "entropy": 5.096055030822754, "epoch": 3.155139289145053, "grad_norm": 0.96875, "learning_rate": 0.000402113929759841, "loss": 4.6486, "mean_token_accuracy": 0.24704242646694183, "num_tokens": 75310485.0, "step": 32845 }, { "entropy": 5.10506682395935, "epoch": 3.155619596541787, "grad_norm": 1.03125, "learning_rate": 0.00040208564007956075, "loss": 4.6364, "mean_token_accuracy": 0.2465073361992836, "num_tokens": 75322826.0, "step": 32850 }, { "entropy": 5.00804877281189, "epoch": 3.1560999039385207, "grad_norm": 0.9453125, "learning_rate": 0.0004020573474487293, "loss": 4.6055, "mean_token_accuracy": 0.25069845020771026, "num_tokens": 75334826.0, "step": 32855 }, { "entropy": 4.932596969604492, "epoch": 3.1565802113352546, "grad_norm": 0.94921875, "learning_rate": 0.00040202905186800347, "loss": 4.5534, "mean_token_accuracy": 0.25578352212905886, "num_tokens": 75345997.0, "step": 32860 }, { "entropy": 5.094801235198974, "epoch": 3.1570605187319885, "grad_norm": 1.109375, "learning_rate": 0.0004020007533380403, "loss": 4.6379, "mean_token_accuracy": 0.23823827058076857, "num_tokens": 75357018.0, "step": 32865 }, { "entropy": 5.011976289749145, "epoch": 3.1575408261287223, "grad_norm": 1.0078125, "learning_rate": 0.0004019724518594967, "loss": 4.6183, "mean_token_accuracy": 0.2500107690691948, "num_tokens": 75368190.0, "step": 32870 }, { "entropy": 5.001093578338623, "epoch": 3.158021133525456, "grad_norm": 0.95703125, "learning_rate": 0.0004019441474330298, "loss": 4.5785, "mean_token_accuracy": 0.25090626180171965, "num_tokens": 75380284.0, "step": 32875 }, { "entropy": 4.977503919601441, "epoch": 3.15850144092219, "grad_norm": 1.09375, "learning_rate": 0.00040191584005929684, "loss": 4.6015, "mean_token_accuracy": 0.2545105591416359, "num_tokens": 75392543.0, "step": 32880 }, { "entropy": 5.009714555740357, "epoch": 3.158981748318924, "grad_norm": 1.0234375, "learning_rate": 0.0004018875297389549, "loss": 4.6082, "mean_token_accuracy": 0.24547204971313477, "num_tokens": 75404883.0, "step": 32885 }, { "entropy": 5.064972591400147, "epoch": 3.159462055715658, "grad_norm": 1.0, "learning_rate": 0.00040185921647266126, "loss": 4.6607, "mean_token_accuracy": 0.24727501720190048, "num_tokens": 75416717.0, "step": 32890 }, { "entropy": 5.0171041011810305, "epoch": 3.1599423631123917, "grad_norm": 1.0546875, "learning_rate": 0.00040183090026107326, "loss": 4.5748, "mean_token_accuracy": 0.2567442923784256, "num_tokens": 75428103.0, "step": 32895 }, { "entropy": 4.942748641967773, "epoch": 3.160422670509126, "grad_norm": 1.109375, "learning_rate": 0.00040180258110484847, "loss": 4.5397, "mean_token_accuracy": 0.257702699303627, "num_tokens": 75438333.0, "step": 32900 }, { "entropy": 5.08033013343811, "epoch": 3.16090297790586, "grad_norm": 1.0234375, "learning_rate": 0.0004017742590046442, "loss": 4.6372, "mean_token_accuracy": 0.24954543262720108, "num_tokens": 75450023.0, "step": 32905 }, { "entropy": 5.0422680377960205, "epoch": 3.161383285302594, "grad_norm": 0.96484375, "learning_rate": 0.00040174593396111814, "loss": 4.6094, "mean_token_accuracy": 0.251060651242733, "num_tokens": 75460724.0, "step": 32910 }, { "entropy": 5.0254497051239015, "epoch": 3.1618635926993277, "grad_norm": 1.0078125, "learning_rate": 0.00040171760597492785, "loss": 4.6303, "mean_token_accuracy": 0.2505000278353691, "num_tokens": 75471752.0, "step": 32915 }, { "entropy": 4.954725503921509, "epoch": 3.1623439000960616, "grad_norm": 1.0390625, "learning_rate": 0.00040168927504673094, "loss": 4.5487, "mean_token_accuracy": 0.2478194385766983, "num_tokens": 75483748.0, "step": 32920 }, { "entropy": 5.0682861328125, "epoch": 3.1628242074927955, "grad_norm": 0.94921875, "learning_rate": 0.0004016609411771853, "loss": 4.7222, "mean_token_accuracy": 0.24132005572319032, "num_tokens": 75495497.0, "step": 32925 }, { "entropy": 5.077421712875366, "epoch": 3.1633045148895294, "grad_norm": 1.0078125, "learning_rate": 0.00040163260436694876, "loss": 4.6752, "mean_token_accuracy": 0.24498880207538604, "num_tokens": 75506852.0, "step": 32930 }, { "entropy": 5.026989841461182, "epoch": 3.1637848222862632, "grad_norm": 1.0, "learning_rate": 0.0004016042646166791, "loss": 4.6044, "mean_token_accuracy": 0.2555360347032547, "num_tokens": 75518633.0, "step": 32935 }, { "entropy": 5.018486785888672, "epoch": 3.164265129682997, "grad_norm": 0.96484375, "learning_rate": 0.0004015759219270344, "loss": 4.5854, "mean_token_accuracy": 0.2519551023840904, "num_tokens": 75529668.0, "step": 32940 }, { "entropy": 5.044800519943237, "epoch": 3.164745437079731, "grad_norm": 1.0234375, "learning_rate": 0.0004015475762986726, "loss": 4.6172, "mean_token_accuracy": 0.25533214062452314, "num_tokens": 75541603.0, "step": 32945 }, { "entropy": 5.093109893798828, "epoch": 3.165225744476465, "grad_norm": 1.046875, "learning_rate": 0.00040151922773225187, "loss": 4.6589, "mean_token_accuracy": 0.24944701492786409, "num_tokens": 75553489.0, "step": 32950 }, { "entropy": 5.018434381484985, "epoch": 3.1657060518731988, "grad_norm": 1.03125, "learning_rate": 0.0004014908762284303, "loss": 4.6143, "mean_token_accuracy": 0.24663615971803665, "num_tokens": 75564809.0, "step": 32955 }, { "entropy": 5.057509565353394, "epoch": 3.1661863592699326, "grad_norm": 1.046875, "learning_rate": 0.00040146252178786633, "loss": 4.6544, "mean_token_accuracy": 0.24733265042304992, "num_tokens": 75575886.0, "step": 32960 }, { "entropy": 5.050777339935303, "epoch": 3.1666666666666665, "grad_norm": 1.0859375, "learning_rate": 0.000401434164411218, "loss": 4.6195, "mean_token_accuracy": 0.24244523793458939, "num_tokens": 75586518.0, "step": 32965 }, { "entropy": 4.947002935409546, "epoch": 3.1671469740634004, "grad_norm": 0.95703125, "learning_rate": 0.00040140580409914385, "loss": 4.4814, "mean_token_accuracy": 0.2595750898122787, "num_tokens": 75598819.0, "step": 32970 }, { "entropy": 4.991409206390381, "epoch": 3.1676272814601343, "grad_norm": 1.03125, "learning_rate": 0.00040137744085230227, "loss": 4.6828, "mean_token_accuracy": 0.24561095386743545, "num_tokens": 75610319.0, "step": 32975 }, { "entropy": 5.220904731750489, "epoch": 3.168107588856868, "grad_norm": 1.0234375, "learning_rate": 0.0004013490746713518, "loss": 4.7541, "mean_token_accuracy": 0.2379745751619339, "num_tokens": 75622479.0, "step": 32980 }, { "entropy": 5.052739048004151, "epoch": 3.1685878962536025, "grad_norm": 0.984375, "learning_rate": 0.00040132070555695096, "loss": 4.6099, "mean_token_accuracy": 0.24859773218631745, "num_tokens": 75634114.0, "step": 32985 }, { "entropy": 5.0500153541564945, "epoch": 3.1690682036503364, "grad_norm": 0.98828125, "learning_rate": 0.00040129233350975847, "loss": 4.7329, "mean_token_accuracy": 0.2398463323712349, "num_tokens": 75645828.0, "step": 32990 }, { "entropy": 5.026799821853638, "epoch": 3.1695485110470702, "grad_norm": 0.9921875, "learning_rate": 0.00040126395853043293, "loss": 4.6365, "mean_token_accuracy": 0.24603895097970963, "num_tokens": 75657335.0, "step": 32995 }, { "entropy": 5.0823150157928465, "epoch": 3.170028818443804, "grad_norm": 0.98828125, "learning_rate": 0.0004012355806196332, "loss": 4.5871, "mean_token_accuracy": 0.2491716131567955, "num_tokens": 75668886.0, "step": 33000 }, { "epoch": 3.170028818443804, "eval_entropy": 4.875704409261264, "eval_loss": 4.792845249176025, "eval_mean_token_accuracy": 0.24698377140447578, "eval_num_tokens": 75668886.0, "eval_runtime": 26.5561, "eval_samples_per_second": 1235.688, "eval_steps_per_second": 154.466, "step": 33000 }, { "entropy": 5.103356218338012, "epoch": 3.170509125840538, "grad_norm": 1.0703125, "learning_rate": 0.00040120719977801823, "loss": 4.7241, "mean_token_accuracy": 0.23807538598775863, "num_tokens": 75679392.0, "step": 33005 }, { "entropy": 5.119944715499878, "epoch": 3.170989433237272, "grad_norm": 1.03125, "learning_rate": 0.00040117881600624676, "loss": 4.7102, "mean_token_accuracy": 0.24104578644037247, "num_tokens": 75692028.0, "step": 33010 }, { "entropy": 5.0259864807128904, "epoch": 3.1714697406340058, "grad_norm": 1.09375, "learning_rate": 0.00040115042930497787, "loss": 4.6282, "mean_token_accuracy": 0.24223802238702774, "num_tokens": 75702879.0, "step": 33015 }, { "entropy": 5.081137800216675, "epoch": 3.1719500480307397, "grad_norm": 1.0390625, "learning_rate": 0.00040112203967487066, "loss": 4.5954, "mean_token_accuracy": 0.24994781166315078, "num_tokens": 75713564.0, "step": 33020 }, { "entropy": 4.96788535118103, "epoch": 3.1724303554274735, "grad_norm": 0.95703125, "learning_rate": 0.00040109364711658416, "loss": 4.5737, "mean_token_accuracy": 0.2551387965679169, "num_tokens": 75724577.0, "step": 33025 }, { "entropy": 5.022297191619873, "epoch": 3.1729106628242074, "grad_norm": 1.046875, "learning_rate": 0.00040106525163077756, "loss": 4.6216, "mean_token_accuracy": 0.2573698371648788, "num_tokens": 75736349.0, "step": 33030 }, { "entropy": 5.048596477508545, "epoch": 3.1733909702209413, "grad_norm": 1.0234375, "learning_rate": 0.0004010368532181102, "loss": 4.647, "mean_token_accuracy": 0.24219596534967422, "num_tokens": 75747465.0, "step": 33035 }, { "entropy": 5.19978289604187, "epoch": 3.173871277617675, "grad_norm": 1.0078125, "learning_rate": 0.0004010084518792413, "loss": 4.7431, "mean_token_accuracy": 0.24264875501394273, "num_tokens": 75759014.0, "step": 33040 }, { "entropy": 4.992469692230225, "epoch": 3.174351585014409, "grad_norm": 1.03125, "learning_rate": 0.00040098004761483037, "loss": 4.5791, "mean_token_accuracy": 0.2490800842642784, "num_tokens": 75770354.0, "step": 33045 }, { "entropy": 5.090083980560303, "epoch": 3.174831892411143, "grad_norm": 1.0390625, "learning_rate": 0.0004009516404255368, "loss": 4.7128, "mean_token_accuracy": 0.2398114785552025, "num_tokens": 75781530.0, "step": 33050 }, { "entropy": 5.0209059715271, "epoch": 3.175312199807877, "grad_norm": 1.0078125, "learning_rate": 0.0004009232303120202, "loss": 4.6373, "mean_token_accuracy": 0.24811351597309111, "num_tokens": 75792751.0, "step": 33055 }, { "entropy": 5.025114583969116, "epoch": 3.175792507204611, "grad_norm": 0.96875, "learning_rate": 0.00040089481727494, "loss": 4.5463, "mean_token_accuracy": 0.25323901772499086, "num_tokens": 75804882.0, "step": 33060 }, { "entropy": 4.963725328445435, "epoch": 3.176272814601345, "grad_norm": 0.98828125, "learning_rate": 0.000400866401314956, "loss": 4.5662, "mean_token_accuracy": 0.2505721032619476, "num_tokens": 75816207.0, "step": 33065 }, { "entropy": 5.0686869621276855, "epoch": 3.176753121998079, "grad_norm": 1.0390625, "learning_rate": 0.00040083798243272797, "loss": 4.7548, "mean_token_accuracy": 0.24290491938591002, "num_tokens": 75827957.0, "step": 33070 }, { "entropy": 5.128195428848267, "epoch": 3.177233429394813, "grad_norm": 0.9453125, "learning_rate": 0.00040080956062891554, "loss": 4.6695, "mean_token_accuracy": 0.2454632982611656, "num_tokens": 75839227.0, "step": 33075 }, { "entropy": 5.098654842376709, "epoch": 3.1777137367915467, "grad_norm": 1.0390625, "learning_rate": 0.00040078113590417887, "loss": 4.6647, "mean_token_accuracy": 0.24950521141290666, "num_tokens": 75849831.0, "step": 33080 }, { "entropy": 4.982477331161499, "epoch": 3.1781940441882806, "grad_norm": 1.046875, "learning_rate": 0.00040075270825917753, "loss": 4.6105, "mean_token_accuracy": 0.25387084633111956, "num_tokens": 75860617.0, "step": 33085 }, { "entropy": 5.068709182739258, "epoch": 3.1786743515850144, "grad_norm": 1.0078125, "learning_rate": 0.0004007242776945718, "loss": 4.6763, "mean_token_accuracy": 0.24255860596895218, "num_tokens": 75872193.0, "step": 33090 }, { "entropy": 5.076613521575927, "epoch": 3.1791546589817483, "grad_norm": 0.96875, "learning_rate": 0.00040069584421102174, "loss": 4.6655, "mean_token_accuracy": 0.2528792917728424, "num_tokens": 75885536.0, "step": 33095 }, { "entropy": 5.132878303527832, "epoch": 3.179634966378482, "grad_norm": 1.140625, "learning_rate": 0.00040066740780918725, "loss": 4.6938, "mean_token_accuracy": 0.242730313539505, "num_tokens": 75897325.0, "step": 33100 }, { "entropy": 5.044506978988648, "epoch": 3.180115273775216, "grad_norm": 1.1015625, "learning_rate": 0.0004006389684897288, "loss": 4.6858, "mean_token_accuracy": 0.24642085283994675, "num_tokens": 75908353.0, "step": 33105 }, { "entropy": 5.045048189163208, "epoch": 3.18059558117195, "grad_norm": 1.015625, "learning_rate": 0.0004006105262533066, "loss": 4.6808, "mean_token_accuracy": 0.24400411993265153, "num_tokens": 75919862.0, "step": 33110 }, { "entropy": 5.002927684783936, "epoch": 3.181075888568684, "grad_norm": 1.0234375, "learning_rate": 0.0004005820811005809, "loss": 4.5726, "mean_token_accuracy": 0.25889708399772643, "num_tokens": 75930971.0, "step": 33115 }, { "entropy": 5.06383581161499, "epoch": 3.1815561959654177, "grad_norm": 1.03125, "learning_rate": 0.00040055363303221226, "loss": 4.5474, "mean_token_accuracy": 0.2644984617829323, "num_tokens": 75941124.0, "step": 33120 }, { "entropy": 5.06303448677063, "epoch": 3.1820365033621516, "grad_norm": 1.0703125, "learning_rate": 0.000400525182048861, "loss": 4.655, "mean_token_accuracy": 0.24702179729938506, "num_tokens": 75953040.0, "step": 33125 }, { "entropy": 5.041329669952392, "epoch": 3.1825168107588855, "grad_norm": 1.046875, "learning_rate": 0.00040049672815118775, "loss": 4.6944, "mean_token_accuracy": 0.23162348121404647, "num_tokens": 75965122.0, "step": 33130 }, { "entropy": 5.072277927398682, "epoch": 3.18299711815562, "grad_norm": 1.046875, "learning_rate": 0.00040046827133985316, "loss": 4.5845, "mean_token_accuracy": 0.24891586154699324, "num_tokens": 75975360.0, "step": 33135 }, { "entropy": 5.0704793453216555, "epoch": 3.1834774255523537, "grad_norm": 1.1015625, "learning_rate": 0.00040043981161551784, "loss": 4.5532, "mean_token_accuracy": 0.25152069330215454, "num_tokens": 75985592.0, "step": 33140 }, { "entropy": 4.945167827606201, "epoch": 3.1839577329490876, "grad_norm": 1.0, "learning_rate": 0.0004004113489788426, "loss": 4.6335, "mean_token_accuracy": 0.24698051065206528, "num_tokens": 75997429.0, "step": 33145 }, { "entropy": 5.114994382858276, "epoch": 3.1844380403458215, "grad_norm": 1.046875, "learning_rate": 0.00040038288343048823, "loss": 4.7781, "mean_token_accuracy": 0.2350222647190094, "num_tokens": 76008785.0, "step": 33150 }, { "entropy": 5.170015287399292, "epoch": 3.1849183477425553, "grad_norm": 0.9921875, "learning_rate": 0.00040035441497111564, "loss": 4.7362, "mean_token_accuracy": 0.24190903902053834, "num_tokens": 76020773.0, "step": 33155 }, { "entropy": 5.065093851089477, "epoch": 3.185398655139289, "grad_norm": 1.109375, "learning_rate": 0.00040032594360138576, "loss": 4.6355, "mean_token_accuracy": 0.25197552144527435, "num_tokens": 76033300.0, "step": 33160 }, { "entropy": 4.973330116271972, "epoch": 3.185878962536023, "grad_norm": 0.984375, "learning_rate": 0.0004002974693219595, "loss": 4.5369, "mean_token_accuracy": 0.24910655617713928, "num_tokens": 76043443.0, "step": 33165 }, { "entropy": 5.027342748641968, "epoch": 3.186359269932757, "grad_norm": 1.015625, "learning_rate": 0.00040026899213349814, "loss": 4.5947, "mean_token_accuracy": 0.25178042650222776, "num_tokens": 76055417.0, "step": 33170 }, { "entropy": 5.066886329650879, "epoch": 3.186839577329491, "grad_norm": 0.94921875, "learning_rate": 0.0004002405120366628, "loss": 4.6239, "mean_token_accuracy": 0.24546905755996704, "num_tokens": 76067259.0, "step": 33175 }, { "entropy": 5.068389558792115, "epoch": 3.1873198847262247, "grad_norm": 1.0234375, "learning_rate": 0.00040021202903211454, "loss": 4.6222, "mean_token_accuracy": 0.24282134771347047, "num_tokens": 76079827.0, "step": 33180 }, { "entropy": 5.074465370178222, "epoch": 3.1878001921229586, "grad_norm": 0.984375, "learning_rate": 0.0004001835431205149, "loss": 4.6209, "mean_token_accuracy": 0.2443382978439331, "num_tokens": 76092068.0, "step": 33185 }, { "entropy": 5.103303289413452, "epoch": 3.1882804995196925, "grad_norm": 1.1953125, "learning_rate": 0.00040015505430252506, "loss": 4.6673, "mean_token_accuracy": 0.24165288209915162, "num_tokens": 76102046.0, "step": 33190 }, { "entropy": 5.015643739700318, "epoch": 3.1887608069164264, "grad_norm": 0.9609375, "learning_rate": 0.00040012656257880645, "loss": 4.5915, "mean_token_accuracy": 0.2513815090060234, "num_tokens": 76113629.0, "step": 33195 }, { "entropy": 4.982402324676514, "epoch": 3.1892411143131603, "grad_norm": 1.1328125, "learning_rate": 0.00040009806795002076, "loss": 4.5417, "mean_token_accuracy": 0.25977119952440264, "num_tokens": 76124593.0, "step": 33200 }, { "entropy": 5.165917634963989, "epoch": 3.189721421709894, "grad_norm": 1.0390625, "learning_rate": 0.0004000695704168292, "loss": 4.8196, "mean_token_accuracy": 0.23088828921318055, "num_tokens": 76135177.0, "step": 33205 }, { "entropy": 5.04478006362915, "epoch": 3.1902017291066285, "grad_norm": 1.03125, "learning_rate": 0.0004000410699798937, "loss": 4.5932, "mean_token_accuracy": 0.24386686980724334, "num_tokens": 76148998.0, "step": 33210 }, { "entropy": 4.967963171005249, "epoch": 3.1906820365033624, "grad_norm": 0.94921875, "learning_rate": 0.00040001256663987585, "loss": 4.625, "mean_token_accuracy": 0.2557815477252007, "num_tokens": 76161224.0, "step": 33215 }, { "entropy": 4.964840984344482, "epoch": 3.1911623439000962, "grad_norm": 1.03125, "learning_rate": 0.00039998406039743736, "loss": 4.6413, "mean_token_accuracy": 0.24818596839904786, "num_tokens": 76172577.0, "step": 33220 }, { "entropy": 5.035871458053589, "epoch": 3.19164265129683, "grad_norm": 1.0703125, "learning_rate": 0.0003999555512532401, "loss": 4.5843, "mean_token_accuracy": 0.24981313645839692, "num_tokens": 76184135.0, "step": 33225 }, { "entropy": 5.109456205368042, "epoch": 3.192122958693564, "grad_norm": 1.0078125, "learning_rate": 0.0003999270392079461, "loss": 4.6615, "mean_token_accuracy": 0.24911601543426515, "num_tokens": 76195831.0, "step": 33230 }, { "entropy": 5.100518894195557, "epoch": 3.192603266090298, "grad_norm": 1.015625, "learning_rate": 0.0003998985242622171, "loss": 4.682, "mean_token_accuracy": 0.2433509945869446, "num_tokens": 76208914.0, "step": 33235 }, { "entropy": 5.048551940917969, "epoch": 3.1930835734870318, "grad_norm": 1.0234375, "learning_rate": 0.0003998700064167153, "loss": 4.6938, "mean_token_accuracy": 0.238228178024292, "num_tokens": 76220599.0, "step": 33240 }, { "entropy": 5.043987560272217, "epoch": 3.1935638808837656, "grad_norm": 1.046875, "learning_rate": 0.0003998414856721027, "loss": 4.6417, "mean_token_accuracy": 0.2440706819295883, "num_tokens": 76232897.0, "step": 33245 }, { "entropy": 5.098506736755371, "epoch": 3.1940441882804995, "grad_norm": 0.9765625, "learning_rate": 0.0003998129620290415, "loss": 4.675, "mean_token_accuracy": 0.24875639379024506, "num_tokens": 76245948.0, "step": 33250 }, { "entropy": 5.020074701309204, "epoch": 3.1945244956772334, "grad_norm": 1.1484375, "learning_rate": 0.00039978443548819393, "loss": 4.6321, "mean_token_accuracy": 0.2490899845957756, "num_tokens": 76257223.0, "step": 33255 }, { "entropy": 4.976610994338989, "epoch": 3.1950048030739673, "grad_norm": 0.98046875, "learning_rate": 0.0003997559060502224, "loss": 4.5476, "mean_token_accuracy": 0.25438212752342226, "num_tokens": 76267606.0, "step": 33260 }, { "entropy": 4.997032260894775, "epoch": 3.195485110470701, "grad_norm": 0.96875, "learning_rate": 0.0003997273737157891, "loss": 4.626, "mean_token_accuracy": 0.2469200849533081, "num_tokens": 76280430.0, "step": 33265 }, { "entropy": 5.1250245571136475, "epoch": 3.195965417867435, "grad_norm": 0.97265625, "learning_rate": 0.00039969883848555647, "loss": 4.6756, "mean_token_accuracy": 0.24692625999450685, "num_tokens": 76293089.0, "step": 33270 }, { "entropy": 5.040512371063232, "epoch": 3.196445725264169, "grad_norm": 0.98046875, "learning_rate": 0.0003996703003601872, "loss": 4.597, "mean_token_accuracy": 0.2488690882921219, "num_tokens": 76304936.0, "step": 33275 }, { "entropy": 4.960997009277344, "epoch": 3.196926032660903, "grad_norm": 0.9296875, "learning_rate": 0.00039964175934034375, "loss": 4.4535, "mean_token_accuracy": 0.2637205198407173, "num_tokens": 76316195.0, "step": 33280 }, { "entropy": 4.97886323928833, "epoch": 3.1974063400576367, "grad_norm": 0.921875, "learning_rate": 0.0003996132154266887, "loss": 4.5768, "mean_token_accuracy": 0.24694945216178893, "num_tokens": 76328128.0, "step": 33285 }, { "entropy": 4.969552946090698, "epoch": 3.1978866474543706, "grad_norm": 1.125, "learning_rate": 0.0003995846686198849, "loss": 4.5687, "mean_token_accuracy": 0.2539353668689728, "num_tokens": 76338596.0, "step": 33290 }, { "entropy": 5.039647817611694, "epoch": 3.198366954851105, "grad_norm": 0.95703125, "learning_rate": 0.0003995561189205949, "loss": 4.6371, "mean_token_accuracy": 0.24575907737016678, "num_tokens": 76350479.0, "step": 33295 }, { "entropy": 5.01677680015564, "epoch": 3.1988472622478388, "grad_norm": 1.015625, "learning_rate": 0.0003995275663294818, "loss": 4.6604, "mean_token_accuracy": 0.24575017541646957, "num_tokens": 76363433.0, "step": 33300 }, { "entropy": 4.960451126098633, "epoch": 3.1993275696445727, "grad_norm": 1.0390625, "learning_rate": 0.0003994990108472084, "loss": 4.5468, "mean_token_accuracy": 0.25467743426561357, "num_tokens": 76374000.0, "step": 33305 }, { "entropy": 4.960294628143311, "epoch": 3.1998078770413065, "grad_norm": 1.0390625, "learning_rate": 0.00039947045247443755, "loss": 4.554, "mean_token_accuracy": 0.24896431416273118, "num_tokens": 76384192.0, "step": 33310 }, { "entropy": 5.095963668823242, "epoch": 3.2002881844380404, "grad_norm": 1.0390625, "learning_rate": 0.00039944189121183247, "loss": 4.7225, "mean_token_accuracy": 0.23484220057725907, "num_tokens": 76396641.0, "step": 33315 }, { "entropy": 4.964787197113037, "epoch": 3.2007684918347743, "grad_norm": 0.99609375, "learning_rate": 0.00039941332706005617, "loss": 4.5098, "mean_token_accuracy": 0.25350615531206133, "num_tokens": 76407954.0, "step": 33320 }, { "entropy": 4.972380542755127, "epoch": 3.201248799231508, "grad_norm": 0.95703125, "learning_rate": 0.00039938476001977175, "loss": 4.6157, "mean_token_accuracy": 0.25204425007104875, "num_tokens": 76419024.0, "step": 33325 }, { "entropy": 5.087919998168945, "epoch": 3.201729106628242, "grad_norm": 0.9765625, "learning_rate": 0.00039935619009164264, "loss": 4.6424, "mean_token_accuracy": 0.23842392563819886, "num_tokens": 76429786.0, "step": 33330 }, { "entropy": 4.965538406372071, "epoch": 3.202209414024976, "grad_norm": 1.0234375, "learning_rate": 0.0003993276172763321, "loss": 4.5046, "mean_token_accuracy": 0.2566877916455269, "num_tokens": 76440439.0, "step": 33335 }, { "entropy": 5.017267942428589, "epoch": 3.20268972142171, "grad_norm": 1.1015625, "learning_rate": 0.00039929904157450343, "loss": 4.6522, "mean_token_accuracy": 0.2454281345009804, "num_tokens": 76451855.0, "step": 33340 }, { "entropy": 4.982196807861328, "epoch": 3.2031700288184437, "grad_norm": 0.99609375, "learning_rate": 0.00039927046298682007, "loss": 4.5607, "mean_token_accuracy": 0.2517274335026741, "num_tokens": 76463662.0, "step": 33345 }, { "entropy": 5.027151155471802, "epoch": 3.2036503362151776, "grad_norm": 1.0234375, "learning_rate": 0.0003992418815139456, "loss": 4.588, "mean_token_accuracy": 0.24779824465513228, "num_tokens": 76476273.0, "step": 33350 }, { "entropy": 5.155083417892456, "epoch": 3.2041306436119115, "grad_norm": 0.9375, "learning_rate": 0.00039921329715654355, "loss": 4.6733, "mean_token_accuracy": 0.23885925114154816, "num_tokens": 76488958.0, "step": 33355 }, { "entropy": 4.977566480636597, "epoch": 3.2046109510086453, "grad_norm": 1.296875, "learning_rate": 0.0003991847099152775, "loss": 4.5679, "mean_token_accuracy": 0.25699655115604403, "num_tokens": 76499606.0, "step": 33360 }, { "entropy": 5.127652788162232, "epoch": 3.2050912584053792, "grad_norm": 1.1015625, "learning_rate": 0.0003991561197908114, "loss": 4.7337, "mean_token_accuracy": 0.2374101310968399, "num_tokens": 76510953.0, "step": 33365 }, { "entropy": 5.055814170837403, "epoch": 3.2055715658021136, "grad_norm": 0.94140625, "learning_rate": 0.0003991275267838088, "loss": 4.5916, "mean_token_accuracy": 0.24716406613588332, "num_tokens": 76522420.0, "step": 33370 }, { "entropy": 5.016264963150024, "epoch": 3.2060518731988474, "grad_norm": 1.046875, "learning_rate": 0.00039909893089493353, "loss": 4.6248, "mean_token_accuracy": 0.2561017364263535, "num_tokens": 76533748.0, "step": 33375 }, { "entropy": 5.040944290161133, "epoch": 3.2065321805955813, "grad_norm": 1.0859375, "learning_rate": 0.00039907033212484966, "loss": 4.5872, "mean_token_accuracy": 0.248762047290802, "num_tokens": 76544831.0, "step": 33380 }, { "entropy": 5.0381245613098145, "epoch": 3.207012487992315, "grad_norm": 1.0390625, "learning_rate": 0.0003990417304742211, "loss": 4.6528, "mean_token_accuracy": 0.2455194041132927, "num_tokens": 76558048.0, "step": 33385 }, { "entropy": 5.03350043296814, "epoch": 3.207492795389049, "grad_norm": 1.0546875, "learning_rate": 0.0003990131259437119, "loss": 4.6788, "mean_token_accuracy": 0.2491478905081749, "num_tokens": 76568461.0, "step": 33390 }, { "entropy": 5.018414163589478, "epoch": 3.207973102785783, "grad_norm": 1.0546875, "learning_rate": 0.0003989845185339861, "loss": 4.6172, "mean_token_accuracy": 0.25062026232481005, "num_tokens": 76580191.0, "step": 33395 }, { "entropy": 5.06202917098999, "epoch": 3.208453410182517, "grad_norm": 1.078125, "learning_rate": 0.0003989559082457079, "loss": 4.5806, "mean_token_accuracy": 0.24616572111845017, "num_tokens": 76591846.0, "step": 33400 }, { "entropy": 5.032164669036865, "epoch": 3.2089337175792507, "grad_norm": 1.0546875, "learning_rate": 0.00039892729507954173, "loss": 4.5598, "mean_token_accuracy": 0.25316447019577026, "num_tokens": 76602301.0, "step": 33405 }, { "entropy": 4.994559240341187, "epoch": 3.2094140249759846, "grad_norm": 1.0546875, "learning_rate": 0.00039889867903615165, "loss": 4.6123, "mean_token_accuracy": 0.2511447861790657, "num_tokens": 76613299.0, "step": 33410 }, { "entropy": 5.055741643905639, "epoch": 3.2098943323727185, "grad_norm": 1.046875, "learning_rate": 0.00039887006011620217, "loss": 4.6565, "mean_token_accuracy": 0.24609391540288925, "num_tokens": 76625022.0, "step": 33415 }, { "entropy": 5.023919916152954, "epoch": 3.2103746397694524, "grad_norm": 1.0390625, "learning_rate": 0.00039884143832035775, "loss": 4.5953, "mean_token_accuracy": 0.2515264004468918, "num_tokens": 76636393.0, "step": 33420 }, { "entropy": 5.065608930587769, "epoch": 3.2108549471661862, "grad_norm": 1.0078125, "learning_rate": 0.0003988128136492828, "loss": 4.6301, "mean_token_accuracy": 0.24483353346586229, "num_tokens": 76648953.0, "step": 33425 }, { "entropy": 5.008993434906006, "epoch": 3.21133525456292, "grad_norm": 0.9765625, "learning_rate": 0.000398784186103642, "loss": 4.6321, "mean_token_accuracy": 0.24404721707105637, "num_tokens": 76660843.0, "step": 33430 }, { "entropy": 5.019329833984375, "epoch": 3.211815561959654, "grad_norm": 1.03125, "learning_rate": 0.00039875555568409996, "loss": 4.6255, "mean_token_accuracy": 0.25317404270172117, "num_tokens": 76671745.0, "step": 33435 }, { "entropy": 5.012636709213257, "epoch": 3.212295869356388, "grad_norm": 1.015625, "learning_rate": 0.0003987269223913214, "loss": 4.5873, "mean_token_accuracy": 0.24610354006290436, "num_tokens": 76684386.0, "step": 33440 }, { "entropy": 5.018772459030151, "epoch": 3.212776176753122, "grad_norm": 1.0546875, "learning_rate": 0.00039869828622597105, "loss": 4.5626, "mean_token_accuracy": 0.25148770660161973, "num_tokens": 76694670.0, "step": 33445 }, { "entropy": 4.9570159912109375, "epoch": 3.213256484149856, "grad_norm": 1.0078125, "learning_rate": 0.00039866964718871385, "loss": 4.6225, "mean_token_accuracy": 0.2571745112538338, "num_tokens": 76705567.0, "step": 33450 }, { "entropy": 4.97451639175415, "epoch": 3.21373679154659, "grad_norm": 1.0625, "learning_rate": 0.0003986410052802146, "loss": 4.5817, "mean_token_accuracy": 0.24855275601148605, "num_tokens": 76717211.0, "step": 33455 }, { "entropy": 5.097514343261719, "epoch": 3.214217098943324, "grad_norm": 1.0703125, "learning_rate": 0.00039861236050113845, "loss": 4.6861, "mean_token_accuracy": 0.24082629680633544, "num_tokens": 76728414.0, "step": 33460 }, { "entropy": 4.929339838027954, "epoch": 3.2146974063400577, "grad_norm": 1.0234375, "learning_rate": 0.0003985837128521503, "loss": 4.522, "mean_token_accuracy": 0.25955982208251954, "num_tokens": 76739790.0, "step": 33465 }, { "entropy": 5.041322517395019, "epoch": 3.2151777137367916, "grad_norm": 1.09375, "learning_rate": 0.0003985550623339153, "loss": 4.6336, "mean_token_accuracy": 0.2502042159438133, "num_tokens": 76751558.0, "step": 33470 }, { "entropy": 5.114486455917358, "epoch": 3.2156580211335255, "grad_norm": 1.03125, "learning_rate": 0.0003985264089470987, "loss": 4.7915, "mean_token_accuracy": 0.2358861654996872, "num_tokens": 76763004.0, "step": 33475 }, { "entropy": 5.061826992034912, "epoch": 3.2161383285302594, "grad_norm": 0.98828125, "learning_rate": 0.00039849775269236556, "loss": 4.6412, "mean_token_accuracy": 0.2504911407828331, "num_tokens": 76773625.0, "step": 33480 }, { "entropy": 5.07300214767456, "epoch": 3.2166186359269933, "grad_norm": 0.953125, "learning_rate": 0.00039846909357038135, "loss": 4.6786, "mean_token_accuracy": 0.2430872544646263, "num_tokens": 76786351.0, "step": 33485 }, { "entropy": 5.053680086135865, "epoch": 3.217098943323727, "grad_norm": 1.0078125, "learning_rate": 0.0003984404315818115, "loss": 4.5789, "mean_token_accuracy": 0.24969908744096755, "num_tokens": 76796997.0, "step": 33490 }, { "entropy": 5.041178035736084, "epoch": 3.217579250720461, "grad_norm": 1.1328125, "learning_rate": 0.00039841176672732127, "loss": 4.6591, "mean_token_accuracy": 0.24388981014490127, "num_tokens": 76809690.0, "step": 33495 }, { "entropy": 5.029389333724976, "epoch": 3.218059558117195, "grad_norm": 0.9921875, "learning_rate": 0.0003983830990075763, "loss": 4.5578, "mean_token_accuracy": 0.25537789016962054, "num_tokens": 76820387.0, "step": 33500 }, { "entropy": 5.070287752151489, "epoch": 3.218539865513929, "grad_norm": 1.0546875, "learning_rate": 0.00039835442842324216, "loss": 4.7261, "mean_token_accuracy": 0.24159679412841797, "num_tokens": 76833261.0, "step": 33505 }, { "entropy": 4.961561965942383, "epoch": 3.2190201729106627, "grad_norm": 1.0078125, "learning_rate": 0.00039832575497498454, "loss": 4.5496, "mean_token_accuracy": 0.24915057718753814, "num_tokens": 76844952.0, "step": 33510 }, { "entropy": 5.020523405075073, "epoch": 3.2195004803073966, "grad_norm": 0.98046875, "learning_rate": 0.00039829707866346895, "loss": 4.6026, "mean_token_accuracy": 0.2461605966091156, "num_tokens": 76856076.0, "step": 33515 }, { "entropy": 5.157642126083374, "epoch": 3.219980787704131, "grad_norm": 0.98046875, "learning_rate": 0.0003982683994893614, "loss": 4.8054, "mean_token_accuracy": 0.23462713956832887, "num_tokens": 76867510.0, "step": 33520 }, { "entropy": 5.093398809432983, "epoch": 3.2204610951008648, "grad_norm": 1.03125, "learning_rate": 0.00039823971745332764, "loss": 4.7145, "mean_token_accuracy": 0.2450822800397873, "num_tokens": 76879919.0, "step": 33525 }, { "entropy": 5.00069637298584, "epoch": 3.2209414024975986, "grad_norm": 1.0390625, "learning_rate": 0.0003982110325560336, "loss": 4.6205, "mean_token_accuracy": 0.24558212459087372, "num_tokens": 76891402.0, "step": 33530 }, { "entropy": 5.091366624832153, "epoch": 3.2214217098943325, "grad_norm": 1.1484375, "learning_rate": 0.0003981823447981453, "loss": 4.6441, "mean_token_accuracy": 0.24134110063314437, "num_tokens": 76902703.0, "step": 33535 }, { "entropy": 5.07065863609314, "epoch": 3.2219020172910664, "grad_norm": 1.0625, "learning_rate": 0.00039815365418032855, "loss": 4.5796, "mean_token_accuracy": 0.24963993430137635, "num_tokens": 76913661.0, "step": 33540 }, { "entropy": 4.983402442932129, "epoch": 3.2223823246878003, "grad_norm": 1.0703125, "learning_rate": 0.00039812496070324983, "loss": 4.5664, "mean_token_accuracy": 0.25056677460670473, "num_tokens": 76924175.0, "step": 33545 }, { "entropy": 5.057305765151978, "epoch": 3.222862632084534, "grad_norm": 0.9921875, "learning_rate": 0.000398096264367575, "loss": 4.6603, "mean_token_accuracy": 0.24586112797260284, "num_tokens": 76935158.0, "step": 33550 }, { "entropy": 4.988756513595581, "epoch": 3.223342939481268, "grad_norm": 1.0390625, "learning_rate": 0.0003980675651739705, "loss": 4.5192, "mean_token_accuracy": 0.2597853600978851, "num_tokens": 76944918.0, "step": 33555 }, { "entropy": 4.917987489700318, "epoch": 3.223823246878002, "grad_norm": 1.09375, "learning_rate": 0.00039803886312310253, "loss": 4.5161, "mean_token_accuracy": 0.26421377807855606, "num_tokens": 76955128.0, "step": 33560 }, { "entropy": 4.992546081542969, "epoch": 3.224303554274736, "grad_norm": 1.03125, "learning_rate": 0.00039801015821563755, "loss": 4.672, "mean_token_accuracy": 0.24415734857320787, "num_tokens": 76966227.0, "step": 33565 }, { "entropy": 5.0746008396148685, "epoch": 3.2247838616714697, "grad_norm": 1.0625, "learning_rate": 0.0003979814504522419, "loss": 4.5778, "mean_token_accuracy": 0.25063045173883436, "num_tokens": 76977059.0, "step": 33570 }, { "entropy": 4.96845383644104, "epoch": 3.2252641690682036, "grad_norm": 1.0703125, "learning_rate": 0.00039795273983358223, "loss": 4.5614, "mean_token_accuracy": 0.25698710083961485, "num_tokens": 76987447.0, "step": 33575 }, { "entropy": 5.023295307159424, "epoch": 3.2257444764649374, "grad_norm": 1.0390625, "learning_rate": 0.00039792402636032497, "loss": 4.6579, "mean_token_accuracy": 0.2497084230184555, "num_tokens": 76998960.0, "step": 33580 }, { "entropy": 5.049805212020874, "epoch": 3.2262247838616713, "grad_norm": 0.95703125, "learning_rate": 0.00039789531003313696, "loss": 4.6694, "mean_token_accuracy": 0.24890413135290146, "num_tokens": 77011255.0, "step": 33585 }, { "entropy": 5.02531247138977, "epoch": 3.226705091258405, "grad_norm": 1.09375, "learning_rate": 0.0003978665908526846, "loss": 4.6114, "mean_token_accuracy": 0.2471139207482338, "num_tokens": 77023061.0, "step": 33590 }, { "entropy": 5.026203250885009, "epoch": 3.227185398655139, "grad_norm": 1.0390625, "learning_rate": 0.0003978378688196349, "loss": 4.5959, "mean_token_accuracy": 0.2514078453183174, "num_tokens": 77033949.0, "step": 33595 }, { "entropy": 4.997019195556641, "epoch": 3.227665706051873, "grad_norm": 1.109375, "learning_rate": 0.0003978091439346546, "loss": 4.5736, "mean_token_accuracy": 0.2488286927342415, "num_tokens": 77044990.0, "step": 33600 }, { "entropy": 5.097410869598389, "epoch": 3.2281460134486073, "grad_norm": 1.0625, "learning_rate": 0.0003977804161984108, "loss": 4.6759, "mean_token_accuracy": 0.2426385059952736, "num_tokens": 77055099.0, "step": 33605 }, { "entropy": 5.047701454162597, "epoch": 3.228626320845341, "grad_norm": 1.0390625, "learning_rate": 0.0003977516856115702, "loss": 4.6047, "mean_token_accuracy": 0.25579265505075455, "num_tokens": 77065854.0, "step": 33610 }, { "entropy": 5.0332708835601805, "epoch": 3.229106628242075, "grad_norm": 0.98828125, "learning_rate": 0.00039772295217479993, "loss": 4.6451, "mean_token_accuracy": 0.25158894062042236, "num_tokens": 77077531.0, "step": 33615 }, { "entropy": 5.085025262832642, "epoch": 3.229586935638809, "grad_norm": 1.1015625, "learning_rate": 0.0003976942158887671, "loss": 4.6101, "mean_token_accuracy": 0.2541229441761971, "num_tokens": 77088659.0, "step": 33620 }, { "entropy": 5.060108709335327, "epoch": 3.230067243035543, "grad_norm": 1.03125, "learning_rate": 0.000397665476754139, "loss": 4.7039, "mean_token_accuracy": 0.23708791583776473, "num_tokens": 77099868.0, "step": 33625 }, { "entropy": 5.044315433502197, "epoch": 3.2305475504322767, "grad_norm": 1.015625, "learning_rate": 0.0003976367347715828, "loss": 4.6222, "mean_token_accuracy": 0.25418720692396163, "num_tokens": 77112415.0, "step": 33630 }, { "entropy": 5.066732931137085, "epoch": 3.2310278578290106, "grad_norm": 1.0078125, "learning_rate": 0.0003976079899417657, "loss": 4.7151, "mean_token_accuracy": 0.23702074140310286, "num_tokens": 77124978.0, "step": 33635 }, { "entropy": 5.072137689590454, "epoch": 3.2315081652257445, "grad_norm": 1.015625, "learning_rate": 0.0003975792422653552, "loss": 4.6697, "mean_token_accuracy": 0.24586405158042907, "num_tokens": 77135800.0, "step": 33640 }, { "entropy": 5.057637071609497, "epoch": 3.2319884726224783, "grad_norm": 1.03125, "learning_rate": 0.0003975504917430186, "loss": 4.6713, "mean_token_accuracy": 0.24309516847133636, "num_tokens": 77147269.0, "step": 33645 }, { "entropy": 5.039349699020386, "epoch": 3.2324687800192122, "grad_norm": 0.9609375, "learning_rate": 0.0003975217383754235, "loss": 4.6536, "mean_token_accuracy": 0.24544923901557922, "num_tokens": 77159522.0, "step": 33650 }, { "entropy": 5.0715264797210695, "epoch": 3.232949087415946, "grad_norm": 1.0859375, "learning_rate": 0.0003974929821632375, "loss": 4.6453, "mean_token_accuracy": 0.24852334558963776, "num_tokens": 77170884.0, "step": 33655 }, { "entropy": 4.9567262649536135, "epoch": 3.23342939481268, "grad_norm": 1.0546875, "learning_rate": 0.00039746422310712814, "loss": 4.4834, "mean_token_accuracy": 0.2606669679284096, "num_tokens": 77181554.0, "step": 33660 }, { "entropy": 5.01213231086731, "epoch": 3.233909702209414, "grad_norm": 1.09375, "learning_rate": 0.0003974354612077632, "loss": 4.6844, "mean_token_accuracy": 0.24595702141523362, "num_tokens": 77192988.0, "step": 33665 }, { "entropy": 5.052438592910766, "epoch": 3.2343900096061478, "grad_norm": 1.03125, "learning_rate": 0.0003974066964658104, "loss": 4.5873, "mean_token_accuracy": 0.255949005484581, "num_tokens": 77204107.0, "step": 33670 }, { "entropy": 5.1571298122406, "epoch": 3.2348703170028816, "grad_norm": 1.125, "learning_rate": 0.00039737792888193754, "loss": 4.7468, "mean_token_accuracy": 0.23783104568719865, "num_tokens": 77217008.0, "step": 33675 }, { "entropy": 5.050239706039429, "epoch": 3.235350624399616, "grad_norm": 1.046875, "learning_rate": 0.0003973491584568126, "loss": 4.6694, "mean_token_accuracy": 0.24460101872682571, "num_tokens": 77228651.0, "step": 33680 }, { "entropy": 4.946198034286499, "epoch": 3.23583093179635, "grad_norm": 1.015625, "learning_rate": 0.0003973203851911035, "loss": 4.5344, "mean_token_accuracy": 0.2570539727807045, "num_tokens": 77240251.0, "step": 33685 }, { "entropy": 5.1295225620269775, "epoch": 3.2363112391930837, "grad_norm": 0.984375, "learning_rate": 0.0003972916090854782, "loss": 4.6414, "mean_token_accuracy": 0.2535032883286476, "num_tokens": 77252160.0, "step": 33690 }, { "entropy": 5.016752290725708, "epoch": 3.2367915465898176, "grad_norm": 1.015625, "learning_rate": 0.00039726283014060497, "loss": 4.5714, "mean_token_accuracy": 0.2535124719142914, "num_tokens": 77263678.0, "step": 33695 }, { "entropy": 5.015379238128662, "epoch": 3.2372718539865515, "grad_norm": 0.97265625, "learning_rate": 0.0003972340483571519, "loss": 4.6308, "mean_token_accuracy": 0.2428615778684616, "num_tokens": 77275180.0, "step": 33700 }, { "entropy": 5.073586702346802, "epoch": 3.2377521613832854, "grad_norm": 1.0, "learning_rate": 0.00039720526373578704, "loss": 4.646, "mean_token_accuracy": 0.24470321238040924, "num_tokens": 77286450.0, "step": 33705 }, { "entropy": 5.053950643539428, "epoch": 3.2382324687800192, "grad_norm": 1.09375, "learning_rate": 0.00039717647627717894, "loss": 4.5862, "mean_token_accuracy": 0.2565572842955589, "num_tokens": 77297011.0, "step": 33710 }, { "entropy": 4.971718645095825, "epoch": 3.238712776176753, "grad_norm": 1.03125, "learning_rate": 0.0003971476859819958, "loss": 4.5017, "mean_token_accuracy": 0.26188462525606154, "num_tokens": 77307408.0, "step": 33715 }, { "entropy": 5.078515434265137, "epoch": 3.239193083573487, "grad_norm": 1.0859375, "learning_rate": 0.0003971188928509062, "loss": 4.655, "mean_token_accuracy": 0.2519090369343758, "num_tokens": 77316762.0, "step": 33720 }, { "entropy": 5.088728237152099, "epoch": 3.239673390970221, "grad_norm": 1.1484375, "learning_rate": 0.0003970900968845784, "loss": 4.647, "mean_token_accuracy": 0.2538820832967758, "num_tokens": 77327233.0, "step": 33725 }, { "entropy": 5.0935204982757565, "epoch": 3.2401536983669548, "grad_norm": 1.046875, "learning_rate": 0.00039706129808368115, "loss": 4.6849, "mean_token_accuracy": 0.24508253037929534, "num_tokens": 77338001.0, "step": 33730 }, { "entropy": 5.001562309265137, "epoch": 3.2406340057636887, "grad_norm": 0.97265625, "learning_rate": 0.0003970324964488829, "loss": 4.6347, "mean_token_accuracy": 0.24426246285438538, "num_tokens": 77350243.0, "step": 33735 }, { "entropy": 5.025910663604736, "epoch": 3.2411143131604225, "grad_norm": 0.95703125, "learning_rate": 0.00039700369198085255, "loss": 4.6393, "mean_token_accuracy": 0.2468763843178749, "num_tokens": 77360712.0, "step": 33740 }, { "entropy": 5.0112837791442875, "epoch": 3.2415946205571564, "grad_norm": 0.9609375, "learning_rate": 0.00039697488468025876, "loss": 4.574, "mean_token_accuracy": 0.24540430605411528, "num_tokens": 77371399.0, "step": 33745 }, { "entropy": 5.097122049331665, "epoch": 3.2420749279538903, "grad_norm": 1.078125, "learning_rate": 0.0003969460745477703, "loss": 4.6828, "mean_token_accuracy": 0.24141585379838942, "num_tokens": 77381652.0, "step": 33750 }, { "entropy": 5.0294352054595945, "epoch": 3.2425552353506246, "grad_norm": 0.9765625, "learning_rate": 0.00039691726158405606, "loss": 4.615, "mean_token_accuracy": 0.2474424034357071, "num_tokens": 77393335.0, "step": 33755 }, { "entropy": 5.017794370651245, "epoch": 3.2430355427473585, "grad_norm": 1.046875, "learning_rate": 0.00039688844578978516, "loss": 4.6233, "mean_token_accuracy": 0.2538589760661125, "num_tokens": 77404311.0, "step": 33760 }, { "entropy": 5.069286727905274, "epoch": 3.2435158501440924, "grad_norm": 1.03125, "learning_rate": 0.0003968596271656263, "loss": 4.6646, "mean_token_accuracy": 0.24574896097183227, "num_tokens": 77415886.0, "step": 33765 }, { "entropy": 5.06607232093811, "epoch": 3.2439961575408263, "grad_norm": 1.0390625, "learning_rate": 0.00039683080571224885, "loss": 4.6343, "mean_token_accuracy": 0.24921049177646637, "num_tokens": 77426300.0, "step": 33770 }, { "entropy": 5.1001753330230715, "epoch": 3.24447646493756, "grad_norm": 1.0078125, "learning_rate": 0.0003968019814303219, "loss": 4.6652, "mean_token_accuracy": 0.24348481893539428, "num_tokens": 77437803.0, "step": 33775 }, { "entropy": 5.098292875289917, "epoch": 3.244956772334294, "grad_norm": 0.98046875, "learning_rate": 0.0003967731543205145, "loss": 4.6655, "mean_token_accuracy": 0.250624942779541, "num_tokens": 77448850.0, "step": 33780 }, { "entropy": 5.036126708984375, "epoch": 3.245437079731028, "grad_norm": 1.125, "learning_rate": 0.00039674432438349607, "loss": 4.6083, "mean_token_accuracy": 0.2461878776550293, "num_tokens": 77458933.0, "step": 33785 }, { "entropy": 5.0860895156860355, "epoch": 3.245917387127762, "grad_norm": 1.0703125, "learning_rate": 0.000396715491619936, "loss": 4.6225, "mean_token_accuracy": 0.23742762207984924, "num_tokens": 77469916.0, "step": 33790 }, { "entropy": 5.081142425537109, "epoch": 3.2463976945244957, "grad_norm": 1.140625, "learning_rate": 0.0003966866560305036, "loss": 4.6019, "mean_token_accuracy": 0.2540366739034653, "num_tokens": 77480548.0, "step": 33795 }, { "entropy": 5.031909418106079, "epoch": 3.2468780019212296, "grad_norm": 1.03125, "learning_rate": 0.00039665781761586837, "loss": 4.6331, "mean_token_accuracy": 0.23783497661352157, "num_tokens": 77490664.0, "step": 33800 }, { "entropy": 5.0877001762390135, "epoch": 3.2473583093179634, "grad_norm": 1.0, "learning_rate": 0.0003966289763766999, "loss": 4.5966, "mean_token_accuracy": 0.25352431684732435, "num_tokens": 77502010.0, "step": 33805 }, { "entropy": 5.030259513854981, "epoch": 3.2478386167146973, "grad_norm": 1.0390625, "learning_rate": 0.0003966001323136678, "loss": 4.5991, "mean_token_accuracy": 0.25208001136779784, "num_tokens": 77513453.0, "step": 33810 }, { "entropy": 5.0315773487091064, "epoch": 3.248318924111431, "grad_norm": 0.95703125, "learning_rate": 0.0003965712854274416, "loss": 4.7023, "mean_token_accuracy": 0.2426896795630455, "num_tokens": 77526294.0, "step": 33815 }, { "entropy": 5.095651865005493, "epoch": 3.248799231508165, "grad_norm": 1.0, "learning_rate": 0.0003965424357186912, "loss": 4.6988, "mean_token_accuracy": 0.24423594325780867, "num_tokens": 77537753.0, "step": 33820 }, { "entropy": 5.003453779220581, "epoch": 3.249279538904899, "grad_norm": 0.98828125, "learning_rate": 0.0003965135831880864, "loss": 4.5521, "mean_token_accuracy": 0.2491741508245468, "num_tokens": 77548931.0, "step": 33825 }, { "entropy": 4.9637964248657225, "epoch": 3.249759846301633, "grad_norm": 1.03125, "learning_rate": 0.000396484727836297, "loss": 4.5891, "mean_token_accuracy": 0.2554382473230362, "num_tokens": 77560259.0, "step": 33830 }, { "entropy": 5.063522148132324, "epoch": 3.2502401536983667, "grad_norm": 1.0859375, "learning_rate": 0.0003964558696639928, "loss": 4.6414, "mean_token_accuracy": 0.2532750189304352, "num_tokens": 77571688.0, "step": 33835 }, { "entropy": 5.131698942184448, "epoch": 3.250720461095101, "grad_norm": 1.0390625, "learning_rate": 0.0003964270086718441, "loss": 4.6939, "mean_token_accuracy": 0.2505921542644501, "num_tokens": 77583754.0, "step": 33840 }, { "entropy": 5.0321290493011475, "epoch": 3.251200768491835, "grad_norm": 0.953125, "learning_rate": 0.00039639814486052083, "loss": 4.5707, "mean_token_accuracy": 0.2609803184866905, "num_tokens": 77596013.0, "step": 33845 }, { "entropy": 5.047715425491333, "epoch": 3.251681075888569, "grad_norm": 0.94140625, "learning_rate": 0.0003963692782306931, "loss": 4.6828, "mean_token_accuracy": 0.24805556684732438, "num_tokens": 77607349.0, "step": 33850 }, { "entropy": 5.053295421600342, "epoch": 3.2521613832853027, "grad_norm": 1.015625, "learning_rate": 0.0003963404087830311, "loss": 4.6281, "mean_token_accuracy": 0.2486888661980629, "num_tokens": 77618325.0, "step": 33855 }, { "entropy": 5.141686725616455, "epoch": 3.2526416906820366, "grad_norm": 1.0546875, "learning_rate": 0.0003963115365182051, "loss": 4.7248, "mean_token_accuracy": 0.2364576131105423, "num_tokens": 77629794.0, "step": 33860 }, { "entropy": 4.975665616989136, "epoch": 3.2531219980787704, "grad_norm": 1.078125, "learning_rate": 0.00039628266143688554, "loss": 4.5912, "mean_token_accuracy": 0.25148352831602094, "num_tokens": 77641800.0, "step": 33865 }, { "entropy": 4.997483825683593, "epoch": 3.2536023054755043, "grad_norm": 0.99609375, "learning_rate": 0.0003962537835397426, "loss": 4.5895, "mean_token_accuracy": 0.24789944887161255, "num_tokens": 77653533.0, "step": 33870 }, { "entropy": 5.035173130035401, "epoch": 3.254082612872238, "grad_norm": 1.0234375, "learning_rate": 0.00039622490282744684, "loss": 4.6184, "mean_token_accuracy": 0.2500454217195511, "num_tokens": 77663508.0, "step": 33875 }, { "entropy": 5.0597367763519285, "epoch": 3.254562920268972, "grad_norm": 1.0, "learning_rate": 0.0003961960193006689, "loss": 4.6473, "mean_token_accuracy": 0.246576663851738, "num_tokens": 77674956.0, "step": 33880 }, { "entropy": 5.108331680297852, "epoch": 3.255043227665706, "grad_norm": 1.0625, "learning_rate": 0.0003961671329600792, "loss": 4.6458, "mean_token_accuracy": 0.2456754356622696, "num_tokens": 77686000.0, "step": 33885 }, { "entropy": 4.950703525543213, "epoch": 3.25552353506244, "grad_norm": 0.9921875, "learning_rate": 0.0003961382438063485, "loss": 4.5561, "mean_token_accuracy": 0.24644300490617752, "num_tokens": 77698979.0, "step": 33890 }, { "entropy": 4.954560852050781, "epoch": 3.2560038424591737, "grad_norm": 1.0078125, "learning_rate": 0.0003961093518401475, "loss": 4.5033, "mean_token_accuracy": 0.25473318099975584, "num_tokens": 77710397.0, "step": 33895 }, { "entropy": 5.02457480430603, "epoch": 3.2564841498559076, "grad_norm": 1.03125, "learning_rate": 0.00039608045706214696, "loss": 4.6169, "mean_token_accuracy": 0.24143614768981933, "num_tokens": 77721665.0, "step": 33900 }, { "entropy": 5.0436742305755615, "epoch": 3.2569644572526415, "grad_norm": 1.109375, "learning_rate": 0.0003960515594730177, "loss": 4.6195, "mean_token_accuracy": 0.24937530755996704, "num_tokens": 77733328.0, "step": 33905 }, { "entropy": 5.059139728546143, "epoch": 3.2574447646493754, "grad_norm": 1.0703125, "learning_rate": 0.0003960226590734307, "loss": 4.6327, "mean_token_accuracy": 0.24265473634004592, "num_tokens": 77745812.0, "step": 33910 }, { "entropy": 4.941529989242554, "epoch": 3.2579250720461097, "grad_norm": 1.03125, "learning_rate": 0.000395993755864057, "loss": 4.4927, "mean_token_accuracy": 0.2593093618750572, "num_tokens": 77757473.0, "step": 33915 }, { "entropy": 5.053327369689941, "epoch": 3.2584053794428436, "grad_norm": 1.0078125, "learning_rate": 0.0003959648498455674, "loss": 4.6265, "mean_token_accuracy": 0.24523908346891404, "num_tokens": 77769924.0, "step": 33920 }, { "entropy": 5.04257230758667, "epoch": 3.2588856868395775, "grad_norm": 0.9609375, "learning_rate": 0.00039593594101863333, "loss": 4.6375, "mean_token_accuracy": 0.2502197057008743, "num_tokens": 77782048.0, "step": 33925 }, { "entropy": 4.931873321533203, "epoch": 3.2593659942363113, "grad_norm": 1.0234375, "learning_rate": 0.00039590702938392577, "loss": 4.4975, "mean_token_accuracy": 0.26133894473314284, "num_tokens": 77792371.0, "step": 33930 }, { "entropy": 5.0368239402771, "epoch": 3.2598463016330452, "grad_norm": 1.09375, "learning_rate": 0.00039587811494211594, "loss": 4.6242, "mean_token_accuracy": 0.2455820083618164, "num_tokens": 77804089.0, "step": 33935 }, { "entropy": 4.983446359634399, "epoch": 3.260326609029779, "grad_norm": 0.9296875, "learning_rate": 0.00039584919769387536, "loss": 4.5917, "mean_token_accuracy": 0.24377025067806243, "num_tokens": 77815962.0, "step": 33940 }, { "entropy": 5.08806529045105, "epoch": 3.260806916426513, "grad_norm": 0.98046875, "learning_rate": 0.0003958202776398751, "loss": 4.7071, "mean_token_accuracy": 0.23923599421977998, "num_tokens": 77827396.0, "step": 33945 }, { "entropy": 5.032593154907227, "epoch": 3.261287223823247, "grad_norm": 1.0703125, "learning_rate": 0.0003957913547807868, "loss": 4.6387, "mean_token_accuracy": 0.25274568498134614, "num_tokens": 77838916.0, "step": 33950 }, { "entropy": 5.155986642837524, "epoch": 3.2617675312199808, "grad_norm": 1.0234375, "learning_rate": 0.000395762429117282, "loss": 4.7851, "mean_token_accuracy": 0.24382611513137817, "num_tokens": 77851393.0, "step": 33955 }, { "entropy": 5.096858119964599, "epoch": 3.2622478386167146, "grad_norm": 1.0, "learning_rate": 0.0003957335006500321, "loss": 4.644, "mean_token_accuracy": 0.24779659509658813, "num_tokens": 77863124.0, "step": 33960 }, { "entropy": 4.9976495742797855, "epoch": 3.2627281460134485, "grad_norm": 0.94921875, "learning_rate": 0.00039570456937970883, "loss": 4.5921, "mean_token_accuracy": 0.2588670402765274, "num_tokens": 77875568.0, "step": 33965 }, { "entropy": 5.091350555419922, "epoch": 3.2632084534101824, "grad_norm": 1.0703125, "learning_rate": 0.0003956756353069839, "loss": 4.6674, "mean_token_accuracy": 0.24813520759344102, "num_tokens": 77885747.0, "step": 33970 }, { "entropy": 4.9637257099151615, "epoch": 3.2636887608069163, "grad_norm": 1.046875, "learning_rate": 0.0003956466984325291, "loss": 4.5257, "mean_token_accuracy": 0.2556587189435959, "num_tokens": 77897074.0, "step": 33975 }, { "entropy": 5.046815204620361, "epoch": 3.26416906820365, "grad_norm": 1.0234375, "learning_rate": 0.00039561775875701616, "loss": 4.6491, "mean_token_accuracy": 0.24351091235876082, "num_tokens": 77907718.0, "step": 33980 }, { "entropy": 5.042139005661011, "epoch": 3.264649375600384, "grad_norm": 0.9609375, "learning_rate": 0.000395588816281117, "loss": 4.6144, "mean_token_accuracy": 0.24225043952465058, "num_tokens": 77919768.0, "step": 33985 }, { "entropy": 5.126606607437134, "epoch": 3.2651296829971184, "grad_norm": 1.0234375, "learning_rate": 0.0003955598710055036, "loss": 4.6581, "mean_token_accuracy": 0.2500751346349716, "num_tokens": 77930927.0, "step": 33990 }, { "entropy": 5.079805898666382, "epoch": 3.2656099903938522, "grad_norm": 1.109375, "learning_rate": 0.000395530922930848, "loss": 4.6082, "mean_token_accuracy": 0.24244888722896576, "num_tokens": 77941907.0, "step": 33995 }, { "entropy": 4.993637466430664, "epoch": 3.266090297790586, "grad_norm": 1.140625, "learning_rate": 0.0003955019720578223, "loss": 4.559, "mean_token_accuracy": 0.2548254653811455, "num_tokens": 77952800.0, "step": 34000 }, { "entropy": 5.073926544189453, "epoch": 3.26657060518732, "grad_norm": 1.0390625, "learning_rate": 0.0003954730183870987, "loss": 4.6828, "mean_token_accuracy": 0.24283509850502014, "num_tokens": 77963874.0, "step": 34005 }, { "entropy": 4.987660312652588, "epoch": 3.267050912584054, "grad_norm": 0.984375, "learning_rate": 0.0003954440619193491, "loss": 4.4803, "mean_token_accuracy": 0.2583694770932198, "num_tokens": 77975282.0, "step": 34010 }, { "entropy": 4.964640045166016, "epoch": 3.2675312199807878, "grad_norm": 0.99609375, "learning_rate": 0.00039541510265524626, "loss": 4.5422, "mean_token_accuracy": 0.25089375078678133, "num_tokens": 77987511.0, "step": 34015 }, { "entropy": 5.047157144546508, "epoch": 3.2680115273775217, "grad_norm": 1.078125, "learning_rate": 0.0003953861405954622, "loss": 4.6001, "mean_token_accuracy": 0.24573374539613724, "num_tokens": 77997805.0, "step": 34020 }, { "entropy": 5.05724778175354, "epoch": 3.2684918347742555, "grad_norm": 1.0703125, "learning_rate": 0.0003953571757406694, "loss": 4.6789, "mean_token_accuracy": 0.24250999987125396, "num_tokens": 78010312.0, "step": 34025 }, { "entropy": 5.050488424301148, "epoch": 3.2689721421709894, "grad_norm": 1.0859375, "learning_rate": 0.00039532820809154044, "loss": 4.5598, "mean_token_accuracy": 0.2540115833282471, "num_tokens": 78019934.0, "step": 34030 }, { "entropy": 5.0048281192779545, "epoch": 3.2694524495677233, "grad_norm": 1.0390625, "learning_rate": 0.00039529923764874774, "loss": 4.5606, "mean_token_accuracy": 0.2609712705016136, "num_tokens": 78031263.0, "step": 34035 }, { "entropy": 4.980857086181641, "epoch": 3.269932756964457, "grad_norm": 1.03125, "learning_rate": 0.000395270264412964, "loss": 4.614, "mean_token_accuracy": 0.25289792418479917, "num_tokens": 78043829.0, "step": 34040 }, { "entropy": 5.137817287445069, "epoch": 3.270413064361191, "grad_norm": 1.0234375, "learning_rate": 0.00039524128838486184, "loss": 4.7425, "mean_token_accuracy": 0.23789486587047576, "num_tokens": 78055292.0, "step": 34045 }, { "entropy": 5.0570290088653564, "epoch": 3.270893371757925, "grad_norm": 1.046875, "learning_rate": 0.000395212309565114, "loss": 4.6443, "mean_token_accuracy": 0.24769087135791779, "num_tokens": 78066969.0, "step": 34050 }, { "entropy": 5.138012361526489, "epoch": 3.271373679154659, "grad_norm": 1.046875, "learning_rate": 0.00039518332795439326, "loss": 4.7234, "mean_token_accuracy": 0.24245439916849137, "num_tokens": 78077994.0, "step": 34055 }, { "entropy": 5.004557514190674, "epoch": 3.2718539865513927, "grad_norm": 1.0546875, "learning_rate": 0.00039515434355337254, "loss": 4.5828, "mean_token_accuracy": 0.24898525774478913, "num_tokens": 78089730.0, "step": 34060 }, { "entropy": 5.042746686935425, "epoch": 3.272334293948127, "grad_norm": 1.0, "learning_rate": 0.0003951253563627248, "loss": 4.6474, "mean_token_accuracy": 0.24422961920499803, "num_tokens": 78100125.0, "step": 34065 }, { "entropy": 5.118434238433838, "epoch": 3.2728146013448605, "grad_norm": 0.97265625, "learning_rate": 0.0003950963663831229, "loss": 4.6964, "mean_token_accuracy": 0.24061804115772248, "num_tokens": 78112170.0, "step": 34070 }, { "entropy": 5.119843292236328, "epoch": 3.273294908741595, "grad_norm": 1.0234375, "learning_rate": 0.00039506737361524007, "loss": 4.6803, "mean_token_accuracy": 0.2418634071946144, "num_tokens": 78123796.0, "step": 34075 }, { "entropy": 5.041185426712036, "epoch": 3.2737752161383287, "grad_norm": 0.98046875, "learning_rate": 0.00039503837805974926, "loss": 4.5614, "mean_token_accuracy": 0.2536353379487991, "num_tokens": 78134170.0, "step": 34080 }, { "entropy": 4.980127477645874, "epoch": 3.2742555235350626, "grad_norm": 0.9765625, "learning_rate": 0.00039500937971732376, "loss": 4.6221, "mean_token_accuracy": 0.2498932659626007, "num_tokens": 78145140.0, "step": 34085 }, { "entropy": 5.1225217342376705, "epoch": 3.2747358309317964, "grad_norm": 1.0390625, "learning_rate": 0.0003949803785886369, "loss": 4.7428, "mean_token_accuracy": 0.24274052083492278, "num_tokens": 78157627.0, "step": 34090 }, { "entropy": 5.093050003051758, "epoch": 3.2752161383285303, "grad_norm": 1.0234375, "learning_rate": 0.00039495137467436184, "loss": 4.6975, "mean_token_accuracy": 0.24486079663038254, "num_tokens": 78170343.0, "step": 34095 }, { "entropy": 5.0705037117004395, "epoch": 3.275696445725264, "grad_norm": 0.9765625, "learning_rate": 0.00039492236797517206, "loss": 4.6008, "mean_token_accuracy": 0.2509596854448318, "num_tokens": 78181393.0, "step": 34100 }, { "entropy": 5.033651685714721, "epoch": 3.276176753121998, "grad_norm": 1.015625, "learning_rate": 0.000394893358491741, "loss": 4.6341, "mean_token_accuracy": 0.24210257232189178, "num_tokens": 78193048.0, "step": 34105 }, { "entropy": 5.0034088611602785, "epoch": 3.276657060518732, "grad_norm": 1.046875, "learning_rate": 0.00039486434622474216, "loss": 4.5586, "mean_token_accuracy": 0.2518891394138336, "num_tokens": 78203738.0, "step": 34110 }, { "entropy": 5.088667583465576, "epoch": 3.277137367915466, "grad_norm": 1.0390625, "learning_rate": 0.00039483533117484916, "loss": 4.6619, "mean_token_accuracy": 0.23626221120357513, "num_tokens": 78215171.0, "step": 34115 }, { "entropy": 5.018340349197388, "epoch": 3.2776176753121997, "grad_norm": 1.078125, "learning_rate": 0.0003948063133427356, "loss": 4.5536, "mean_token_accuracy": 0.24994443356990814, "num_tokens": 78225901.0, "step": 34120 }, { "entropy": 5.001010227203369, "epoch": 3.2780979827089336, "grad_norm": 1.0234375, "learning_rate": 0.0003947772927290751, "loss": 4.5854, "mean_token_accuracy": 0.24815791249275207, "num_tokens": 78237327.0, "step": 34125 }, { "entropy": 5.011168193817139, "epoch": 3.2785782901056675, "grad_norm": 1.03125, "learning_rate": 0.0003947482693345416, "loss": 4.6219, "mean_token_accuracy": 0.2456973373889923, "num_tokens": 78248030.0, "step": 34130 }, { "entropy": 4.986574697494507, "epoch": 3.2790585975024014, "grad_norm": 1.0078125, "learning_rate": 0.00039471924315980894, "loss": 4.5148, "mean_token_accuracy": 0.25550462305545807, "num_tokens": 78259303.0, "step": 34135 }, { "entropy": 5.041993141174316, "epoch": 3.2795389048991357, "grad_norm": 1.0, "learning_rate": 0.00039469021420555087, "loss": 4.62, "mean_token_accuracy": 0.24644032418727874, "num_tokens": 78271284.0, "step": 34140 }, { "entropy": 5.01559419631958, "epoch": 3.280019212295869, "grad_norm": 1.03125, "learning_rate": 0.00039466118247244143, "loss": 4.5618, "mean_token_accuracy": 0.2596289172768593, "num_tokens": 78282718.0, "step": 34145 }, { "entropy": 4.9947576999664305, "epoch": 3.2804995196926034, "grad_norm": 1.1015625, "learning_rate": 0.0003946321479611547, "loss": 4.6023, "mean_token_accuracy": 0.2562656417489052, "num_tokens": 78294292.0, "step": 34150 }, { "entropy": 4.983988332748413, "epoch": 3.2809798270893373, "grad_norm": 1.0390625, "learning_rate": 0.00039460311067236465, "loss": 4.6255, "mean_token_accuracy": 0.24257488548755646, "num_tokens": 78306834.0, "step": 34155 }, { "entropy": 5.084269332885742, "epoch": 3.281460134486071, "grad_norm": 0.94140625, "learning_rate": 0.00039457407060674557, "loss": 4.6643, "mean_token_accuracy": 0.24376922398805617, "num_tokens": 78319866.0, "step": 34160 }, { "entropy": 5.0587766647338865, "epoch": 3.281940441882805, "grad_norm": 1.03125, "learning_rate": 0.00039454502776497163, "loss": 4.6218, "mean_token_accuracy": 0.24582590013742447, "num_tokens": 78330933.0, "step": 34165 }, { "entropy": 5.02895336151123, "epoch": 3.282420749279539, "grad_norm": 1.046875, "learning_rate": 0.00039451598214771706, "loss": 4.5921, "mean_token_accuracy": 0.25370020866394044, "num_tokens": 78342323.0, "step": 34170 }, { "entropy": 5.072585821151733, "epoch": 3.282901056676273, "grad_norm": 0.99609375, "learning_rate": 0.0003944869337556563, "loss": 4.6043, "mean_token_accuracy": 0.2609324440360069, "num_tokens": 78353090.0, "step": 34175 }, { "entropy": 5.058878993988037, "epoch": 3.2833813640730067, "grad_norm": 1.0078125, "learning_rate": 0.0003944578825894639, "loss": 4.6583, "mean_token_accuracy": 0.2444702386856079, "num_tokens": 78362679.0, "step": 34180 }, { "entropy": 5.04763503074646, "epoch": 3.2838616714697406, "grad_norm": 1.015625, "learning_rate": 0.00039442882864981397, "loss": 4.6504, "mean_token_accuracy": 0.24100523293018342, "num_tokens": 78373592.0, "step": 34185 }, { "entropy": 4.972846364974975, "epoch": 3.2843419788664745, "grad_norm": 0.93359375, "learning_rate": 0.00039439977193738134, "loss": 4.5258, "mean_token_accuracy": 0.2608575657010078, "num_tokens": 78384617.0, "step": 34190 }, { "entropy": 5.000351285934448, "epoch": 3.2848222862632084, "grad_norm": 1.0078125, "learning_rate": 0.00039437071245284055, "loss": 4.6439, "mean_token_accuracy": 0.24550879001617432, "num_tokens": 78396253.0, "step": 34195 }, { "entropy": 5.083364057540893, "epoch": 3.2853025936599423, "grad_norm": 0.9765625, "learning_rate": 0.0003943416501968663, "loss": 4.7062, "mean_token_accuracy": 0.2408829912543297, "num_tokens": 78407852.0, "step": 34200 }, { "entropy": 5.002320957183838, "epoch": 3.285782901056676, "grad_norm": 1.0390625, "learning_rate": 0.00039431258517013323, "loss": 4.5475, "mean_token_accuracy": 0.2584647685289383, "num_tokens": 78419434.0, "step": 34205 }, { "entropy": 5.072924947738647, "epoch": 3.28626320845341, "grad_norm": 1.0078125, "learning_rate": 0.0003942835173733163, "loss": 4.6388, "mean_token_accuracy": 0.24000005573034286, "num_tokens": 78431820.0, "step": 34210 }, { "entropy": 5.077604722976685, "epoch": 3.286743515850144, "grad_norm": 1.0625, "learning_rate": 0.0003942544468070903, "loss": 4.6388, "mean_token_accuracy": 0.24990027397871017, "num_tokens": 78443062.0, "step": 34215 }, { "entropy": 5.091687536239624, "epoch": 3.287223823246878, "grad_norm": 0.98046875, "learning_rate": 0.0003942253734721301, "loss": 4.6612, "mean_token_accuracy": 0.24888237565755844, "num_tokens": 78454091.0, "step": 34220 }, { "entropy": 5.073592567443848, "epoch": 3.287704130643612, "grad_norm": 0.9765625, "learning_rate": 0.00039419629736911076, "loss": 4.7042, "mean_token_accuracy": 0.24581227451562881, "num_tokens": 78466108.0, "step": 34225 }, { "entropy": 4.997543239593506, "epoch": 3.288184438040346, "grad_norm": 1.0078125, "learning_rate": 0.00039416721849870736, "loss": 4.5288, "mean_token_accuracy": 0.250569124519825, "num_tokens": 78476690.0, "step": 34230 }, { "entropy": 5.145573568344116, "epoch": 3.28866474543708, "grad_norm": 1.0703125, "learning_rate": 0.0003941381368615951, "loss": 4.7435, "mean_token_accuracy": 0.23636368811130523, "num_tokens": 78487157.0, "step": 34235 }, { "entropy": 5.09866304397583, "epoch": 3.2891450528338138, "grad_norm": 1.0859375, "learning_rate": 0.0003941090524584489, "loss": 4.6557, "mean_token_accuracy": 0.24694444388151168, "num_tokens": 78499034.0, "step": 34240 }, { "entropy": 5.00386004447937, "epoch": 3.2896253602305476, "grad_norm": 1.0234375, "learning_rate": 0.0003940799652899442, "loss": 4.5568, "mean_token_accuracy": 0.25482958406209943, "num_tokens": 78509720.0, "step": 34245 }, { "entropy": 5.045232772827148, "epoch": 3.2901056676272815, "grad_norm": 1.015625, "learning_rate": 0.0003940508753567564, "loss": 4.6288, "mean_token_accuracy": 0.24974631518125534, "num_tokens": 78522126.0, "step": 34250 }, { "entropy": 5.113291025161743, "epoch": 3.2905859750240154, "grad_norm": 0.98828125, "learning_rate": 0.00039402178265956074, "loss": 4.7041, "mean_token_accuracy": 0.2427150070667267, "num_tokens": 78533348.0, "step": 34255 }, { "entropy": 5.0764930725097654, "epoch": 3.2910662824207493, "grad_norm": 0.97265625, "learning_rate": 0.0003939926871990328, "loss": 4.6161, "mean_token_accuracy": 0.2491741433739662, "num_tokens": 78545889.0, "step": 34260 }, { "entropy": 5.077715158462524, "epoch": 3.291546589817483, "grad_norm": 1.0, "learning_rate": 0.0003939635889758478, "loss": 4.656, "mean_token_accuracy": 0.24635598659515381, "num_tokens": 78557429.0, "step": 34265 }, { "entropy": 4.983099269866943, "epoch": 3.292026897214217, "grad_norm": 1.0, "learning_rate": 0.00039393448799068164, "loss": 4.5366, "mean_token_accuracy": 0.2637444019317627, "num_tokens": 78567824.0, "step": 34270 }, { "entropy": 5.100458812713623, "epoch": 3.292507204610951, "grad_norm": 1.0546875, "learning_rate": 0.0003939053842442098, "loss": 4.6634, "mean_token_accuracy": 0.2483208805322647, "num_tokens": 78578229.0, "step": 34275 }, { "entropy": 5.04512939453125, "epoch": 3.292987512007685, "grad_norm": 0.9921875, "learning_rate": 0.00039387627773710803, "loss": 4.6441, "mean_token_accuracy": 0.2516984835267067, "num_tokens": 78591619.0, "step": 34280 }, { "entropy": 4.999605941772461, "epoch": 3.2934678194044187, "grad_norm": 1.0234375, "learning_rate": 0.00039384716847005216, "loss": 4.5984, "mean_token_accuracy": 0.251834337413311, "num_tokens": 78602576.0, "step": 34285 }, { "entropy": 4.958413600921631, "epoch": 3.2939481268011526, "grad_norm": 1.0, "learning_rate": 0.00039381805644371774, "loss": 4.6043, "mean_token_accuracy": 0.2595936581492424, "num_tokens": 78613532.0, "step": 34290 }, { "entropy": 5.088353824615479, "epoch": 3.2944284341978864, "grad_norm": 0.96875, "learning_rate": 0.000393788941658781, "loss": 4.6476, "mean_token_accuracy": 0.24660000056028367, "num_tokens": 78626120.0, "step": 34295 }, { "entropy": 5.0401242733001705, "epoch": 3.2949087415946208, "grad_norm": 1.0078125, "learning_rate": 0.00039375982411591774, "loss": 4.6076, "mean_token_accuracy": 0.24968953877687455, "num_tokens": 78636175.0, "step": 34300 }, { "entropy": 5.1204423904418945, "epoch": 3.2953890489913547, "grad_norm": 1.0546875, "learning_rate": 0.00039373070381580404, "loss": 4.6746, "mean_token_accuracy": 0.24043382555246354, "num_tokens": 78647429.0, "step": 34305 }, { "entropy": 5.073682069778442, "epoch": 3.2958693563880885, "grad_norm": 1.0078125, "learning_rate": 0.0003937015807591159, "loss": 4.6841, "mean_token_accuracy": 0.2399148389697075, "num_tokens": 78659792.0, "step": 34310 }, { "entropy": 5.013111019134522, "epoch": 3.2963496637848224, "grad_norm": 1.015625, "learning_rate": 0.00039367245494652963, "loss": 4.5591, "mean_token_accuracy": 0.2582303687930107, "num_tokens": 78670614.0, "step": 34315 }, { "entropy": 5.007979106903076, "epoch": 3.2968299711815563, "grad_norm": 1.09375, "learning_rate": 0.00039364332637872125, "loss": 4.5678, "mean_token_accuracy": 0.2522773787379265, "num_tokens": 78681475.0, "step": 34320 }, { "entropy": 5.03101658821106, "epoch": 3.29731027857829, "grad_norm": 0.99609375, "learning_rate": 0.00039361419505636714, "loss": 4.6014, "mean_token_accuracy": 0.2510263308882713, "num_tokens": 78692251.0, "step": 34325 }, { "entropy": 5.033343839645386, "epoch": 3.297790585975024, "grad_norm": 0.9609375, "learning_rate": 0.00039358506098014363, "loss": 4.623, "mean_token_accuracy": 0.24910807013511657, "num_tokens": 78704584.0, "step": 34330 }, { "entropy": 5.027398014068604, "epoch": 3.298270893371758, "grad_norm": 0.96875, "learning_rate": 0.00039355592415072716, "loss": 4.5556, "mean_token_accuracy": 0.24752137809991837, "num_tokens": 78715467.0, "step": 34335 }, { "entropy": 5.020139980316162, "epoch": 3.298751200768492, "grad_norm": 0.98046875, "learning_rate": 0.00039352678456879415, "loss": 4.5599, "mean_token_accuracy": 0.2560836523771286, "num_tokens": 78725835.0, "step": 34340 }, { "entropy": 4.993885707855225, "epoch": 3.2992315081652257, "grad_norm": 1.0078125, "learning_rate": 0.0003934976422350212, "loss": 4.5578, "mean_token_accuracy": 0.2558558017015457, "num_tokens": 78737143.0, "step": 34345 }, { "entropy": 5.193957424163818, "epoch": 3.2997118155619596, "grad_norm": 1.1328125, "learning_rate": 0.0003934684971500848, "loss": 4.7854, "mean_token_accuracy": 0.24060207307338716, "num_tokens": 78748641.0, "step": 34350 }, { "entropy": 5.041642332077027, "epoch": 3.3001921229586935, "grad_norm": 1.0625, "learning_rate": 0.00039343934931466165, "loss": 4.605, "mean_token_accuracy": 0.24833667576313018, "num_tokens": 78760200.0, "step": 34355 }, { "entropy": 4.999837636947632, "epoch": 3.3006724303554273, "grad_norm": 1.0078125, "learning_rate": 0.00039341019872942855, "loss": 4.589, "mean_token_accuracy": 0.25284974128007887, "num_tokens": 78771347.0, "step": 34360 }, { "entropy": 5.003264427185059, "epoch": 3.3011527377521612, "grad_norm": 1.015625, "learning_rate": 0.00039338104539506227, "loss": 4.5099, "mean_token_accuracy": 0.2578217089176178, "num_tokens": 78783514.0, "step": 34365 }, { "entropy": 5.04760332107544, "epoch": 3.301633045148895, "grad_norm": 1.078125, "learning_rate": 0.0003933518893122396, "loss": 4.6619, "mean_token_accuracy": 0.24556959122419358, "num_tokens": 78794621.0, "step": 34370 }, { "entropy": 5.0478212356567385, "epoch": 3.3021133525456294, "grad_norm": 0.9921875, "learning_rate": 0.0003933227304816375, "loss": 4.6196, "mean_token_accuracy": 0.2519634172320366, "num_tokens": 78806293.0, "step": 34375 }, { "entropy": 5.144480228424072, "epoch": 3.302593659942363, "grad_norm": 0.9921875, "learning_rate": 0.000393293568903933, "loss": 4.6596, "mean_token_accuracy": 0.24188109934329988, "num_tokens": 78816930.0, "step": 34380 }, { "entropy": 5.061930131912232, "epoch": 3.303073967339097, "grad_norm": 1.078125, "learning_rate": 0.0003932644045798029, "loss": 4.7013, "mean_token_accuracy": 0.23837460577487946, "num_tokens": 78828321.0, "step": 34385 }, { "entropy": 5.046088790893554, "epoch": 3.303554274735831, "grad_norm": 0.97265625, "learning_rate": 0.0003932352375099247, "loss": 4.6959, "mean_token_accuracy": 0.24086285680532454, "num_tokens": 78840967.0, "step": 34390 }, { "entropy": 5.038535118103027, "epoch": 3.304034582132565, "grad_norm": 1.03125, "learning_rate": 0.0003932060676949753, "loss": 4.6158, "mean_token_accuracy": 0.24674846231937408, "num_tokens": 78853037.0, "step": 34395 }, { "entropy": 5.139471483230591, "epoch": 3.304514889529299, "grad_norm": 1.140625, "learning_rate": 0.0003931768951356319, "loss": 4.6476, "mean_token_accuracy": 0.24751783162355423, "num_tokens": 78863441.0, "step": 34400 }, { "entropy": 4.912761688232422, "epoch": 3.3049951969260327, "grad_norm": 1.0078125, "learning_rate": 0.0003931477198325721, "loss": 4.5402, "mean_token_accuracy": 0.2573926866054535, "num_tokens": 78875770.0, "step": 34405 }, { "entropy": 5.050485706329345, "epoch": 3.3054755043227666, "grad_norm": 1.0625, "learning_rate": 0.000393118541786473, "loss": 4.6137, "mean_token_accuracy": 0.25367127656936644, "num_tokens": 78886239.0, "step": 34410 }, { "entropy": 5.072754001617431, "epoch": 3.3059558117195005, "grad_norm": 0.9765625, "learning_rate": 0.00039308936099801203, "loss": 4.6464, "mean_token_accuracy": 0.24211475551128386, "num_tokens": 78897497.0, "step": 34415 }, { "entropy": 5.091218328475952, "epoch": 3.3064361191162344, "grad_norm": 1.125, "learning_rate": 0.0003930601774678669, "loss": 4.6657, "mean_token_accuracy": 0.244375142455101, "num_tokens": 78908643.0, "step": 34420 }, { "entropy": 5.081040143966675, "epoch": 3.3069164265129682, "grad_norm": 1.0546875, "learning_rate": 0.00039303099119671487, "loss": 4.7168, "mean_token_accuracy": 0.24277334958314895, "num_tokens": 78921240.0, "step": 34425 }, { "entropy": 5.032566070556641, "epoch": 3.307396733909702, "grad_norm": 1.015625, "learning_rate": 0.00039300180218523374, "loss": 4.6431, "mean_token_accuracy": 0.255554161965847, "num_tokens": 78933803.0, "step": 34430 }, { "entropy": 5.148969936370849, "epoch": 3.307877041306436, "grad_norm": 1.109375, "learning_rate": 0.0003929726104341013, "loss": 4.7358, "mean_token_accuracy": 0.2412612333893776, "num_tokens": 78944876.0, "step": 34435 }, { "entropy": 5.021073818206787, "epoch": 3.30835734870317, "grad_norm": 1.046875, "learning_rate": 0.00039294341594399494, "loss": 4.5694, "mean_token_accuracy": 0.25430659353733065, "num_tokens": 78957044.0, "step": 34440 }, { "entropy": 4.975513029098511, "epoch": 3.3088376560999038, "grad_norm": 1.0390625, "learning_rate": 0.00039291421871559274, "loss": 4.5802, "mean_token_accuracy": 0.25888924300670624, "num_tokens": 78968654.0, "step": 34445 }, { "entropy": 5.046443319320678, "epoch": 3.309317963496638, "grad_norm": 1.0390625, "learning_rate": 0.00039288501874957263, "loss": 4.6547, "mean_token_accuracy": 0.2466696321964264, "num_tokens": 78980201.0, "step": 34450 }, { "entropy": 5.0870613098144535, "epoch": 3.3097982708933715, "grad_norm": 1.0625, "learning_rate": 0.0003928558160466123, "loss": 4.663, "mean_token_accuracy": 0.2413409322500229, "num_tokens": 78991172.0, "step": 34455 }, { "entropy": 5.024161911010742, "epoch": 3.310278578290106, "grad_norm": 0.96484375, "learning_rate": 0.0003928266106073899, "loss": 4.6257, "mean_token_accuracy": 0.2498500108718872, "num_tokens": 79001944.0, "step": 34460 }, { "entropy": 5.031950569152832, "epoch": 3.3107588856868397, "grad_norm": 0.93359375, "learning_rate": 0.00039279740243258353, "loss": 4.6391, "mean_token_accuracy": 0.2514120519161224, "num_tokens": 79014715.0, "step": 34465 }, { "entropy": 5.045935392379761, "epoch": 3.3112391930835736, "grad_norm": 0.98046875, "learning_rate": 0.0003927681915228712, "loss": 4.5737, "mean_token_accuracy": 0.2540336072444916, "num_tokens": 79025499.0, "step": 34470 }, { "entropy": 5.014480066299439, "epoch": 3.3117195004803075, "grad_norm": 0.93359375, "learning_rate": 0.0003927389778789312, "loss": 4.6461, "mean_token_accuracy": 0.2476935252547264, "num_tokens": 79037400.0, "step": 34475 }, { "entropy": 5.067327260971069, "epoch": 3.3121998078770414, "grad_norm": 1.078125, "learning_rate": 0.0003927097615014418, "loss": 4.6303, "mean_token_accuracy": 0.24671047925949097, "num_tokens": 79048450.0, "step": 34480 }, { "entropy": 5.0652423858642575, "epoch": 3.3126801152737753, "grad_norm": 1.0078125, "learning_rate": 0.0003926805423910811, "loss": 4.6727, "mean_token_accuracy": 0.2537225067615509, "num_tokens": 79061579.0, "step": 34485 }, { "entropy": 5.02296199798584, "epoch": 3.313160422670509, "grad_norm": 1.0, "learning_rate": 0.0003926513205485278, "loss": 4.5953, "mean_token_accuracy": 0.24416229724884034, "num_tokens": 79073106.0, "step": 34490 }, { "entropy": 5.067980146408081, "epoch": 3.313640730067243, "grad_norm": 1.03125, "learning_rate": 0.0003926220959744602, "loss": 4.6871, "mean_token_accuracy": 0.24308695495128632, "num_tokens": 79085528.0, "step": 34495 }, { "entropy": 5.017328786849975, "epoch": 3.314121037463977, "grad_norm": 1.0625, "learning_rate": 0.0003925928686695567, "loss": 4.5148, "mean_token_accuracy": 0.2620679005980492, "num_tokens": 79096429.0, "step": 34500 }, { "entropy": 5.082864093780517, "epoch": 3.314601344860711, "grad_norm": 1.09375, "learning_rate": 0.000392563638634496, "loss": 4.6977, "mean_token_accuracy": 0.2402432456612587, "num_tokens": 79107688.0, "step": 34505 }, { "entropy": 5.049029350280762, "epoch": 3.3150816522574447, "grad_norm": 1.015625, "learning_rate": 0.0003925344058699567, "loss": 4.5991, "mean_token_accuracy": 0.2516641363501549, "num_tokens": 79119549.0, "step": 34510 }, { "entropy": 4.996371936798096, "epoch": 3.3155619596541785, "grad_norm": 0.95703125, "learning_rate": 0.0003925051703766175, "loss": 4.5842, "mean_token_accuracy": 0.25428757518529893, "num_tokens": 79131193.0, "step": 34515 }, { "entropy": 5.008741044998169, "epoch": 3.3160422670509124, "grad_norm": 1.015625, "learning_rate": 0.0003924759321551572, "loss": 4.6671, "mean_token_accuracy": 0.238971708714962, "num_tokens": 79142117.0, "step": 34520 }, { "entropy": 5.074768209457398, "epoch": 3.3165225744476463, "grad_norm": 1.0078125, "learning_rate": 0.0003924466912062546, "loss": 4.6178, "mean_token_accuracy": 0.24548172652721406, "num_tokens": 79154734.0, "step": 34525 }, { "entropy": 5.067018842697143, "epoch": 3.31700288184438, "grad_norm": 1.0546875, "learning_rate": 0.0003924174475305885, "loss": 4.6499, "mean_token_accuracy": 0.24706732481718063, "num_tokens": 79166077.0, "step": 34530 }, { "entropy": 5.053887891769409, "epoch": 3.3174831892411145, "grad_norm": 1.0390625, "learning_rate": 0.00039238820112883795, "loss": 4.6358, "mean_token_accuracy": 0.2421278402209282, "num_tokens": 79177544.0, "step": 34535 }, { "entropy": 5.022403430938721, "epoch": 3.3179634966378484, "grad_norm": 1.0078125, "learning_rate": 0.0003923589520016819, "loss": 4.6493, "mean_token_accuracy": 0.2522857293486595, "num_tokens": 79189606.0, "step": 34540 }, { "entropy": 4.980406093597412, "epoch": 3.3184438040345823, "grad_norm": 1.015625, "learning_rate": 0.00039232970014979965, "loss": 4.5205, "mean_token_accuracy": 0.257637657225132, "num_tokens": 79201385.0, "step": 34545 }, { "entropy": 5.0712072372436525, "epoch": 3.318924111431316, "grad_norm": 1.0859375, "learning_rate": 0.0003923004455738699, "loss": 4.688, "mean_token_accuracy": 0.23973239362239837, "num_tokens": 79213481.0, "step": 34550 }, { "entropy": 5.036832189559936, "epoch": 3.31940441882805, "grad_norm": 0.9375, "learning_rate": 0.00039227118827457234, "loss": 4.5735, "mean_token_accuracy": 0.2491331622004509, "num_tokens": 79226369.0, "step": 34555 }, { "entropy": 5.086540699005127, "epoch": 3.319884726224784, "grad_norm": 1.0703125, "learning_rate": 0.00039224192825258584, "loss": 4.683, "mean_token_accuracy": 0.24358577728271485, "num_tokens": 79238236.0, "step": 34560 }, { "entropy": 5.090976619720459, "epoch": 3.320365033621518, "grad_norm": 1.0546875, "learning_rate": 0.00039221266550859004, "loss": 4.6357, "mean_token_accuracy": 0.24827074408531188, "num_tokens": 79248536.0, "step": 34565 }, { "entropy": 5.061130809783935, "epoch": 3.3208453410182517, "grad_norm": 1.0390625, "learning_rate": 0.00039218340004326414, "loss": 4.6051, "mean_token_accuracy": 0.2513726234436035, "num_tokens": 79259273.0, "step": 34570 }, { "entropy": 4.9110987186431885, "epoch": 3.3213256484149856, "grad_norm": 1.015625, "learning_rate": 0.0003921541318572876, "loss": 4.5244, "mean_token_accuracy": 0.255599670112133, "num_tokens": 79271986.0, "step": 34575 }, { "entropy": 5.030234861373901, "epoch": 3.3218059558117194, "grad_norm": 0.9609375, "learning_rate": 0.00039212486095134005, "loss": 4.6012, "mean_token_accuracy": 0.25354058742523194, "num_tokens": 79282602.0, "step": 34580 }, { "entropy": 5.048261070251465, "epoch": 3.3222862632084533, "grad_norm": 0.94140625, "learning_rate": 0.0003920955873261011, "loss": 4.6313, "mean_token_accuracy": 0.2526991873979568, "num_tokens": 79295034.0, "step": 34585 }, { "entropy": 4.9824480533599855, "epoch": 3.322766570605187, "grad_norm": 0.92578125, "learning_rate": 0.0003920663109822502, "loss": 4.6067, "mean_token_accuracy": 0.24954349547624588, "num_tokens": 79307434.0, "step": 34590 }, { "entropy": 5.1025842189788815, "epoch": 3.323246878001921, "grad_norm": 0.96875, "learning_rate": 0.00039203703192046717, "loss": 4.6699, "mean_token_accuracy": 0.24924284517765044, "num_tokens": 79318185.0, "step": 34595 }, { "entropy": 5.00308198928833, "epoch": 3.323727185398655, "grad_norm": 0.94921875, "learning_rate": 0.0003920077501414318, "loss": 4.5686, "mean_token_accuracy": 0.25722116231918335, "num_tokens": 79329800.0, "step": 34600 }, { "entropy": 4.989739322662354, "epoch": 3.324207492795389, "grad_norm": 1.0234375, "learning_rate": 0.00039197846564582395, "loss": 4.6425, "mean_token_accuracy": 0.24360090494155884, "num_tokens": 79342537.0, "step": 34605 }, { "entropy": 5.022253942489624, "epoch": 3.324687800192123, "grad_norm": 0.98828125, "learning_rate": 0.00039194917843432347, "loss": 4.6199, "mean_token_accuracy": 0.25004632622003553, "num_tokens": 79353727.0, "step": 34610 }, { "entropy": 5.032826948165893, "epoch": 3.325168107588857, "grad_norm": 1.1015625, "learning_rate": 0.0003919198885076103, "loss": 4.5463, "mean_token_accuracy": 0.25729864537715913, "num_tokens": 79364337.0, "step": 34615 }, { "entropy": 5.034902667999267, "epoch": 3.325648414985591, "grad_norm": 1.015625, "learning_rate": 0.00039189059586636465, "loss": 4.604, "mean_token_accuracy": 0.25298081040382386, "num_tokens": 79375890.0, "step": 34620 }, { "entropy": 5.0548238277435305, "epoch": 3.326128722382325, "grad_norm": 1.0234375, "learning_rate": 0.0003918613005112663, "loss": 4.6147, "mean_token_accuracy": 0.2492382198572159, "num_tokens": 79386686.0, "step": 34625 }, { "entropy": 5.050034379959106, "epoch": 3.3266090297790587, "grad_norm": 1.0390625, "learning_rate": 0.0003918320024429956, "loss": 4.6355, "mean_token_accuracy": 0.24784999787807466, "num_tokens": 79397852.0, "step": 34630 }, { "entropy": 5.052454900741577, "epoch": 3.3270893371757926, "grad_norm": 0.97265625, "learning_rate": 0.0003918027016622328, "loss": 4.6198, "mean_token_accuracy": 0.2552115857601166, "num_tokens": 79409547.0, "step": 34635 }, { "entropy": 4.92240686416626, "epoch": 3.3275696445725265, "grad_norm": 1.0078125, "learning_rate": 0.0003917733981696579, "loss": 4.5988, "mean_token_accuracy": 0.2538344353437424, "num_tokens": 79421906.0, "step": 34640 }, { "entropy": 5.0243449211120605, "epoch": 3.3280499519692603, "grad_norm": 0.99609375, "learning_rate": 0.0003917440919659516, "loss": 4.594, "mean_token_accuracy": 0.25182389467954636, "num_tokens": 79433767.0, "step": 34645 }, { "entropy": 5.105506849288941, "epoch": 3.3285302593659942, "grad_norm": 1.03125, "learning_rate": 0.0003917147830517941, "loss": 4.643, "mean_token_accuracy": 0.2416081815958023, "num_tokens": 79445488.0, "step": 34650 }, { "entropy": 5.068530893325805, "epoch": 3.329010566762728, "grad_norm": 0.93359375, "learning_rate": 0.0003916854714278659, "loss": 4.6105, "mean_token_accuracy": 0.25305124223232267, "num_tokens": 79457908.0, "step": 34655 }, { "entropy": 5.064067792892456, "epoch": 3.329490874159462, "grad_norm": 1.0390625, "learning_rate": 0.0003916561570948476, "loss": 4.6647, "mean_token_accuracy": 0.2456543266773224, "num_tokens": 79469876.0, "step": 34660 }, { "entropy": 5.001161003112793, "epoch": 3.329971181556196, "grad_norm": 1.0390625, "learning_rate": 0.0003916268400534197, "loss": 4.541, "mean_token_accuracy": 0.25127813071012495, "num_tokens": 79480532.0, "step": 34665 }, { "entropy": 5.0196448802948, "epoch": 3.3304514889529298, "grad_norm": 1.046875, "learning_rate": 0.0003915975203042628, "loss": 4.6284, "mean_token_accuracy": 0.25387539267539977, "num_tokens": 79491298.0, "step": 34670 }, { "entropy": 5.026435327529907, "epoch": 3.3309317963496636, "grad_norm": 1.0078125, "learning_rate": 0.00039156819784805783, "loss": 4.6023, "mean_token_accuracy": 0.24340915977954863, "num_tokens": 79502794.0, "step": 34675 }, { "entropy": 5.02727255821228, "epoch": 3.3314121037463975, "grad_norm": 1.0078125, "learning_rate": 0.0003915388726854854, "loss": 4.6149, "mean_token_accuracy": 0.24926782846450807, "num_tokens": 79513104.0, "step": 34680 }, { "entropy": 5.1041487693786625, "epoch": 3.331892411143132, "grad_norm": 1.0, "learning_rate": 0.00039150954481722634, "loss": 4.7032, "mean_token_accuracy": 0.23929551988840103, "num_tokens": 79524660.0, "step": 34685 }, { "entropy": 5.1001204490661625, "epoch": 3.3323727185398653, "grad_norm": 1.1484375, "learning_rate": 0.0003914802142439617, "loss": 4.6225, "mean_token_accuracy": 0.24540394246578218, "num_tokens": 79538027.0, "step": 34690 }, { "entropy": 5.030107164382935, "epoch": 3.3328530259365996, "grad_norm": 1.0859375, "learning_rate": 0.0003914508809663723, "loss": 4.5675, "mean_token_accuracy": 0.2557969391345978, "num_tokens": 79548894.0, "step": 34695 }, { "entropy": 5.004552030563355, "epoch": 3.3333333333333335, "grad_norm": 1.0703125, "learning_rate": 0.00039142154498513913, "loss": 4.6753, "mean_token_accuracy": 0.24684297144412995, "num_tokens": 79560520.0, "step": 34700 }, { "entropy": 5.088085651397705, "epoch": 3.3338136407300674, "grad_norm": 1.0625, "learning_rate": 0.00039139220630094357, "loss": 4.6439, "mean_token_accuracy": 0.25575283318758013, "num_tokens": 79571440.0, "step": 34705 }, { "entropy": 5.060253047943116, "epoch": 3.3342939481268012, "grad_norm": 0.9921875, "learning_rate": 0.00039136286491446657, "loss": 4.6559, "mean_token_accuracy": 0.2452133461833, "num_tokens": 79582046.0, "step": 34710 }, { "entropy": 5.013051509857178, "epoch": 3.334774255523535, "grad_norm": 1.0625, "learning_rate": 0.00039133352082638923, "loss": 4.606, "mean_token_accuracy": 0.24551442116498948, "num_tokens": 79593704.0, "step": 34715 }, { "entropy": 5.074276065826416, "epoch": 3.335254562920269, "grad_norm": 0.96484375, "learning_rate": 0.00039130417403739315, "loss": 4.6681, "mean_token_accuracy": 0.24499332159757614, "num_tokens": 79605783.0, "step": 34720 }, { "entropy": 5.042751979827881, "epoch": 3.335734870317003, "grad_norm": 0.984375, "learning_rate": 0.0003912748245481594, "loss": 4.5323, "mean_token_accuracy": 0.25625754743814466, "num_tokens": 79617140.0, "step": 34725 }, { "entropy": 5.0301860809326175, "epoch": 3.3362151777137368, "grad_norm": 1.0390625, "learning_rate": 0.00039124547235936947, "loss": 4.6171, "mean_token_accuracy": 0.245346499979496, "num_tokens": 79627763.0, "step": 34730 }, { "entropy": 5.1219123840332035, "epoch": 3.3366954851104706, "grad_norm": 1.015625, "learning_rate": 0.00039121611747170495, "loss": 4.7187, "mean_token_accuracy": 0.24241504669189454, "num_tokens": 79640056.0, "step": 34735 }, { "entropy": 5.054620790481567, "epoch": 3.3371757925072045, "grad_norm": 0.9921875, "learning_rate": 0.00039118675988584724, "loss": 4.6452, "mean_token_accuracy": 0.24911766946315766, "num_tokens": 79652795.0, "step": 34740 }, { "entropy": 5.165208911895752, "epoch": 3.3376560999039384, "grad_norm": 0.9765625, "learning_rate": 0.0003911573996024779, "loss": 4.7728, "mean_token_accuracy": 0.23868272453546524, "num_tokens": 79663734.0, "step": 34745 }, { "entropy": 5.031789350509643, "epoch": 3.3381364073006723, "grad_norm": 1.1171875, "learning_rate": 0.0003911280366222787, "loss": 4.5572, "mean_token_accuracy": 0.25184957683086395, "num_tokens": 79675092.0, "step": 34750 }, { "entropy": 5.066293573379516, "epoch": 3.338616714697406, "grad_norm": 1.0703125, "learning_rate": 0.00039109867094593134, "loss": 4.7148, "mean_token_accuracy": 0.23990821242332458, "num_tokens": 79687126.0, "step": 34755 }, { "entropy": 4.976014232635498, "epoch": 3.3390970220941405, "grad_norm": 1.0, "learning_rate": 0.0003910693025741175, "loss": 4.5393, "mean_token_accuracy": 0.2583530515432358, "num_tokens": 79698511.0, "step": 34760 }, { "entropy": 5.0544932842254635, "epoch": 3.339577329490874, "grad_norm": 1.0625, "learning_rate": 0.00039103993150751916, "loss": 4.6336, "mean_token_accuracy": 0.2523597240447998, "num_tokens": 79709407.0, "step": 34765 }, { "entropy": 5.0417557716369625, "epoch": 3.3400576368876083, "grad_norm": 1.0703125, "learning_rate": 0.00039101055774681825, "loss": 4.6133, "mean_token_accuracy": 0.25056389570236204, "num_tokens": 79720562.0, "step": 34770 }, { "entropy": 5.073037481307983, "epoch": 3.340537944284342, "grad_norm": 0.96875, "learning_rate": 0.0003909811812926966, "loss": 4.7071, "mean_token_accuracy": 0.23592607975006102, "num_tokens": 79731687.0, "step": 34775 }, { "entropy": 5.069588565826416, "epoch": 3.341018251681076, "grad_norm": 1.046875, "learning_rate": 0.0003909518021458363, "loss": 4.6447, "mean_token_accuracy": 0.24849862605333328, "num_tokens": 79742954.0, "step": 34780 }, { "entropy": 5.050691366195679, "epoch": 3.34149855907781, "grad_norm": 1.0390625, "learning_rate": 0.0003909224203069195, "loss": 4.6767, "mean_token_accuracy": 0.24575020968914033, "num_tokens": 79754233.0, "step": 34785 }, { "entropy": 5.048553848266602, "epoch": 3.341978866474544, "grad_norm": 1.015625, "learning_rate": 0.0003908930357766283, "loss": 4.5237, "mean_token_accuracy": 0.25873469561338425, "num_tokens": 79765348.0, "step": 34790 }, { "entropy": 5.020631742477417, "epoch": 3.3424591738712777, "grad_norm": 1.0546875, "learning_rate": 0.0003908636485556449, "loss": 4.5937, "mean_token_accuracy": 0.2509862929582596, "num_tokens": 79776993.0, "step": 34795 }, { "entropy": 5.103504943847656, "epoch": 3.3429394812680115, "grad_norm": 1.046875, "learning_rate": 0.00039083425864465165, "loss": 4.7444, "mean_token_accuracy": 0.2453736409544945, "num_tokens": 79787321.0, "step": 34800 }, { "entropy": 5.048814916610718, "epoch": 3.3434197886647454, "grad_norm": 1.0625, "learning_rate": 0.0003908048660443309, "loss": 4.5941, "mean_token_accuracy": 0.24696359783411026, "num_tokens": 79798130.0, "step": 34805 }, { "entropy": 5.0221131324768065, "epoch": 3.3439000960614793, "grad_norm": 0.98828125, "learning_rate": 0.0003907754707553651, "loss": 4.5643, "mean_token_accuracy": 0.25141998529434206, "num_tokens": 79809616.0, "step": 34810 }, { "entropy": 4.96633505821228, "epoch": 3.344380403458213, "grad_norm": 0.984375, "learning_rate": 0.0003907460727784365, "loss": 4.5849, "mean_token_accuracy": 0.25542705655097964, "num_tokens": 79820355.0, "step": 34815 }, { "entropy": 5.065265417098999, "epoch": 3.344860710854947, "grad_norm": 1.0390625, "learning_rate": 0.00039071667211422787, "loss": 4.6215, "mean_token_accuracy": 0.2475579112768173, "num_tokens": 79831733.0, "step": 34820 }, { "entropy": 5.066289615631104, "epoch": 3.345341018251681, "grad_norm": 1.046875, "learning_rate": 0.0003906872687634217, "loss": 4.6637, "mean_token_accuracy": 0.24272034019231797, "num_tokens": 79842961.0, "step": 34825 }, { "entropy": 5.043630075454712, "epoch": 3.345821325648415, "grad_norm": 1.0078125, "learning_rate": 0.00039065786272670066, "loss": 4.6594, "mean_token_accuracy": 0.25113607496023177, "num_tokens": 79855142.0, "step": 34830 }, { "entropy": 4.972777795791626, "epoch": 3.3463016330451487, "grad_norm": 1.1171875, "learning_rate": 0.0003906284540047475, "loss": 4.5554, "mean_token_accuracy": 0.2545921131968498, "num_tokens": 79865973.0, "step": 34835 }, { "entropy": 5.092799186706543, "epoch": 3.3467819404418826, "grad_norm": 1.0625, "learning_rate": 0.00039059904259824507, "loss": 4.6387, "mean_token_accuracy": 0.2502507969737053, "num_tokens": 79877424.0, "step": 34840 }, { "entropy": 5.109822702407837, "epoch": 3.347262247838617, "grad_norm": 0.95703125, "learning_rate": 0.000390569628507876, "loss": 4.669, "mean_token_accuracy": 0.24460556954145432, "num_tokens": 79889956.0, "step": 34845 }, { "entropy": 5.022493124008179, "epoch": 3.347742555235351, "grad_norm": 0.984375, "learning_rate": 0.00039054021173432336, "loss": 4.6251, "mean_token_accuracy": 0.24823231250047684, "num_tokens": 79902454.0, "step": 34850 }, { "entropy": 5.035952091217041, "epoch": 3.3482228626320847, "grad_norm": 0.98828125, "learning_rate": 0.0003905107922782701, "loss": 4.6641, "mean_token_accuracy": 0.24565812349319457, "num_tokens": 79914425.0, "step": 34855 }, { "entropy": 4.987614679336548, "epoch": 3.3487031700288186, "grad_norm": 0.98046875, "learning_rate": 0.0003904813701403993, "loss": 4.5628, "mean_token_accuracy": 0.25591775923967364, "num_tokens": 79926225.0, "step": 34860 }, { "entropy": 4.974949789047241, "epoch": 3.3491834774255524, "grad_norm": 0.96484375, "learning_rate": 0.00039045194532139396, "loss": 4.5591, "mean_token_accuracy": 0.25687998682260516, "num_tokens": 79937804.0, "step": 34865 }, { "entropy": 5.070349645614624, "epoch": 3.3496637848222863, "grad_norm": 1.1171875, "learning_rate": 0.0003904225178219372, "loss": 4.6242, "mean_token_accuracy": 0.25052126199007035, "num_tokens": 79949588.0, "step": 34870 }, { "entropy": 5.033775806427002, "epoch": 3.35014409221902, "grad_norm": 1.0234375, "learning_rate": 0.00039039308764271237, "loss": 4.5852, "mean_token_accuracy": 0.24895876049995422, "num_tokens": 79960243.0, "step": 34875 }, { "entropy": 4.979428672790528, "epoch": 3.350624399615754, "grad_norm": 1.0625, "learning_rate": 0.0003903636547844026, "loss": 4.6057, "mean_token_accuracy": 0.2489009216427803, "num_tokens": 79971780.0, "step": 34880 }, { "entropy": 5.041569805145263, "epoch": 3.351104707012488, "grad_norm": 0.99609375, "learning_rate": 0.00039033421924769145, "loss": 4.6335, "mean_token_accuracy": 0.2484254464507103, "num_tokens": 79982105.0, "step": 34885 }, { "entropy": 5.013979148864746, "epoch": 3.351585014409222, "grad_norm": 1.015625, "learning_rate": 0.00039030478103326216, "loss": 4.5643, "mean_token_accuracy": 0.2592616483569145, "num_tokens": 79993456.0, "step": 34890 }, { "entropy": 5.073239994049072, "epoch": 3.3520653218059557, "grad_norm": 0.99609375, "learning_rate": 0.00039027534014179823, "loss": 4.6747, "mean_token_accuracy": 0.2410885751247406, "num_tokens": 80006335.0, "step": 34895 }, { "entropy": 5.026223230361938, "epoch": 3.3525456292026896, "grad_norm": 0.953125, "learning_rate": 0.0003902458965739832, "loss": 4.6048, "mean_token_accuracy": 0.25290912985801695, "num_tokens": 80017574.0, "step": 34900 }, { "entropy": 5.0304210662841795, "epoch": 3.3530259365994235, "grad_norm": 1.015625, "learning_rate": 0.0003902164503305006, "loss": 4.629, "mean_token_accuracy": 0.2353252351284027, "num_tokens": 80029470.0, "step": 34905 }, { "entropy": 5.000137281417847, "epoch": 3.3535062439961574, "grad_norm": 0.9453125, "learning_rate": 0.0003901870014120343, "loss": 4.5857, "mean_token_accuracy": 0.2529801607131958, "num_tokens": 80040599.0, "step": 34910 }, { "entropy": 5.060665082931519, "epoch": 3.3539865513928913, "grad_norm": 1.0234375, "learning_rate": 0.0003901575498192678, "loss": 4.631, "mean_token_accuracy": 0.24895701557397842, "num_tokens": 80052597.0, "step": 34915 }, { "entropy": 5.065235280990601, "epoch": 3.3544668587896256, "grad_norm": 1.125, "learning_rate": 0.0003901280955528849, "loss": 4.6645, "mean_token_accuracy": 0.2418915808200836, "num_tokens": 80063208.0, "step": 34920 }, { "entropy": 4.99900975227356, "epoch": 3.3549471661863595, "grad_norm": 1.078125, "learning_rate": 0.0003900986386135695, "loss": 4.6406, "mean_token_accuracy": 0.2570782914757729, "num_tokens": 80074046.0, "step": 34925 }, { "entropy": 5.06900725364685, "epoch": 3.3554274735830933, "grad_norm": 0.96484375, "learning_rate": 0.00039006917900200543, "loss": 4.6908, "mean_token_accuracy": 0.24064703434705734, "num_tokens": 80086054.0, "step": 34930 }, { "entropy": 5.1344099044799805, "epoch": 3.3559077809798272, "grad_norm": 1.0234375, "learning_rate": 0.00039003971671887675, "loss": 4.5825, "mean_token_accuracy": 0.25378605872392657, "num_tokens": 80096030.0, "step": 34935 }, { "entropy": 4.9018505096435545, "epoch": 3.356388088376561, "grad_norm": 0.97265625, "learning_rate": 0.0003900102517648674, "loss": 4.4582, "mean_token_accuracy": 0.26462458819150925, "num_tokens": 80106412.0, "step": 34940 }, { "entropy": 5.00390625, "epoch": 3.356868395773295, "grad_norm": 1.1171875, "learning_rate": 0.0003899807841406617, "loss": 4.6295, "mean_token_accuracy": 0.2515963226556778, "num_tokens": 80116580.0, "step": 34945 }, { "entropy": 5.083790874481201, "epoch": 3.357348703170029, "grad_norm": 1.0234375, "learning_rate": 0.0003899513138469434, "loss": 4.6466, "mean_token_accuracy": 0.2497406020760536, "num_tokens": 80129055.0, "step": 34950 }, { "entropy": 5.04338812828064, "epoch": 3.3578290105667628, "grad_norm": 1.015625, "learning_rate": 0.000389921840884397, "loss": 4.7086, "mean_token_accuracy": 0.24321456551551818, "num_tokens": 80140626.0, "step": 34955 }, { "entropy": 5.090425682067871, "epoch": 3.3583093179634966, "grad_norm": 0.99609375, "learning_rate": 0.00038989236525370676, "loss": 4.6687, "mean_token_accuracy": 0.24711875915527343, "num_tokens": 80151896.0, "step": 34960 }, { "entropy": 5.068356513977051, "epoch": 3.3587896253602305, "grad_norm": 1.140625, "learning_rate": 0.0003898628869555569, "loss": 4.584, "mean_token_accuracy": 0.25327493250370026, "num_tokens": 80162801.0, "step": 34965 }, { "entropy": 5.058989238739014, "epoch": 3.3592699327569644, "grad_norm": 1.0234375, "learning_rate": 0.00038983340599063187, "loss": 4.6073, "mean_token_accuracy": 0.24858204871416092, "num_tokens": 80174998.0, "step": 34970 }, { "entropy": 5.136708736419678, "epoch": 3.3597502401536983, "grad_norm": 0.9375, "learning_rate": 0.0003898039223596162, "loss": 4.7701, "mean_token_accuracy": 0.23823589086532593, "num_tokens": 80188092.0, "step": 34975 }, { "entropy": 5.088174486160279, "epoch": 3.360230547550432, "grad_norm": 1.09375, "learning_rate": 0.0003897744360631943, "loss": 4.6215, "mean_token_accuracy": 0.25468567907810213, "num_tokens": 80199749.0, "step": 34980 }, { "entropy": 4.988624811172485, "epoch": 3.360710854947166, "grad_norm": 1.0625, "learning_rate": 0.00038974494710205084, "loss": 4.5906, "mean_token_accuracy": 0.2534733057022095, "num_tokens": 80211714.0, "step": 34985 }, { "entropy": 5.010880756378174, "epoch": 3.3611911623439, "grad_norm": 1.078125, "learning_rate": 0.00038971545547687036, "loss": 4.6184, "mean_token_accuracy": 0.25120625346899034, "num_tokens": 80222258.0, "step": 34990 }, { "entropy": 4.944441032409668, "epoch": 3.3616714697406342, "grad_norm": 1.078125, "learning_rate": 0.00038968596118833766, "loss": 4.5875, "mean_token_accuracy": 0.25603134781122205, "num_tokens": 80234532.0, "step": 34995 }, { "entropy": 5.029980945587158, "epoch": 3.3621517771373677, "grad_norm": 1.0390625, "learning_rate": 0.00038965646423713744, "loss": 4.5566, "mean_token_accuracy": 0.2543851360678673, "num_tokens": 80245826.0, "step": 35000 }, { "entropy": 4.983625268936157, "epoch": 3.362632084534102, "grad_norm": 1.0234375, "learning_rate": 0.00038962696462395473, "loss": 4.5365, "mean_token_accuracy": 0.24928556382656097, "num_tokens": 80256715.0, "step": 35005 }, { "entropy": 5.06455135345459, "epoch": 3.363112391930836, "grad_norm": 1.0, "learning_rate": 0.0003895974623494742, "loss": 4.6382, "mean_token_accuracy": 0.24848238825798036, "num_tokens": 80267713.0, "step": 35010 }, { "entropy": 5.060695934295654, "epoch": 3.3635926993275698, "grad_norm": 1.0546875, "learning_rate": 0.00038956795741438085, "loss": 4.6585, "mean_token_accuracy": 0.2485118180513382, "num_tokens": 80278935.0, "step": 35015 }, { "entropy": 4.979933404922486, "epoch": 3.3640730067243036, "grad_norm": 0.953125, "learning_rate": 0.00038953844981935975, "loss": 4.5085, "mean_token_accuracy": 0.2593720957636833, "num_tokens": 80289735.0, "step": 35020 }, { "entropy": 5.018657970428467, "epoch": 3.3645533141210375, "grad_norm": 0.9765625, "learning_rate": 0.00038950893956509597, "loss": 4.684, "mean_token_accuracy": 0.25095806568861007, "num_tokens": 80301617.0, "step": 35025 }, { "entropy": 4.975236129760742, "epoch": 3.3650336215177714, "grad_norm": 1.1015625, "learning_rate": 0.0003894794266522746, "loss": 4.4971, "mean_token_accuracy": 0.26460852175951005, "num_tokens": 80312996.0, "step": 35030 }, { "entropy": 5.0634908199310305, "epoch": 3.3655139289145053, "grad_norm": 1.078125, "learning_rate": 0.00038944991108158094, "loss": 4.5849, "mean_token_accuracy": 0.24807824194431305, "num_tokens": 80324435.0, "step": 35035 }, { "entropy": 5.008642959594726, "epoch": 3.365994236311239, "grad_norm": 0.96484375, "learning_rate": 0.0003894203928537001, "loss": 4.6229, "mean_token_accuracy": 0.25143705904483793, "num_tokens": 80335922.0, "step": 35040 }, { "entropy": 5.046055936813355, "epoch": 3.366474543707973, "grad_norm": 0.9765625, "learning_rate": 0.00038939087196931754, "loss": 4.6381, "mean_token_accuracy": 0.25367101579904555, "num_tokens": 80348405.0, "step": 35045 }, { "entropy": 5.050200891494751, "epoch": 3.366954851104707, "grad_norm": 1.1015625, "learning_rate": 0.00038936134842911863, "loss": 4.6002, "mean_token_accuracy": 0.25341845452785494, "num_tokens": 80361387.0, "step": 35050 }, { "entropy": 4.981326484680176, "epoch": 3.367435158501441, "grad_norm": 1.03125, "learning_rate": 0.0003893318222337888, "loss": 4.642, "mean_token_accuracy": 0.25004069954156877, "num_tokens": 80372500.0, "step": 35055 }, { "entropy": 5.095010089874267, "epoch": 3.3679154658981747, "grad_norm": 0.99609375, "learning_rate": 0.00038930229338401354, "loss": 4.7115, "mean_token_accuracy": 0.24472733289003373, "num_tokens": 80384311.0, "step": 35060 }, { "entropy": 5.059339284896851, "epoch": 3.3683957732949086, "grad_norm": 1.03125, "learning_rate": 0.0003892727618804783, "loss": 4.5883, "mean_token_accuracy": 0.25022012293338775, "num_tokens": 80396259.0, "step": 35065 }, { "entropy": 5.0155357837677, "epoch": 3.3688760806916425, "grad_norm": 1.09375, "learning_rate": 0.000389243227723869, "loss": 4.6135, "mean_token_accuracy": 0.24623016715049745, "num_tokens": 80407420.0, "step": 35070 }, { "entropy": 4.988042116165161, "epoch": 3.3693563880883763, "grad_norm": 1.15625, "learning_rate": 0.0003892136909148711, "loss": 4.5612, "mean_token_accuracy": 0.2516023561358452, "num_tokens": 80417130.0, "step": 35075 }, { "entropy": 5.065314388275146, "epoch": 3.3698366954851107, "grad_norm": 0.9375, "learning_rate": 0.0003891841514541706, "loss": 4.6198, "mean_token_accuracy": 0.25408089309930804, "num_tokens": 80429405.0, "step": 35080 }, { "entropy": 5.032301759719848, "epoch": 3.3703170028818445, "grad_norm": 1.1328125, "learning_rate": 0.000389154609342453, "loss": 4.6282, "mean_token_accuracy": 0.24592494666576387, "num_tokens": 80439895.0, "step": 35085 }, { "entropy": 4.953702640533447, "epoch": 3.3707973102785784, "grad_norm": 0.99609375, "learning_rate": 0.0003891250645804044, "loss": 4.5914, "mean_token_accuracy": 0.25820731818675996, "num_tokens": 80451884.0, "step": 35090 }, { "entropy": 5.060756540298462, "epoch": 3.3712776176753123, "grad_norm": 1.0390625, "learning_rate": 0.00038909551716871074, "loss": 4.6214, "mean_token_accuracy": 0.2534745901823044, "num_tokens": 80462629.0, "step": 35095 }, { "entropy": 5.088488388061523, "epoch": 3.371757925072046, "grad_norm": 1.078125, "learning_rate": 0.0003890659671080579, "loss": 4.6689, "mean_token_accuracy": 0.24508027881383895, "num_tokens": 80472066.0, "step": 35100 }, { "entropy": 5.0153107166290285, "epoch": 3.37223823246878, "grad_norm": 0.92578125, "learning_rate": 0.000389036414399132, "loss": 4.5686, "mean_token_accuracy": 0.25422897189855576, "num_tokens": 80483687.0, "step": 35105 }, { "entropy": 5.004940223693848, "epoch": 3.372718539865514, "grad_norm": 1.09375, "learning_rate": 0.0003890068590426191, "loss": 4.6007, "mean_token_accuracy": 0.2536112517118454, "num_tokens": 80493323.0, "step": 35110 }, { "entropy": 5.023319911956787, "epoch": 3.373198847262248, "grad_norm": 1.0546875, "learning_rate": 0.0003889773010392056, "loss": 4.6233, "mean_token_accuracy": 0.24997997283935547, "num_tokens": 80504454.0, "step": 35115 }, { "entropy": 4.969642305374146, "epoch": 3.3736791546589817, "grad_norm": 1.1640625, "learning_rate": 0.00038894774038957756, "loss": 4.5349, "mean_token_accuracy": 0.25590767413377763, "num_tokens": 80516201.0, "step": 35120 }, { "entropy": 4.994466161727905, "epoch": 3.3741594620557156, "grad_norm": 1.0546875, "learning_rate": 0.00038891817709442135, "loss": 4.6321, "mean_token_accuracy": 0.2489602282643318, "num_tokens": 80528319.0, "step": 35125 }, { "entropy": 5.012839078903198, "epoch": 3.3746397694524495, "grad_norm": 1.0, "learning_rate": 0.00038888861115442334, "loss": 4.5588, "mean_token_accuracy": 0.2572706416249275, "num_tokens": 80539684.0, "step": 35130 }, { "entropy": 5.011568927764893, "epoch": 3.3751200768491834, "grad_norm": 0.94921875, "learning_rate": 0.0003888590425702699, "loss": 4.5827, "mean_token_accuracy": 0.25684234499931335, "num_tokens": 80550971.0, "step": 35135 }, { "entropy": 4.997000217437744, "epoch": 3.3756003842459172, "grad_norm": 1.0234375, "learning_rate": 0.0003888294713426477, "loss": 4.5924, "mean_token_accuracy": 0.2533003658056259, "num_tokens": 80562931.0, "step": 35140 }, { "entropy": 4.9594700813293455, "epoch": 3.376080691642651, "grad_norm": 1.0234375, "learning_rate": 0.00038879989747224317, "loss": 4.5819, "mean_token_accuracy": 0.2596623405814171, "num_tokens": 80573899.0, "step": 35145 }, { "entropy": 5.024944305419922, "epoch": 3.376560999039385, "grad_norm": 0.98046875, "learning_rate": 0.0003887703209597428, "loss": 4.5934, "mean_token_accuracy": 0.25081279426813125, "num_tokens": 80585453.0, "step": 35150 }, { "entropy": 5.05692572593689, "epoch": 3.3770413064361193, "grad_norm": 0.96484375, "learning_rate": 0.0003887407418058335, "loss": 4.6539, "mean_token_accuracy": 0.2525654971599579, "num_tokens": 80597911.0, "step": 35155 }, { "entropy": 5.046922397613526, "epoch": 3.377521613832853, "grad_norm": 1.09375, "learning_rate": 0.00038871116001120196, "loss": 4.6075, "mean_token_accuracy": 0.25285103768110273, "num_tokens": 80609350.0, "step": 35160 }, { "entropy": 5.025788450241089, "epoch": 3.378001921229587, "grad_norm": 0.9296875, "learning_rate": 0.0003886815755765348, "loss": 4.6589, "mean_token_accuracy": 0.246637824177742, "num_tokens": 80620710.0, "step": 35165 }, { "entropy": 5.088965368270874, "epoch": 3.378482228626321, "grad_norm": 0.95703125, "learning_rate": 0.0003886519885025191, "loss": 4.737, "mean_token_accuracy": 0.23663281500339509, "num_tokens": 80633031.0, "step": 35170 }, { "entropy": 4.987149667739868, "epoch": 3.378962536023055, "grad_norm": 0.97265625, "learning_rate": 0.00038862239878984173, "loss": 4.5205, "mean_token_accuracy": 0.2592849716544151, "num_tokens": 80644047.0, "step": 35175 }, { "entropy": 5.032893371582031, "epoch": 3.3794428434197887, "grad_norm": 1.0390625, "learning_rate": 0.0003885928064391897, "loss": 4.5971, "mean_token_accuracy": 0.25226512998342515, "num_tokens": 80655769.0, "step": 35180 }, { "entropy": 5.070116376876831, "epoch": 3.3799231508165226, "grad_norm": 1.015625, "learning_rate": 0.0003885632114512499, "loss": 4.6411, "mean_token_accuracy": 0.2474030002951622, "num_tokens": 80667881.0, "step": 35185 }, { "entropy": 5.0757129192352295, "epoch": 3.3804034582132565, "grad_norm": 1.0, "learning_rate": 0.00038853361382670956, "loss": 4.6875, "mean_token_accuracy": 0.24633048176765443, "num_tokens": 80679021.0, "step": 35190 }, { "entropy": 5.026468753814697, "epoch": 3.3808837656099904, "grad_norm": 1.0234375, "learning_rate": 0.00038850401356625583, "loss": 4.641, "mean_token_accuracy": 0.2502972841262817, "num_tokens": 80689366.0, "step": 35195 }, { "entropy": 4.9525751113891605, "epoch": 3.3813640730067243, "grad_norm": 1.015625, "learning_rate": 0.0003884744106705759, "loss": 4.5438, "mean_token_accuracy": 0.25194079875946046, "num_tokens": 80700755.0, "step": 35200 }, { "entropy": 5.034112548828125, "epoch": 3.381844380403458, "grad_norm": 1.015625, "learning_rate": 0.00038844480514035727, "loss": 4.6111, "mean_token_accuracy": 0.24931746870279312, "num_tokens": 80712186.0, "step": 35205 }, { "entropy": 4.97149806022644, "epoch": 3.382324687800192, "grad_norm": 1.078125, "learning_rate": 0.000388415196976287, "loss": 4.5836, "mean_token_accuracy": 0.2611155539751053, "num_tokens": 80723865.0, "step": 35210 }, { "entropy": 5.002107858657837, "epoch": 3.382804995196926, "grad_norm": 1.0625, "learning_rate": 0.0003883855861790526, "loss": 4.6256, "mean_token_accuracy": 0.2487585127353668, "num_tokens": 80735817.0, "step": 35215 }, { "entropy": 5.030579614639282, "epoch": 3.38328530259366, "grad_norm": 1.015625, "learning_rate": 0.0003883559727493417, "loss": 4.6974, "mean_token_accuracy": 0.24719827026128768, "num_tokens": 80747782.0, "step": 35220 }, { "entropy": 5.0343286991119385, "epoch": 3.3837656099903937, "grad_norm": 1.046875, "learning_rate": 0.0003883263566878416, "loss": 4.6278, "mean_token_accuracy": 0.24029747098684312, "num_tokens": 80759119.0, "step": 35225 }, { "entropy": 5.019806241989135, "epoch": 3.384245917387128, "grad_norm": 1.03125, "learning_rate": 0.00038829673799524006, "loss": 4.5461, "mean_token_accuracy": 0.2609804138541222, "num_tokens": 80769792.0, "step": 35230 }, { "entropy": 4.979036855697632, "epoch": 3.3847262247838614, "grad_norm": 0.9296875, "learning_rate": 0.00038826711667222464, "loss": 4.5588, "mean_token_accuracy": 0.2536883130669594, "num_tokens": 80781685.0, "step": 35235 }, { "entropy": 4.916115140914917, "epoch": 3.3852065321805958, "grad_norm": 1.0390625, "learning_rate": 0.00038823749271948315, "loss": 4.5189, "mean_token_accuracy": 0.25899354815483094, "num_tokens": 80792499.0, "step": 35240 }, { "entropy": 5.050924444198609, "epoch": 3.3856868395773296, "grad_norm": 1.109375, "learning_rate": 0.00038820786613770334, "loss": 4.6005, "mean_token_accuracy": 0.25700204372406005, "num_tokens": 80803020.0, "step": 35245 }, { "entropy": 5.054552841186523, "epoch": 3.3861671469740635, "grad_norm": 1.0859375, "learning_rate": 0.00038817823692757303, "loss": 4.6538, "mean_token_accuracy": 0.2523463472723961, "num_tokens": 80814543.0, "step": 35250 }, { "entropy": 4.916332340240478, "epoch": 3.3866474543707974, "grad_norm": 1.0234375, "learning_rate": 0.00038814860508978004, "loss": 4.5524, "mean_token_accuracy": 0.25886755585670473, "num_tokens": 80826686.0, "step": 35255 }, { "entropy": 5.032272148132324, "epoch": 3.3871277617675313, "grad_norm": 1.0390625, "learning_rate": 0.0003881189706250125, "loss": 4.6673, "mean_token_accuracy": 0.24142957776784896, "num_tokens": 80838372.0, "step": 35260 }, { "entropy": 5.123521566390991, "epoch": 3.387608069164265, "grad_norm": 1.0625, "learning_rate": 0.00038808933353395836, "loss": 4.6314, "mean_token_accuracy": 0.24835428595542908, "num_tokens": 80849334.0, "step": 35265 }, { "entropy": 4.956131029129028, "epoch": 3.388088376560999, "grad_norm": 1.109375, "learning_rate": 0.00038805969381730564, "loss": 4.4944, "mean_token_accuracy": 0.25334072560071946, "num_tokens": 80860682.0, "step": 35270 }, { "entropy": 5.04627799987793, "epoch": 3.388568683957733, "grad_norm": 1.09375, "learning_rate": 0.00038803005147574265, "loss": 4.6569, "mean_token_accuracy": 0.24770759046077728, "num_tokens": 80872026.0, "step": 35275 }, { "entropy": 5.063813591003418, "epoch": 3.389048991354467, "grad_norm": 1.0078125, "learning_rate": 0.0003880004065099575, "loss": 4.6519, "mean_token_accuracy": 0.24535784721374512, "num_tokens": 80884146.0, "step": 35280 }, { "entropy": 5.036695432662964, "epoch": 3.3895292987512007, "grad_norm": 0.96484375, "learning_rate": 0.0003879707589206383, "loss": 4.6238, "mean_token_accuracy": 0.25011399686336516, "num_tokens": 80895975.0, "step": 35285 }, { "entropy": 5.1202473640441895, "epoch": 3.3900096061479346, "grad_norm": 1.03125, "learning_rate": 0.0003879411087084736, "loss": 4.7081, "mean_token_accuracy": 0.241127410531044, "num_tokens": 80908144.0, "step": 35290 }, { "entropy": 5.017182159423828, "epoch": 3.3904899135446684, "grad_norm": 1.0078125, "learning_rate": 0.00038791145587415186, "loss": 4.6031, "mean_token_accuracy": 0.2476770669221878, "num_tokens": 80918616.0, "step": 35295 }, { "entropy": 5.067205572128296, "epoch": 3.3909702209414023, "grad_norm": 0.94921875, "learning_rate": 0.00038788180041836117, "loss": 4.7103, "mean_token_accuracy": 0.2398787707090378, "num_tokens": 80929690.0, "step": 35300 }, { "entropy": 5.053402137756348, "epoch": 3.3914505283381366, "grad_norm": 1.015625, "learning_rate": 0.00038785214234179037, "loss": 4.582, "mean_token_accuracy": 0.248058520257473, "num_tokens": 80940817.0, "step": 35305 }, { "entropy": 5.060543441772461, "epoch": 3.39193083573487, "grad_norm": 1.0703125, "learning_rate": 0.00038782248164512804, "loss": 4.5829, "mean_token_accuracy": 0.2567798539996147, "num_tokens": 80950951.0, "step": 35310 }, { "entropy": 5.011791658401489, "epoch": 3.3924111431316044, "grad_norm": 1.0078125, "learning_rate": 0.00038779281832906253, "loss": 4.6077, "mean_token_accuracy": 0.24900132268667222, "num_tokens": 80963511.0, "step": 35315 }, { "entropy": 5.099618673324585, "epoch": 3.3928914505283383, "grad_norm": 0.94921875, "learning_rate": 0.00038776315239428275, "loss": 4.7113, "mean_token_accuracy": 0.24593275040388107, "num_tokens": 80975905.0, "step": 35320 }, { "entropy": 5.089013481140137, "epoch": 3.393371757925072, "grad_norm": 1.0390625, "learning_rate": 0.00038773348384147743, "loss": 4.6394, "mean_token_accuracy": 0.2502724289894104, "num_tokens": 80986850.0, "step": 35325 }, { "entropy": 4.917418622970581, "epoch": 3.393852065321806, "grad_norm": 1.1015625, "learning_rate": 0.0003877038126713354, "loss": 4.5039, "mean_token_accuracy": 0.263748537003994, "num_tokens": 80998190.0, "step": 35330 }, { "entropy": 4.979682922363281, "epoch": 3.39433237271854, "grad_norm": 0.98828125, "learning_rate": 0.00038767413888454537, "loss": 4.6027, "mean_token_accuracy": 0.24252035170793534, "num_tokens": 81011086.0, "step": 35335 }, { "entropy": 5.177395629882812, "epoch": 3.394812680115274, "grad_norm": 0.98046875, "learning_rate": 0.00038764446248179665, "loss": 4.6873, "mean_token_accuracy": 0.2508353665471077, "num_tokens": 81022074.0, "step": 35340 }, { "entropy": 5.008141040802002, "epoch": 3.3952929875120077, "grad_norm": 1.0, "learning_rate": 0.0003876147834637778, "loss": 4.5974, "mean_token_accuracy": 0.24676503390073776, "num_tokens": 81033019.0, "step": 35345 }, { "entropy": 5.081465482711792, "epoch": 3.3957732949087416, "grad_norm": 1.0703125, "learning_rate": 0.00038758510183117806, "loss": 4.6982, "mean_token_accuracy": 0.24363914877176285, "num_tokens": 81044955.0, "step": 35350 }, { "entropy": 5.02382526397705, "epoch": 3.3962536023054755, "grad_norm": 1.0859375, "learning_rate": 0.0003875554175846866, "loss": 4.5134, "mean_token_accuracy": 0.26208958923816683, "num_tokens": 81055980.0, "step": 35355 }, { "entropy": 4.9981273174285885, "epoch": 3.3967339097022093, "grad_norm": 1.0, "learning_rate": 0.00038752573072499267, "loss": 4.5599, "mean_token_accuracy": 0.2592511162161827, "num_tokens": 81067960.0, "step": 35360 }, { "entropy": 5.036674976348877, "epoch": 3.3972142170989432, "grad_norm": 1.0390625, "learning_rate": 0.00038749604125278524, "loss": 4.6555, "mean_token_accuracy": 0.2486070767045021, "num_tokens": 81078084.0, "step": 35365 }, { "entropy": 5.049911880493164, "epoch": 3.397694524495677, "grad_norm": 1.1796875, "learning_rate": 0.0003874663491687539, "loss": 4.5872, "mean_token_accuracy": 0.2517434969544411, "num_tokens": 81089376.0, "step": 35370 }, { "entropy": 5.107871150970459, "epoch": 3.398174831892411, "grad_norm": 0.94921875, "learning_rate": 0.00038743665447358785, "loss": 4.6663, "mean_token_accuracy": 0.24245515316724778, "num_tokens": 81100906.0, "step": 35375 }, { "entropy": 5.0644361019134525, "epoch": 3.398655139289145, "grad_norm": 0.98046875, "learning_rate": 0.0003874069571679766, "loss": 4.6551, "mean_token_accuracy": 0.24924662858247756, "num_tokens": 81112839.0, "step": 35380 }, { "entropy": 4.942523193359375, "epoch": 3.3991354466858787, "grad_norm": 1.015625, "learning_rate": 0.00038737725725260946, "loss": 4.5748, "mean_token_accuracy": 0.2515710085630417, "num_tokens": 81126015.0, "step": 35385 }, { "entropy": 5.0529743194580075, "epoch": 3.399615754082613, "grad_norm": 0.93359375, "learning_rate": 0.00038734755472817617, "loss": 4.6528, "mean_token_accuracy": 0.24668528586626054, "num_tokens": 81139526.0, "step": 35390 }, { "entropy": 5.015586805343628, "epoch": 3.400096061479347, "grad_norm": 1.03125, "learning_rate": 0.00038731784959536626, "loss": 4.5318, "mean_token_accuracy": 0.25951134115457536, "num_tokens": 81152340.0, "step": 35395 }, { "entropy": 5.102068090438843, "epoch": 3.400576368876081, "grad_norm": 1.0234375, "learning_rate": 0.00038728814185486944, "loss": 4.651, "mean_token_accuracy": 0.2418101504445076, "num_tokens": 81162724.0, "step": 35400 }, { "entropy": 5.047775077819824, "epoch": 3.4010566762728147, "grad_norm": 1.1484375, "learning_rate": 0.0003872584315073753, "loss": 4.612, "mean_token_accuracy": 0.24746428579092025, "num_tokens": 81172933.0, "step": 35405 }, { "entropy": 4.999932432174683, "epoch": 3.4015369836695486, "grad_norm": 1.0546875, "learning_rate": 0.0003872287185535738, "loss": 4.627, "mean_token_accuracy": 0.24676198065280913, "num_tokens": 81183774.0, "step": 35410 }, { "entropy": 4.921209859848022, "epoch": 3.4020172910662825, "grad_norm": 0.9765625, "learning_rate": 0.00038719900299415475, "loss": 4.4743, "mean_token_accuracy": 0.2676608473062515, "num_tokens": 81194896.0, "step": 35415 }, { "entropy": 5.025229454040527, "epoch": 3.4024975984630164, "grad_norm": 1.0625, "learning_rate": 0.0003871692848298079, "loss": 4.5811, "mean_token_accuracy": 0.2540947362780571, "num_tokens": 81204889.0, "step": 35420 }, { "entropy": 5.062064266204834, "epoch": 3.4029779058597502, "grad_norm": 1.1171875, "learning_rate": 0.00038713956406122334, "loss": 4.6621, "mean_token_accuracy": 0.2442566990852356, "num_tokens": 81217177.0, "step": 35425 }, { "entropy": 5.0922904968261715, "epoch": 3.403458213256484, "grad_norm": 1.0234375, "learning_rate": 0.00038710984068909116, "loss": 4.6769, "mean_token_accuracy": 0.24345260560512544, "num_tokens": 81228767.0, "step": 35430 }, { "entropy": 4.998703479766846, "epoch": 3.403938520653218, "grad_norm": 1.0, "learning_rate": 0.0003870801147141014, "loss": 4.5507, "mean_token_accuracy": 0.24674364775419236, "num_tokens": 81239599.0, "step": 35435 }, { "entropy": 5.003114843368531, "epoch": 3.404418828049952, "grad_norm": 0.99609375, "learning_rate": 0.0003870503861369441, "loss": 4.6116, "mean_token_accuracy": 0.2476789563894272, "num_tokens": 81251051.0, "step": 35440 }, { "entropy": 5.058178091049195, "epoch": 3.4048991354466858, "grad_norm": 0.9296875, "learning_rate": 0.00038702065495830956, "loss": 4.6773, "mean_token_accuracy": 0.24947068840265274, "num_tokens": 81262484.0, "step": 35445 }, { "entropy": 5.028571701049804, "epoch": 3.4053794428434196, "grad_norm": 1.0390625, "learning_rate": 0.0003869909211788881, "loss": 4.5757, "mean_token_accuracy": 0.26031079739332197, "num_tokens": 81273068.0, "step": 35450 }, { "entropy": 5.029877090454102, "epoch": 3.4058597502401535, "grad_norm": 1.09375, "learning_rate": 0.00038696118479936994, "loss": 4.6287, "mean_token_accuracy": 0.25055190920829773, "num_tokens": 81285239.0, "step": 35455 }, { "entropy": 5.05253529548645, "epoch": 3.4063400576368874, "grad_norm": 1.125, "learning_rate": 0.00038693144582044553, "loss": 4.6357, "mean_token_accuracy": 0.2467782527208328, "num_tokens": 81295998.0, "step": 35460 }, { "entropy": 4.9913591861724855, "epoch": 3.4068203650336217, "grad_norm": 1.0625, "learning_rate": 0.00038690170424280534, "loss": 4.5207, "mean_token_accuracy": 0.2551711842417717, "num_tokens": 81307569.0, "step": 35465 }, { "entropy": 4.980480146408081, "epoch": 3.4073006724303556, "grad_norm": 1.1171875, "learning_rate": 0.0003868719600671399, "loss": 4.571, "mean_token_accuracy": 0.24703271239995955, "num_tokens": 81319023.0, "step": 35470 }, { "entropy": 4.9213526248931885, "epoch": 3.4077809798270895, "grad_norm": 0.97265625, "learning_rate": 0.00038684221329413965, "loss": 4.5374, "mean_token_accuracy": 0.26038162857294084, "num_tokens": 81330087.0, "step": 35475 }, { "entropy": 4.984519338607788, "epoch": 3.4082612872238234, "grad_norm": 1.0546875, "learning_rate": 0.0003868124639244954, "loss": 4.4978, "mean_token_accuracy": 0.26115120351314547, "num_tokens": 81341533.0, "step": 35480 }, { "entropy": 5.065533256530761, "epoch": 3.4087415946205573, "grad_norm": 1.09375, "learning_rate": 0.00038678271195889766, "loss": 4.6757, "mean_token_accuracy": 0.2376832216978073, "num_tokens": 81353505.0, "step": 35485 }, { "entropy": 5.03993034362793, "epoch": 3.409221902017291, "grad_norm": 1.0078125, "learning_rate": 0.00038675295739803734, "loss": 4.6213, "mean_token_accuracy": 0.24922273308038712, "num_tokens": 81365246.0, "step": 35490 }, { "entropy": 5.156140899658203, "epoch": 3.409702209414025, "grad_norm": 0.94921875, "learning_rate": 0.00038672320024260516, "loss": 4.7174, "mean_token_accuracy": 0.2432163506746292, "num_tokens": 81376403.0, "step": 35495 }, { "entropy": 5.041756677627563, "epoch": 3.410182516810759, "grad_norm": 1.046875, "learning_rate": 0.00038669344049329204, "loss": 4.6031, "mean_token_accuracy": 0.2519982814788818, "num_tokens": 81386865.0, "step": 35500 }, { "entropy": 4.944682168960571, "epoch": 3.410662824207493, "grad_norm": 0.9375, "learning_rate": 0.00038666367815078887, "loss": 4.5717, "mean_token_accuracy": 0.25449229329824447, "num_tokens": 81398191.0, "step": 35505 }, { "entropy": 5.026098012924194, "epoch": 3.4111431316042267, "grad_norm": 0.9609375, "learning_rate": 0.0003866339132157867, "loss": 4.6578, "mean_token_accuracy": 0.24305206537246704, "num_tokens": 81410694.0, "step": 35510 }, { "entropy": 5.051671552658081, "epoch": 3.4116234390009605, "grad_norm": 0.94140625, "learning_rate": 0.0003866041456889764, "loss": 4.5805, "mean_token_accuracy": 0.249052694439888, "num_tokens": 81422195.0, "step": 35515 }, { "entropy": 4.9964416980743405, "epoch": 3.4121037463976944, "grad_norm": 0.9453125, "learning_rate": 0.00038657437557104946, "loss": 4.6021, "mean_token_accuracy": 0.25322929918766024, "num_tokens": 81432979.0, "step": 35520 }, { "entropy": 4.972515535354614, "epoch": 3.4125840537944283, "grad_norm": 0.99609375, "learning_rate": 0.0003865446028626967, "loss": 4.5523, "mean_token_accuracy": 0.25452432930469515, "num_tokens": 81444328.0, "step": 35525 }, { "entropy": 5.15280499458313, "epoch": 3.413064361191162, "grad_norm": 1.0234375, "learning_rate": 0.00038651482756460947, "loss": 4.8092, "mean_token_accuracy": 0.24050280153751374, "num_tokens": 81455530.0, "step": 35530 }, { "entropy": 5.016593647003174, "epoch": 3.413544668587896, "grad_norm": 1.015625, "learning_rate": 0.00038648504967747914, "loss": 4.5999, "mean_token_accuracy": 0.25508580207824705, "num_tokens": 81466462.0, "step": 35535 }, { "entropy": 5.0983155250549315, "epoch": 3.4140249759846304, "grad_norm": 1.0, "learning_rate": 0.00038645526920199697, "loss": 4.7034, "mean_token_accuracy": 0.24511379301548003, "num_tokens": 81479459.0, "step": 35540 }, { "entropy": 5.1104835033416744, "epoch": 3.414505283381364, "grad_norm": 0.94921875, "learning_rate": 0.0003864254861388544, "loss": 4.7303, "mean_token_accuracy": 0.24671141505241395, "num_tokens": 81491026.0, "step": 35545 }, { "entropy": 5.081034135818482, "epoch": 3.414985590778098, "grad_norm": 0.9765625, "learning_rate": 0.00038639570048874295, "loss": 4.6166, "mean_token_accuracy": 0.2519968613982201, "num_tokens": 81504469.0, "step": 35550 }, { "entropy": 4.983581924438477, "epoch": 3.415465898174832, "grad_norm": 1.0078125, "learning_rate": 0.00038636591225235407, "loss": 4.5361, "mean_token_accuracy": 0.256315740942955, "num_tokens": 81515873.0, "step": 35555 }, { "entropy": 4.966502714157104, "epoch": 3.415946205571566, "grad_norm": 0.9296875, "learning_rate": 0.0003863361214303794, "loss": 4.5935, "mean_token_accuracy": 0.25549074858427046, "num_tokens": 81528169.0, "step": 35560 }, { "entropy": 5.063381147384644, "epoch": 3.4164265129683, "grad_norm": 1.0234375, "learning_rate": 0.0003863063280235106, "loss": 4.5861, "mean_token_accuracy": 0.24859169274568557, "num_tokens": 81538831.0, "step": 35565 }, { "entropy": 5.129141902923584, "epoch": 3.4169068203650337, "grad_norm": 0.97265625, "learning_rate": 0.00038627653203243933, "loss": 4.7333, "mean_token_accuracy": 0.24121089577674865, "num_tokens": 81549809.0, "step": 35570 }, { "entropy": 4.9427472114562985, "epoch": 3.4173871277617676, "grad_norm": 1.0546875, "learning_rate": 0.0003862467334578574, "loss": 4.5318, "mean_token_accuracy": 0.25782768428325653, "num_tokens": 81560537.0, "step": 35575 }, { "entropy": 5.009121417999268, "epoch": 3.4178674351585014, "grad_norm": 1.0703125, "learning_rate": 0.00038621693230045677, "loss": 4.5855, "mean_token_accuracy": 0.24942596554756163, "num_tokens": 81571983.0, "step": 35580 }, { "entropy": 4.954544734954834, "epoch": 3.4183477425552353, "grad_norm": 1.03125, "learning_rate": 0.0003861871285609291, "loss": 4.5645, "mean_token_accuracy": 0.2511437177658081, "num_tokens": 81584783.0, "step": 35585 }, { "entropy": 4.972129917144775, "epoch": 3.418828049951969, "grad_norm": 0.9375, "learning_rate": 0.0003861573222399665, "loss": 4.5154, "mean_token_accuracy": 0.2564325526356697, "num_tokens": 81596619.0, "step": 35590 }, { "entropy": 5.14181661605835, "epoch": 3.419308357348703, "grad_norm": 1.0, "learning_rate": 0.000386127513338261, "loss": 4.7134, "mean_token_accuracy": 0.24326184689998626, "num_tokens": 81607981.0, "step": 35595 }, { "entropy": 5.087239122390747, "epoch": 3.419788664745437, "grad_norm": 0.99609375, "learning_rate": 0.0003860977018565046, "loss": 4.6904, "mean_token_accuracy": 0.2416967809200287, "num_tokens": 81620444.0, "step": 35600 }, { "entropy": 5.025595569610596, "epoch": 3.420268972142171, "grad_norm": 0.95703125, "learning_rate": 0.0003860678877953894, "loss": 4.5763, "mean_token_accuracy": 0.25276096612215043, "num_tokens": 81631668.0, "step": 35605 }, { "entropy": 5.03992805480957, "epoch": 3.4207492795389047, "grad_norm": 0.9765625, "learning_rate": 0.0003860380711556077, "loss": 4.7201, "mean_token_accuracy": 0.24342207759618759, "num_tokens": 81643230.0, "step": 35610 }, { "entropy": 5.098343706130981, "epoch": 3.421229586935639, "grad_norm": 1.0546875, "learning_rate": 0.00038600825193785173, "loss": 4.6451, "mean_token_accuracy": 0.24605976045131683, "num_tokens": 81654217.0, "step": 35615 }, { "entropy": 5.0649824142456055, "epoch": 3.4217098943323725, "grad_norm": 1.0078125, "learning_rate": 0.0003859784301428137, "loss": 4.6915, "mean_token_accuracy": 0.23920787870883942, "num_tokens": 81666025.0, "step": 35620 }, { "entropy": 5.0398296356201175, "epoch": 3.422190201729107, "grad_norm": 1.0546875, "learning_rate": 0.0003859486057711861, "loss": 4.6418, "mean_token_accuracy": 0.2450200706720352, "num_tokens": 81677417.0, "step": 35625 }, { "entropy": 5.043654251098633, "epoch": 3.4226705091258407, "grad_norm": 1.03125, "learning_rate": 0.0003859187788236613, "loss": 4.6808, "mean_token_accuracy": 0.25091739892959597, "num_tokens": 81687898.0, "step": 35630 }, { "entropy": 4.949628829956055, "epoch": 3.4231508165225746, "grad_norm": 1.0546875, "learning_rate": 0.00038588894930093184, "loss": 4.5476, "mean_token_accuracy": 0.2605251118540764, "num_tokens": 81699532.0, "step": 35635 }, { "entropy": 5.077291822433471, "epoch": 3.4236311239193085, "grad_norm": 1.1640625, "learning_rate": 0.00038585911720369023, "loss": 4.6599, "mean_token_accuracy": 0.24108761101961135, "num_tokens": 81709164.0, "step": 35640 }, { "entropy": 5.066379022598267, "epoch": 3.4241114313160423, "grad_norm": 1.09375, "learning_rate": 0.0003858292825326291, "loss": 4.6108, "mean_token_accuracy": 0.25411655455827714, "num_tokens": 81721354.0, "step": 35645 }, { "entropy": 4.982634782791138, "epoch": 3.4245917387127762, "grad_norm": 1.0234375, "learning_rate": 0.0003857994452884412, "loss": 4.529, "mean_token_accuracy": 0.257805435359478, "num_tokens": 81731793.0, "step": 35650 }, { "entropy": 5.05521674156189, "epoch": 3.42507204610951, "grad_norm": 0.97265625, "learning_rate": 0.0003857696054718191, "loss": 4.6328, "mean_token_accuracy": 0.256782965362072, "num_tokens": 81743254.0, "step": 35655 }, { "entropy": 5.1028913974761965, "epoch": 3.425552353506244, "grad_norm": 1.0859375, "learning_rate": 0.0003857397630834557, "loss": 4.697, "mean_token_accuracy": 0.24332701712846755, "num_tokens": 81755080.0, "step": 35660 }, { "entropy": 5.051576948165893, "epoch": 3.426032660902978, "grad_norm": 0.92578125, "learning_rate": 0.00038570991812404384, "loss": 4.6407, "mean_token_accuracy": 0.24917400032281875, "num_tokens": 81767367.0, "step": 35665 }, { "entropy": 5.072865724563599, "epoch": 3.4265129682997117, "grad_norm": 1.0546875, "learning_rate": 0.0003856800705942764, "loss": 4.7, "mean_token_accuracy": 0.23929235339164734, "num_tokens": 81778513.0, "step": 35670 }, { "entropy": 5.057154512405395, "epoch": 3.4269932756964456, "grad_norm": 1.0234375, "learning_rate": 0.00038565022049484636, "loss": 4.5665, "mean_token_accuracy": 0.2571455791592598, "num_tokens": 81789845.0, "step": 35675 }, { "entropy": 5.016182804107666, "epoch": 3.4274735830931795, "grad_norm": 0.98046875, "learning_rate": 0.00038562036782644675, "loss": 4.6429, "mean_token_accuracy": 0.24943196326494216, "num_tokens": 81801287.0, "step": 35680 }, { "entropy": 5.031500720977784, "epoch": 3.4279538904899134, "grad_norm": 1.078125, "learning_rate": 0.0003855905125897708, "loss": 4.5899, "mean_token_accuracy": 0.2518752470612526, "num_tokens": 81811462.0, "step": 35685 }, { "entropy": 4.98153395652771, "epoch": 3.4284341978866473, "grad_norm": 1.0078125, "learning_rate": 0.00038556065478551147, "loss": 4.5334, "mean_token_accuracy": 0.2516901955008507, "num_tokens": 81822590.0, "step": 35690 }, { "entropy": 5.091107988357544, "epoch": 3.428914505283381, "grad_norm": 1.1328125, "learning_rate": 0.000385530794414362, "loss": 4.6171, "mean_token_accuracy": 0.259315188229084, "num_tokens": 81834755.0, "step": 35695 }, { "entropy": 4.974374151229858, "epoch": 3.4293948126801155, "grad_norm": 0.94921875, "learning_rate": 0.0003855009314770157, "loss": 4.5024, "mean_token_accuracy": 0.2566398024559021, "num_tokens": 81846259.0, "step": 35700 }, { "entropy": 4.9552396774292, "epoch": 3.4298751200768494, "grad_norm": 0.99609375, "learning_rate": 0.00038547106597416593, "loss": 4.5194, "mean_token_accuracy": 0.2630326122045517, "num_tokens": 81856262.0, "step": 35705 }, { "entropy": 5.03646993637085, "epoch": 3.4303554274735832, "grad_norm": 0.97265625, "learning_rate": 0.000385441197906506, "loss": 4.6465, "mean_token_accuracy": 0.24763473719358445, "num_tokens": 81867201.0, "step": 35710 }, { "entropy": 5.095039701461792, "epoch": 3.430835734870317, "grad_norm": 1.078125, "learning_rate": 0.00038541132727472945, "loss": 4.8005, "mean_token_accuracy": 0.2443981871008873, "num_tokens": 81878654.0, "step": 35715 }, { "entropy": 5.058059453964233, "epoch": 3.431316042267051, "grad_norm": 0.9765625, "learning_rate": 0.00038538145407952964, "loss": 4.5626, "mean_token_accuracy": 0.2526533126831055, "num_tokens": 81889672.0, "step": 35720 }, { "entropy": 5.020740413665772, "epoch": 3.431796349663785, "grad_norm": 0.96875, "learning_rate": 0.0003853515783216003, "loss": 4.5785, "mean_token_accuracy": 0.2503949970006943, "num_tokens": 81901072.0, "step": 35725 }, { "entropy": 5.039410972595215, "epoch": 3.4322766570605188, "grad_norm": 1.03125, "learning_rate": 0.000385321700001635, "loss": 4.5605, "mean_token_accuracy": 0.2550284430384636, "num_tokens": 81913146.0, "step": 35730 }, { "entropy": 5.088719511032105, "epoch": 3.4327569644572526, "grad_norm": 1.0078125, "learning_rate": 0.0003852918191203274, "loss": 4.6659, "mean_token_accuracy": 0.24266094714403152, "num_tokens": 81924648.0, "step": 35735 }, { "entropy": 5.051954650878907, "epoch": 3.4332372718539865, "grad_norm": 1.1328125, "learning_rate": 0.0003852619356783712, "loss": 4.6513, "mean_token_accuracy": 0.24199773818254472, "num_tokens": 81935744.0, "step": 35740 }, { "entropy": 5.077026414871216, "epoch": 3.4337175792507204, "grad_norm": 1.0390625, "learning_rate": 0.0003852320496764603, "loss": 4.599, "mean_token_accuracy": 0.25022688806056975, "num_tokens": 81946931.0, "step": 35745 }, { "entropy": 4.994713354110718, "epoch": 3.4341978866474543, "grad_norm": 0.96875, "learning_rate": 0.00038520216111528855, "loss": 4.57, "mean_token_accuracy": 0.2605443805456161, "num_tokens": 81958009.0, "step": 35750 }, { "entropy": 5.0040308952331545, "epoch": 3.434678194044188, "grad_norm": 1.0234375, "learning_rate": 0.0003851722699955499, "loss": 4.6106, "mean_token_accuracy": 0.2508874759078026, "num_tokens": 81970648.0, "step": 35755 }, { "entropy": 5.03471007347107, "epoch": 3.435158501440922, "grad_norm": 1.015625, "learning_rate": 0.0003851423763179382, "loss": 4.5707, "mean_token_accuracy": 0.2587558254599571, "num_tokens": 81982293.0, "step": 35760 }, { "entropy": 4.992668104171753, "epoch": 3.435638808837656, "grad_norm": 1.0625, "learning_rate": 0.00038511248008314756, "loss": 4.5886, "mean_token_accuracy": 0.2558387294411659, "num_tokens": 81993856.0, "step": 35765 }, { "entropy": 5.088662481307983, "epoch": 3.43611911623439, "grad_norm": 1.046875, "learning_rate": 0.0003850825812918722, "loss": 4.6627, "mean_token_accuracy": 0.24435512721538544, "num_tokens": 82005359.0, "step": 35770 }, { "entropy": 5.056514310836792, "epoch": 3.436599423631124, "grad_norm": 1.0234375, "learning_rate": 0.0003850526799448061, "loss": 4.6276, "mean_token_accuracy": 0.2558924823999405, "num_tokens": 82017789.0, "step": 35775 }, { "entropy": 5.071243810653686, "epoch": 3.437079731027858, "grad_norm": 1.0390625, "learning_rate": 0.0003850227760426436, "loss": 4.6057, "mean_token_accuracy": 0.2534796819090843, "num_tokens": 82028147.0, "step": 35780 }, { "entropy": 5.021691226959229, "epoch": 3.437560038424592, "grad_norm": 1.0703125, "learning_rate": 0.00038499286958607894, "loss": 4.5866, "mean_token_accuracy": 0.2504493460059166, "num_tokens": 82039485.0, "step": 35785 }, { "entropy": 4.984779596328735, "epoch": 3.438040345821326, "grad_norm": 1.046875, "learning_rate": 0.0003849629605758065, "loss": 4.5157, "mean_token_accuracy": 0.26161112636327744, "num_tokens": 82050004.0, "step": 35790 }, { "entropy": 5.006572675704956, "epoch": 3.4385206532180597, "grad_norm": 1.0703125, "learning_rate": 0.0003849330490125205, "loss": 4.5316, "mean_token_accuracy": 0.25540418922901154, "num_tokens": 82060621.0, "step": 35795 }, { "entropy": 5.030001497268676, "epoch": 3.4390009606147935, "grad_norm": 0.96484375, "learning_rate": 0.00038490313489691566, "loss": 4.6048, "mean_token_accuracy": 0.25173886865377426, "num_tokens": 82070955.0, "step": 35800 }, { "entropy": 5.1286180973052975, "epoch": 3.4394812680115274, "grad_norm": 0.98046875, "learning_rate": 0.0003848732182296863, "loss": 4.8051, "mean_token_accuracy": 0.23692064583301545, "num_tokens": 82082623.0, "step": 35805 }, { "entropy": 5.04611382484436, "epoch": 3.4399615754082613, "grad_norm": 1.0546875, "learning_rate": 0.00038484329901152713, "loss": 4.5586, "mean_token_accuracy": 0.25244507640600206, "num_tokens": 82092707.0, "step": 35810 }, { "entropy": 5.013063287734985, "epoch": 3.440441882804995, "grad_norm": 0.984375, "learning_rate": 0.00038481337724313264, "loss": 4.5912, "mean_token_accuracy": 0.2531083166599274, "num_tokens": 82105114.0, "step": 35815 }, { "entropy": 4.991770172119141, "epoch": 3.440922190201729, "grad_norm": 0.9296875, "learning_rate": 0.0003847834529251977, "loss": 4.5489, "mean_token_accuracy": 0.26085243225097654, "num_tokens": 82117638.0, "step": 35820 }, { "entropy": 5.081189155578613, "epoch": 3.441402497598463, "grad_norm": 1.0234375, "learning_rate": 0.00038475352605841693, "loss": 4.6518, "mean_token_accuracy": 0.24742389023303984, "num_tokens": 82129100.0, "step": 35825 }, { "entropy": 5.064033174514771, "epoch": 3.441882804995197, "grad_norm": 1.0703125, "learning_rate": 0.0003847235966434852, "loss": 4.6592, "mean_token_accuracy": 0.2504685491323471, "num_tokens": 82140776.0, "step": 35830 }, { "entropy": 5.111886405944825, "epoch": 3.4423631123919307, "grad_norm": 1.0546875, "learning_rate": 0.0003846936646810974, "loss": 4.6665, "mean_token_accuracy": 0.24439497143030167, "num_tokens": 82151569.0, "step": 35835 }, { "entropy": 5.114431619644165, "epoch": 3.4428434197886646, "grad_norm": 1.0078125, "learning_rate": 0.00038466373017194834, "loss": 4.6824, "mean_token_accuracy": 0.2462215691804886, "num_tokens": 82162696.0, "step": 35840 }, { "entropy": 4.95943398475647, "epoch": 3.4433237271853985, "grad_norm": 1.0, "learning_rate": 0.0003846337931167332, "loss": 4.4898, "mean_token_accuracy": 0.25996591746807096, "num_tokens": 82173580.0, "step": 35845 }, { "entropy": 4.962374210357666, "epoch": 3.443804034582133, "grad_norm": 0.99609375, "learning_rate": 0.00038460385351614683, "loss": 4.6017, "mean_token_accuracy": 0.2535645171999931, "num_tokens": 82185839.0, "step": 35850 }, { "entropy": 5.020360898971558, "epoch": 3.4442843419788662, "grad_norm": 1.03125, "learning_rate": 0.00038457391137088455, "loss": 4.6071, "mean_token_accuracy": 0.24996693730354308, "num_tokens": 82196682.0, "step": 35855 }, { "entropy": 5.0393565654754635, "epoch": 3.4447646493756006, "grad_norm": 1.015625, "learning_rate": 0.00038454396668164136, "loss": 4.5911, "mean_token_accuracy": 0.24866246283054352, "num_tokens": 82207349.0, "step": 35860 }, { "entropy": 5.040912437438965, "epoch": 3.4452449567723344, "grad_norm": 1.0234375, "learning_rate": 0.0003845140194491126, "loss": 4.5791, "mean_token_accuracy": 0.25327396392822266, "num_tokens": 82218547.0, "step": 35865 }, { "entropy": 5.020535707473755, "epoch": 3.4457252641690683, "grad_norm": 0.98046875, "learning_rate": 0.00038448406967399334, "loss": 4.5906, "mean_token_accuracy": 0.2593914374709129, "num_tokens": 82230246.0, "step": 35870 }, { "entropy": 4.9510817527771, "epoch": 3.446205571565802, "grad_norm": 1.03125, "learning_rate": 0.00038445411735697917, "loss": 4.5575, "mean_token_accuracy": 0.2552237197756767, "num_tokens": 82240924.0, "step": 35875 }, { "entropy": 5.022664403915405, "epoch": 3.446685878962536, "grad_norm": 0.96484375, "learning_rate": 0.0003844241624987655, "loss": 4.6331, "mean_token_accuracy": 0.24676787704229355, "num_tokens": 82252951.0, "step": 35880 }, { "entropy": 5.098922300338745, "epoch": 3.44716618635927, "grad_norm": 1.0390625, "learning_rate": 0.0003843942051000476, "loss": 4.6785, "mean_token_accuracy": 0.24114069640636443, "num_tokens": 82264377.0, "step": 35885 }, { "entropy": 5.125530767440796, "epoch": 3.447646493756004, "grad_norm": 0.9609375, "learning_rate": 0.000384364245161521, "loss": 4.7298, "mean_token_accuracy": 0.23500104248523712, "num_tokens": 82276736.0, "step": 35890 }, { "entropy": 5.0649346828460695, "epoch": 3.4481268011527377, "grad_norm": 1.0390625, "learning_rate": 0.0003843342826838815, "loss": 4.6294, "mean_token_accuracy": 0.24611299782991408, "num_tokens": 82288574.0, "step": 35895 }, { "entropy": 5.029484605789184, "epoch": 3.4486071085494716, "grad_norm": 0.93359375, "learning_rate": 0.00038430431766782463, "loss": 4.5331, "mean_token_accuracy": 0.265340293943882, "num_tokens": 82299570.0, "step": 35900 }, { "entropy": 5.0758363723754885, "epoch": 3.4490874159462055, "grad_norm": 0.99609375, "learning_rate": 0.000384274350114046, "loss": 4.6827, "mean_token_accuracy": 0.23846648633480072, "num_tokens": 82310449.0, "step": 35905 }, { "entropy": 5.053023433685302, "epoch": 3.4495677233429394, "grad_norm": 0.9765625, "learning_rate": 0.00038424438002324145, "loss": 4.643, "mean_token_accuracy": 0.24154511839151382, "num_tokens": 82321517.0, "step": 35910 }, { "entropy": 4.999557304382324, "epoch": 3.4500480307396733, "grad_norm": 0.9609375, "learning_rate": 0.00038421440739610683, "loss": 4.6294, "mean_token_accuracy": 0.24681228399276733, "num_tokens": 82334555.0, "step": 35915 }, { "entropy": 4.973876285552978, "epoch": 3.450528338136407, "grad_norm": 0.86328125, "learning_rate": 0.00038418443223333797, "loss": 4.478, "mean_token_accuracy": 0.26210538744926454, "num_tokens": 82347166.0, "step": 35920 }, { "entropy": 4.95529408454895, "epoch": 3.451008645533141, "grad_norm": 0.97265625, "learning_rate": 0.0003841544545356308, "loss": 4.5209, "mean_token_accuracy": 0.2557129502296448, "num_tokens": 82358220.0, "step": 35925 }, { "entropy": 5.0186504364013675, "epoch": 3.451488952929875, "grad_norm": 0.9765625, "learning_rate": 0.00038412447430368125, "loss": 4.633, "mean_token_accuracy": 0.24600803405046462, "num_tokens": 82369154.0, "step": 35930 }, { "entropy": 5.007594299316406, "epoch": 3.4519692603266092, "grad_norm": 1.0, "learning_rate": 0.00038409449153818556, "loss": 4.5539, "mean_token_accuracy": 0.2577602624893188, "num_tokens": 82380311.0, "step": 35935 }, { "entropy": 5.017320871353149, "epoch": 3.452449567723343, "grad_norm": 1.0703125, "learning_rate": 0.00038406450623983964, "loss": 4.5599, "mean_token_accuracy": 0.2534759595990181, "num_tokens": 82390895.0, "step": 35940 }, { "entropy": 5.009060716629028, "epoch": 3.452929875120077, "grad_norm": 1.03125, "learning_rate": 0.00038403451840933966, "loss": 4.5409, "mean_token_accuracy": 0.257034033536911, "num_tokens": 82400880.0, "step": 35945 }, { "entropy": 4.976863765716553, "epoch": 3.453410182516811, "grad_norm": 0.96875, "learning_rate": 0.00038400452804738204, "loss": 4.6159, "mean_token_accuracy": 0.2538122460246086, "num_tokens": 82413566.0, "step": 35950 }, { "entropy": 4.963717746734619, "epoch": 3.4538904899135447, "grad_norm": 1.109375, "learning_rate": 0.00038397453515466297, "loss": 4.5475, "mean_token_accuracy": 0.2589743047952652, "num_tokens": 82424434.0, "step": 35955 }, { "entropy": 5.089448308944702, "epoch": 3.4543707973102786, "grad_norm": 1.046875, "learning_rate": 0.0003839445397318787, "loss": 4.6639, "mean_token_accuracy": 0.24951853454113007, "num_tokens": 82436310.0, "step": 35960 }, { "entropy": 4.98779330253601, "epoch": 3.4548511047070125, "grad_norm": 1.03125, "learning_rate": 0.0003839145417797258, "loss": 4.5347, "mean_token_accuracy": 0.2546408846974373, "num_tokens": 82446484.0, "step": 35965 }, { "entropy": 5.090916872024536, "epoch": 3.4553314121037464, "grad_norm": 1.03125, "learning_rate": 0.0003838845412989006, "loss": 4.6964, "mean_token_accuracy": 0.23920599222183228, "num_tokens": 82458169.0, "step": 35970 }, { "entropy": 4.982842636108399, "epoch": 3.4558117195004803, "grad_norm": 0.95703125, "learning_rate": 0.0003838545382900997, "loss": 4.5668, "mean_token_accuracy": 0.2542534157633781, "num_tokens": 82469407.0, "step": 35975 }, { "entropy": 4.986524534225464, "epoch": 3.456292026897214, "grad_norm": 1.0546875, "learning_rate": 0.0003838245327540196, "loss": 4.5392, "mean_token_accuracy": 0.26070113480091095, "num_tokens": 82480757.0, "step": 35980 }, { "entropy": 5.008253049850464, "epoch": 3.456772334293948, "grad_norm": 1.0625, "learning_rate": 0.00038379452469135706, "loss": 4.6082, "mean_token_accuracy": 0.25176827758550646, "num_tokens": 82491073.0, "step": 35985 }, { "entropy": 5.055674934387207, "epoch": 3.457252641690682, "grad_norm": 0.96875, "learning_rate": 0.00038376451410280864, "loss": 4.6247, "mean_token_accuracy": 0.25961445420980456, "num_tokens": 82502243.0, "step": 35990 }, { "entropy": 5.063381910324097, "epoch": 3.457732949087416, "grad_norm": 1.046875, "learning_rate": 0.00038373450098907124, "loss": 4.6124, "mean_token_accuracy": 0.2445521369576454, "num_tokens": 82514831.0, "step": 35995 }, { "entropy": 5.02369704246521, "epoch": 3.4582132564841497, "grad_norm": 1.125, "learning_rate": 0.00038370448535084156, "loss": 4.6043, "mean_token_accuracy": 0.2521915763616562, "num_tokens": 82525907.0, "step": 36000 }, { "epoch": 3.4582132564841497, "eval_entropy": 4.835890457549948, "eval_loss": 4.756885051727295, "eval_mean_token_accuracy": 0.25125276272992053, "eval_num_tokens": 82525907.0, "eval_runtime": 26.6323, "eval_samples_per_second": 1232.153, "eval_steps_per_second": 154.024, "step": 36000 } ], "logging_steps": 5, "max_steps": 104090, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2600661776768e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }