{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5763688760806917, "eval_steps": 3000, "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 4.81198468208313, "epoch": 0.0004803073967339097, "grad_norm": 15.3125, "learning_rate": 2e-06, "loss": 14.3995, "mean_token_accuracy": 0.0, "num_tokens": 10855.0, "step": 5 }, { "entropy": 4.828950214385986, "epoch": 0.0009606147934678194, "grad_norm": 16.0, "learning_rate": 4.5e-06, "loss": 14.4568, "mean_token_accuracy": 6.361323175951838e-05, "num_tokens": 24110.0, "step": 10 }, { "entropy": 4.885565328598022, "epoch": 0.001440922190201729, "grad_norm": 18.375, "learning_rate": 7e-06, "loss": 14.1468, "mean_token_accuracy": 0.0, "num_tokens": 35984.0, "step": 15 }, { "entropy": 5.113980484008789, "epoch": 0.0019212295869356388, "grad_norm": 25.5, "learning_rate": 9.5e-06, "loss": 13.5274, "mean_token_accuracy": 0.0, "num_tokens": 48152.0, "step": 20 }, { "entropy": 7.0846137523651125, "epoch": 0.0024015369836695487, "grad_norm": 18.875, "learning_rate": 1.2e-05, "loss": 11.983, "mean_token_accuracy": 5.9031875571236016e-05, "num_tokens": 59810.0, "step": 25 }, { "entropy": 10.311653995513916, "epoch": 0.002881844380403458, "grad_norm": 3.25, "learning_rate": 1.4500000000000002e-05, "loss": 10.8966, "mean_token_accuracy": 0.0035814862465485932, "num_tokens": 70852.0, "step": 30 }, { "entropy": 10.698549842834472, "epoch": 0.0033621517771373678, "grad_norm": 3.453125, "learning_rate": 1.7000000000000003e-05, "loss": 10.681, "mean_token_accuracy": 0.012990868836641311, "num_tokens": 83378.0, "step": 35 }, { "entropy": 10.70135440826416, "epoch": 0.0038424591738712775, "grad_norm": 2.890625, "learning_rate": 1.95e-05, "loss": 10.3702, "mean_token_accuracy": 0.015855902433395387, "num_tokens": 95505.0, "step": 40 }, { "entropy": 10.669420051574708, "epoch": 0.004322766570605188, "grad_norm": 2.609375, "learning_rate": 2.2e-05, "loss": 10.0399, "mean_token_accuracy": 0.019150405284017326, "num_tokens": 106812.0, "step": 45 }, { "entropy": 10.626140022277832, "epoch": 0.004803073967339097, "grad_norm": 2.171875, "learning_rate": 2.4500000000000003e-05, "loss": 9.8531, "mean_token_accuracy": 0.030371082201600074, "num_tokens": 118572.0, "step": 50 }, { "entropy": 10.630718421936034, "epoch": 0.005283381364073006, "grad_norm": 2.140625, "learning_rate": 2.7e-05, "loss": 9.7085, "mean_token_accuracy": 0.02918087989091873, "num_tokens": 130051.0, "step": 55 }, { "entropy": 10.632691478729248, "epoch": 0.005763688760806916, "grad_norm": 2.109375, "learning_rate": 2.95e-05, "loss": 9.6316, "mean_token_accuracy": 0.033551334962248804, "num_tokens": 141920.0, "step": 60 }, { "entropy": 10.621756076812744, "epoch": 0.006243996157540826, "grad_norm": 1.953125, "learning_rate": 3.2e-05, "loss": 9.4968, "mean_token_accuracy": 0.03377603869885206, "num_tokens": 152706.0, "step": 65 }, { "entropy": 10.59926996231079, "epoch": 0.0067243035542747355, "grad_norm": 2.0, "learning_rate": 3.4500000000000005e-05, "loss": 9.4671, "mean_token_accuracy": 0.030284658074378967, "num_tokens": 165253.0, "step": 70 }, { "entropy": 10.586241340637207, "epoch": 0.007204610951008645, "grad_norm": 1.9921875, "learning_rate": 3.7e-05, "loss": 9.3528, "mean_token_accuracy": 0.03066213186830282, "num_tokens": 176708.0, "step": 75 }, { "entropy": 10.572576808929444, "epoch": 0.007684918347742555, "grad_norm": 1.9921875, "learning_rate": 3.95e-05, "loss": 9.3119, "mean_token_accuracy": 0.02979854876175523, "num_tokens": 188240.0, "step": 80 }, { "entropy": 10.554954528808594, "epoch": 0.008165225744476465, "grad_norm": 1.96875, "learning_rate": 4.2000000000000004e-05, "loss": 9.1145, "mean_token_accuracy": 0.03125303704291582, "num_tokens": 198355.0, "step": 85 }, { "entropy": 10.53057928085327, "epoch": 0.008645533141210375, "grad_norm": 1.8515625, "learning_rate": 4.45e-05, "loss": 9.0646, "mean_token_accuracy": 0.02982727512717247, "num_tokens": 209497.0, "step": 90 }, { "entropy": 10.494773197174073, "epoch": 0.009125840537944284, "grad_norm": 1.9609375, "learning_rate": 4.7000000000000004e-05, "loss": 8.9936, "mean_token_accuracy": 0.02780488096177578, "num_tokens": 220859.0, "step": 95 }, { "entropy": 10.448780918121338, "epoch": 0.009606147934678195, "grad_norm": 1.78125, "learning_rate": 4.9500000000000004e-05, "loss": 8.9232, "mean_token_accuracy": 0.030998879671096803, "num_tokens": 231550.0, "step": 100 }, { "entropy": 10.376792049407959, "epoch": 0.010086455331412104, "grad_norm": 1.65625, "learning_rate": 5.2e-05, "loss": 8.7452, "mean_token_accuracy": 0.030790003202855586, "num_tokens": 244210.0, "step": 105 }, { "entropy": 10.282748031616212, "epoch": 0.010566762728146013, "grad_norm": 1.6953125, "learning_rate": 5.45e-05, "loss": 8.6175, "mean_token_accuracy": 0.040817446634173395, "num_tokens": 255745.0, "step": 110 }, { "entropy": 10.166150856018067, "epoch": 0.011047070124879923, "grad_norm": 1.4609375, "learning_rate": 5.7e-05, "loss": 8.5074, "mean_token_accuracy": 0.0365377115085721, "num_tokens": 266180.0, "step": 115 }, { "entropy": 10.028709888458252, "epoch": 0.011527377521613832, "grad_norm": 1.4140625, "learning_rate": 5.9499999999999996e-05, "loss": 8.3681, "mean_token_accuracy": 0.03765994198620319, "num_tokens": 277736.0, "step": 120 }, { "entropy": 9.827960968017578, "epoch": 0.012007684918347743, "grad_norm": 1.2734375, "learning_rate": 6.2e-05, "loss": 8.2429, "mean_token_accuracy": 0.035723325610160825, "num_tokens": 289069.0, "step": 125 }, { "entropy": 9.59237585067749, "epoch": 0.012487992315081652, "grad_norm": 1.1796875, "learning_rate": 6.450000000000001e-05, "loss": 8.0891, "mean_token_accuracy": 0.04738196656107903, "num_tokens": 300240.0, "step": 130 }, { "entropy": 9.368733978271484, "epoch": 0.012968299711815562, "grad_norm": 1.09375, "learning_rate": 6.7e-05, "loss": 8.0332, "mean_token_accuracy": 0.04018798861652613, "num_tokens": 311698.0, "step": 135 }, { "entropy": 9.110132884979247, "epoch": 0.013448607108549471, "grad_norm": 0.95703125, "learning_rate": 6.950000000000001e-05, "loss": 7.9056, "mean_token_accuracy": 0.0432288508862257, "num_tokens": 322844.0, "step": 140 }, { "entropy": 8.820003223419189, "epoch": 0.013928914505283382, "grad_norm": 0.98046875, "learning_rate": 7.2e-05, "loss": 7.8235, "mean_token_accuracy": 0.045638217404484746, "num_tokens": 335092.0, "step": 145 }, { "entropy": 8.585826587677001, "epoch": 0.01440922190201729, "grad_norm": 0.8359375, "learning_rate": 7.45e-05, "loss": 7.7332, "mean_token_accuracy": 0.04667803719639778, "num_tokens": 347033.0, "step": 150 }, { "entropy": 8.385289859771728, "epoch": 0.014889529298751201, "grad_norm": 0.9921875, "learning_rate": 7.7e-05, "loss": 7.6524, "mean_token_accuracy": 0.05755673125386238, "num_tokens": 358696.0, "step": 155 }, { "entropy": 8.231111812591553, "epoch": 0.01536983669548511, "grad_norm": 0.875, "learning_rate": 7.950000000000001e-05, "loss": 7.6369, "mean_token_accuracy": 0.05747554413974285, "num_tokens": 369390.0, "step": 160 }, { "entropy": 8.13049030303955, "epoch": 0.01585014409221902, "grad_norm": 0.921875, "learning_rate": 8.2e-05, "loss": 7.573, "mean_token_accuracy": 0.058345531672239305, "num_tokens": 380540.0, "step": 165 }, { "entropy": 8.037137985229492, "epoch": 0.01633045148895293, "grad_norm": 1.4375, "learning_rate": 8.450000000000001e-05, "loss": 7.5672, "mean_token_accuracy": 0.05862935781478882, "num_tokens": 391243.0, "step": 170 }, { "entropy": 7.971378183364868, "epoch": 0.01681075888568684, "grad_norm": 1.1328125, "learning_rate": 8.7e-05, "loss": 7.5403, "mean_token_accuracy": 0.06493047513067722, "num_tokens": 403336.0, "step": 175 }, { "entropy": 7.996695470809937, "epoch": 0.01729106628242075, "grad_norm": 1.7890625, "learning_rate": 8.95e-05, "loss": 7.4714, "mean_token_accuracy": 0.06883232817053794, "num_tokens": 413886.0, "step": 180 }, { "entropy": 7.944087362289428, "epoch": 0.01777137367915466, "grad_norm": 1.28125, "learning_rate": 9.2e-05, "loss": 7.5072, "mean_token_accuracy": 0.07003857865929604, "num_tokens": 425277.0, "step": 185 }, { "entropy": 7.903090763092041, "epoch": 0.01825168107588857, "grad_norm": 1.1484375, "learning_rate": 9.45e-05, "loss": 7.5901, "mean_token_accuracy": 0.07094852812588215, "num_tokens": 436868.0, "step": 190 }, { "entropy": 7.9524956226348875, "epoch": 0.018731988472622477, "grad_norm": 1.3671875, "learning_rate": 9.7e-05, "loss": 7.3956, "mean_token_accuracy": 0.0713607795536518, "num_tokens": 448349.0, "step": 195 }, { "entropy": 7.893163013458252, "epoch": 0.01921229586935639, "grad_norm": 1.078125, "learning_rate": 9.95e-05, "loss": 7.398, "mean_token_accuracy": 0.07450502514839172, "num_tokens": 459447.0, "step": 200 }, { "entropy": 7.827638578414917, "epoch": 0.0196926032660903, "grad_norm": 1.09375, "learning_rate": 0.000102, "loss": 7.3545, "mean_token_accuracy": 0.07836289256811142, "num_tokens": 470734.0, "step": 205 }, { "entropy": 7.920483875274658, "epoch": 0.020172910662824207, "grad_norm": 1.2890625, "learning_rate": 0.00010449999999999999, "loss": 7.3929, "mean_token_accuracy": 0.07436848841607571, "num_tokens": 482015.0, "step": 210 }, { "entropy": 7.829608154296875, "epoch": 0.020653218059558116, "grad_norm": 1.09375, "learning_rate": 0.000107, "loss": 7.3388, "mean_token_accuracy": 0.0812894694507122, "num_tokens": 493339.0, "step": 215 }, { "entropy": 7.832039451599121, "epoch": 0.021133525456292025, "grad_norm": 1.09375, "learning_rate": 0.0001095, "loss": 7.2806, "mean_token_accuracy": 0.08215347118675709, "num_tokens": 504924.0, "step": 220 }, { "entropy": 7.841120386123658, "epoch": 0.021613832853025938, "grad_norm": 1.3828125, "learning_rate": 0.000112, "loss": 7.2586, "mean_token_accuracy": 0.07783420942723751, "num_tokens": 516603.0, "step": 225 }, { "entropy": 7.667848110198975, "epoch": 0.022094140249759846, "grad_norm": 1.234375, "learning_rate": 0.0001145, "loss": 7.1767, "mean_token_accuracy": 0.0903685748577118, "num_tokens": 528347.0, "step": 230 }, { "entropy": 7.665532779693604, "epoch": 0.022574447646493755, "grad_norm": 1.453125, "learning_rate": 0.00011700000000000001, "loss": 7.2657, "mean_token_accuracy": 0.08881851136684418, "num_tokens": 539328.0, "step": 235 }, { "entropy": 7.787159252166748, "epoch": 0.023054755043227664, "grad_norm": 1.375, "learning_rate": 0.00011949999999999999, "loss": 7.2264, "mean_token_accuracy": 0.09179538786411286, "num_tokens": 549297.0, "step": 240 }, { "entropy": 7.68054313659668, "epoch": 0.023535062439961577, "grad_norm": 1.40625, "learning_rate": 0.000122, "loss": 7.1925, "mean_token_accuracy": 0.0870781309902668, "num_tokens": 560306.0, "step": 245 }, { "entropy": 7.722461795806884, "epoch": 0.024015369836695485, "grad_norm": 3.09375, "learning_rate": 0.0001245, "loss": 7.2601, "mean_token_accuracy": 0.08716249391436577, "num_tokens": 571972.0, "step": 250 }, { "entropy": 7.669500827789307, "epoch": 0.024495677233429394, "grad_norm": 1.125, "learning_rate": 0.000127, "loss": 7.1479, "mean_token_accuracy": 0.09271593019366264, "num_tokens": 582962.0, "step": 255 }, { "entropy": 7.6647216796875, "epoch": 0.024975984630163303, "grad_norm": 0.9296875, "learning_rate": 0.0001295, "loss": 7.1214, "mean_token_accuracy": 0.09072922170162201, "num_tokens": 597193.0, "step": 260 }, { "entropy": 7.66283483505249, "epoch": 0.025456292026897216, "grad_norm": 1.21875, "learning_rate": 0.000132, "loss": 7.1819, "mean_token_accuracy": 0.09304547160863877, "num_tokens": 608982.0, "step": 265 }, { "entropy": 7.661752843856812, "epoch": 0.025936599423631124, "grad_norm": 1.25, "learning_rate": 0.00013450000000000002, "loss": 7.2188, "mean_token_accuracy": 0.08966975659132004, "num_tokens": 619953.0, "step": 270 }, { "entropy": 7.643835210800171, "epoch": 0.026416906820365033, "grad_norm": 1.25, "learning_rate": 0.00013700000000000002, "loss": 7.1751, "mean_token_accuracy": 0.09371341913938522, "num_tokens": 631039.0, "step": 275 }, { "entropy": 7.632717418670654, "epoch": 0.026897214217098942, "grad_norm": 1.1328125, "learning_rate": 0.0001395, "loss": 7.1656, "mean_token_accuracy": 0.09481634944677353, "num_tokens": 642656.0, "step": 280 }, { "entropy": 7.468483591079712, "epoch": 0.027377521613832854, "grad_norm": 1.46875, "learning_rate": 0.00014199999999999998, "loss": 7.0285, "mean_token_accuracy": 0.10727941244840622, "num_tokens": 653748.0, "step": 285 }, { "entropy": 7.516920471191407, "epoch": 0.027857829010566763, "grad_norm": 1.171875, "learning_rate": 0.0001445, "loss": 7.0029, "mean_token_accuracy": 0.09661566317081452, "num_tokens": 665618.0, "step": 290 }, { "entropy": 7.486124277114868, "epoch": 0.028338136407300672, "grad_norm": 1.0625, "learning_rate": 0.000147, "loss": 7.0287, "mean_token_accuracy": 0.09913064762949944, "num_tokens": 677329.0, "step": 295 }, { "entropy": 7.49315767288208, "epoch": 0.02881844380403458, "grad_norm": 1.7109375, "learning_rate": 0.0001495, "loss": 6.9864, "mean_token_accuracy": 0.1033214770257473, "num_tokens": 688278.0, "step": 300 }, { "entropy": 7.431641435623169, "epoch": 0.029298751200768493, "grad_norm": 1.96875, "learning_rate": 0.000152, "loss": 7.046, "mean_token_accuracy": 0.10180941373109817, "num_tokens": 700739.0, "step": 305 }, { "entropy": 7.378959465026855, "epoch": 0.029779058597502402, "grad_norm": 2.0625, "learning_rate": 0.00015450000000000001, "loss": 6.9858, "mean_token_accuracy": 0.104751455783844, "num_tokens": 712527.0, "step": 310 }, { "entropy": 7.4179362773895265, "epoch": 0.03025936599423631, "grad_norm": 1.390625, "learning_rate": 0.000157, "loss": 7.0113, "mean_token_accuracy": 0.09946026802062988, "num_tokens": 724514.0, "step": 315 }, { "entropy": 7.464642429351807, "epoch": 0.03073967339097022, "grad_norm": 1.3125, "learning_rate": 0.0001595, "loss": 6.958, "mean_token_accuracy": 0.10636739879846573, "num_tokens": 735679.0, "step": 320 }, { "entropy": 7.379268789291382, "epoch": 0.03121998078770413, "grad_norm": 1.234375, "learning_rate": 0.000162, "loss": 6.9502, "mean_token_accuracy": 0.10707954466342925, "num_tokens": 747896.0, "step": 325 }, { "entropy": 7.4328147888183596, "epoch": 0.03170028818443804, "grad_norm": 1.1953125, "learning_rate": 0.00016450000000000001, "loss": 7.0008, "mean_token_accuracy": 0.10451544597744941, "num_tokens": 759081.0, "step": 330 }, { "entropy": 7.373377466201783, "epoch": 0.03218059558117195, "grad_norm": 1.6640625, "learning_rate": 0.00016700000000000002, "loss": 6.9349, "mean_token_accuracy": 0.10051383301615716, "num_tokens": 770459.0, "step": 335 }, { "entropy": 7.3182484149932865, "epoch": 0.03266090297790586, "grad_norm": 2.25, "learning_rate": 0.00016950000000000003, "loss": 6.9097, "mean_token_accuracy": 0.10436427593231201, "num_tokens": 783960.0, "step": 340 }, { "entropy": 7.2723020076751705, "epoch": 0.03314121037463977, "grad_norm": 1.34375, "learning_rate": 0.00017199999999999998, "loss": 6.9998, "mean_token_accuracy": 0.1017355315387249, "num_tokens": 795425.0, "step": 345 }, { "entropy": 7.288401937484741, "epoch": 0.03362151777137368, "grad_norm": 1.5625, "learning_rate": 0.00017449999999999999, "loss": 6.9466, "mean_token_accuracy": 0.1032905712723732, "num_tokens": 807536.0, "step": 350 }, { "entropy": 7.429675006866455, "epoch": 0.034101825168107586, "grad_norm": 1.25, "learning_rate": 0.000177, "loss": 6.9955, "mean_token_accuracy": 0.09869879111647606, "num_tokens": 818801.0, "step": 355 }, { "entropy": 7.303883075714111, "epoch": 0.0345821325648415, "grad_norm": 1.2578125, "learning_rate": 0.0001795, "loss": 6.8664, "mean_token_accuracy": 0.1042160525918007, "num_tokens": 831497.0, "step": 360 }, { "entropy": 7.275684547424317, "epoch": 0.03506243996157541, "grad_norm": 1.1328125, "learning_rate": 0.000182, "loss": 6.8349, "mean_token_accuracy": 0.10631057769060134, "num_tokens": 842491.0, "step": 365 }, { "entropy": 7.303065443038941, "epoch": 0.03554274735830932, "grad_norm": 1.328125, "learning_rate": 0.0001845, "loss": 6.9059, "mean_token_accuracy": 0.09917943850159645, "num_tokens": 854560.0, "step": 370 }, { "entropy": 7.275861215591431, "epoch": 0.03602305475504323, "grad_norm": 1.3515625, "learning_rate": 0.000187, "loss": 6.8151, "mean_token_accuracy": 0.11120132729411125, "num_tokens": 866688.0, "step": 375 }, { "entropy": 7.233143997192383, "epoch": 0.03650336215177714, "grad_norm": 1.65625, "learning_rate": 0.0001895, "loss": 6.9205, "mean_token_accuracy": 0.09971508085727691, "num_tokens": 879484.0, "step": 380 }, { "entropy": 7.290747499465942, "epoch": 0.036983669548511046, "grad_norm": 1.2109375, "learning_rate": 0.000192, "loss": 6.9039, "mean_token_accuracy": 0.10731675177812576, "num_tokens": 890807.0, "step": 385 }, { "entropy": 7.2609399318695065, "epoch": 0.037463976945244955, "grad_norm": 1.828125, "learning_rate": 0.0001945, "loss": 6.854, "mean_token_accuracy": 0.10835549905896187, "num_tokens": 901759.0, "step": 390 }, { "entropy": 7.174216985702515, "epoch": 0.037944284341978864, "grad_norm": 1.28125, "learning_rate": 0.00019700000000000002, "loss": 6.7707, "mean_token_accuracy": 0.1162538155913353, "num_tokens": 912212.0, "step": 395 }, { "entropy": 7.264402294158936, "epoch": 0.03842459173871278, "grad_norm": 1.171875, "learning_rate": 0.00019950000000000002, "loss": 6.8764, "mean_token_accuracy": 0.10775518119335174, "num_tokens": 923947.0, "step": 400 }, { "entropy": 7.194364166259765, "epoch": 0.03890489913544669, "grad_norm": 1.5703125, "learning_rate": 0.000202, "loss": 6.8149, "mean_token_accuracy": 0.1155998706817627, "num_tokens": 935732.0, "step": 405 }, { "entropy": 7.094007158279419, "epoch": 0.0393852065321806, "grad_norm": 1.5390625, "learning_rate": 0.00020449999999999998, "loss": 6.7534, "mean_token_accuracy": 0.11219719424843788, "num_tokens": 948261.0, "step": 410 }, { "entropy": 7.198687505722046, "epoch": 0.039865513928914506, "grad_norm": 1.5390625, "learning_rate": 0.000207, "loss": 6.8682, "mean_token_accuracy": 0.11036199703812599, "num_tokens": 959574.0, "step": 415 }, { "entropy": 7.14764518737793, "epoch": 0.040345821325648415, "grad_norm": 1.2109375, "learning_rate": 0.0002095, "loss": 6.9302, "mean_token_accuracy": 0.10567210242152214, "num_tokens": 970329.0, "step": 420 }, { "entropy": 7.284962558746338, "epoch": 0.040826128722382324, "grad_norm": 1.5078125, "learning_rate": 0.000212, "loss": 6.7852, "mean_token_accuracy": 0.11808342635631561, "num_tokens": 982037.0, "step": 425 }, { "entropy": 6.99963059425354, "epoch": 0.04130643611911623, "grad_norm": 1.15625, "learning_rate": 0.0002145, "loss": 6.7507, "mean_token_accuracy": 0.1121592566370964, "num_tokens": 994612.0, "step": 430 }, { "entropy": 7.1772722721099855, "epoch": 0.04178674351585014, "grad_norm": 1.203125, "learning_rate": 0.00021700000000000002, "loss": 6.8563, "mean_token_accuracy": 0.11890432462096215, "num_tokens": 1005960.0, "step": 435 }, { "entropy": 7.119032526016236, "epoch": 0.04226705091258405, "grad_norm": 1.234375, "learning_rate": 0.0002195, "loss": 6.726, "mean_token_accuracy": 0.11254842653870582, "num_tokens": 1017618.0, "step": 440 }, { "entropy": 7.120699787139893, "epoch": 0.042747358309317966, "grad_norm": 1.5234375, "learning_rate": 0.000222, "loss": 6.7617, "mean_token_accuracy": 0.11123086810112, "num_tokens": 1029307.0, "step": 445 }, { "entropy": 7.10453462600708, "epoch": 0.043227665706051875, "grad_norm": 1.21875, "learning_rate": 0.0002245, "loss": 6.7794, "mean_token_accuracy": 0.11213452070951462, "num_tokens": 1042027.0, "step": 450 }, { "entropy": 7.109935092926025, "epoch": 0.043707973102785784, "grad_norm": 1.1171875, "learning_rate": 0.00022700000000000002, "loss": 6.7726, "mean_token_accuracy": 0.11005142331123352, "num_tokens": 1053125.0, "step": 455 }, { "entropy": 7.093224906921387, "epoch": 0.04418828049951969, "grad_norm": 1.578125, "learning_rate": 0.00022950000000000002, "loss": 6.7646, "mean_token_accuracy": 0.11863623559474945, "num_tokens": 1064908.0, "step": 460 }, { "entropy": 7.0393500328063965, "epoch": 0.0446685878962536, "grad_norm": 1.1796875, "learning_rate": 0.00023200000000000003, "loss": 6.6415, "mean_token_accuracy": 0.12022090703248978, "num_tokens": 1076328.0, "step": 465 }, { "entropy": 7.159615230560303, "epoch": 0.04514889529298751, "grad_norm": 1.3203125, "learning_rate": 0.00023449999999999998, "loss": 6.8668, "mean_token_accuracy": 0.10638144612312317, "num_tokens": 1088469.0, "step": 470 }, { "entropy": 6.9358738422393795, "epoch": 0.04562920268972142, "grad_norm": 1.375, "learning_rate": 0.000237, "loss": 6.6608, "mean_token_accuracy": 0.11796007007360458, "num_tokens": 1099408.0, "step": 475 }, { "entropy": 6.921041584014892, "epoch": 0.04610951008645533, "grad_norm": 1.1484375, "learning_rate": 0.0002395, "loss": 6.596, "mean_token_accuracy": 0.12084084451198578, "num_tokens": 1111101.0, "step": 480 }, { "entropy": 6.980242967605591, "epoch": 0.046589817483189244, "grad_norm": 1.375, "learning_rate": 0.000242, "loss": 6.6189, "mean_token_accuracy": 0.11961494460701942, "num_tokens": 1122877.0, "step": 485 }, { "entropy": 6.998215103149414, "epoch": 0.04707012487992315, "grad_norm": 1.2890625, "learning_rate": 0.0002445, "loss": 6.7183, "mean_token_accuracy": 0.1069619596004486, "num_tokens": 1133956.0, "step": 490 }, { "entropy": 6.955817556381225, "epoch": 0.04755043227665706, "grad_norm": 1.265625, "learning_rate": 0.000247, "loss": 6.6106, "mean_token_accuracy": 0.12115221694111825, "num_tokens": 1146101.0, "step": 495 }, { "entropy": 6.991823005676269, "epoch": 0.04803073967339097, "grad_norm": 1.4453125, "learning_rate": 0.0002495, "loss": 6.704, "mean_token_accuracy": 0.1240153320133686, "num_tokens": 1157432.0, "step": 500 }, { "entropy": 6.995119285583496, "epoch": 0.04851104707012488, "grad_norm": 1.1875, "learning_rate": 0.000252, "loss": 6.6931, "mean_token_accuracy": 0.12121785953640937, "num_tokens": 1167601.0, "step": 505 }, { "entropy": 6.925166416168213, "epoch": 0.04899135446685879, "grad_norm": 1.2265625, "learning_rate": 0.0002545, "loss": 6.5948, "mean_token_accuracy": 0.11933866590261459, "num_tokens": 1178818.0, "step": 510 }, { "entropy": 7.102405261993408, "epoch": 0.0494716618635927, "grad_norm": 1.234375, "learning_rate": 0.000257, "loss": 6.8296, "mean_token_accuracy": 0.11879347264766693, "num_tokens": 1189977.0, "step": 515 }, { "entropy": 6.896050024032593, "epoch": 0.049951969260326606, "grad_norm": 1.09375, "learning_rate": 0.0002595, "loss": 6.6543, "mean_token_accuracy": 0.12233106046915054, "num_tokens": 1201039.0, "step": 520 }, { "entropy": 7.007365083694458, "epoch": 0.05043227665706052, "grad_norm": 1.1953125, "learning_rate": 0.000262, "loss": 6.6791, "mean_token_accuracy": 0.12215208187699318, "num_tokens": 1212573.0, "step": 525 }, { "entropy": 7.002063369750976, "epoch": 0.05091258405379443, "grad_norm": 1.1171875, "learning_rate": 0.00026450000000000003, "loss": 6.6208, "mean_token_accuracy": 0.1271028608083725, "num_tokens": 1223382.0, "step": 530 }, { "entropy": 6.9438478469848635, "epoch": 0.05139289145052834, "grad_norm": 1.0078125, "learning_rate": 0.00026700000000000004, "loss": 6.6969, "mean_token_accuracy": 0.12958464100956918, "num_tokens": 1236501.0, "step": 535 }, { "entropy": 6.931712675094604, "epoch": 0.05187319884726225, "grad_norm": 1.203125, "learning_rate": 0.00026950000000000005, "loss": 6.687, "mean_token_accuracy": 0.12256318107247352, "num_tokens": 1246798.0, "step": 540 }, { "entropy": 6.9002622127532955, "epoch": 0.05235350624399616, "grad_norm": 1.40625, "learning_rate": 0.00027200000000000005, "loss": 6.6164, "mean_token_accuracy": 0.12228193208575248, "num_tokens": 1258182.0, "step": 545 }, { "entropy": 6.873838090896607, "epoch": 0.052833813640730067, "grad_norm": 1.5625, "learning_rate": 0.0002745, "loss": 6.5781, "mean_token_accuracy": 0.11714496314525605, "num_tokens": 1270273.0, "step": 550 }, { "entropy": 6.869143629074097, "epoch": 0.053314121037463975, "grad_norm": 1.4296875, "learning_rate": 0.000277, "loss": 6.6336, "mean_token_accuracy": 0.11991709843277931, "num_tokens": 1281136.0, "step": 555 }, { "entropy": 6.914445209503174, "epoch": 0.053794428434197884, "grad_norm": 1.109375, "learning_rate": 0.0002795, "loss": 6.6257, "mean_token_accuracy": 0.12010404467582703, "num_tokens": 1294488.0, "step": 560 }, { "entropy": 6.732436418533325, "epoch": 0.05427473583093179, "grad_norm": 1.296875, "learning_rate": 0.00028199999999999997, "loss": 6.5262, "mean_token_accuracy": 0.12693093419075013, "num_tokens": 1304113.0, "step": 565 }, { "entropy": 6.927071809768677, "epoch": 0.05475504322766571, "grad_norm": 1.2890625, "learning_rate": 0.0002845, "loss": 6.5843, "mean_token_accuracy": 0.12877818644046785, "num_tokens": 1315417.0, "step": 570 }, { "entropy": 6.783261919021607, "epoch": 0.05523535062439962, "grad_norm": 1.34375, "learning_rate": 0.000287, "loss": 6.5521, "mean_token_accuracy": 0.1234595388174057, "num_tokens": 1328084.0, "step": 575 }, { "entropy": 6.8645414352417, "epoch": 0.05571565802113353, "grad_norm": 1.1328125, "learning_rate": 0.0002895, "loss": 6.6982, "mean_token_accuracy": 0.1229254849255085, "num_tokens": 1338696.0, "step": 580 }, { "entropy": 6.887264966964722, "epoch": 0.056195965417867436, "grad_norm": 1.1328125, "learning_rate": 0.000292, "loss": 6.6333, "mean_token_accuracy": 0.12206205278635025, "num_tokens": 1350240.0, "step": 585 }, { "entropy": 6.901881551742553, "epoch": 0.056676272814601344, "grad_norm": 1.390625, "learning_rate": 0.0002945, "loss": 6.5792, "mean_token_accuracy": 0.12374859303236008, "num_tokens": 1361720.0, "step": 590 }, { "entropy": 6.646714115142823, "epoch": 0.05715658021133525, "grad_norm": 1.453125, "learning_rate": 0.000297, "loss": 6.5831, "mean_token_accuracy": 0.12852583453059196, "num_tokens": 1373286.0, "step": 595 }, { "entropy": 6.89121675491333, "epoch": 0.05763688760806916, "grad_norm": 1.5390625, "learning_rate": 0.0002995, "loss": 6.5332, "mean_token_accuracy": 0.12378557696938515, "num_tokens": 1384274.0, "step": 600 }, { "entropy": 6.707057476043701, "epoch": 0.05811719500480307, "grad_norm": 1.2734375, "learning_rate": 0.000302, "loss": 6.5674, "mean_token_accuracy": 0.1248041570186615, "num_tokens": 1395355.0, "step": 605 }, { "entropy": 6.787681436538696, "epoch": 0.05859750240153699, "grad_norm": 1.59375, "learning_rate": 0.0003045, "loss": 6.5071, "mean_token_accuracy": 0.1337241604924202, "num_tokens": 1406664.0, "step": 610 }, { "entropy": 6.907395648956299, "epoch": 0.059077809798270896, "grad_norm": 1.1953125, "learning_rate": 0.000307, "loss": 6.6562, "mean_token_accuracy": 0.12113718539476395, "num_tokens": 1418450.0, "step": 615 }, { "entropy": 6.8045419216156, "epoch": 0.059558117195004805, "grad_norm": 1.1640625, "learning_rate": 0.0003095, "loss": 6.5466, "mean_token_accuracy": 0.12454390972852707, "num_tokens": 1430048.0, "step": 620 }, { "entropy": 6.808126592636109, "epoch": 0.060038424591738714, "grad_norm": 1.5703125, "learning_rate": 0.000312, "loss": 6.5911, "mean_token_accuracy": 0.12378140687942504, "num_tokens": 1441820.0, "step": 625 }, { "entropy": 6.753187370300293, "epoch": 0.06051873198847262, "grad_norm": 1.2109375, "learning_rate": 0.0003145, "loss": 6.445, "mean_token_accuracy": 0.13010460510849953, "num_tokens": 1453209.0, "step": 630 }, { "entropy": 6.6527941703796385, "epoch": 0.06099903938520653, "grad_norm": 1.3515625, "learning_rate": 0.000317, "loss": 6.4598, "mean_token_accuracy": 0.12725651860237122, "num_tokens": 1465423.0, "step": 635 }, { "entropy": 6.711978006362915, "epoch": 0.06147934678194044, "grad_norm": 1.234375, "learning_rate": 0.0003195, "loss": 6.4541, "mean_token_accuracy": 0.13069155365228652, "num_tokens": 1476575.0, "step": 640 }, { "entropy": 6.659121417999268, "epoch": 0.06195965417867435, "grad_norm": 1.734375, "learning_rate": 0.000322, "loss": 6.4109, "mean_token_accuracy": 0.12579366862773894, "num_tokens": 1486932.0, "step": 645 }, { "entropy": 6.691300868988037, "epoch": 0.06243996157540826, "grad_norm": 1.140625, "learning_rate": 0.00032450000000000003, "loss": 6.4399, "mean_token_accuracy": 0.12854820042848586, "num_tokens": 1498494.0, "step": 650 }, { "entropy": 6.7037928104400635, "epoch": 0.06292026897214217, "grad_norm": 1.2109375, "learning_rate": 0.00032700000000000003, "loss": 6.4936, "mean_token_accuracy": 0.12374913021922111, "num_tokens": 1509937.0, "step": 655 }, { "entropy": 6.782931184768676, "epoch": 0.06340057636887608, "grad_norm": 1.3125, "learning_rate": 0.00032950000000000004, "loss": 6.5147, "mean_token_accuracy": 0.13380258977413179, "num_tokens": 1519823.0, "step": 660 }, { "entropy": 6.726450872421265, "epoch": 0.06388088376560999, "grad_norm": 1.2890625, "learning_rate": 0.00033200000000000005, "loss": 6.5528, "mean_token_accuracy": 0.12575417309999465, "num_tokens": 1529943.0, "step": 665 }, { "entropy": 6.611954069137573, "epoch": 0.0643611911623439, "grad_norm": 1.2578125, "learning_rate": 0.00033450000000000005, "loss": 6.3767, "mean_token_accuracy": 0.13369367122650147, "num_tokens": 1540618.0, "step": 670 }, { "entropy": 6.685780334472656, "epoch": 0.06484149855907781, "grad_norm": 1.3515625, "learning_rate": 0.000337, "loss": 6.5048, "mean_token_accuracy": 0.1227756217122078, "num_tokens": 1553208.0, "step": 675 }, { "entropy": 6.6764894962310795, "epoch": 0.06532180595581172, "grad_norm": 1.3359375, "learning_rate": 0.0003395, "loss": 6.4589, "mean_token_accuracy": 0.1339925467967987, "num_tokens": 1563975.0, "step": 680 }, { "entropy": 6.717716455459595, "epoch": 0.06580211335254563, "grad_norm": 1.28125, "learning_rate": 0.000342, "loss": 6.5252, "mean_token_accuracy": 0.12458744868636132, "num_tokens": 1575998.0, "step": 685 }, { "entropy": 6.6251349449157715, "epoch": 0.06628242074927954, "grad_norm": 1.125, "learning_rate": 0.00034449999999999997, "loss": 6.3994, "mean_token_accuracy": 0.13568611592054367, "num_tokens": 1586041.0, "step": 690 }, { "entropy": 6.637330770492554, "epoch": 0.06676272814601344, "grad_norm": 1.4375, "learning_rate": 0.000347, "loss": 6.4796, "mean_token_accuracy": 0.12872253656387328, "num_tokens": 1597531.0, "step": 695 }, { "entropy": 6.617096710205078, "epoch": 0.06724303554274735, "grad_norm": 1.3828125, "learning_rate": 0.0003495, "loss": 6.4549, "mean_token_accuracy": 0.12859696000814438, "num_tokens": 1609255.0, "step": 700 }, { "entropy": 6.640483236312866, "epoch": 0.06772334293948126, "grad_norm": 1.265625, "learning_rate": 0.000352, "loss": 6.439, "mean_token_accuracy": 0.13394341096282006, "num_tokens": 1621098.0, "step": 705 }, { "entropy": 6.601499080657959, "epoch": 0.06820365033621517, "grad_norm": 1.2578125, "learning_rate": 0.0003545, "loss": 6.3504, "mean_token_accuracy": 0.14078185856342315, "num_tokens": 1631941.0, "step": 710 }, { "entropy": 6.551211166381836, "epoch": 0.0686839577329491, "grad_norm": 1.1484375, "learning_rate": 0.000357, "loss": 6.3471, "mean_token_accuracy": 0.13648251742124556, "num_tokens": 1643117.0, "step": 715 }, { "entropy": 6.5161905765533445, "epoch": 0.069164265129683, "grad_norm": 1.40625, "learning_rate": 0.0003595, "loss": 6.3952, "mean_token_accuracy": 0.13429828062653543, "num_tokens": 1653595.0, "step": 720 }, { "entropy": 6.614610481262207, "epoch": 0.06964457252641691, "grad_norm": 1.2734375, "learning_rate": 0.000362, "loss": 6.4168, "mean_token_accuracy": 0.13274685442447662, "num_tokens": 1664495.0, "step": 725 }, { "entropy": 6.5094832420349125, "epoch": 0.07012487992315082, "grad_norm": 1.1328125, "learning_rate": 0.0003645, "loss": 6.4047, "mean_token_accuracy": 0.136563728004694, "num_tokens": 1674923.0, "step": 730 }, { "entropy": 6.602942371368409, "epoch": 0.07060518731988473, "grad_norm": 1.234375, "learning_rate": 0.000367, "loss": 6.3045, "mean_token_accuracy": 0.13681301474571228, "num_tokens": 1685904.0, "step": 735 }, { "entropy": 6.596617603302002, "epoch": 0.07108549471661864, "grad_norm": 1.046875, "learning_rate": 0.0003695, "loss": 6.5324, "mean_token_accuracy": 0.12432878389954567, "num_tokens": 1699133.0, "step": 740 }, { "entropy": 6.504991292953491, "epoch": 0.07156580211335255, "grad_norm": 1.1796875, "learning_rate": 0.000372, "loss": 6.342, "mean_token_accuracy": 0.13271907046437265, "num_tokens": 1711559.0, "step": 745 }, { "entropy": 6.592547464370727, "epoch": 0.07204610951008646, "grad_norm": 1.15625, "learning_rate": 0.0003745, "loss": 6.2575, "mean_token_accuracy": 0.14460937380790712, "num_tokens": 1722526.0, "step": 750 }, { "entropy": 6.4313709259033205, "epoch": 0.07252641690682037, "grad_norm": 1.1015625, "learning_rate": 0.000377, "loss": 6.3265, "mean_token_accuracy": 0.1398925192654133, "num_tokens": 1734261.0, "step": 755 }, { "entropy": 6.5256377220153805, "epoch": 0.07300672430355427, "grad_norm": 1.203125, "learning_rate": 0.0003795, "loss": 6.3105, "mean_token_accuracy": 0.14366703033447265, "num_tokens": 1745151.0, "step": 760 }, { "entropy": 6.631883907318115, "epoch": 0.07348703170028818, "grad_norm": 1.3203125, "learning_rate": 0.000382, "loss": 6.4547, "mean_token_accuracy": 0.1341322012245655, "num_tokens": 1755463.0, "step": 765 }, { "entropy": 6.584089756011963, "epoch": 0.07396733909702209, "grad_norm": 1.3203125, "learning_rate": 0.0003845, "loss": 6.4178, "mean_token_accuracy": 0.1315837398171425, "num_tokens": 1767717.0, "step": 770 }, { "entropy": 6.3859930515289305, "epoch": 0.074447646493756, "grad_norm": 1.296875, "learning_rate": 0.00038700000000000003, "loss": 6.2619, "mean_token_accuracy": 0.14160886630415917, "num_tokens": 1779115.0, "step": 775 }, { "entropy": 6.3998737812042235, "epoch": 0.07492795389048991, "grad_norm": 1.3359375, "learning_rate": 0.00038950000000000003, "loss": 6.213, "mean_token_accuracy": 0.1398429863154888, "num_tokens": 1789644.0, "step": 780 }, { "entropy": 6.540688323974609, "epoch": 0.07540826128722382, "grad_norm": 1.140625, "learning_rate": 0.00039200000000000004, "loss": 6.4251, "mean_token_accuracy": 0.13578777611255646, "num_tokens": 1800606.0, "step": 785 }, { "entropy": 6.513448238372803, "epoch": 0.07588856868395773, "grad_norm": 1.1484375, "learning_rate": 0.00039450000000000005, "loss": 6.4264, "mean_token_accuracy": 0.12942690253257752, "num_tokens": 1812168.0, "step": 790 }, { "entropy": 6.5457319736480715, "epoch": 0.07636887608069164, "grad_norm": 1.2109375, "learning_rate": 0.00039700000000000005, "loss": 6.3796, "mean_token_accuracy": 0.1303087830543518, "num_tokens": 1823830.0, "step": 795 }, { "entropy": 6.495282316207886, "epoch": 0.07684918347742556, "grad_norm": 1.15625, "learning_rate": 0.0003995, "loss": 6.3456, "mean_token_accuracy": 0.13957973942160606, "num_tokens": 1835611.0, "step": 800 }, { "entropy": 6.467644214630127, "epoch": 0.07732949087415947, "grad_norm": 1.15625, "learning_rate": 0.000402, "loss": 6.4127, "mean_token_accuracy": 0.1334280975162983, "num_tokens": 1847036.0, "step": 805 }, { "entropy": 6.464094591140747, "epoch": 0.07780979827089338, "grad_norm": 1.296875, "learning_rate": 0.0004045, "loss": 6.3528, "mean_token_accuracy": 0.13223012760281563, "num_tokens": 1857476.0, "step": 810 }, { "entropy": 6.50727949142456, "epoch": 0.07829010566762729, "grad_norm": 1.1328125, "learning_rate": 0.00040699999999999997, "loss": 6.3773, "mean_token_accuracy": 0.1352442115545273, "num_tokens": 1869073.0, "step": 815 }, { "entropy": 6.384515810012817, "epoch": 0.0787704130643612, "grad_norm": 1.078125, "learning_rate": 0.0004095, "loss": 6.2486, "mean_token_accuracy": 0.14026699736714363, "num_tokens": 1880439.0, "step": 820 }, { "entropy": 6.561717510223389, "epoch": 0.0792507204610951, "grad_norm": 1.328125, "learning_rate": 0.000412, "loss": 6.4116, "mean_token_accuracy": 0.134783523529768, "num_tokens": 1891600.0, "step": 825 }, { "entropy": 6.414502573013306, "epoch": 0.07973102785782901, "grad_norm": 1.328125, "learning_rate": 0.0004145, "loss": 6.3783, "mean_token_accuracy": 0.13531816452741624, "num_tokens": 1903126.0, "step": 830 }, { "entropy": 6.5730548858642575, "epoch": 0.08021133525456292, "grad_norm": 1.296875, "learning_rate": 0.000417, "loss": 6.3467, "mean_token_accuracy": 0.14032403156161308, "num_tokens": 1913913.0, "step": 835 }, { "entropy": 6.344644355773926, "epoch": 0.08069164265129683, "grad_norm": 1.28125, "learning_rate": 0.0004195, "loss": 6.2684, "mean_token_accuracy": 0.1382530964910984, "num_tokens": 1924961.0, "step": 840 }, { "entropy": 6.523792457580567, "epoch": 0.08117195004803074, "grad_norm": 1.2421875, "learning_rate": 0.000422, "loss": 6.3612, "mean_token_accuracy": 0.12942377403378486, "num_tokens": 1936773.0, "step": 845 }, { "entropy": 6.355926513671875, "epoch": 0.08165225744476465, "grad_norm": 1.34375, "learning_rate": 0.0004245, "loss": 6.2783, "mean_token_accuracy": 0.13875910267233849, "num_tokens": 1948190.0, "step": 850 }, { "entropy": 6.331581449508667, "epoch": 0.08213256484149856, "grad_norm": 1.09375, "learning_rate": 0.000427, "loss": 6.2694, "mean_token_accuracy": 0.14160780385136604, "num_tokens": 1960038.0, "step": 855 }, { "entropy": 6.557125091552734, "epoch": 0.08261287223823247, "grad_norm": 1.2890625, "learning_rate": 0.0004295, "loss": 6.3489, "mean_token_accuracy": 0.14002878665924073, "num_tokens": 1970535.0, "step": 860 }, { "entropy": 6.411432456970215, "epoch": 0.08309317963496637, "grad_norm": 1.34375, "learning_rate": 0.000432, "loss": 6.3226, "mean_token_accuracy": 0.13546231836080552, "num_tokens": 1981386.0, "step": 865 }, { "entropy": 6.337710332870484, "epoch": 0.08357348703170028, "grad_norm": 1.1171875, "learning_rate": 0.0004345, "loss": 6.2428, "mean_token_accuracy": 0.1426716774702072, "num_tokens": 1993196.0, "step": 870 }, { "entropy": 6.432919025421143, "epoch": 0.08405379442843419, "grad_norm": 1.0859375, "learning_rate": 0.000437, "loss": 6.2741, "mean_token_accuracy": 0.14658503904938697, "num_tokens": 2004756.0, "step": 875 }, { "entropy": 6.315603113174438, "epoch": 0.0845341018251681, "grad_norm": 1.25, "learning_rate": 0.0004395, "loss": 6.2347, "mean_token_accuracy": 0.14145326390862464, "num_tokens": 2016020.0, "step": 880 }, { "entropy": 6.380750274658203, "epoch": 0.08501440922190202, "grad_norm": 1.2265625, "learning_rate": 0.000442, "loss": 6.2819, "mean_token_accuracy": 0.14082487300038338, "num_tokens": 2027747.0, "step": 885 }, { "entropy": 6.4264098644256595, "epoch": 0.08549471661863593, "grad_norm": 1.3984375, "learning_rate": 0.0004445, "loss": 6.2553, "mean_token_accuracy": 0.13818828240036965, "num_tokens": 2038841.0, "step": 890 }, { "entropy": 6.385887289047242, "epoch": 0.08597502401536984, "grad_norm": 1.046875, "learning_rate": 0.000447, "loss": 6.3043, "mean_token_accuracy": 0.13402576446533204, "num_tokens": 2049905.0, "step": 895 }, { "entropy": 6.424469089508056, "epoch": 0.08645533141210375, "grad_norm": 1.234375, "learning_rate": 0.00044950000000000003, "loss": 6.3803, "mean_token_accuracy": 0.13485484719276428, "num_tokens": 2062492.0, "step": 900 }, { "entropy": 6.387258577346802, "epoch": 0.08693563880883766, "grad_norm": 1.21875, "learning_rate": 0.00045200000000000004, "loss": 6.31, "mean_token_accuracy": 0.1353304862976074, "num_tokens": 2073840.0, "step": 905 }, { "entropy": 6.3580629348754885, "epoch": 0.08741594620557157, "grad_norm": 1.328125, "learning_rate": 0.00045450000000000004, "loss": 6.221, "mean_token_accuracy": 0.14060378223657607, "num_tokens": 2085720.0, "step": 910 }, { "entropy": 6.353258228302002, "epoch": 0.08789625360230548, "grad_norm": 1.0703125, "learning_rate": 0.00045700000000000005, "loss": 6.3039, "mean_token_accuracy": 0.1413162462413311, "num_tokens": 2096649.0, "step": 915 }, { "entropy": 6.436611890792847, "epoch": 0.08837656099903939, "grad_norm": 0.99609375, "learning_rate": 0.00045950000000000006, "loss": 6.3061, "mean_token_accuracy": 0.14285610914230346, "num_tokens": 2109030.0, "step": 920 }, { "entropy": 6.35608320236206, "epoch": 0.0888568683957733, "grad_norm": 1.15625, "learning_rate": 0.000462, "loss": 6.2113, "mean_token_accuracy": 0.14488047659397124, "num_tokens": 2121384.0, "step": 925 }, { "entropy": 6.269479846954345, "epoch": 0.0893371757925072, "grad_norm": 1.1796875, "learning_rate": 0.0004645, "loss": 6.1635, "mean_token_accuracy": 0.147640460729599, "num_tokens": 2131377.0, "step": 930 }, { "entropy": 6.344134902954101, "epoch": 0.08981748318924111, "grad_norm": 1.1484375, "learning_rate": 0.000467, "loss": 6.3531, "mean_token_accuracy": 0.1383367098867893, "num_tokens": 2142364.0, "step": 935 }, { "entropy": 6.356987571716308, "epoch": 0.09029779058597502, "grad_norm": 1.171875, "learning_rate": 0.0004695, "loss": 6.2296, "mean_token_accuracy": 0.14149210676550866, "num_tokens": 2153040.0, "step": 940 }, { "entropy": 6.35843825340271, "epoch": 0.09077809798270893, "grad_norm": 1.015625, "learning_rate": 0.000472, "loss": 6.2728, "mean_token_accuracy": 0.14314480721950532, "num_tokens": 2165571.0, "step": 945 }, { "entropy": 6.3020600318908695, "epoch": 0.09125840537944284, "grad_norm": 1.0546875, "learning_rate": 0.0004745, "loss": 6.2423, "mean_token_accuracy": 0.14072795882821082, "num_tokens": 2177241.0, "step": 950 }, { "entropy": 6.329180097579956, "epoch": 0.09173871277617675, "grad_norm": 1.2109375, "learning_rate": 0.000477, "loss": 6.2801, "mean_token_accuracy": 0.1361616224050522, "num_tokens": 2187475.0, "step": 955 }, { "entropy": 6.315436792373657, "epoch": 0.09221902017291066, "grad_norm": 1.0390625, "learning_rate": 0.0004795, "loss": 6.3087, "mean_token_accuracy": 0.14151085540652275, "num_tokens": 2198185.0, "step": 960 }, { "entropy": 6.303459358215332, "epoch": 0.09269932756964457, "grad_norm": 1.1640625, "learning_rate": 0.000482, "loss": 6.2346, "mean_token_accuracy": 0.14740882739424704, "num_tokens": 2210404.0, "step": 965 }, { "entropy": 6.370419549942016, "epoch": 0.09317963496637849, "grad_norm": 1.2109375, "learning_rate": 0.0004845, "loss": 6.2262, "mean_token_accuracy": 0.144054813683033, "num_tokens": 2222188.0, "step": 970 }, { "entropy": 6.290718269348145, "epoch": 0.0936599423631124, "grad_norm": 1.109375, "learning_rate": 0.000487, "loss": 6.2775, "mean_token_accuracy": 0.1421047918498516, "num_tokens": 2233418.0, "step": 975 }, { "entropy": 6.352431869506836, "epoch": 0.0941402497598463, "grad_norm": 1.125, "learning_rate": 0.0004895, "loss": 6.2415, "mean_token_accuracy": 0.14807373881340027, "num_tokens": 2245053.0, "step": 980 }, { "entropy": 6.250268840789795, "epoch": 0.09462055715658022, "grad_norm": 1.1328125, "learning_rate": 0.000492, "loss": 6.2715, "mean_token_accuracy": 0.14363499581813813, "num_tokens": 2256375.0, "step": 985 }, { "entropy": 6.225133609771729, "epoch": 0.09510086455331412, "grad_norm": 1.2265625, "learning_rate": 0.0004945, "loss": 6.1142, "mean_token_accuracy": 0.1477846160531044, "num_tokens": 2267074.0, "step": 990 }, { "entropy": 6.191523456573487, "epoch": 0.09558117195004803, "grad_norm": 1.15625, "learning_rate": 0.000497, "loss": 6.1547, "mean_token_accuracy": 0.14838184416294098, "num_tokens": 2277168.0, "step": 995 }, { "entropy": 6.25091781616211, "epoch": 0.09606147934678194, "grad_norm": 1.046875, "learning_rate": 0.0004995, "loss": 6.1381, "mean_token_accuracy": 0.14807945489883423, "num_tokens": 2288178.0, "step": 1000 }, { "entropy": 6.215264129638672, "epoch": 0.09654178674351585, "grad_norm": 1.1484375, "learning_rate": 0.0004999999983283737, "loss": 6.1686, "mean_token_accuracy": 0.1440332628786564, "num_tokens": 2299765.0, "step": 1005 }, { "entropy": 6.3124645233154295, "epoch": 0.09702209414024976, "grad_norm": 1.15625, "learning_rate": 0.0004999999915373924, "loss": 6.2644, "mean_token_accuracy": 0.13689299449324607, "num_tokens": 2312047.0, "step": 1010 }, { "entropy": 6.30297064781189, "epoch": 0.09750240153698367, "grad_norm": 1.1484375, "learning_rate": 0.0004999999795225793, "loss": 6.2563, "mean_token_accuracy": 0.1363622300326824, "num_tokens": 2324118.0, "step": 1015 }, { "entropy": 6.299112796783447, "epoch": 0.09798270893371758, "grad_norm": 1.203125, "learning_rate": 0.0004999999622839347, "loss": 6.2494, "mean_token_accuracy": 0.14326749965548516, "num_tokens": 2335171.0, "step": 1020 }, { "entropy": 6.283253812789917, "epoch": 0.09846301633045149, "grad_norm": 1.078125, "learning_rate": 0.0004999999398214593, "loss": 6.1501, "mean_token_accuracy": 0.14212532341480255, "num_tokens": 2346338.0, "step": 1025 }, { "entropy": 6.212884902954102, "epoch": 0.0989433237271854, "grad_norm": 1.1875, "learning_rate": 0.0004999999121351532, "loss": 6.1934, "mean_token_accuracy": 0.14963782876729964, "num_tokens": 2357185.0, "step": 1030 }, { "entropy": 6.190281915664673, "epoch": 0.0994236311239193, "grad_norm": 1.109375, "learning_rate": 0.0004999998792250173, "loss": 6.1183, "mean_token_accuracy": 0.15685753300786018, "num_tokens": 2368494.0, "step": 1035 }, { "entropy": 6.289627552032471, "epoch": 0.09990393852065321, "grad_norm": 1.15625, "learning_rate": 0.0004999998410910524, "loss": 6.3364, "mean_token_accuracy": 0.13329742476344109, "num_tokens": 2380800.0, "step": 1040 }, { "entropy": 6.3118733882904055, "epoch": 0.10038424591738712, "grad_norm": 1.0859375, "learning_rate": 0.0004999997977332592, "loss": 6.2551, "mean_token_accuracy": 0.13934137374162675, "num_tokens": 2391753.0, "step": 1045 }, { "entropy": 6.178606843948364, "epoch": 0.10086455331412104, "grad_norm": 1.0390625, "learning_rate": 0.0004999997491516389, "loss": 6.1391, "mean_token_accuracy": 0.1400229126214981, "num_tokens": 2403324.0, "step": 1050 }, { "entropy": 6.235824918746948, "epoch": 0.10134486071085495, "grad_norm": 1.15625, "learning_rate": 0.0004999996953461925, "loss": 6.2482, "mean_token_accuracy": 0.13423383459448815, "num_tokens": 2414873.0, "step": 1055 }, { "entropy": 6.138184642791748, "epoch": 0.10182516810758886, "grad_norm": 1.0, "learning_rate": 0.0004999996363169212, "loss": 6.0208, "mean_token_accuracy": 0.15671658217906953, "num_tokens": 2425308.0, "step": 1060 }, { "entropy": 6.144180011749268, "epoch": 0.10230547550432277, "grad_norm": 1.109375, "learning_rate": 0.0004999995720638266, "loss": 6.0654, "mean_token_accuracy": 0.1525282308459282, "num_tokens": 2436835.0, "step": 1065 }, { "entropy": 6.183439445495606, "epoch": 0.10278578290105668, "grad_norm": 1.140625, "learning_rate": 0.00049999950258691, "loss": 6.1921, "mean_token_accuracy": 0.1451313279569149, "num_tokens": 2446798.0, "step": 1070 }, { "entropy": 6.123720979690551, "epoch": 0.10326609029779059, "grad_norm": 1.15625, "learning_rate": 0.0004999994278861731, "loss": 6.0747, "mean_token_accuracy": 0.15084402859210969, "num_tokens": 2457308.0, "step": 1075 }, { "entropy": 6.215669107437134, "epoch": 0.1037463976945245, "grad_norm": 1.0390625, "learning_rate": 0.0004999993479616175, "loss": 6.1309, "mean_token_accuracy": 0.13830516785383223, "num_tokens": 2468917.0, "step": 1080 }, { "entropy": 6.227848720550537, "epoch": 0.1042267050912584, "grad_norm": 1.09375, "learning_rate": 0.0004999992628132451, "loss": 6.1529, "mean_token_accuracy": 0.14558819606900214, "num_tokens": 2481363.0, "step": 1085 }, { "entropy": 6.175233983993531, "epoch": 0.10470701248799232, "grad_norm": 1.046875, "learning_rate": 0.0004999991724410582, "loss": 6.1551, "mean_token_accuracy": 0.14347582682967186, "num_tokens": 2493082.0, "step": 1090 }, { "entropy": 6.150361251831055, "epoch": 0.10518731988472622, "grad_norm": 1.0703125, "learning_rate": 0.0004999990768450583, "loss": 6.106, "mean_token_accuracy": 0.1499667778611183, "num_tokens": 2503849.0, "step": 1095 }, { "entropy": 6.225272464752197, "epoch": 0.10566762728146013, "grad_norm": 1.140625, "learning_rate": 0.0004999989760252482, "loss": 6.1511, "mean_token_accuracy": 0.14817013815045357, "num_tokens": 2514528.0, "step": 1100 }, { "entropy": 6.097928714752197, "epoch": 0.10614793467819404, "grad_norm": 1.2578125, "learning_rate": 0.0004999988699816299, "loss": 6.1427, "mean_token_accuracy": 0.14771459847688675, "num_tokens": 2524971.0, "step": 1105 }, { "entropy": 6.153327941894531, "epoch": 0.10662824207492795, "grad_norm": 1.03125, "learning_rate": 0.0004999987587142058, "loss": 6.057, "mean_token_accuracy": 0.14452041387557985, "num_tokens": 2535674.0, "step": 1110 }, { "entropy": 6.2696786403656, "epoch": 0.10710854947166186, "grad_norm": 1.09375, "learning_rate": 0.0004999986422229789, "loss": 6.2903, "mean_token_accuracy": 0.13996392711997033, "num_tokens": 2547108.0, "step": 1115 }, { "entropy": 6.155757236480713, "epoch": 0.10758885686839577, "grad_norm": 1.015625, "learning_rate": 0.0004999985205079514, "loss": 6.1047, "mean_token_accuracy": 0.1451355442404747, "num_tokens": 2559474.0, "step": 1120 }, { "entropy": 6.012842035293579, "epoch": 0.10806916426512968, "grad_norm": 1.03125, "learning_rate": 0.0004999983935691265, "loss": 5.9441, "mean_token_accuracy": 0.16244944632053376, "num_tokens": 2571264.0, "step": 1125 }, { "entropy": 6.159362649917602, "epoch": 0.10854947166186359, "grad_norm": 1.03125, "learning_rate": 0.000499998261406507, "loss": 6.1208, "mean_token_accuracy": 0.1507526934146881, "num_tokens": 2583731.0, "step": 1130 }, { "entropy": 6.268857860565186, "epoch": 0.10902977905859751, "grad_norm": 1.25, "learning_rate": 0.0004999981240200958, "loss": 6.1607, "mean_token_accuracy": 0.14638862013816833, "num_tokens": 2595497.0, "step": 1135 }, { "entropy": 6.053813219070435, "epoch": 0.10951008645533142, "grad_norm": 0.9921875, "learning_rate": 0.0004999979814098966, "loss": 6.1148, "mean_token_accuracy": 0.1516471363604069, "num_tokens": 2607358.0, "step": 1140 }, { "entropy": 6.1449603080749515, "epoch": 0.10999039385206533, "grad_norm": 1.109375, "learning_rate": 0.0004999978335759121, "loss": 6.0354, "mean_token_accuracy": 0.15392047837376593, "num_tokens": 2618936.0, "step": 1145 }, { "entropy": 6.154958772659302, "epoch": 0.11047070124879924, "grad_norm": 1.0703125, "learning_rate": 0.0004999976805181461, "loss": 6.1981, "mean_token_accuracy": 0.14167412593960763, "num_tokens": 2631840.0, "step": 1150 }, { "entropy": 6.140295743942261, "epoch": 0.11095100864553314, "grad_norm": 1.109375, "learning_rate": 0.000499997522236602, "loss": 6.1443, "mean_token_accuracy": 0.15361175835132598, "num_tokens": 2642412.0, "step": 1155 }, { "entropy": 6.160842370986939, "epoch": 0.11143131604226705, "grad_norm": 0.97265625, "learning_rate": 0.0004999973587312837, "loss": 6.1067, "mean_token_accuracy": 0.14919153451919556, "num_tokens": 2653890.0, "step": 1160 }, { "entropy": 6.146590614318848, "epoch": 0.11191162343900096, "grad_norm": 1.1015625, "learning_rate": 0.0004999971900021947, "loss": 6.163, "mean_token_accuracy": 0.15273661985993386, "num_tokens": 2664888.0, "step": 1165 }, { "entropy": 6.159024953842163, "epoch": 0.11239193083573487, "grad_norm": 1.0078125, "learning_rate": 0.0004999970160493391, "loss": 6.0579, "mean_token_accuracy": 0.14569913148880004, "num_tokens": 2675550.0, "step": 1170 }, { "entropy": 6.02392611503601, "epoch": 0.11287223823246878, "grad_norm": 1.015625, "learning_rate": 0.0004999968368727209, "loss": 6.0724, "mean_token_accuracy": 0.15466973930597305, "num_tokens": 2688022.0, "step": 1175 }, { "entropy": 6.1862691879272464, "epoch": 0.11335254562920269, "grad_norm": 0.94921875, "learning_rate": 0.0004999966524723442, "loss": 6.0632, "mean_token_accuracy": 0.14964798092842102, "num_tokens": 2698737.0, "step": 1180 }, { "entropy": 6.077165365219116, "epoch": 0.1138328530259366, "grad_norm": 0.98046875, "learning_rate": 0.0004999964628482135, "loss": 6.0344, "mean_token_accuracy": 0.15742302685976028, "num_tokens": 2709844.0, "step": 1185 }, { "entropy": 6.127112817764282, "epoch": 0.1143131604226705, "grad_norm": 1.0, "learning_rate": 0.0004999962680003328, "loss": 6.1035, "mean_token_accuracy": 0.1519095703959465, "num_tokens": 2720273.0, "step": 1190 }, { "entropy": 6.1255943775177, "epoch": 0.11479346781940442, "grad_norm": 1.078125, "learning_rate": 0.000499996067928707, "loss": 6.1124, "mean_token_accuracy": 0.14679019302129745, "num_tokens": 2731354.0, "step": 1195 }, { "entropy": 6.127178192138672, "epoch": 0.11527377521613832, "grad_norm": 1.0703125, "learning_rate": 0.0004999958626333406, "loss": 6.1052, "mean_token_accuracy": 0.1527300015091896, "num_tokens": 2742966.0, "step": 1200 }, { "entropy": 6.03611798286438, "epoch": 0.11575408261287223, "grad_norm": 1.0703125, "learning_rate": 0.0004999956521142383, "loss": 6.009, "mean_token_accuracy": 0.1586822062730789, "num_tokens": 2755010.0, "step": 1205 }, { "entropy": 6.0991308212280275, "epoch": 0.11623439000960614, "grad_norm": 1.03125, "learning_rate": 0.0004999954363714051, "loss": 6.0361, "mean_token_accuracy": 0.14981242269277573, "num_tokens": 2766176.0, "step": 1210 }, { "entropy": 6.185801792144775, "epoch": 0.11671469740634005, "grad_norm": 1.015625, "learning_rate": 0.0004999952154048459, "loss": 6.1829, "mean_token_accuracy": 0.15044604614377022, "num_tokens": 2777861.0, "step": 1215 }, { "entropy": 6.021704149246216, "epoch": 0.11719500480307397, "grad_norm": 1.0234375, "learning_rate": 0.000499994989214566, "loss": 5.9954, "mean_token_accuracy": 0.1536705419421196, "num_tokens": 2788725.0, "step": 1220 }, { "entropy": 6.0181561470031735, "epoch": 0.11767531219980788, "grad_norm": 0.98046875, "learning_rate": 0.0004999947578005705, "loss": 6.0312, "mean_token_accuracy": 0.15193646997213364, "num_tokens": 2801613.0, "step": 1225 }, { "entropy": 6.218272686004639, "epoch": 0.11815561959654179, "grad_norm": 0.98828125, "learning_rate": 0.0004999945211628648, "loss": 6.0986, "mean_token_accuracy": 0.1493365317583084, "num_tokens": 2812474.0, "step": 1230 }, { "entropy": 5.971197032928467, "epoch": 0.1186359269932757, "grad_norm": 1.03125, "learning_rate": 0.0004999942793014544, "loss": 6.0103, "mean_token_accuracy": 0.15563429594039918, "num_tokens": 2823178.0, "step": 1235 }, { "entropy": 6.045905733108521, "epoch": 0.11911623439000961, "grad_norm": 0.9375, "learning_rate": 0.000499994032216345, "loss": 6.0211, "mean_token_accuracy": 0.15064174830913543, "num_tokens": 2836486.0, "step": 1240 }, { "entropy": 6.107371759414673, "epoch": 0.11959654178674352, "grad_norm": 1.0703125, "learning_rate": 0.0004999937799075422, "loss": 6.0746, "mean_token_accuracy": 0.1570821538567543, "num_tokens": 2847902.0, "step": 1245 }, { "entropy": 5.903108596801758, "epoch": 0.12007684918347743, "grad_norm": 0.99609375, "learning_rate": 0.000499993522375052, "loss": 5.9739, "mean_token_accuracy": 0.15461545437574387, "num_tokens": 2859991.0, "step": 1250 }, { "entropy": 6.248143100738526, "epoch": 0.12055715658021134, "grad_norm": 1.0546875, "learning_rate": 0.0004999932596188802, "loss": 6.1545, "mean_token_accuracy": 0.14593613222241403, "num_tokens": 2870269.0, "step": 1255 }, { "entropy": 6.034249687194825, "epoch": 0.12103746397694524, "grad_norm": 1.1171875, "learning_rate": 0.0004999929916390331, "loss": 6.0279, "mean_token_accuracy": 0.14597706943750383, "num_tokens": 2882191.0, "step": 1260 }, { "entropy": 5.966269588470459, "epoch": 0.12151777137367915, "grad_norm": 0.99609375, "learning_rate": 0.0004999927184355169, "loss": 6.0372, "mean_token_accuracy": 0.14836430177092552, "num_tokens": 2892775.0, "step": 1265 }, { "entropy": 6.147925519943238, "epoch": 0.12199807877041306, "grad_norm": 1.0234375, "learning_rate": 0.0004999924400083377, "loss": 6.0247, "mean_token_accuracy": 0.15831544399261474, "num_tokens": 2904750.0, "step": 1270 }, { "entropy": 6.081568050384521, "epoch": 0.12247838616714697, "grad_norm": 1.0078125, "learning_rate": 0.0004999921563575022, "loss": 6.0988, "mean_token_accuracy": 0.14920950308442116, "num_tokens": 2916150.0, "step": 1275 }, { "entropy": 6.07696213722229, "epoch": 0.12295869356388088, "grad_norm": 1.09375, "learning_rate": 0.0004999918674830169, "loss": 6.0644, "mean_token_accuracy": 0.1496642827987671, "num_tokens": 2928452.0, "step": 1280 }, { "entropy": 6.035782670974731, "epoch": 0.12343900096061479, "grad_norm": 1.0703125, "learning_rate": 0.0004999915733848886, "loss": 6.0442, "mean_token_accuracy": 0.1454036220908165, "num_tokens": 2940577.0, "step": 1285 }, { "entropy": 6.022758436203003, "epoch": 0.1239193083573487, "grad_norm": 1.0390625, "learning_rate": 0.000499991274063124, "loss": 6.0283, "mean_token_accuracy": 0.15150520876049994, "num_tokens": 2952302.0, "step": 1290 }, { "entropy": 6.0645428657531735, "epoch": 0.12439961575408261, "grad_norm": 1.1328125, "learning_rate": 0.0004999909695177301, "loss": 6.0669, "mean_token_accuracy": 0.15440516471862792, "num_tokens": 2964611.0, "step": 1295 }, { "entropy": 6.0961566925048825, "epoch": 0.12487992315081652, "grad_norm": 1.0078125, "learning_rate": 0.000499990659748714, "loss": 6.05, "mean_token_accuracy": 0.15006925463676452, "num_tokens": 2975668.0, "step": 1300 }, { "entropy": 6.146146440505982, "epoch": 0.12536023054755044, "grad_norm": 1.015625, "learning_rate": 0.0004999903447560828, "loss": 6.1198, "mean_token_accuracy": 0.14781473577022552, "num_tokens": 2987303.0, "step": 1305 }, { "entropy": 6.117984342575073, "epoch": 0.12584053794428435, "grad_norm": 0.9453125, "learning_rate": 0.0004999900245398439, "loss": 6.0166, "mean_token_accuracy": 0.16036698669195176, "num_tokens": 3000400.0, "step": 1310 }, { "entropy": 6.010946893692017, "epoch": 0.12632084534101826, "grad_norm": 1.0625, "learning_rate": 0.0004999896991000047, "loss": 5.9477, "mean_token_accuracy": 0.1495976448059082, "num_tokens": 3012336.0, "step": 1315 }, { "entropy": 6.054377698898316, "epoch": 0.12680115273775217, "grad_norm": 1.046875, "learning_rate": 0.0004999893684365729, "loss": 6.0047, "mean_token_accuracy": 0.15137309059500695, "num_tokens": 3023004.0, "step": 1320 }, { "entropy": 6.044629859924316, "epoch": 0.12728146013448607, "grad_norm": 0.984375, "learning_rate": 0.0004999890325495559, "loss": 6.0922, "mean_token_accuracy": 0.147823116928339, "num_tokens": 3035147.0, "step": 1325 }, { "entropy": 6.072157478332519, "epoch": 0.12776176753121998, "grad_norm": 1.0078125, "learning_rate": 0.0004999886914389617, "loss": 5.9177, "mean_token_accuracy": 0.1551705077290535, "num_tokens": 3045611.0, "step": 1330 }, { "entropy": 5.916638660430908, "epoch": 0.1282420749279539, "grad_norm": 0.92578125, "learning_rate": 0.0004999883451047981, "loss": 5.9296, "mean_token_accuracy": 0.1561925306916237, "num_tokens": 3056420.0, "step": 1335 }, { "entropy": 5.977782440185547, "epoch": 0.1287223823246878, "grad_norm": 0.9765625, "learning_rate": 0.0004999879935470733, "loss": 5.9227, "mean_token_accuracy": 0.15750788599252702, "num_tokens": 3068770.0, "step": 1340 }, { "entropy": 6.05616979598999, "epoch": 0.1292026897214217, "grad_norm": 1.015625, "learning_rate": 0.0004999876367657954, "loss": 6.0521, "mean_token_accuracy": 0.14580482840538025, "num_tokens": 3080806.0, "step": 1345 }, { "entropy": 6.143747854232788, "epoch": 0.12968299711815562, "grad_norm": 1.015625, "learning_rate": 0.0004999872747609725, "loss": 6.0742, "mean_token_accuracy": 0.1484417587518692, "num_tokens": 3091769.0, "step": 1350 }, { "entropy": 5.9879156112670895, "epoch": 0.13016330451488953, "grad_norm": 1.171875, "learning_rate": 0.0004999869075326132, "loss": 5.9938, "mean_token_accuracy": 0.15191702395677567, "num_tokens": 3103121.0, "step": 1355 }, { "entropy": 6.010816240310669, "epoch": 0.13064361191162344, "grad_norm": 0.890625, "learning_rate": 0.000499986535080726, "loss": 5.9724, "mean_token_accuracy": 0.16233935654163362, "num_tokens": 3115606.0, "step": 1360 }, { "entropy": 6.026129817962646, "epoch": 0.13112391930835735, "grad_norm": 0.94921875, "learning_rate": 0.0004999861574053196, "loss": 5.8723, "mean_token_accuracy": 0.16096271872520446, "num_tokens": 3127961.0, "step": 1365 }, { "entropy": 5.87260947227478, "epoch": 0.13160422670509125, "grad_norm": 1.015625, "learning_rate": 0.0004999857745064027, "loss": 5.8905, "mean_token_accuracy": 0.15895691215991975, "num_tokens": 3138316.0, "step": 1370 }, { "entropy": 5.953699588775635, "epoch": 0.13208453410182516, "grad_norm": 0.9296875, "learning_rate": 0.000499985386383984, "loss": 5.8671, "mean_token_accuracy": 0.15866711735725403, "num_tokens": 3150818.0, "step": 1375 }, { "entropy": 6.006815195083618, "epoch": 0.13256484149855907, "grad_norm": 1.1015625, "learning_rate": 0.0004999849930380729, "loss": 6.0195, "mean_token_accuracy": 0.1508159779012203, "num_tokens": 3162066.0, "step": 1380 }, { "entropy": 5.941660642623901, "epoch": 0.13304514889529298, "grad_norm": 1.09375, "learning_rate": 0.0004999845944686781, "loss": 5.9924, "mean_token_accuracy": 0.1508888617157936, "num_tokens": 3172209.0, "step": 1385 }, { "entropy": 5.954594707489013, "epoch": 0.1335254562920269, "grad_norm": 0.98046875, "learning_rate": 0.0004999841906758093, "loss": 5.8218, "mean_token_accuracy": 0.1675858825445175, "num_tokens": 3183248.0, "step": 1390 }, { "entropy": 5.94215030670166, "epoch": 0.1340057636887608, "grad_norm": 1.0625, "learning_rate": 0.0004999837816594757, "loss": 5.9139, "mean_token_accuracy": 0.15847276002168656, "num_tokens": 3194748.0, "step": 1395 }, { "entropy": 5.930553770065307, "epoch": 0.1344860710854947, "grad_norm": 1.0625, "learning_rate": 0.0004999833674196865, "loss": 5.8849, "mean_token_accuracy": 0.16950529664754868, "num_tokens": 3205669.0, "step": 1400 }, { "entropy": 5.932918214797974, "epoch": 0.13496637848222862, "grad_norm": 1.0234375, "learning_rate": 0.0004999829479564518, "loss": 5.9807, "mean_token_accuracy": 0.14995542094111441, "num_tokens": 3216035.0, "step": 1405 }, { "entropy": 6.064324188232422, "epoch": 0.13544668587896252, "grad_norm": 1.109375, "learning_rate": 0.000499982523269781, "loss": 5.9647, "mean_token_accuracy": 0.15931690335273743, "num_tokens": 3227192.0, "step": 1410 }, { "entropy": 5.975619888305664, "epoch": 0.13592699327569643, "grad_norm": 0.9609375, "learning_rate": 0.0004999820933596842, "loss": 5.9871, "mean_token_accuracy": 0.15620121210813523, "num_tokens": 3240237.0, "step": 1415 }, { "entropy": 5.962911701202392, "epoch": 0.13640730067243034, "grad_norm": 1.0234375, "learning_rate": 0.000499981658226171, "loss": 5.8734, "mean_token_accuracy": 0.16469697579741477, "num_tokens": 3251963.0, "step": 1420 }, { "entropy": 5.908741474151611, "epoch": 0.13688760806916425, "grad_norm": 1.0078125, "learning_rate": 0.000499981217869252, "loss": 5.9953, "mean_token_accuracy": 0.15814436972141266, "num_tokens": 3263101.0, "step": 1425 }, { "entropy": 5.985613679885864, "epoch": 0.1373679154658982, "grad_norm": 1.1015625, "learning_rate": 0.000499980772288937, "loss": 5.8679, "mean_token_accuracy": 0.16649020761251448, "num_tokens": 3275100.0, "step": 1430 }, { "entropy": 5.945235109329223, "epoch": 0.1378482228626321, "grad_norm": 0.9140625, "learning_rate": 0.0004999803214852367, "loss": 5.9638, "mean_token_accuracy": 0.15565589517354966, "num_tokens": 3287025.0, "step": 1435 }, { "entropy": 6.04934253692627, "epoch": 0.138328530259366, "grad_norm": 0.91796875, "learning_rate": 0.0004999798654581613, "loss": 5.9662, "mean_token_accuracy": 0.15883919447660447, "num_tokens": 3299867.0, "step": 1440 }, { "entropy": 5.918570852279663, "epoch": 0.13880883765609991, "grad_norm": 1.046875, "learning_rate": 0.0004999794042077214, "loss": 5.9038, "mean_token_accuracy": 0.16191874593496322, "num_tokens": 3311183.0, "step": 1445 }, { "entropy": 5.952925539016723, "epoch": 0.13928914505283382, "grad_norm": 1.1484375, "learning_rate": 0.0004999789377339279, "loss": 5.9687, "mean_token_accuracy": 0.15641413480043412, "num_tokens": 3322247.0, "step": 1450 }, { "entropy": 5.962415742874145, "epoch": 0.13976945244956773, "grad_norm": 1.03125, "learning_rate": 0.0004999784660367915, "loss": 5.8826, "mean_token_accuracy": 0.1588966131210327, "num_tokens": 3333369.0, "step": 1455 }, { "entropy": 5.904612874984741, "epoch": 0.14024975984630164, "grad_norm": 1.0546875, "learning_rate": 0.0004999779891163231, "loss": 5.9113, "mean_token_accuracy": 0.16011089235544204, "num_tokens": 3345876.0, "step": 1460 }, { "entropy": 5.91278772354126, "epoch": 0.14073006724303555, "grad_norm": 1.0234375, "learning_rate": 0.0004999775069725339, "loss": 5.8124, "mean_token_accuracy": 0.1629629462957382, "num_tokens": 3357323.0, "step": 1465 }, { "entropy": 5.912459039688111, "epoch": 0.14121037463976946, "grad_norm": 1.109375, "learning_rate": 0.000499977019605435, "loss": 5.897, "mean_token_accuracy": 0.15947655588388443, "num_tokens": 3367689.0, "step": 1470 }, { "entropy": 5.844752836227417, "epoch": 0.14169068203650337, "grad_norm": 0.9921875, "learning_rate": 0.0004999765270150378, "loss": 5.8568, "mean_token_accuracy": 0.15955205261707306, "num_tokens": 3379472.0, "step": 1475 }, { "entropy": 5.996302938461303, "epoch": 0.14217098943323728, "grad_norm": 1.015625, "learning_rate": 0.0004999760292013536, "loss": 5.8922, "mean_token_accuracy": 0.15859662368893623, "num_tokens": 3390929.0, "step": 1480 }, { "entropy": 5.99014687538147, "epoch": 0.14265129682997119, "grad_norm": 1.0625, "learning_rate": 0.0004999755261643941, "loss": 5.8976, "mean_token_accuracy": 0.16287715286016463, "num_tokens": 3401242.0, "step": 1485 }, { "entropy": 5.869934892654419, "epoch": 0.1431316042267051, "grad_norm": 1.0859375, "learning_rate": 0.0004999750179041709, "loss": 5.8878, "mean_token_accuracy": 0.16124220937490463, "num_tokens": 3411169.0, "step": 1490 }, { "entropy": 5.874157810211182, "epoch": 0.143611911623439, "grad_norm": 1.0859375, "learning_rate": 0.0004999745044206959, "loss": 5.7279, "mean_token_accuracy": 0.16647156924009324, "num_tokens": 3423265.0, "step": 1495 }, { "entropy": 5.832660913467407, "epoch": 0.1440922190201729, "grad_norm": 0.96484375, "learning_rate": 0.0004999739857139809, "loss": 5.8347, "mean_token_accuracy": 0.16908216327428818, "num_tokens": 3434793.0, "step": 1500 }, { "entropy": 5.757522106170654, "epoch": 0.14457252641690682, "grad_norm": 0.98828125, "learning_rate": 0.000499973461784038, "loss": 5.7679, "mean_token_accuracy": 0.17928926199674605, "num_tokens": 3445732.0, "step": 1505 }, { "entropy": 5.942258501052857, "epoch": 0.14505283381364073, "grad_norm": 0.98046875, "learning_rate": 0.0004999729326308792, "loss": 5.9516, "mean_token_accuracy": 0.15832037180662156, "num_tokens": 3457090.0, "step": 1510 }, { "entropy": 5.99946174621582, "epoch": 0.14553314121037464, "grad_norm": 1.1484375, "learning_rate": 0.000499972398254517, "loss": 5.9388, "mean_token_accuracy": 0.15340567082166673, "num_tokens": 3468087.0, "step": 1515 }, { "entropy": 5.941799163818359, "epoch": 0.14601344860710855, "grad_norm": 1.0625, "learning_rate": 0.000499971858654964, "loss": 5.8778, "mean_token_accuracy": 0.1609287366271019, "num_tokens": 3478820.0, "step": 1520 }, { "entropy": 5.859274196624756, "epoch": 0.14649375600384246, "grad_norm": 0.97265625, "learning_rate": 0.0004999713138322321, "loss": 5.9021, "mean_token_accuracy": 0.15754427909851074, "num_tokens": 3489878.0, "step": 1525 }, { "entropy": 5.942076396942139, "epoch": 0.14697406340057637, "grad_norm": 1.03125, "learning_rate": 0.0004999707637863346, "loss": 5.8905, "mean_token_accuracy": 0.1585473045706749, "num_tokens": 3500944.0, "step": 1530 }, { "entropy": 5.8406360149383545, "epoch": 0.14745437079731027, "grad_norm": 1.078125, "learning_rate": 0.0004999702085172838, "loss": 5.8719, "mean_token_accuracy": 0.16607238352298737, "num_tokens": 3511383.0, "step": 1535 }, { "entropy": 5.969763612747192, "epoch": 0.14793467819404418, "grad_norm": 0.9609375, "learning_rate": 0.0004999696480250929, "loss": 5.963, "mean_token_accuracy": 0.15430965945124625, "num_tokens": 3523300.0, "step": 1540 }, { "entropy": 5.970634698867798, "epoch": 0.1484149855907781, "grad_norm": 1.1953125, "learning_rate": 0.0004999690823097747, "loss": 5.8799, "mean_token_accuracy": 0.1521039791405201, "num_tokens": 3534371.0, "step": 1545 }, { "entropy": 5.841155576705932, "epoch": 0.148895292987512, "grad_norm": 1.1171875, "learning_rate": 0.0004999685113713426, "loss": 5.8552, "mean_token_accuracy": 0.16120514869689942, "num_tokens": 3544847.0, "step": 1550 }, { "entropy": 5.92685284614563, "epoch": 0.1493756003842459, "grad_norm": 0.9765625, "learning_rate": 0.0004999679352098096, "loss": 5.8223, "mean_token_accuracy": 0.16645588725805283, "num_tokens": 3555859.0, "step": 1555 }, { "entropy": 5.8343531608581545, "epoch": 0.14985590778097982, "grad_norm": 0.9375, "learning_rate": 0.0004999673538251891, "loss": 5.8389, "mean_token_accuracy": 0.15894080251455306, "num_tokens": 3568283.0, "step": 1560 }, { "entropy": 5.834793663024902, "epoch": 0.15033621517771373, "grad_norm": 0.9609375, "learning_rate": 0.0004999667672174947, "loss": 5.917, "mean_token_accuracy": 0.1583700641989708, "num_tokens": 3581442.0, "step": 1565 }, { "entropy": 6.0175745487213135, "epoch": 0.15081652257444764, "grad_norm": 1.015625, "learning_rate": 0.00049996617538674, "loss": 5.9571, "mean_token_accuracy": 0.15496992468833923, "num_tokens": 3594055.0, "step": 1570 }, { "entropy": 5.962413930892945, "epoch": 0.15129682997118155, "grad_norm": 1.0, "learning_rate": 0.0004999655783329386, "loss": 5.9187, "mean_token_accuracy": 0.15283605754375457, "num_tokens": 3605952.0, "step": 1575 }, { "entropy": 5.910793209075928, "epoch": 0.15177713736791545, "grad_norm": 0.98828125, "learning_rate": 0.0004999649760561046, "loss": 5.9577, "mean_token_accuracy": 0.158383572101593, "num_tokens": 3618544.0, "step": 1580 }, { "entropy": 5.908201408386231, "epoch": 0.15225744476464936, "grad_norm": 1.03125, "learning_rate": 0.0004999643685562519, "loss": 5.8929, "mean_token_accuracy": 0.16440413743257523, "num_tokens": 3630445.0, "step": 1585 }, { "entropy": 5.935053777694702, "epoch": 0.15273775216138327, "grad_norm": 1.0234375, "learning_rate": 0.0004999637558333945, "loss": 5.8797, "mean_token_accuracy": 0.16155748218297958, "num_tokens": 3642516.0, "step": 1590 }, { "entropy": 5.843541431427002, "epoch": 0.15321805955811718, "grad_norm": 1.046875, "learning_rate": 0.0004999631378875467, "loss": 5.8175, "mean_token_accuracy": 0.16581382006406784, "num_tokens": 3654425.0, "step": 1595 }, { "entropy": 5.805763053894043, "epoch": 0.15369836695485112, "grad_norm": 0.9765625, "learning_rate": 0.0004999625147187228, "loss": 5.8228, "mean_token_accuracy": 0.16464165300130845, "num_tokens": 3666521.0, "step": 1600 }, { "entropy": 6.019205856323242, "epoch": 0.15417867435158503, "grad_norm": 0.94140625, "learning_rate": 0.0004999618863269373, "loss": 5.8806, "mean_token_accuracy": 0.15575164407491685, "num_tokens": 3679121.0, "step": 1605 }, { "entropy": 5.91282377243042, "epoch": 0.15465898174831894, "grad_norm": 1.015625, "learning_rate": 0.0004999612527122049, "loss": 5.8941, "mean_token_accuracy": 0.15461272597312928, "num_tokens": 3691095.0, "step": 1610 }, { "entropy": 5.826972103118896, "epoch": 0.15513928914505284, "grad_norm": 0.87109375, "learning_rate": 0.0004999606138745402, "loss": 5.8562, "mean_token_accuracy": 0.16407538801431656, "num_tokens": 3703426.0, "step": 1615 }, { "entropy": 5.967412042617798, "epoch": 0.15561959654178675, "grad_norm": 1.0, "learning_rate": 0.0004999599698139581, "loss": 5.9309, "mean_token_accuracy": 0.1637990355491638, "num_tokens": 3715429.0, "step": 1620 }, { "entropy": 5.932253503799439, "epoch": 0.15609990393852066, "grad_norm": 1.03125, "learning_rate": 0.0004999593205304734, "loss": 5.909, "mean_token_accuracy": 0.15584128946065903, "num_tokens": 3726327.0, "step": 1625 }, { "entropy": 5.9037374496459964, "epoch": 0.15658021133525457, "grad_norm": 1.0, "learning_rate": 0.0004999586660241012, "loss": 5.8582, "mean_token_accuracy": 0.1553866222500801, "num_tokens": 3736818.0, "step": 1630 }, { "entropy": 5.929326868057251, "epoch": 0.15706051873198848, "grad_norm": 0.9921875, "learning_rate": 0.0004999580062948569, "loss": 5.8583, "mean_token_accuracy": 0.16254822611808778, "num_tokens": 3747776.0, "step": 1635 }, { "entropy": 5.7625970363616945, "epoch": 0.1575408261287224, "grad_norm": 1.0, "learning_rate": 0.0004999573413427556, "loss": 5.7301, "mean_token_accuracy": 0.164338056743145, "num_tokens": 3758990.0, "step": 1640 }, { "entropy": 5.8398857593536375, "epoch": 0.1580211335254563, "grad_norm": 0.9609375, "learning_rate": 0.0004999566711678128, "loss": 5.7961, "mean_token_accuracy": 0.1605479434132576, "num_tokens": 3769686.0, "step": 1645 }, { "entropy": 5.867894649505615, "epoch": 0.1585014409221902, "grad_norm": 0.92578125, "learning_rate": 0.0004999559957700442, "loss": 5.8554, "mean_token_accuracy": 0.16354380249977113, "num_tokens": 3781815.0, "step": 1650 }, { "entropy": 5.88207426071167, "epoch": 0.15898174831892412, "grad_norm": 0.99609375, "learning_rate": 0.0004999553151494653, "loss": 5.9139, "mean_token_accuracy": 0.15942219495773316, "num_tokens": 3793392.0, "step": 1655 }, { "entropy": 5.860579538345337, "epoch": 0.15946205571565802, "grad_norm": 1.015625, "learning_rate": 0.0004999546293060919, "loss": 5.8298, "mean_token_accuracy": 0.16041782200336457, "num_tokens": 3804974.0, "step": 1660 }, { "entropy": 5.799793004989624, "epoch": 0.15994236311239193, "grad_norm": 0.953125, "learning_rate": 0.00049995393823994, "loss": 5.7028, "mean_token_accuracy": 0.17192372530698777, "num_tokens": 3817166.0, "step": 1665 }, { "entropy": 5.849306297302246, "epoch": 0.16042267050912584, "grad_norm": 1.03125, "learning_rate": 0.0004999532419510255, "loss": 5.8307, "mean_token_accuracy": 0.1580624461174011, "num_tokens": 3828151.0, "step": 1670 }, { "entropy": 5.847281789779663, "epoch": 0.16090297790585975, "grad_norm": 0.97265625, "learning_rate": 0.000499952540439365, "loss": 5.8283, "mean_token_accuracy": 0.16032543033361435, "num_tokens": 3839439.0, "step": 1675 }, { "entropy": 5.906755828857422, "epoch": 0.16138328530259366, "grad_norm": 0.95703125, "learning_rate": 0.0004999518337049743, "loss": 5.8813, "mean_token_accuracy": 0.15963228195905685, "num_tokens": 3851694.0, "step": 1680 }, { "entropy": 5.831542205810547, "epoch": 0.16186359269932757, "grad_norm": 0.91015625, "learning_rate": 0.00049995112174787, "loss": 5.8589, "mean_token_accuracy": 0.15917099863290787, "num_tokens": 3863593.0, "step": 1685 }, { "entropy": 5.811672306060791, "epoch": 0.16234390009606148, "grad_norm": 0.95703125, "learning_rate": 0.0004999504045680687, "loss": 5.7935, "mean_token_accuracy": 0.1701650395989418, "num_tokens": 3874588.0, "step": 1690 }, { "entropy": 5.894420862197876, "epoch": 0.1628242074927954, "grad_norm": 1.046875, "learning_rate": 0.0004999496821655869, "loss": 5.8753, "mean_token_accuracy": 0.16022350043058395, "num_tokens": 3884662.0, "step": 1695 }, { "entropy": 5.956241655349731, "epoch": 0.1633045148895293, "grad_norm": 0.890625, "learning_rate": 0.0004999489545404414, "loss": 5.9739, "mean_token_accuracy": 0.15092033073306083, "num_tokens": 3896569.0, "step": 1700 }, { "entropy": 5.943658018112183, "epoch": 0.1637848222862632, "grad_norm": 0.8984375, "learning_rate": 0.0004999482216926493, "loss": 5.8162, "mean_token_accuracy": 0.1632000833749771, "num_tokens": 3907691.0, "step": 1705 }, { "entropy": 5.843317651748658, "epoch": 0.1642651296829971, "grad_norm": 1.09375, "learning_rate": 0.0004999474836222273, "loss": 5.83, "mean_token_accuracy": 0.1665841408073902, "num_tokens": 3918794.0, "step": 1710 }, { "entropy": 5.834485340118408, "epoch": 0.16474543707973102, "grad_norm": 0.94140625, "learning_rate": 0.0004999467403291928, "loss": 5.8301, "mean_token_accuracy": 0.1692491739988327, "num_tokens": 3929773.0, "step": 1715 }, { "entropy": 5.874946594238281, "epoch": 0.16522574447646493, "grad_norm": 1.0625, "learning_rate": 0.0004999459918135628, "loss": 5.8498, "mean_token_accuracy": 0.16062923073768615, "num_tokens": 3940264.0, "step": 1720 }, { "entropy": 5.791439247131348, "epoch": 0.16570605187319884, "grad_norm": 1.0078125, "learning_rate": 0.000499945238075355, "loss": 5.7456, "mean_token_accuracy": 0.1693306788802147, "num_tokens": 3951500.0, "step": 1725 }, { "entropy": 5.851829910278321, "epoch": 0.16618635926993275, "grad_norm": 1.046875, "learning_rate": 0.0004999444791145865, "loss": 5.8145, "mean_token_accuracy": 0.16588351577520372, "num_tokens": 3963580.0, "step": 1730 }, { "entropy": 5.804158353805542, "epoch": 0.16666666666666666, "grad_norm": 0.9375, "learning_rate": 0.0004999437149312754, "loss": 5.7585, "mean_token_accuracy": 0.17176578491926192, "num_tokens": 3975994.0, "step": 1735 }, { "entropy": 5.836318635940552, "epoch": 0.16714697406340057, "grad_norm": 1.015625, "learning_rate": 0.000499942945525439, "loss": 5.7658, "mean_token_accuracy": 0.15896687656641006, "num_tokens": 3987897.0, "step": 1740 }, { "entropy": 5.888211059570312, "epoch": 0.16762728146013448, "grad_norm": 1.03125, "learning_rate": 0.0004999421708970954, "loss": 5.93, "mean_token_accuracy": 0.15537445321679116, "num_tokens": 3999829.0, "step": 1745 }, { "entropy": 5.7658594131469725, "epoch": 0.16810758885686838, "grad_norm": 0.9765625, "learning_rate": 0.0004999413910462625, "loss": 5.7591, "mean_token_accuracy": 0.16620118021965027, "num_tokens": 4010882.0, "step": 1750 }, { "entropy": 5.861884737014771, "epoch": 0.1685878962536023, "grad_norm": 0.9453125, "learning_rate": 0.0004999406059729586, "loss": 5.7469, "mean_token_accuracy": 0.17034892737865448, "num_tokens": 4021423.0, "step": 1755 }, { "entropy": 5.888075494766236, "epoch": 0.1690682036503362, "grad_norm": 0.921875, "learning_rate": 0.0004999398156772016, "loss": 5.8931, "mean_token_accuracy": 0.15374189764261245, "num_tokens": 4033590.0, "step": 1760 }, { "entropy": 5.721970653533935, "epoch": 0.16954851104707014, "grad_norm": 1.078125, "learning_rate": 0.00049993902015901, "loss": 5.7562, "mean_token_accuracy": 0.16655992865562438, "num_tokens": 4043978.0, "step": 1765 }, { "entropy": 5.931190156936646, "epoch": 0.17002881844380405, "grad_norm": 1.0703125, "learning_rate": 0.0004999382194184023, "loss": 5.8756, "mean_token_accuracy": 0.16273052543401717, "num_tokens": 4054513.0, "step": 1770 }, { "entropy": 5.857993745803833, "epoch": 0.17050912584053796, "grad_norm": 0.9375, "learning_rate": 0.0004999374134553972, "loss": 5.8367, "mean_token_accuracy": 0.16276317089796066, "num_tokens": 4066019.0, "step": 1775 }, { "entropy": 5.841061735153199, "epoch": 0.17098943323727187, "grad_norm": 0.93359375, "learning_rate": 0.0004999366022700131, "loss": 5.7935, "mean_token_accuracy": 0.1673088401556015, "num_tokens": 4077688.0, "step": 1780 }, { "entropy": 5.860415935516357, "epoch": 0.17146974063400577, "grad_norm": 0.9765625, "learning_rate": 0.0004999357858622691, "loss": 5.8573, "mean_token_accuracy": 0.1664716601371765, "num_tokens": 4089803.0, "step": 1785 }, { "entropy": 5.8289069652557375, "epoch": 0.17195004803073968, "grad_norm": 0.91796875, "learning_rate": 0.0004999349642321842, "loss": 5.8073, "mean_token_accuracy": 0.16912547051906585, "num_tokens": 4101969.0, "step": 1790 }, { "entropy": 5.799117517471314, "epoch": 0.1724303554274736, "grad_norm": 0.99609375, "learning_rate": 0.0004999341373797772, "loss": 5.7955, "mean_token_accuracy": 0.15957102179527283, "num_tokens": 4113567.0, "step": 1795 }, { "entropy": 5.814974451065064, "epoch": 0.1729106628242075, "grad_norm": 0.9921875, "learning_rate": 0.0004999333053050675, "loss": 5.7575, "mean_token_accuracy": 0.1691056177020073, "num_tokens": 4125191.0, "step": 1800 }, { "entropy": 5.827954626083374, "epoch": 0.1733909702209414, "grad_norm": 1.140625, "learning_rate": 0.0004999324680080744, "loss": 5.8004, "mean_token_accuracy": 0.16687883883714677, "num_tokens": 4135050.0, "step": 1805 }, { "entropy": 5.842863750457764, "epoch": 0.17387127761767532, "grad_norm": 0.93359375, "learning_rate": 0.0004999316254888172, "loss": 5.8736, "mean_token_accuracy": 0.1648238182067871, "num_tokens": 4146874.0, "step": 1810 }, { "entropy": 5.857775688171387, "epoch": 0.17435158501440923, "grad_norm": 0.93359375, "learning_rate": 0.0004999307777473157, "loss": 5.7974, "mean_token_accuracy": 0.16151650995016098, "num_tokens": 4158118.0, "step": 1815 }, { "entropy": 5.818978691101075, "epoch": 0.17483189241114314, "grad_norm": 1.171875, "learning_rate": 0.0004999299247835893, "loss": 5.7561, "mean_token_accuracy": 0.17479462176561356, "num_tokens": 4169035.0, "step": 1820 }, { "entropy": 5.738432455062866, "epoch": 0.17531219980787704, "grad_norm": 1.03125, "learning_rate": 0.000499929066597658, "loss": 5.745, "mean_token_accuracy": 0.17148349434137344, "num_tokens": 4180314.0, "step": 1825 }, { "entropy": 5.883955717086792, "epoch": 0.17579250720461095, "grad_norm": 1.046875, "learning_rate": 0.0004999282031895418, "loss": 5.8239, "mean_token_accuracy": 0.16614590883255004, "num_tokens": 4192238.0, "step": 1830 }, { "entropy": 5.769097232818604, "epoch": 0.17627281460134486, "grad_norm": 1.0078125, "learning_rate": 0.0004999273345592604, "loss": 5.756, "mean_token_accuracy": 0.16652164459228516, "num_tokens": 4203346.0, "step": 1835 }, { "entropy": 5.811061954498291, "epoch": 0.17675312199807877, "grad_norm": 0.96875, "learning_rate": 0.0004999264607068343, "loss": 5.8159, "mean_token_accuracy": 0.17016567289829254, "num_tokens": 4213763.0, "step": 1840 }, { "entropy": 5.781940555572509, "epoch": 0.17723342939481268, "grad_norm": 0.90234375, "learning_rate": 0.0004999255816322837, "loss": 5.7699, "mean_token_accuracy": 0.16876950412988662, "num_tokens": 4225553.0, "step": 1845 }, { "entropy": 5.857665061950684, "epoch": 0.1777137367915466, "grad_norm": 0.99609375, "learning_rate": 0.000499924697335629, "loss": 5.702, "mean_token_accuracy": 0.17350574135780333, "num_tokens": 4236058.0, "step": 1850 }, { "entropy": 5.640166330337524, "epoch": 0.1781940441882805, "grad_norm": 0.92578125, "learning_rate": 0.0004999238078168906, "loss": 5.7763, "mean_token_accuracy": 0.17054813206195832, "num_tokens": 4248299.0, "step": 1855 }, { "entropy": 5.8273721694946286, "epoch": 0.1786743515850144, "grad_norm": 0.94921875, "learning_rate": 0.0004999229130760894, "loss": 5.7052, "mean_token_accuracy": 0.17111807465553283, "num_tokens": 4259704.0, "step": 1860 }, { "entropy": 5.691127586364746, "epoch": 0.17915465898174832, "grad_norm": 1.0, "learning_rate": 0.000499922013113246, "loss": 5.587, "mean_token_accuracy": 0.18398697525262833, "num_tokens": 4270480.0, "step": 1865 }, { "entropy": 5.780127954483032, "epoch": 0.17963496637848222, "grad_norm": 1.0234375, "learning_rate": 0.0004999211079283814, "loss": 5.8538, "mean_token_accuracy": 0.16719998568296432, "num_tokens": 4282104.0, "step": 1870 }, { "entropy": 5.849603605270386, "epoch": 0.18011527377521613, "grad_norm": 0.93359375, "learning_rate": 0.0004999201975215164, "loss": 5.8172, "mean_token_accuracy": 0.16666848957538605, "num_tokens": 4294251.0, "step": 1875 }, { "entropy": 5.757232236862182, "epoch": 0.18059558117195004, "grad_norm": 0.95703125, "learning_rate": 0.0004999192818926725, "loss": 5.7017, "mean_token_accuracy": 0.16847867369651795, "num_tokens": 4305569.0, "step": 1880 }, { "entropy": 5.859993028640747, "epoch": 0.18107588856868395, "grad_norm": 1.09375, "learning_rate": 0.0004999183610418706, "loss": 5.8283, "mean_token_accuracy": 0.16413767859339715, "num_tokens": 4317845.0, "step": 1885 }, { "entropy": 5.76594557762146, "epoch": 0.18155619596541786, "grad_norm": 0.90625, "learning_rate": 0.0004999174349691322, "loss": 5.6959, "mean_token_accuracy": 0.17179392874240876, "num_tokens": 4329987.0, "step": 1890 }, { "entropy": 5.697657203674316, "epoch": 0.18203650336215177, "grad_norm": 0.88671875, "learning_rate": 0.0004999165036744788, "loss": 5.7257, "mean_token_accuracy": 0.16847490072250365, "num_tokens": 4341628.0, "step": 1895 }, { "entropy": 5.861244201660156, "epoch": 0.18251681075888568, "grad_norm": 1.046875, "learning_rate": 0.0004999155671579322, "loss": 5.7851, "mean_token_accuracy": 0.1615397110581398, "num_tokens": 4352379.0, "step": 1900 }, { "entropy": 5.6849024295806885, "epoch": 0.1829971181556196, "grad_norm": 1.0234375, "learning_rate": 0.000499914625419514, "loss": 5.7181, "mean_token_accuracy": 0.171738800406456, "num_tokens": 4364800.0, "step": 1905 }, { "entropy": 5.776795959472656, "epoch": 0.1834774255523535, "grad_norm": 1.0859375, "learning_rate": 0.0004999136784592459, "loss": 5.7315, "mean_token_accuracy": 0.16872817426919937, "num_tokens": 4376048.0, "step": 1910 }, { "entropy": 5.730347061157227, "epoch": 0.1839577329490874, "grad_norm": 0.921875, "learning_rate": 0.0004999127262771502, "loss": 5.7297, "mean_token_accuracy": 0.16825871765613556, "num_tokens": 4388072.0, "step": 1915 }, { "entropy": 5.872533082962036, "epoch": 0.1844380403458213, "grad_norm": 1.046875, "learning_rate": 0.0004999117688732487, "loss": 5.8226, "mean_token_accuracy": 0.16391085535287858, "num_tokens": 4399843.0, "step": 1920 }, { "entropy": 5.713910245895386, "epoch": 0.18491834774255522, "grad_norm": 1.015625, "learning_rate": 0.0004999108062475638, "loss": 5.6757, "mean_token_accuracy": 0.17384760677814484, "num_tokens": 4411373.0, "step": 1925 }, { "entropy": 5.716005563735962, "epoch": 0.18539865513928913, "grad_norm": 1.03125, "learning_rate": 0.000499909838400118, "loss": 5.6614, "mean_token_accuracy": 0.173922398686409, "num_tokens": 4421857.0, "step": 1930 }, { "entropy": 5.820113229751587, "epoch": 0.18587896253602307, "grad_norm": 1.0078125, "learning_rate": 0.0004999088653309334, "loss": 5.7618, "mean_token_accuracy": 0.1711716189980507, "num_tokens": 4432728.0, "step": 1935 }, { "entropy": 5.708466053009033, "epoch": 0.18635926993275698, "grad_norm": 0.9375, "learning_rate": 0.0004999078870400329, "loss": 5.693, "mean_token_accuracy": 0.17283684760332108, "num_tokens": 4444683.0, "step": 1940 }, { "entropy": 5.8614743709564205, "epoch": 0.18683957732949089, "grad_norm": 0.953125, "learning_rate": 0.0004999069035274391, "loss": 5.8215, "mean_token_accuracy": 0.16018551886081694, "num_tokens": 4456961.0, "step": 1945 }, { "entropy": 5.694478511810303, "epoch": 0.1873198847262248, "grad_norm": 0.9140625, "learning_rate": 0.0004999059147931747, "loss": 5.665, "mean_token_accuracy": 0.1762719616293907, "num_tokens": 4468424.0, "step": 1950 }, { "entropy": 5.791493558883667, "epoch": 0.1878001921229587, "grad_norm": 0.94921875, "learning_rate": 0.0004999049208372629, "loss": 5.8694, "mean_token_accuracy": 0.15364666059613227, "num_tokens": 4479813.0, "step": 1955 }, { "entropy": 5.952554082870483, "epoch": 0.1882804995196926, "grad_norm": 1.03125, "learning_rate": 0.0004999039216597267, "loss": 5.862, "mean_token_accuracy": 0.16733278185129166, "num_tokens": 4491172.0, "step": 1960 }, { "entropy": 5.706536293029785, "epoch": 0.18876080691642652, "grad_norm": 0.92578125, "learning_rate": 0.0004999029172605892, "loss": 5.7439, "mean_token_accuracy": 0.1704375624656677, "num_tokens": 4503063.0, "step": 1965 }, { "entropy": 5.889812326431274, "epoch": 0.18924111431316043, "grad_norm": 0.91796875, "learning_rate": 0.0004999019076398738, "loss": 5.8177, "mean_token_accuracy": 0.15313875377178193, "num_tokens": 4514188.0, "step": 1970 }, { "entropy": 5.822384834289551, "epoch": 0.18972142170989434, "grad_norm": 0.95703125, "learning_rate": 0.000499900892797604, "loss": 5.7258, "mean_token_accuracy": 0.17310872822999954, "num_tokens": 4525293.0, "step": 1975 }, { "entropy": 5.80044903755188, "epoch": 0.19020172910662825, "grad_norm": 1.046875, "learning_rate": 0.0004998998727338031, "loss": 5.8139, "mean_token_accuracy": 0.1692732721567154, "num_tokens": 4536589.0, "step": 1980 }, { "entropy": 5.689789342880249, "epoch": 0.19068203650336216, "grad_norm": 0.98828125, "learning_rate": 0.0004998988474484952, "loss": 5.5648, "mean_token_accuracy": 0.19031796902418135, "num_tokens": 4547594.0, "step": 1985 }, { "entropy": 5.717133808135986, "epoch": 0.19116234390009607, "grad_norm": 0.90625, "learning_rate": 0.0004998978169417038, "loss": 5.78, "mean_token_accuracy": 0.1743384450674057, "num_tokens": 4559850.0, "step": 1990 }, { "entropy": 5.791743421554566, "epoch": 0.19164265129682997, "grad_norm": 1.0546875, "learning_rate": 0.0004998967812134529, "loss": 5.7138, "mean_token_accuracy": 0.17110339552164078, "num_tokens": 4570727.0, "step": 1995 }, { "entropy": 5.610540056228638, "epoch": 0.19212295869356388, "grad_norm": 0.99609375, "learning_rate": 0.0004998957402637664, "loss": 5.6542, "mean_token_accuracy": 0.17157155871391297, "num_tokens": 4582248.0, "step": 2000 }, { "entropy": 5.801579093933105, "epoch": 0.1926032660902978, "grad_norm": 1.1484375, "learning_rate": 0.0004998946940926687, "loss": 5.6973, "mean_token_accuracy": 0.17121600955724717, "num_tokens": 4592604.0, "step": 2005 }, { "entropy": 5.661766576766968, "epoch": 0.1930835734870317, "grad_norm": 1.015625, "learning_rate": 0.000499893642700184, "loss": 5.7182, "mean_token_accuracy": 0.17020188719034196, "num_tokens": 4604398.0, "step": 2010 }, { "entropy": 5.790825366973877, "epoch": 0.1935638808837656, "grad_norm": 0.921875, "learning_rate": 0.0004998925860863368, "loss": 5.7931, "mean_token_accuracy": 0.1685462474822998, "num_tokens": 4616434.0, "step": 2015 }, { "entropy": 5.820285224914551, "epoch": 0.19404418828049952, "grad_norm": 0.9296875, "learning_rate": 0.0004998915242511516, "loss": 5.7541, "mean_token_accuracy": 0.17625110745429992, "num_tokens": 4627577.0, "step": 2020 }, { "entropy": 5.7781401634216305, "epoch": 0.19452449567723343, "grad_norm": 1.0390625, "learning_rate": 0.0004998904571946528, "loss": 5.817, "mean_token_accuracy": 0.16743545606732368, "num_tokens": 4639698.0, "step": 2025 }, { "entropy": 5.838766145706177, "epoch": 0.19500480307396734, "grad_norm": 0.99609375, "learning_rate": 0.0004998893849168655, "loss": 5.8269, "mean_token_accuracy": 0.16433341503143312, "num_tokens": 4650643.0, "step": 2030 }, { "entropy": 5.762656116485596, "epoch": 0.19548511047070125, "grad_norm": 0.93359375, "learning_rate": 0.0004998883074178144, "loss": 5.7427, "mean_token_accuracy": 0.16878412663936615, "num_tokens": 4662897.0, "step": 2035 }, { "entropy": 5.818380117416382, "epoch": 0.19596541786743515, "grad_norm": 0.98828125, "learning_rate": 0.0004998872246975247, "loss": 5.8217, "mean_token_accuracy": 0.1706990644335747, "num_tokens": 4673701.0, "step": 2040 }, { "entropy": 5.910197305679321, "epoch": 0.19644572526416906, "grad_norm": 0.97265625, "learning_rate": 0.0004998861367560213, "loss": 5.7826, "mean_token_accuracy": 0.16689348816871644, "num_tokens": 4685873.0, "step": 2045 }, { "entropy": 5.714930677413941, "epoch": 0.19692603266090297, "grad_norm": 0.97265625, "learning_rate": 0.0004998850435933296, "loss": 5.6724, "mean_token_accuracy": 0.17364383190870286, "num_tokens": 4697179.0, "step": 2050 }, { "entropy": 5.752671766281128, "epoch": 0.19740634005763688, "grad_norm": 1.0390625, "learning_rate": 0.0004998839452094749, "loss": 5.7084, "mean_token_accuracy": 0.17288116365671158, "num_tokens": 4707752.0, "step": 2055 }, { "entropy": 5.625265073776245, "epoch": 0.1978866474543708, "grad_norm": 1.03125, "learning_rate": 0.0004998828416044829, "loss": 5.58, "mean_token_accuracy": 0.17766032367944717, "num_tokens": 4718413.0, "step": 2060 }, { "entropy": 5.750666522979737, "epoch": 0.1983669548511047, "grad_norm": 1.03125, "learning_rate": 0.000499881732778379, "loss": 5.7696, "mean_token_accuracy": 0.16185117661952972, "num_tokens": 4730033.0, "step": 2065 }, { "entropy": 5.668474435806274, "epoch": 0.1988472622478386, "grad_norm": 0.91015625, "learning_rate": 0.000499880618731189, "loss": 5.6346, "mean_token_accuracy": 0.17201206237077712, "num_tokens": 4742084.0, "step": 2070 }, { "entropy": 5.801948118209839, "epoch": 0.19932756964457252, "grad_norm": 0.98046875, "learning_rate": 0.0004998794994629388, "loss": 5.8485, "mean_token_accuracy": 0.16415513008832933, "num_tokens": 4753885.0, "step": 2075 }, { "entropy": 5.755141353607177, "epoch": 0.19980787704130643, "grad_norm": 1.0, "learning_rate": 0.0004998783749736545, "loss": 5.6852, "mean_token_accuracy": 0.17273288518190383, "num_tokens": 4765686.0, "step": 2080 }, { "entropy": 5.7318039894104, "epoch": 0.20028818443804033, "grad_norm": 0.96875, "learning_rate": 0.0004998772452633619, "loss": 5.7343, "mean_token_accuracy": 0.1667577311396599, "num_tokens": 4777157.0, "step": 2085 }, { "entropy": 5.734004545211792, "epoch": 0.20076849183477424, "grad_norm": 1.0078125, "learning_rate": 0.0004998761103320876, "loss": 5.6803, "mean_token_accuracy": 0.17569620162248611, "num_tokens": 4788583.0, "step": 2090 }, { "entropy": 5.81385350227356, "epoch": 0.20124879923150815, "grad_norm": 0.94140625, "learning_rate": 0.0004998749701798577, "loss": 5.795, "mean_token_accuracy": 0.164644692838192, "num_tokens": 4800749.0, "step": 2095 }, { "entropy": 5.652225208282471, "epoch": 0.2017291066282421, "grad_norm": 0.96875, "learning_rate": 0.0004998738248066986, "loss": 5.7001, "mean_token_accuracy": 0.17118856757879258, "num_tokens": 4812488.0, "step": 2100 }, { "entropy": 5.816308832168579, "epoch": 0.202209414024976, "grad_norm": 1.0859375, "learning_rate": 0.0004998726742126372, "loss": 5.6902, "mean_token_accuracy": 0.17228334546089172, "num_tokens": 4823495.0, "step": 2105 }, { "entropy": 5.622010517120361, "epoch": 0.2026897214217099, "grad_norm": 1.046875, "learning_rate": 0.0004998715183976999, "loss": 5.726, "mean_token_accuracy": 0.16997579634189605, "num_tokens": 4834450.0, "step": 2110 }, { "entropy": 5.763468551635742, "epoch": 0.20317002881844382, "grad_norm": 0.91796875, "learning_rate": 0.0004998703573619137, "loss": 5.6443, "mean_token_accuracy": 0.18120874017477034, "num_tokens": 4846826.0, "step": 2115 }, { "entropy": 5.804740762710571, "epoch": 0.20365033621517772, "grad_norm": 0.9296875, "learning_rate": 0.0004998691911053056, "loss": 5.8366, "mean_token_accuracy": 0.15913107842206956, "num_tokens": 4859668.0, "step": 2120 }, { "entropy": 5.727064418792724, "epoch": 0.20413064361191163, "grad_norm": 1.0546875, "learning_rate": 0.0004998680196279026, "loss": 5.7049, "mean_token_accuracy": 0.17213667631149293, "num_tokens": 4871727.0, "step": 2125 }, { "entropy": 5.794467830657959, "epoch": 0.20461095100864554, "grad_norm": 1.015625, "learning_rate": 0.0004998668429297319, "loss": 5.7674, "mean_token_accuracy": 0.17240212336182595, "num_tokens": 4882191.0, "step": 2130 }, { "entropy": 5.760322952270508, "epoch": 0.20509125840537945, "grad_norm": 1.078125, "learning_rate": 0.0004998656610108208, "loss": 5.6971, "mean_token_accuracy": 0.1685373991727829, "num_tokens": 4892416.0, "step": 2135 }, { "entropy": 5.694274854660034, "epoch": 0.20557156580211336, "grad_norm": 1.03125, "learning_rate": 0.0004998644738711969, "loss": 5.6674, "mean_token_accuracy": 0.1685459852218628, "num_tokens": 4903572.0, "step": 2140 }, { "entropy": 5.810105037689209, "epoch": 0.20605187319884727, "grad_norm": 0.875, "learning_rate": 0.0004998632815108874, "loss": 5.763, "mean_token_accuracy": 0.16395961344242097, "num_tokens": 4915417.0, "step": 2145 }, { "entropy": 5.73304591178894, "epoch": 0.20653218059558118, "grad_norm": 1.0234375, "learning_rate": 0.0004998620839299203, "loss": 5.6495, "mean_token_accuracy": 0.17259960770606994, "num_tokens": 4926943.0, "step": 2150 }, { "entropy": 5.6710865020751955, "epoch": 0.2070124879923151, "grad_norm": 1.0234375, "learning_rate": 0.0004998608811283233, "loss": 5.6095, "mean_token_accuracy": 0.17803010493516921, "num_tokens": 4937724.0, "step": 2155 }, { "entropy": 5.7808784484863285, "epoch": 0.207492795389049, "grad_norm": 0.984375, "learning_rate": 0.0004998596731061244, "loss": 5.7756, "mean_token_accuracy": 0.16368448734283447, "num_tokens": 4949970.0, "step": 2160 }, { "entropy": 5.784394645690918, "epoch": 0.2079731027857829, "grad_norm": 0.9765625, "learning_rate": 0.0004998584598633516, "loss": 5.774, "mean_token_accuracy": 0.16977567672729493, "num_tokens": 4961389.0, "step": 2165 }, { "entropy": 5.7822630405426025, "epoch": 0.2084534101825168, "grad_norm": 1.015625, "learning_rate": 0.0004998572414000329, "loss": 5.82, "mean_token_accuracy": 0.16696709543466567, "num_tokens": 4973888.0, "step": 2170 }, { "entropy": 5.75656681060791, "epoch": 0.20893371757925072, "grad_norm": 1.03125, "learning_rate": 0.0004998560177161969, "loss": 5.7667, "mean_token_accuracy": 0.1604086473584175, "num_tokens": 4985423.0, "step": 2175 }, { "entropy": 5.70469822883606, "epoch": 0.20941402497598463, "grad_norm": 0.93359375, "learning_rate": 0.0004998547888118718, "loss": 5.726, "mean_token_accuracy": 0.16619897931814193, "num_tokens": 4997711.0, "step": 2180 }, { "entropy": 5.7725687503814695, "epoch": 0.20989433237271854, "grad_norm": 0.97265625, "learning_rate": 0.0004998535546870862, "loss": 5.7454, "mean_token_accuracy": 0.1679087519645691, "num_tokens": 5009633.0, "step": 2185 }, { "entropy": 5.739374876022339, "epoch": 0.21037463976945245, "grad_norm": 1.0234375, "learning_rate": 0.0004998523153418687, "loss": 5.6759, "mean_token_accuracy": 0.17375072985887527, "num_tokens": 5021523.0, "step": 2190 }, { "entropy": 5.785361337661743, "epoch": 0.21085494716618636, "grad_norm": 1.0078125, "learning_rate": 0.0004998510707762481, "loss": 5.7695, "mean_token_accuracy": 0.1699072614312172, "num_tokens": 5033513.0, "step": 2195 }, { "entropy": 5.7873194217681885, "epoch": 0.21133525456292027, "grad_norm": 0.90625, "learning_rate": 0.0004998498209902533, "loss": 5.7758, "mean_token_accuracy": 0.16922611892223358, "num_tokens": 5047055.0, "step": 2200 }, { "entropy": 5.707646226882934, "epoch": 0.21181556195965417, "grad_norm": 1.0703125, "learning_rate": 0.0004998485659839134, "loss": 5.6497, "mean_token_accuracy": 0.17682456970214844, "num_tokens": 5057613.0, "step": 2205 }, { "entropy": 5.753945970535279, "epoch": 0.21229586935638808, "grad_norm": 1.015625, "learning_rate": 0.0004998473057572575, "loss": 5.7615, "mean_token_accuracy": 0.16833806186914443, "num_tokens": 5068886.0, "step": 2210 }, { "entropy": 5.742906093597412, "epoch": 0.212776176753122, "grad_norm": 1.0546875, "learning_rate": 0.0004998460403103146, "loss": 5.7494, "mean_token_accuracy": 0.16465574279427528, "num_tokens": 5079978.0, "step": 2215 }, { "entropy": 5.736083173751831, "epoch": 0.2132564841498559, "grad_norm": 1.078125, "learning_rate": 0.0004998447696431146, "loss": 5.7159, "mean_token_accuracy": 0.17075446248054504, "num_tokens": 5091021.0, "step": 2220 }, { "entropy": 5.6740076541900635, "epoch": 0.2137367915465898, "grad_norm": 1.046875, "learning_rate": 0.0004998434937556865, "loss": 5.5988, "mean_token_accuracy": 0.181574647128582, "num_tokens": 5101483.0, "step": 2225 }, { "entropy": 5.708674907684326, "epoch": 0.21421709894332372, "grad_norm": 0.98828125, "learning_rate": 0.0004998422126480602, "loss": 5.7447, "mean_token_accuracy": 0.16306292563676833, "num_tokens": 5113116.0, "step": 2230 }, { "entropy": 5.82704176902771, "epoch": 0.21469740634005763, "grad_norm": 1.0703125, "learning_rate": 0.0004998409263202653, "loss": 5.6819, "mean_token_accuracy": 0.1686948984861374, "num_tokens": 5124824.0, "step": 2235 }, { "entropy": 5.589908075332642, "epoch": 0.21517771373679154, "grad_norm": 1.0, "learning_rate": 0.0004998396347723318, "loss": 5.6335, "mean_token_accuracy": 0.16587817817926406, "num_tokens": 5137567.0, "step": 2240 }, { "entropy": 5.72907018661499, "epoch": 0.21565802113352545, "grad_norm": 0.94921875, "learning_rate": 0.0004998383380042895, "loss": 5.6846, "mean_token_accuracy": 0.16729460805654525, "num_tokens": 5149016.0, "step": 2245 }, { "entropy": 5.6214783668518065, "epoch": 0.21613832853025935, "grad_norm": 1.0390625, "learning_rate": 0.0004998370360161688, "loss": 5.5788, "mean_token_accuracy": 0.17212725132703782, "num_tokens": 5160356.0, "step": 2250 }, { "entropy": 5.79612250328064, "epoch": 0.21661863592699326, "grad_norm": 0.97265625, "learning_rate": 0.0004998357288079996, "loss": 5.7818, "mean_token_accuracy": 0.16184753328561782, "num_tokens": 5172100.0, "step": 2255 }, { "entropy": 5.740008592605591, "epoch": 0.21709894332372717, "grad_norm": 1.046875, "learning_rate": 0.0004998344163798125, "loss": 5.7405, "mean_token_accuracy": 0.16320510655641557, "num_tokens": 5183984.0, "step": 2260 }, { "entropy": 5.707123565673828, "epoch": 0.21757925072046108, "grad_norm": 0.93359375, "learning_rate": 0.0004998330987316379, "loss": 5.7153, "mean_token_accuracy": 0.167342671751976, "num_tokens": 5195853.0, "step": 2265 }, { "entropy": 5.6320737361907955, "epoch": 0.21805955811719502, "grad_norm": 0.99609375, "learning_rate": 0.0004998317758635062, "loss": 5.5593, "mean_token_accuracy": 0.17451774328947067, "num_tokens": 5206995.0, "step": 2270 }, { "entropy": 5.515458297729492, "epoch": 0.21853986551392893, "grad_norm": 0.99609375, "learning_rate": 0.0004998304477754484, "loss": 5.5989, "mean_token_accuracy": 0.17679600268602372, "num_tokens": 5219291.0, "step": 2275 }, { "entropy": 5.740645408630371, "epoch": 0.21902017291066284, "grad_norm": 1.046875, "learning_rate": 0.0004998291144674952, "loss": 5.6885, "mean_token_accuracy": 0.17223394364118577, "num_tokens": 5230856.0, "step": 2280 }, { "entropy": 5.601490020751953, "epoch": 0.21950048030739674, "grad_norm": 0.98046875, "learning_rate": 0.0004998277759396776, "loss": 5.5333, "mean_token_accuracy": 0.1814967930316925, "num_tokens": 5242871.0, "step": 2285 }, { "entropy": 5.656805944442749, "epoch": 0.21998078770413065, "grad_norm": 1.0078125, "learning_rate": 0.0004998264321920265, "loss": 5.64, "mean_token_accuracy": 0.17801354676485062, "num_tokens": 5253835.0, "step": 2290 }, { "entropy": 5.676252794265747, "epoch": 0.22046109510086456, "grad_norm": 0.890625, "learning_rate": 0.0004998250832245734, "loss": 5.6181, "mean_token_accuracy": 0.17702293545007705, "num_tokens": 5266195.0, "step": 2295 }, { "entropy": 5.641697740554809, "epoch": 0.22094140249759847, "grad_norm": 1.0390625, "learning_rate": 0.0004998237290373494, "loss": 5.6002, "mean_token_accuracy": 0.1801271617412567, "num_tokens": 5277499.0, "step": 2300 }, { "entropy": 5.739913368225098, "epoch": 0.22142170989433238, "grad_norm": 0.96875, "learning_rate": 0.000499822369630386, "loss": 5.7231, "mean_token_accuracy": 0.1597047820687294, "num_tokens": 5288622.0, "step": 2305 }, { "entropy": 5.738846015930176, "epoch": 0.2219020172910663, "grad_norm": 1.1484375, "learning_rate": 0.0004998210050037148, "loss": 5.7816, "mean_token_accuracy": 0.16195343434810638, "num_tokens": 5299664.0, "step": 2310 }, { "entropy": 5.717037725448608, "epoch": 0.2223823246878002, "grad_norm": 0.99609375, "learning_rate": 0.0004998196351573674, "loss": 5.6552, "mean_token_accuracy": 0.17402878403663635, "num_tokens": 5311627.0, "step": 2315 }, { "entropy": 5.5637411117553714, "epoch": 0.2228626320845341, "grad_norm": 0.98046875, "learning_rate": 0.0004998182600913757, "loss": 5.5627, "mean_token_accuracy": 0.17947529554367064, "num_tokens": 5323000.0, "step": 2320 }, { "entropy": 5.704880237579346, "epoch": 0.22334293948126802, "grad_norm": 0.98828125, "learning_rate": 0.0004998168798057715, "loss": 5.5992, "mean_token_accuracy": 0.18110302537679673, "num_tokens": 5333811.0, "step": 2325 }, { "entropy": 5.615099573135376, "epoch": 0.22382324687800192, "grad_norm": 1.015625, "learning_rate": 0.000499815494300587, "loss": 5.5991, "mean_token_accuracy": 0.17574110478162766, "num_tokens": 5344762.0, "step": 2330 }, { "entropy": 5.721481513977051, "epoch": 0.22430355427473583, "grad_norm": 1.0625, "learning_rate": 0.0004998141035758542, "loss": 5.6195, "mean_token_accuracy": 0.17343118488788606, "num_tokens": 5356112.0, "step": 2335 }, { "entropy": 5.655849504470825, "epoch": 0.22478386167146974, "grad_norm": 1.171875, "learning_rate": 0.0004998127076316054, "loss": 5.7311, "mean_token_accuracy": 0.17190437763929367, "num_tokens": 5367339.0, "step": 2340 }, { "entropy": 5.674526071548462, "epoch": 0.22526416906820365, "grad_norm": 0.99609375, "learning_rate": 0.0004998113064678734, "loss": 5.6665, "mean_token_accuracy": 0.17564141601324082, "num_tokens": 5378627.0, "step": 2345 }, { "entropy": 5.726110649108887, "epoch": 0.22574447646493756, "grad_norm": 1.0234375, "learning_rate": 0.0004998099000846901, "loss": 5.7012, "mean_token_accuracy": 0.1681268870830536, "num_tokens": 5390209.0, "step": 2350 }, { "entropy": 5.734390020370483, "epoch": 0.22622478386167147, "grad_norm": 1.1640625, "learning_rate": 0.0004998084884820887, "loss": 5.6833, "mean_token_accuracy": 0.17136491537094117, "num_tokens": 5401578.0, "step": 2355 }, { "entropy": 5.615032052993774, "epoch": 0.22670509125840538, "grad_norm": 1.0, "learning_rate": 0.0004998070716601016, "loss": 5.5881, "mean_token_accuracy": 0.17977205514907837, "num_tokens": 5413831.0, "step": 2360 }, { "entropy": 5.722073316574097, "epoch": 0.2271853986551393, "grad_norm": 1.0390625, "learning_rate": 0.0004998056496187618, "loss": 5.6496, "mean_token_accuracy": 0.1711253985762596, "num_tokens": 5425430.0, "step": 2365 }, { "entropy": 5.49839334487915, "epoch": 0.2276657060518732, "grad_norm": 1.0, "learning_rate": 0.0004998042223581025, "loss": 5.4985, "mean_token_accuracy": 0.1870403528213501, "num_tokens": 5435353.0, "step": 2370 }, { "entropy": 5.7514622688293455, "epoch": 0.2281460134486071, "grad_norm": 0.97265625, "learning_rate": 0.0004998027898781565, "loss": 5.6991, "mean_token_accuracy": 0.17083023190498353, "num_tokens": 5446925.0, "step": 2375 }, { "entropy": 5.589994049072265, "epoch": 0.228626320845341, "grad_norm": 1.046875, "learning_rate": 0.0004998013521789574, "loss": 5.5899, "mean_token_accuracy": 0.1772562175989151, "num_tokens": 5456613.0, "step": 2380 }, { "entropy": 5.697564649581909, "epoch": 0.22910662824207492, "grad_norm": 1.1015625, "learning_rate": 0.0004997999092605384, "loss": 5.6209, "mean_token_accuracy": 0.17314212173223495, "num_tokens": 5467790.0, "step": 2385 }, { "entropy": 5.672542333602905, "epoch": 0.22958693563880883, "grad_norm": 1.0, "learning_rate": 0.000499798461122933, "loss": 5.6065, "mean_token_accuracy": 0.17598363608121873, "num_tokens": 5479166.0, "step": 2390 }, { "entropy": 5.594286203384399, "epoch": 0.23006724303554274, "grad_norm": 0.99609375, "learning_rate": 0.0004997970077661748, "loss": 5.5932, "mean_token_accuracy": 0.18340873271226882, "num_tokens": 5490186.0, "step": 2395 }, { "entropy": 5.690382814407348, "epoch": 0.23054755043227665, "grad_norm": 1.03125, "learning_rate": 0.0004997955491902977, "loss": 5.5575, "mean_token_accuracy": 0.1718940794467926, "num_tokens": 5500416.0, "step": 2400 }, { "entropy": 5.582558584213257, "epoch": 0.23102785782901056, "grad_norm": 1.0390625, "learning_rate": 0.0004997940853953354, "loss": 5.6489, "mean_token_accuracy": 0.17370383739471434, "num_tokens": 5512189.0, "step": 2405 }, { "entropy": 5.628128719329834, "epoch": 0.23150816522574447, "grad_norm": 0.96484375, "learning_rate": 0.000499792616381322, "loss": 5.5142, "mean_token_accuracy": 0.1828036591410637, "num_tokens": 5523631.0, "step": 2410 }, { "entropy": 5.609222555160523, "epoch": 0.23198847262247838, "grad_norm": 0.96875, "learning_rate": 0.0004997911421482914, "loss": 5.5763, "mean_token_accuracy": 0.1823565348982811, "num_tokens": 5535637.0, "step": 2415 }, { "entropy": 5.639013814926147, "epoch": 0.23246878001921228, "grad_norm": 1.0078125, "learning_rate": 0.000499789662696278, "loss": 5.5869, "mean_token_accuracy": 0.18035637438297272, "num_tokens": 5546470.0, "step": 2420 }, { "entropy": 5.694498586654663, "epoch": 0.2329490874159462, "grad_norm": 0.95703125, "learning_rate": 0.0004997881780253162, "loss": 5.7456, "mean_token_accuracy": 0.1703657627105713, "num_tokens": 5558633.0, "step": 2425 }, { "entropy": 5.6558629989624025, "epoch": 0.2334293948126801, "grad_norm": 0.875, "learning_rate": 0.0004997866881354403, "loss": 5.6547, "mean_token_accuracy": 0.17033104449510575, "num_tokens": 5570427.0, "step": 2430 }, { "entropy": 5.6951744556427, "epoch": 0.23390970220941404, "grad_norm": 0.9765625, "learning_rate": 0.000499785193026685, "loss": 5.6383, "mean_token_accuracy": 0.17484120875597, "num_tokens": 5580991.0, "step": 2435 }, { "entropy": 5.701549911499024, "epoch": 0.23439000960614795, "grad_norm": 1.03125, "learning_rate": 0.0004997836926990851, "loss": 5.6816, "mean_token_accuracy": 0.17114701271057128, "num_tokens": 5592777.0, "step": 2440 }, { "entropy": 5.602617788314819, "epoch": 0.23487031700288186, "grad_norm": 1.015625, "learning_rate": 0.0004997821871526752, "loss": 5.5874, "mean_token_accuracy": 0.17974285781383514, "num_tokens": 5603326.0, "step": 2445 }, { "entropy": 5.631419324874878, "epoch": 0.23535062439961577, "grad_norm": 1.1171875, "learning_rate": 0.0004997806763874905, "loss": 5.5697, "mean_token_accuracy": 0.1791187435388565, "num_tokens": 5614504.0, "step": 2450 }, { "entropy": 5.617094326019287, "epoch": 0.23583093179634967, "grad_norm": 0.98046875, "learning_rate": 0.0004997791604035659, "loss": 5.6264, "mean_token_accuracy": 0.17776354700326918, "num_tokens": 5625150.0, "step": 2455 }, { "entropy": 5.6507199764251705, "epoch": 0.23631123919308358, "grad_norm": 0.9921875, "learning_rate": 0.0004997776392009366, "loss": 5.6458, "mean_token_accuracy": 0.169050732254982, "num_tokens": 5636815.0, "step": 2460 }, { "entropy": 5.706958866119384, "epoch": 0.2367915465898175, "grad_norm": 0.9453125, "learning_rate": 0.0004997761127796381, "loss": 5.6366, "mean_token_accuracy": 0.17092559188604356, "num_tokens": 5648272.0, "step": 2465 }, { "entropy": 5.628375577926636, "epoch": 0.2372718539865514, "grad_norm": 1.0078125, "learning_rate": 0.0004997745811397056, "loss": 5.5463, "mean_token_accuracy": 0.17801680713891982, "num_tokens": 5659227.0, "step": 2470 }, { "entropy": 5.6414820671081545, "epoch": 0.2377521613832853, "grad_norm": 1.046875, "learning_rate": 0.0004997730442811748, "loss": 5.6796, "mean_token_accuracy": 0.17399391829967498, "num_tokens": 5670411.0, "step": 2475 }, { "entropy": 5.5770539283752445, "epoch": 0.23823246878001922, "grad_norm": 1.0859375, "learning_rate": 0.0004997715022040814, "loss": 5.5182, "mean_token_accuracy": 0.1782184734940529, "num_tokens": 5681570.0, "step": 2480 }, { "entropy": 5.523485231399536, "epoch": 0.23871277617675313, "grad_norm": 0.984375, "learning_rate": 0.000499769954908461, "loss": 5.5022, "mean_token_accuracy": 0.1887900114059448, "num_tokens": 5693021.0, "step": 2485 }, { "entropy": 5.659896421432495, "epoch": 0.23919308357348704, "grad_norm": 0.9609375, "learning_rate": 0.0004997684023943498, "loss": 5.5883, "mean_token_accuracy": 0.17428779155015944, "num_tokens": 5704043.0, "step": 2490 }, { "entropy": 5.5805792808532715, "epoch": 0.23967339097022095, "grad_norm": 0.99609375, "learning_rate": 0.0004997668446617837, "loss": 5.6675, "mean_token_accuracy": 0.16685750484466552, "num_tokens": 5715735.0, "step": 2495 }, { "entropy": 5.760880804061889, "epoch": 0.24015369836695485, "grad_norm": 1.0625, "learning_rate": 0.0004997652817107989, "loss": 5.6294, "mean_token_accuracy": 0.17232899218797684, "num_tokens": 5725778.0, "step": 2500 }, { "entropy": 5.601306343078614, "epoch": 0.24063400576368876, "grad_norm": 1.0390625, "learning_rate": 0.0004997637135414315, "loss": 5.6628, "mean_token_accuracy": 0.17220552116632462, "num_tokens": 5737224.0, "step": 2505 }, { "entropy": 5.779234981536865, "epoch": 0.24111431316042267, "grad_norm": 0.9609375, "learning_rate": 0.0004997621401537183, "loss": 5.6855, "mean_token_accuracy": 0.17120948135852815, "num_tokens": 5749226.0, "step": 2510 }, { "entropy": 5.6741156578063965, "epoch": 0.24159462055715658, "grad_norm": 1.1015625, "learning_rate": 0.0004997605615476955, "loss": 5.6578, "mean_token_accuracy": 0.17114464193582535, "num_tokens": 5760282.0, "step": 2515 }, { "entropy": 5.539696168899536, "epoch": 0.2420749279538905, "grad_norm": 0.94921875, "learning_rate": 0.0004997589777234, "loss": 5.5633, "mean_token_accuracy": 0.181555312871933, "num_tokens": 5771756.0, "step": 2520 }, { "entropy": 5.650804233551026, "epoch": 0.2425552353506244, "grad_norm": 1.078125, "learning_rate": 0.0004997573886808684, "loss": 5.5835, "mean_token_accuracy": 0.16679947078227997, "num_tokens": 5783237.0, "step": 2525 }, { "entropy": 5.646309852600098, "epoch": 0.2430355427473583, "grad_norm": 1.0078125, "learning_rate": 0.0004997557944201375, "loss": 5.6814, "mean_token_accuracy": 0.17147036045789718, "num_tokens": 5794825.0, "step": 2530 }, { "entropy": 5.675209999084473, "epoch": 0.24351585014409222, "grad_norm": 1.0390625, "learning_rate": 0.0004997541949412445, "loss": 5.5712, "mean_token_accuracy": 0.18625136017799376, "num_tokens": 5805578.0, "step": 2535 }, { "entropy": 5.649836206436158, "epoch": 0.24399615754082613, "grad_norm": 0.984375, "learning_rate": 0.0004997525902442266, "loss": 5.6738, "mean_token_accuracy": 0.16476511359214782, "num_tokens": 5818201.0, "step": 2540 }, { "entropy": 5.602812147140503, "epoch": 0.24447646493756003, "grad_norm": 0.9296875, "learning_rate": 0.0004997509803291207, "loss": 5.5959, "mean_token_accuracy": 0.17587143927812576, "num_tokens": 5830319.0, "step": 2545 }, { "entropy": 5.5824614524841305, "epoch": 0.24495677233429394, "grad_norm": 1.0234375, "learning_rate": 0.0004997493651959647, "loss": 5.5428, "mean_token_accuracy": 0.17996817231178283, "num_tokens": 5840638.0, "step": 2550 }, { "entropy": 5.66239709854126, "epoch": 0.24543707973102785, "grad_norm": 0.90625, "learning_rate": 0.0004997477448447955, "loss": 5.5773, "mean_token_accuracy": 0.17367178648710252, "num_tokens": 5852472.0, "step": 2555 }, { "entropy": 5.678495073318482, "epoch": 0.24591738712776176, "grad_norm": 0.9921875, "learning_rate": 0.0004997461192756512, "loss": 5.6133, "mean_token_accuracy": 0.170744089782238, "num_tokens": 5863455.0, "step": 2560 }, { "entropy": 5.512450170516968, "epoch": 0.24639769452449567, "grad_norm": 1.1171875, "learning_rate": 0.0004997444884885694, "loss": 5.5251, "mean_token_accuracy": 0.17817995101213455, "num_tokens": 5873141.0, "step": 2565 }, { "entropy": 5.603986024856567, "epoch": 0.24687800192122958, "grad_norm": 1.0859375, "learning_rate": 0.0004997428524835879, "loss": 5.6316, "mean_token_accuracy": 0.17475323528051376, "num_tokens": 5884363.0, "step": 2570 }, { "entropy": 5.740997219085694, "epoch": 0.2473583093179635, "grad_norm": 0.96484375, "learning_rate": 0.0004997412112607446, "loss": 5.6721, "mean_token_accuracy": 0.17148932665586472, "num_tokens": 5895856.0, "step": 2575 }, { "entropy": 5.542859792709351, "epoch": 0.2478386167146974, "grad_norm": 1.046875, "learning_rate": 0.0004997395648200778, "loss": 5.4922, "mean_token_accuracy": 0.17950474172830583, "num_tokens": 5906657.0, "step": 2580 }, { "entropy": 5.600370979309082, "epoch": 0.2483189241114313, "grad_norm": 0.8984375, "learning_rate": 0.0004997379131616257, "loss": 5.6226, "mean_token_accuracy": 0.1700095072388649, "num_tokens": 5919496.0, "step": 2585 }, { "entropy": 5.690901279449463, "epoch": 0.24879923150816521, "grad_norm": 0.9375, "learning_rate": 0.0004997362562854266, "loss": 5.6843, "mean_token_accuracy": 0.16776154488325118, "num_tokens": 5932593.0, "step": 2590 }, { "entropy": 5.619813919067383, "epoch": 0.24927953890489912, "grad_norm": 1.015625, "learning_rate": 0.0004997345941915187, "loss": 5.6128, "mean_token_accuracy": 0.17226099967956543, "num_tokens": 5944080.0, "step": 2595 }, { "entropy": 5.602241802215576, "epoch": 0.24975984630163303, "grad_norm": 1.0078125, "learning_rate": 0.0004997329268799412, "loss": 5.5752, "mean_token_accuracy": 0.18460023701190947, "num_tokens": 5955703.0, "step": 2600 }, { "entropy": 5.62792739868164, "epoch": 0.25024015369836694, "grad_norm": 0.984375, "learning_rate": 0.0004997312543507322, "loss": 5.6565, "mean_token_accuracy": 0.1714890867471695, "num_tokens": 5966979.0, "step": 2605 }, { "entropy": 5.672908306121826, "epoch": 0.2507204610951009, "grad_norm": 1.03125, "learning_rate": 0.0004997295766039309, "loss": 5.545, "mean_token_accuracy": 0.17637500017881394, "num_tokens": 5978808.0, "step": 2610 }, { "entropy": 5.6401097774505615, "epoch": 0.25120076849183476, "grad_norm": 0.953125, "learning_rate": 0.0004997278936395761, "loss": 5.7288, "mean_token_accuracy": 0.16584430038928985, "num_tokens": 5992145.0, "step": 2615 }, { "entropy": 5.665263652801514, "epoch": 0.2516810758885687, "grad_norm": 0.96875, "learning_rate": 0.0004997262054577071, "loss": 5.5694, "mean_token_accuracy": 0.17564088106155396, "num_tokens": 6003723.0, "step": 2620 }, { "entropy": 5.6567973613739015, "epoch": 0.2521613832853026, "grad_norm": 1.0703125, "learning_rate": 0.0004997245120583627, "loss": 5.6351, "mean_token_accuracy": 0.1769047811627388, "num_tokens": 6014064.0, "step": 2625 }, { "entropy": 5.53907151222229, "epoch": 0.2526416906820365, "grad_norm": 0.93359375, "learning_rate": 0.0004997228134415825, "loss": 5.5168, "mean_token_accuracy": 0.1834915667772293, "num_tokens": 6025455.0, "step": 2630 }, { "entropy": 5.6452476501464846, "epoch": 0.2531219980787704, "grad_norm": 1.0390625, "learning_rate": 0.0004997211096074059, "loss": 5.6231, "mean_token_accuracy": 0.16973316073417663, "num_tokens": 6037347.0, "step": 2635 }, { "entropy": 5.600665187835693, "epoch": 0.25360230547550433, "grad_norm": 0.9921875, "learning_rate": 0.0004997194005558722, "loss": 5.5304, "mean_token_accuracy": 0.18019532412290573, "num_tokens": 6049236.0, "step": 2640 }, { "entropy": 5.534391641616821, "epoch": 0.2540826128722382, "grad_norm": 0.96484375, "learning_rate": 0.0004997176862870216, "loss": 5.5339, "mean_token_accuracy": 0.1798613414168358, "num_tokens": 6060982.0, "step": 2645 }, { "entropy": 5.637931680679321, "epoch": 0.25456292026897215, "grad_norm": 1.0546875, "learning_rate": 0.0004997159668008933, "loss": 5.5514, "mean_token_accuracy": 0.17985030263662338, "num_tokens": 6070925.0, "step": 2650 }, { "entropy": 5.526381587982177, "epoch": 0.25504322766570603, "grad_norm": 1.0078125, "learning_rate": 0.0004997142420975277, "loss": 5.514, "mean_token_accuracy": 0.18175738006830217, "num_tokens": 6081279.0, "step": 2655 }, { "entropy": 5.5633796691894535, "epoch": 0.25552353506243997, "grad_norm": 0.91796875, "learning_rate": 0.0004997125121769647, "loss": 5.6108, "mean_token_accuracy": 0.17793446481227876, "num_tokens": 6091797.0, "step": 2660 }, { "entropy": 5.687921333312988, "epoch": 0.25600384245917385, "grad_norm": 0.9296875, "learning_rate": 0.0004997107770392444, "loss": 5.6134, "mean_token_accuracy": 0.1804993599653244, "num_tokens": 6103435.0, "step": 2665 }, { "entropy": 5.648722791671753, "epoch": 0.2564841498559078, "grad_norm": 0.9375, "learning_rate": 0.000499709036684407, "loss": 5.6751, "mean_token_accuracy": 0.17587384432554246, "num_tokens": 6114531.0, "step": 2670 }, { "entropy": 5.569314622879029, "epoch": 0.25696445725264166, "grad_norm": 1.0078125, "learning_rate": 0.0004997072911124932, "loss": 5.5173, "mean_token_accuracy": 0.17945850938558577, "num_tokens": 6126110.0, "step": 2675 }, { "entropy": 5.670061159133911, "epoch": 0.2574447646493756, "grad_norm": 1.015625, "learning_rate": 0.0004997055403235432, "loss": 5.6187, "mean_token_accuracy": 0.1766670301556587, "num_tokens": 6137114.0, "step": 2680 }, { "entropy": 5.62683253288269, "epoch": 0.2579250720461095, "grad_norm": 0.984375, "learning_rate": 0.0004997037843175978, "loss": 5.5718, "mean_token_accuracy": 0.17658228576183319, "num_tokens": 6148696.0, "step": 2685 }, { "entropy": 5.59165620803833, "epoch": 0.2584053794428434, "grad_norm": 0.9609375, "learning_rate": 0.0004997020230946978, "loss": 5.568, "mean_token_accuracy": 0.1790614068508148, "num_tokens": 6160235.0, "step": 2690 }, { "entropy": 5.629477691650391, "epoch": 0.25888568683957736, "grad_norm": 0.96875, "learning_rate": 0.0004997002566548841, "loss": 5.5586, "mean_token_accuracy": 0.17292713820934297, "num_tokens": 6172031.0, "step": 2695 }, { "entropy": 5.48054838180542, "epoch": 0.25936599423631124, "grad_norm": 0.96875, "learning_rate": 0.0004996984849981976, "loss": 5.4233, "mean_token_accuracy": 0.1893267199397087, "num_tokens": 6183547.0, "step": 2700 }, { "entropy": 5.619540548324585, "epoch": 0.2598463016330452, "grad_norm": 0.97265625, "learning_rate": 0.0004996967081246794, "loss": 5.632, "mean_token_accuracy": 0.1678134724497795, "num_tokens": 6194768.0, "step": 2705 }, { "entropy": 5.6499683380126955, "epoch": 0.26032660902977905, "grad_norm": 0.984375, "learning_rate": 0.0004996949260343711, "loss": 5.6314, "mean_token_accuracy": 0.1706198126077652, "num_tokens": 6206099.0, "step": 2710 }, { "entropy": 5.624089670181275, "epoch": 0.260806916426513, "grad_norm": 1.046875, "learning_rate": 0.0004996931387273137, "loss": 5.6262, "mean_token_accuracy": 0.17660144418478013, "num_tokens": 6217530.0, "step": 2715 }, { "entropy": 5.713815212249756, "epoch": 0.2612872238232469, "grad_norm": 0.94140625, "learning_rate": 0.0004996913462035487, "loss": 5.6448, "mean_token_accuracy": 0.1767139658331871, "num_tokens": 6228564.0, "step": 2720 }, { "entropy": 5.539792156219482, "epoch": 0.2617675312199808, "grad_norm": 0.97265625, "learning_rate": 0.000499689548463118, "loss": 5.5174, "mean_token_accuracy": 0.17854675203561782, "num_tokens": 6239945.0, "step": 2725 }, { "entropy": 5.59919810295105, "epoch": 0.2622478386167147, "grad_norm": 0.9921875, "learning_rate": 0.0004996877455060631, "loss": 5.6312, "mean_token_accuracy": 0.17017472237348558, "num_tokens": 6251829.0, "step": 2730 }, { "entropy": 5.7330786228179935, "epoch": 0.2627281460134486, "grad_norm": 0.9765625, "learning_rate": 0.0004996859373324259, "loss": 5.7264, "mean_token_accuracy": 0.16224824339151384, "num_tokens": 6264823.0, "step": 2735 }, { "entropy": 5.5701476573944095, "epoch": 0.2632084534101825, "grad_norm": 1.015625, "learning_rate": 0.0004996841239422485, "loss": 5.4065, "mean_token_accuracy": 0.18482713848352433, "num_tokens": 6276247.0, "step": 2740 }, { "entropy": 5.470470857620239, "epoch": 0.26368876080691644, "grad_norm": 0.98828125, "learning_rate": 0.0004996823053355729, "loss": 5.5321, "mean_token_accuracy": 0.18076382875442504, "num_tokens": 6287593.0, "step": 2745 }, { "entropy": 5.685536909103393, "epoch": 0.2641690682036503, "grad_norm": 0.9921875, "learning_rate": 0.0004996804815124413, "loss": 5.6897, "mean_token_accuracy": 0.16898608654737474, "num_tokens": 6299918.0, "step": 2750 }, { "entropy": 5.568260049819946, "epoch": 0.26464937560038426, "grad_norm": 1.078125, "learning_rate": 0.0004996786524728962, "loss": 5.5287, "mean_token_accuracy": 0.18196363002061844, "num_tokens": 6311147.0, "step": 2755 }, { "entropy": 5.45229320526123, "epoch": 0.26512968299711814, "grad_norm": 0.96875, "learning_rate": 0.0004996768182169797, "loss": 5.4564, "mean_token_accuracy": 0.18652137070894242, "num_tokens": 6323239.0, "step": 2760 }, { "entropy": 5.692247343063355, "epoch": 0.2656099903938521, "grad_norm": 1.0234375, "learning_rate": 0.0004996749787447349, "loss": 5.5567, "mean_token_accuracy": 0.17187336832284927, "num_tokens": 6334625.0, "step": 2765 }, { "entropy": 5.545494651794433, "epoch": 0.26609029779058596, "grad_norm": 1.046875, "learning_rate": 0.000499673134056204, "loss": 5.5938, "mean_token_accuracy": 0.17517421692609786, "num_tokens": 6346068.0, "step": 2770 }, { "entropy": 5.584152412414551, "epoch": 0.2665706051873199, "grad_norm": 1.125, "learning_rate": 0.0004996712841514303, "loss": 5.5716, "mean_token_accuracy": 0.17334717959165574, "num_tokens": 6357097.0, "step": 2775 }, { "entropy": 5.656313180923462, "epoch": 0.2670509125840538, "grad_norm": 1.0859375, "learning_rate": 0.0004996694290304563, "loss": 5.6313, "mean_token_accuracy": 0.16709280461072923, "num_tokens": 6367481.0, "step": 2780 }, { "entropy": 5.52793607711792, "epoch": 0.2675312199807877, "grad_norm": 1.0, "learning_rate": 0.0004996675686933255, "loss": 5.5381, "mean_token_accuracy": 0.18144787847995758, "num_tokens": 6378873.0, "step": 2785 }, { "entropy": 5.664049291610718, "epoch": 0.2680115273775216, "grad_norm": 0.953125, "learning_rate": 0.0004996657031400807, "loss": 5.5768, "mean_token_accuracy": 0.18006865531206132, "num_tokens": 6390651.0, "step": 2790 }, { "entropy": 5.478256464004517, "epoch": 0.26849183477425553, "grad_norm": 1.078125, "learning_rate": 0.0004996638323707655, "loss": 5.446, "mean_token_accuracy": 0.1820421040058136, "num_tokens": 6401631.0, "step": 2795 }, { "entropy": 5.48651123046875, "epoch": 0.2689721421709894, "grad_norm": 0.97265625, "learning_rate": 0.0004996619563854232, "loss": 5.5308, "mean_token_accuracy": 0.1832943469285965, "num_tokens": 6413875.0, "step": 2800 }, { "entropy": 5.689049482345581, "epoch": 0.26945244956772335, "grad_norm": 1.03125, "learning_rate": 0.0004996600751840974, "loss": 5.5579, "mean_token_accuracy": 0.1733505442738533, "num_tokens": 6425764.0, "step": 2805 }, { "entropy": 5.478516244888306, "epoch": 0.26993275696445723, "grad_norm": 0.984375, "learning_rate": 0.0004996581887668317, "loss": 5.494, "mean_token_accuracy": 0.18221275955438615, "num_tokens": 6437911.0, "step": 2810 }, { "entropy": 5.534301519393921, "epoch": 0.27041306436119117, "grad_norm": 1.0859375, "learning_rate": 0.00049965629713367, "loss": 5.4961, "mean_token_accuracy": 0.18141991049051284, "num_tokens": 6449942.0, "step": 2815 }, { "entropy": 5.604593276977539, "epoch": 0.27089337175792505, "grad_norm": 0.953125, "learning_rate": 0.0004996544002846561, "loss": 5.6208, "mean_token_accuracy": 0.17682201713323592, "num_tokens": 6461729.0, "step": 2820 }, { "entropy": 5.614752101898193, "epoch": 0.271373679154659, "grad_norm": 1.0078125, "learning_rate": 0.0004996524982198343, "loss": 5.5988, "mean_token_accuracy": 0.17795798033475876, "num_tokens": 6472046.0, "step": 2825 }, { "entropy": 5.600375080108643, "epoch": 0.27185398655139287, "grad_norm": 1.0234375, "learning_rate": 0.0004996505909392485, "loss": 5.5667, "mean_token_accuracy": 0.17373612523078918, "num_tokens": 6483308.0, "step": 2830 }, { "entropy": 5.429362010955811, "epoch": 0.2723342939481268, "grad_norm": 0.98828125, "learning_rate": 0.0004996486784429429, "loss": 5.4311, "mean_token_accuracy": 0.18428465574979783, "num_tokens": 6495093.0, "step": 2835 }, { "entropy": 5.5981306552886965, "epoch": 0.2728146013448607, "grad_norm": 1.1015625, "learning_rate": 0.0004996467607309622, "loss": 5.5307, "mean_token_accuracy": 0.17854470163583755, "num_tokens": 6505933.0, "step": 2840 }, { "entropy": 5.626583003997803, "epoch": 0.2732949087415946, "grad_norm": 1.03125, "learning_rate": 0.0004996448378033507, "loss": 5.5893, "mean_token_accuracy": 0.17490534335374833, "num_tokens": 6517280.0, "step": 2845 }, { "entropy": 5.60156021118164, "epoch": 0.2737752161383285, "grad_norm": 1.03125, "learning_rate": 0.0004996429096601532, "loss": 5.6315, "mean_token_accuracy": 0.17191672027111055, "num_tokens": 6528980.0, "step": 2850 }, { "entropy": 5.601687097549439, "epoch": 0.27425552353506244, "grad_norm": 1.046875, "learning_rate": 0.0004996409763014144, "loss": 5.6235, "mean_token_accuracy": 0.17743158787488938, "num_tokens": 6540670.0, "step": 2855 }, { "entropy": 5.593181991577149, "epoch": 0.2747358309317964, "grad_norm": 1.0078125, "learning_rate": 0.0004996390377271791, "loss": 5.5855, "mean_token_accuracy": 0.18115401417016982, "num_tokens": 6551302.0, "step": 2860 }, { "entropy": 5.5507872104644775, "epoch": 0.27521613832853026, "grad_norm": 1.09375, "learning_rate": 0.0004996370939374924, "loss": 5.5433, "mean_token_accuracy": 0.1738438919186592, "num_tokens": 6563177.0, "step": 2865 }, { "entropy": 5.72943229675293, "epoch": 0.2756964457252642, "grad_norm": 1.1953125, "learning_rate": 0.0004996351449323994, "loss": 5.6521, "mean_token_accuracy": 0.17468605786561966, "num_tokens": 6573323.0, "step": 2870 }, { "entropy": 5.5880653858184814, "epoch": 0.2761767531219981, "grad_norm": 1.03125, "learning_rate": 0.0004996331907119455, "loss": 5.591, "mean_token_accuracy": 0.16756793707609177, "num_tokens": 6585382.0, "step": 2875 }, { "entropy": 5.474012231826782, "epoch": 0.276657060518732, "grad_norm": 1.0078125, "learning_rate": 0.0004996312312761758, "loss": 5.467, "mean_token_accuracy": 0.1900227263569832, "num_tokens": 6596629.0, "step": 2880 }, { "entropy": 5.6394744396209715, "epoch": 0.2771373679154659, "grad_norm": 1.0234375, "learning_rate": 0.000499629266625136, "loss": 5.5734, "mean_token_accuracy": 0.17828488498926162, "num_tokens": 6608408.0, "step": 2885 }, { "entropy": 5.638094282150268, "epoch": 0.27761767531219983, "grad_norm": 1.1015625, "learning_rate": 0.0004996272967588715, "loss": 5.5989, "mean_token_accuracy": 0.1704651966691017, "num_tokens": 6619375.0, "step": 2890 }, { "entropy": 5.618940448760986, "epoch": 0.2780979827089337, "grad_norm": 1.09375, "learning_rate": 0.0004996253216774283, "loss": 5.6398, "mean_token_accuracy": 0.17304042726755142, "num_tokens": 6631317.0, "step": 2895 }, { "entropy": 5.576578378677368, "epoch": 0.27857829010566765, "grad_norm": 1.1015625, "learning_rate": 0.0004996233413808521, "loss": 5.4904, "mean_token_accuracy": 0.18116467744112014, "num_tokens": 6642009.0, "step": 2900 }, { "entropy": 5.609902429580688, "epoch": 0.27905859750240153, "grad_norm": 1.09375, "learning_rate": 0.0004996213558691889, "loss": 5.6478, "mean_token_accuracy": 0.1682332620024681, "num_tokens": 6654713.0, "step": 2905 }, { "entropy": 5.651772451400757, "epoch": 0.27953890489913547, "grad_norm": 0.95703125, "learning_rate": 0.0004996193651424848, "loss": 5.6064, "mean_token_accuracy": 0.17700932323932647, "num_tokens": 6667157.0, "step": 2910 }, { "entropy": 5.575735330581665, "epoch": 0.28001921229586935, "grad_norm": 0.94140625, "learning_rate": 0.000499617369200786, "loss": 5.5599, "mean_token_accuracy": 0.18871267586946489, "num_tokens": 6679573.0, "step": 2915 }, { "entropy": 5.593114852905273, "epoch": 0.2804995196926033, "grad_norm": 0.859375, "learning_rate": 0.0004996153680441389, "loss": 5.624, "mean_token_accuracy": 0.17413021624088287, "num_tokens": 6691768.0, "step": 2920 }, { "entropy": 5.653490257263184, "epoch": 0.28097982708933716, "grad_norm": 1.015625, "learning_rate": 0.00049961336167259, "loss": 5.5864, "mean_token_accuracy": 0.17438612282276153, "num_tokens": 6701964.0, "step": 2925 }, { "entropy": 5.618965578079224, "epoch": 0.2814601344860711, "grad_norm": 1.0859375, "learning_rate": 0.0004996113500861857, "loss": 5.5759, "mean_token_accuracy": 0.1726679503917694, "num_tokens": 6713506.0, "step": 2930 }, { "entropy": 5.581022930145264, "epoch": 0.281940441882805, "grad_norm": 1.0859375, "learning_rate": 0.0004996093332849729, "loss": 5.593, "mean_token_accuracy": 0.1725487932562828, "num_tokens": 6724616.0, "step": 2935 }, { "entropy": 5.562248182296753, "epoch": 0.2824207492795389, "grad_norm": 1.0234375, "learning_rate": 0.0004996073112689983, "loss": 5.5803, "mean_token_accuracy": 0.17757243812084197, "num_tokens": 6735054.0, "step": 2940 }, { "entropy": 5.616918420791626, "epoch": 0.2829010566762728, "grad_norm": 0.9609375, "learning_rate": 0.0004996052840383088, "loss": 5.6325, "mean_token_accuracy": 0.17381539791822434, "num_tokens": 6746756.0, "step": 2945 }, { "entropy": 5.603857469558716, "epoch": 0.28338136407300674, "grad_norm": 0.89453125, "learning_rate": 0.0004996032515929516, "loss": 5.4992, "mean_token_accuracy": 0.1776091992855072, "num_tokens": 6759566.0, "step": 2950 }, { "entropy": 5.573670148849487, "epoch": 0.2838616714697406, "grad_norm": 1.0, "learning_rate": 0.0004996012139329738, "loss": 5.5225, "mean_token_accuracy": 0.17899418324232103, "num_tokens": 6771375.0, "step": 2955 }, { "entropy": 5.619125080108643, "epoch": 0.28434197886647455, "grad_norm": 1.1015625, "learning_rate": 0.0004995991710584228, "loss": 5.6311, "mean_token_accuracy": 0.16734524071216583, "num_tokens": 6783252.0, "step": 2960 }, { "entropy": 5.58878116607666, "epoch": 0.28482228626320844, "grad_norm": 0.953125, "learning_rate": 0.0004995971229693459, "loss": 5.5941, "mean_token_accuracy": 0.17340553402900696, "num_tokens": 6795525.0, "step": 2965 }, { "entropy": 5.610876131057739, "epoch": 0.28530259365994237, "grad_norm": 0.9296875, "learning_rate": 0.0004995950696657909, "loss": 5.5353, "mean_token_accuracy": 0.17990380227565766, "num_tokens": 6807212.0, "step": 2970 }, { "entropy": 5.52398419380188, "epoch": 0.28578290105667625, "grad_norm": 1.015625, "learning_rate": 0.0004995930111478051, "loss": 5.4712, "mean_token_accuracy": 0.1771505206823349, "num_tokens": 6819367.0, "step": 2975 }, { "entropy": 5.5713125705719, "epoch": 0.2862632084534102, "grad_norm": 1.046875, "learning_rate": 0.0004995909474154365, "loss": 5.5531, "mean_token_accuracy": 0.17791730761528016, "num_tokens": 6830405.0, "step": 2980 }, { "entropy": 5.524326038360596, "epoch": 0.28674351585014407, "grad_norm": 0.9765625, "learning_rate": 0.0004995888784687331, "loss": 5.5413, "mean_token_accuracy": 0.18089909702539445, "num_tokens": 6841479.0, "step": 2985 }, { "entropy": 5.545838022232056, "epoch": 0.287223823246878, "grad_norm": 1.015625, "learning_rate": 0.0004995868043077428, "loss": 5.5784, "mean_token_accuracy": 0.1739095240831375, "num_tokens": 6851585.0, "step": 2990 }, { "entropy": 5.605233526229858, "epoch": 0.2877041306436119, "grad_norm": 1.0390625, "learning_rate": 0.0004995847249325137, "loss": 5.5488, "mean_token_accuracy": 0.1776391088962555, "num_tokens": 6863176.0, "step": 2995 }, { "entropy": 5.596064901351928, "epoch": 0.2881844380403458, "grad_norm": 1.0703125, "learning_rate": 0.0004995826403430942, "loss": 5.595, "mean_token_accuracy": 0.17474860548973084, "num_tokens": 6874021.0, "step": 3000 }, { "epoch": 0.2881844380403458, "eval_entropy": 5.440896103871502, "eval_loss": 5.576871395111084, "eval_mean_token_accuracy": 0.18414354559419172, "eval_num_tokens": 6874021.0, "eval_runtime": 26.9459, "eval_samples_per_second": 1217.809, "eval_steps_per_second": 152.231, "step": 3000 }, { "entropy": 5.6302040100097654, "epoch": 0.2886647454370797, "grad_norm": 1.03125, "learning_rate": 0.0004995805505395328, "loss": 5.5584, "mean_token_accuracy": 0.17477040886878967, "num_tokens": 6884999.0, "step": 3005 }, { "entropy": 5.559301853179932, "epoch": 0.28914505283381364, "grad_norm": 1.0703125, "learning_rate": 0.0004995784555218778, "loss": 5.548, "mean_token_accuracy": 0.17850742042064666, "num_tokens": 6897021.0, "step": 3010 }, { "entropy": 5.518660974502564, "epoch": 0.2896253602305475, "grad_norm": 1.0703125, "learning_rate": 0.0004995763552901779, "loss": 5.5449, "mean_token_accuracy": 0.17909058481454848, "num_tokens": 6908320.0, "step": 3015 }, { "entropy": 5.68627028465271, "epoch": 0.29010566762728146, "grad_norm": 1.015625, "learning_rate": 0.0004995742498444818, "loss": 5.5342, "mean_token_accuracy": 0.18174685835838317, "num_tokens": 6919957.0, "step": 3020 }, { "entropy": 5.529996299743653, "epoch": 0.2905859750240154, "grad_norm": 0.9609375, "learning_rate": 0.0004995721391848387, "loss": 5.4942, "mean_token_accuracy": 0.17575003057718278, "num_tokens": 6930531.0, "step": 3025 }, { "entropy": 5.623160696029663, "epoch": 0.2910662824207493, "grad_norm": 1.0234375, "learning_rate": 0.0004995700233112972, "loss": 5.6325, "mean_token_accuracy": 0.17704310566186904, "num_tokens": 6942556.0, "step": 3030 }, { "entropy": 5.583187103271484, "epoch": 0.2915465898174832, "grad_norm": 0.9609375, "learning_rate": 0.0004995679022239066, "loss": 5.5762, "mean_token_accuracy": 0.17900587618350983, "num_tokens": 6954410.0, "step": 3035 }, { "entropy": 5.579293632507325, "epoch": 0.2920268972142171, "grad_norm": 1.0859375, "learning_rate": 0.0004995657759227162, "loss": 5.5857, "mean_token_accuracy": 0.17669540643692017, "num_tokens": 6964970.0, "step": 3040 }, { "entropy": 5.554018545150757, "epoch": 0.29250720461095103, "grad_norm": 0.9765625, "learning_rate": 0.0004995636444077751, "loss": 5.4673, "mean_token_accuracy": 0.1851392537355423, "num_tokens": 6976016.0, "step": 3045 }, { "entropy": 5.490430164337158, "epoch": 0.2929875120076849, "grad_norm": 1.046875, "learning_rate": 0.0004995615076791333, "loss": 5.4999, "mean_token_accuracy": 0.1816742718219757, "num_tokens": 6987199.0, "step": 3050 }, { "entropy": 5.5644313335418705, "epoch": 0.29346781940441885, "grad_norm": 1.0078125, "learning_rate": 0.0004995593657368399, "loss": 5.5218, "mean_token_accuracy": 0.18650518208742142, "num_tokens": 6999174.0, "step": 3055 }, { "entropy": 5.557963037490845, "epoch": 0.29394812680115273, "grad_norm": 0.97265625, "learning_rate": 0.000499557218580945, "loss": 5.5884, "mean_token_accuracy": 0.17525261044502258, "num_tokens": 7012148.0, "step": 3060 }, { "entropy": 5.486077213287354, "epoch": 0.29442843419788667, "grad_norm": 1.0234375, "learning_rate": 0.0004995550662114981, "loss": 5.4609, "mean_token_accuracy": 0.18215615749359132, "num_tokens": 7023238.0, "step": 3065 }, { "entropy": 5.561151647567749, "epoch": 0.29490874159462055, "grad_norm": 1.0234375, "learning_rate": 0.0004995529086285495, "loss": 5.5521, "mean_token_accuracy": 0.17758539766073228, "num_tokens": 7034944.0, "step": 3070 }, { "entropy": 5.563313627243042, "epoch": 0.2953890489913545, "grad_norm": 1.015625, "learning_rate": 0.000499550745832149, "loss": 5.4154, "mean_token_accuracy": 0.18512072116136552, "num_tokens": 7046880.0, "step": 3075 }, { "entropy": 5.486554431915283, "epoch": 0.29586935638808837, "grad_norm": 1.0390625, "learning_rate": 0.0004995485778223471, "loss": 5.4866, "mean_token_accuracy": 0.1800946146249771, "num_tokens": 7057678.0, "step": 3080 }, { "entropy": 5.4739940643310545, "epoch": 0.2963496637848223, "grad_norm": 1.0703125, "learning_rate": 0.0004995464045991939, "loss": 5.4688, "mean_token_accuracy": 0.18641662895679473, "num_tokens": 7068336.0, "step": 3085 }, { "entropy": 5.588371753692627, "epoch": 0.2968299711815562, "grad_norm": 0.98828125, "learning_rate": 0.00049954422616274, "loss": 5.5343, "mean_token_accuracy": 0.17594826519489287, "num_tokens": 7080341.0, "step": 3090 }, { "entropy": 5.6965454578399655, "epoch": 0.2973102785782901, "grad_norm": 1.1171875, "learning_rate": 0.0004995420425130359, "loss": 5.6866, "mean_token_accuracy": 0.17018966376781464, "num_tokens": 7090618.0, "step": 3095 }, { "entropy": 5.499913692474365, "epoch": 0.297790585975024, "grad_norm": 1.078125, "learning_rate": 0.0004995398536501324, "loss": 5.4331, "mean_token_accuracy": 0.18785624653100969, "num_tokens": 7101843.0, "step": 3100 }, { "entropy": 5.4791899681091305, "epoch": 0.29827089337175794, "grad_norm": 1.1640625, "learning_rate": 0.0004995376595740801, "loss": 5.5056, "mean_token_accuracy": 0.18063082695007324, "num_tokens": 7112014.0, "step": 3105 }, { "entropy": 5.632973289489746, "epoch": 0.2987512007684918, "grad_norm": 1.0078125, "learning_rate": 0.0004995354602849302, "loss": 5.5822, "mean_token_accuracy": 0.17074308097362517, "num_tokens": 7123860.0, "step": 3110 }, { "entropy": 5.571376514434815, "epoch": 0.29923150816522576, "grad_norm": 1.046875, "learning_rate": 0.0004995332557827337, "loss": 5.5564, "mean_token_accuracy": 0.17600722908973693, "num_tokens": 7135901.0, "step": 3115 }, { "entropy": 5.5778998851776125, "epoch": 0.29971181556195964, "grad_norm": 1.1171875, "learning_rate": 0.0004995310460675416, "loss": 5.5339, "mean_token_accuracy": 0.1845734417438507, "num_tokens": 7148743.0, "step": 3120 }, { "entropy": 5.589261770248413, "epoch": 0.3001921229586936, "grad_norm": 1.0546875, "learning_rate": 0.0004995288311394053, "loss": 5.5804, "mean_token_accuracy": 0.18021756410598755, "num_tokens": 7160731.0, "step": 3125 }, { "entropy": 5.574976587295533, "epoch": 0.30067243035542746, "grad_norm": 0.9765625, "learning_rate": 0.0004995266109983764, "loss": 5.5617, "mean_token_accuracy": 0.17890461087226867, "num_tokens": 7172861.0, "step": 3130 }, { "entropy": 5.5695881843566895, "epoch": 0.3011527377521614, "grad_norm": 1.015625, "learning_rate": 0.0004995243856445062, "loss": 5.5087, "mean_token_accuracy": 0.17425711154937745, "num_tokens": 7183954.0, "step": 3135 }, { "entropy": 5.523225164413452, "epoch": 0.3016330451488953, "grad_norm": 1.03125, "learning_rate": 0.0004995221550778466, "loss": 5.4793, "mean_token_accuracy": 0.1828732267022133, "num_tokens": 7195466.0, "step": 3140 }, { "entropy": 5.535993862152099, "epoch": 0.3021133525456292, "grad_norm": 1.046875, "learning_rate": 0.0004995199192984491, "loss": 5.4733, "mean_token_accuracy": 0.18358256071805953, "num_tokens": 7207173.0, "step": 3145 }, { "entropy": 5.601380920410156, "epoch": 0.3025936599423631, "grad_norm": 1.03125, "learning_rate": 0.0004995176783063657, "loss": 5.6094, "mean_token_accuracy": 0.17880836874246597, "num_tokens": 7220095.0, "step": 3150 }, { "entropy": 5.5713316917419435, "epoch": 0.30307396733909703, "grad_norm": 1.1015625, "learning_rate": 0.0004995154321016487, "loss": 5.5217, "mean_token_accuracy": 0.18463317751884462, "num_tokens": 7230664.0, "step": 3155 }, { "entropy": 5.5087896347045895, "epoch": 0.3035542747358309, "grad_norm": 1.0078125, "learning_rate": 0.0004995131806843499, "loss": 5.4837, "mean_token_accuracy": 0.18419086784124375, "num_tokens": 7241278.0, "step": 3160 }, { "entropy": 5.4533278465271, "epoch": 0.30403458213256485, "grad_norm": 1.0703125, "learning_rate": 0.0004995109240545218, "loss": 5.6281, "mean_token_accuracy": 0.1725993424654007, "num_tokens": 7252999.0, "step": 3165 }, { "entropy": 5.589286613464355, "epoch": 0.3045148895292987, "grad_norm": 1.0703125, "learning_rate": 0.0004995086622122167, "loss": 5.4738, "mean_token_accuracy": 0.17775996774435043, "num_tokens": 7263949.0, "step": 3170 }, { "entropy": 5.558937978744507, "epoch": 0.30499519692603266, "grad_norm": 1.0, "learning_rate": 0.0004995063951574871, "loss": 5.5219, "mean_token_accuracy": 0.18208030313253404, "num_tokens": 7275467.0, "step": 3175 }, { "entropy": 5.563764429092407, "epoch": 0.30547550432276654, "grad_norm": 1.03125, "learning_rate": 0.0004995041228903856, "loss": 5.4858, "mean_token_accuracy": 0.18617523461580276, "num_tokens": 7285534.0, "step": 3180 }, { "entropy": 5.614857864379883, "epoch": 0.3059558117195005, "grad_norm": 1.0234375, "learning_rate": 0.000499501845410965, "loss": 5.5985, "mean_token_accuracy": 0.18059034049510955, "num_tokens": 7297252.0, "step": 3185 }, { "entropy": 5.526304435729981, "epoch": 0.30643611911623436, "grad_norm": 0.93359375, "learning_rate": 0.0004994995627192781, "loss": 5.4686, "mean_token_accuracy": 0.18378556221723558, "num_tokens": 7308492.0, "step": 3190 }, { "entropy": 5.5130932331085205, "epoch": 0.3069164265129683, "grad_norm": 0.98828125, "learning_rate": 0.0004994972748153781, "loss": 5.5122, "mean_token_accuracy": 0.18087892532348632, "num_tokens": 7319703.0, "step": 3195 }, { "entropy": 5.598230838775635, "epoch": 0.30739673390970224, "grad_norm": 1.046875, "learning_rate": 0.000499494981699318, "loss": 5.4766, "mean_token_accuracy": 0.18629593551158904, "num_tokens": 7331022.0, "step": 3200 }, { "entropy": 5.5110736846923825, "epoch": 0.3078770413064361, "grad_norm": 1.0234375, "learning_rate": 0.000499492683371151, "loss": 5.5125, "mean_token_accuracy": 0.18337176293134688, "num_tokens": 7342977.0, "step": 3205 }, { "entropy": 5.602800512313843, "epoch": 0.30835734870317005, "grad_norm": 1.0234375, "learning_rate": 0.0004994903798309306, "loss": 5.5087, "mean_token_accuracy": 0.17746395766735076, "num_tokens": 7353227.0, "step": 3210 }, { "entropy": 5.563166570663452, "epoch": 0.30883765609990393, "grad_norm": 1.0703125, "learning_rate": 0.0004994880710787102, "loss": 5.5743, "mean_token_accuracy": 0.1642255187034607, "num_tokens": 7364165.0, "step": 3215 }, { "entropy": 5.544680643081665, "epoch": 0.30931796349663787, "grad_norm": 0.984375, "learning_rate": 0.0004994857571145432, "loss": 5.5023, "mean_token_accuracy": 0.18458254784345626, "num_tokens": 7374800.0, "step": 3220 }, { "entropy": 5.425434350967407, "epoch": 0.30979827089337175, "grad_norm": 1.0546875, "learning_rate": 0.0004994834379384837, "loss": 5.4565, "mean_token_accuracy": 0.18336665779352188, "num_tokens": 7386360.0, "step": 3225 }, { "entropy": 5.552868223190307, "epoch": 0.3102785782901057, "grad_norm": 1.0, "learning_rate": 0.0004994811135505851, "loss": 5.4698, "mean_token_accuracy": 0.18341365456581116, "num_tokens": 7397066.0, "step": 3230 }, { "entropy": 5.558938503265381, "epoch": 0.31075888568683957, "grad_norm": 1.0703125, "learning_rate": 0.0004994787839509018, "loss": 5.564, "mean_token_accuracy": 0.1713826075196266, "num_tokens": 7408349.0, "step": 3235 }, { "entropy": 5.5813216209411625, "epoch": 0.3112391930835735, "grad_norm": 1.0390625, "learning_rate": 0.0004994764491394876, "loss": 5.5886, "mean_token_accuracy": 0.17263369262218475, "num_tokens": 7420343.0, "step": 3240 }, { "entropy": 5.624362230300903, "epoch": 0.3117195004803074, "grad_norm": 0.921875, "learning_rate": 0.0004994741091163969, "loss": 5.4904, "mean_token_accuracy": 0.18449428975582122, "num_tokens": 7431683.0, "step": 3245 }, { "entropy": 5.41058030128479, "epoch": 0.3121998078770413, "grad_norm": 1.0234375, "learning_rate": 0.000499471763881684, "loss": 5.4083, "mean_token_accuracy": 0.18659997135400772, "num_tokens": 7443327.0, "step": 3250 }, { "entropy": 5.545905923843383, "epoch": 0.3126801152737752, "grad_norm": 1.03125, "learning_rate": 0.0004994694134354031, "loss": 5.517, "mean_token_accuracy": 0.18232496678829194, "num_tokens": 7454002.0, "step": 3255 }, { "entropy": 5.49485216140747, "epoch": 0.31316042267050914, "grad_norm": 1.078125, "learning_rate": 0.000499467057777609, "loss": 5.5092, "mean_token_accuracy": 0.18318750262260436, "num_tokens": 7464074.0, "step": 3260 }, { "entropy": 5.470322179794311, "epoch": 0.313640730067243, "grad_norm": 1.0859375, "learning_rate": 0.0004994646969083565, "loss": 5.434, "mean_token_accuracy": 0.1871152251958847, "num_tokens": 7475543.0, "step": 3265 }, { "entropy": 5.583432674407959, "epoch": 0.31412103746397696, "grad_norm": 1.0859375, "learning_rate": 0.0004994623308277002, "loss": 5.4947, "mean_token_accuracy": 0.18215811550617217, "num_tokens": 7486818.0, "step": 3270 }, { "entropy": 5.5460193157196045, "epoch": 0.31460134486071084, "grad_norm": 1.0078125, "learning_rate": 0.000499459959535695, "loss": 5.5431, "mean_token_accuracy": 0.17775923311710357, "num_tokens": 7499046.0, "step": 3275 }, { "entropy": 5.530418539047242, "epoch": 0.3150816522574448, "grad_norm": 1.109375, "learning_rate": 0.0004994575830323962, "loss": 5.4758, "mean_token_accuracy": 0.1772423878312111, "num_tokens": 7509853.0, "step": 3280 }, { "entropy": 5.422787761688232, "epoch": 0.31556195965417866, "grad_norm": 1.046875, "learning_rate": 0.0004994552013178586, "loss": 5.3345, "mean_token_accuracy": 0.1908559814095497, "num_tokens": 7521091.0, "step": 3285 }, { "entropy": 5.470391035079956, "epoch": 0.3160422670509126, "grad_norm": 1.078125, "learning_rate": 0.000499452814392138, "loss": 5.4638, "mean_token_accuracy": 0.19296756088733674, "num_tokens": 7531317.0, "step": 3290 }, { "entropy": 5.550863265991211, "epoch": 0.3165225744476465, "grad_norm": 1.0234375, "learning_rate": 0.0004994504222552894, "loss": 5.6115, "mean_token_accuracy": 0.17447966411709787, "num_tokens": 7542822.0, "step": 3295 }, { "entropy": 5.679572725296021, "epoch": 0.3170028818443804, "grad_norm": 1.0703125, "learning_rate": 0.0004994480249073684, "loss": 5.5371, "mean_token_accuracy": 0.17899394482374192, "num_tokens": 7552434.0, "step": 3300 }, { "entropy": 5.455837345123291, "epoch": 0.3174831892411143, "grad_norm": 0.98046875, "learning_rate": 0.0004994456223484308, "loss": 5.412, "mean_token_accuracy": 0.1847301483154297, "num_tokens": 7563895.0, "step": 3305 }, { "entropy": 5.356154918670654, "epoch": 0.31796349663784823, "grad_norm": 1.0, "learning_rate": 0.0004994432145785323, "loss": 5.4431, "mean_token_accuracy": 0.1852705791592598, "num_tokens": 7575391.0, "step": 3310 }, { "entropy": 5.603661298751831, "epoch": 0.3184438040345821, "grad_norm": 1.078125, "learning_rate": 0.0004994408015977288, "loss": 5.5895, "mean_token_accuracy": 0.18396379053592682, "num_tokens": 7587119.0, "step": 3315 }, { "entropy": 5.5791820049285885, "epoch": 0.31892411143131605, "grad_norm": 1.109375, "learning_rate": 0.0004994383834060764, "loss": 5.5529, "mean_token_accuracy": 0.17733592242002488, "num_tokens": 7598615.0, "step": 3320 }, { "entropy": 5.522308588027954, "epoch": 0.31940441882804993, "grad_norm": 0.9921875, "learning_rate": 0.0004994359600036311, "loss": 5.5022, "mean_token_accuracy": 0.18452920615673066, "num_tokens": 7610159.0, "step": 3325 }, { "entropy": 5.598204278945923, "epoch": 0.31988472622478387, "grad_norm": 1.171875, "learning_rate": 0.0004994335313904493, "loss": 5.4916, "mean_token_accuracy": 0.18418505936861038, "num_tokens": 7620922.0, "step": 3330 }, { "entropy": 5.45703272819519, "epoch": 0.32036503362151775, "grad_norm": 0.95703125, "learning_rate": 0.0004994310975665873, "loss": 5.4117, "mean_token_accuracy": 0.18754592537879944, "num_tokens": 7632343.0, "step": 3335 }, { "entropy": 5.619206094741822, "epoch": 0.3208453410182517, "grad_norm": 0.96875, "learning_rate": 0.0004994286585321017, "loss": 5.6097, "mean_token_accuracy": 0.1694990485906601, "num_tokens": 7644748.0, "step": 3340 }, { "entropy": 5.595988607406616, "epoch": 0.32132564841498557, "grad_norm": 1.1171875, "learning_rate": 0.000499426214287049, "loss": 5.5649, "mean_token_accuracy": 0.18684215247631072, "num_tokens": 7655449.0, "step": 3345 }, { "entropy": 5.522005844116211, "epoch": 0.3218059558117195, "grad_norm": 1.0546875, "learning_rate": 0.0004994237648314862, "loss": 5.5274, "mean_token_accuracy": 0.18205100297927856, "num_tokens": 7665623.0, "step": 3350 }, { "entropy": 5.492083740234375, "epoch": 0.3222862632084534, "grad_norm": 1.0, "learning_rate": 0.0004994213101654697, "loss": 5.4173, "mean_token_accuracy": 0.18764639347791673, "num_tokens": 7676860.0, "step": 3355 }, { "entropy": 5.5761909008026125, "epoch": 0.3227665706051873, "grad_norm": 1.015625, "learning_rate": 0.000499418850289057, "loss": 5.603, "mean_token_accuracy": 0.1757027193903923, "num_tokens": 7687778.0, "step": 3360 }, { "entropy": 5.565295886993408, "epoch": 0.32324687800192126, "grad_norm": 1.078125, "learning_rate": 0.0004994163852023048, "loss": 5.4981, "mean_token_accuracy": 0.18085954636335372, "num_tokens": 7699154.0, "step": 3365 }, { "entropy": 5.525069093704223, "epoch": 0.32372718539865514, "grad_norm": 1.0703125, "learning_rate": 0.0004994139149052706, "loss": 5.5175, "mean_token_accuracy": 0.18480815589427949, "num_tokens": 7711010.0, "step": 3370 }, { "entropy": 5.576666164398193, "epoch": 0.3242074927953891, "grad_norm": 1.0390625, "learning_rate": 0.0004994114393980117, "loss": 5.538, "mean_token_accuracy": 0.17918068915605545, "num_tokens": 7721969.0, "step": 3375 }, { "entropy": 5.561730909347534, "epoch": 0.32468780019212296, "grad_norm": 1.0078125, "learning_rate": 0.0004994089586805856, "loss": 5.4863, "mean_token_accuracy": 0.1827893927693367, "num_tokens": 7733762.0, "step": 3380 }, { "entropy": 5.549566268920898, "epoch": 0.3251681075888569, "grad_norm": 1.0859375, "learning_rate": 0.0004994064727530496, "loss": 5.4963, "mean_token_accuracy": 0.17758472561836242, "num_tokens": 7744614.0, "step": 3385 }, { "entropy": 5.498316717147827, "epoch": 0.3256484149855908, "grad_norm": 1.0, "learning_rate": 0.0004994039816154618, "loss": 5.4339, "mean_token_accuracy": 0.18473347425460815, "num_tokens": 7755799.0, "step": 3390 }, { "entropy": 5.455300903320312, "epoch": 0.3261287223823247, "grad_norm": 1.0078125, "learning_rate": 0.00049940148526788, "loss": 5.4848, "mean_token_accuracy": 0.18304541558027268, "num_tokens": 7768140.0, "step": 3395 }, { "entropy": 5.568225574493408, "epoch": 0.3266090297790586, "grad_norm": 1.125, "learning_rate": 0.0004993989837103618, "loss": 5.4898, "mean_token_accuracy": 0.1791609227657318, "num_tokens": 7778494.0, "step": 3400 }, { "entropy": 5.607134199142456, "epoch": 0.3270893371757925, "grad_norm": 1.0859375, "learning_rate": 0.0004993964769429657, "loss": 5.5675, "mean_token_accuracy": 0.18318891525268555, "num_tokens": 7789234.0, "step": 3405 }, { "entropy": 5.541140413284301, "epoch": 0.3275696445725264, "grad_norm": 0.9453125, "learning_rate": 0.0004993939649657498, "loss": 5.548, "mean_token_accuracy": 0.18319968730211258, "num_tokens": 7800602.0, "step": 3410 }, { "entropy": 5.469655227661133, "epoch": 0.32804995196926034, "grad_norm": 1.0078125, "learning_rate": 0.0004993914477787721, "loss": 5.3674, "mean_token_accuracy": 0.1912238970398903, "num_tokens": 7812803.0, "step": 3415 }, { "entropy": 5.625386571884155, "epoch": 0.3285302593659942, "grad_norm": 1.0703125, "learning_rate": 0.0004993889253820915, "loss": 5.6669, "mean_token_accuracy": 0.16849727183580399, "num_tokens": 7825432.0, "step": 3420 }, { "entropy": 5.567583656311035, "epoch": 0.32901056676272816, "grad_norm": 1.046875, "learning_rate": 0.0004993863977757663, "loss": 5.4819, "mean_token_accuracy": 0.18198901265859604, "num_tokens": 7837258.0, "step": 3425 }, { "entropy": 5.42762131690979, "epoch": 0.32949087415946204, "grad_norm": 1.046875, "learning_rate": 0.0004993838649598552, "loss": 5.3739, "mean_token_accuracy": 0.1897459015250206, "num_tokens": 7847573.0, "step": 3430 }, { "entropy": 5.551398038864136, "epoch": 0.329971181556196, "grad_norm": 1.0703125, "learning_rate": 0.0004993813269344171, "loss": 5.4969, "mean_token_accuracy": 0.17690201252698898, "num_tokens": 7857957.0, "step": 3435 }, { "entropy": 5.5013957023620605, "epoch": 0.33045148895292986, "grad_norm": 1.046875, "learning_rate": 0.0004993787836995108, "loss": 5.4174, "mean_token_accuracy": 0.1926833838224411, "num_tokens": 7867996.0, "step": 3440 }, { "entropy": 5.446499681472778, "epoch": 0.3309317963496638, "grad_norm": 1.0859375, "learning_rate": 0.0004993762352551954, "loss": 5.4766, "mean_token_accuracy": 0.1805843397974968, "num_tokens": 7879245.0, "step": 3445 }, { "entropy": 5.61943678855896, "epoch": 0.3314121037463977, "grad_norm": 1.09375, "learning_rate": 0.0004993736816015301, "loss": 5.5669, "mean_token_accuracy": 0.17582879960536957, "num_tokens": 7891186.0, "step": 3450 }, { "entropy": 5.609936046600342, "epoch": 0.3318924111431316, "grad_norm": 0.9609375, "learning_rate": 0.0004993711227385742, "loss": 5.5802, "mean_token_accuracy": 0.1823540985584259, "num_tokens": 7902231.0, "step": 3455 }, { "entropy": 5.523345851898194, "epoch": 0.3323727185398655, "grad_norm": 1.125, "learning_rate": 0.0004993685586663871, "loss": 5.5412, "mean_token_accuracy": 0.18139662891626357, "num_tokens": 7913364.0, "step": 3460 }, { "entropy": 5.735165405273437, "epoch": 0.33285302593659943, "grad_norm": 1.046875, "learning_rate": 0.0004993659893850281, "loss": 5.7308, "mean_token_accuracy": 0.16727230101823806, "num_tokens": 7925217.0, "step": 3465 }, { "entropy": 5.506084823608399, "epoch": 0.3333333333333333, "grad_norm": 0.92578125, "learning_rate": 0.0004993634148945573, "loss": 5.4639, "mean_token_accuracy": 0.17894653379917144, "num_tokens": 7937636.0, "step": 3470 }, { "entropy": 5.5272363185882565, "epoch": 0.33381364073006725, "grad_norm": 1.03125, "learning_rate": 0.0004993608351950341, "loss": 5.4896, "mean_token_accuracy": 0.17503666803240775, "num_tokens": 7948958.0, "step": 3475 }, { "entropy": 5.620566320419312, "epoch": 0.33429394812680113, "grad_norm": 1.0625, "learning_rate": 0.0004993582502865185, "loss": 5.5323, "mean_token_accuracy": 0.18402974754571916, "num_tokens": 7960013.0, "step": 3480 }, { "entropy": 5.462809419631958, "epoch": 0.33477425552353507, "grad_norm": 1.046875, "learning_rate": 0.0004993556601690706, "loss": 5.5416, "mean_token_accuracy": 0.17792800366878508, "num_tokens": 7971041.0, "step": 3485 }, { "entropy": 5.618744802474976, "epoch": 0.33525456292026895, "grad_norm": 1.03125, "learning_rate": 0.0004993530648427505, "loss": 5.576, "mean_token_accuracy": 0.1723045140504837, "num_tokens": 7982752.0, "step": 3490 }, { "entropy": 5.599891996383667, "epoch": 0.3357348703170029, "grad_norm": 1.0390625, "learning_rate": 0.0004993504643076184, "loss": 5.4278, "mean_token_accuracy": 0.18250093311071397, "num_tokens": 7993681.0, "step": 3495 }, { "entropy": 5.470984411239624, "epoch": 0.33621517771373677, "grad_norm": 1.0703125, "learning_rate": 0.0004993478585637347, "loss": 5.4781, "mean_token_accuracy": 0.18258391320705414, "num_tokens": 8004727.0, "step": 3500 }, { "entropy": 5.505999660491943, "epoch": 0.3366954851104707, "grad_norm": 1.0234375, "learning_rate": 0.0004993452476111599, "loss": 5.4797, "mean_token_accuracy": 0.18967788219451903, "num_tokens": 8015423.0, "step": 3505 }, { "entropy": 5.512713193893433, "epoch": 0.3371757925072046, "grad_norm": 0.9140625, "learning_rate": 0.0004993426314499546, "loss": 5.4536, "mean_token_accuracy": 0.18748492896556854, "num_tokens": 8027911.0, "step": 3510 }, { "entropy": 5.572777605056762, "epoch": 0.3376560999039385, "grad_norm": 0.96484375, "learning_rate": 0.0004993400100801796, "loss": 5.4747, "mean_token_accuracy": 0.1818804770708084, "num_tokens": 8038831.0, "step": 3515 }, { "entropy": 5.392134952545166, "epoch": 0.3381364073006724, "grad_norm": 1.0703125, "learning_rate": 0.0004993373835018956, "loss": 5.3718, "mean_token_accuracy": 0.18957587629556655, "num_tokens": 8049906.0, "step": 3520 }, { "entropy": 5.393214273452759, "epoch": 0.33861671469740634, "grad_norm": 1.0078125, "learning_rate": 0.0004993347517151638, "loss": 5.469, "mean_token_accuracy": 0.18386447727680205, "num_tokens": 8061158.0, "step": 3525 }, { "entropy": 5.6083544254302975, "epoch": 0.3390970220941403, "grad_norm": 1.0859375, "learning_rate": 0.0004993321147200452, "loss": 5.4326, "mean_token_accuracy": 0.181746444106102, "num_tokens": 8071958.0, "step": 3530 }, { "entropy": 5.465584182739258, "epoch": 0.33957732949087416, "grad_norm": 0.99609375, "learning_rate": 0.000499329472516601, "loss": 5.4294, "mean_token_accuracy": 0.17608542144298553, "num_tokens": 8084068.0, "step": 3535 }, { "entropy": 5.410733461380005, "epoch": 0.3400576368876081, "grad_norm": 1.015625, "learning_rate": 0.0004993268251048925, "loss": 5.3472, "mean_token_accuracy": 0.19578494429588317, "num_tokens": 8096132.0, "step": 3540 }, { "entropy": 5.503920364379883, "epoch": 0.340537944284342, "grad_norm": 1.0, "learning_rate": 0.0004993241724849814, "loss": 5.5102, "mean_token_accuracy": 0.18362511545419694, "num_tokens": 8107327.0, "step": 3545 }, { "entropy": 5.497963953018188, "epoch": 0.3410182516810759, "grad_norm": 1.015625, "learning_rate": 0.000499321514656929, "loss": 5.4779, "mean_token_accuracy": 0.18374822586774825, "num_tokens": 8118584.0, "step": 3550 }, { "entropy": 5.550964641571045, "epoch": 0.3414985590778098, "grad_norm": 1.03125, "learning_rate": 0.0004993188516207972, "loss": 5.5337, "mean_token_accuracy": 0.1793607845902443, "num_tokens": 8130081.0, "step": 3555 }, { "entropy": 5.507245492935181, "epoch": 0.34197886647454373, "grad_norm": 1.0546875, "learning_rate": 0.0004993161833766478, "loss": 5.4932, "mean_token_accuracy": 0.1838148668408394, "num_tokens": 8141463.0, "step": 3560 }, { "entropy": 5.541257572174072, "epoch": 0.3424591738712776, "grad_norm": 1.15625, "learning_rate": 0.0004993135099245426, "loss": 5.5042, "mean_token_accuracy": 0.17985130697488785, "num_tokens": 8153863.0, "step": 3565 }, { "entropy": 5.428792333602905, "epoch": 0.34293948126801155, "grad_norm": 1.140625, "learning_rate": 0.0004993108312645438, "loss": 5.463, "mean_token_accuracy": 0.18102106750011443, "num_tokens": 8165695.0, "step": 3570 }, { "entropy": 5.5374926090240475, "epoch": 0.34341978866474543, "grad_norm": 1.0546875, "learning_rate": 0.0004993081473967135, "loss": 5.5119, "mean_token_accuracy": 0.18098655641078948, "num_tokens": 8176456.0, "step": 3575 }, { "entropy": 5.58543210029602, "epoch": 0.34390009606147937, "grad_norm": 1.0390625, "learning_rate": 0.0004993054583211143, "loss": 5.5092, "mean_token_accuracy": 0.1822955548763275, "num_tokens": 8189050.0, "step": 3580 }, { "entropy": 5.43015308380127, "epoch": 0.34438040345821325, "grad_norm": 0.94921875, "learning_rate": 0.0004993027640378081, "loss": 5.4081, "mean_token_accuracy": 0.185765840113163, "num_tokens": 8200011.0, "step": 3585 }, { "entropy": 5.474026918411255, "epoch": 0.3448607108549472, "grad_norm": 1.0625, "learning_rate": 0.000499300064546858, "loss": 5.4183, "mean_token_accuracy": 0.1868817389011383, "num_tokens": 8211770.0, "step": 3590 }, { "entropy": 5.55191330909729, "epoch": 0.34534101825168106, "grad_norm": 1.140625, "learning_rate": 0.0004992973598483264, "loss": 5.4638, "mean_token_accuracy": 0.18688549250364303, "num_tokens": 8223582.0, "step": 3595 }, { "entropy": 5.575275611877442, "epoch": 0.345821325648415, "grad_norm": 1.109375, "learning_rate": 0.000499294649942276, "loss": 5.5846, "mean_token_accuracy": 0.1825041502714157, "num_tokens": 8234336.0, "step": 3600 }, { "entropy": 5.547464847564697, "epoch": 0.3463016330451489, "grad_norm": 1.0859375, "learning_rate": 0.0004992919348287699, "loss": 5.4941, "mean_token_accuracy": 0.18366153985261918, "num_tokens": 8244605.0, "step": 3605 }, { "entropy": 5.5259942531585695, "epoch": 0.3467819404418828, "grad_norm": 0.98046875, "learning_rate": 0.0004992892145078711, "loss": 5.5254, "mean_token_accuracy": 0.17931086868047713, "num_tokens": 8255876.0, "step": 3610 }, { "entropy": 5.4697678565979, "epoch": 0.3472622478386167, "grad_norm": 1.0703125, "learning_rate": 0.0004992864889796427, "loss": 5.4174, "mean_token_accuracy": 0.18721913993358613, "num_tokens": 8266602.0, "step": 3615 }, { "entropy": 5.546818780899048, "epoch": 0.34774255523535064, "grad_norm": 0.9609375, "learning_rate": 0.0004992837582441481, "loss": 5.4216, "mean_token_accuracy": 0.18347607105970382, "num_tokens": 8279804.0, "step": 3620 }, { "entropy": 5.569514989852905, "epoch": 0.3482228626320845, "grad_norm": 1.03125, "learning_rate": 0.0004992810223014506, "loss": 5.5242, "mean_token_accuracy": 0.1833881989121437, "num_tokens": 8291020.0, "step": 3625 }, { "entropy": 5.5203827857971195, "epoch": 0.34870317002881845, "grad_norm": 1.0390625, "learning_rate": 0.0004992782811516137, "loss": 5.4727, "mean_token_accuracy": 0.18729409873485564, "num_tokens": 8302192.0, "step": 3630 }, { "entropy": 5.496627855300903, "epoch": 0.34918347742555234, "grad_norm": 0.96484375, "learning_rate": 0.0004992755347947011, "loss": 5.4324, "mean_token_accuracy": 0.18265776634216307, "num_tokens": 8313649.0, "step": 3635 }, { "entropy": 5.44870662689209, "epoch": 0.34966378482228627, "grad_norm": 1.0234375, "learning_rate": 0.0004992727832307766, "loss": 5.4304, "mean_token_accuracy": 0.18587879687547684, "num_tokens": 8324694.0, "step": 3640 }, { "entropy": 5.604543972015381, "epoch": 0.35014409221902015, "grad_norm": 1.1015625, "learning_rate": 0.0004992700264599039, "loss": 5.594, "mean_token_accuracy": 0.1727964922785759, "num_tokens": 8336517.0, "step": 3645 }, { "entropy": 5.540855789184571, "epoch": 0.3506243996157541, "grad_norm": 0.98046875, "learning_rate": 0.0004992672644821473, "loss": 5.5425, "mean_token_accuracy": 0.1779757022857666, "num_tokens": 8349001.0, "step": 3650 }, { "entropy": 5.5626523971557615, "epoch": 0.35110470701248797, "grad_norm": 1.046875, "learning_rate": 0.0004992644972975707, "loss": 5.4537, "mean_token_accuracy": 0.1864044651389122, "num_tokens": 8361230.0, "step": 3655 }, { "entropy": 5.394788694381714, "epoch": 0.3515850144092219, "grad_norm": 1.015625, "learning_rate": 0.0004992617249062383, "loss": 5.3924, "mean_token_accuracy": 0.19216873198747636, "num_tokens": 8372159.0, "step": 3660 }, { "entropy": 5.543751049041748, "epoch": 0.3520653218059558, "grad_norm": 1.0546875, "learning_rate": 0.0004992589473082147, "loss": 5.5214, "mean_token_accuracy": 0.18608528524637222, "num_tokens": 8383228.0, "step": 3665 }, { "entropy": 5.509809923171997, "epoch": 0.3525456292026897, "grad_norm": 1.0625, "learning_rate": 0.0004992561645035641, "loss": 5.4561, "mean_token_accuracy": 0.18168068826198577, "num_tokens": 8394582.0, "step": 3670 }, { "entropy": 5.514116191864014, "epoch": 0.3530259365994236, "grad_norm": 0.9921875, "learning_rate": 0.0004992533764923515, "loss": 5.4481, "mean_token_accuracy": 0.18126334249973297, "num_tokens": 8406784.0, "step": 3675 }, { "entropy": 5.483726072311401, "epoch": 0.35350624399615754, "grad_norm": 1.015625, "learning_rate": 0.0004992505832746412, "loss": 5.4286, "mean_token_accuracy": 0.19101243019104003, "num_tokens": 8418405.0, "step": 3680 }, { "entropy": 5.5265562534332275, "epoch": 0.3539865513928914, "grad_norm": 1.0078125, "learning_rate": 0.0004992477848504983, "loss": 5.392, "mean_token_accuracy": 0.18716304898262023, "num_tokens": 8430432.0, "step": 3685 }, { "entropy": 5.479315328598022, "epoch": 0.35446685878962536, "grad_norm": 0.9375, "learning_rate": 0.0004992449812199877, "loss": 5.5635, "mean_token_accuracy": 0.17799893915653228, "num_tokens": 8442423.0, "step": 3690 }, { "entropy": 5.518668079376221, "epoch": 0.3549471661863593, "grad_norm": 0.99609375, "learning_rate": 0.0004992421723831745, "loss": 5.546, "mean_token_accuracy": 0.1842621758580208, "num_tokens": 8454951.0, "step": 3695 }, { "entropy": 5.520323848724365, "epoch": 0.3554274735830932, "grad_norm": 0.97265625, "learning_rate": 0.0004992393583401239, "loss": 5.4033, "mean_token_accuracy": 0.18851898312568666, "num_tokens": 8467758.0, "step": 3700 }, { "entropy": 5.475191354751587, "epoch": 0.3559077809798271, "grad_norm": 1.046875, "learning_rate": 0.0004992365390909014, "loss": 5.4854, "mean_token_accuracy": 0.17992179691791535, "num_tokens": 8479728.0, "step": 3705 }, { "entropy": 5.535838651657104, "epoch": 0.356388088376561, "grad_norm": 1.0078125, "learning_rate": 0.0004992337146355721, "loss": 5.552, "mean_token_accuracy": 0.17727553099393845, "num_tokens": 8492099.0, "step": 3710 }, { "entropy": 5.610863542556762, "epoch": 0.35686839577329493, "grad_norm": 1.015625, "learning_rate": 0.0004992308849742019, "loss": 5.4819, "mean_token_accuracy": 0.17355056405067443, "num_tokens": 8504657.0, "step": 3715 }, { "entropy": 5.48232364654541, "epoch": 0.3573487031700288, "grad_norm": 1.0390625, "learning_rate": 0.0004992280501068563, "loss": 5.4509, "mean_token_accuracy": 0.18914830237627028, "num_tokens": 8514728.0, "step": 3720 }, { "entropy": 5.528886175155639, "epoch": 0.35782901056676275, "grad_norm": 1.09375, "learning_rate": 0.0004992252100336012, "loss": 5.581, "mean_token_accuracy": 0.1833130970597267, "num_tokens": 8525588.0, "step": 3725 }, { "entropy": 5.540911626815796, "epoch": 0.35830931796349663, "grad_norm": 1.125, "learning_rate": 0.0004992223647545027, "loss": 5.527, "mean_token_accuracy": 0.18297800421714783, "num_tokens": 8537468.0, "step": 3730 }, { "entropy": 5.5527503490448, "epoch": 0.35878962536023057, "grad_norm": 0.99609375, "learning_rate": 0.0004992195142696266, "loss": 5.438, "mean_token_accuracy": 0.18914629518985748, "num_tokens": 8548598.0, "step": 3735 }, { "entropy": 5.33068585395813, "epoch": 0.35926993275696445, "grad_norm": 1.0078125, "learning_rate": 0.0004992166585790391, "loss": 5.3396, "mean_token_accuracy": 0.19562919437885284, "num_tokens": 8560301.0, "step": 3740 }, { "entropy": 5.483434391021729, "epoch": 0.3597502401536984, "grad_norm": 1.1171875, "learning_rate": 0.0004992137976828067, "loss": 5.4516, "mean_token_accuracy": 0.18603197634220123, "num_tokens": 8571186.0, "step": 3745 }, { "entropy": 5.484015607833863, "epoch": 0.36023054755043227, "grad_norm": 1.2734375, "learning_rate": 0.0004992109315809955, "loss": 5.4383, "mean_token_accuracy": 0.18905191421508788, "num_tokens": 8580725.0, "step": 3750 }, { "entropy": 5.519361686706543, "epoch": 0.3607108549471662, "grad_norm": 0.93359375, "learning_rate": 0.0004992080602736725, "loss": 5.5532, "mean_token_accuracy": 0.1773756206035614, "num_tokens": 8594598.0, "step": 3755 }, { "entropy": 5.643574905395508, "epoch": 0.3611911623439001, "grad_norm": 0.98828125, "learning_rate": 0.0004992051837609039, "loss": 5.5404, "mean_token_accuracy": 0.17730522602796556, "num_tokens": 8606733.0, "step": 3760 }, { "entropy": 5.508514451980591, "epoch": 0.361671469740634, "grad_norm": 1.015625, "learning_rate": 0.0004992023020427568, "loss": 5.4788, "mean_token_accuracy": 0.18672696501016617, "num_tokens": 8618863.0, "step": 3765 }, { "entropy": 5.3892511367797855, "epoch": 0.3621517771373679, "grad_norm": 1.0390625, "learning_rate": 0.0004991994151192979, "loss": 5.3304, "mean_token_accuracy": 0.18849435597658157, "num_tokens": 8629270.0, "step": 3770 }, { "entropy": 5.4767759323120115, "epoch": 0.36263208453410184, "grad_norm": 1.0546875, "learning_rate": 0.0004991965229905943, "loss": 5.5364, "mean_token_accuracy": 0.18494855612516403, "num_tokens": 8641363.0, "step": 3775 }, { "entropy": 5.6278270244598385, "epoch": 0.3631123919308357, "grad_norm": 1.0625, "learning_rate": 0.0004991936256567133, "loss": 5.4992, "mean_token_accuracy": 0.18451761305332184, "num_tokens": 8653233.0, "step": 3780 }, { "entropy": 5.4851010799407955, "epoch": 0.36359269932756966, "grad_norm": 0.91015625, "learning_rate": 0.000499190723117722, "loss": 5.487, "mean_token_accuracy": 0.17836329340934753, "num_tokens": 8665192.0, "step": 3785 }, { "entropy": 5.579302835464477, "epoch": 0.36407300672430354, "grad_norm": 1.0859375, "learning_rate": 0.0004991878153736877, "loss": 5.5583, "mean_token_accuracy": 0.17446503937244415, "num_tokens": 8677669.0, "step": 3790 }, { "entropy": 5.419927787780762, "epoch": 0.3645533141210375, "grad_norm": 1.0859375, "learning_rate": 0.0004991849024246781, "loss": 5.3676, "mean_token_accuracy": 0.18973670154809952, "num_tokens": 8688002.0, "step": 3795 }, { "entropy": 5.438193988800049, "epoch": 0.36503362151777136, "grad_norm": 1.03125, "learning_rate": 0.0004991819842707608, "loss": 5.4133, "mean_token_accuracy": 0.18962489068508148, "num_tokens": 8698396.0, "step": 3800 }, { "entropy": 5.543167686462402, "epoch": 0.3655139289145053, "grad_norm": 1.0546875, "learning_rate": 0.0004991790609120035, "loss": 5.4297, "mean_token_accuracy": 0.18700562715530394, "num_tokens": 8711135.0, "step": 3805 }, { "entropy": 5.469641494750976, "epoch": 0.3659942363112392, "grad_norm": 1.0390625, "learning_rate": 0.000499176132348474, "loss": 5.4735, "mean_token_accuracy": 0.1897922232747078, "num_tokens": 8723707.0, "step": 3810 }, { "entropy": 5.582857084274292, "epoch": 0.3664745437079731, "grad_norm": 1.1328125, "learning_rate": 0.0004991731985802405, "loss": 5.4338, "mean_token_accuracy": 0.18693850934505463, "num_tokens": 8734193.0, "step": 3815 }, { "entropy": 5.444149160385132, "epoch": 0.366954851104707, "grad_norm": 1.1171875, "learning_rate": 0.0004991702596073708, "loss": 5.4841, "mean_token_accuracy": 0.18134361505508423, "num_tokens": 8745619.0, "step": 3820 }, { "entropy": 5.426347923278809, "epoch": 0.36743515850144093, "grad_norm": 1.015625, "learning_rate": 0.0004991673154299335, "loss": 5.4231, "mean_token_accuracy": 0.18122087568044662, "num_tokens": 8757331.0, "step": 3825 }, { "entropy": 5.515204238891601, "epoch": 0.3679154658981748, "grad_norm": 0.98046875, "learning_rate": 0.0004991643660479967, "loss": 5.428, "mean_token_accuracy": 0.1868494287133217, "num_tokens": 8768840.0, "step": 3830 }, { "entropy": 5.460073804855346, "epoch": 0.36839577329490875, "grad_norm": 1.0, "learning_rate": 0.0004991614114616289, "loss": 5.3818, "mean_token_accuracy": 0.18779707103967666, "num_tokens": 8781214.0, "step": 3835 }, { "entropy": 5.510246324539184, "epoch": 0.3688760806916426, "grad_norm": 0.98046875, "learning_rate": 0.0004991584516708988, "loss": 5.4477, "mean_token_accuracy": 0.18548956960439683, "num_tokens": 8791645.0, "step": 3840 }, { "entropy": 5.5942995071411135, "epoch": 0.36935638808837656, "grad_norm": 1.078125, "learning_rate": 0.0004991554866758751, "loss": 5.6333, "mean_token_accuracy": 0.1739022307097912, "num_tokens": 8803286.0, "step": 3845 }, { "entropy": 5.493673467636109, "epoch": 0.36983669548511044, "grad_norm": 1.1015625, "learning_rate": 0.0004991525164766265, "loss": 5.4163, "mean_token_accuracy": 0.1872221603989601, "num_tokens": 8814207.0, "step": 3850 }, { "entropy": 5.503255462646484, "epoch": 0.3703170028818444, "grad_norm": 0.9609375, "learning_rate": 0.0004991495410732222, "loss": 5.4683, "mean_token_accuracy": 0.17725101560354234, "num_tokens": 8825540.0, "step": 3855 }, { "entropy": 5.5069482803344725, "epoch": 0.37079731027857826, "grad_norm": 1.015625, "learning_rate": 0.0004991465604657311, "loss": 5.5937, "mean_token_accuracy": 0.17322031259536744, "num_tokens": 8838182.0, "step": 3860 }, { "entropy": 5.526088094711303, "epoch": 0.3712776176753122, "grad_norm": 1.015625, "learning_rate": 0.0004991435746542224, "loss": 5.4654, "mean_token_accuracy": 0.18988653868436814, "num_tokens": 8850211.0, "step": 3865 }, { "entropy": 5.439452648162842, "epoch": 0.37175792507204614, "grad_norm": 0.97265625, "learning_rate": 0.0004991405836387655, "loss": 5.5032, "mean_token_accuracy": 0.18108827471733094, "num_tokens": 8862804.0, "step": 3870 }, { "entropy": 5.529762125015258, "epoch": 0.37223823246878, "grad_norm": 1.078125, "learning_rate": 0.0004991375874194298, "loss": 5.4602, "mean_token_accuracy": 0.17960784435272217, "num_tokens": 8874112.0, "step": 3875 }, { "entropy": 5.469674205780029, "epoch": 0.37271853986551395, "grad_norm": 1.03125, "learning_rate": 0.000499134585996285, "loss": 5.477, "mean_token_accuracy": 0.18614101260900498, "num_tokens": 8885114.0, "step": 3880 }, { "entropy": 5.554774141311645, "epoch": 0.37319884726224783, "grad_norm": 1.046875, "learning_rate": 0.0004991315793694004, "loss": 5.3691, "mean_token_accuracy": 0.18807282894849778, "num_tokens": 8895555.0, "step": 3885 }, { "entropy": 5.405085754394531, "epoch": 0.37367915465898177, "grad_norm": 1.0234375, "learning_rate": 0.0004991285675388463, "loss": 5.3765, "mean_token_accuracy": 0.19634046405553818, "num_tokens": 8906073.0, "step": 3890 }, { "entropy": 5.501630163192749, "epoch": 0.37415946205571565, "grad_norm": 1.125, "learning_rate": 0.0004991255505046922, "loss": 5.5188, "mean_token_accuracy": 0.1789945885539055, "num_tokens": 8916587.0, "step": 3895 }, { "entropy": 5.550557231903076, "epoch": 0.3746397694524496, "grad_norm": 1.0546875, "learning_rate": 0.0004991225282670083, "loss": 5.4113, "mean_token_accuracy": 0.1861289381980896, "num_tokens": 8927923.0, "step": 3900 }, { "entropy": 5.382868242263794, "epoch": 0.37512007684918347, "grad_norm": 1.078125, "learning_rate": 0.000499119500825865, "loss": 5.4579, "mean_token_accuracy": 0.18377629071474075, "num_tokens": 8939939.0, "step": 3905 }, { "entropy": 5.397466945648193, "epoch": 0.3756003842459174, "grad_norm": 1.09375, "learning_rate": 0.0004991164681813323, "loss": 5.4378, "mean_token_accuracy": 0.19209783971309663, "num_tokens": 8951748.0, "step": 3910 }, { "entropy": 5.485667037963867, "epoch": 0.3760806916426513, "grad_norm": 1.015625, "learning_rate": 0.0004991134303334807, "loss": 5.3588, "mean_token_accuracy": 0.19007459729909898, "num_tokens": 8962922.0, "step": 3915 }, { "entropy": 5.372178030014038, "epoch": 0.3765609990393852, "grad_norm": 1.03125, "learning_rate": 0.0004991103872823807, "loss": 5.3442, "mean_token_accuracy": 0.19452154785394668, "num_tokens": 8974013.0, "step": 3920 }, { "entropy": 5.436591958999633, "epoch": 0.3770413064361191, "grad_norm": 1.015625, "learning_rate": 0.000499107339028103, "loss": 5.4262, "mean_token_accuracy": 0.18169266134500503, "num_tokens": 8986032.0, "step": 3925 }, { "entropy": 5.542058515548706, "epoch": 0.37752161383285304, "grad_norm": 1.0546875, "learning_rate": 0.0004991042855707184, "loss": 5.4187, "mean_token_accuracy": 0.1796349912881851, "num_tokens": 8996889.0, "step": 3930 }, { "entropy": 5.436617517471314, "epoch": 0.3780019212295869, "grad_norm": 1.0546875, "learning_rate": 0.0004991012269102977, "loss": 5.3992, "mean_token_accuracy": 0.18429471999406816, "num_tokens": 9007594.0, "step": 3935 }, { "entropy": 5.426474618911743, "epoch": 0.37848222862632086, "grad_norm": 1.0859375, "learning_rate": 0.0004990981630469119, "loss": 5.402, "mean_token_accuracy": 0.18193352967500687, "num_tokens": 9018097.0, "step": 3940 }, { "entropy": 5.5093968391418455, "epoch": 0.37896253602305474, "grad_norm": 1.0234375, "learning_rate": 0.0004990950939806323, "loss": 5.5113, "mean_token_accuracy": 0.18117111474275588, "num_tokens": 9029554.0, "step": 3945 }, { "entropy": 5.489337825775147, "epoch": 0.3794428434197887, "grad_norm": 0.99609375, "learning_rate": 0.00049909201971153, "loss": 5.3772, "mean_token_accuracy": 0.1829820305109024, "num_tokens": 9042518.0, "step": 3950 }, { "entropy": 5.421378660202026, "epoch": 0.37992315081652256, "grad_norm": 1.1015625, "learning_rate": 0.0004990889402396763, "loss": 5.4316, "mean_token_accuracy": 0.18639881759881974, "num_tokens": 9054524.0, "step": 3955 }, { "entropy": 5.510490798950196, "epoch": 0.3804034582132565, "grad_norm": 0.9921875, "learning_rate": 0.0004990858555651431, "loss": 5.4016, "mean_token_accuracy": 0.18468015938997268, "num_tokens": 9065375.0, "step": 3960 }, { "entropy": 5.44808177947998, "epoch": 0.3808837656099904, "grad_norm": 1.109375, "learning_rate": 0.0004990827656880015, "loss": 5.3509, "mean_token_accuracy": 0.1859322890639305, "num_tokens": 9076338.0, "step": 3965 }, { "entropy": 5.432799911499023, "epoch": 0.3813640730067243, "grad_norm": 0.96875, "learning_rate": 0.0004990796706083235, "loss": 5.4011, "mean_token_accuracy": 0.18659975230693818, "num_tokens": 9088407.0, "step": 3970 }, { "entropy": 5.426470470428467, "epoch": 0.3818443804034582, "grad_norm": 1.03125, "learning_rate": 0.0004990765703261809, "loss": 5.3649, "mean_token_accuracy": 0.18807975053787232, "num_tokens": 9099833.0, "step": 3975 }, { "entropy": 5.350304222106933, "epoch": 0.38232468780019213, "grad_norm": 0.98046875, "learning_rate": 0.0004990734648416458, "loss": 5.3388, "mean_token_accuracy": 0.189335997402668, "num_tokens": 9111126.0, "step": 3980 }, { "entropy": 5.505539417266846, "epoch": 0.382804995196926, "grad_norm": 1.09375, "learning_rate": 0.0004990703541547901, "loss": 5.4548, "mean_token_accuracy": 0.1886373370885849, "num_tokens": 9121979.0, "step": 3985 }, { "entropy": 5.520917081832886, "epoch": 0.38328530259365995, "grad_norm": 1.109375, "learning_rate": 0.0004990672382656863, "loss": 5.4535, "mean_token_accuracy": 0.18644375950098038, "num_tokens": 9132929.0, "step": 3990 }, { "entropy": 5.485851383209228, "epoch": 0.38376560999039383, "grad_norm": 1.0, "learning_rate": 0.0004990641171744064, "loss": 5.4111, "mean_token_accuracy": 0.1882080391049385, "num_tokens": 9143903.0, "step": 3995 }, { "entropy": 5.495297384262085, "epoch": 0.38424591738712777, "grad_norm": 1.1484375, "learning_rate": 0.0004990609908810231, "loss": 5.5045, "mean_token_accuracy": 0.18192221075296403, "num_tokens": 9154416.0, "step": 4000 }, { "entropy": 5.513756942749024, "epoch": 0.38472622478386165, "grad_norm": 1.078125, "learning_rate": 0.0004990578593856089, "loss": 5.4805, "mean_token_accuracy": 0.18242392241954802, "num_tokens": 9165613.0, "step": 4005 }, { "entropy": 5.4664655208587645, "epoch": 0.3852065321805956, "grad_norm": 0.99609375, "learning_rate": 0.0004990547226882366, "loss": 5.433, "mean_token_accuracy": 0.18787842243909836, "num_tokens": 9177884.0, "step": 4010 }, { "entropy": 5.5449103832244875, "epoch": 0.38568683957732947, "grad_norm": 1.015625, "learning_rate": 0.0004990515807889788, "loss": 5.5669, "mean_token_accuracy": 0.17467134743928908, "num_tokens": 9190041.0, "step": 4015 }, { "entropy": 5.556881046295166, "epoch": 0.3861671469740634, "grad_norm": 1.0546875, "learning_rate": 0.0004990484336879087, "loss": 5.4402, "mean_token_accuracy": 0.18740091025829314, "num_tokens": 9202390.0, "step": 4020 }, { "entropy": 5.409300327301025, "epoch": 0.3866474543707973, "grad_norm": 1.09375, "learning_rate": 0.0004990452813850992, "loss": 5.4373, "mean_token_accuracy": 0.18635576069355012, "num_tokens": 9213437.0, "step": 4025 }, { "entropy": 5.554971408843994, "epoch": 0.3871277617675312, "grad_norm": 0.9765625, "learning_rate": 0.0004990421238806236, "loss": 5.517, "mean_token_accuracy": 0.17564513981342317, "num_tokens": 9226310.0, "step": 4030 }, { "entropy": 5.530429458618164, "epoch": 0.38760806916426516, "grad_norm": 1.0703125, "learning_rate": 0.0004990389611745551, "loss": 5.4495, "mean_token_accuracy": 0.1819504901766777, "num_tokens": 9236271.0, "step": 4035 }, { "entropy": 5.516104078292846, "epoch": 0.38808837656099904, "grad_norm": 1.1171875, "learning_rate": 0.0004990357932669672, "loss": 5.5245, "mean_token_accuracy": 0.18500009030103684, "num_tokens": 9247755.0, "step": 4040 }, { "entropy": 5.464123487472534, "epoch": 0.388568683957733, "grad_norm": 1.0703125, "learning_rate": 0.0004990326201579335, "loss": 5.361, "mean_token_accuracy": 0.19129124879837037, "num_tokens": 9259821.0, "step": 4045 }, { "entropy": 5.4668073654174805, "epoch": 0.38904899135446686, "grad_norm": 1.171875, "learning_rate": 0.0004990294418475274, "loss": 5.4631, "mean_token_accuracy": 0.18641964942216874, "num_tokens": 9270663.0, "step": 4050 }, { "entropy": 5.465627670288086, "epoch": 0.3895292987512008, "grad_norm": 1.234375, "learning_rate": 0.0004990262583358231, "loss": 5.4879, "mean_token_accuracy": 0.17998379915952684, "num_tokens": 9282588.0, "step": 4055 }, { "entropy": 5.510502290725708, "epoch": 0.3900096061479347, "grad_norm": 1.1015625, "learning_rate": 0.0004990230696228943, "loss": 5.4397, "mean_token_accuracy": 0.17829088270664215, "num_tokens": 9293368.0, "step": 4060 }, { "entropy": 5.477728748321534, "epoch": 0.3904899135446686, "grad_norm": 1.0234375, "learning_rate": 0.0004990198757088149, "loss": 5.5128, "mean_token_accuracy": 0.1811017781496048, "num_tokens": 9305962.0, "step": 4065 }, { "entropy": 5.508330774307251, "epoch": 0.3909702209414025, "grad_norm": 1.0625, "learning_rate": 0.0004990166765936593, "loss": 5.393, "mean_token_accuracy": 0.19244694262742995, "num_tokens": 9317955.0, "step": 4070 }, { "entropy": 5.450256824493408, "epoch": 0.3914505283381364, "grad_norm": 1.0078125, "learning_rate": 0.0004990134722775016, "loss": 5.3934, "mean_token_accuracy": 0.19047792106866837, "num_tokens": 9329491.0, "step": 4075 }, { "entropy": 5.451663637161255, "epoch": 0.3919308357348703, "grad_norm": 1.03125, "learning_rate": 0.0004990102627604162, "loss": 5.5273, "mean_token_accuracy": 0.19028781056404115, "num_tokens": 9341612.0, "step": 4080 }, { "entropy": 5.524235773086548, "epoch": 0.39241114313160425, "grad_norm": 1.046875, "learning_rate": 0.0004990070480424778, "loss": 5.458, "mean_token_accuracy": 0.18043633103370665, "num_tokens": 9352302.0, "step": 4085 }, { "entropy": 5.440912199020386, "epoch": 0.3928914505283381, "grad_norm": 0.97265625, "learning_rate": 0.0004990038281237608, "loss": 5.3919, "mean_token_accuracy": 0.1852226436138153, "num_tokens": 9363303.0, "step": 4090 }, { "entropy": 5.433840227127075, "epoch": 0.39337175792507206, "grad_norm": 1.0546875, "learning_rate": 0.0004990006030043401, "loss": 5.3732, "mean_token_accuracy": 0.1849522888660431, "num_tokens": 9375878.0, "step": 4095 }, { "entropy": 5.470492124557495, "epoch": 0.39385206532180594, "grad_norm": 1.0625, "learning_rate": 0.0004989973726842906, "loss": 5.4145, "mean_token_accuracy": 0.18103147149086, "num_tokens": 9388342.0, "step": 4100 }, { "entropy": 5.44459342956543, "epoch": 0.3943323727185399, "grad_norm": 1.1875, "learning_rate": 0.0004989941371636872, "loss": 5.3549, "mean_token_accuracy": 0.1901955187320709, "num_tokens": 9399047.0, "step": 4105 }, { "entropy": 5.449139881134033, "epoch": 0.39481268011527376, "grad_norm": 1.0703125, "learning_rate": 0.0004989908964426051, "loss": 5.4342, "mean_token_accuracy": 0.18933464139699935, "num_tokens": 9410172.0, "step": 4110 }, { "entropy": 5.547493505477905, "epoch": 0.3952929875120077, "grad_norm": 1.0703125, "learning_rate": 0.0004989876505211194, "loss": 5.5794, "mean_token_accuracy": 0.17717085629701615, "num_tokens": 9422287.0, "step": 4115 }, { "entropy": 5.5754584789276125, "epoch": 0.3957732949087416, "grad_norm": 1.046875, "learning_rate": 0.0004989843993993056, "loss": 5.44, "mean_token_accuracy": 0.18759053498506545, "num_tokens": 9433709.0, "step": 4120 }, { "entropy": 5.341240167617798, "epoch": 0.3962536023054755, "grad_norm": 1.125, "learning_rate": 0.0004989811430772392, "loss": 5.3199, "mean_token_accuracy": 0.189169280230999, "num_tokens": 9445138.0, "step": 4125 }, { "entropy": 5.4137170791625975, "epoch": 0.3967339097022094, "grad_norm": 1.1484375, "learning_rate": 0.0004989778815549957, "loss": 5.4579, "mean_token_accuracy": 0.1827932521700859, "num_tokens": 9455263.0, "step": 4130 }, { "entropy": 5.533003664016723, "epoch": 0.39721421709894333, "grad_norm": 1.15625, "learning_rate": 0.0004989746148326508, "loss": 5.4184, "mean_token_accuracy": 0.18644048422574996, "num_tokens": 9465491.0, "step": 4135 }, { "entropy": 5.372505331039429, "epoch": 0.3976945244956772, "grad_norm": 0.984375, "learning_rate": 0.0004989713429102805, "loss": 5.3821, "mean_token_accuracy": 0.1837732046842575, "num_tokens": 9477601.0, "step": 4140 }, { "entropy": 5.426533985137939, "epoch": 0.39817483189241115, "grad_norm": 1.0, "learning_rate": 0.0004989680657879607, "loss": 5.4426, "mean_token_accuracy": 0.18387902528047562, "num_tokens": 9489385.0, "step": 4145 }, { "entropy": 5.473710680007935, "epoch": 0.39865513928914503, "grad_norm": 1.03125, "learning_rate": 0.0004989647834657675, "loss": 5.3249, "mean_token_accuracy": 0.19230013936758042, "num_tokens": 9501131.0, "step": 4150 }, { "entropy": 5.420683908462524, "epoch": 0.39913544668587897, "grad_norm": 0.96875, "learning_rate": 0.000498961495943777, "loss": 5.4614, "mean_token_accuracy": 0.18854968398809432, "num_tokens": 9513094.0, "step": 4155 }, { "entropy": 5.577786207199097, "epoch": 0.39961575408261285, "grad_norm": 1.1015625, "learning_rate": 0.0004989582032220656, "loss": 5.5832, "mean_token_accuracy": 0.17526223361492158, "num_tokens": 9524538.0, "step": 4160 }, { "entropy": 5.522935295104981, "epoch": 0.4000960614793468, "grad_norm": 1.171875, "learning_rate": 0.0004989549053007096, "loss": 5.3961, "mean_token_accuracy": 0.19305580705404282, "num_tokens": 9535284.0, "step": 4165 }, { "entropy": 5.462124681472778, "epoch": 0.40057636887608067, "grad_norm": 1.1328125, "learning_rate": 0.0004989516021797858, "loss": 5.471, "mean_token_accuracy": 0.18390081077814102, "num_tokens": 9546472.0, "step": 4170 }, { "entropy": 5.499347305297851, "epoch": 0.4010566762728146, "grad_norm": 1.015625, "learning_rate": 0.000498948293859371, "loss": 5.4605, "mean_token_accuracy": 0.18212546557188034, "num_tokens": 9558358.0, "step": 4175 }, { "entropy": 5.496229076385498, "epoch": 0.4015369836695485, "grad_norm": 0.9765625, "learning_rate": 0.0004989449803395415, "loss": 5.4959, "mean_token_accuracy": 0.18471186012029647, "num_tokens": 9570653.0, "step": 4180 }, { "entropy": 5.556100845336914, "epoch": 0.4020172910662824, "grad_norm": 1.015625, "learning_rate": 0.0004989416616203747, "loss": 5.4386, "mean_token_accuracy": 0.18714374899864197, "num_tokens": 9582150.0, "step": 4185 }, { "entropy": 5.4823558807373045, "epoch": 0.4024975984630163, "grad_norm": 1.0546875, "learning_rate": 0.0004989383377019476, "loss": 5.38, "mean_token_accuracy": 0.19184014648199083, "num_tokens": 9592462.0, "step": 4190 }, { "entropy": 5.375227689743042, "epoch": 0.40297790585975024, "grad_norm": 1.0390625, "learning_rate": 0.0004989350085843371, "loss": 5.374, "mean_token_accuracy": 0.18951477408409118, "num_tokens": 9604027.0, "step": 4195 }, { "entropy": 5.387249088287353, "epoch": 0.4034582132564842, "grad_norm": 0.9609375, "learning_rate": 0.0004989316742676207, "loss": 5.3733, "mean_token_accuracy": 0.19109322130680084, "num_tokens": 9616325.0, "step": 4200 }, { "entropy": 5.396379852294922, "epoch": 0.40393852065321806, "grad_norm": 1.046875, "learning_rate": 0.0004989283347518757, "loss": 5.3338, "mean_token_accuracy": 0.18609212040901185, "num_tokens": 9628133.0, "step": 4205 }, { "entropy": 5.579652786254883, "epoch": 0.404418828049952, "grad_norm": 1.0625, "learning_rate": 0.0004989249900371797, "loss": 5.5629, "mean_token_accuracy": 0.17861852645874024, "num_tokens": 9639686.0, "step": 4210 }, { "entropy": 5.429533529281616, "epoch": 0.4048991354466859, "grad_norm": 1.09375, "learning_rate": 0.0004989216401236103, "loss": 5.4184, "mean_token_accuracy": 0.18496839255094527, "num_tokens": 9650222.0, "step": 4215 }, { "entropy": 5.367856836318969, "epoch": 0.4053794428434198, "grad_norm": 1.1171875, "learning_rate": 0.0004989182850112455, "loss": 5.3417, "mean_token_accuracy": 0.1997272178530693, "num_tokens": 9661792.0, "step": 4220 }, { "entropy": 5.516646957397461, "epoch": 0.4058597502401537, "grad_norm": 1.0546875, "learning_rate": 0.0004989149247001629, "loss": 5.4497, "mean_token_accuracy": 0.18383817970752717, "num_tokens": 9673000.0, "step": 4225 }, { "entropy": 5.532714462280273, "epoch": 0.40634005763688763, "grad_norm": 1.0234375, "learning_rate": 0.0004989115591904407, "loss": 5.3975, "mean_token_accuracy": 0.1901587262749672, "num_tokens": 9685253.0, "step": 4230 }, { "entropy": 5.391170501708984, "epoch": 0.4068203650336215, "grad_norm": 1.0546875, "learning_rate": 0.0004989081884821569, "loss": 5.4004, "mean_token_accuracy": 0.18320820480585098, "num_tokens": 9697245.0, "step": 4235 }, { "entropy": 5.450364589691162, "epoch": 0.40730067243035545, "grad_norm": 1.0390625, "learning_rate": 0.0004989048125753899, "loss": 5.4156, "mean_token_accuracy": 0.18504445552825927, "num_tokens": 9710095.0, "step": 4240 }, { "entropy": 5.407678937911987, "epoch": 0.40778097982708933, "grad_norm": 1.0625, "learning_rate": 0.000498901431470218, "loss": 5.2919, "mean_token_accuracy": 0.19396644979715347, "num_tokens": 9721488.0, "step": 4245 }, { "entropy": 5.2491998195648195, "epoch": 0.40826128722382327, "grad_norm": 0.98046875, "learning_rate": 0.0004988980451667198, "loss": 5.255, "mean_token_accuracy": 0.19170391261577607, "num_tokens": 9733280.0, "step": 4250 }, { "entropy": 5.455927753448487, "epoch": 0.40874159462055715, "grad_norm": 0.97265625, "learning_rate": 0.0004988946536649737, "loss": 5.3863, "mean_token_accuracy": 0.18661659061908722, "num_tokens": 9744514.0, "step": 4255 }, { "entropy": 5.413423871994018, "epoch": 0.4092219020172911, "grad_norm": 1.1015625, "learning_rate": 0.0004988912569650585, "loss": 5.3752, "mean_token_accuracy": 0.19112140834331512, "num_tokens": 9754931.0, "step": 4260 }, { "entropy": 5.389836359024048, "epoch": 0.40970220941402496, "grad_norm": 1.046875, "learning_rate": 0.0004988878550670533, "loss": 5.3725, "mean_token_accuracy": 0.19297343790531157, "num_tokens": 9765635.0, "step": 4265 }, { "entropy": 5.508016872406006, "epoch": 0.4101825168107589, "grad_norm": 1.0546875, "learning_rate": 0.0004988844479710369, "loss": 5.4792, "mean_token_accuracy": 0.18072771430015563, "num_tokens": 9777512.0, "step": 4270 }, { "entropy": 5.541130542755127, "epoch": 0.4106628242074928, "grad_norm": 1.015625, "learning_rate": 0.0004988810356770884, "loss": 5.4764, "mean_token_accuracy": 0.1744610548019409, "num_tokens": 9790128.0, "step": 4275 }, { "entropy": 5.451146841049194, "epoch": 0.4111431316042267, "grad_norm": 0.98046875, "learning_rate": 0.000498877618185287, "loss": 5.4112, "mean_token_accuracy": 0.19078320413827896, "num_tokens": 9802549.0, "step": 4280 }, { "entropy": 5.365971374511719, "epoch": 0.4116234390009606, "grad_norm": 1.0859375, "learning_rate": 0.0004988741954957121, "loss": 5.3574, "mean_token_accuracy": 0.18884203881025313, "num_tokens": 9813736.0, "step": 4285 }, { "entropy": 5.380771827697754, "epoch": 0.41210374639769454, "grad_norm": 1.1171875, "learning_rate": 0.0004988707676084432, "loss": 5.3584, "mean_token_accuracy": 0.19705824106931685, "num_tokens": 9823785.0, "step": 4290 }, { "entropy": 5.432324981689453, "epoch": 0.4125840537944284, "grad_norm": 1.0625, "learning_rate": 0.0004988673345235597, "loss": 5.3197, "mean_token_accuracy": 0.1934140741825104, "num_tokens": 9834910.0, "step": 4295 }, { "entropy": 5.437625408172607, "epoch": 0.41306436119116235, "grad_norm": 1.0, "learning_rate": 0.0004988638962411416, "loss": 5.363, "mean_token_accuracy": 0.18818716257810592, "num_tokens": 9845593.0, "step": 4300 }, { "entropy": 5.392855072021485, "epoch": 0.41354466858789624, "grad_norm": 0.9765625, "learning_rate": 0.0004988604527612685, "loss": 5.2697, "mean_token_accuracy": 0.2009762555360794, "num_tokens": 9856763.0, "step": 4305 }, { "entropy": 5.503190565109253, "epoch": 0.4140249759846302, "grad_norm": 1.0625, "learning_rate": 0.0004988570040840205, "loss": 5.4945, "mean_token_accuracy": 0.18051616251468658, "num_tokens": 9869528.0, "step": 4310 }, { "entropy": 5.407845735549927, "epoch": 0.41450528338136405, "grad_norm": 1.09375, "learning_rate": 0.0004988535502094774, "loss": 5.3958, "mean_token_accuracy": 0.18804680705070495, "num_tokens": 9881170.0, "step": 4315 }, { "entropy": 5.461514711380005, "epoch": 0.414985590778098, "grad_norm": 1.0859375, "learning_rate": 0.0004988500911377198, "loss": 5.4803, "mean_token_accuracy": 0.18439086973667146, "num_tokens": 9893119.0, "step": 4320 }, { "entropy": 5.368999385833741, "epoch": 0.41546589817483187, "grad_norm": 1.015625, "learning_rate": 0.0004988466268688276, "loss": 5.3154, "mean_token_accuracy": 0.19932861626148224, "num_tokens": 9905339.0, "step": 4325 }, { "entropy": 5.482837677001953, "epoch": 0.4159462055715658, "grad_norm": 0.9765625, "learning_rate": 0.0004988431574028814, "loss": 5.4002, "mean_token_accuracy": 0.19202394932508468, "num_tokens": 9917500.0, "step": 4330 }, { "entropy": 5.466025495529175, "epoch": 0.4164265129682997, "grad_norm": 1.1875, "learning_rate": 0.0004988396827399618, "loss": 5.4808, "mean_token_accuracy": 0.18326758295297624, "num_tokens": 9929667.0, "step": 4335 }, { "entropy": 5.48503007888794, "epoch": 0.4169068203650336, "grad_norm": 1.03125, "learning_rate": 0.0004988362028801495, "loss": 5.4048, "mean_token_accuracy": 0.18796583414077758, "num_tokens": 9941102.0, "step": 4340 }, { "entropy": 5.412125444412231, "epoch": 0.4173871277617675, "grad_norm": 1.078125, "learning_rate": 0.0004988327178235253, "loss": 5.3058, "mean_token_accuracy": 0.1973835989832878, "num_tokens": 9951986.0, "step": 4345 }, { "entropy": 5.383547782897949, "epoch": 0.41786743515850144, "grad_norm": 0.95703125, "learning_rate": 0.0004988292275701699, "loss": 5.3119, "mean_token_accuracy": 0.19086995273828505, "num_tokens": 9964486.0, "step": 4350 }, { "entropy": 5.406881952285767, "epoch": 0.4183477425552353, "grad_norm": 1.046875, "learning_rate": 0.0004988257321201646, "loss": 5.4094, "mean_token_accuracy": 0.1860354095697403, "num_tokens": 9975909.0, "step": 4355 }, { "entropy": 5.473488092422485, "epoch": 0.41882804995196926, "grad_norm": 1.078125, "learning_rate": 0.0004988222314735902, "loss": 5.4171, "mean_token_accuracy": 0.18617332428693772, "num_tokens": 9986951.0, "step": 4360 }, { "entropy": 5.517805814743042, "epoch": 0.41930835734870314, "grad_norm": 1.1484375, "learning_rate": 0.0004988187256305284, "loss": 5.5057, "mean_token_accuracy": 0.1791812226176262, "num_tokens": 9999234.0, "step": 4365 }, { "entropy": 5.405948638916016, "epoch": 0.4197886647454371, "grad_norm": 1.046875, "learning_rate": 0.0004988152145910603, "loss": 5.3792, "mean_token_accuracy": 0.1959477871656418, "num_tokens": 10010178.0, "step": 4370 }, { "entropy": 5.391415548324585, "epoch": 0.420268972142171, "grad_norm": 1.0859375, "learning_rate": 0.0004988116983552675, "loss": 5.3218, "mean_token_accuracy": 0.18838354647159578, "num_tokens": 10021183.0, "step": 4375 }, { "entropy": 5.590651321411133, "epoch": 0.4207492795389049, "grad_norm": 1.0390625, "learning_rate": 0.0004988081769232317, "loss": 5.6204, "mean_token_accuracy": 0.17428677082061766, "num_tokens": 10033686.0, "step": 4380 }, { "entropy": 5.384156322479248, "epoch": 0.42122958693563883, "grad_norm": 1.0859375, "learning_rate": 0.0004988046502950346, "loss": 5.3079, "mean_token_accuracy": 0.187077134847641, "num_tokens": 10045923.0, "step": 4385 }, { "entropy": 5.270208120346069, "epoch": 0.4217098943323727, "grad_norm": 0.99609375, "learning_rate": 0.000498801118470758, "loss": 5.2402, "mean_token_accuracy": 0.19899773895740508, "num_tokens": 10057196.0, "step": 4390 }, { "entropy": 5.409784030914307, "epoch": 0.42219020172910665, "grad_norm": 1.109375, "learning_rate": 0.000498797581450484, "loss": 5.4295, "mean_token_accuracy": 0.18354050666093827, "num_tokens": 10069655.0, "step": 4395 }, { "entropy": 5.448616600036621, "epoch": 0.42267050912584053, "grad_norm": 1.234375, "learning_rate": 0.0004987940392342948, "loss": 5.3095, "mean_token_accuracy": 0.19377071112394334, "num_tokens": 10080876.0, "step": 4400 }, { "entropy": 5.421027898788452, "epoch": 0.42315081652257447, "grad_norm": 0.9921875, "learning_rate": 0.0004987904918222726, "loss": 5.415, "mean_token_accuracy": 0.18513490557670592, "num_tokens": 10091986.0, "step": 4405 }, { "entropy": 5.5097509860992435, "epoch": 0.42363112391930835, "grad_norm": 1.109375, "learning_rate": 0.0004987869392144996, "loss": 5.499, "mean_token_accuracy": 0.18492884635925294, "num_tokens": 10104027.0, "step": 4410 }, { "entropy": 5.425499534606933, "epoch": 0.4241114313160423, "grad_norm": 1.09375, "learning_rate": 0.0004987833814110584, "loss": 5.3567, "mean_token_accuracy": 0.1865203857421875, "num_tokens": 10114665.0, "step": 4415 }, { "entropy": 5.385516119003296, "epoch": 0.42459173871277617, "grad_norm": 1.015625, "learning_rate": 0.0004987798184120316, "loss": 5.3742, "mean_token_accuracy": 0.19014959633350373, "num_tokens": 10126032.0, "step": 4420 }, { "entropy": 5.512171411514283, "epoch": 0.4250720461095101, "grad_norm": 1.125, "learning_rate": 0.0004987762502175018, "loss": 5.4288, "mean_token_accuracy": 0.1829407036304474, "num_tokens": 10137256.0, "step": 4425 }, { "entropy": 5.3579336643219, "epoch": 0.425552353506244, "grad_norm": 1.09375, "learning_rate": 0.000498772676827552, "loss": 5.3117, "mean_token_accuracy": 0.1916539713740349, "num_tokens": 10149445.0, "step": 4430 }, { "entropy": 5.474416351318359, "epoch": 0.4260326609029779, "grad_norm": 1.03125, "learning_rate": 0.0004987690982422652, "loss": 5.4495, "mean_token_accuracy": 0.18037094324827194, "num_tokens": 10161607.0, "step": 4435 }, { "entropy": 5.448618030548095, "epoch": 0.4265129682997118, "grad_norm": 1.0703125, "learning_rate": 0.0004987655144617243, "loss": 5.4681, "mean_token_accuracy": 0.18403236269950868, "num_tokens": 10173184.0, "step": 4440 }, { "entropy": 5.4251587867736815, "epoch": 0.42699327569644574, "grad_norm": 1.0234375, "learning_rate": 0.0004987619254860126, "loss": 5.328, "mean_token_accuracy": 0.19698531180620193, "num_tokens": 10184617.0, "step": 4445 }, { "entropy": 5.4672339916229244, "epoch": 0.4274735830931796, "grad_norm": 1.0703125, "learning_rate": 0.0004987583313152134, "loss": 5.3568, "mean_token_accuracy": 0.18906597346067427, "num_tokens": 10195608.0, "step": 4450 }, { "entropy": 5.386989736557007, "epoch": 0.42795389048991356, "grad_norm": 1.0859375, "learning_rate": 0.0004987547319494104, "loss": 5.4529, "mean_token_accuracy": 0.18423379063606263, "num_tokens": 10206763.0, "step": 4455 }, { "entropy": 5.486404466629028, "epoch": 0.42843419788664744, "grad_norm": 0.99609375, "learning_rate": 0.0004987511273886867, "loss": 5.3933, "mean_token_accuracy": 0.1908423647284508, "num_tokens": 10218714.0, "step": 4460 }, { "entropy": 5.427644729614258, "epoch": 0.4289145052833814, "grad_norm": 1.15625, "learning_rate": 0.0004987475176331263, "loss": 5.415, "mean_token_accuracy": 0.18401106595993041, "num_tokens": 10229902.0, "step": 4465 }, { "entropy": 5.423227453231812, "epoch": 0.42939481268011526, "grad_norm": 1.0390625, "learning_rate": 0.0004987439026828129, "loss": 5.288, "mean_token_accuracy": 0.19139131158590317, "num_tokens": 10241578.0, "step": 4470 }, { "entropy": 5.324700498580933, "epoch": 0.4298751200768492, "grad_norm": 1.1796875, "learning_rate": 0.0004987402825378305, "loss": 5.2595, "mean_token_accuracy": 0.19443607479333877, "num_tokens": 10252109.0, "step": 4475 }, { "entropy": 5.429213285446167, "epoch": 0.4303554274735831, "grad_norm": 0.99609375, "learning_rate": 0.0004987366571982631, "loss": 5.4252, "mean_token_accuracy": 0.18883214443922042, "num_tokens": 10263357.0, "step": 4480 }, { "entropy": 5.487810945510864, "epoch": 0.430835734870317, "grad_norm": 1.078125, "learning_rate": 0.0004987330266641948, "loss": 5.4308, "mean_token_accuracy": 0.18471152931451798, "num_tokens": 10275536.0, "step": 4485 }, { "entropy": 5.453687620162964, "epoch": 0.4313160422670509, "grad_norm": 1.046875, "learning_rate": 0.0004987293909357101, "loss": 5.415, "mean_token_accuracy": 0.19442622363567352, "num_tokens": 10286901.0, "step": 4490 }, { "entropy": 5.365311050415039, "epoch": 0.43179634966378483, "grad_norm": 0.9921875, "learning_rate": 0.0004987257500128933, "loss": 5.3172, "mean_token_accuracy": 0.18610639423131942, "num_tokens": 10298961.0, "step": 4495 }, { "entropy": 5.462113523483277, "epoch": 0.4322766570605187, "grad_norm": 1.046875, "learning_rate": 0.0004987221038958288, "loss": 5.4543, "mean_token_accuracy": 0.18748044222593307, "num_tokens": 10310911.0, "step": 4500 }, { "entropy": 5.510283613204956, "epoch": 0.43275696445725265, "grad_norm": 1.0234375, "learning_rate": 0.0004987184525846015, "loss": 5.4389, "mean_token_accuracy": 0.1841048017144203, "num_tokens": 10322267.0, "step": 4505 }, { "entropy": 5.411655378341675, "epoch": 0.4332372718539865, "grad_norm": 1.125, "learning_rate": 0.0004987147960792958, "loss": 5.459, "mean_token_accuracy": 0.18804670721292496, "num_tokens": 10335111.0, "step": 4510 }, { "entropy": 5.520284938812256, "epoch": 0.43371757925072046, "grad_norm": 0.97265625, "learning_rate": 0.0004987111343799971, "loss": 5.3974, "mean_token_accuracy": 0.1907435804605484, "num_tokens": 10345672.0, "step": 4515 }, { "entropy": 5.501500225067138, "epoch": 0.43419788664745435, "grad_norm": 1.0859375, "learning_rate": 0.00049870746748679, "loss": 5.3725, "mean_token_accuracy": 0.1861974611878395, "num_tokens": 10357369.0, "step": 4520 }, { "entropy": 5.38987283706665, "epoch": 0.4346781940441883, "grad_norm": 1.09375, "learning_rate": 0.0004987037953997598, "loss": 5.3935, "mean_token_accuracy": 0.18683493435382842, "num_tokens": 10368842.0, "step": 4525 }, { "entropy": 5.43892183303833, "epoch": 0.43515850144092216, "grad_norm": 1.015625, "learning_rate": 0.0004987001181189918, "loss": 5.3539, "mean_token_accuracy": 0.18663013726472855, "num_tokens": 10380096.0, "step": 4530 }, { "entropy": 5.306481552124024, "epoch": 0.4356388088376561, "grad_norm": 1.046875, "learning_rate": 0.0004986964356445713, "loss": 5.3772, "mean_token_accuracy": 0.19005681425333024, "num_tokens": 10391996.0, "step": 4535 }, { "entropy": 5.48760027885437, "epoch": 0.43611911623439004, "grad_norm": 1.0546875, "learning_rate": 0.0004986927479765837, "loss": 5.3288, "mean_token_accuracy": 0.18343985229730606, "num_tokens": 10403607.0, "step": 4540 }, { "entropy": 5.396467876434326, "epoch": 0.4365994236311239, "grad_norm": 1.140625, "learning_rate": 0.0004986890551151148, "loss": 5.3604, "mean_token_accuracy": 0.184589384496212, "num_tokens": 10413580.0, "step": 4545 }, { "entropy": 5.349568462371826, "epoch": 0.43707973102785785, "grad_norm": 0.99609375, "learning_rate": 0.0004986853570602503, "loss": 5.3881, "mean_token_accuracy": 0.18719975054264068, "num_tokens": 10426456.0, "step": 4550 }, { "entropy": 5.520879220962525, "epoch": 0.43756003842459174, "grad_norm": 0.98828125, "learning_rate": 0.0004986816538120758, "loss": 5.4101, "mean_token_accuracy": 0.18188669979572297, "num_tokens": 10438869.0, "step": 4555 }, { "entropy": 5.397240781784058, "epoch": 0.43804034582132567, "grad_norm": 1.09375, "learning_rate": 0.0004986779453706778, "loss": 5.4142, "mean_token_accuracy": 0.1816550999879837, "num_tokens": 10450672.0, "step": 4560 }, { "entropy": 5.4152685642242435, "epoch": 0.43852065321805955, "grad_norm": 1.125, "learning_rate": 0.0004986742317361419, "loss": 5.3271, "mean_token_accuracy": 0.19575155526399612, "num_tokens": 10461890.0, "step": 4565 }, { "entropy": 5.498744964599609, "epoch": 0.4390009606147935, "grad_norm": 1.125, "learning_rate": 0.0004986705129085546, "loss": 5.4613, "mean_token_accuracy": 0.17549378722906112, "num_tokens": 10473866.0, "step": 4570 }, { "entropy": 5.460689496994019, "epoch": 0.43948126801152737, "grad_norm": 1.0859375, "learning_rate": 0.0004986667888880021, "loss": 5.381, "mean_token_accuracy": 0.18632390201091767, "num_tokens": 10484889.0, "step": 4575 }, { "entropy": 5.412662744522095, "epoch": 0.4399615754082613, "grad_norm": 1.1015625, "learning_rate": 0.0004986630596745709, "loss": 5.4207, "mean_token_accuracy": 0.1880632683634758, "num_tokens": 10496108.0, "step": 4580 }, { "entropy": 5.389367771148682, "epoch": 0.4404418828049952, "grad_norm": 1.203125, "learning_rate": 0.0004986593252683477, "loss": 5.363, "mean_token_accuracy": 0.18732869774103164, "num_tokens": 10505472.0, "step": 4585 }, { "entropy": 5.307269144058227, "epoch": 0.4409221902017291, "grad_norm": 1.0625, "learning_rate": 0.0004986555856694191, "loss": 5.2773, "mean_token_accuracy": 0.19333918690681456, "num_tokens": 10516954.0, "step": 4590 }, { "entropy": 5.524228239059449, "epoch": 0.441402497598463, "grad_norm": 1.0390625, "learning_rate": 0.0004986518408778718, "loss": 5.3859, "mean_token_accuracy": 0.18945636600255966, "num_tokens": 10528166.0, "step": 4595 }, { "entropy": 5.38381519317627, "epoch": 0.44188280499519694, "grad_norm": 1.140625, "learning_rate": 0.0004986480908937929, "loss": 5.3113, "mean_token_accuracy": 0.18772315680980683, "num_tokens": 10538112.0, "step": 4600 }, { "entropy": 5.444307518005371, "epoch": 0.4423631123919308, "grad_norm": 1.0546875, "learning_rate": 0.0004986443357172695, "loss": 5.4568, "mean_token_accuracy": 0.18497458845376968, "num_tokens": 10549888.0, "step": 4605 }, { "entropy": 5.58274884223938, "epoch": 0.44284341978866476, "grad_norm": 0.984375, "learning_rate": 0.0004986405753483887, "loss": 5.5294, "mean_token_accuracy": 0.17502811402082444, "num_tokens": 10561710.0, "step": 4610 }, { "entropy": 5.410598850250244, "epoch": 0.44332372718539864, "grad_norm": 1.0, "learning_rate": 0.0004986368097872377, "loss": 5.379, "mean_token_accuracy": 0.18401092439889907, "num_tokens": 10574564.0, "step": 4615 }, { "entropy": 5.41968560218811, "epoch": 0.4438040345821326, "grad_norm": 1.0703125, "learning_rate": 0.0004986330390339042, "loss": 5.3586, "mean_token_accuracy": 0.18878330439329147, "num_tokens": 10586639.0, "step": 4620 }, { "entropy": 5.373893547058105, "epoch": 0.44428434197886646, "grad_norm": 1.0390625, "learning_rate": 0.0004986292630884755, "loss": 5.3645, "mean_token_accuracy": 0.18980913162231444, "num_tokens": 10598730.0, "step": 4625 }, { "entropy": 5.395772886276245, "epoch": 0.4447646493756004, "grad_norm": 1.0703125, "learning_rate": 0.0004986254819510393, "loss": 5.2863, "mean_token_accuracy": 0.2030077889561653, "num_tokens": 10610352.0, "step": 4630 }, { "entropy": 5.410120058059692, "epoch": 0.4452449567723343, "grad_norm": 1.0234375, "learning_rate": 0.0004986216956216835, "loss": 5.3544, "mean_token_accuracy": 0.18991922438144684, "num_tokens": 10621951.0, "step": 4635 }, { "entropy": 5.380520057678223, "epoch": 0.4457252641690682, "grad_norm": 1.125, "learning_rate": 0.000498617904100496, "loss": 5.3114, "mean_token_accuracy": 0.1913859009742737, "num_tokens": 10633207.0, "step": 4640 }, { "entropy": 5.473378133773804, "epoch": 0.4462055715658021, "grad_norm": 0.9921875, "learning_rate": 0.0004986141073875646, "loss": 5.4035, "mean_token_accuracy": 0.18385644257068634, "num_tokens": 10645853.0, "step": 4645 }, { "entropy": 5.330105209350586, "epoch": 0.44668587896253603, "grad_norm": 1.0078125, "learning_rate": 0.0004986103054829779, "loss": 5.3305, "mean_token_accuracy": 0.18985379487276077, "num_tokens": 10656892.0, "step": 4650 }, { "entropy": 5.424197340011597, "epoch": 0.4471661863592699, "grad_norm": 1.09375, "learning_rate": 0.0004986064983868237, "loss": 5.3095, "mean_token_accuracy": 0.18436852544546128, "num_tokens": 10670110.0, "step": 4655 }, { "entropy": 5.429648303985596, "epoch": 0.44764649375600385, "grad_norm": 1.09375, "learning_rate": 0.0004986026860991906, "loss": 5.4385, "mean_token_accuracy": 0.185771344602108, "num_tokens": 10681255.0, "step": 4660 }, { "entropy": 5.471052789688111, "epoch": 0.44812680115273773, "grad_norm": 1.1875, "learning_rate": 0.0004985988686201672, "loss": 5.5041, "mean_token_accuracy": 0.1844386264681816, "num_tokens": 10692631.0, "step": 4665 }, { "entropy": 5.442734622955323, "epoch": 0.44860710854947167, "grad_norm": 1.0234375, "learning_rate": 0.0004985950459498419, "loss": 5.3372, "mean_token_accuracy": 0.19462240785360335, "num_tokens": 10704880.0, "step": 4670 }, { "entropy": 5.390188550949096, "epoch": 0.44908741594620555, "grad_norm": 1.0703125, "learning_rate": 0.0004985912180883037, "loss": 5.3095, "mean_token_accuracy": 0.19716786891222, "num_tokens": 10715561.0, "step": 4675 }, { "entropy": 5.376702499389649, "epoch": 0.4495677233429395, "grad_norm": 1.125, "learning_rate": 0.0004985873850356411, "loss": 5.3369, "mean_token_accuracy": 0.19014816135168075, "num_tokens": 10727232.0, "step": 4680 }, { "entropy": 5.387975978851318, "epoch": 0.45004803073967337, "grad_norm": 1.0546875, "learning_rate": 0.0004985835467919436, "loss": 5.3461, "mean_token_accuracy": 0.19422013461589813, "num_tokens": 10739404.0, "step": 4685 }, { "entropy": 5.369897413253784, "epoch": 0.4505283381364073, "grad_norm": 1.0390625, "learning_rate": 0.0004985797033572999, "loss": 5.3767, "mean_token_accuracy": 0.18446222841739654, "num_tokens": 10751948.0, "step": 4690 }, { "entropy": 5.362226104736328, "epoch": 0.4510086455331412, "grad_norm": 1.0, "learning_rate": 0.0004985758547317994, "loss": 5.3363, "mean_token_accuracy": 0.18433189690113067, "num_tokens": 10764611.0, "step": 4695 }, { "entropy": 5.447867727279663, "epoch": 0.4514889529298751, "grad_norm": 1.0859375, "learning_rate": 0.0004985720009155315, "loss": 5.3727, "mean_token_accuracy": 0.1841047078371048, "num_tokens": 10775954.0, "step": 4700 }, { "entropy": 5.409327983856201, "epoch": 0.45196926032660906, "grad_norm": 1.15625, "learning_rate": 0.0004985681419085856, "loss": 5.3909, "mean_token_accuracy": 0.18282371312379836, "num_tokens": 10788723.0, "step": 4705 }, { "entropy": 5.421317195892334, "epoch": 0.45244956772334294, "grad_norm": 1.078125, "learning_rate": 0.0004985642777110513, "loss": 5.3841, "mean_token_accuracy": 0.1885462448000908, "num_tokens": 10799879.0, "step": 4710 }, { "entropy": 5.3301918506622314, "epoch": 0.4529298751200769, "grad_norm": 1.0234375, "learning_rate": 0.0004985604083230183, "loss": 5.3231, "mean_token_accuracy": 0.18998679518699646, "num_tokens": 10811838.0, "step": 4715 }, { "entropy": 5.428510332107544, "epoch": 0.45341018251681076, "grad_norm": 1.1640625, "learning_rate": 0.0004985565337445765, "loss": 5.3434, "mean_token_accuracy": 0.19171882420778275, "num_tokens": 10822910.0, "step": 4720 }, { "entropy": 5.471314573287964, "epoch": 0.4538904899135447, "grad_norm": 1.0234375, "learning_rate": 0.0004985526539758158, "loss": 5.3992, "mean_token_accuracy": 0.18527638167142868, "num_tokens": 10835344.0, "step": 4725 }, { "entropy": 5.375976181030273, "epoch": 0.4543707973102786, "grad_norm": 1.015625, "learning_rate": 0.0004985487690168263, "loss": 5.4034, "mean_token_accuracy": 0.19202104806900025, "num_tokens": 10846043.0, "step": 4730 }, { "entropy": 5.380132484436035, "epoch": 0.4548511047070125, "grad_norm": 1.1015625, "learning_rate": 0.000498544878867698, "loss": 5.298, "mean_token_accuracy": 0.19829845130443574, "num_tokens": 10857783.0, "step": 4735 }, { "entropy": 5.434480476379394, "epoch": 0.4553314121037464, "grad_norm": 0.99609375, "learning_rate": 0.0004985409835285215, "loss": 5.373, "mean_token_accuracy": 0.19089124351739883, "num_tokens": 10870527.0, "step": 4740 }, { "entropy": 5.414768075942993, "epoch": 0.45581171950048033, "grad_norm": 1.046875, "learning_rate": 0.0004985370829993873, "loss": 5.3646, "mean_token_accuracy": 0.19075230062007903, "num_tokens": 10882285.0, "step": 4745 }, { "entropy": 5.423041200637817, "epoch": 0.4562920268972142, "grad_norm": 1.0390625, "learning_rate": 0.0004985331772803857, "loss": 5.3874, "mean_token_accuracy": 0.19265468865633012, "num_tokens": 10895319.0, "step": 4750 }, { "entropy": 5.484057378768921, "epoch": 0.45677233429394815, "grad_norm": 1.0234375, "learning_rate": 0.0004985292663716074, "loss": 5.382, "mean_token_accuracy": 0.19183963984251023, "num_tokens": 10906253.0, "step": 4755 }, { "entropy": 5.229197072982788, "epoch": 0.457252641690682, "grad_norm": 0.96875, "learning_rate": 0.0004985253502731435, "loss": 5.2575, "mean_token_accuracy": 0.19930023998022078, "num_tokens": 10918197.0, "step": 4760 }, { "entropy": 5.455323648452759, "epoch": 0.45773294908741596, "grad_norm": 1.015625, "learning_rate": 0.0004985214289850845, "loss": 5.4579, "mean_token_accuracy": 0.17997599244117737, "num_tokens": 10930771.0, "step": 4765 }, { "entropy": 5.443937206268311, "epoch": 0.45821325648414984, "grad_norm": 0.98828125, "learning_rate": 0.0004985175025075217, "loss": 5.3491, "mean_token_accuracy": 0.18804308474063874, "num_tokens": 10942759.0, "step": 4770 }, { "entropy": 5.591840028762817, "epoch": 0.4586935638808838, "grad_norm": 1.03125, "learning_rate": 0.0004985135708405462, "loss": 5.5609, "mean_token_accuracy": 0.17564835995435715, "num_tokens": 10953557.0, "step": 4775 }, { "entropy": 5.411443281173706, "epoch": 0.45917387127761766, "grad_norm": 1.0546875, "learning_rate": 0.0004985096339842493, "loss": 5.3321, "mean_token_accuracy": 0.19676847159862518, "num_tokens": 10963142.0, "step": 4780 }, { "entropy": 5.309838056564331, "epoch": 0.4596541786743516, "grad_norm": 1.046875, "learning_rate": 0.0004985056919387224, "loss": 5.2856, "mean_token_accuracy": 0.19894758760929107, "num_tokens": 10974321.0, "step": 4785 }, { "entropy": 5.502527189254761, "epoch": 0.4601344860710855, "grad_norm": 1.1328125, "learning_rate": 0.0004985017447040569, "loss": 5.4874, "mean_token_accuracy": 0.18695860356092453, "num_tokens": 10985524.0, "step": 4790 }, { "entropy": 5.457700490951538, "epoch": 0.4606147934678194, "grad_norm": 1.0625, "learning_rate": 0.0004984977922803447, "loss": 5.3727, "mean_token_accuracy": 0.1937094435095787, "num_tokens": 10997606.0, "step": 4795 }, { "entropy": 5.4323536396026615, "epoch": 0.4610951008645533, "grad_norm": 1.140625, "learning_rate": 0.0004984938346676772, "loss": 5.3833, "mean_token_accuracy": 0.18257274031639098, "num_tokens": 11010692.0, "step": 4800 }, { "entropy": 5.40803747177124, "epoch": 0.46157540826128723, "grad_norm": 0.98828125, "learning_rate": 0.0004984898718661468, "loss": 5.3099, "mean_token_accuracy": 0.19199058413505554, "num_tokens": 11022517.0, "step": 4805 }, { "entropy": 5.350576591491699, "epoch": 0.4620557156580211, "grad_norm": 1.1796875, "learning_rate": 0.0004984859038758451, "loss": 5.3253, "mean_token_accuracy": 0.19188573807477952, "num_tokens": 11033141.0, "step": 4810 }, { "entropy": 5.32304048538208, "epoch": 0.46253602305475505, "grad_norm": 1.0625, "learning_rate": 0.0004984819306968642, "loss": 5.3173, "mean_token_accuracy": 0.19185021072626113, "num_tokens": 11044619.0, "step": 4815 }, { "entropy": 5.495067167282104, "epoch": 0.46301633045148893, "grad_norm": 0.98828125, "learning_rate": 0.0004984779523292966, "loss": 5.3646, "mean_token_accuracy": 0.18967657685279846, "num_tokens": 11055934.0, "step": 4820 }, { "entropy": 5.383758926391602, "epoch": 0.46349663784822287, "grad_norm": 1.015625, "learning_rate": 0.0004984739687732345, "loss": 5.2493, "mean_token_accuracy": 0.19513811767101288, "num_tokens": 11066203.0, "step": 4825 }, { "entropy": 5.187354946136475, "epoch": 0.46397694524495675, "grad_norm": 0.9921875, "learning_rate": 0.0004984699800287705, "loss": 5.1973, "mean_token_accuracy": 0.19977913796901703, "num_tokens": 11079664.0, "step": 4830 }, { "entropy": 5.341605234146118, "epoch": 0.4644572526416907, "grad_norm": 1.0, "learning_rate": 0.000498465986095997, "loss": 5.2652, "mean_token_accuracy": 0.19821466654539108, "num_tokens": 11091186.0, "step": 4835 }, { "entropy": 5.42094578742981, "epoch": 0.46493756003842457, "grad_norm": 1.1328125, "learning_rate": 0.0004984619869750069, "loss": 5.383, "mean_token_accuracy": 0.18526540249586104, "num_tokens": 11102710.0, "step": 4840 }, { "entropy": 5.292195415496826, "epoch": 0.4654178674351585, "grad_norm": 1.0625, "learning_rate": 0.000498457982665893, "loss": 5.2795, "mean_token_accuracy": 0.19302588403224946, "num_tokens": 11114746.0, "step": 4845 }, { "entropy": 5.397561931610108, "epoch": 0.4658981748318924, "grad_norm": 1.0546875, "learning_rate": 0.0004984539731687483, "loss": 5.3462, "mean_token_accuracy": 0.18983854949474335, "num_tokens": 11126572.0, "step": 4850 }, { "entropy": 5.380267095565796, "epoch": 0.4663784822286263, "grad_norm": 1.1328125, "learning_rate": 0.0004984499584836659, "loss": 5.2431, "mean_token_accuracy": 0.19321491122245787, "num_tokens": 11137830.0, "step": 4855 }, { "entropy": 5.32379674911499, "epoch": 0.4668587896253602, "grad_norm": 1.09375, "learning_rate": 0.000498445938610739, "loss": 5.281, "mean_token_accuracy": 0.19294328689575196, "num_tokens": 11148860.0, "step": 4860 }, { "entropy": 5.419743824005127, "epoch": 0.46733909702209414, "grad_norm": 1.0859375, "learning_rate": 0.0004984419135500608, "loss": 5.4081, "mean_token_accuracy": 0.17859717160463334, "num_tokens": 11161311.0, "step": 4865 }, { "entropy": 5.430191612243652, "epoch": 0.4678194044188281, "grad_norm": 1.046875, "learning_rate": 0.0004984378833017249, "loss": 5.2942, "mean_token_accuracy": 0.19046030193567276, "num_tokens": 11173124.0, "step": 4870 }, { "entropy": 5.344765472412109, "epoch": 0.46829971181556196, "grad_norm": 1.140625, "learning_rate": 0.0004984338478658248, "loss": 5.3783, "mean_token_accuracy": 0.19164984971284865, "num_tokens": 11184879.0, "step": 4875 }, { "entropy": 5.45609302520752, "epoch": 0.4687800192122959, "grad_norm": 1.1015625, "learning_rate": 0.0004984298072424542, "loss": 5.378, "mean_token_accuracy": 0.1874854624271393, "num_tokens": 11196243.0, "step": 4880 }, { "entropy": 5.339529609680175, "epoch": 0.4692603266090298, "grad_norm": 1.1015625, "learning_rate": 0.000498425761431707, "loss": 5.2513, "mean_token_accuracy": 0.20040780752897264, "num_tokens": 11207485.0, "step": 4885 }, { "entropy": 5.312271356582642, "epoch": 0.4697406340057637, "grad_norm": 1.046875, "learning_rate": 0.000498421710433677, "loss": 5.279, "mean_token_accuracy": 0.19036460667848587, "num_tokens": 11219891.0, "step": 4890 }, { "entropy": 5.4914182186126705, "epoch": 0.4702209414024976, "grad_norm": 1.0234375, "learning_rate": 0.0004984176542484584, "loss": 5.388, "mean_token_accuracy": 0.18597144782543182, "num_tokens": 11231329.0, "step": 4895 }, { "entropy": 5.378525733947754, "epoch": 0.47070124879923153, "grad_norm": 1.0625, "learning_rate": 0.0004984135928761452, "loss": 5.266, "mean_token_accuracy": 0.1995886370539665, "num_tokens": 11241367.0, "step": 4900 }, { "entropy": 5.358568334579468, "epoch": 0.4711815561959654, "grad_norm": 1.0234375, "learning_rate": 0.0004984095263168317, "loss": 5.3589, "mean_token_accuracy": 0.18466073721647264, "num_tokens": 11254532.0, "step": 4905 }, { "entropy": 5.4979103088378904, "epoch": 0.47166186359269935, "grad_norm": 1.0859375, "learning_rate": 0.0004984054545706124, "loss": 5.4398, "mean_token_accuracy": 0.18243181705474854, "num_tokens": 11265223.0, "step": 4910 }, { "entropy": 5.3696846008300785, "epoch": 0.47214217098943323, "grad_norm": 1.0390625, "learning_rate": 0.000498401377637582, "loss": 5.3635, "mean_token_accuracy": 0.18885526210069656, "num_tokens": 11278228.0, "step": 4915 }, { "entropy": 5.484466791152954, "epoch": 0.47262247838616717, "grad_norm": 1.03125, "learning_rate": 0.000498397295517835, "loss": 5.4846, "mean_token_accuracy": 0.1801117405295372, "num_tokens": 11289654.0, "step": 4920 }, { "entropy": 5.394139242172241, "epoch": 0.47310278578290105, "grad_norm": 1.03125, "learning_rate": 0.0004983932082114659, "loss": 5.2357, "mean_token_accuracy": 0.19755308330059052, "num_tokens": 11301911.0, "step": 4925 }, { "entropy": 5.4873377799987795, "epoch": 0.473583093179635, "grad_norm": 1.1328125, "learning_rate": 0.0004983891157185699, "loss": 5.4364, "mean_token_accuracy": 0.18308536261320113, "num_tokens": 11312945.0, "step": 4930 }, { "entropy": 5.549541664123535, "epoch": 0.47406340057636887, "grad_norm": 1.046875, "learning_rate": 0.0004983850180392421, "loss": 5.4774, "mean_token_accuracy": 0.18022425770759581, "num_tokens": 11324126.0, "step": 4935 }, { "entropy": 5.402717351913452, "epoch": 0.4745437079731028, "grad_norm": 1.0546875, "learning_rate": 0.0004983809151735775, "loss": 5.4133, "mean_token_accuracy": 0.18017226606607437, "num_tokens": 11336395.0, "step": 4940 }, { "entropy": 5.403596019744873, "epoch": 0.4750240153698367, "grad_norm": 1.0078125, "learning_rate": 0.0004983768071216713, "loss": 5.3135, "mean_token_accuracy": 0.1902969852089882, "num_tokens": 11347387.0, "step": 4945 }, { "entropy": 5.353836917877198, "epoch": 0.4755043227665706, "grad_norm": 1.25, "learning_rate": 0.0004983726938836189, "loss": 5.308, "mean_token_accuracy": 0.19681546241044998, "num_tokens": 11358467.0, "step": 4950 }, { "entropy": 5.486645841598511, "epoch": 0.4759846301633045, "grad_norm": 1.0703125, "learning_rate": 0.0004983685754595159, "loss": 5.4724, "mean_token_accuracy": 0.18010423183441163, "num_tokens": 11370322.0, "step": 4955 }, { "entropy": 5.333859491348266, "epoch": 0.47646493756003844, "grad_norm": 1.0703125, "learning_rate": 0.0004983644518494578, "loss": 5.2697, "mean_token_accuracy": 0.20096147507429124, "num_tokens": 11381719.0, "step": 4960 }, { "entropy": 5.328320550918579, "epoch": 0.4769452449567723, "grad_norm": 1.0859375, "learning_rate": 0.0004983603230535403, "loss": 5.2895, "mean_token_accuracy": 0.1948627695441246, "num_tokens": 11393561.0, "step": 4965 }, { "entropy": 5.460376167297364, "epoch": 0.47742555235350626, "grad_norm": 0.96875, "learning_rate": 0.0004983561890718594, "loss": 5.3849, "mean_token_accuracy": 0.18933912962675095, "num_tokens": 11405411.0, "step": 4970 }, { "entropy": 5.5110303401947025, "epoch": 0.47790585975024014, "grad_norm": 1.1796875, "learning_rate": 0.000498352049904511, "loss": 5.4771, "mean_token_accuracy": 0.17981591820716858, "num_tokens": 11417419.0, "step": 4975 }, { "entropy": 5.429950714111328, "epoch": 0.4783861671469741, "grad_norm": 1.1328125, "learning_rate": 0.0004983479055515914, "loss": 5.2844, "mean_token_accuracy": 0.18997065275907515, "num_tokens": 11428145.0, "step": 4980 }, { "entropy": 5.290281534194946, "epoch": 0.47886647454370795, "grad_norm": 1.0, "learning_rate": 0.0004983437560131964, "loss": 5.2422, "mean_token_accuracy": 0.1993091583251953, "num_tokens": 11439224.0, "step": 4985 }, { "entropy": 5.409195756912231, "epoch": 0.4793467819404419, "grad_norm": 1.0859375, "learning_rate": 0.0004983396012894228, "loss": 5.3477, "mean_token_accuracy": 0.18979695290327073, "num_tokens": 11451731.0, "step": 4990 }, { "entropy": 5.435146522521973, "epoch": 0.47982708933717577, "grad_norm": 1.1171875, "learning_rate": 0.0004983354413803666, "loss": 5.3375, "mean_token_accuracy": 0.1958609476685524, "num_tokens": 11463058.0, "step": 4995 }, { "entropy": 5.473912382125855, "epoch": 0.4803073967339097, "grad_norm": 1.1640625, "learning_rate": 0.0004983312762861248, "loss": 5.4305, "mean_token_accuracy": 0.18449530750513077, "num_tokens": 11472618.0, "step": 5000 }, { "entropy": 5.364778709411621, "epoch": 0.4807877041306436, "grad_norm": 1.0625, "learning_rate": 0.0004983271060067939, "loss": 5.3246, "mean_token_accuracy": 0.18677808940410615, "num_tokens": 11483114.0, "step": 5005 }, { "entropy": 5.3417730808258055, "epoch": 0.4812680115273775, "grad_norm": 1.15625, "learning_rate": 0.0004983229305424707, "loss": 5.2799, "mean_token_accuracy": 0.19405496269464492, "num_tokens": 11494281.0, "step": 5010 }, { "entropy": 5.351672601699829, "epoch": 0.4817483189241114, "grad_norm": 1.0078125, "learning_rate": 0.0004983187498932522, "loss": 5.3503, "mean_token_accuracy": 0.18800514042377472, "num_tokens": 11505962.0, "step": 5015 }, { "entropy": 5.4874766826629635, "epoch": 0.48222862632084534, "grad_norm": 1.0078125, "learning_rate": 0.0004983145640592354, "loss": 5.4492, "mean_token_accuracy": 0.18352760821580888, "num_tokens": 11517558.0, "step": 5020 }, { "entropy": 5.448751974105835, "epoch": 0.4827089337175792, "grad_norm": 1.1015625, "learning_rate": 0.0004983103730405176, "loss": 5.4179, "mean_token_accuracy": 0.18682138621807098, "num_tokens": 11529184.0, "step": 5025 }, { "entropy": 5.338459253311157, "epoch": 0.48318924111431316, "grad_norm": 1.09375, "learning_rate": 0.000498306176837196, "loss": 5.3335, "mean_token_accuracy": 0.18406548202037812, "num_tokens": 11540727.0, "step": 5030 }, { "entropy": 5.360374689102173, "epoch": 0.48366954851104704, "grad_norm": 1.0390625, "learning_rate": 0.0004983019754493681, "loss": 5.261, "mean_token_accuracy": 0.1907915487885475, "num_tokens": 11551510.0, "step": 5035 }, { "entropy": 5.47594895362854, "epoch": 0.484149855907781, "grad_norm": 1.078125, "learning_rate": 0.0004982977688771314, "loss": 5.4187, "mean_token_accuracy": 0.18854755759239197, "num_tokens": 11563203.0, "step": 5040 }, { "entropy": 5.308377647399903, "epoch": 0.4846301633045149, "grad_norm": 1.0234375, "learning_rate": 0.0004982935571205835, "loss": 5.2718, "mean_token_accuracy": 0.19544857442379, "num_tokens": 11576013.0, "step": 5045 }, { "entropy": 5.291185140609741, "epoch": 0.4851104707012488, "grad_norm": 1.1328125, "learning_rate": 0.0004982893401798223, "loss": 5.2498, "mean_token_accuracy": 0.20830876976251603, "num_tokens": 11587535.0, "step": 5050 }, { "entropy": 5.403550291061402, "epoch": 0.48559077809798273, "grad_norm": 1.0234375, "learning_rate": 0.0004982851180549456, "loss": 5.2771, "mean_token_accuracy": 0.19294197112321854, "num_tokens": 11598487.0, "step": 5055 }, { "entropy": 5.25755033493042, "epoch": 0.4860710854947166, "grad_norm": 1.046875, "learning_rate": 0.0004982808907460515, "loss": 5.1559, "mean_token_accuracy": 0.20932556241750716, "num_tokens": 11609457.0, "step": 5060 }, { "entropy": 5.265308237075805, "epoch": 0.48655139289145055, "grad_norm": 1.015625, "learning_rate": 0.0004982766582532382, "loss": 5.2257, "mean_token_accuracy": 0.19795275181531907, "num_tokens": 11620251.0, "step": 5065 }, { "entropy": 5.307956266403198, "epoch": 0.48703170028818443, "grad_norm": 1.1640625, "learning_rate": 0.0004982724205766038, "loss": 5.2262, "mean_token_accuracy": 0.19880327582359314, "num_tokens": 11630956.0, "step": 5070 }, { "entropy": 5.348564767837525, "epoch": 0.48751200768491837, "grad_norm": 0.9609375, "learning_rate": 0.0004982681777162468, "loss": 5.2773, "mean_token_accuracy": 0.1949208691716194, "num_tokens": 11642560.0, "step": 5075 }, { "entropy": 5.300316572189331, "epoch": 0.48799231508165225, "grad_norm": 1.109375, "learning_rate": 0.0004982639296722657, "loss": 5.2365, "mean_token_accuracy": 0.19546635299921036, "num_tokens": 11654050.0, "step": 5080 }, { "entropy": 5.333183813095093, "epoch": 0.4884726224783862, "grad_norm": 1.109375, "learning_rate": 0.0004982596764447591, "loss": 5.4035, "mean_token_accuracy": 0.19310665130615234, "num_tokens": 11664947.0, "step": 5085 }, { "entropy": 5.469000768661499, "epoch": 0.48895292987512007, "grad_norm": 1.0390625, "learning_rate": 0.0004982554180338258, "loss": 5.3106, "mean_token_accuracy": 0.19500951319932938, "num_tokens": 11676927.0, "step": 5090 }, { "entropy": 5.502379417419434, "epoch": 0.489433237271854, "grad_norm": 1.1484375, "learning_rate": 0.0004982511544395646, "loss": 5.4242, "mean_token_accuracy": 0.18115128874778746, "num_tokens": 11688573.0, "step": 5095 }, { "entropy": 5.288805294036865, "epoch": 0.4899135446685879, "grad_norm": 1.171875, "learning_rate": 0.0004982468856620745, "loss": 5.3128, "mean_token_accuracy": 0.18783441036939622, "num_tokens": 11698704.0, "step": 5100 }, { "entropy": 5.3273578643798825, "epoch": 0.4903938520653218, "grad_norm": 1.0859375, "learning_rate": 0.0004982426117014545, "loss": 5.2533, "mean_token_accuracy": 0.19392533451318741, "num_tokens": 11709466.0, "step": 5105 }, { "entropy": 5.3791663646698, "epoch": 0.4908741594620557, "grad_norm": 1.0703125, "learning_rate": 0.0004982383325578041, "loss": 5.3413, "mean_token_accuracy": 0.1898537114262581, "num_tokens": 11721120.0, "step": 5110 }, { "entropy": 5.4256843566894535, "epoch": 0.49135446685878964, "grad_norm": 1.0546875, "learning_rate": 0.0004982340482312226, "loss": 5.3358, "mean_token_accuracy": 0.18456312417984008, "num_tokens": 11732120.0, "step": 5115 }, { "entropy": 5.288364553451538, "epoch": 0.4918347742555235, "grad_norm": 1.046875, "learning_rate": 0.0004982297587218092, "loss": 5.2294, "mean_token_accuracy": 0.1978309139609337, "num_tokens": 11743501.0, "step": 5120 }, { "entropy": 5.363348197937012, "epoch": 0.49231508165225746, "grad_norm": 1.109375, "learning_rate": 0.0004982254640296637, "loss": 5.3152, "mean_token_accuracy": 0.1956743210554123, "num_tokens": 11755051.0, "step": 5125 }, { "entropy": 5.436681079864502, "epoch": 0.49279538904899134, "grad_norm": 1.0546875, "learning_rate": 0.0004982211641548857, "loss": 5.4609, "mean_token_accuracy": 0.1842927649617195, "num_tokens": 11767663.0, "step": 5130 }, { "entropy": 5.419048309326172, "epoch": 0.4932756964457253, "grad_norm": 1.09375, "learning_rate": 0.0004982168590975752, "loss": 5.3034, "mean_token_accuracy": 0.19774986803531647, "num_tokens": 11778828.0, "step": 5135 }, { "entropy": 5.459513902664185, "epoch": 0.49375600384245916, "grad_norm": 1.046875, "learning_rate": 0.0004982125488578321, "loss": 5.4794, "mean_token_accuracy": 0.18496931344270706, "num_tokens": 11790654.0, "step": 5140 }, { "entropy": 5.433895540237427, "epoch": 0.4942363112391931, "grad_norm": 1.140625, "learning_rate": 0.0004982082334357563, "loss": 5.2837, "mean_token_accuracy": 0.1902835488319397, "num_tokens": 11801489.0, "step": 5145 }, { "entropy": 5.311564207077026, "epoch": 0.494716618635927, "grad_norm": 1.1328125, "learning_rate": 0.0004982039128314481, "loss": 5.2873, "mean_token_accuracy": 0.19224448949098588, "num_tokens": 11813818.0, "step": 5150 }, { "entropy": 5.333755207061768, "epoch": 0.4951969260326609, "grad_norm": 1.0625, "learning_rate": 0.0004981995870450079, "loss": 5.2929, "mean_token_accuracy": 0.191859370470047, "num_tokens": 11824814.0, "step": 5155 }, { "entropy": 5.45896692276001, "epoch": 0.4956772334293948, "grad_norm": 1.140625, "learning_rate": 0.0004981952560765361, "loss": 5.3373, "mean_token_accuracy": 0.18679553270339966, "num_tokens": 11836252.0, "step": 5160 }, { "entropy": 5.314207363128662, "epoch": 0.49615754082612873, "grad_norm": 1.09375, "learning_rate": 0.0004981909199261331, "loss": 5.2629, "mean_token_accuracy": 0.19086166322231293, "num_tokens": 11847715.0, "step": 5165 }, { "entropy": 5.273135042190551, "epoch": 0.4966378482228626, "grad_norm": 1.0625, "learning_rate": 0.0004981865785938998, "loss": 5.2629, "mean_token_accuracy": 0.19300127327442168, "num_tokens": 11860309.0, "step": 5170 }, { "entropy": 5.348716497421265, "epoch": 0.49711815561959655, "grad_norm": 1.015625, "learning_rate": 0.0004981822320799367, "loss": 5.2577, "mean_token_accuracy": 0.1956932559609413, "num_tokens": 11872569.0, "step": 5175 }, { "entropy": 5.3287012577056885, "epoch": 0.49759846301633043, "grad_norm": 1.0546875, "learning_rate": 0.0004981778803843449, "loss": 5.2523, "mean_token_accuracy": 0.19481286704540252, "num_tokens": 11884778.0, "step": 5180 }, { "entropy": 5.390296173095703, "epoch": 0.49807877041306436, "grad_norm": 1.0859375, "learning_rate": 0.0004981735235072256, "loss": 5.3358, "mean_token_accuracy": 0.1911753833293915, "num_tokens": 11897324.0, "step": 5185 }, { "entropy": 5.467144203186035, "epoch": 0.49855907780979825, "grad_norm": 1.03125, "learning_rate": 0.0004981691614486796, "loss": 5.366, "mean_token_accuracy": 0.18982964605093003, "num_tokens": 11909145.0, "step": 5190 }, { "entropy": 5.322554683685302, "epoch": 0.4990393852065322, "grad_norm": 1.0625, "learning_rate": 0.0004981647942088084, "loss": 5.2697, "mean_token_accuracy": 0.20009808093309403, "num_tokens": 11921021.0, "step": 5195 }, { "entropy": 5.487699699401856, "epoch": 0.49951969260326606, "grad_norm": 1.046875, "learning_rate": 0.0004981604217877135, "loss": 5.4279, "mean_token_accuracy": 0.1888749822974205, "num_tokens": 11932565.0, "step": 5200 }, { "entropy": 5.318529844284058, "epoch": 0.5, "grad_norm": 1.046875, "learning_rate": 0.000498156044185496, "loss": 5.3392, "mean_token_accuracy": 0.19370948225259782, "num_tokens": 11943225.0, "step": 5205 }, { "entropy": 5.364103078842163, "epoch": 0.5004803073967339, "grad_norm": 1.140625, "learning_rate": 0.0004981516614022579, "loss": 5.3219, "mean_token_accuracy": 0.1932568922638893, "num_tokens": 11954821.0, "step": 5210 }, { "entropy": 5.446450281143188, "epoch": 0.5009606147934679, "grad_norm": 1.1328125, "learning_rate": 0.0004981472734381008, "loss": 5.2738, "mean_token_accuracy": 0.1951069414615631, "num_tokens": 11966090.0, "step": 5215 }, { "entropy": 5.353061962127685, "epoch": 0.5014409221902018, "grad_norm": 1.1015625, "learning_rate": 0.0004981428802931267, "loss": 5.3074, "mean_token_accuracy": 0.1921882688999176, "num_tokens": 11977410.0, "step": 5220 }, { "entropy": 5.339950656890869, "epoch": 0.5019212295869356, "grad_norm": 1.15625, "learning_rate": 0.0004981384819674375, "loss": 5.2841, "mean_token_accuracy": 0.19126271605491638, "num_tokens": 11989119.0, "step": 5225 }, { "entropy": 5.432912015914917, "epoch": 0.5024015369836695, "grad_norm": 1.0390625, "learning_rate": 0.0004981340784611354, "loss": 5.3942, "mean_token_accuracy": 0.19018032401800156, "num_tokens": 12000165.0, "step": 5230 }, { "entropy": 5.395741987228393, "epoch": 0.5028818443804035, "grad_norm": 1.0546875, "learning_rate": 0.0004981296697743224, "loss": 5.3475, "mean_token_accuracy": 0.18768104463815688, "num_tokens": 12012118.0, "step": 5235 }, { "entropy": 5.430673694610595, "epoch": 0.5033621517771374, "grad_norm": 1.1015625, "learning_rate": 0.0004981252559071012, "loss": 5.4181, "mean_token_accuracy": 0.1866712138056755, "num_tokens": 12023432.0, "step": 5240 }, { "entropy": 5.427559089660645, "epoch": 0.5038424591738713, "grad_norm": 1.1953125, "learning_rate": 0.0004981208368595739, "loss": 5.2939, "mean_token_accuracy": 0.1980261042714119, "num_tokens": 12034323.0, "step": 5245 }, { "entropy": 5.264776802062988, "epoch": 0.5043227665706052, "grad_norm": 1.09375, "learning_rate": 0.0004981164126318435, "loss": 5.3022, "mean_token_accuracy": 0.19116167575120926, "num_tokens": 12045532.0, "step": 5250 }, { "entropy": 5.449652862548828, "epoch": 0.5048030739673391, "grad_norm": 1.015625, "learning_rate": 0.0004981119832240124, "loss": 5.3111, "mean_token_accuracy": 0.19520313441753387, "num_tokens": 12057346.0, "step": 5255 }, { "entropy": 5.301677227020264, "epoch": 0.505283381364073, "grad_norm": 1.046875, "learning_rate": 0.0004981075486361837, "loss": 5.2825, "mean_token_accuracy": 0.19872631430625914, "num_tokens": 12068670.0, "step": 5260 }, { "entropy": 5.390146923065186, "epoch": 0.5057636887608069, "grad_norm": 1.0703125, "learning_rate": 0.0004981031088684601, "loss": 5.4028, "mean_token_accuracy": 0.18470921665430068, "num_tokens": 12079664.0, "step": 5265 }, { "entropy": 5.474726438522339, "epoch": 0.5062439961575408, "grad_norm": 1.0859375, "learning_rate": 0.0004980986639209448, "loss": 5.3285, "mean_token_accuracy": 0.1994831383228302, "num_tokens": 12089984.0, "step": 5270 }, { "entropy": 5.29730339050293, "epoch": 0.5067243035542748, "grad_norm": 1.1484375, "learning_rate": 0.000498094213793741, "loss": 5.2835, "mean_token_accuracy": 0.1948940023779869, "num_tokens": 12101182.0, "step": 5275 }, { "entropy": 5.408280658721924, "epoch": 0.5072046109510087, "grad_norm": 1.125, "learning_rate": 0.000498089758486952, "loss": 5.353, "mean_token_accuracy": 0.18289182782173158, "num_tokens": 12112002.0, "step": 5280 }, { "entropy": 5.495666790008545, "epoch": 0.5076849183477425, "grad_norm": 1.0, "learning_rate": 0.0004980852980006812, "loss": 5.4392, "mean_token_accuracy": 0.1805154114961624, "num_tokens": 12124194.0, "step": 5285 }, { "entropy": 5.392632579803466, "epoch": 0.5081652257444764, "grad_norm": 1.1640625, "learning_rate": 0.0004980808323350323, "loss": 5.359, "mean_token_accuracy": 0.1960368499159813, "num_tokens": 12133966.0, "step": 5290 }, { "entropy": 5.391989612579346, "epoch": 0.5086455331412104, "grad_norm": 1.1484375, "learning_rate": 0.0004980763614901089, "loss": 5.2967, "mean_token_accuracy": 0.19686038345098494, "num_tokens": 12145643.0, "step": 5295 }, { "entropy": 5.379247760772705, "epoch": 0.5091258405379443, "grad_norm": 1.1015625, "learning_rate": 0.0004980718854660146, "loss": 5.3464, "mean_token_accuracy": 0.18789971768856048, "num_tokens": 12156804.0, "step": 5300 }, { "entropy": 5.400803756713867, "epoch": 0.5096061479346782, "grad_norm": 1.0234375, "learning_rate": 0.0004980674042628537, "loss": 5.2967, "mean_token_accuracy": 0.19052283465862274, "num_tokens": 12168700.0, "step": 5305 }, { "entropy": 5.401619243621826, "epoch": 0.5100864553314121, "grad_norm": 1.0078125, "learning_rate": 0.00049806291788073, "loss": 5.3123, "mean_token_accuracy": 0.18629832863807677, "num_tokens": 12181050.0, "step": 5310 }, { "entropy": 5.469602966308594, "epoch": 0.510566762728146, "grad_norm": 1.0859375, "learning_rate": 0.0004980584263197477, "loss": 5.3949, "mean_token_accuracy": 0.1858072027564049, "num_tokens": 12192001.0, "step": 5315 }, { "entropy": 5.508568143844604, "epoch": 0.5110470701248799, "grad_norm": 1.1875, "learning_rate": 0.0004980539295800111, "loss": 5.509, "mean_token_accuracy": 0.18043418526649474, "num_tokens": 12202436.0, "step": 5320 }, { "entropy": 5.362590551376343, "epoch": 0.5115273775216138, "grad_norm": 1.0546875, "learning_rate": 0.0004980494276616246, "loss": 5.3016, "mean_token_accuracy": 0.18966611623764038, "num_tokens": 12214454.0, "step": 5325 }, { "entropy": 5.349428033828735, "epoch": 0.5120076849183477, "grad_norm": 1.0390625, "learning_rate": 0.0004980449205646926, "loss": 5.3122, "mean_token_accuracy": 0.19553214311599731, "num_tokens": 12225924.0, "step": 5330 }, { "entropy": 5.415020084381103, "epoch": 0.5124879923150817, "grad_norm": 1.1328125, "learning_rate": 0.00049804040828932, "loss": 5.3326, "mean_token_accuracy": 0.19512139409780502, "num_tokens": 12236456.0, "step": 5335 }, { "entropy": 5.421989011764526, "epoch": 0.5129682997118156, "grad_norm": 1.03125, "learning_rate": 0.0004980358908356113, "loss": 5.3535, "mean_token_accuracy": 0.18762658089399337, "num_tokens": 12247719.0, "step": 5340 }, { "entropy": 5.350346803665161, "epoch": 0.5134486071085494, "grad_norm": 1.0703125, "learning_rate": 0.0004980313682036717, "loss": 5.381, "mean_token_accuracy": 0.1927213490009308, "num_tokens": 12259141.0, "step": 5345 }, { "entropy": 5.49134635925293, "epoch": 0.5139289145052833, "grad_norm": 1.2109375, "learning_rate": 0.0004980268403936058, "loss": 5.4456, "mean_token_accuracy": 0.18453603684902192, "num_tokens": 12269748.0, "step": 5350 }, { "entropy": 5.434391784667969, "epoch": 0.5144092219020173, "grad_norm": 1.0546875, "learning_rate": 0.0004980223074055189, "loss": 5.379, "mean_token_accuracy": 0.1960138276219368, "num_tokens": 12281456.0, "step": 5355 }, { "entropy": 5.409012746810913, "epoch": 0.5148895292987512, "grad_norm": 1.125, "learning_rate": 0.0004980177692395164, "loss": 5.3518, "mean_token_accuracy": 0.18338604271411896, "num_tokens": 12293763.0, "step": 5360 }, { "entropy": 5.351993417739868, "epoch": 0.5153698366954851, "grad_norm": 0.98046875, "learning_rate": 0.0004980132258957035, "loss": 5.2808, "mean_token_accuracy": 0.1969463735818863, "num_tokens": 12305398.0, "step": 5365 }, { "entropy": 5.274507617950439, "epoch": 0.515850144092219, "grad_norm": 1.1796875, "learning_rate": 0.0004980086773741856, "loss": 5.2796, "mean_token_accuracy": 0.19121709913015367, "num_tokens": 12316582.0, "step": 5370 }, { "entropy": 5.483122396469116, "epoch": 0.516330451488953, "grad_norm": 1.1875, "learning_rate": 0.0004980041236750685, "loss": 5.3846, "mean_token_accuracy": 0.18809578120708464, "num_tokens": 12328463.0, "step": 5375 }, { "entropy": 5.445298194885254, "epoch": 0.5168107588856868, "grad_norm": 1.0390625, "learning_rate": 0.0004979995647984577, "loss": 5.3698, "mean_token_accuracy": 0.19524169117212295, "num_tokens": 12341040.0, "step": 5380 }, { "entropy": 5.2983297348022464, "epoch": 0.5172910662824207, "grad_norm": 0.9765625, "learning_rate": 0.0004979950007444593, "loss": 5.261, "mean_token_accuracy": 0.1934810236096382, "num_tokens": 12353024.0, "step": 5385 }, { "entropy": 5.358570623397827, "epoch": 0.5177713736791547, "grad_norm": 1.0625, "learning_rate": 0.0004979904315131792, "loss": 5.2844, "mean_token_accuracy": 0.19403222799301148, "num_tokens": 12366100.0, "step": 5390 }, { "entropy": 5.293501186370849, "epoch": 0.5182516810758886, "grad_norm": 1.0546875, "learning_rate": 0.0004979858571047233, "loss": 5.2707, "mean_token_accuracy": 0.19768950045108796, "num_tokens": 12377829.0, "step": 5395 }, { "entropy": 5.466844320297241, "epoch": 0.5187319884726225, "grad_norm": 1.15625, "learning_rate": 0.0004979812775191979, "loss": 5.4031, "mean_token_accuracy": 0.18979473859071733, "num_tokens": 12390830.0, "step": 5400 }, { "entropy": 5.328051805496216, "epoch": 0.5192122958693564, "grad_norm": 1.0546875, "learning_rate": 0.0004979766927567094, "loss": 5.2545, "mean_token_accuracy": 0.19470396041870117, "num_tokens": 12401642.0, "step": 5405 }, { "entropy": 5.3456236839294435, "epoch": 0.5196926032660903, "grad_norm": 1.109375, "learning_rate": 0.0004979721028173643, "loss": 5.3476, "mean_token_accuracy": 0.1877232700586319, "num_tokens": 12411653.0, "step": 5410 }, { "entropy": 5.386164760589599, "epoch": 0.5201729106628242, "grad_norm": 1.0859375, "learning_rate": 0.000497967507701269, "loss": 5.2486, "mean_token_accuracy": 0.20038487911224365, "num_tokens": 12422891.0, "step": 5415 }, { "entropy": 5.397801113128662, "epoch": 0.5206532180595581, "grad_norm": 1.0625, "learning_rate": 0.0004979629074085303, "loss": 5.3408, "mean_token_accuracy": 0.19329493790864943, "num_tokens": 12434190.0, "step": 5420 }, { "entropy": 5.424389457702636, "epoch": 0.521133525456292, "grad_norm": 1.0234375, "learning_rate": 0.0004979583019392548, "loss": 5.3974, "mean_token_accuracy": 0.18989453911781312, "num_tokens": 12445796.0, "step": 5425 }, { "entropy": 5.483598613739014, "epoch": 0.521613832853026, "grad_norm": 1.140625, "learning_rate": 0.0004979536912935497, "loss": 5.4639, "mean_token_accuracy": 0.18501935750246049, "num_tokens": 12456212.0, "step": 5430 }, { "entropy": 5.330318355560303, "epoch": 0.5220941402497599, "grad_norm": 1.1171875, "learning_rate": 0.000497949075471522, "loss": 5.1899, "mean_token_accuracy": 0.19820088148117065, "num_tokens": 12467871.0, "step": 5435 }, { "entropy": 5.372925519943237, "epoch": 0.5225744476464937, "grad_norm": 1.0625, "learning_rate": 0.0004979444544732786, "loss": 5.2819, "mean_token_accuracy": 0.1852207139134407, "num_tokens": 12478626.0, "step": 5440 }, { "entropy": 5.313206958770752, "epoch": 0.5230547550432276, "grad_norm": 1.015625, "learning_rate": 0.000497939828298927, "loss": 5.3741, "mean_token_accuracy": 0.19033849388360977, "num_tokens": 12491487.0, "step": 5445 }, { "entropy": 5.462804317474365, "epoch": 0.5235350624399616, "grad_norm": 1.1484375, "learning_rate": 0.0004979351969485747, "loss": 5.3383, "mean_token_accuracy": 0.18805173933506011, "num_tokens": 12503240.0, "step": 5450 }, { "entropy": 5.4243183612823485, "epoch": 0.5240153698366955, "grad_norm": 1.0859375, "learning_rate": 0.0004979305604223291, "loss": 5.2774, "mean_token_accuracy": 0.1903422147035599, "num_tokens": 12513860.0, "step": 5455 }, { "entropy": 5.313809871673584, "epoch": 0.5244956772334294, "grad_norm": 1.1171875, "learning_rate": 0.0004979259187202978, "loss": 5.352, "mean_token_accuracy": 0.1945337176322937, "num_tokens": 12525884.0, "step": 5460 }, { "entropy": 5.442373895645142, "epoch": 0.5249759846301633, "grad_norm": 1.1171875, "learning_rate": 0.0004979212718425887, "loss": 5.2672, "mean_token_accuracy": 0.1932208612561226, "num_tokens": 12536709.0, "step": 5465 }, { "entropy": 5.334468412399292, "epoch": 0.5254562920268973, "grad_norm": 1.15625, "learning_rate": 0.0004979166197893096, "loss": 5.2663, "mean_token_accuracy": 0.19677013605833055, "num_tokens": 12549727.0, "step": 5470 }, { "entropy": 5.339883422851562, "epoch": 0.5259365994236311, "grad_norm": 0.98828125, "learning_rate": 0.0004979119625605683, "loss": 5.3345, "mean_token_accuracy": 0.18942939788103103, "num_tokens": 12562053.0, "step": 5475 }, { "entropy": 5.287409067153931, "epoch": 0.526416906820365, "grad_norm": 1.1171875, "learning_rate": 0.0004979073001564734, "loss": 5.2257, "mean_token_accuracy": 0.20170782059431075, "num_tokens": 12574096.0, "step": 5480 }, { "entropy": 5.40628571510315, "epoch": 0.5268972142170989, "grad_norm": 1.046875, "learning_rate": 0.0004979026325771328, "loss": 5.4013, "mean_token_accuracy": 0.18865474164485932, "num_tokens": 12585416.0, "step": 5485 }, { "entropy": 5.369120025634766, "epoch": 0.5273775216138329, "grad_norm": 1.0078125, "learning_rate": 0.0004978979598226549, "loss": 5.2525, "mean_token_accuracy": 0.1964880034327507, "num_tokens": 12596861.0, "step": 5490 }, { "entropy": 5.307511520385742, "epoch": 0.5278578290105668, "grad_norm": 1.109375, "learning_rate": 0.0004978932818931483, "loss": 5.2672, "mean_token_accuracy": 0.19722044318914414, "num_tokens": 12607761.0, "step": 5495 }, { "entropy": 5.4275431632995605, "epoch": 0.5283381364073007, "grad_norm": 1.1171875, "learning_rate": 0.0004978885987887216, "loss": 5.3898, "mean_token_accuracy": 0.19588741660118103, "num_tokens": 12619889.0, "step": 5500 }, { "entropy": 5.4371997833251955, "epoch": 0.5288184438040345, "grad_norm": 1.1171875, "learning_rate": 0.0004978839105094833, "loss": 5.3606, "mean_token_accuracy": 0.19224700778722764, "num_tokens": 12630604.0, "step": 5505 }, { "entropy": 5.222589921951294, "epoch": 0.5292987512007685, "grad_norm": 1.078125, "learning_rate": 0.0004978792170555426, "loss": 5.2618, "mean_token_accuracy": 0.19633477181196213, "num_tokens": 12641172.0, "step": 5510 }, { "entropy": 5.292724561691284, "epoch": 0.5297790585975024, "grad_norm": 1.046875, "learning_rate": 0.0004978745184270083, "loss": 5.1601, "mean_token_accuracy": 0.20660953521728515, "num_tokens": 12651731.0, "step": 5515 }, { "entropy": 5.392834901809692, "epoch": 0.5302593659942363, "grad_norm": 1.0859375, "learning_rate": 0.0004978698146239893, "loss": 5.2978, "mean_token_accuracy": 0.1936490774154663, "num_tokens": 12663050.0, "step": 5520 }, { "entropy": 5.409347009658814, "epoch": 0.5307396733909702, "grad_norm": 1.0703125, "learning_rate": 0.0004978651056465952, "loss": 5.3862, "mean_token_accuracy": 0.18999682515859603, "num_tokens": 12674732.0, "step": 5525 }, { "entropy": 5.332290983200073, "epoch": 0.5312199807877042, "grad_norm": 1.140625, "learning_rate": 0.000497860391494935, "loss": 5.2171, "mean_token_accuracy": 0.19382983297109604, "num_tokens": 12685981.0, "step": 5530 }, { "entropy": 5.412051010131836, "epoch": 0.531700288184438, "grad_norm": 1.1328125, "learning_rate": 0.0004978556721691183, "loss": 5.3525, "mean_token_accuracy": 0.19065555483102797, "num_tokens": 12697139.0, "step": 5535 }, { "entropy": 5.317591810226441, "epoch": 0.5321805955811719, "grad_norm": 1.1015625, "learning_rate": 0.0004978509476692547, "loss": 5.2966, "mean_token_accuracy": 0.18611351698637008, "num_tokens": 12708268.0, "step": 5540 }, { "entropy": 5.375318956375122, "epoch": 0.5326609029779059, "grad_norm": 1.1015625, "learning_rate": 0.0004978462179954538, "loss": 5.2958, "mean_token_accuracy": 0.18993753045797349, "num_tokens": 12720715.0, "step": 5545 }, { "entropy": 5.3367125511169435, "epoch": 0.5331412103746398, "grad_norm": 1.1171875, "learning_rate": 0.0004978414831478253, "loss": 5.269, "mean_token_accuracy": 0.19713337272405623, "num_tokens": 12732409.0, "step": 5550 }, { "entropy": 5.323969554901123, "epoch": 0.5336215177713737, "grad_norm": 1.1171875, "learning_rate": 0.0004978367431264794, "loss": 5.397, "mean_token_accuracy": 0.18209069669246675, "num_tokens": 12745174.0, "step": 5555 }, { "entropy": 5.410878992080688, "epoch": 0.5341018251681076, "grad_norm": 1.078125, "learning_rate": 0.0004978319979315261, "loss": 5.3328, "mean_token_accuracy": 0.19573558866977692, "num_tokens": 12756116.0, "step": 5560 }, { "entropy": 5.376229763031006, "epoch": 0.5345821325648416, "grad_norm": 1.21875, "learning_rate": 0.0004978272475630752, "loss": 5.2851, "mean_token_accuracy": 0.1916971653699875, "num_tokens": 12768183.0, "step": 5565 }, { "entropy": 5.264455699920655, "epoch": 0.5350624399615754, "grad_norm": 1.203125, "learning_rate": 0.0004978224920212374, "loss": 5.2931, "mean_token_accuracy": 0.1934914067387581, "num_tokens": 12778537.0, "step": 5570 }, { "entropy": 5.313297891616822, "epoch": 0.5355427473583093, "grad_norm": 1.109375, "learning_rate": 0.0004978177313061232, "loss": 5.3228, "mean_token_accuracy": 0.19088124930858613, "num_tokens": 12789691.0, "step": 5575 }, { "entropy": 5.473337554931641, "epoch": 0.5360230547550432, "grad_norm": 1.078125, "learning_rate": 0.0004978129654178426, "loss": 5.3433, "mean_token_accuracy": 0.18791570216417314, "num_tokens": 12801438.0, "step": 5580 }, { "entropy": 5.4069455623626705, "epoch": 0.5365033621517772, "grad_norm": 1.0546875, "learning_rate": 0.0004978081943565067, "loss": 5.3061, "mean_token_accuracy": 0.18656288981437683, "num_tokens": 12812425.0, "step": 5585 }, { "entropy": 5.307536172866821, "epoch": 0.5369836695485111, "grad_norm": 1.0390625, "learning_rate": 0.0004978034181222261, "loss": 5.2769, "mean_token_accuracy": 0.18625542372465134, "num_tokens": 12824735.0, "step": 5590 }, { "entropy": 5.430880117416382, "epoch": 0.537463976945245, "grad_norm": 1.1796875, "learning_rate": 0.0004977986367151119, "loss": 5.3688, "mean_token_accuracy": 0.1952778786420822, "num_tokens": 12835454.0, "step": 5595 }, { "entropy": 5.434065580368042, "epoch": 0.5379442843419788, "grad_norm": 1.046875, "learning_rate": 0.0004977938501352747, "loss": 5.4122, "mean_token_accuracy": 0.18514797538518907, "num_tokens": 12847086.0, "step": 5600 }, { "entropy": 5.385431623458862, "epoch": 0.5384245917387128, "grad_norm": 1.078125, "learning_rate": 0.0004977890583828259, "loss": 5.3549, "mean_token_accuracy": 0.1888865575194359, "num_tokens": 12857713.0, "step": 5605 }, { "entropy": 5.36136646270752, "epoch": 0.5389048991354467, "grad_norm": 1.125, "learning_rate": 0.0004977842614578768, "loss": 5.3356, "mean_token_accuracy": 0.18914903849363326, "num_tokens": 12869967.0, "step": 5610 }, { "entropy": 5.433460998535156, "epoch": 0.5393852065321806, "grad_norm": 1.0859375, "learning_rate": 0.0004977794593605386, "loss": 5.3684, "mean_token_accuracy": 0.18960850983858107, "num_tokens": 12881230.0, "step": 5615 }, { "entropy": 5.352547121047974, "epoch": 0.5398655139289145, "grad_norm": 1.109375, "learning_rate": 0.000497774652090923, "loss": 5.3222, "mean_token_accuracy": 0.18944347649812698, "num_tokens": 12892376.0, "step": 5620 }, { "entropy": 5.436691570281982, "epoch": 0.5403458213256485, "grad_norm": 1.09375, "learning_rate": 0.0004977698396491414, "loss": 5.3307, "mean_token_accuracy": 0.19240753799676896, "num_tokens": 12903709.0, "step": 5625 }, { "entropy": 5.2928542137146, "epoch": 0.5408261287223823, "grad_norm": 1.03125, "learning_rate": 0.0004977650220353055, "loss": 5.1629, "mean_token_accuracy": 0.19530351608991622, "num_tokens": 12914958.0, "step": 5630 }, { "entropy": 5.280749416351318, "epoch": 0.5413064361191162, "grad_norm": 1.1171875, "learning_rate": 0.0004977601992495274, "loss": 5.2875, "mean_token_accuracy": 0.1923414632678032, "num_tokens": 12927418.0, "step": 5635 }, { "entropy": 5.413435602188111, "epoch": 0.5417867435158501, "grad_norm": 0.98046875, "learning_rate": 0.0004977553712919189, "loss": 5.3325, "mean_token_accuracy": 0.1892315372824669, "num_tokens": 12939874.0, "step": 5640 }, { "entropy": 5.463119792938232, "epoch": 0.5422670509125841, "grad_norm": 1.1484375, "learning_rate": 0.0004977505381625921, "loss": 5.3542, "mean_token_accuracy": 0.18793897628784179, "num_tokens": 12951113.0, "step": 5645 }, { "entropy": 5.333239316940308, "epoch": 0.542747358309318, "grad_norm": 1.0, "learning_rate": 0.0004977456998616593, "loss": 5.247, "mean_token_accuracy": 0.19487171471118928, "num_tokens": 12961940.0, "step": 5650 }, { "entropy": 5.247047281265258, "epoch": 0.5432276657060519, "grad_norm": 1.1015625, "learning_rate": 0.0004977408563892327, "loss": 5.2389, "mean_token_accuracy": 0.19528348445892335, "num_tokens": 12973938.0, "step": 5655 }, { "entropy": 5.355054330825806, "epoch": 0.5437079731027857, "grad_norm": 1.1015625, "learning_rate": 0.0004977360077454249, "loss": 5.2669, "mean_token_accuracy": 0.19261687248945236, "num_tokens": 12985400.0, "step": 5660 }, { "entropy": 5.381504774093628, "epoch": 0.5441882804995197, "grad_norm": 1.0390625, "learning_rate": 0.0004977311539303483, "loss": 5.2984, "mean_token_accuracy": 0.202898870408535, "num_tokens": 12996402.0, "step": 5665 }, { "entropy": 5.339759063720703, "epoch": 0.5446685878962536, "grad_norm": 1.2421875, "learning_rate": 0.0004977262949441158, "loss": 5.1882, "mean_token_accuracy": 0.20247950553894042, "num_tokens": 13006991.0, "step": 5670 }, { "entropy": 5.329454803466797, "epoch": 0.5451488952929875, "grad_norm": 1.171875, "learning_rate": 0.0004977214307868399, "loss": 5.2909, "mean_token_accuracy": 0.19646303355693817, "num_tokens": 13016969.0, "step": 5675 }, { "entropy": 5.333616399765015, "epoch": 0.5456292026897214, "grad_norm": 1.265625, "learning_rate": 0.000497716561458634, "loss": 5.2395, "mean_token_accuracy": 0.1989587128162384, "num_tokens": 13027759.0, "step": 5680 }, { "entropy": 5.4932708740234375, "epoch": 0.5461095100864554, "grad_norm": 1.0625, "learning_rate": 0.0004977116869596107, "loss": 5.4415, "mean_token_accuracy": 0.1860479310154915, "num_tokens": 13039881.0, "step": 5685 }, { "entropy": 5.399776601791382, "epoch": 0.5465898174831892, "grad_norm": 1.0625, "learning_rate": 0.0004977068072898834, "loss": 5.3041, "mean_token_accuracy": 0.18947898745536804, "num_tokens": 13051443.0, "step": 5690 }, { "entropy": 5.3822290897369385, "epoch": 0.5470701248799231, "grad_norm": 1.0859375, "learning_rate": 0.0004977019224495652, "loss": 5.3697, "mean_token_accuracy": 0.18962922990322112, "num_tokens": 13063474.0, "step": 5695 }, { "entropy": 5.307476902008057, "epoch": 0.547550432276657, "grad_norm": 1.1796875, "learning_rate": 0.0004976970324387698, "loss": 5.234, "mean_token_accuracy": 0.20077043473720552, "num_tokens": 13074365.0, "step": 5700 }, { "entropy": 5.339881372451782, "epoch": 0.548030739673391, "grad_norm": 1.0625, "learning_rate": 0.0004976921372576104, "loss": 5.3033, "mean_token_accuracy": 0.19367703795433044, "num_tokens": 13087354.0, "step": 5705 }, { "entropy": 5.32935528755188, "epoch": 0.5485110470701249, "grad_norm": 1.03125, "learning_rate": 0.0004976872369062011, "loss": 5.2787, "mean_token_accuracy": 0.19071510583162307, "num_tokens": 13099306.0, "step": 5710 }, { "entropy": 5.4302033424377445, "epoch": 0.5489913544668588, "grad_norm": 1.109375, "learning_rate": 0.0004976823313846552, "loss": 5.4164, "mean_token_accuracy": 0.19036435931921006, "num_tokens": 13111259.0, "step": 5715 }, { "entropy": 5.4693896770477295, "epoch": 0.5494716618635928, "grad_norm": 1.0625, "learning_rate": 0.0004976774206930869, "loss": 5.3256, "mean_token_accuracy": 0.18587163984775543, "num_tokens": 13123589.0, "step": 5720 }, { "entropy": 5.253912925720215, "epoch": 0.5499519692603266, "grad_norm": 1.109375, "learning_rate": 0.0004976725048316101, "loss": 5.322, "mean_token_accuracy": 0.19089159667491912, "num_tokens": 13136485.0, "step": 5725 }, { "entropy": 5.40102801322937, "epoch": 0.5504322766570605, "grad_norm": 0.98828125, "learning_rate": 0.0004976675838003388, "loss": 5.2997, "mean_token_accuracy": 0.19145811647176741, "num_tokens": 13148067.0, "step": 5730 }, { "entropy": 5.367999935150147, "epoch": 0.5509125840537944, "grad_norm": 1.171875, "learning_rate": 0.0004976626575993877, "loss": 5.2818, "mean_token_accuracy": 0.18961854726076127, "num_tokens": 13159813.0, "step": 5735 }, { "entropy": 5.410087442398071, "epoch": 0.5513928914505284, "grad_norm": 1.234375, "learning_rate": 0.0004976577262288705, "loss": 5.356, "mean_token_accuracy": 0.18928916603326798, "num_tokens": 13170828.0, "step": 5740 }, { "entropy": 5.265670728683472, "epoch": 0.5518731988472623, "grad_norm": 1.125, "learning_rate": 0.0004976527896889023, "loss": 5.181, "mean_token_accuracy": 0.20403801798820495, "num_tokens": 13181883.0, "step": 5745 }, { "entropy": 5.295314884185791, "epoch": 0.5523535062439962, "grad_norm": 1.21875, "learning_rate": 0.0004976478479795974, "loss": 5.2557, "mean_token_accuracy": 0.1949864685535431, "num_tokens": 13193530.0, "step": 5750 }, { "entropy": 5.484155082702637, "epoch": 0.55283381364073, "grad_norm": 1.1015625, "learning_rate": 0.0004976429011010706, "loss": 5.4823, "mean_token_accuracy": 0.17912757843732835, "num_tokens": 13205822.0, "step": 5755 }, { "entropy": 5.3539347648620605, "epoch": 0.553314121037464, "grad_norm": 1.0546875, "learning_rate": 0.0004976379490534366, "loss": 5.2081, "mean_token_accuracy": 0.19992550164461137, "num_tokens": 13216698.0, "step": 5760 }, { "entropy": 5.291062736511231, "epoch": 0.5537944284341979, "grad_norm": 1.0625, "learning_rate": 0.0004976329918368107, "loss": 5.2968, "mean_token_accuracy": 0.19075367897748946, "num_tokens": 13228389.0, "step": 5765 }, { "entropy": 5.433424997329712, "epoch": 0.5542747358309318, "grad_norm": 1.109375, "learning_rate": 0.0004976280294513079, "loss": 5.3505, "mean_token_accuracy": 0.18287664502859116, "num_tokens": 13239628.0, "step": 5770 }, { "entropy": 5.404953861236573, "epoch": 0.5547550432276657, "grad_norm": 1.078125, "learning_rate": 0.0004976230618970431, "loss": 5.352, "mean_token_accuracy": 0.19548004865646362, "num_tokens": 13251149.0, "step": 5775 }, { "entropy": 5.455016326904297, "epoch": 0.5552353506243997, "grad_norm": 1.0625, "learning_rate": 0.000497618089174132, "loss": 5.413, "mean_token_accuracy": 0.18660195618867875, "num_tokens": 13264846.0, "step": 5780 }, { "entropy": 5.248121690750122, "epoch": 0.5557156580211335, "grad_norm": 1.03125, "learning_rate": 0.0004976131112826898, "loss": 5.1913, "mean_token_accuracy": 0.2054605171084404, "num_tokens": 13275409.0, "step": 5785 }, { "entropy": 5.259016036987305, "epoch": 0.5561959654178674, "grad_norm": 1.03125, "learning_rate": 0.0004976081282228323, "loss": 5.1657, "mean_token_accuracy": 0.20358884781599046, "num_tokens": 13287173.0, "step": 5790 }, { "entropy": 5.411679124832153, "epoch": 0.5566762728146013, "grad_norm": 1.03125, "learning_rate": 0.000497603139994675, "loss": 5.2377, "mean_token_accuracy": 0.19680293649435043, "num_tokens": 13298225.0, "step": 5795 }, { "entropy": 5.2930761814117435, "epoch": 0.5571565802113353, "grad_norm": 1.0703125, "learning_rate": 0.0004975981465983338, "loss": 5.2468, "mean_token_accuracy": 0.19053254425525665, "num_tokens": 13309685.0, "step": 5800 }, { "entropy": 5.304633331298828, "epoch": 0.5576368876080692, "grad_norm": 1.1171875, "learning_rate": 0.0004975931480339246, "loss": 5.2554, "mean_token_accuracy": 0.19651708900928497, "num_tokens": 13320837.0, "step": 5805 }, { "entropy": 5.383905267715454, "epoch": 0.5581171950048031, "grad_norm": 1.0390625, "learning_rate": 0.0004975881443015635, "loss": 5.3718, "mean_token_accuracy": 0.19027461260557174, "num_tokens": 13333512.0, "step": 5810 }, { "entropy": 5.465289068222046, "epoch": 0.5585975024015369, "grad_norm": 1.0390625, "learning_rate": 0.0004975831354013667, "loss": 5.3829, "mean_token_accuracy": 0.19368760734796525, "num_tokens": 13345189.0, "step": 5815 }, { "entropy": 5.329316329956055, "epoch": 0.5590778097982709, "grad_norm": 1.078125, "learning_rate": 0.0004975781213334503, "loss": 5.2472, "mean_token_accuracy": 0.20152513086795806, "num_tokens": 13356123.0, "step": 5820 }, { "entropy": 5.329442405700684, "epoch": 0.5595581171950048, "grad_norm": 1.1875, "learning_rate": 0.0004975731020979309, "loss": 5.2949, "mean_token_accuracy": 0.19351785629987717, "num_tokens": 13366902.0, "step": 5825 }, { "entropy": 5.4559613227844235, "epoch": 0.5600384245917387, "grad_norm": 1.15625, "learning_rate": 0.0004975680776949249, "loss": 5.3542, "mean_token_accuracy": 0.18989898711442948, "num_tokens": 13377567.0, "step": 5830 }, { "entropy": 5.390386629104614, "epoch": 0.5605187319884726, "grad_norm": 1.0625, "learning_rate": 0.0004975630481245492, "loss": 5.2869, "mean_token_accuracy": 0.2009364992380142, "num_tokens": 13387297.0, "step": 5835 }, { "entropy": 5.348505544662475, "epoch": 0.5609990393852066, "grad_norm": 1.1796875, "learning_rate": 0.0004975580133869202, "loss": 5.3381, "mean_token_accuracy": 0.1932346299290657, "num_tokens": 13397723.0, "step": 5840 }, { "entropy": 5.408625984191895, "epoch": 0.5614793467819404, "grad_norm": 1.0625, "learning_rate": 0.0004975529734821552, "loss": 5.3863, "mean_token_accuracy": 0.18635910749435425, "num_tokens": 13409875.0, "step": 5845 }, { "entropy": 5.352054500579834, "epoch": 0.5619596541786743, "grad_norm": 1.1015625, "learning_rate": 0.0004975479284103708, "loss": 5.2921, "mean_token_accuracy": 0.1954024314880371, "num_tokens": 13421338.0, "step": 5850 }, { "entropy": 5.418287992477417, "epoch": 0.5624399615754082, "grad_norm": 1.171875, "learning_rate": 0.0004975428781716845, "loss": 5.3258, "mean_token_accuracy": 0.19152757823467254, "num_tokens": 13431373.0, "step": 5855 }, { "entropy": 5.360725784301758, "epoch": 0.5629202689721422, "grad_norm": 0.99609375, "learning_rate": 0.0004975378227662134, "loss": 5.3208, "mean_token_accuracy": 0.19721843004226686, "num_tokens": 13443158.0, "step": 5860 }, { "entropy": 5.44525113105774, "epoch": 0.5634005763688761, "grad_norm": 1.0859375, "learning_rate": 0.0004975327621940746, "loss": 5.3795, "mean_token_accuracy": 0.18757863938808442, "num_tokens": 13454559.0, "step": 5865 }, { "entropy": 5.453475904464722, "epoch": 0.56388088376561, "grad_norm": 1.0703125, "learning_rate": 0.0004975276964553861, "loss": 5.4604, "mean_token_accuracy": 0.1895272508263588, "num_tokens": 13466934.0, "step": 5870 }, { "entropy": 5.349884796142578, "epoch": 0.5643611911623438, "grad_norm": 1.3125, "learning_rate": 0.0004975226255502651, "loss": 5.2124, "mean_token_accuracy": 0.20376883447170258, "num_tokens": 13477770.0, "step": 5875 }, { "entropy": 5.428862237930298, "epoch": 0.5648414985590778, "grad_norm": 1.125, "learning_rate": 0.0004975175494788297, "loss": 5.4214, "mean_token_accuracy": 0.1833633303642273, "num_tokens": 13490093.0, "step": 5880 }, { "entropy": 5.4273130893707275, "epoch": 0.5653218059558117, "grad_norm": 1.390625, "learning_rate": 0.0004975124682411974, "loss": 5.2743, "mean_token_accuracy": 0.19006698280572892, "num_tokens": 13500663.0, "step": 5885 }, { "entropy": 5.404650068283081, "epoch": 0.5658021133525456, "grad_norm": 1.0546875, "learning_rate": 0.0004975073818374863, "loss": 5.3747, "mean_token_accuracy": 0.19194794446229935, "num_tokens": 13512369.0, "step": 5890 }, { "entropy": 5.352162408828735, "epoch": 0.5662824207492796, "grad_norm": 1.1953125, "learning_rate": 0.0004975022902678145, "loss": 5.2518, "mean_token_accuracy": 0.18981288820505143, "num_tokens": 13523181.0, "step": 5895 }, { "entropy": 5.307896852493286, "epoch": 0.5667627281460135, "grad_norm": 1.09375, "learning_rate": 0.0004974971935323003, "loss": 5.2062, "mean_token_accuracy": 0.19488532990217208, "num_tokens": 13534113.0, "step": 5900 }, { "entropy": 5.3025891304016115, "epoch": 0.5672430355427474, "grad_norm": 1.078125, "learning_rate": 0.0004974920916310619, "loss": 5.2425, "mean_token_accuracy": 0.19460777193307877, "num_tokens": 13545037.0, "step": 5905 }, { "entropy": 5.368872261047363, "epoch": 0.5677233429394812, "grad_norm": 1.109375, "learning_rate": 0.0004974869845642178, "loss": 5.2926, "mean_token_accuracy": 0.19421349167823793, "num_tokens": 13555541.0, "step": 5910 }, { "entropy": 5.389457654953003, "epoch": 0.5682036503362152, "grad_norm": 1.1875, "learning_rate": 0.0004974818723318866, "loss": 5.2973, "mean_token_accuracy": 0.19764145314693451, "num_tokens": 13566951.0, "step": 5915 }, { "entropy": 5.347638368606567, "epoch": 0.5686839577329491, "grad_norm": 1.078125, "learning_rate": 0.0004974767549341868, "loss": 5.3505, "mean_token_accuracy": 0.18888978958129882, "num_tokens": 13578492.0, "step": 5920 }, { "entropy": 5.425949621200561, "epoch": 0.569164265129683, "grad_norm": 1.1640625, "learning_rate": 0.0004974716323712376, "loss": 5.2433, "mean_token_accuracy": 0.20290264040231704, "num_tokens": 13589183.0, "step": 5925 }, { "entropy": 5.37887659072876, "epoch": 0.5696445725264169, "grad_norm": 1.140625, "learning_rate": 0.0004974665046431576, "loss": 5.3868, "mean_token_accuracy": 0.19258931577205657, "num_tokens": 13600588.0, "step": 5930 }, { "entropy": 5.309185123443603, "epoch": 0.5701248799231509, "grad_norm": 1.1015625, "learning_rate": 0.0004974613717500659, "loss": 5.2605, "mean_token_accuracy": 0.20295644104480742, "num_tokens": 13612107.0, "step": 5935 }, { "entropy": 5.485657453536987, "epoch": 0.5706051873198847, "grad_norm": 1.1640625, "learning_rate": 0.0004974562336920818, "loss": 5.4246, "mean_token_accuracy": 0.18908909112215042, "num_tokens": 13623973.0, "step": 5940 }, { "entropy": 5.3633698463439945, "epoch": 0.5710854947166186, "grad_norm": 1.03125, "learning_rate": 0.0004974510904693245, "loss": 5.2372, "mean_token_accuracy": 0.19648284167051316, "num_tokens": 13634994.0, "step": 5945 }, { "entropy": 5.412157249450684, "epoch": 0.5715658021133525, "grad_norm": 1.1171875, "learning_rate": 0.0004974459420819134, "loss": 5.3895, "mean_token_accuracy": 0.19440043568611146, "num_tokens": 13646361.0, "step": 5950 }, { "entropy": 5.36341814994812, "epoch": 0.5720461095100865, "grad_norm": 1.125, "learning_rate": 0.000497440788529968, "loss": 5.2834, "mean_token_accuracy": 0.19329349249601363, "num_tokens": 13656975.0, "step": 5955 }, { "entropy": 5.428890562057495, "epoch": 0.5725264169068204, "grad_norm": 1.0859375, "learning_rate": 0.0004974356298136081, "loss": 5.3207, "mean_token_accuracy": 0.18961571753025055, "num_tokens": 13668434.0, "step": 5960 }, { "entropy": 5.403112125396729, "epoch": 0.5730067243035543, "grad_norm": 1.1328125, "learning_rate": 0.0004974304659329533, "loss": 5.301, "mean_token_accuracy": 0.1921529397368431, "num_tokens": 13679266.0, "step": 5965 }, { "entropy": 5.291449975967407, "epoch": 0.5734870317002881, "grad_norm": 1.140625, "learning_rate": 0.0004974252968881236, "loss": 5.3247, "mean_token_accuracy": 0.18704658299684523, "num_tokens": 13690921.0, "step": 5970 }, { "entropy": 5.385117483139038, "epoch": 0.5739673390970221, "grad_norm": 1.078125, "learning_rate": 0.000497420122679239, "loss": 5.2579, "mean_token_accuracy": 0.19390686601400375, "num_tokens": 13702329.0, "step": 5975 }, { "entropy": 5.317170143127441, "epoch": 0.574447646493756, "grad_norm": 1.109375, "learning_rate": 0.0004974149433064196, "loss": 5.2295, "mean_token_accuracy": 0.20150385797023773, "num_tokens": 13713356.0, "step": 5980 }, { "entropy": 5.237676763534546, "epoch": 0.5749279538904899, "grad_norm": 1.0625, "learning_rate": 0.0004974097587697856, "loss": 5.2294, "mean_token_accuracy": 0.19473931789398194, "num_tokens": 13724718.0, "step": 5985 }, { "entropy": 5.28824028968811, "epoch": 0.5754082612872238, "grad_norm": 1.046875, "learning_rate": 0.0004974045690694575, "loss": 5.2596, "mean_token_accuracy": 0.196784345805645, "num_tokens": 13736113.0, "step": 5990 }, { "entropy": 5.417406034469605, "epoch": 0.5758885686839578, "grad_norm": 0.99609375, "learning_rate": 0.0004973993742055557, "loss": 5.272, "mean_token_accuracy": 0.19672393202781677, "num_tokens": 13748322.0, "step": 5995 }, { "entropy": 5.3009929180145265, "epoch": 0.5763688760806917, "grad_norm": 1.1328125, "learning_rate": 0.0004973941741782007, "loss": 5.2743, "mean_token_accuracy": 0.18973211497068404, "num_tokens": 13759433.0, "step": 6000 }, { "epoch": 0.5763688760806917, "eval_entropy": 5.216975544093005, "eval_loss": 5.320178508758545, "eval_mean_token_accuracy": 0.1993778554485636, "eval_num_tokens": 13759433.0, "eval_runtime": 27.3927, "eval_samples_per_second": 1197.949, "eval_steps_per_second": 149.748, "step": 6000 } ], "logging_steps": 5, "max_steps": 104090, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.1108419647488e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }