{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.23244326931458292, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 10.742584228515625, "epoch": 0.0002905540866432286, "grad_norm": 4.90625, "learning_rate": 2e-06, "loss": 10.7837, "mean_token_accuracy": 0.0, "num_tokens": 10156.0, "step": 5 }, { "entropy": 10.742587471008301, "epoch": 0.0005811081732864572, "grad_norm": 4.8125, "learning_rate": 4.5e-06, "loss": 10.7753, "mean_token_accuracy": 9.267840650863945e-05, "num_tokens": 20933.0, "step": 10 }, { "entropy": 10.74257869720459, "epoch": 0.0008716622599296859, "grad_norm": 4.375, "learning_rate": 7e-06, "loss": 10.7508, "mean_token_accuracy": 0.0, "num_tokens": 31298.0, "step": 15 }, { "entropy": 10.742635726928711, "epoch": 0.0011622163465729145, "grad_norm": 4.875, "learning_rate": 9.5e-06, "loss": 10.697, "mean_token_accuracy": 0.0, "num_tokens": 40913.0, "step": 20 }, { "entropy": 10.742652702331544, "epoch": 0.0014527704332161432, "grad_norm": 4.28125, "learning_rate": 1.2e-05, "loss": 10.5798, "mean_token_accuracy": 0.0007269373920280487, "num_tokens": 49901.0, "step": 25 }, { "entropy": 10.742454719543456, "epoch": 0.0017433245198593718, "grad_norm": 4.0625, "learning_rate": 1.4500000000000002e-05, "loss": 10.4688, "mean_token_accuracy": 0.01560134175233543, "num_tokens": 59328.0, "step": 30 }, { "entropy": 10.741775226593017, "epoch": 0.0020338786065026006, "grad_norm": 3.25, "learning_rate": 1.7000000000000003e-05, "loss": 10.3287, "mean_token_accuracy": 0.037073963694274424, "num_tokens": 68405.0, "step": 35 }, { "entropy": 10.740037631988525, "epoch": 0.002324432693145829, "grad_norm": 2.578125, "learning_rate": 1.95e-05, "loss": 10.2203, "mean_token_accuracy": 0.037133642472326756, "num_tokens": 77591.0, "step": 40 }, { "entropy": 10.73731575012207, "epoch": 0.0026149867797890577, "grad_norm": 2.359375, "learning_rate": 2.2e-05, "loss": 10.1202, "mean_token_accuracy": 0.03901108838617802, "num_tokens": 88186.0, "step": 45 }, { "entropy": 10.734606838226318, "epoch": 0.0029055408664322865, "grad_norm": 2.09375, "learning_rate": 2.4500000000000003e-05, "loss": 10.0211, "mean_token_accuracy": 0.04241710864007473, "num_tokens": 97594.0, "step": 50 }, { "entropy": 10.73211612701416, "epoch": 0.003196094953075515, "grad_norm": 1.9921875, "learning_rate": 2.7e-05, "loss": 9.9871, "mean_token_accuracy": 0.03826836366206408, "num_tokens": 107386.0, "step": 55 }, { "entropy": 10.73102445602417, "epoch": 0.0034866490397187436, "grad_norm": 1.9140625, "learning_rate": 2.95e-05, "loss": 9.9132, "mean_token_accuracy": 0.03943221494555473, "num_tokens": 116742.0, "step": 60 }, { "entropy": 10.729645252227783, "epoch": 0.0037772031263619723, "grad_norm": 1.859375, "learning_rate": 3.2e-05, "loss": 9.8519, "mean_token_accuracy": 0.03962419871240854, "num_tokens": 126520.0, "step": 65 }, { "entropy": 10.727834033966065, "epoch": 0.004067757213005201, "grad_norm": 1.7734375, "learning_rate": 3.4500000000000005e-05, "loss": 9.7907, "mean_token_accuracy": 0.03989919070154428, "num_tokens": 136382.0, "step": 70 }, { "entropy": 10.724947452545166, "epoch": 0.0043583112996484295, "grad_norm": 1.7421875, "learning_rate": 3.7e-05, "loss": 9.7212, "mean_token_accuracy": 0.03671109899878502, "num_tokens": 146435.0, "step": 75 }, { "entropy": 10.72182493209839, "epoch": 0.004648865386291658, "grad_norm": 1.8359375, "learning_rate": 3.95e-05, "loss": 9.6591, "mean_token_accuracy": 0.037667426839470865, "num_tokens": 156174.0, "step": 80 }, { "entropy": 10.71723222732544, "epoch": 0.004939419472934887, "grad_norm": 1.765625, "learning_rate": 4.2000000000000004e-05, "loss": 9.5783, "mean_token_accuracy": 0.04142397493124008, "num_tokens": 165118.0, "step": 85 }, { "entropy": 10.708585739135742, "epoch": 0.005229973559578115, "grad_norm": 1.875, "learning_rate": 4.45e-05, "loss": 9.5252, "mean_token_accuracy": 0.04036426953971386, "num_tokens": 174401.0, "step": 90 }, { "entropy": 10.697160243988037, "epoch": 0.005520527646221344, "grad_norm": 1.765625, "learning_rate": 4.7000000000000004e-05, "loss": 9.443, "mean_token_accuracy": 0.04118307866156101, "num_tokens": 183533.0, "step": 95 }, { "entropy": 10.683875274658202, "epoch": 0.005811081732864573, "grad_norm": 1.7734375, "learning_rate": 4.9500000000000004e-05, "loss": 9.3596, "mean_token_accuracy": 0.045602331310510634, "num_tokens": 193296.0, "step": 100 }, { "entropy": 10.665638542175293, "epoch": 0.006101635819507801, "grad_norm": 1.75, "learning_rate": 5.2e-05, "loss": 9.2242, "mean_token_accuracy": 0.055989645794034, "num_tokens": 202741.0, "step": 105 }, { "entropy": 10.637047958374023, "epoch": 0.00639218990615103, "grad_norm": 1.703125, "learning_rate": 5.45e-05, "loss": 9.1359, "mean_token_accuracy": 0.05134495124220848, "num_tokens": 212441.0, "step": 110 }, { "entropy": 10.61000461578369, "epoch": 0.006682743992794259, "grad_norm": 1.6875, "learning_rate": 5.7e-05, "loss": 8.9868, "mean_token_accuracy": 0.04918566383421421, "num_tokens": 220671.0, "step": 115 }, { "entropy": 10.56931962966919, "epoch": 0.006973298079437487, "grad_norm": 1.7578125, "learning_rate": 5.9499999999999996e-05, "loss": 8.9878, "mean_token_accuracy": 0.04560479037463665, "num_tokens": 231390.0, "step": 120 }, { "entropy": 10.515452098846435, "epoch": 0.007263852166080716, "grad_norm": 1.5859375, "learning_rate": 6.2e-05, "loss": 8.8241, "mean_token_accuracy": 0.05023673102259636, "num_tokens": 241137.0, "step": 125 }, { "entropy": 10.430156230926514, "epoch": 0.007554406252723945, "grad_norm": 1.578125, "learning_rate": 6.450000000000001e-05, "loss": 8.6778, "mean_token_accuracy": 0.05138532817363739, "num_tokens": 250627.0, "step": 130 }, { "entropy": 10.353140926361084, "epoch": 0.007844960339367173, "grad_norm": 1.5859375, "learning_rate": 6.7e-05, "loss": 8.5255, "mean_token_accuracy": 0.05529710613191128, "num_tokens": 259564.0, "step": 135 }, { "entropy": 10.262280082702636, "epoch": 0.008135514426010402, "grad_norm": 1.453125, "learning_rate": 6.950000000000001e-05, "loss": 8.4168, "mean_token_accuracy": 0.05102897398173809, "num_tokens": 268997.0, "step": 140 }, { "entropy": 10.17268762588501, "epoch": 0.00842606851265363, "grad_norm": 1.40625, "learning_rate": 7.2e-05, "loss": 8.402, "mean_token_accuracy": 0.04707291163504124, "num_tokens": 278989.0, "step": 145 }, { "entropy": 10.068906784057617, "epoch": 0.008716622599296859, "grad_norm": 1.3828125, "learning_rate": 7.45e-05, "loss": 8.2195, "mean_token_accuracy": 0.04823922924697399, "num_tokens": 288770.0, "step": 150 }, { "entropy": 9.884156227111816, "epoch": 0.009007176685940088, "grad_norm": 1.234375, "learning_rate": 7.7e-05, "loss": 8.1604, "mean_token_accuracy": 0.05296766012907028, "num_tokens": 298368.0, "step": 155 }, { "entropy": 9.749515438079834, "epoch": 0.009297730772583316, "grad_norm": 1.125, "learning_rate": 7.950000000000001e-05, "loss": 7.9887, "mean_token_accuracy": 0.054083061218261716, "num_tokens": 307437.0, "step": 160 }, { "entropy": 9.539670753479005, "epoch": 0.009588284859226545, "grad_norm": 1.3828125, "learning_rate": 8.2e-05, "loss": 7.931, "mean_token_accuracy": 0.05368399284780025, "num_tokens": 317842.0, "step": 165 }, { "entropy": 9.367785167694091, "epoch": 0.009878838945869774, "grad_norm": 0.984375, "learning_rate": 8.450000000000001e-05, "loss": 7.7746, "mean_token_accuracy": 0.056211471930146216, "num_tokens": 327455.0, "step": 170 }, { "entropy": 9.106531143188477, "epoch": 0.010169393032513002, "grad_norm": 0.96484375, "learning_rate": 8.7e-05, "loss": 7.7023, "mean_token_accuracy": 0.059121083468198776, "num_tokens": 338593.0, "step": 175 }, { "entropy": 8.891216564178468, "epoch": 0.01045994711915623, "grad_norm": 1.0, "learning_rate": 8.95e-05, "loss": 7.6717, "mean_token_accuracy": 0.060001150518655774, "num_tokens": 348278.0, "step": 180 }, { "entropy": 8.690237522125244, "epoch": 0.01075050120579946, "grad_norm": 0.91796875, "learning_rate": 9.2e-05, "loss": 7.5848, "mean_token_accuracy": 0.060652027279138564, "num_tokens": 358293.0, "step": 185 }, { "entropy": 8.500970458984375, "epoch": 0.011041055292442687, "grad_norm": 0.70703125, "learning_rate": 9.45e-05, "loss": 7.6462, "mean_token_accuracy": 0.06345079019665718, "num_tokens": 368177.0, "step": 190 }, { "entropy": 8.432841682434082, "epoch": 0.011331609379085917, "grad_norm": 0.87890625, "learning_rate": 9.7e-05, "loss": 7.5041, "mean_token_accuracy": 0.06438801400363445, "num_tokens": 377258.0, "step": 195 }, { "entropy": 8.328762531280518, "epoch": 0.011622163465729146, "grad_norm": 0.79296875, "learning_rate": 9.95e-05, "loss": 7.515, "mean_token_accuracy": 0.06462946832180023, "num_tokens": 385931.0, "step": 200 }, { "entropy": 8.228355598449706, "epoch": 0.011912717552372373, "grad_norm": 0.984375, "learning_rate": 0.000102, "loss": 7.4262, "mean_token_accuracy": 0.06731356121599674, "num_tokens": 394370.0, "step": 205 }, { "entropy": 8.163572025299072, "epoch": 0.012203271639015602, "grad_norm": 0.765625, "learning_rate": 0.00010449999999999999, "loss": 7.5127, "mean_token_accuracy": 0.06187250129878521, "num_tokens": 405167.0, "step": 210 }, { "entropy": 8.144425964355468, "epoch": 0.012493825725658832, "grad_norm": 0.90234375, "learning_rate": 0.000107, "loss": 7.4823, "mean_token_accuracy": 0.06424942426383495, "num_tokens": 414954.0, "step": 215 }, { "entropy": 8.074434852600097, "epoch": 0.01278437981230206, "grad_norm": 1.0078125, "learning_rate": 0.0001095, "loss": 7.4379, "mean_token_accuracy": 0.07021872885525227, "num_tokens": 423806.0, "step": 220 }, { "entropy": 8.100719451904297, "epoch": 0.013074933898945288, "grad_norm": 1.1015625, "learning_rate": 0.000112, "loss": 7.4049, "mean_token_accuracy": 0.07006631046533585, "num_tokens": 433416.0, "step": 225 }, { "entropy": 8.068440341949463, "epoch": 0.013365487985588518, "grad_norm": 1.015625, "learning_rate": 0.0001145, "loss": 7.4086, "mean_token_accuracy": 0.0656484205275774, "num_tokens": 443237.0, "step": 230 }, { "entropy": 8.008077144622803, "epoch": 0.013656042072231747, "grad_norm": 1.0078125, "learning_rate": 0.00011700000000000001, "loss": 7.3811, "mean_token_accuracy": 0.07138268202543259, "num_tokens": 452334.0, "step": 235 }, { "entropy": 7.95733003616333, "epoch": 0.013946596158874974, "grad_norm": 1.0390625, "learning_rate": 0.00011949999999999999, "loss": 7.451, "mean_token_accuracy": 0.07069577798247337, "num_tokens": 462604.0, "step": 240 }, { "entropy": 7.985943031311035, "epoch": 0.014237150245518203, "grad_norm": 1.109375, "learning_rate": 0.000122, "loss": 7.3342, "mean_token_accuracy": 0.07418472990393639, "num_tokens": 472105.0, "step": 245 }, { "entropy": 7.985353708267212, "epoch": 0.014527704332161433, "grad_norm": 0.89453125, "learning_rate": 0.0001245, "loss": 7.3573, "mean_token_accuracy": 0.07192124761641025, "num_tokens": 481873.0, "step": 250 }, { "entropy": 7.852858924865723, "epoch": 0.01481825841880466, "grad_norm": 0.9140625, "learning_rate": 0.000127, "loss": 7.3134, "mean_token_accuracy": 0.07094009146094322, "num_tokens": 490776.0, "step": 255 }, { "entropy": 7.97090711593628, "epoch": 0.01510881250544789, "grad_norm": 0.97265625, "learning_rate": 0.0001295, "loss": 7.3459, "mean_token_accuracy": 0.06945950090885163, "num_tokens": 500237.0, "step": 260 }, { "entropy": 7.988322401046753, "epoch": 0.015399366592091119, "grad_norm": 1.0078125, "learning_rate": 0.000132, "loss": 7.3569, "mean_token_accuracy": 0.0719369538128376, "num_tokens": 509449.0, "step": 265 }, { "entropy": 7.863973140716553, "epoch": 0.015689920678734346, "grad_norm": 0.9296875, "learning_rate": 0.00013450000000000002, "loss": 7.3463, "mean_token_accuracy": 0.07629362866282463, "num_tokens": 519335.0, "step": 270 }, { "entropy": 7.850080347061157, "epoch": 0.015980474765377575, "grad_norm": 0.828125, "learning_rate": 0.00013700000000000002, "loss": 7.269, "mean_token_accuracy": 0.07348301075398922, "num_tokens": 529108.0, "step": 275 }, { "entropy": 7.803001642227173, "epoch": 0.016271028852020804, "grad_norm": 0.9609375, "learning_rate": 0.0001395, "loss": 7.3421, "mean_token_accuracy": 0.07442944496870041, "num_tokens": 539409.0, "step": 280 }, { "entropy": 7.8401947021484375, "epoch": 0.016561582938664034, "grad_norm": 0.98828125, "learning_rate": 0.00014199999999999998, "loss": 7.269, "mean_token_accuracy": 0.07476447969675064, "num_tokens": 549790.0, "step": 285 }, { "entropy": 7.773062610626221, "epoch": 0.01685213702530726, "grad_norm": 0.90234375, "learning_rate": 0.0001445, "loss": 7.2597, "mean_token_accuracy": 0.07743276208639145, "num_tokens": 559343.0, "step": 290 }, { "entropy": 7.833370351791382, "epoch": 0.01714269111195049, "grad_norm": 1.1171875, "learning_rate": 0.000147, "loss": 7.306, "mean_token_accuracy": 0.07507650516927242, "num_tokens": 568806.0, "step": 295 }, { "entropy": 7.692620134353637, "epoch": 0.017433245198593718, "grad_norm": 1.0703125, "learning_rate": 0.0001495, "loss": 7.1532, "mean_token_accuracy": 0.07671754881739616, "num_tokens": 578988.0, "step": 300 }, { "entropy": 7.840510559082031, "epoch": 0.017723799285236947, "grad_norm": 1.015625, "learning_rate": 0.000152, "loss": 7.258, "mean_token_accuracy": 0.0767325557768345, "num_tokens": 588588.0, "step": 305 }, { "entropy": 7.740892934799194, "epoch": 0.018014353371880176, "grad_norm": 0.9453125, "learning_rate": 0.00015450000000000001, "loss": 7.2385, "mean_token_accuracy": 0.07767370343208313, "num_tokens": 597957.0, "step": 310 }, { "entropy": 7.761815309524536, "epoch": 0.018304907458523405, "grad_norm": 0.8671875, "learning_rate": 0.000157, "loss": 7.2168, "mean_token_accuracy": 0.07732245922088624, "num_tokens": 607446.0, "step": 315 }, { "entropy": 7.723113679885865, "epoch": 0.01859546154516663, "grad_norm": 0.8359375, "learning_rate": 0.0001595, "loss": 7.1559, "mean_token_accuracy": 0.07753840312361718, "num_tokens": 617064.0, "step": 320 }, { "entropy": 7.695508337020874, "epoch": 0.01888601563180986, "grad_norm": 1.03125, "learning_rate": 0.000162, "loss": 7.2008, "mean_token_accuracy": 0.08057244047522545, "num_tokens": 625927.0, "step": 325 }, { "entropy": 7.717827177047729, "epoch": 0.01917656971845309, "grad_norm": 0.97265625, "learning_rate": 0.00016450000000000001, "loss": 7.1152, "mean_token_accuracy": 0.07994545996189117, "num_tokens": 635341.0, "step": 330 }, { "entropy": 7.675025224685669, "epoch": 0.01946712380509632, "grad_norm": 2.171875, "learning_rate": 0.00016700000000000002, "loss": 7.1106, "mean_token_accuracy": 0.08988085016608238, "num_tokens": 645095.0, "step": 335 }, { "entropy": 7.714554166793823, "epoch": 0.019757677891739548, "grad_norm": 1.0234375, "learning_rate": 0.00016950000000000003, "loss": 7.1558, "mean_token_accuracy": 0.0730321068316698, "num_tokens": 654754.0, "step": 340 }, { "entropy": 7.60111026763916, "epoch": 0.020048231978382777, "grad_norm": 0.8671875, "learning_rate": 0.00017199999999999998, "loss": 7.1266, "mean_token_accuracy": 0.07690966166555882, "num_tokens": 664589.0, "step": 345 }, { "entropy": 7.6628223896026615, "epoch": 0.020338786065026003, "grad_norm": 1.0703125, "learning_rate": 0.00017449999999999999, "loss": 7.1425, "mean_token_accuracy": 0.07918459475040436, "num_tokens": 673870.0, "step": 350 }, { "entropy": 7.577814197540283, "epoch": 0.020629340151669232, "grad_norm": 1.03125, "learning_rate": 0.000177, "loss": 7.1137, "mean_token_accuracy": 0.07997918874025345, "num_tokens": 684309.0, "step": 355 }, { "entropy": 7.6769345760345455, "epoch": 0.02091989423831246, "grad_norm": 1.359375, "learning_rate": 0.0001795, "loss": 7.1629, "mean_token_accuracy": 0.07469077445566655, "num_tokens": 693702.0, "step": 360 }, { "entropy": 7.534895896911621, "epoch": 0.02121044832495569, "grad_norm": 0.94140625, "learning_rate": 0.000182, "loss": 7.0599, "mean_token_accuracy": 0.07970957532525062, "num_tokens": 702951.0, "step": 365 }, { "entropy": 7.588031339645386, "epoch": 0.02150100241159892, "grad_norm": 1.25, "learning_rate": 0.0001845, "loss": 7.0677, "mean_token_accuracy": 0.08218754455447197, "num_tokens": 712481.0, "step": 370 }, { "entropy": 7.600922870635986, "epoch": 0.02179155649824215, "grad_norm": 1.046875, "learning_rate": 0.000187, "loss": 7.0683, "mean_token_accuracy": 0.08380770459771156, "num_tokens": 721579.0, "step": 375 }, { "entropy": 7.572713327407837, "epoch": 0.022082110584885375, "grad_norm": 1.03125, "learning_rate": 0.0001895, "loss": 7.0774, "mean_token_accuracy": 0.07982454895973205, "num_tokens": 731404.0, "step": 380 }, { "entropy": 7.548839807510376, "epoch": 0.022372664671528604, "grad_norm": 0.93359375, "learning_rate": 0.000192, "loss": 7.0556, "mean_token_accuracy": 0.07496214136481286, "num_tokens": 740751.0, "step": 385 }, { "entropy": 7.523876476287842, "epoch": 0.022663218758171833, "grad_norm": 0.9765625, "learning_rate": 0.0001945, "loss": 7.0247, "mean_token_accuracy": 0.08082472011446953, "num_tokens": 751171.0, "step": 390 }, { "entropy": 7.552808237075806, "epoch": 0.022953772844815062, "grad_norm": 1.078125, "learning_rate": 0.00019700000000000002, "loss": 7.0823, "mean_token_accuracy": 0.07615064568817616, "num_tokens": 760874.0, "step": 395 }, { "entropy": 7.583486127853393, "epoch": 0.02324432693145829, "grad_norm": 1.2734375, "learning_rate": 0.00019950000000000002, "loss": 7.0585, "mean_token_accuracy": 0.08326990716159344, "num_tokens": 769652.0, "step": 400 }, { "entropy": 7.488273334503174, "epoch": 0.02353488101810152, "grad_norm": 0.98828125, "learning_rate": 0.000202, "loss": 7.0421, "mean_token_accuracy": 0.07620194889605045, "num_tokens": 779591.0, "step": 405 }, { "entropy": 7.564187002182007, "epoch": 0.023825435104744747, "grad_norm": 0.94921875, "learning_rate": 0.00020449999999999998, "loss": 7.156, "mean_token_accuracy": 0.08098742663860321, "num_tokens": 789582.0, "step": 410 }, { "entropy": 7.506245565414429, "epoch": 0.024115989191387976, "grad_norm": 1.1015625, "learning_rate": 0.000207, "loss": 7.0546, "mean_token_accuracy": 0.07765479311347008, "num_tokens": 799146.0, "step": 415 }, { "entropy": 7.4926127910614015, "epoch": 0.024406543278031205, "grad_norm": 1.0390625, "learning_rate": 0.0002095, "loss": 7.0246, "mean_token_accuracy": 0.0782523088157177, "num_tokens": 808934.0, "step": 420 }, { "entropy": 7.532363748550415, "epoch": 0.024697097364674434, "grad_norm": 0.87109375, "learning_rate": 0.000212, "loss": 7.0821, "mean_token_accuracy": 0.07597277015447616, "num_tokens": 819280.0, "step": 425 }, { "entropy": 7.457432746887207, "epoch": 0.024987651451317663, "grad_norm": 0.97265625, "learning_rate": 0.0002145, "loss": 6.9892, "mean_token_accuracy": 0.0840725652873516, "num_tokens": 828818.0, "step": 430 }, { "entropy": 7.463752698898316, "epoch": 0.025278205537960893, "grad_norm": 1.0703125, "learning_rate": 0.00021700000000000002, "loss": 6.9816, "mean_token_accuracy": 0.08661384396255016, "num_tokens": 839175.0, "step": 435 }, { "entropy": 7.5449175357818605, "epoch": 0.02556875962460412, "grad_norm": 1.125, "learning_rate": 0.0002195, "loss": 7.0777, "mean_token_accuracy": 0.07947314418852329, "num_tokens": 849965.0, "step": 440 }, { "entropy": 7.392349624633789, "epoch": 0.025859313711247348, "grad_norm": 1.0859375, "learning_rate": 0.000222, "loss": 6.9968, "mean_token_accuracy": 0.08229465186595916, "num_tokens": 859229.0, "step": 445 }, { "entropy": 7.4397971630096436, "epoch": 0.026149867797890577, "grad_norm": 1.28125, "learning_rate": 0.0002245, "loss": 6.9708, "mean_token_accuracy": 0.0816520519554615, "num_tokens": 869199.0, "step": 450 }, { "entropy": 7.399962043762207, "epoch": 0.026440421884533806, "grad_norm": 0.93359375, "learning_rate": 0.00022700000000000002, "loss": 6.9666, "mean_token_accuracy": 0.09285714998841285, "num_tokens": 879470.0, "step": 455 }, { "entropy": 7.4366514682769775, "epoch": 0.026730975971177035, "grad_norm": 0.921875, "learning_rate": 0.00022950000000000002, "loss": 6.9274, "mean_token_accuracy": 0.0792453158646822, "num_tokens": 888397.0, "step": 460 }, { "entropy": 7.370485734939575, "epoch": 0.027021530057820264, "grad_norm": 0.953125, "learning_rate": 0.00023200000000000003, "loss": 6.8202, "mean_token_accuracy": 0.0855403620749712, "num_tokens": 898321.0, "step": 465 }, { "entropy": 7.4845947265625, "epoch": 0.027312084144463494, "grad_norm": 1.0546875, "learning_rate": 0.00023449999999999998, "loss": 7.0856, "mean_token_accuracy": 0.0808610200881958, "num_tokens": 907947.0, "step": 470 }, { "entropy": 7.327203702926636, "epoch": 0.02760263823110672, "grad_norm": 1.1015625, "learning_rate": 0.000237, "loss": 6.9103, "mean_token_accuracy": 0.0954340323805809, "num_tokens": 916842.0, "step": 475 }, { "entropy": 7.380954456329346, "epoch": 0.02789319231774995, "grad_norm": 1.046875, "learning_rate": 0.0002395, "loss": 7.0098, "mean_token_accuracy": 0.08165798112750053, "num_tokens": 926431.0, "step": 480 }, { "entropy": 7.412681722640992, "epoch": 0.028183746404393178, "grad_norm": 0.98828125, "learning_rate": 0.000242, "loss": 6.9162, "mean_token_accuracy": 0.08133741281926632, "num_tokens": 935819.0, "step": 485 }, { "entropy": 7.44426212310791, "epoch": 0.028474300491036407, "grad_norm": 1.15625, "learning_rate": 0.0002445, "loss": 6.9418, "mean_token_accuracy": 0.08402741849422454, "num_tokens": 944198.0, "step": 490 }, { "entropy": 7.264917373657227, "epoch": 0.028764854577679636, "grad_norm": 0.88671875, "learning_rate": 0.000247, "loss": 6.9628, "mean_token_accuracy": 0.08352083042263984, "num_tokens": 954972.0, "step": 495 }, { "entropy": 7.385922384262085, "epoch": 0.029055408664322865, "grad_norm": 1.0703125, "learning_rate": 0.0002495, "loss": 6.9018, "mean_token_accuracy": 0.08520250022411346, "num_tokens": 964532.0, "step": 500 }, { "entropy": 7.475071048736572, "epoch": 0.02934596275096609, "grad_norm": 1.1171875, "learning_rate": 0.000252, "loss": 6.9955, "mean_token_accuracy": 0.07958225682377815, "num_tokens": 974547.0, "step": 505 }, { "entropy": 7.299204540252686, "epoch": 0.02963651683760932, "grad_norm": 0.98046875, "learning_rate": 0.0002545, "loss": 6.935, "mean_token_accuracy": 0.08022963926196099, "num_tokens": 984245.0, "step": 510 }, { "entropy": 7.318370199203491, "epoch": 0.02992707092425255, "grad_norm": 0.94140625, "learning_rate": 0.000257, "loss": 6.7766, "mean_token_accuracy": 0.08500204458832741, "num_tokens": 994400.0, "step": 515 }, { "entropy": 7.352757215499878, "epoch": 0.03021762501089578, "grad_norm": 1.2109375, "learning_rate": 0.0002595, "loss": 7.0024, "mean_token_accuracy": 0.07765024341642857, "num_tokens": 1005775.0, "step": 520 }, { "entropy": 7.312537145614624, "epoch": 0.030508179097539008, "grad_norm": 1.0859375, "learning_rate": 0.000262, "loss": 6.9055, "mean_token_accuracy": 0.08693855553865433, "num_tokens": 1015386.0, "step": 525 }, { "entropy": 7.383286190032959, "epoch": 0.030798733184182237, "grad_norm": 1.0859375, "learning_rate": 0.00026450000000000003, "loss": 6.8994, "mean_token_accuracy": 0.09188547134399414, "num_tokens": 1024963.0, "step": 530 }, { "entropy": 7.249363946914673, "epoch": 0.031089287270825463, "grad_norm": 0.9921875, "learning_rate": 0.00026700000000000004, "loss": 6.8996, "mean_token_accuracy": 0.08531768508255481, "num_tokens": 1034667.0, "step": 535 }, { "entropy": 7.265355777740479, "epoch": 0.03137984135746869, "grad_norm": 0.9921875, "learning_rate": 0.00026950000000000005, "loss": 6.8796, "mean_token_accuracy": 0.08795020580291749, "num_tokens": 1044171.0, "step": 540 }, { "entropy": 7.295146417617798, "epoch": 0.031670395444111925, "grad_norm": 1.1171875, "learning_rate": 0.00027200000000000005, "loss": 6.8538, "mean_token_accuracy": 0.08691519349813462, "num_tokens": 1053585.0, "step": 545 }, { "entropy": 7.237406063079834, "epoch": 0.03196094953075515, "grad_norm": 1.15625, "learning_rate": 0.0002745, "loss": 6.7515, "mean_token_accuracy": 0.09050033241510391, "num_tokens": 1063310.0, "step": 550 }, { "entropy": 7.263738679885864, "epoch": 0.032251503617398376, "grad_norm": 0.953125, "learning_rate": 0.000277, "loss": 6.8651, "mean_token_accuracy": 0.08824861124157905, "num_tokens": 1073529.0, "step": 555 }, { "entropy": 7.175330972671508, "epoch": 0.03254205770404161, "grad_norm": 1.109375, "learning_rate": 0.0002795, "loss": 6.8319, "mean_token_accuracy": 0.08951647505164147, "num_tokens": 1083432.0, "step": 560 }, { "entropy": 7.184946346282959, "epoch": 0.032832611790684835, "grad_norm": 0.953125, "learning_rate": 0.00028199999999999997, "loss": 6.8004, "mean_token_accuracy": 0.09656240493059158, "num_tokens": 1092453.0, "step": 565 }, { "entropy": 7.274725437164307, "epoch": 0.03312316587732807, "grad_norm": 1.03125, "learning_rate": 0.0002845, "loss": 6.8865, "mean_token_accuracy": 0.08661114051938057, "num_tokens": 1102402.0, "step": 570 }, { "entropy": 7.303795433044433, "epoch": 0.03341371996397129, "grad_norm": 1.1171875, "learning_rate": 0.000287, "loss": 6.8928, "mean_token_accuracy": 0.09610759019851685, "num_tokens": 1111907.0, "step": 575 }, { "entropy": 7.228280067443848, "epoch": 0.03370427405061452, "grad_norm": 1.125, "learning_rate": 0.0002895, "loss": 6.7846, "mean_token_accuracy": 0.09133462607860565, "num_tokens": 1120712.0, "step": 580 }, { "entropy": 7.0720751762390135, "epoch": 0.03399482813725775, "grad_norm": 1.0703125, "learning_rate": 0.000292, "loss": 6.6691, "mean_token_accuracy": 0.0894063800573349, "num_tokens": 1131165.0, "step": 585 }, { "entropy": 7.229758644104004, "epoch": 0.03428538222390098, "grad_norm": 1.0625, "learning_rate": 0.0002945, "loss": 6.8337, "mean_token_accuracy": 0.08700250834226608, "num_tokens": 1140527.0, "step": 590 }, { "entropy": 7.137591791152954, "epoch": 0.03457593631054421, "grad_norm": 1.140625, "learning_rate": 0.000297, "loss": 6.792, "mean_token_accuracy": 0.08842456936836243, "num_tokens": 1149977.0, "step": 595 }, { "entropy": 7.240325021743774, "epoch": 0.034866490397187436, "grad_norm": 1.1328125, "learning_rate": 0.0002995, "loss": 6.8153, "mean_token_accuracy": 0.08972005397081376, "num_tokens": 1159918.0, "step": 600 }, { "entropy": 7.116828918457031, "epoch": 0.03515704448383067, "grad_norm": 0.96484375, "learning_rate": 0.000302, "loss": 6.7965, "mean_token_accuracy": 0.08587550893425941, "num_tokens": 1169218.0, "step": 605 }, { "entropy": 7.1641600131988525, "epoch": 0.035447598570473894, "grad_norm": 1.1953125, "learning_rate": 0.0003045, "loss": 6.8058, "mean_token_accuracy": 0.09056585654616356, "num_tokens": 1179429.0, "step": 610 }, { "entropy": 7.0538177490234375, "epoch": 0.03573815265711712, "grad_norm": 0.953125, "learning_rate": 0.000307, "loss": 6.7051, "mean_token_accuracy": 0.0951805867254734, "num_tokens": 1189379.0, "step": 615 }, { "entropy": 7.165834856033325, "epoch": 0.03602870674376035, "grad_norm": 1.1328125, "learning_rate": 0.0003095, "loss": 6.6834, "mean_token_accuracy": 0.09452618882060052, "num_tokens": 1198643.0, "step": 620 }, { "entropy": 7.1435986995697025, "epoch": 0.03631926083040358, "grad_norm": 1.203125, "learning_rate": 0.000312, "loss": 6.8985, "mean_token_accuracy": 0.08901753202080727, "num_tokens": 1207933.0, "step": 625 }, { "entropy": 7.125590705871582, "epoch": 0.03660981491704681, "grad_norm": 1.1640625, "learning_rate": 0.0003145, "loss": 6.7771, "mean_token_accuracy": 0.09473630785942078, "num_tokens": 1217000.0, "step": 630 }, { "entropy": 7.342123746871948, "epoch": 0.03690036900369004, "grad_norm": 1.1796875, "learning_rate": 0.000317, "loss": 6.8715, "mean_token_accuracy": 0.08738602064549923, "num_tokens": 1227054.0, "step": 635 }, { "entropy": 7.0751423835754395, "epoch": 0.03719092309033326, "grad_norm": 1.0625, "learning_rate": 0.0003195, "loss": 6.8639, "mean_token_accuracy": 0.08903967961668968, "num_tokens": 1237126.0, "step": 640 }, { "entropy": 7.132748985290528, "epoch": 0.037481477176976495, "grad_norm": 1.140625, "learning_rate": 0.000322, "loss": 6.7309, "mean_token_accuracy": 0.09907565861940384, "num_tokens": 1247404.0, "step": 645 }, { "entropy": 7.105540752410889, "epoch": 0.03777203126361972, "grad_norm": 0.97265625, "learning_rate": 0.00032450000000000003, "loss": 6.6672, "mean_token_accuracy": 0.08641588017344475, "num_tokens": 1257130.0, "step": 650 }, { "entropy": 7.073269605636597, "epoch": 0.038062585350262954, "grad_norm": 1.0234375, "learning_rate": 0.00032700000000000003, "loss": 6.7423, "mean_token_accuracy": 0.09811322540044784, "num_tokens": 1266931.0, "step": 655 }, { "entropy": 7.157707405090332, "epoch": 0.03835313943690618, "grad_norm": 0.8828125, "learning_rate": 0.00032950000000000004, "loss": 6.7753, "mean_token_accuracy": 0.08842945359647274, "num_tokens": 1277770.0, "step": 660 }, { "entropy": 7.074891519546509, "epoch": 0.03864369352354941, "grad_norm": 1.1953125, "learning_rate": 0.00033200000000000005, "loss": 6.6966, "mean_token_accuracy": 0.09733218997716904, "num_tokens": 1287188.0, "step": 665 }, { "entropy": 7.035866546630859, "epoch": 0.03893424761019264, "grad_norm": 1.046875, "learning_rate": 0.00033450000000000005, "loss": 6.7408, "mean_token_accuracy": 0.09134816229343415, "num_tokens": 1297038.0, "step": 670 }, { "entropy": 7.091120624542237, "epoch": 0.03922480169683586, "grad_norm": 0.984375, "learning_rate": 0.000337, "loss": 6.6964, "mean_token_accuracy": 0.09473009631037713, "num_tokens": 1306860.0, "step": 675 }, { "entropy": 7.030598735809326, "epoch": 0.039515355783479096, "grad_norm": 0.94140625, "learning_rate": 0.0003395, "loss": 6.6668, "mean_token_accuracy": 0.09435953348875045, "num_tokens": 1316585.0, "step": 680 }, { "entropy": 7.1326805591583256, "epoch": 0.03980590987012232, "grad_norm": 1.0546875, "learning_rate": 0.000342, "loss": 6.7282, "mean_token_accuracy": 0.09551571607589722, "num_tokens": 1325601.0, "step": 685 }, { "entropy": 7.101321458816528, "epoch": 0.040096463956765555, "grad_norm": 1.0625, "learning_rate": 0.00034449999999999997, "loss": 6.7604, "mean_token_accuracy": 0.09247554913163185, "num_tokens": 1336305.0, "step": 690 }, { "entropy": 7.1049731254577635, "epoch": 0.04038701804340878, "grad_norm": 1.140625, "learning_rate": 0.000347, "loss": 6.6507, "mean_token_accuracy": 0.09341847449541092, "num_tokens": 1344820.0, "step": 695 }, { "entropy": 6.997063255310058, "epoch": 0.040677572130052006, "grad_norm": 1.125, "learning_rate": 0.0003495, "loss": 6.6331, "mean_token_accuracy": 0.09355669766664505, "num_tokens": 1353950.0, "step": 700 }, { "entropy": 7.01454758644104, "epoch": 0.04096812621669524, "grad_norm": 1.0078125, "learning_rate": 0.000352, "loss": 6.7545, "mean_token_accuracy": 0.09254956245422363, "num_tokens": 1364881.0, "step": 705 }, { "entropy": 7.0095212936401365, "epoch": 0.041258680303338464, "grad_norm": 1.078125, "learning_rate": 0.0003545, "loss": 6.7061, "mean_token_accuracy": 0.09260506108403206, "num_tokens": 1374018.0, "step": 710 }, { "entropy": 7.11537013053894, "epoch": 0.0415492343899817, "grad_norm": 1.0625, "learning_rate": 0.000357, "loss": 6.6946, "mean_token_accuracy": 0.08821133449673653, "num_tokens": 1384319.0, "step": 715 }, { "entropy": 6.958690166473389, "epoch": 0.04183978847662492, "grad_norm": 1.0546875, "learning_rate": 0.0003595, "loss": 6.5713, "mean_token_accuracy": 0.09440450817346573, "num_tokens": 1393753.0, "step": 720 }, { "entropy": 6.922836446762085, "epoch": 0.042130342563268156, "grad_norm": 1.046875, "learning_rate": 0.000362, "loss": 6.6616, "mean_token_accuracy": 0.09427325800061226, "num_tokens": 1403599.0, "step": 725 }, { "entropy": 7.020907402038574, "epoch": 0.04242089664991138, "grad_norm": 1.1484375, "learning_rate": 0.0003645, "loss": 6.6611, "mean_token_accuracy": 0.10043973848223686, "num_tokens": 1412508.0, "step": 730 }, { "entropy": 7.071925306320191, "epoch": 0.04271145073655461, "grad_norm": 1.125, "learning_rate": 0.000367, "loss": 6.8015, "mean_token_accuracy": 0.0910523734986782, "num_tokens": 1422776.0, "step": 735 }, { "entropy": 6.998428392410278, "epoch": 0.04300200482319784, "grad_norm": 1.140625, "learning_rate": 0.0003695, "loss": 6.6414, "mean_token_accuracy": 0.09633751660585403, "num_tokens": 1432901.0, "step": 740 }, { "entropy": 7.035877513885498, "epoch": 0.043292558909841065, "grad_norm": 1.0625, "learning_rate": 0.000372, "loss": 6.677, "mean_token_accuracy": 0.09542910531163215, "num_tokens": 1442916.0, "step": 745 }, { "entropy": 6.878139925003052, "epoch": 0.0435831129964843, "grad_norm": 1.0078125, "learning_rate": 0.0003745, "loss": 6.5395, "mean_token_accuracy": 0.09616116657853127, "num_tokens": 1453037.0, "step": 750 }, { "entropy": 6.96289029121399, "epoch": 0.043873667083127524, "grad_norm": 1.0703125, "learning_rate": 0.000377, "loss": 6.6196, "mean_token_accuracy": 0.10786209627985954, "num_tokens": 1461963.0, "step": 755 }, { "entropy": 7.00122447013855, "epoch": 0.04416422116977075, "grad_norm": 1.15625, "learning_rate": 0.0003795, "loss": 6.7012, "mean_token_accuracy": 0.09169812574982643, "num_tokens": 1471521.0, "step": 760 }, { "entropy": 6.930304098129272, "epoch": 0.04445477525641398, "grad_norm": 1.21875, "learning_rate": 0.000382, "loss": 6.5366, "mean_token_accuracy": 0.0987947553396225, "num_tokens": 1481438.0, "step": 765 }, { "entropy": 6.89730920791626, "epoch": 0.04474532934305721, "grad_norm": 1.1171875, "learning_rate": 0.0003845, "loss": 6.5654, "mean_token_accuracy": 0.09912522435188294, "num_tokens": 1490522.0, "step": 770 }, { "entropy": 6.994078540802002, "epoch": 0.04503588342970044, "grad_norm": 0.96875, "learning_rate": 0.00038700000000000003, "loss": 6.7343, "mean_token_accuracy": 0.09250347167253495, "num_tokens": 1501034.0, "step": 775 }, { "entropy": 6.894172525405883, "epoch": 0.045326437516343666, "grad_norm": 1.1796875, "learning_rate": 0.00038950000000000003, "loss": 6.5391, "mean_token_accuracy": 0.10528326034545898, "num_tokens": 1510390.0, "step": 780 }, { "entropy": 6.992980337142944, "epoch": 0.0456169916029869, "grad_norm": 1.21875, "learning_rate": 0.00039200000000000004, "loss": 6.6468, "mean_token_accuracy": 0.09232402816414834, "num_tokens": 1520048.0, "step": 785 }, { "entropy": 6.977211618423462, "epoch": 0.045907545689630125, "grad_norm": 1.2578125, "learning_rate": 0.00039450000000000005, "loss": 6.5275, "mean_token_accuracy": 0.10221462920308114, "num_tokens": 1529113.0, "step": 790 }, { "entropy": 6.760094785690308, "epoch": 0.04619809977627335, "grad_norm": 1.09375, "learning_rate": 0.00039700000000000005, "loss": 6.6057, "mean_token_accuracy": 0.09887640923261642, "num_tokens": 1538573.0, "step": 795 }, { "entropy": 6.975562715530396, "epoch": 0.04648865386291658, "grad_norm": 1.1640625, "learning_rate": 0.0003995, "loss": 6.6064, "mean_token_accuracy": 0.10373581051826478, "num_tokens": 1547471.0, "step": 800 }, { "entropy": 6.8805656909942625, "epoch": 0.04677920794955981, "grad_norm": 1.015625, "learning_rate": 0.000402, "loss": 6.5641, "mean_token_accuracy": 0.10285315811634063, "num_tokens": 1557259.0, "step": 805 }, { "entropy": 7.063277673721314, "epoch": 0.04706976203620304, "grad_norm": 0.98046875, "learning_rate": 0.0004045, "loss": 6.7921, "mean_token_accuracy": 0.09200607016682624, "num_tokens": 1567383.0, "step": 810 }, { "entropy": 6.87684121131897, "epoch": 0.04736031612284627, "grad_norm": 1.078125, "learning_rate": 0.00040699999999999997, "loss": 6.4826, "mean_token_accuracy": 0.11064840331673623, "num_tokens": 1577106.0, "step": 815 }, { "entropy": 6.807673025131225, "epoch": 0.04765087020948949, "grad_norm": 1.0703125, "learning_rate": 0.0004095, "loss": 6.5393, "mean_token_accuracy": 0.10080247670412064, "num_tokens": 1586100.0, "step": 820 }, { "entropy": 6.877712535858154, "epoch": 0.047941424296132726, "grad_norm": 1.0859375, "learning_rate": 0.000412, "loss": 6.6279, "mean_token_accuracy": 0.09564873427152634, "num_tokens": 1596950.0, "step": 825 }, { "entropy": 6.891899585723877, "epoch": 0.04823197838277595, "grad_norm": 1.0859375, "learning_rate": 0.0004145, "loss": 6.5837, "mean_token_accuracy": 0.09832958057522774, "num_tokens": 1606001.0, "step": 830 }, { "entropy": 6.978082180023193, "epoch": 0.048522532469419184, "grad_norm": 1.0390625, "learning_rate": 0.000417, "loss": 6.6825, "mean_token_accuracy": 0.0975476372987032, "num_tokens": 1616498.0, "step": 835 }, { "entropy": 6.831979036331177, "epoch": 0.04881308655606241, "grad_norm": 1.2109375, "learning_rate": 0.0004195, "loss": 6.5199, "mean_token_accuracy": 0.10347988307476044, "num_tokens": 1625195.0, "step": 840 }, { "entropy": 6.784482002258301, "epoch": 0.04910364064270564, "grad_norm": 1.0, "learning_rate": 0.000422, "loss": 6.4476, "mean_token_accuracy": 0.10162880271673203, "num_tokens": 1635176.0, "step": 845 }, { "entropy": 6.806185960769653, "epoch": 0.04939419472934887, "grad_norm": 1.09375, "learning_rate": 0.0004245, "loss": 6.553, "mean_token_accuracy": 0.1015662670135498, "num_tokens": 1645183.0, "step": 850 }, { "entropy": 6.801709985733032, "epoch": 0.049684748815992094, "grad_norm": 1.046875, "learning_rate": 0.000427, "loss": 6.5479, "mean_token_accuracy": 0.10148834735155106, "num_tokens": 1654226.0, "step": 855 }, { "entropy": 6.834500074386597, "epoch": 0.04997530290263533, "grad_norm": 1.0078125, "learning_rate": 0.0004295, "loss": 6.5426, "mean_token_accuracy": 0.10362305790185929, "num_tokens": 1664572.0, "step": 860 }, { "entropy": 6.950858306884766, "epoch": 0.05026585698927855, "grad_norm": 1.0234375, "learning_rate": 0.000432, "loss": 6.6472, "mean_token_accuracy": 0.09981537386775016, "num_tokens": 1674070.0, "step": 865 }, { "entropy": 6.791647720336914, "epoch": 0.050556411075921785, "grad_norm": 1.1171875, "learning_rate": 0.0004345, "loss": 6.4773, "mean_token_accuracy": 0.09943379536271095, "num_tokens": 1683473.0, "step": 870 }, { "entropy": 6.777591514587402, "epoch": 0.05084696516256501, "grad_norm": 1.1328125, "learning_rate": 0.000437, "loss": 6.4869, "mean_token_accuracy": 0.10118941962718964, "num_tokens": 1693171.0, "step": 875 }, { "entropy": 6.898639726638794, "epoch": 0.05113751924920824, "grad_norm": 0.953125, "learning_rate": 0.0004395, "loss": 6.606, "mean_token_accuracy": 0.09705074802041054, "num_tokens": 1703023.0, "step": 880 }, { "entropy": 6.73418025970459, "epoch": 0.05142807333585147, "grad_norm": 1.0703125, "learning_rate": 0.000442, "loss": 6.4984, "mean_token_accuracy": 0.1019330695271492, "num_tokens": 1712698.0, "step": 885 }, { "entropy": 6.906363248825073, "epoch": 0.051718627422494695, "grad_norm": 1.1171875, "learning_rate": 0.0004445, "loss": 6.6098, "mean_token_accuracy": 0.09838435426354408, "num_tokens": 1721502.0, "step": 890 }, { "entropy": 6.7474723815917965, "epoch": 0.05200918150913793, "grad_norm": 1.0546875, "learning_rate": 0.000447, "loss": 6.4942, "mean_token_accuracy": 0.10594057068228721, "num_tokens": 1730551.0, "step": 895 }, { "entropy": 6.808920383453369, "epoch": 0.052299735595781154, "grad_norm": 1.015625, "learning_rate": 0.00044950000000000003, "loss": 6.5645, "mean_token_accuracy": 0.10622440055012702, "num_tokens": 1739368.0, "step": 900 }, { "entropy": 6.827513933181763, "epoch": 0.052590289682424386, "grad_norm": 1.140625, "learning_rate": 0.00045200000000000004, "loss": 6.4176, "mean_token_accuracy": 0.11146403327584267, "num_tokens": 1748528.0, "step": 905 }, { "entropy": 6.713736248016358, "epoch": 0.05288084376906761, "grad_norm": 0.9765625, "learning_rate": 0.00045450000000000004, "loss": 6.5739, "mean_token_accuracy": 0.09899114519357681, "num_tokens": 1759569.0, "step": 910 }, { "entropy": 6.80773286819458, "epoch": 0.05317139785571084, "grad_norm": 1.15625, "learning_rate": 0.00045700000000000005, "loss": 6.5099, "mean_token_accuracy": 0.10788461863994599, "num_tokens": 1769366.0, "step": 915 }, { "entropy": 6.76817569732666, "epoch": 0.05346195194235407, "grad_norm": 1.046875, "learning_rate": 0.00045950000000000006, "loss": 6.6024, "mean_token_accuracy": 0.09936894476413727, "num_tokens": 1780155.0, "step": 920 }, { "entropy": 6.755830335617065, "epoch": 0.053752506028997296, "grad_norm": 1.1484375, "learning_rate": 0.000462, "loss": 6.4233, "mean_token_accuracy": 0.10512633025646209, "num_tokens": 1789436.0, "step": 925 }, { "entropy": 6.823408889770508, "epoch": 0.05404306011564053, "grad_norm": 1.1640625, "learning_rate": 0.0004645, "loss": 6.5652, "mean_token_accuracy": 0.0998048096895218, "num_tokens": 1798836.0, "step": 930 }, { "entropy": 6.751146364212036, "epoch": 0.054333614202283755, "grad_norm": 1.03125, "learning_rate": 0.000467, "loss": 6.444, "mean_token_accuracy": 0.10532717406749725, "num_tokens": 1808666.0, "step": 935 }, { "entropy": 6.8108867645263675, "epoch": 0.05462416828892699, "grad_norm": 1.1171875, "learning_rate": 0.0004695, "loss": 6.5972, "mean_token_accuracy": 0.09496863186359406, "num_tokens": 1820001.0, "step": 940 }, { "entropy": 6.751294231414795, "epoch": 0.05491472237557021, "grad_norm": 1.03125, "learning_rate": 0.000472, "loss": 6.4693, "mean_token_accuracy": 0.10566612035036087, "num_tokens": 1830284.0, "step": 945 }, { "entropy": 6.820448493957519, "epoch": 0.05520527646221344, "grad_norm": 1.1171875, "learning_rate": 0.0004745, "loss": 6.4794, "mean_token_accuracy": 0.10577797368168831, "num_tokens": 1839930.0, "step": 950 }, { "entropy": 6.629036235809326, "epoch": 0.05549583054885667, "grad_norm": 0.98046875, "learning_rate": 0.000477, "loss": 6.5675, "mean_token_accuracy": 0.10090194195508957, "num_tokens": 1850697.0, "step": 955 }, { "entropy": 6.817226839065552, "epoch": 0.0557863846354999, "grad_norm": 1.0234375, "learning_rate": 0.0004795, "loss": 6.497, "mean_token_accuracy": 0.10740380734205246, "num_tokens": 1860196.0, "step": 960 }, { "entropy": 6.774875259399414, "epoch": 0.05607693872214313, "grad_norm": 1.0546875, "learning_rate": 0.000482, "loss": 6.47, "mean_token_accuracy": 0.1075842596590519, "num_tokens": 1869000.0, "step": 965 }, { "entropy": 6.722468996047974, "epoch": 0.056367492808786356, "grad_norm": 1.1875, "learning_rate": 0.0004845, "loss": 6.469, "mean_token_accuracy": 0.10600791200995445, "num_tokens": 1878687.0, "step": 970 }, { "entropy": 6.728367662429809, "epoch": 0.05665804689542958, "grad_norm": 1.1640625, "learning_rate": 0.000487, "loss": 6.3467, "mean_token_accuracy": 0.10569515079259872, "num_tokens": 1886914.0, "step": 975 }, { "entropy": 6.671978425979614, "epoch": 0.056948600982072814, "grad_norm": 0.97265625, "learning_rate": 0.0004895, "loss": 6.5321, "mean_token_accuracy": 0.10422437414526939, "num_tokens": 1897392.0, "step": 980 }, { "entropy": 6.805356025695801, "epoch": 0.05723915506871604, "grad_norm": 1.109375, "learning_rate": 0.000492, "loss": 6.488, "mean_token_accuracy": 0.10600305423140526, "num_tokens": 1906215.0, "step": 985 }, { "entropy": 6.8313037872314455, "epoch": 0.05752970915535927, "grad_norm": 1.0859375, "learning_rate": 0.0004945, "loss": 6.5017, "mean_token_accuracy": 0.10730748698115349, "num_tokens": 1915376.0, "step": 990 }, { "entropy": 6.659111022949219, "epoch": 0.0578202632420025, "grad_norm": 1.03125, "learning_rate": 0.000497, "loss": 6.465, "mean_token_accuracy": 0.10440039038658142, "num_tokens": 1925558.0, "step": 995 }, { "entropy": 6.676358318328857, "epoch": 0.05811081732864573, "grad_norm": 1.0625, "learning_rate": 0.0004995, "loss": 6.4301, "mean_token_accuracy": 0.10430914014577866, "num_tokens": 1935176.0, "step": 1000 }, { "entropy": 6.770152616500854, "epoch": 0.05840137141528896, "grad_norm": 0.9921875, "learning_rate": 0.000499998026082006, "loss": 6.4924, "mean_token_accuracy": 0.10445862039923667, "num_tokens": 1945135.0, "step": 1005 }, { "entropy": 6.597527885437012, "epoch": 0.05869192550193218, "grad_norm": 1.1875, "learning_rate": 0.0004999900070995136, "loss": 6.4838, "mean_token_accuracy": 0.10765932872891426, "num_tokens": 1955585.0, "step": 1010 }, { "entropy": 6.867468976974488, "epoch": 0.058982479588575415, "grad_norm": 1.125, "learning_rate": 0.0004999758199023239, "loss": 6.4687, "mean_token_accuracy": 0.10314074084162712, "num_tokens": 1964750.0, "step": 1015 }, { "entropy": 6.624800300598144, "epoch": 0.05927303367521864, "grad_norm": 1.0, "learning_rate": 0.0004999554648793858, "loss": 6.5436, "mean_token_accuracy": 0.10335941463708878, "num_tokens": 1974697.0, "step": 1020 }, { "entropy": 6.7362236976623535, "epoch": 0.05956358776186187, "grad_norm": 1.09375, "learning_rate": 0.0004999289425887425, "loss": 6.4934, "mean_token_accuracy": 0.10554013177752494, "num_tokens": 1983384.0, "step": 1025 }, { "entropy": 6.754078722000122, "epoch": 0.0598541418485051, "grad_norm": 0.98828125, "learning_rate": 0.0004998962537575161, "loss": 6.5229, "mean_token_accuracy": 0.11017107889056206, "num_tokens": 1993790.0, "step": 1030 }, { "entropy": 6.697407197952271, "epoch": 0.060144695935148325, "grad_norm": 1.046875, "learning_rate": 0.0004998573992818874, "loss": 6.4027, "mean_token_accuracy": 0.10623413920402527, "num_tokens": 2003296.0, "step": 1035 }, { "entropy": 6.585323095321655, "epoch": 0.06043525002179156, "grad_norm": 1.0625, "learning_rate": 0.0004998123802270715, "loss": 6.3345, "mean_token_accuracy": 0.11027837991714477, "num_tokens": 2012481.0, "step": 1040 }, { "entropy": 6.705205965042114, "epoch": 0.06072580410843478, "grad_norm": 1.1796875, "learning_rate": 0.0004997611978272886, "loss": 6.4994, "mean_token_accuracy": 0.10490612536668778, "num_tokens": 2022382.0, "step": 1045 }, { "entropy": 6.638956928253174, "epoch": 0.061016358195078016, "grad_norm": 1.0546875, "learning_rate": 0.0004997038534857298, "loss": 6.4097, "mean_token_accuracy": 0.11042128577828407, "num_tokens": 2032290.0, "step": 1050 }, { "entropy": 6.6624797821044925, "epoch": 0.06130691228172124, "grad_norm": 0.984375, "learning_rate": 0.0004996403487745194, "loss": 6.3594, "mean_token_accuracy": 0.10972521901130676, "num_tokens": 2041094.0, "step": 1055 }, { "entropy": 6.609392881393433, "epoch": 0.061597466368364474, "grad_norm": 1.109375, "learning_rate": 0.000499570685434671, "loss": 6.5125, "mean_token_accuracy": 0.10544388592243195, "num_tokens": 2051169.0, "step": 1060 }, { "entropy": 6.6946526050567625, "epoch": 0.0618880204550077, "grad_norm": 1.03125, "learning_rate": 0.0004994948653760405, "loss": 6.3966, "mean_token_accuracy": 0.1103939101099968, "num_tokens": 2061310.0, "step": 1065 }, { "entropy": 6.619559907913208, "epoch": 0.062178574541650926, "grad_norm": 1.0390625, "learning_rate": 0.0004994128906772729, "loss": 6.3829, "mean_token_accuracy": 0.10736953839659691, "num_tokens": 2071537.0, "step": 1070 }, { "entropy": 6.6101906299591064, "epoch": 0.06246912862829416, "grad_norm": 0.9296875, "learning_rate": 0.000499324763585746, "loss": 6.4507, "mean_token_accuracy": 0.10780780464410782, "num_tokens": 2082540.0, "step": 1075 }, { "entropy": 6.621304225921631, "epoch": 0.06275968271493738, "grad_norm": 1.1328125, "learning_rate": 0.0004992304865175085, "loss": 6.4413, "mean_token_accuracy": 0.11023736447095871, "num_tokens": 2091313.0, "step": 1080 }, { "entropy": 6.691177225112915, "epoch": 0.06305023680158062, "grad_norm": 1.0234375, "learning_rate": 0.0004991300620572138, "loss": 6.4862, "mean_token_accuracy": 0.10716225057840348, "num_tokens": 2100826.0, "step": 1085 }, { "entropy": 6.671515083312988, "epoch": 0.06334079088822385, "grad_norm": 1.0625, "learning_rate": 0.0004990234929580494, "loss": 6.4177, "mean_token_accuracy": 0.10876795202493668, "num_tokens": 2109798.0, "step": 1090 }, { "entropy": 6.640522909164429, "epoch": 0.06363134497486707, "grad_norm": 0.9765625, "learning_rate": 0.0004989107821416609, "loss": 6.3138, "mean_token_accuracy": 0.11188038140535354, "num_tokens": 2119641.0, "step": 1095 }, { "entropy": 6.565330696105957, "epoch": 0.0639218990615103, "grad_norm": 1.140625, "learning_rate": 0.0004987919326980723, "loss": 6.3525, "mean_token_accuracy": 0.11164129376411439, "num_tokens": 2128724.0, "step": 1100 }, { "entropy": 6.521946573257447, "epoch": 0.06421245314815353, "grad_norm": 1.109375, "learning_rate": 0.0004986669478856011, "loss": 6.2737, "mean_token_accuracy": 0.11544388085603714, "num_tokens": 2137251.0, "step": 1105 }, { "entropy": 6.6156073093414305, "epoch": 0.06450300723479675, "grad_norm": 1.0, "learning_rate": 0.0004985358311307688, "loss": 6.3821, "mean_token_accuracy": 0.118138437718153, "num_tokens": 2146978.0, "step": 1110 }, { "entropy": 6.669202089309692, "epoch": 0.06479356132143999, "grad_norm": 0.98046875, "learning_rate": 0.0004983985860282081, "loss": 6.4636, "mean_token_accuracy": 0.10260412320494652, "num_tokens": 2157153.0, "step": 1115 }, { "entropy": 6.475356149673462, "epoch": 0.06508411540808322, "grad_norm": 0.9609375, "learning_rate": 0.0004982552163405623, "loss": 6.3599, "mean_token_accuracy": 0.11348235085606576, "num_tokens": 2166946.0, "step": 1120 }, { "entropy": 6.657857656478882, "epoch": 0.06537466949472644, "grad_norm": 1.0703125, "learning_rate": 0.0004981057259983839, "loss": 6.3772, "mean_token_accuracy": 0.11038358807563782, "num_tokens": 2177249.0, "step": 1125 }, { "entropy": 6.466132879257202, "epoch": 0.06566522358136967, "grad_norm": 0.99609375, "learning_rate": 0.0004979501191000262, "loss": 6.3098, "mean_token_accuracy": 0.11056527942419052, "num_tokens": 2187240.0, "step": 1130 }, { "entropy": 6.6453643321990965, "epoch": 0.0659557776680129, "grad_norm": 1.0625, "learning_rate": 0.0004977883999115311, "loss": 6.3145, "mean_token_accuracy": 0.11672020331025124, "num_tokens": 2196199.0, "step": 1135 }, { "entropy": 6.595391893386841, "epoch": 0.06624633175465613, "grad_norm": 1.0703125, "learning_rate": 0.0004976205728665113, "loss": 6.2689, "mean_token_accuracy": 0.11631305515766144, "num_tokens": 2205726.0, "step": 1140 }, { "entropy": 6.587292861938477, "epoch": 0.06653688584129935, "grad_norm": 0.9765625, "learning_rate": 0.0004974466425660307, "loss": 6.4457, "mean_token_accuracy": 0.10664665251970291, "num_tokens": 2216552.0, "step": 1145 }, { "entropy": 6.597306776046753, "epoch": 0.06682743992794259, "grad_norm": 0.953125, "learning_rate": 0.0004972666137784759, "loss": 6.3034, "mean_token_accuracy": 0.11342373788356781, "num_tokens": 2225935.0, "step": 1150 }, { "entropy": 6.644480466842651, "epoch": 0.06711799401458582, "grad_norm": 0.953125, "learning_rate": 0.0004970804914394271, "loss": 6.4604, "mean_token_accuracy": 0.11499964445829391, "num_tokens": 2235907.0, "step": 1155 }, { "entropy": 6.599408388137817, "epoch": 0.06740854810122904, "grad_norm": 1.1328125, "learning_rate": 0.0004968882806515225, "loss": 6.3881, "mean_token_accuracy": 0.10959212481975555, "num_tokens": 2244473.0, "step": 1160 }, { "entropy": 6.641416931152344, "epoch": 0.06769910218787227, "grad_norm": 1.1875, "learning_rate": 0.0004966899866843177, "loss": 6.4123, "mean_token_accuracy": 0.1027280792593956, "num_tokens": 2253834.0, "step": 1165 }, { "entropy": 6.5416028022766115, "epoch": 0.0679896562745155, "grad_norm": 1.015625, "learning_rate": 0.000496485614974142, "loss": 6.3413, "mean_token_accuracy": 0.11207354813814163, "num_tokens": 2263243.0, "step": 1170 }, { "entropy": 6.6198502540588375, "epoch": 0.06828021036115874, "grad_norm": 1.0859375, "learning_rate": 0.0004962751711239492, "loss": 6.3035, "mean_token_accuracy": 0.11463942378759384, "num_tokens": 2273008.0, "step": 1175 }, { "entropy": 6.430229234695434, "epoch": 0.06857076444780195, "grad_norm": 1.0078125, "learning_rate": 0.0004960586609031636, "loss": 6.3457, "mean_token_accuracy": 0.1155870608985424, "num_tokens": 2282522.0, "step": 1180 }, { "entropy": 6.601986408233643, "epoch": 0.06886131853444519, "grad_norm": 1.0625, "learning_rate": 0.0004958360902475224, "loss": 6.2529, "mean_token_accuracy": 0.12027783617377281, "num_tokens": 2292114.0, "step": 1185 }, { "entropy": 6.400939083099365, "epoch": 0.06915187262108842, "grad_norm": 0.94921875, "learning_rate": 0.0004956074652589125, "loss": 6.1978, "mean_token_accuracy": 0.12538810446858406, "num_tokens": 2301592.0, "step": 1190 }, { "entropy": 6.51713194847107, "epoch": 0.06944242670773164, "grad_norm": 0.9921875, "learning_rate": 0.0004953727922052035, "loss": 6.3201, "mean_token_accuracy": 0.11454231590032578, "num_tokens": 2310940.0, "step": 1195 }, { "entropy": 6.463452672958374, "epoch": 0.06973298079437487, "grad_norm": 1.0703125, "learning_rate": 0.0004951320775200756, "loss": 6.3959, "mean_token_accuracy": 0.1151392012834549, "num_tokens": 2320535.0, "step": 1200 }, { "entropy": 6.596390962600708, "epoch": 0.0700235348810181, "grad_norm": 0.96875, "learning_rate": 0.0004948853278028436, "loss": 6.2563, "mean_token_accuracy": 0.12523823976516724, "num_tokens": 2330431.0, "step": 1205 }, { "entropy": 6.3869446277618405, "epoch": 0.07031408896766134, "grad_norm": 1.0546875, "learning_rate": 0.0004946325498182755, "loss": 6.2036, "mean_token_accuracy": 0.12079060897231102, "num_tokens": 2339323.0, "step": 1210 }, { "entropy": 6.510322713851929, "epoch": 0.07060464305430456, "grad_norm": 1.0390625, "learning_rate": 0.0004943737504964076, "loss": 6.2992, "mean_token_accuracy": 0.11487918049097061, "num_tokens": 2349750.0, "step": 1215 }, { "entropy": 6.503530073165893, "epoch": 0.07089519714094779, "grad_norm": 1.1171875, "learning_rate": 0.000494108936932354, "loss": 6.2558, "mean_token_accuracy": 0.1210679478943348, "num_tokens": 2359147.0, "step": 1220 }, { "entropy": 6.520279359817505, "epoch": 0.07118575122759102, "grad_norm": 0.953125, "learning_rate": 0.0004938381163861124, "loss": 6.2786, "mean_token_accuracy": 0.11829182729125023, "num_tokens": 2368762.0, "step": 1225 }, { "entropy": 6.391372203826904, "epoch": 0.07147630531423424, "grad_norm": 0.9765625, "learning_rate": 0.0004935612962823645, "loss": 6.1568, "mean_token_accuracy": 0.12013374790549278, "num_tokens": 2378060.0, "step": 1230 }, { "entropy": 6.465664291381836, "epoch": 0.07176685940087747, "grad_norm": 1.0625, "learning_rate": 0.0004932784842102739, "loss": 6.2575, "mean_token_accuracy": 0.12200002744793892, "num_tokens": 2386997.0, "step": 1235 }, { "entropy": 6.6493157863616945, "epoch": 0.0720574134875207, "grad_norm": 1.2578125, "learning_rate": 0.0004929896879232758, "loss": 6.4026, "mean_token_accuracy": 0.11086667999625206, "num_tokens": 2396980.0, "step": 1240 }, { "entropy": 6.435001850128174, "epoch": 0.07234796757416392, "grad_norm": 1.0703125, "learning_rate": 0.0004926949153388668, "loss": 6.2556, "mean_token_accuracy": 0.1203616626560688, "num_tokens": 2406450.0, "step": 1245 }, { "entropy": 6.519892168045044, "epoch": 0.07263852166080716, "grad_norm": 1.03125, "learning_rate": 0.0004923941745383859, "loss": 6.2632, "mean_token_accuracy": 0.11274134442210197, "num_tokens": 2415985.0, "step": 1250 }, { "entropy": 6.457003879547119, "epoch": 0.07292907574745039, "grad_norm": 0.94921875, "learning_rate": 0.000492087473766794, "loss": 6.2928, "mean_token_accuracy": 0.11486212983727455, "num_tokens": 2425676.0, "step": 1255 }, { "entropy": 6.508018493652344, "epoch": 0.07321962983409362, "grad_norm": 1.0, "learning_rate": 0.000491774821432448, "loss": 6.2922, "mean_token_accuracy": 0.10985862240195274, "num_tokens": 2435918.0, "step": 1260 }, { "entropy": 6.5097509860992435, "epoch": 0.07351018392073684, "grad_norm": 1.0703125, "learning_rate": 0.0004914562261068693, "loss": 6.3562, "mean_token_accuracy": 0.11788229197263718, "num_tokens": 2445267.0, "step": 1265 }, { "entropy": 6.599736261367798, "epoch": 0.07380073800738007, "grad_norm": 1.140625, "learning_rate": 0.0004911316965245098, "loss": 6.3224, "mean_token_accuracy": 0.11191006749868393, "num_tokens": 2455885.0, "step": 1270 }, { "entropy": 6.489064168930054, "epoch": 0.0740912920940233, "grad_norm": 1.0234375, "learning_rate": 0.000490801241582512, "loss": 6.3483, "mean_token_accuracy": 0.11579938605427742, "num_tokens": 2465604.0, "step": 1275 }, { "entropy": 6.5532605171203615, "epoch": 0.07438184618066652, "grad_norm": 1.1015625, "learning_rate": 0.000490464870340465, "loss": 6.4458, "mean_token_accuracy": 0.10784725919365883, "num_tokens": 2475168.0, "step": 1280 }, { "entropy": 6.473039054870606, "epoch": 0.07467240026730976, "grad_norm": 1.1796875, "learning_rate": 0.0004901225920201563, "loss": 6.2243, "mean_token_accuracy": 0.12185250818729401, "num_tokens": 2484185.0, "step": 1285 }, { "entropy": 6.583461809158325, "epoch": 0.07496295435395299, "grad_norm": 1.1171875, "learning_rate": 0.000489774416005319, "loss": 6.3387, "mean_token_accuracy": 0.11904568299651146, "num_tokens": 2492992.0, "step": 1290 }, { "entropy": 6.418948078155518, "epoch": 0.07525350844059622, "grad_norm": 1.03125, "learning_rate": 0.0004894203518413742, "loss": 6.2065, "mean_token_accuracy": 0.119369375705719, "num_tokens": 2502541.0, "step": 1295 }, { "entropy": 6.468045377731324, "epoch": 0.07554406252723944, "grad_norm": 1.0546875, "learning_rate": 0.0004890604092351701, "loss": 6.2364, "mean_token_accuracy": 0.11862708181142807, "num_tokens": 2511947.0, "step": 1300 }, { "entropy": 6.385909509658814, "epoch": 0.07583461661388267, "grad_norm": 1.0703125, "learning_rate": 0.000488694598054715, "loss": 6.2525, "mean_token_accuracy": 0.12124920263886452, "num_tokens": 2521727.0, "step": 1305 }, { "entropy": 6.531244993209839, "epoch": 0.07612517070052591, "grad_norm": 1.0625, "learning_rate": 0.0004883229283289071, "loss": 6.2694, "mean_token_accuracy": 0.1218131199479103, "num_tokens": 2530680.0, "step": 1310 }, { "entropy": 6.422513055801391, "epoch": 0.07641572478716913, "grad_norm": 1.0703125, "learning_rate": 0.00048794541024725993, "loss": 6.1542, "mean_token_accuracy": 0.12266649156808854, "num_tokens": 2539414.0, "step": 1315 }, { "entropy": 6.491461181640625, "epoch": 0.07670627887381236, "grad_norm": 1.0390625, "learning_rate": 0.0004875620541596221, "loss": 6.3072, "mean_token_accuracy": 0.1141884945333004, "num_tokens": 2549609.0, "step": 1320 }, { "entropy": 6.4648158073425295, "epoch": 0.07699683296045559, "grad_norm": 1.0625, "learning_rate": 0.00048717287057589454, "loss": 6.2773, "mean_token_accuracy": 0.11799687221646309, "num_tokens": 2560081.0, "step": 1325 }, { "entropy": 6.400183534622192, "epoch": 0.07728738704709882, "grad_norm": 1.09375, "learning_rate": 0.0004867778701657417, "loss": 6.2328, "mean_token_accuracy": 0.11631238982081413, "num_tokens": 2569995.0, "step": 1330 }, { "entropy": 6.37140007019043, "epoch": 0.07757794113374204, "grad_norm": 1.046875, "learning_rate": 0.00048637706375829955, "loss": 6.1738, "mean_token_accuracy": 0.1213558554649353, "num_tokens": 2579502.0, "step": 1335 }, { "entropy": 6.476347970962524, "epoch": 0.07786849522038528, "grad_norm": 0.9921875, "learning_rate": 0.000485970462341878, "loss": 6.2553, "mean_token_accuracy": 0.12006450816988945, "num_tokens": 2589515.0, "step": 1340 }, { "entropy": 6.434140920639038, "epoch": 0.07815904930702851, "grad_norm": 1.0859375, "learning_rate": 0.00048555807706366044, "loss": 6.1897, "mean_token_accuracy": 0.12782623916864394, "num_tokens": 2598822.0, "step": 1345 }, { "entropy": 6.443134021759033, "epoch": 0.07844960339367173, "grad_norm": 0.93359375, "learning_rate": 0.00048513991922939756, "loss": 6.315, "mean_token_accuracy": 0.11421679928898812, "num_tokens": 2609169.0, "step": 1350 }, { "entropy": 6.484804105758667, "epoch": 0.07874015748031496, "grad_norm": 0.98046875, "learning_rate": 0.00048471600030309744, "loss": 6.2716, "mean_token_accuracy": 0.11644304916262627, "num_tokens": 2618683.0, "step": 1355 }, { "entropy": 6.466926431655883, "epoch": 0.07903071156695819, "grad_norm": 1.140625, "learning_rate": 0.00048428633190671186, "loss": 6.2371, "mean_token_accuracy": 0.12091248780488968, "num_tokens": 2627976.0, "step": 1360 }, { "entropy": 6.505730533599854, "epoch": 0.07932126565360141, "grad_norm": 1.0703125, "learning_rate": 0.0004838509258198167, "loss": 6.294, "mean_token_accuracy": 0.11860666498541832, "num_tokens": 2637235.0, "step": 1365 }, { "entropy": 6.393795537948608, "epoch": 0.07961181974024464, "grad_norm": 0.984375, "learning_rate": 0.00048340979397929, "loss": 6.2951, "mean_token_accuracy": 0.11754858568310737, "num_tokens": 2646698.0, "step": 1370 }, { "entropy": 6.505375099182129, "epoch": 0.07990237382688788, "grad_norm": 1.125, "learning_rate": 0.00048296294847898386, "loss": 6.2788, "mean_token_accuracy": 0.12090856656432152, "num_tokens": 2656357.0, "step": 1375 }, { "entropy": 6.434703159332275, "epoch": 0.08019292791353111, "grad_norm": 1.0859375, "learning_rate": 0.0004825104015693934, "loss": 6.1776, "mean_token_accuracy": 0.11764631941914558, "num_tokens": 2665561.0, "step": 1380 }, { "entropy": 6.437805318832398, "epoch": 0.08048348200017433, "grad_norm": 1.0859375, "learning_rate": 0.0004820521656573208, "loss": 6.1909, "mean_token_accuracy": 0.12296778410673141, "num_tokens": 2674600.0, "step": 1385 }, { "entropy": 6.368801641464233, "epoch": 0.08077403608681756, "grad_norm": 1.0234375, "learning_rate": 0.00048158825330553505, "loss": 6.1838, "mean_token_accuracy": 0.12880179584026336, "num_tokens": 2684944.0, "step": 1390 }, { "entropy": 6.461294555664063, "epoch": 0.0810645901734608, "grad_norm": 1.0078125, "learning_rate": 0.00048111867723242763, "loss": 6.1342, "mean_token_accuracy": 0.12006727010011672, "num_tokens": 2694467.0, "step": 1395 }, { "entropy": 6.442787504196167, "epoch": 0.08135514426010401, "grad_norm": 1.0546875, "learning_rate": 0.0004806434503116637, "loss": 6.2769, "mean_token_accuracy": 0.11950750723481178, "num_tokens": 2704499.0, "step": 1400 }, { "entropy": 6.378614997863769, "epoch": 0.08164569834674724, "grad_norm": 1.0, "learning_rate": 0.0004801625855718296, "loss": 6.1896, "mean_token_accuracy": 0.11940810978412628, "num_tokens": 2715424.0, "step": 1405 }, { "entropy": 6.41011266708374, "epoch": 0.08193625243339048, "grad_norm": 1.09375, "learning_rate": 0.00047967609619607477, "loss": 6.1788, "mean_token_accuracy": 0.12036006227135658, "num_tokens": 2724805.0, "step": 1410 }, { "entropy": 6.3130451202392575, "epoch": 0.08222680652003371, "grad_norm": 1.0234375, "learning_rate": 0.0004791839955217513, "loss": 6.1481, "mean_token_accuracy": 0.12863539010286332, "num_tokens": 2734216.0, "step": 1415 }, { "entropy": 6.424062490463257, "epoch": 0.08251736060667693, "grad_norm": 1.0234375, "learning_rate": 0.00047868629704004786, "loss": 6.2572, "mean_token_accuracy": 0.11476619765162469, "num_tokens": 2744146.0, "step": 1420 }, { "entropy": 6.422879314422607, "epoch": 0.08280791469332016, "grad_norm": 1.046875, "learning_rate": 0.00047818301439561965, "loss": 6.2419, "mean_token_accuracy": 0.12102322354912758, "num_tokens": 2754000.0, "step": 1425 }, { "entropy": 6.637474250793457, "epoch": 0.0830984687799634, "grad_norm": 1.046875, "learning_rate": 0.00047767416138621454, "loss": 6.288, "mean_token_accuracy": 0.11775907129049301, "num_tokens": 2763185.0, "step": 1430 }, { "entropy": 6.372423696517944, "epoch": 0.08338902286660661, "grad_norm": 1.078125, "learning_rate": 0.000477159751962295, "loss": 6.2381, "mean_token_accuracy": 0.11884959116578102, "num_tokens": 2773324.0, "step": 1435 }, { "entropy": 6.485676908493042, "epoch": 0.08367957695324985, "grad_norm": 1.0546875, "learning_rate": 0.00047663980022665507, "loss": 6.2207, "mean_token_accuracy": 0.11649533435702324, "num_tokens": 2783184.0, "step": 1440 }, { "entropy": 6.396980142593383, "epoch": 0.08397013103989308, "grad_norm": 0.9296875, "learning_rate": 0.00047611432043403437, "loss": 6.2223, "mean_token_accuracy": 0.11544240266084671, "num_tokens": 2793278.0, "step": 1445 }, { "entropy": 6.366146802902222, "epoch": 0.08426068512653631, "grad_norm": 1.0625, "learning_rate": 0.0004755833269907267, "loss": 6.1262, "mean_token_accuracy": 0.12203074395656585, "num_tokens": 2802164.0, "step": 1450 }, { "entropy": 6.457718706130981, "epoch": 0.08455123921317953, "grad_norm": 1.0078125, "learning_rate": 0.0004750468344541857, "loss": 6.1891, "mean_token_accuracy": 0.11854342371225357, "num_tokens": 2811537.0, "step": 1455 }, { "entropy": 6.381798458099365, "epoch": 0.08484179329982276, "grad_norm": 1.0546875, "learning_rate": 0.00047450485753262525, "loss": 6.2965, "mean_token_accuracy": 0.11684540212154389, "num_tokens": 2821861.0, "step": 1460 }, { "entropy": 6.412109518051148, "epoch": 0.085132347386466, "grad_norm": 0.98046875, "learning_rate": 0.00047395741108461633, "loss": 6.1718, "mean_token_accuracy": 0.12374548763036727, "num_tokens": 2831916.0, "step": 1465 }, { "entropy": 6.33392972946167, "epoch": 0.08542290147310921, "grad_norm": 1.0546875, "learning_rate": 0.00047340451011867985, "loss": 6.1604, "mean_token_accuracy": 0.12683377638459206, "num_tokens": 2840979.0, "step": 1470 }, { "entropy": 6.418259906768799, "epoch": 0.08571345555975245, "grad_norm": 1.1015625, "learning_rate": 0.00047284616979287515, "loss": 6.1782, "mean_token_accuracy": 0.11932171955704689, "num_tokens": 2851332.0, "step": 1475 }, { "entropy": 6.265405559539795, "epoch": 0.08600400964639568, "grad_norm": 1.03125, "learning_rate": 0.00047228240541438433, "loss": 6.073, "mean_token_accuracy": 0.12999156266450881, "num_tokens": 2860134.0, "step": 1480 }, { "entropy": 6.458755302429199, "epoch": 0.08629456373303891, "grad_norm": 1.1171875, "learning_rate": 0.00047171323243909257, "loss": 6.2126, "mean_token_accuracy": 0.11848914325237274, "num_tokens": 2869218.0, "step": 1485 }, { "entropy": 6.345139837265014, "epoch": 0.08658511781968213, "grad_norm": 0.98828125, "learning_rate": 0.00047113866647116457, "loss": 6.1426, "mean_token_accuracy": 0.12274593263864517, "num_tokens": 2878529.0, "step": 1490 }, { "entropy": 6.426075124740601, "epoch": 0.08687567190632536, "grad_norm": 1.0625, "learning_rate": 0.0004705587232626164, "loss": 6.1579, "mean_token_accuracy": 0.11727055683732032, "num_tokens": 2888149.0, "step": 1495 }, { "entropy": 6.3561450958251955, "epoch": 0.0871662259929686, "grad_norm": 1.03125, "learning_rate": 0.00046997341871288424, "loss": 6.1347, "mean_token_accuracy": 0.12332948073744773, "num_tokens": 2897790.0, "step": 1500 }, { "entropy": 6.316312408447265, "epoch": 0.08745678007961181, "grad_norm": 0.87890625, "learning_rate": 0.0004693827688683879, "loss": 6.2274, "mean_token_accuracy": 0.12053183913230896, "num_tokens": 2908168.0, "step": 1505 }, { "entropy": 6.390694427490234, "epoch": 0.08774733416625505, "grad_norm": 1.0, "learning_rate": 0.0004687867899220914, "loss": 6.116, "mean_token_accuracy": 0.12294506877660752, "num_tokens": 2918734.0, "step": 1510 }, { "entropy": 6.296877431869507, "epoch": 0.08803788825289828, "grad_norm": 1.0078125, "learning_rate": 0.00046818549821305846, "loss": 6.0839, "mean_token_accuracy": 0.1293163001537323, "num_tokens": 2927599.0, "step": 1515 }, { "entropy": 6.2974005222320555, "epoch": 0.0883284423395415, "grad_norm": 1.0234375, "learning_rate": 0.00046757891022600494, "loss": 6.1189, "mean_token_accuracy": 0.12587246671319008, "num_tokens": 2936707.0, "step": 1520 }, { "entropy": 6.4475304126739506, "epoch": 0.08861899642618473, "grad_norm": 1.0078125, "learning_rate": 0.0004669670425908471, "loss": 6.187, "mean_token_accuracy": 0.12100831568241119, "num_tokens": 2945607.0, "step": 1525 }, { "entropy": 6.347147464752197, "epoch": 0.08890955051282796, "grad_norm": 1.09375, "learning_rate": 0.0004663499120822451, "loss": 6.0989, "mean_token_accuracy": 0.12438113316893577, "num_tokens": 2954836.0, "step": 1530 }, { "entropy": 6.321421432495117, "epoch": 0.0892001045994712, "grad_norm": 1.0234375, "learning_rate": 0.0004657275356191437, "loss": 6.1061, "mean_token_accuracy": 0.12466374784708023, "num_tokens": 2964338.0, "step": 1535 }, { "entropy": 6.353213739395142, "epoch": 0.08949065868611442, "grad_norm": 1.0546875, "learning_rate": 0.00046509993026430804, "loss": 6.1634, "mean_token_accuracy": 0.12038285210728646, "num_tokens": 2973943.0, "step": 1540 }, { "entropy": 6.339908075332642, "epoch": 0.08978121277275765, "grad_norm": 1.0546875, "learning_rate": 0.0004644671132238558, "loss": 6.0839, "mean_token_accuracy": 0.12774784490466118, "num_tokens": 2983315.0, "step": 1545 }, { "entropy": 6.300918197631836, "epoch": 0.09007176685940088, "grad_norm": 1.1640625, "learning_rate": 0.00046382910184678585, "loss": 6.0278, "mean_token_accuracy": 0.12856598794460297, "num_tokens": 2992039.0, "step": 1550 }, { "entropy": 6.178817892074585, "epoch": 0.0903623209460441, "grad_norm": 0.9765625, "learning_rate": 0.0004631859136245025, "loss": 6.0594, "mean_token_accuracy": 0.12656542137265206, "num_tokens": 3001428.0, "step": 1555 }, { "entropy": 6.426393222808838, "epoch": 0.09065287503268733, "grad_norm": 0.96875, "learning_rate": 0.0004625375661903357, "loss": 6.1823, "mean_token_accuracy": 0.12130758315324783, "num_tokens": 3012060.0, "step": 1560 }, { "entropy": 6.289572381973267, "epoch": 0.09094342911933057, "grad_norm": 1.03125, "learning_rate": 0.00046188407731905787, "loss": 6.133, "mean_token_accuracy": 0.11978519856929778, "num_tokens": 3021371.0, "step": 1565 }, { "entropy": 6.355306005477905, "epoch": 0.0912339832059738, "grad_norm": 1.0390625, "learning_rate": 0.00046122546492639643, "loss": 6.1783, "mean_token_accuracy": 0.12279156744480133, "num_tokens": 3030934.0, "step": 1570 }, { "entropy": 6.265284681320191, "epoch": 0.09152453729261702, "grad_norm": 1.0546875, "learning_rate": 0.000460561747068543, "loss": 6.1286, "mean_token_accuracy": 0.12520743757486344, "num_tokens": 3041182.0, "step": 1575 }, { "entropy": 6.356278705596924, "epoch": 0.09181509137926025, "grad_norm": 1.0859375, "learning_rate": 0.0004598929419416578, "loss": 6.0982, "mean_token_accuracy": 0.12530012279748917, "num_tokens": 3050086.0, "step": 1580 }, { "entropy": 6.332121706008911, "epoch": 0.09210564546590348, "grad_norm": 1.078125, "learning_rate": 0.00045921906788137123, "loss": 6.2171, "mean_token_accuracy": 0.12314857169985771, "num_tokens": 3061403.0, "step": 1585 }, { "entropy": 6.35912766456604, "epoch": 0.0923961995525467, "grad_norm": 1.078125, "learning_rate": 0.00045854014336228115, "loss": 6.1708, "mean_token_accuracy": 0.12304715439677238, "num_tokens": 3070942.0, "step": 1590 }, { "entropy": 6.291305208206177, "epoch": 0.09268675363918993, "grad_norm": 1.0859375, "learning_rate": 0.00045785618699744615, "loss": 6.0504, "mean_token_accuracy": 0.12217177525162697, "num_tokens": 3079526.0, "step": 1595 }, { "entropy": 6.269596576690674, "epoch": 0.09297730772583317, "grad_norm": 1.0546875, "learning_rate": 0.00045716721753787543, "loss": 6.0384, "mean_token_accuracy": 0.12933970913290976, "num_tokens": 3090977.0, "step": 1600 }, { "entropy": 6.290025997161865, "epoch": 0.0932678618124764, "grad_norm": 1.0859375, "learning_rate": 0.0004564732538720148, "loss": 6.1565, "mean_token_accuracy": 0.1253731794655323, "num_tokens": 3100830.0, "step": 1605 }, { "entropy": 6.346334457397461, "epoch": 0.09355841589911962, "grad_norm": 0.921875, "learning_rate": 0.00045577431502522877, "loss": 6.1792, "mean_token_accuracy": 0.12612521946430205, "num_tokens": 3110285.0, "step": 1610 }, { "entropy": 6.386321640014648, "epoch": 0.09384896998576285, "grad_norm": 0.9921875, "learning_rate": 0.0004550704201592787, "loss": 6.0621, "mean_token_accuracy": 0.12808025181293486, "num_tokens": 3119690.0, "step": 1615 }, { "entropy": 6.274943828582764, "epoch": 0.09413952407240608, "grad_norm": 1.0, "learning_rate": 0.0004543615885717981, "loss": 6.1145, "mean_token_accuracy": 0.12201056703925132, "num_tokens": 3129656.0, "step": 1620 }, { "entropy": 6.2934671401977536, "epoch": 0.0944300781590493, "grad_norm": 1.0078125, "learning_rate": 0.00045364783969576296, "loss": 6.0519, "mean_token_accuracy": 0.12800228744745254, "num_tokens": 3140083.0, "step": 1625 }, { "entropy": 6.261609125137329, "epoch": 0.09472063224569253, "grad_norm": 1.078125, "learning_rate": 0.0004529291930989592, "loss": 6.0483, "mean_token_accuracy": 0.13036949634552003, "num_tokens": 3149747.0, "step": 1630 }, { "entropy": 6.2602826118469235, "epoch": 0.09501118633233577, "grad_norm": 0.9296875, "learning_rate": 0.0004522056684834464, "loss": 6.019, "mean_token_accuracy": 0.12770563066005708, "num_tokens": 3160367.0, "step": 1635 }, { "entropy": 6.249707126617432, "epoch": 0.09530174041897899, "grad_norm": 1.0625, "learning_rate": 0.0004514772856850173, "loss": 6.0068, "mean_token_accuracy": 0.12763984724879265, "num_tokens": 3169375.0, "step": 1640 }, { "entropy": 6.236002111434937, "epoch": 0.09559229450562222, "grad_norm": 1.0625, "learning_rate": 0.0004507440646726542, "loss": 6.0794, "mean_token_accuracy": 0.13096466660499573, "num_tokens": 3178907.0, "step": 1645 }, { "entropy": 6.366798305511475, "epoch": 0.09588284859226545, "grad_norm": 1.0, "learning_rate": 0.0004500060255479818, "loss": 6.0808, "mean_token_accuracy": 0.12382574900984764, "num_tokens": 3189336.0, "step": 1650 }, { "entropy": 6.227943420410156, "epoch": 0.09617340267890868, "grad_norm": 1.1015625, "learning_rate": 0.0004492631885447151, "loss": 6.1707, "mean_token_accuracy": 0.12618450224399566, "num_tokens": 3198787.0, "step": 1655 }, { "entropy": 6.291205787658692, "epoch": 0.0964639567655519, "grad_norm": 1.0625, "learning_rate": 0.00044851557402810616, "loss": 6.0351, "mean_token_accuracy": 0.1262456052005291, "num_tokens": 3208161.0, "step": 1660 }, { "entropy": 6.287449550628662, "epoch": 0.09675451085219514, "grad_norm": 1.015625, "learning_rate": 0.00044776320249438444, "loss": 6.095, "mean_token_accuracy": 0.1295604422688484, "num_tokens": 3217589.0, "step": 1665 }, { "entropy": 6.171545219421387, "epoch": 0.09704506493883837, "grad_norm": 0.9296875, "learning_rate": 0.00044700609457019565, "loss": 6.0335, "mean_token_accuracy": 0.12443587705492973, "num_tokens": 3227159.0, "step": 1670 }, { "entropy": 6.260297155380249, "epoch": 0.09733561902548159, "grad_norm": 0.984375, "learning_rate": 0.0004462442710120359, "loss": 6.0323, "mean_token_accuracy": 0.13212064653635025, "num_tokens": 3236765.0, "step": 1675 }, { "entropy": 6.34990553855896, "epoch": 0.09762617311212482, "grad_norm": 0.9140625, "learning_rate": 0.000445477752705683, "loss": 6.1196, "mean_token_accuracy": 0.12271321415901185, "num_tokens": 3247136.0, "step": 1680 }, { "entropy": 6.242118835449219, "epoch": 0.09791672719876805, "grad_norm": 1.1171875, "learning_rate": 0.00044470656066562336, "loss": 6.1049, "mean_token_accuracy": 0.12386861220002174, "num_tokens": 3256880.0, "step": 1685 }, { "entropy": 6.314453554153443, "epoch": 0.09820728128541129, "grad_norm": 1.015625, "learning_rate": 0.0004439307160344765, "loss": 6.138, "mean_token_accuracy": 0.12304992526769638, "num_tokens": 3267104.0, "step": 1690 }, { "entropy": 6.257071495056152, "epoch": 0.0984978353720545, "grad_norm": 1.0703125, "learning_rate": 0.00044315024008241473, "loss": 6.0882, "mean_token_accuracy": 0.12182446792721749, "num_tokens": 3276165.0, "step": 1695 }, { "entropy": 6.447886228561401, "epoch": 0.09878838945869774, "grad_norm": 1.015625, "learning_rate": 0.0004423651542065806, "loss": 6.2112, "mean_token_accuracy": 0.12100318372249604, "num_tokens": 3285600.0, "step": 1700 }, { "entropy": 6.263007783889771, "epoch": 0.09907894354534097, "grad_norm": 1.03125, "learning_rate": 0.00044157547993050006, "loss": 6.1135, "mean_token_accuracy": 0.12620161846280098, "num_tokens": 3295654.0, "step": 1705 }, { "entropy": 6.253702402114868, "epoch": 0.09936949763198419, "grad_norm": 1.0703125, "learning_rate": 0.00044078123890349227, "loss": 6.0644, "mean_token_accuracy": 0.134315574914217, "num_tokens": 3304743.0, "step": 1710 }, { "entropy": 6.289787006378174, "epoch": 0.09966005171862742, "grad_norm": 1.0390625, "learning_rate": 0.00043998245290007606, "loss": 6.0324, "mean_token_accuracy": 0.12361097186803818, "num_tokens": 3313951.0, "step": 1715 }, { "entropy": 6.332495594024659, "epoch": 0.09995060580527065, "grad_norm": 0.98828125, "learning_rate": 0.00043917914381937323, "loss": 6.0995, "mean_token_accuracy": 0.1251884751021862, "num_tokens": 3324508.0, "step": 1720 }, { "entropy": 6.194294214248657, "epoch": 0.10024115989191389, "grad_norm": 0.94140625, "learning_rate": 0.00043837133368450815, "loss": 6.054, "mean_token_accuracy": 0.12405704930424691, "num_tokens": 3335373.0, "step": 1725 }, { "entropy": 6.333467721939087, "epoch": 0.1005317139785571, "grad_norm": 1.0234375, "learning_rate": 0.0004375590446420037, "loss": 6.0678, "mean_token_accuracy": 0.12813965305685998, "num_tokens": 3345242.0, "step": 1730 }, { "entropy": 6.268236112594605, "epoch": 0.10082226806520034, "grad_norm": 1.125, "learning_rate": 0.0004367422989611743, "loss": 6.0504, "mean_token_accuracy": 0.13582077920436858, "num_tokens": 3354783.0, "step": 1735 }, { "entropy": 6.266420888900757, "epoch": 0.10111282215184357, "grad_norm": 0.9921875, "learning_rate": 0.0004359211190335153, "loss": 6.0742, "mean_token_accuracy": 0.13280235901474952, "num_tokens": 3363705.0, "step": 1740 }, { "entropy": 6.345703363418579, "epoch": 0.10140337623848679, "grad_norm": 1.0625, "learning_rate": 0.00043509552737208923, "loss": 6.1009, "mean_token_accuracy": 0.12972408011555672, "num_tokens": 3372331.0, "step": 1745 }, { "entropy": 6.221278953552246, "epoch": 0.10169393032513002, "grad_norm": 0.984375, "learning_rate": 0.00043426554661090853, "loss": 6.0122, "mean_token_accuracy": 0.13363172858953476, "num_tokens": 3380986.0, "step": 1750 }, { "entropy": 6.314662122726441, "epoch": 0.10198448441177325, "grad_norm": 1.046875, "learning_rate": 0.00043343119950431516, "loss": 6.0681, "mean_token_accuracy": 0.12935666590929032, "num_tokens": 3390852.0, "step": 1755 }, { "entropy": 6.2016339778900145, "epoch": 0.10227503849841647, "grad_norm": 1.03125, "learning_rate": 0.00043259250892635644, "loss": 6.0835, "mean_token_accuracy": 0.1321997858583927, "num_tokens": 3399916.0, "step": 1760 }, { "entropy": 6.3428630352020265, "epoch": 0.1025655925850597, "grad_norm": 0.92578125, "learning_rate": 0.0004317494978701582, "loss": 6.0995, "mean_token_accuracy": 0.13536889478564262, "num_tokens": 3409913.0, "step": 1765 }, { "entropy": 6.2461179256439205, "epoch": 0.10285614667170294, "grad_norm": 0.953125, "learning_rate": 0.0004309021894472943, "loss": 6.1217, "mean_token_accuracy": 0.12532801926136017, "num_tokens": 3420817.0, "step": 1770 }, { "entropy": 6.281768560409546, "epoch": 0.10314670075834617, "grad_norm": 0.9453125, "learning_rate": 0.0004300506068871534, "loss": 6.0642, "mean_token_accuracy": 0.13035471364855766, "num_tokens": 3430873.0, "step": 1775 }, { "entropy": 6.25596866607666, "epoch": 0.10343725484498939, "grad_norm": 1.0859375, "learning_rate": 0.00042919477353630135, "loss": 5.9967, "mean_token_accuracy": 0.13541611135005951, "num_tokens": 3440078.0, "step": 1780 }, { "entropy": 6.170544290542603, "epoch": 0.10372780893163262, "grad_norm": 0.9609375, "learning_rate": 0.000428334712857842, "loss": 5.9563, "mean_token_accuracy": 0.1369057409465313, "num_tokens": 3449029.0, "step": 1785 }, { "entropy": 6.206426477432251, "epoch": 0.10401836301827586, "grad_norm": 0.91015625, "learning_rate": 0.00042747044843077304, "loss": 6.0783, "mean_token_accuracy": 0.13255516290664673, "num_tokens": 3458880.0, "step": 1790 }, { "entropy": 6.415725946426392, "epoch": 0.10430891710491907, "grad_norm": 1.0234375, "learning_rate": 0.00042660200394934047, "loss": 6.1575, "mean_token_accuracy": 0.1243210181593895, "num_tokens": 3468132.0, "step": 1795 }, { "entropy": 6.199592542648316, "epoch": 0.10459947119156231, "grad_norm": 1.1015625, "learning_rate": 0.00042572940322238844, "loss": 6.0499, "mean_token_accuracy": 0.1273614466190338, "num_tokens": 3477429.0, "step": 1800 }, { "entropy": 6.205861282348633, "epoch": 0.10489002527820554, "grad_norm": 1.0859375, "learning_rate": 0.00042485267017270664, "loss": 6.0663, "mean_token_accuracy": 0.12217539176344872, "num_tokens": 3487526.0, "step": 1805 }, { "entropy": 6.246215867996216, "epoch": 0.10518057936484877, "grad_norm": 1.109375, "learning_rate": 0.0004239718288363745, "loss": 6.0049, "mean_token_accuracy": 0.14162934869527816, "num_tokens": 3496280.0, "step": 1810 }, { "entropy": 6.257954835891724, "epoch": 0.10547113345149199, "grad_norm": 1.109375, "learning_rate": 0.0004230869033621023, "loss": 6.0072, "mean_token_accuracy": 0.13223105296492577, "num_tokens": 3505871.0, "step": 1815 }, { "entropy": 6.254146718978882, "epoch": 0.10576168753813522, "grad_norm": 0.91015625, "learning_rate": 0.0004221979180105688, "loss": 5.9791, "mean_token_accuracy": 0.13909292891621589, "num_tokens": 3515846.0, "step": 1820 }, { "entropy": 6.229612159729004, "epoch": 0.10605224162477846, "grad_norm": 1.078125, "learning_rate": 0.00042130489715375645, "loss": 6.0716, "mean_token_accuracy": 0.12691670581698417, "num_tokens": 3525358.0, "step": 1825 }, { "entropy": 6.216009950637817, "epoch": 0.10634279571142168, "grad_norm": 1.046875, "learning_rate": 0.00042040786527428335, "loss": 5.985, "mean_token_accuracy": 0.13652188181877137, "num_tokens": 3534459.0, "step": 1830 }, { "entropy": 6.155793523788452, "epoch": 0.10663334979806491, "grad_norm": 1.0390625, "learning_rate": 0.0004195068469647315, "loss": 5.9546, "mean_token_accuracy": 0.1332765720784664, "num_tokens": 3545268.0, "step": 1835 }, { "entropy": 6.335662126541138, "epoch": 0.10692390388470814, "grad_norm": 1.078125, "learning_rate": 0.00041860186692697297, "loss": 6.0853, "mean_token_accuracy": 0.13031049072742462, "num_tokens": 3554281.0, "step": 1840 }, { "entropy": 6.235852289199829, "epoch": 0.10721445797135137, "grad_norm": 1.09375, "learning_rate": 0.00041769294997149264, "loss": 6.0309, "mean_token_accuracy": 0.13206790015101433, "num_tokens": 3563505.0, "step": 1845 }, { "entropy": 6.190238428115845, "epoch": 0.10750501205799459, "grad_norm": 1.0703125, "learning_rate": 0.0004167801210167081, "loss": 5.9761, "mean_token_accuracy": 0.14046704694628714, "num_tokens": 3573288.0, "step": 1850 }, { "entropy": 6.100773715972901, "epoch": 0.10779556614463782, "grad_norm": 1.0859375, "learning_rate": 0.0004158634050882861, "loss": 5.898, "mean_token_accuracy": 0.14386857226490973, "num_tokens": 3582156.0, "step": 1855 }, { "entropy": 6.168911600112915, "epoch": 0.10808612023128106, "grad_norm": 0.93359375, "learning_rate": 0.0004149428273184569, "loss": 6.0786, "mean_token_accuracy": 0.13001196533441545, "num_tokens": 3592708.0, "step": 1860 }, { "entropy": 6.220205640792846, "epoch": 0.10837667431792428, "grad_norm": 0.99609375, "learning_rate": 0.0004140184129453253, "loss": 5.9618, "mean_token_accuracy": 0.13206626623868942, "num_tokens": 3602983.0, "step": 1865 }, { "entropy": 6.231088352203369, "epoch": 0.10866722840456751, "grad_norm": 1.0234375, "learning_rate": 0.000413090187312178, "loss": 6.031, "mean_token_accuracy": 0.13375057205557822, "num_tokens": 3612779.0, "step": 1870 }, { "entropy": 6.18984317779541, "epoch": 0.10895778249121074, "grad_norm": 1.0, "learning_rate": 0.0004121581758667898, "loss": 6.0085, "mean_token_accuracy": 0.1313602216541767, "num_tokens": 3622850.0, "step": 1875 }, { "entropy": 6.1243922233581545, "epoch": 0.10924833657785397, "grad_norm": 0.9609375, "learning_rate": 0.00041122240416072533, "loss": 6.0192, "mean_token_accuracy": 0.1339510276913643, "num_tokens": 3632673.0, "step": 1880 }, { "entropy": 6.293065547943115, "epoch": 0.1095388906644972, "grad_norm": 1.1328125, "learning_rate": 0.0004102828978486385, "loss": 6.0195, "mean_token_accuracy": 0.1283419005572796, "num_tokens": 3642571.0, "step": 1885 }, { "entropy": 6.16258282661438, "epoch": 0.10982944475114043, "grad_norm": 1.03125, "learning_rate": 0.0004093396826875695, "loss": 6.0009, "mean_token_accuracy": 0.13664330318570136, "num_tokens": 3651864.0, "step": 1890 }, { "entropy": 6.2043415069580075, "epoch": 0.11011999883778366, "grad_norm": 0.99609375, "learning_rate": 0.00040839278453623837, "loss": 5.9716, "mean_token_accuracy": 0.1314692884683609, "num_tokens": 3662410.0, "step": 1895 }, { "entropy": 6.146759462356568, "epoch": 0.11041055292442688, "grad_norm": 1.0234375, "learning_rate": 0.0004074422293543363, "loss": 5.9287, "mean_token_accuracy": 0.13767404705286027, "num_tokens": 3672340.0, "step": 1900 }, { "entropy": 6.102005672454834, "epoch": 0.11070110701107011, "grad_norm": 0.92578125, "learning_rate": 0.0004064880432018137, "loss": 6.0753, "mean_token_accuracy": 0.1314219541847706, "num_tokens": 3682745.0, "step": 1905 }, { "entropy": 6.236651849746704, "epoch": 0.11099166109771334, "grad_norm": 1.0, "learning_rate": 0.00040553025223816615, "loss": 5.9814, "mean_token_accuracy": 0.13747138530015945, "num_tokens": 3692075.0, "step": 1910 }, { "entropy": 6.1932165145874025, "epoch": 0.11128221518435656, "grad_norm": 0.9921875, "learning_rate": 0.00040456888272171653, "loss": 5.9772, "mean_token_accuracy": 0.13977260813117026, "num_tokens": 3701639.0, "step": 1915 }, { "entropy": 6.29685697555542, "epoch": 0.1115727692709998, "grad_norm": 1.09375, "learning_rate": 0.00040360396100889577, "loss": 6.0266, "mean_token_accuracy": 0.13467289954423906, "num_tokens": 3711103.0, "step": 1920 }, { "entropy": 6.2153466701507565, "epoch": 0.11186332335764303, "grad_norm": 0.98046875, "learning_rate": 0.0004026355135535202, "loss": 6.0563, "mean_token_accuracy": 0.1295161299407482, "num_tokens": 3720229.0, "step": 1925 }, { "entropy": 6.287035751342773, "epoch": 0.11215387744428626, "grad_norm": 0.92578125, "learning_rate": 0.000401663566906066, "loss": 6.0289, "mean_token_accuracy": 0.1336451180279255, "num_tokens": 3730616.0, "step": 1930 }, { "entropy": 6.164524412155151, "epoch": 0.11244443153092948, "grad_norm": 1.0234375, "learning_rate": 0.00040068814771294134, "loss": 5.8945, "mean_token_accuracy": 0.13720172494649888, "num_tokens": 3739829.0, "step": 1935 }, { "entropy": 6.15112156867981, "epoch": 0.11273498561757271, "grad_norm": 1.046875, "learning_rate": 0.0003997092827157562, "loss": 6.0658, "mean_token_accuracy": 0.12816951870918275, "num_tokens": 3749830.0, "step": 1940 }, { "entropy": 6.2455854415893555, "epoch": 0.11302553970421594, "grad_norm": 1.0390625, "learning_rate": 0.000398726998750589, "loss": 5.9084, "mean_token_accuracy": 0.14013779759407044, "num_tokens": 3759583.0, "step": 1945 }, { "entropy": 6.15108060836792, "epoch": 0.11331609379085916, "grad_norm": 1.046875, "learning_rate": 0.00039774132274725076, "loss": 5.9655, "mean_token_accuracy": 0.13013281747698785, "num_tokens": 3769219.0, "step": 1950 }, { "entropy": 6.253466367721558, "epoch": 0.1136066478775024, "grad_norm": 0.94921875, "learning_rate": 0.00039675228172854707, "loss": 5.9664, "mean_token_accuracy": 0.1329497739672661, "num_tokens": 3778913.0, "step": 1955 }, { "entropy": 6.288890600204468, "epoch": 0.11389720196414563, "grad_norm": 1.0, "learning_rate": 0.0003957599028095371, "loss": 6.053, "mean_token_accuracy": 0.133541439473629, "num_tokens": 3788544.0, "step": 1960 }, { "entropy": 6.138053035736084, "epoch": 0.11418775605078886, "grad_norm": 1.109375, "learning_rate": 0.00039476421319679017, "loss": 5.8634, "mean_token_accuracy": 0.1413568802177906, "num_tokens": 3797921.0, "step": 1965 }, { "entropy": 6.105274248123169, "epoch": 0.11447831013743208, "grad_norm": 1.0078125, "learning_rate": 0.00039376524018764, "loss": 5.9334, "mean_token_accuracy": 0.13669840842485428, "num_tokens": 3807442.0, "step": 1970 }, { "entropy": 6.157075214385986, "epoch": 0.11476886422407531, "grad_norm": 1.0234375, "learning_rate": 0.00039276301116943616, "loss": 5.9183, "mean_token_accuracy": 0.1388249270617962, "num_tokens": 3817875.0, "step": 1975 }, { "entropy": 6.148885679244995, "epoch": 0.11505941831071854, "grad_norm": 1.0859375, "learning_rate": 0.0003917575536187936, "loss": 6.0358, "mean_token_accuracy": 0.1268165521323681, "num_tokens": 3826925.0, "step": 1980 }, { "entropy": 6.279354047775269, "epoch": 0.11534997239736176, "grad_norm": 1.0, "learning_rate": 0.00039074889510083894, "loss": 6.0141, "mean_token_accuracy": 0.1367575228214264, "num_tokens": 3836047.0, "step": 1985 }, { "entropy": 6.161008596420288, "epoch": 0.115640526484005, "grad_norm": 1.078125, "learning_rate": 0.00038973706326845495, "loss": 5.969, "mean_token_accuracy": 0.1333487443625927, "num_tokens": 3845874.0, "step": 1990 }, { "entropy": 6.21445050239563, "epoch": 0.11593108057064823, "grad_norm": 1.03125, "learning_rate": 0.0003887220858615225, "loss": 5.9627, "mean_token_accuracy": 0.13459742665290833, "num_tokens": 3855967.0, "step": 1995 }, { "entropy": 6.1095618724823, "epoch": 0.11622163465729146, "grad_norm": 0.98828125, "learning_rate": 0.0003877039907061597, "loss": 5.9908, "mean_token_accuracy": 0.13831030651926995, "num_tokens": 3866467.0, "step": 2000 }, { "entropy": 6.203917360305786, "epoch": 0.11651218874393468, "grad_norm": 1.09375, "learning_rate": 0.0003866828057139598, "loss": 5.9744, "mean_token_accuracy": 0.13815115690231322, "num_tokens": 3875916.0, "step": 2005 }, { "entropy": 6.18111400604248, "epoch": 0.11680274283057791, "grad_norm": 1.0546875, "learning_rate": 0.00038565855888122503, "loss": 5.9594, "mean_token_accuracy": 0.1330759234726429, "num_tokens": 3885987.0, "step": 2010 }, { "entropy": 6.2428779125213625, "epoch": 0.11709329691722115, "grad_norm": 1.109375, "learning_rate": 0.00038463127828819975, "loss": 6.0059, "mean_token_accuracy": 0.13700252026319504, "num_tokens": 3895809.0, "step": 2015 }, { "entropy": 6.195304489135742, "epoch": 0.11738385100386436, "grad_norm": 1.0078125, "learning_rate": 0.00038360099209830043, "loss": 6.0109, "mean_token_accuracy": 0.132937653362751, "num_tokens": 3905491.0, "step": 2020 }, { "entropy": 6.115702676773071, "epoch": 0.1176744050905076, "grad_norm": 0.921875, "learning_rate": 0.0003825677285573433, "loss": 5.8753, "mean_token_accuracy": 0.14159394055604935, "num_tokens": 3915073.0, "step": 2025 }, { "entropy": 6.103581476211548, "epoch": 0.11796495917715083, "grad_norm": 1.0625, "learning_rate": 0.00038153151599277027, "loss": 5.9516, "mean_token_accuracy": 0.1373932972550392, "num_tokens": 3924786.0, "step": 2030 }, { "entropy": 6.296129131317139, "epoch": 0.11825551326379405, "grad_norm": 0.97265625, "learning_rate": 0.0003804923828128723, "loss": 6.1096, "mean_token_accuracy": 0.12966496869921684, "num_tokens": 3934745.0, "step": 2035 }, { "entropy": 6.2405133724212645, "epoch": 0.11854606735043728, "grad_norm": 1.0234375, "learning_rate": 0.0003794503575060104, "loss": 5.928, "mean_token_accuracy": 0.1365241065621376, "num_tokens": 3945328.0, "step": 2040 }, { "entropy": 6.151012706756592, "epoch": 0.11883662143708051, "grad_norm": 0.93359375, "learning_rate": 0.00037840546863983484, "loss": 6.0549, "mean_token_accuracy": 0.12878239378333092, "num_tokens": 3955894.0, "step": 2045 }, { "entropy": 6.219704437255859, "epoch": 0.11912717552372375, "grad_norm": 1.078125, "learning_rate": 0.0003773577448605015, "loss": 5.9845, "mean_token_accuracy": 0.13799333572387695, "num_tokens": 3964895.0, "step": 2050 }, { "entropy": 6.154746055603027, "epoch": 0.11941772961036697, "grad_norm": 1.078125, "learning_rate": 0.0003763072148918872, "loss": 6.0396, "mean_token_accuracy": 0.12946364358067514, "num_tokens": 3974681.0, "step": 2055 }, { "entropy": 6.248775148391724, "epoch": 0.1197082836970102, "grad_norm": 1.046875, "learning_rate": 0.0003752539075348017, "loss": 6.0252, "mean_token_accuracy": 0.13851658627390862, "num_tokens": 3984402.0, "step": 2060 }, { "entropy": 6.127353715896606, "epoch": 0.11999883778365343, "grad_norm": 1.0703125, "learning_rate": 0.00037419785166619817, "loss": 6.0268, "mean_token_accuracy": 0.1265586420893669, "num_tokens": 3995279.0, "step": 2065 }, { "entropy": 6.17973141670227, "epoch": 0.12028939187029665, "grad_norm": 1.015625, "learning_rate": 0.0003731390762383818, "loss": 5.8617, "mean_token_accuracy": 0.15186458677053452, "num_tokens": 4003525.0, "step": 2070 }, { "entropy": 6.115002965927124, "epoch": 0.12057994595693988, "grad_norm": 1.0546875, "learning_rate": 0.0003720776102782158, "loss": 5.8387, "mean_token_accuracy": 0.13773723766207696, "num_tokens": 4012373.0, "step": 2075 }, { "entropy": 6.103297424316406, "epoch": 0.12087050004358312, "grad_norm": 0.9609375, "learning_rate": 0.00037101348288632555, "loss": 5.9031, "mean_token_accuracy": 0.13564639389514924, "num_tokens": 4021972.0, "step": 2080 }, { "entropy": 6.180972766876221, "epoch": 0.12116105413022635, "grad_norm": 0.921875, "learning_rate": 0.0003699467232363012, "loss": 5.9966, "mean_token_accuracy": 0.134269118309021, "num_tokens": 4032384.0, "step": 2085 }, { "entropy": 6.158924150466919, "epoch": 0.12145160821686957, "grad_norm": 1.0078125, "learning_rate": 0.0003688773605738973, "loss": 5.8791, "mean_token_accuracy": 0.13695783466100692, "num_tokens": 4041974.0, "step": 2090 }, { "entropy": 6.068432807922363, "epoch": 0.1217421623035128, "grad_norm": 1.0625, "learning_rate": 0.00036780542421623134, "loss": 5.9396, "mean_token_accuracy": 0.13688302487134935, "num_tokens": 4051694.0, "step": 2095 }, { "entropy": 6.212389612197876, "epoch": 0.12203271639015603, "grad_norm": 1.078125, "learning_rate": 0.0003667309435509802, "loss": 6.002, "mean_token_accuracy": 0.13414775878190993, "num_tokens": 4062828.0, "step": 2100 }, { "entropy": 6.241210889816284, "epoch": 0.12232327047679925, "grad_norm": 1.1015625, "learning_rate": 0.0003656539480355741, "loss": 5.9742, "mean_token_accuracy": 0.13490709364414216, "num_tokens": 4072012.0, "step": 2105 }, { "entropy": 6.0934614658355715, "epoch": 0.12261382456344248, "grad_norm": 0.99609375, "learning_rate": 0.0003645744671963891, "loss": 5.9166, "mean_token_accuracy": 0.14179718866944313, "num_tokens": 4081587.0, "step": 2110 }, { "entropy": 6.224443626403809, "epoch": 0.12290437865008572, "grad_norm": 1.1171875, "learning_rate": 0.0003634925306279376, "loss": 5.9655, "mean_token_accuracy": 0.13849997371435166, "num_tokens": 4091374.0, "step": 2115 }, { "entropy": 6.211995363235474, "epoch": 0.12319493273672895, "grad_norm": 1.15625, "learning_rate": 0.0003624081679920574, "loss": 5.9553, "mean_token_accuracy": 0.1392621487379074, "num_tokens": 4100532.0, "step": 2120 }, { "entropy": 6.090556001663208, "epoch": 0.12348548682337217, "grad_norm": 1.046875, "learning_rate": 0.0003613214090170977, "loss": 5.9123, "mean_token_accuracy": 0.13530985191464423, "num_tokens": 4110194.0, "step": 2125 }, { "entropy": 6.171910429000855, "epoch": 0.1237760409100154, "grad_norm": 0.98828125, "learning_rate": 0.0003602322834971048, "loss": 5.9322, "mean_token_accuracy": 0.136457958817482, "num_tokens": 4119816.0, "step": 2130 }, { "entropy": 6.220505046844482, "epoch": 0.12406659499665863, "grad_norm": 1.0078125, "learning_rate": 0.0003591408212910051, "loss": 6.0141, "mean_token_accuracy": 0.13071410208940507, "num_tokens": 4130072.0, "step": 2135 }, { "entropy": 6.171974468231201, "epoch": 0.12435714908330185, "grad_norm": 1.15625, "learning_rate": 0.0003580470523217863, "loss": 5.9574, "mean_token_accuracy": 0.13431628346443175, "num_tokens": 4139101.0, "step": 2140 }, { "entropy": 6.139108896255493, "epoch": 0.12464770316994508, "grad_norm": 0.921875, "learning_rate": 0.0003569510065756771, "loss": 5.8817, "mean_token_accuracy": 0.13240241631865501, "num_tokens": 4149500.0, "step": 2145 }, { "entropy": 6.066398859024048, "epoch": 0.12493825725658832, "grad_norm": 1.09375, "learning_rate": 0.0003558527141013254, "loss": 5.8244, "mean_token_accuracy": 0.14197371304035186, "num_tokens": 4158642.0, "step": 2150 }, { "entropy": 6.045716571807861, "epoch": 0.12522881134323155, "grad_norm": 1.0390625, "learning_rate": 0.0003547522050089742, "loss": 5.8963, "mean_token_accuracy": 0.13848227709531785, "num_tokens": 4167911.0, "step": 2155 }, { "entropy": 6.142486429214477, "epoch": 0.12551936542987477, "grad_norm": 1.0625, "learning_rate": 0.00035364950946963606, "loss": 5.8062, "mean_token_accuracy": 0.14425584971904754, "num_tokens": 4177589.0, "step": 2160 }, { "entropy": 6.1435057640075685, "epoch": 0.125809919516518, "grad_norm": 1.0, "learning_rate": 0.0003525446577142663, "loss": 5.9855, "mean_token_accuracy": 0.13806044012308122, "num_tokens": 4187332.0, "step": 2165 }, { "entropy": 6.165029859542846, "epoch": 0.12610047360316123, "grad_norm": 1.109375, "learning_rate": 0.00035143768003293395, "loss": 5.9359, "mean_token_accuracy": 0.1438089445233345, "num_tokens": 4196686.0, "step": 2170 }, { "entropy": 6.102525806427002, "epoch": 0.12639102768980445, "grad_norm": 1.046875, "learning_rate": 0.0003503286067739913, "loss": 5.8595, "mean_token_accuracy": 0.14088326916098595, "num_tokens": 4205908.0, "step": 2175 }, { "entropy": 6.056342458724975, "epoch": 0.1266815817764477, "grad_norm": 0.99609375, "learning_rate": 0.00034921746834324193, "loss": 5.8166, "mean_token_accuracy": 0.14242705181241036, "num_tokens": 4215992.0, "step": 2180 }, { "entropy": 6.085980653762817, "epoch": 0.12697213586309092, "grad_norm": 0.96875, "learning_rate": 0.0003481042952031072, "loss": 5.8801, "mean_token_accuracy": 0.14103155285120011, "num_tokens": 4227094.0, "step": 2185 }, { "entropy": 6.166931819915772, "epoch": 0.12726268994973414, "grad_norm": 0.9296875, "learning_rate": 0.0003469891178717911, "loss": 5.9574, "mean_token_accuracy": 0.14575981721282005, "num_tokens": 4236432.0, "step": 2190 }, { "entropy": 6.066125965118408, "epoch": 0.12755324403637738, "grad_norm": 1.1015625, "learning_rate": 0.0003458719669224436, "loss": 5.7683, "mean_token_accuracy": 0.14632787331938743, "num_tokens": 4245305.0, "step": 2195 }, { "entropy": 6.0759042263031, "epoch": 0.1278437981230206, "grad_norm": 1.0546875, "learning_rate": 0.0003447528729823221, "loss": 5.9274, "mean_token_accuracy": 0.13991366624832152, "num_tokens": 4255445.0, "step": 2200 }, { "entropy": 6.161522722244262, "epoch": 0.12813435220966382, "grad_norm": 1.0625, "learning_rate": 0.0003436318667319525, "loss": 5.9095, "mean_token_accuracy": 0.14544984251260756, "num_tokens": 4265114.0, "step": 2205 }, { "entropy": 6.1537879467010494, "epoch": 0.12842490629630707, "grad_norm": 1.0078125, "learning_rate": 0.00034250897890428716, "loss": 5.9025, "mean_token_accuracy": 0.13503851667046546, "num_tokens": 4274787.0, "step": 2210 }, { "entropy": 6.1523271083831785, "epoch": 0.1287154603829503, "grad_norm": 1.0703125, "learning_rate": 0.0003413842402838633, "loss": 5.9789, "mean_token_accuracy": 0.13642423674464227, "num_tokens": 4284509.0, "step": 2215 }, { "entropy": 6.217884969711304, "epoch": 0.1290060144695935, "grad_norm": 1.03125, "learning_rate": 0.00034025768170595834, "loss": 5.9438, "mean_token_accuracy": 0.1450071580708027, "num_tokens": 4294128.0, "step": 2220 }, { "entropy": 6.103723478317261, "epoch": 0.12929656855623675, "grad_norm": 1.0, "learning_rate": 0.0003391293340557446, "loss": 5.9493, "mean_token_accuracy": 0.13352228179574013, "num_tokens": 4303779.0, "step": 2225 }, { "entropy": 6.194416570663452, "epoch": 0.12958712264287997, "grad_norm": 0.921875, "learning_rate": 0.0003379992282674431, "loss": 5.9859, "mean_token_accuracy": 0.14135119765996934, "num_tokens": 4314520.0, "step": 2230 }, { "entropy": 6.184441709518433, "epoch": 0.1298776767295232, "grad_norm": 0.99609375, "learning_rate": 0.0003368673953234749, "loss": 5.9214, "mean_token_accuracy": 0.13849867284297943, "num_tokens": 4324477.0, "step": 2235 }, { "entropy": 6.059356164932251, "epoch": 0.13016823081616644, "grad_norm": 1.0390625, "learning_rate": 0.00033573386625361176, "loss": 5.8373, "mean_token_accuracy": 0.14615851789712905, "num_tokens": 4334009.0, "step": 2240 }, { "entropy": 6.182690954208374, "epoch": 0.13045878490280965, "grad_norm": 0.94140625, "learning_rate": 0.00033459867213412567, "loss": 5.9892, "mean_token_accuracy": 0.13937190547585487, "num_tokens": 4343748.0, "step": 2245 }, { "entropy": 6.105481243133545, "epoch": 0.13074933898945287, "grad_norm": 1.0234375, "learning_rate": 0.000333461844086937, "loss": 5.9097, "mean_token_accuracy": 0.13728254288434982, "num_tokens": 4352957.0, "step": 2250 }, { "entropy": 6.125918388366699, "epoch": 0.13103989307609612, "grad_norm": 1.0703125, "learning_rate": 0.00033232341327876097, "loss": 5.9005, "mean_token_accuracy": 0.1423856124281883, "num_tokens": 4362505.0, "step": 2255 }, { "entropy": 6.138764905929565, "epoch": 0.13133044716273934, "grad_norm": 1.015625, "learning_rate": 0.0003311834109202531, "loss": 5.9093, "mean_token_accuracy": 0.14300098568201064, "num_tokens": 4371664.0, "step": 2260 }, { "entropy": 6.16502833366394, "epoch": 0.13162100124938259, "grad_norm": 0.953125, "learning_rate": 0.00033004186826515416, "loss": 6.0271, "mean_token_accuracy": 0.13194756507873534, "num_tokens": 4382600.0, "step": 2265 }, { "entropy": 6.100395345687867, "epoch": 0.1319115553360258, "grad_norm": 0.9453125, "learning_rate": 0.0003288988166094324, "loss": 5.927, "mean_token_accuracy": 0.13698131814599038, "num_tokens": 4393672.0, "step": 2270 }, { "entropy": 6.156081247329712, "epoch": 0.13220210942266902, "grad_norm": 0.98828125, "learning_rate": 0.00032775428729042656, "loss": 5.8873, "mean_token_accuracy": 0.13874078989028932, "num_tokens": 4403156.0, "step": 2275 }, { "entropy": 6.1929707527160645, "epoch": 0.13249266350931227, "grad_norm": 1.1640625, "learning_rate": 0.000326608311685986, "loss": 5.9783, "mean_token_accuracy": 0.13139391839504241, "num_tokens": 4412742.0, "step": 2280 }, { "entropy": 6.086524152755738, "epoch": 0.1327832175959555, "grad_norm": 0.99609375, "learning_rate": 0.0003254609212136108, "loss": 5.8274, "mean_token_accuracy": 0.15008396059274673, "num_tokens": 4422232.0, "step": 2285 }, { "entropy": 6.099150848388672, "epoch": 0.1330737716825987, "grad_norm": 1.015625, "learning_rate": 0.00032431214732959036, "loss": 5.8815, "mean_token_accuracy": 0.13752613663673402, "num_tokens": 4432405.0, "step": 2290 }, { "entropy": 6.194040107727051, "epoch": 0.13336432576924195, "grad_norm": 1.1328125, "learning_rate": 0.000323162021528141, "loss": 5.9082, "mean_token_accuracy": 0.1354072481393814, "num_tokens": 4441504.0, "step": 2295 }, { "entropy": 6.136425590515136, "epoch": 0.13365487985588517, "grad_norm": 0.9375, "learning_rate": 0.00032201057534054264, "loss": 5.9503, "mean_token_accuracy": 0.1408660188317299, "num_tokens": 4452478.0, "step": 2300 }, { "entropy": 6.132717275619507, "epoch": 0.1339454339425284, "grad_norm": 0.96875, "learning_rate": 0.00032085784033427414, "loss": 5.8967, "mean_token_accuracy": 0.13943730369210244, "num_tokens": 4462267.0, "step": 2305 }, { "entropy": 6.183125257492065, "epoch": 0.13423598802917164, "grad_norm": 1.0390625, "learning_rate": 0.0003197038481121478, "loss": 5.9465, "mean_token_accuracy": 0.14744184017181397, "num_tokens": 4472580.0, "step": 2310 }, { "entropy": 6.118056774139404, "epoch": 0.13452654211581486, "grad_norm": 1.0625, "learning_rate": 0.0003185486303114436, "loss": 5.9433, "mean_token_accuracy": 0.13671476170420646, "num_tokens": 4481768.0, "step": 2315 }, { "entropy": 6.0453300952911375, "epoch": 0.13481709620245808, "grad_norm": 0.96484375, "learning_rate": 0.0003173922186030409, "loss": 5.8219, "mean_token_accuracy": 0.14138804078102113, "num_tokens": 4491269.0, "step": 2320 }, { "entropy": 6.065739154815674, "epoch": 0.13510765028910132, "grad_norm": 1.015625, "learning_rate": 0.000316234644690551, "loss": 5.7896, "mean_token_accuracy": 0.14193628579378129, "num_tokens": 4501357.0, "step": 2325 }, { "entropy": 6.165984678268432, "epoch": 0.13539820437574454, "grad_norm": 0.921875, "learning_rate": 0.0003150759403094473, "loss": 5.829, "mean_token_accuracy": 0.14373186007142066, "num_tokens": 4510972.0, "step": 2330 }, { "entropy": 6.087504100799561, "epoch": 0.13568875846238776, "grad_norm": 1.015625, "learning_rate": 0.00031391613722619587, "loss": 5.8799, "mean_token_accuracy": 0.1405518189072609, "num_tokens": 4520887.0, "step": 2335 }, { "entropy": 6.0864016056060795, "epoch": 0.135979312549031, "grad_norm": 1.1015625, "learning_rate": 0.000312755267237384, "loss": 5.8226, "mean_token_accuracy": 0.14198774620890617, "num_tokens": 4529971.0, "step": 2340 }, { "entropy": 6.121823167800903, "epoch": 0.13626986663567422, "grad_norm": 0.9765625, "learning_rate": 0.0003115933621688488, "loss": 5.9209, "mean_token_accuracy": 0.1370498724281788, "num_tokens": 4540375.0, "step": 2345 }, { "entropy": 6.068475914001465, "epoch": 0.13656042072231747, "grad_norm": 1.109375, "learning_rate": 0.00031043045387480487, "loss": 5.8503, "mean_token_accuracy": 0.13427165821194648, "num_tokens": 4549554.0, "step": 2350 }, { "entropy": 6.043267726898193, "epoch": 0.1368509748089607, "grad_norm": 0.953125, "learning_rate": 0.0003092665742369703, "loss": 5.7866, "mean_token_accuracy": 0.14580736979842185, "num_tokens": 4558697.0, "step": 2355 }, { "entropy": 6.053312063217163, "epoch": 0.1371415288956039, "grad_norm": 1.0703125, "learning_rate": 0.00030810175516369343, "loss": 5.8247, "mean_token_accuracy": 0.14898887798190116, "num_tokens": 4567592.0, "step": 2360 }, { "entropy": 6.1225522518157955, "epoch": 0.13743208298224716, "grad_norm": 1.03125, "learning_rate": 0.0003069360285890775, "loss": 5.8661, "mean_token_accuracy": 0.14503989070653917, "num_tokens": 4576594.0, "step": 2365 }, { "entropy": 6.086222171783447, "epoch": 0.13772263706889037, "grad_norm": 1.078125, "learning_rate": 0.00030576942647210547, "loss": 5.8143, "mean_token_accuracy": 0.14734100848436354, "num_tokens": 4585317.0, "step": 2370 }, { "entropy": 6.101655912399292, "epoch": 0.1380131911555336, "grad_norm": 1.15625, "learning_rate": 0.00030460198079576355, "loss": 5.8265, "mean_token_accuracy": 0.15099092870950698, "num_tokens": 4593621.0, "step": 2375 }, { "entropy": 6.059905099868774, "epoch": 0.13830374524217684, "grad_norm": 1.2890625, "learning_rate": 0.0003034337235661648, "loss": 5.8365, "mean_token_accuracy": 0.13829366117715836, "num_tokens": 4603537.0, "step": 2380 }, { "entropy": 6.074821090698242, "epoch": 0.13859429932882006, "grad_norm": 1.15625, "learning_rate": 0.0003022646868116714, "loss": 5.8688, "mean_token_accuracy": 0.14191555976867676, "num_tokens": 4613085.0, "step": 2385 }, { "entropy": 6.087924957275391, "epoch": 0.13888485341546328, "grad_norm": 1.015625, "learning_rate": 0.0003010949025820163, "loss": 5.8978, "mean_token_accuracy": 0.13870447725057602, "num_tokens": 4622750.0, "step": 2390 }, { "entropy": 6.221824693679809, "epoch": 0.13917540750210652, "grad_norm": 1.0625, "learning_rate": 0.0002999244029474252, "loss": 5.9973, "mean_token_accuracy": 0.13631478250026702, "num_tokens": 4632787.0, "step": 2395 }, { "entropy": 6.084641647338867, "epoch": 0.13946596158874974, "grad_norm": 1.046875, "learning_rate": 0.00029875321999773684, "loss": 5.8022, "mean_token_accuracy": 0.14561834558844566, "num_tokens": 4642277.0, "step": 2400 }, { "entropy": 6.126396894454956, "epoch": 0.13975651567539296, "grad_norm": 1.03125, "learning_rate": 0.00029758138584152333, "loss": 5.8342, "mean_token_accuracy": 0.14577654898166656, "num_tokens": 4651764.0, "step": 2405 }, { "entropy": 6.0264387130737305, "epoch": 0.1400470697620362, "grad_norm": 1.0625, "learning_rate": 0.0002964089326052102, "loss": 5.8166, "mean_token_accuracy": 0.14705195873975754, "num_tokens": 4661938.0, "step": 2410 }, { "entropy": 6.151280355453491, "epoch": 0.14033762384867943, "grad_norm": 1.140625, "learning_rate": 0.0002952358924321949, "loss": 5.8146, "mean_token_accuracy": 0.14316702708601953, "num_tokens": 4670960.0, "step": 2415 }, { "entropy": 6.228445291519165, "epoch": 0.14062817793532267, "grad_norm": 1.0703125, "learning_rate": 0.00029406229748196657, "loss": 5.9368, "mean_token_accuracy": 0.1354992315173149, "num_tokens": 4680777.0, "step": 2420 }, { "entropy": 6.084039545059204, "epoch": 0.1409187320219659, "grad_norm": 1.1015625, "learning_rate": 0.0002928881799292235, "loss": 5.7482, "mean_token_accuracy": 0.15117157846689225, "num_tokens": 4690390.0, "step": 2425 }, { "entropy": 6.010620021820069, "epoch": 0.1412092861086091, "grad_norm": 1.015625, "learning_rate": 0.00029171357196299154, "loss": 5.9686, "mean_token_accuracy": 0.14216312393546104, "num_tokens": 4701133.0, "step": 2430 }, { "entropy": 6.182425451278687, "epoch": 0.14149984019525236, "grad_norm": 0.9609375, "learning_rate": 0.0002905385057857414, "loss": 5.9243, "mean_token_accuracy": 0.14200448989868164, "num_tokens": 4711962.0, "step": 2435 }, { "entropy": 6.141026830673217, "epoch": 0.14179039428189558, "grad_norm": 1.1171875, "learning_rate": 0.0002893630136125058, "loss": 5.8835, "mean_token_accuracy": 0.14305603951215745, "num_tokens": 4721748.0, "step": 2440 }, { "entropy": 6.0170793533325195, "epoch": 0.1420809483685388, "grad_norm": 0.98828125, "learning_rate": 0.0002881871276699967, "loss": 5.766, "mean_token_accuracy": 0.14974772036075593, "num_tokens": 4731178.0, "step": 2445 }, { "entropy": 5.9737021923065186, "epoch": 0.14237150245518204, "grad_norm": 1.046875, "learning_rate": 0.00028701088019572114, "loss": 5.7396, "mean_token_accuracy": 0.148803973197937, "num_tokens": 4739590.0, "step": 2450 }, { "entropy": 6.12122106552124, "epoch": 0.14266205654182526, "grad_norm": 1.03125, "learning_rate": 0.0002858343034370977, "loss": 5.9511, "mean_token_accuracy": 0.14803745746612548, "num_tokens": 4749840.0, "step": 2455 }, { "entropy": 6.207825994491577, "epoch": 0.14295261062846848, "grad_norm": 1.109375, "learning_rate": 0.00028465742965057267, "loss": 5.9567, "mean_token_accuracy": 0.13596878871321677, "num_tokens": 4759347.0, "step": 2460 }, { "entropy": 6.198162078857422, "epoch": 0.14324316471511173, "grad_norm": 1.0546875, "learning_rate": 0.00028348029110073533, "loss": 5.8925, "mean_token_accuracy": 0.14506246596574784, "num_tokens": 4769911.0, "step": 2465 }, { "entropy": 6.0526893615722654, "epoch": 0.14353371880175494, "grad_norm": 1.0234375, "learning_rate": 0.00028230292005943365, "loss": 5.8162, "mean_token_accuracy": 0.1422274589538574, "num_tokens": 4780775.0, "step": 2470 }, { "entropy": 6.0685014724731445, "epoch": 0.14382427288839816, "grad_norm": 1.0625, "learning_rate": 0.00028112534880488945, "loss": 5.8628, "mean_token_accuracy": 0.1423807591199875, "num_tokens": 4790845.0, "step": 2475 }, { "entropy": 6.090683555603027, "epoch": 0.1441148269750414, "grad_norm": 0.95703125, "learning_rate": 0.0002799476096208137, "loss": 5.8106, "mean_token_accuracy": 0.1501375898718834, "num_tokens": 4801214.0, "step": 2480 }, { "entropy": 6.07487530708313, "epoch": 0.14440538106168463, "grad_norm": 0.9765625, "learning_rate": 0.00027876973479552087, "loss": 5.7633, "mean_token_accuracy": 0.1493755668401718, "num_tokens": 4810720.0, "step": 2485 }, { "entropy": 6.07555193901062, "epoch": 0.14469593514832785, "grad_norm": 1.1171875, "learning_rate": 0.00027759175662104424, "loss": 5.7415, "mean_token_accuracy": 0.14737216681241988, "num_tokens": 4820078.0, "step": 2490 }, { "entropy": 6.0236917495727536, "epoch": 0.1449864892349711, "grad_norm": 1.015625, "learning_rate": 0.0002764137073922508, "loss": 5.8242, "mean_token_accuracy": 0.14522194787859916, "num_tokens": 4830561.0, "step": 2495 }, { "entropy": 6.035146951675415, "epoch": 0.1452770433216143, "grad_norm": 1.078125, "learning_rate": 0.00027523561940595505, "loss": 5.8653, "mean_token_accuracy": 0.1423221454024315, "num_tokens": 4839849.0, "step": 2500 }, { "entropy": 6.047043657302856, "epoch": 0.14556759740825756, "grad_norm": 0.984375, "learning_rate": 0.0002740575249600342, "loss": 5.809, "mean_token_accuracy": 0.14349082857370377, "num_tokens": 4850113.0, "step": 2505 }, { "entropy": 6.110679817199707, "epoch": 0.14585815149490078, "grad_norm": 1.171875, "learning_rate": 0.00027287945635254263, "loss": 5.7927, "mean_token_accuracy": 0.13830389603972434, "num_tokens": 4859572.0, "step": 2510 }, { "entropy": 6.028803539276123, "epoch": 0.146148705581544, "grad_norm": 1.03125, "learning_rate": 0.00027170144588082635, "loss": 5.8538, "mean_token_accuracy": 0.14364985525608062, "num_tokens": 4870238.0, "step": 2515 }, { "entropy": 6.05920820236206, "epoch": 0.14643925966818724, "grad_norm": 0.96875, "learning_rate": 0.00027052352584063763, "loss": 5.7673, "mean_token_accuracy": 0.14797088503837585, "num_tokens": 4879743.0, "step": 2520 }, { "entropy": 6.063130807876587, "epoch": 0.14672981375483046, "grad_norm": 1.03125, "learning_rate": 0.00026934572852524907, "loss": 5.8141, "mean_token_accuracy": 0.1519768014550209, "num_tokens": 4888874.0, "step": 2525 }, { "entropy": 6.081371688842774, "epoch": 0.14702036784147368, "grad_norm": 1.0390625, "learning_rate": 0.00026816808622456937, "loss": 5.8422, "mean_token_accuracy": 0.14229626208543777, "num_tokens": 4898244.0, "step": 2530 }, { "entropy": 6.169855117797852, "epoch": 0.14731092192811693, "grad_norm": 1.015625, "learning_rate": 0.0002669906312242569, "loss": 5.8431, "mean_token_accuracy": 0.1464947611093521, "num_tokens": 4907978.0, "step": 2535 }, { "entropy": 6.089100074768067, "epoch": 0.14760147601476015, "grad_norm": 0.953125, "learning_rate": 0.00026581339580483525, "loss": 5.8544, "mean_token_accuracy": 0.14092473834753036, "num_tokens": 4917647.0, "step": 2540 }, { "entropy": 6.051334381103516, "epoch": 0.14789203010140337, "grad_norm": 1.09375, "learning_rate": 0.0002646364122408082, "loss": 5.8866, "mean_token_accuracy": 0.1406030498445034, "num_tokens": 4927322.0, "step": 2545 }, { "entropy": 6.089036989212036, "epoch": 0.1481825841880466, "grad_norm": 0.90234375, "learning_rate": 0.0002634597127997749, "loss": 5.8912, "mean_token_accuracy": 0.14071550071239472, "num_tokens": 4938095.0, "step": 2550 }, { "entropy": 6.179394674301148, "epoch": 0.14847313827468983, "grad_norm": 0.98828125, "learning_rate": 0.0002622833297415445, "loss": 5.8789, "mean_token_accuracy": 0.14479369372129441, "num_tokens": 4947768.0, "step": 2555 }, { "entropy": 6.177506828308106, "epoch": 0.14876369236133305, "grad_norm": 1.0, "learning_rate": 0.0002611072953172531, "loss": 5.8719, "mean_token_accuracy": 0.14385495483875274, "num_tokens": 4957776.0, "step": 2560 }, { "entropy": 6.029763984680176, "epoch": 0.1490542464479763, "grad_norm": 1.09375, "learning_rate": 0.00025993164176847845, "loss": 5.8281, "mean_token_accuracy": 0.15120696425437927, "num_tokens": 4966992.0, "step": 2565 }, { "entropy": 6.071042394638061, "epoch": 0.14934480053461952, "grad_norm": 1.03125, "learning_rate": 0.0002587564013263564, "loss": 5.8395, "mean_token_accuracy": 0.14219800606369973, "num_tokens": 4976216.0, "step": 2570 }, { "entropy": 6.099312973022461, "epoch": 0.14963535462126276, "grad_norm": 1.109375, "learning_rate": 0.0002575816062106974, "loss": 5.7508, "mean_token_accuracy": 0.14777441769838334, "num_tokens": 4985143.0, "step": 2575 }, { "entropy": 6.13524899482727, "epoch": 0.14992590870790598, "grad_norm": 1.1015625, "learning_rate": 0.00025640728862910293, "loss": 5.9437, "mean_token_accuracy": 0.13997391015291213, "num_tokens": 4995058.0, "step": 2580 }, { "entropy": 6.081535530090332, "epoch": 0.1502164627945492, "grad_norm": 1.0625, "learning_rate": 0.00025523348077608285, "loss": 5.7767, "mean_token_accuracy": 0.14847566336393356, "num_tokens": 5003930.0, "step": 2585 }, { "entropy": 6.133546257019043, "epoch": 0.15050701688119245, "grad_norm": 0.98828125, "learning_rate": 0.00025406021483217225, "loss": 5.8917, "mean_token_accuracy": 0.14307338669896125, "num_tokens": 5013907.0, "step": 2590 }, { "entropy": 6.05073037147522, "epoch": 0.15079757096783566, "grad_norm": 1.03125, "learning_rate": 0.00025288752296304963, "loss": 5.7465, "mean_token_accuracy": 0.1435894712805748, "num_tokens": 5024028.0, "step": 2595 }, { "entropy": 6.054743099212646, "epoch": 0.15108812505447888, "grad_norm": 1.0546875, "learning_rate": 0.000251715437318655, "loss": 5.8135, "mean_token_accuracy": 0.14531584978103637, "num_tokens": 5033555.0, "step": 2600 }, { "entropy": 6.053986740112305, "epoch": 0.15137867914112213, "grad_norm": 1.0078125, "learning_rate": 0.0002505439900323084, "loss": 5.862, "mean_token_accuracy": 0.14667272865772246, "num_tokens": 5043180.0, "step": 2605 }, { "entropy": 6.088306331634522, "epoch": 0.15166923322776535, "grad_norm": 0.9921875, "learning_rate": 0.00024937321321982894, "loss": 5.7691, "mean_token_accuracy": 0.14470289498567582, "num_tokens": 5052220.0, "step": 2610 }, { "entropy": 6.0878173351287845, "epoch": 0.15195978731440857, "grad_norm": 1.03125, "learning_rate": 0.00024820313897865433, "loss": 5.7726, "mean_token_accuracy": 0.150884909927845, "num_tokens": 5061544.0, "step": 2615 }, { "entropy": 6.131782722473145, "epoch": 0.15225034140105181, "grad_norm": 1.09375, "learning_rate": 0.00024703379938696105, "loss": 5.9184, "mean_token_accuracy": 0.1400494635105133, "num_tokens": 5070611.0, "step": 2620 }, { "entropy": 6.153847122192383, "epoch": 0.15254089548769503, "grad_norm": 1.0703125, "learning_rate": 0.00024586522650278447, "loss": 5.874, "mean_token_accuracy": 0.1386608324944973, "num_tokens": 5080750.0, "step": 2625 }, { "entropy": 6.137750577926636, "epoch": 0.15283144957433825, "grad_norm": 0.99609375, "learning_rate": 0.00024469745236314064, "loss": 5.8592, "mean_token_accuracy": 0.13961437940597535, "num_tokens": 5090067.0, "step": 2630 }, { "entropy": 6.123914098739624, "epoch": 0.1531220036609815, "grad_norm": 1.0546875, "learning_rate": 0.00024353050898314767, "loss": 5.8592, "mean_token_accuracy": 0.14175378978252412, "num_tokens": 5100053.0, "step": 2635 }, { "entropy": 6.1782163143157955, "epoch": 0.15341255774762472, "grad_norm": 1.0, "learning_rate": 0.00024236442835514743, "loss": 5.8117, "mean_token_accuracy": 0.1458034932613373, "num_tokens": 5109296.0, "step": 2640 }, { "entropy": 6.071597719192505, "epoch": 0.15370311183426794, "grad_norm": 1.1328125, "learning_rate": 0.00024119924244782965, "loss": 5.8673, "mean_token_accuracy": 0.14649384766817092, "num_tokens": 5118744.0, "step": 2645 }, { "entropy": 6.130202531814575, "epoch": 0.15399366592091118, "grad_norm": 1.046875, "learning_rate": 0.00024003498320535462, "loss": 5.8775, "mean_token_accuracy": 0.1437153235077858, "num_tokens": 5127763.0, "step": 2650 }, { "entropy": 6.148328161239624, "epoch": 0.1542842200075544, "grad_norm": 1.171875, "learning_rate": 0.00023887168254647727, "loss": 5.9019, "mean_token_accuracy": 0.14456916153430938, "num_tokens": 5138067.0, "step": 2655 }, { "entropy": 6.150043106079101, "epoch": 0.15457477409419765, "grad_norm": 0.98046875, "learning_rate": 0.00023770937236367308, "loss": 5.8983, "mean_token_accuracy": 0.1399595282971859, "num_tokens": 5148280.0, "step": 2660 }, { "entropy": 6.082090711593628, "epoch": 0.15486532818084087, "grad_norm": 1.046875, "learning_rate": 0.00023654808452226278, "loss": 5.7799, "mean_token_accuracy": 0.15123223662376403, "num_tokens": 5158182.0, "step": 2665 }, { "entropy": 6.0570969581604, "epoch": 0.15515588226748409, "grad_norm": 1.0078125, "learning_rate": 0.00023538785085953912, "loss": 5.7383, "mean_token_accuracy": 0.14949096888303756, "num_tokens": 5167524.0, "step": 2670 }, { "entropy": 6.062791872024536, "epoch": 0.15544643635412733, "grad_norm": 0.98046875, "learning_rate": 0.00023422870318389404, "loss": 5.7904, "mean_token_accuracy": 0.13950854763388634, "num_tokens": 5177581.0, "step": 2675 }, { "entropy": 6.112806892395019, "epoch": 0.15573699044077055, "grad_norm": 0.97265625, "learning_rate": 0.0002330706732739468, "loss": 5.783, "mean_token_accuracy": 0.14393220096826553, "num_tokens": 5187156.0, "step": 2680 }, { "entropy": 6.065037822723388, "epoch": 0.15602754452741377, "grad_norm": 0.9765625, "learning_rate": 0.00023191379287767211, "loss": 5.8843, "mean_token_accuracy": 0.14143779054284095, "num_tokens": 5198015.0, "step": 2685 }, { "entropy": 6.149574279785156, "epoch": 0.15631809861405702, "grad_norm": 0.98828125, "learning_rate": 0.0002307580937115305, "loss": 5.8311, "mean_token_accuracy": 0.14658329337835313, "num_tokens": 5207961.0, "step": 2690 }, { "entropy": 6.041101455688477, "epoch": 0.15660865270070023, "grad_norm": 1.0, "learning_rate": 0.00022960360745959846, "loss": 5.8328, "mean_token_accuracy": 0.14369555339217185, "num_tokens": 5217318.0, "step": 2695 }, { "entropy": 6.05631422996521, "epoch": 0.15689920678734345, "grad_norm": 1.0234375, "learning_rate": 0.00022845036577269972, "loss": 5.6925, "mean_token_accuracy": 0.1581657573580742, "num_tokens": 5226393.0, "step": 2700 }, { "entropy": 5.980499124526977, "epoch": 0.1571897608739867, "grad_norm": 1.03125, "learning_rate": 0.00022729840026753777, "loss": 5.6844, "mean_token_accuracy": 0.14944447427988053, "num_tokens": 5236003.0, "step": 2705 }, { "entropy": 6.103996896743775, "epoch": 0.15748031496062992, "grad_norm": 1.0703125, "learning_rate": 0.0002261477425258287, "loss": 5.853, "mean_token_accuracy": 0.1508561223745346, "num_tokens": 5246472.0, "step": 2710 }, { "entropy": 6.170705938339234, "epoch": 0.15777086904727314, "grad_norm": 1.046875, "learning_rate": 0.0002249984240934358, "loss": 5.931, "mean_token_accuracy": 0.1417808599770069, "num_tokens": 5256921.0, "step": 2715 }, { "entropy": 5.98744764328003, "epoch": 0.15806142313391638, "grad_norm": 1.0234375, "learning_rate": 0.00022385047647950464, "loss": 5.7333, "mean_token_accuracy": 0.15196397304534912, "num_tokens": 5266832.0, "step": 2720 }, { "entropy": 5.991374492645264, "epoch": 0.1583519772205596, "grad_norm": 1.09375, "learning_rate": 0.0002227039311555986, "loss": 5.7585, "mean_token_accuracy": 0.1483888141810894, "num_tokens": 5276386.0, "step": 2725 }, { "entropy": 6.100140237808228, "epoch": 0.15864253130720282, "grad_norm": 1.015625, "learning_rate": 0.0002215588195548372, "loss": 5.7618, "mean_token_accuracy": 0.14937272816896438, "num_tokens": 5285959.0, "step": 2730 }, { "entropy": 6.154450845718384, "epoch": 0.15893308539384607, "grad_norm": 1.0078125, "learning_rate": 0.00022041517307103337, "loss": 5.7947, "mean_token_accuracy": 0.14946697056293487, "num_tokens": 5295457.0, "step": 2735 }, { "entropy": 6.096340894699097, "epoch": 0.1592236394804893, "grad_norm": 0.8984375, "learning_rate": 0.0002192730230578331, "loss": 5.8109, "mean_token_accuracy": 0.14488886743783952, "num_tokens": 5306092.0, "step": 2740 }, { "entropy": 6.052392101287841, "epoch": 0.15951419356713253, "grad_norm": 1.140625, "learning_rate": 0.0002181324008278559, "loss": 5.8358, "mean_token_accuracy": 0.14705842584371567, "num_tokens": 5314960.0, "step": 2745 }, { "entropy": 5.998585510253906, "epoch": 0.15980474765377575, "grad_norm": 1.0625, "learning_rate": 0.00021699333765183655, "loss": 5.7745, "mean_token_accuracy": 0.15136635154485703, "num_tokens": 5324390.0, "step": 2750 }, { "entropy": 6.0869420051574705, "epoch": 0.16009530174041897, "grad_norm": 0.98046875, "learning_rate": 0.0002158558647577673, "loss": 5.812, "mean_token_accuracy": 0.14605457559227944, "num_tokens": 5334650.0, "step": 2755 }, { "entropy": 6.197345113754272, "epoch": 0.16038585582706222, "grad_norm": 0.93359375, "learning_rate": 0.00021472001333004215, "loss": 5.8713, "mean_token_accuracy": 0.1437445230782032, "num_tokens": 5343958.0, "step": 2760 }, { "entropy": 6.1040606021881105, "epoch": 0.16067640991370544, "grad_norm": 1.0078125, "learning_rate": 0.00021358581450860186, "loss": 5.8254, "mean_token_accuracy": 0.14942396581172943, "num_tokens": 5353428.0, "step": 2765 }, { "entropy": 6.14132776260376, "epoch": 0.16096696400034866, "grad_norm": 0.9375, "learning_rate": 0.0002124532993880799, "loss": 5.8757, "mean_token_accuracy": 0.14288587495684624, "num_tokens": 5364132.0, "step": 2770 }, { "entropy": 6.148387336730957, "epoch": 0.1612575180869919, "grad_norm": 0.94921875, "learning_rate": 0.00021132249901695044, "loss": 5.853, "mean_token_accuracy": 0.14395386576652527, "num_tokens": 5374066.0, "step": 2775 }, { "entropy": 6.0949657440185545, "epoch": 0.16154807217363512, "grad_norm": 1.1015625, "learning_rate": 0.00021019344439667705, "loss": 5.8179, "mean_token_accuracy": 0.1493311658501625, "num_tokens": 5383479.0, "step": 2780 }, { "entropy": 5.99338116645813, "epoch": 0.16183862626027834, "grad_norm": 0.9921875, "learning_rate": 0.00020906616648086213, "loss": 5.683, "mean_token_accuracy": 0.15754484832286836, "num_tokens": 5392894.0, "step": 2785 }, { "entropy": 6.032156229019165, "epoch": 0.1621291803469216, "grad_norm": 0.984375, "learning_rate": 0.00020794069617439942, "loss": 5.8144, "mean_token_accuracy": 0.1486166849732399, "num_tokens": 5402886.0, "step": 2790 }, { "entropy": 6.081609296798706, "epoch": 0.1624197344335648, "grad_norm": 1.015625, "learning_rate": 0.00020681706433262593, "loss": 5.7009, "mean_token_accuracy": 0.1564814940094948, "num_tokens": 5411656.0, "step": 2795 }, { "entropy": 6.125734090805054, "epoch": 0.16271028852020802, "grad_norm": 1.046875, "learning_rate": 0.00020569530176047602, "loss": 5.8456, "mean_token_accuracy": 0.14672704488039018, "num_tokens": 5421650.0, "step": 2800 }, { "entropy": 6.053952217102051, "epoch": 0.16300084260685127, "grad_norm": 0.97265625, "learning_rate": 0.0002045754392116374, "loss": 5.7943, "mean_token_accuracy": 0.153436142206192, "num_tokens": 5431154.0, "step": 2805 }, { "entropy": 6.064789199829102, "epoch": 0.1632913966934945, "grad_norm": 1.0234375, "learning_rate": 0.00020345750738770757, "loss": 5.8505, "mean_token_accuracy": 0.14092413783073426, "num_tokens": 5441464.0, "step": 2810 }, { "entropy": 6.110472869873047, "epoch": 0.16358195078013774, "grad_norm": 0.9765625, "learning_rate": 0.00020234153693735214, "loss": 5.8195, "mean_token_accuracy": 0.14966847896575927, "num_tokens": 5452077.0, "step": 2815 }, { "entropy": 6.1211137771606445, "epoch": 0.16387250486678095, "grad_norm": 1.0625, "learning_rate": 0.0002012275584554647, "loss": 5.8616, "mean_token_accuracy": 0.14003771468997, "num_tokens": 5461792.0, "step": 2820 }, { "entropy": 6.047807073593139, "epoch": 0.16416305895342417, "grad_norm": 1.0234375, "learning_rate": 0.00020011560248232803, "loss": 5.7094, "mean_token_accuracy": 0.15667359828948973, "num_tokens": 5471637.0, "step": 2825 }, { "entropy": 6.167580080032349, "epoch": 0.16445361304006742, "grad_norm": 1.015625, "learning_rate": 0.00019900569950277692, "loss": 5.9432, "mean_token_accuracy": 0.14090102761983872, "num_tokens": 5482341.0, "step": 2830 }, { "entropy": 6.063886308670044, "epoch": 0.16474416712671064, "grad_norm": 1.078125, "learning_rate": 0.00019789787994536228, "loss": 5.7621, "mean_token_accuracy": 0.15223144590854645, "num_tokens": 5492335.0, "step": 2835 }, { "entropy": 6.057334852218628, "epoch": 0.16503472121335386, "grad_norm": 0.97265625, "learning_rate": 0.00019679217418151667, "loss": 5.7486, "mean_token_accuracy": 0.15512095093727113, "num_tokens": 5501879.0, "step": 2840 }, { "entropy": 6.074555587768555, "epoch": 0.1653252752999971, "grad_norm": 0.91015625, "learning_rate": 0.00019568861252472236, "loss": 5.7906, "mean_token_accuracy": 0.15294522792100906, "num_tokens": 5512419.0, "step": 2845 }, { "entropy": 6.052340698242188, "epoch": 0.16561582938664032, "grad_norm": 1.2578125, "learning_rate": 0.00019458722522967952, "loss": 5.6966, "mean_token_accuracy": 0.15103003978729249, "num_tokens": 5521017.0, "step": 2850 }, { "entropy": 6.01881628036499, "epoch": 0.16590638347328354, "grad_norm": 1.1015625, "learning_rate": 0.00019348804249147723, "loss": 5.7061, "mean_token_accuracy": 0.14818918108940124, "num_tokens": 5530916.0, "step": 2855 }, { "entropy": 6.014477682113648, "epoch": 0.1661969375599268, "grad_norm": 0.97265625, "learning_rate": 0.0001923910944447655, "loss": 5.6511, "mean_token_accuracy": 0.16009259968996048, "num_tokens": 5540228.0, "step": 2860 }, { "entropy": 6.070650434494018, "epoch": 0.16648749164657, "grad_norm": 1.03125, "learning_rate": 0.00019129641116292928, "loss": 5.7298, "mean_token_accuracy": 0.15569742172956466, "num_tokens": 5549921.0, "step": 2865 }, { "entropy": 6.1058845043182375, "epoch": 0.16677804573321323, "grad_norm": 1.078125, "learning_rate": 0.00019020402265726343, "loss": 5.8318, "mean_token_accuracy": 0.143881855905056, "num_tokens": 5560308.0, "step": 2870 }, { "entropy": 6.0743451595306395, "epoch": 0.16706859981985647, "grad_norm": 1.125, "learning_rate": 0.0001891139588761509, "loss": 5.7595, "mean_token_accuracy": 0.1477736845612526, "num_tokens": 5569026.0, "step": 2875 }, { "entropy": 6.015086269378662, "epoch": 0.1673591539064997, "grad_norm": 1.03125, "learning_rate": 0.00018802624970424076, "loss": 5.725, "mean_token_accuracy": 0.15312366485595702, "num_tokens": 5578812.0, "step": 2880 }, { "entropy": 6.130430459976196, "epoch": 0.1676497079931429, "grad_norm": 1.015625, "learning_rate": 0.00018694092496162945, "loss": 5.831, "mean_token_accuracy": 0.14988720864057542, "num_tokens": 5588763.0, "step": 2885 }, { "entropy": 6.03325777053833, "epoch": 0.16794026207978616, "grad_norm": 1.0859375, "learning_rate": 0.00018585801440304306, "loss": 5.6702, "mean_token_accuracy": 0.15091593116521834, "num_tokens": 5597719.0, "step": 2890 }, { "entropy": 6.071933698654175, "epoch": 0.16823081616642938, "grad_norm": 1.0390625, "learning_rate": 0.00018477754771702165, "loss": 5.7461, "mean_token_accuracy": 0.14670687392354012, "num_tokens": 5607376.0, "step": 2895 }, { "entropy": 6.061179494857788, "epoch": 0.16852137025307262, "grad_norm": 1.0078125, "learning_rate": 0.00018369955452510506, "loss": 5.757, "mean_token_accuracy": 0.14725697934627532, "num_tokens": 5617227.0, "step": 2900 }, { "entropy": 5.983553457260132, "epoch": 0.16881192433971584, "grad_norm": 1.0078125, "learning_rate": 0.0001826240643810212, "loss": 5.6661, "mean_token_accuracy": 0.1538853704929352, "num_tokens": 5626241.0, "step": 2905 }, { "entropy": 6.049111032485962, "epoch": 0.16910247842635906, "grad_norm": 0.984375, "learning_rate": 0.0001815511067698758, "loss": 5.8158, "mean_token_accuracy": 0.14741248339414598, "num_tokens": 5636969.0, "step": 2910 }, { "entropy": 5.976936292648316, "epoch": 0.1693930325130023, "grad_norm": 0.984375, "learning_rate": 0.0001804807111073436, "loss": 5.6947, "mean_token_accuracy": 0.14919188469648362, "num_tokens": 5646430.0, "step": 2915 }, { "entropy": 6.0697746753692625, "epoch": 0.16968358659964553, "grad_norm": 1.046875, "learning_rate": 0.0001794129067388625, "loss": 5.7424, "mean_token_accuracy": 0.15479477643966674, "num_tokens": 5656049.0, "step": 2920 }, { "entropy": 6.103779983520508, "epoch": 0.16997414068628874, "grad_norm": 1.0625, "learning_rate": 0.00017834772293882868, "loss": 5.7761, "mean_token_accuracy": 0.14724364280700683, "num_tokens": 5665657.0, "step": 2925 }, { "entropy": 6.004815101623535, "epoch": 0.170264694772932, "grad_norm": 1.078125, "learning_rate": 0.000177285188909794, "loss": 5.6713, "mean_token_accuracy": 0.14558330550789833, "num_tokens": 5675306.0, "step": 2930 }, { "entropy": 6.077218818664551, "epoch": 0.1705552488595752, "grad_norm": 1.0625, "learning_rate": 0.0001762253337816656, "loss": 5.8962, "mean_token_accuracy": 0.14256232976913452, "num_tokens": 5685295.0, "step": 2935 }, { "entropy": 5.960014724731446, "epoch": 0.17084580294621843, "grad_norm": 1.046875, "learning_rate": 0.00017516818661090738, "loss": 5.5849, "mean_token_accuracy": 0.153633750975132, "num_tokens": 5694626.0, "step": 2940 }, { "entropy": 6.0667417526245115, "epoch": 0.17113635703286167, "grad_norm": 0.98046875, "learning_rate": 0.0001741137763797428, "loss": 5.7658, "mean_token_accuracy": 0.1459375351667404, "num_tokens": 5704247.0, "step": 2945 }, { "entropy": 6.021193981170654, "epoch": 0.1714269111195049, "grad_norm": 1.1484375, "learning_rate": 0.00017306213199536115, "loss": 5.6436, "mean_token_accuracy": 0.16296780705451966, "num_tokens": 5712536.0, "step": 2950 }, { "entropy": 6.047375822067261, "epoch": 0.1717174652061481, "grad_norm": 1.046875, "learning_rate": 0.0001720132822891243, "loss": 5.7144, "mean_token_accuracy": 0.1537844330072403, "num_tokens": 5721641.0, "step": 2955 }, { "entropy": 5.961157178878784, "epoch": 0.17200801929279136, "grad_norm": 1.0234375, "learning_rate": 0.0001709672560157769, "loss": 5.6832, "mean_token_accuracy": 0.15097189098596572, "num_tokens": 5731404.0, "step": 2960 }, { "entropy": 6.061686420440674, "epoch": 0.17229857337943458, "grad_norm": 0.984375, "learning_rate": 0.00016992408185265758, "loss": 5.7934, "mean_token_accuracy": 0.14903522282838821, "num_tokens": 5741006.0, "step": 2965 }, { "entropy": 6.143270587921142, "epoch": 0.17258912746607782, "grad_norm": 1.0, "learning_rate": 0.00016888378839891298, "loss": 5.7955, "mean_token_accuracy": 0.14605200439691543, "num_tokens": 5751124.0, "step": 2970 }, { "entropy": 6.10756402015686, "epoch": 0.17287968155272104, "grad_norm": 1.0703125, "learning_rate": 0.0001678464041747137, "loss": 5.7795, "mean_token_accuracy": 0.15010488629341126, "num_tokens": 5761261.0, "step": 2975 }, { "entropy": 6.070679616928101, "epoch": 0.17317023563936426, "grad_norm": 1.0546875, "learning_rate": 0.00016681195762047223, "loss": 5.7228, "mean_token_accuracy": 0.1600602760910988, "num_tokens": 5769855.0, "step": 2980 }, { "entropy": 6.0531364440917965, "epoch": 0.1734607897260075, "grad_norm": 1.1015625, "learning_rate": 0.00016578047709606337, "loss": 5.7577, "mean_token_accuracy": 0.14312802702188493, "num_tokens": 5779394.0, "step": 2985 }, { "entropy": 6.0887946605682375, "epoch": 0.17375134381265073, "grad_norm": 0.98046875, "learning_rate": 0.00016475199088004678, "loss": 5.8185, "mean_token_accuracy": 0.15020564645528794, "num_tokens": 5789442.0, "step": 2990 }, { "entropy": 6.0648823261260985, "epoch": 0.17404189789929395, "grad_norm": 1.015625, "learning_rate": 0.00016372652716889163, "loss": 5.7086, "mean_token_accuracy": 0.15479273200035096, "num_tokens": 5798269.0, "step": 2995 }, { "entropy": 6.102453422546387, "epoch": 0.1743324519859372, "grad_norm": 1.015625, "learning_rate": 0.0001627041140762035, "loss": 5.7227, "mean_token_accuracy": 0.14934398829936982, "num_tokens": 5808608.0, "step": 3000 }, { "entropy": 6.100079727172852, "epoch": 0.1746230060725804, "grad_norm": 1.1484375, "learning_rate": 0.00016168477963195382, "loss": 5.777, "mean_token_accuracy": 0.14889512956142426, "num_tokens": 5818134.0, "step": 3005 }, { "entropy": 6.064034843444825, "epoch": 0.17491356015922363, "grad_norm": 0.984375, "learning_rate": 0.0001606685517817114, "loss": 5.7285, "mean_token_accuracy": 0.15235000252723693, "num_tokens": 5828237.0, "step": 3010 }, { "entropy": 6.020033311843872, "epoch": 0.17520411424586688, "grad_norm": 1.015625, "learning_rate": 0.00015965545838587592, "loss": 5.7206, "mean_token_accuracy": 0.1511564001441002, "num_tokens": 5837597.0, "step": 3015 }, { "entropy": 6.070970773696899, "epoch": 0.1754946683325101, "grad_norm": 1.0546875, "learning_rate": 0.00015864552721891467, "loss": 5.6728, "mean_token_accuracy": 0.1517861396074295, "num_tokens": 5846728.0, "step": 3020 }, { "entropy": 6.065035820007324, "epoch": 0.17578522241915331, "grad_norm": 0.91796875, "learning_rate": 0.00015763878596860076, "loss": 5.831, "mean_token_accuracy": 0.14596770107746124, "num_tokens": 5857655.0, "step": 3025 }, { "entropy": 5.956533288955688, "epoch": 0.17607577650579656, "grad_norm": 0.94921875, "learning_rate": 0.00015663526223525412, "loss": 5.6927, "mean_token_accuracy": 0.1539006546139717, "num_tokens": 5868854.0, "step": 3030 }, { "entropy": 6.1010034561157225, "epoch": 0.17636633059243978, "grad_norm": 0.9609375, "learning_rate": 0.0001556349835309848, "loss": 5.8291, "mean_token_accuracy": 0.1526801697909832, "num_tokens": 5879362.0, "step": 3035 }, { "entropy": 6.131938362121582, "epoch": 0.176656884679083, "grad_norm": 1.0, "learning_rate": 0.0001546379772789389, "loss": 5.8241, "mean_token_accuracy": 0.15398107543587686, "num_tokens": 5888753.0, "step": 3040 }, { "entropy": 6.086278247833252, "epoch": 0.17694743876572624, "grad_norm": 1.078125, "learning_rate": 0.00015364427081254622, "loss": 5.7149, "mean_token_accuracy": 0.15152743011713027, "num_tokens": 5898860.0, "step": 3045 }, { "entropy": 6.081228303909302, "epoch": 0.17723799285236946, "grad_norm": 1.09375, "learning_rate": 0.00015265389137477165, "loss": 5.7008, "mean_token_accuracy": 0.14895583540201188, "num_tokens": 5908191.0, "step": 3050 }, { "entropy": 5.977576351165771, "epoch": 0.1775285469390127, "grad_norm": 1.046875, "learning_rate": 0.00015166686611736786, "loss": 5.6745, "mean_token_accuracy": 0.15492946803569793, "num_tokens": 5918266.0, "step": 3055 }, { "entropy": 5.968826675415039, "epoch": 0.17781910102565593, "grad_norm": 0.99609375, "learning_rate": 0.00015068322210013064, "loss": 5.6773, "mean_token_accuracy": 0.15108609497547149, "num_tokens": 5927628.0, "step": 3060 }, { "entropy": 6.01093373298645, "epoch": 0.17810965511229915, "grad_norm": 1.0859375, "learning_rate": 0.0001497029862901578, "loss": 5.6803, "mean_token_accuracy": 0.15184399709105492, "num_tokens": 5937323.0, "step": 3065 }, { "entropy": 6.0069934844970705, "epoch": 0.1784002091989424, "grad_norm": 1.0, "learning_rate": 0.00014872618556110905, "loss": 5.7033, "mean_token_accuracy": 0.15416699647903442, "num_tokens": 5946767.0, "step": 3070 }, { "entropy": 6.092588567733765, "epoch": 0.1786907632855856, "grad_norm": 1.046875, "learning_rate": 0.00014775284669246992, "loss": 5.7472, "mean_token_accuracy": 0.1522138647735119, "num_tokens": 5956469.0, "step": 3075 }, { "entropy": 6.093777990341186, "epoch": 0.17898131737222883, "grad_norm": 1.0078125, "learning_rate": 0.00014678299636881716, "loss": 5.7564, "mean_token_accuracy": 0.15174834728240966, "num_tokens": 5965882.0, "step": 3080 }, { "entropy": 6.043925333023071, "epoch": 0.17927187145887208, "grad_norm": 1.0703125, "learning_rate": 0.0001458166611790873, "loss": 5.6561, "mean_token_accuracy": 0.15863914489746095, "num_tokens": 5974696.0, "step": 3085 }, { "entropy": 6.098592472076416, "epoch": 0.1795624255455153, "grad_norm": 1.0078125, "learning_rate": 0.00014485386761584773, "loss": 5.771, "mean_token_accuracy": 0.15556092113256453, "num_tokens": 5984525.0, "step": 3090 }, { "entropy": 6.02237606048584, "epoch": 0.17985297963215852, "grad_norm": 1.046875, "learning_rate": 0.00014389464207457042, "loss": 5.7049, "mean_token_accuracy": 0.1512262910604477, "num_tokens": 5993382.0, "step": 3095 }, { "entropy": 5.985553407669068, "epoch": 0.18014353371880176, "grad_norm": 1.1328125, "learning_rate": 0.00014293901085290795, "loss": 5.6662, "mean_token_accuracy": 0.15523958802223206, "num_tokens": 6003127.0, "step": 3100 }, { "entropy": 6.131969976425171, "epoch": 0.18043408780544498, "grad_norm": 1.0859375, "learning_rate": 0.00014198700014997307, "loss": 5.8075, "mean_token_accuracy": 0.1487472876906395, "num_tokens": 6013336.0, "step": 3105 }, { "entropy": 6.103825998306275, "epoch": 0.1807246418920882, "grad_norm": 0.953125, "learning_rate": 0.00014103863606562016, "loss": 5.8092, "mean_token_accuracy": 0.14103479385375978, "num_tokens": 6023446.0, "step": 3110 }, { "entropy": 6.070917987823487, "epoch": 0.18101519597873145, "grad_norm": 0.96875, "learning_rate": 0.00014009394459972964, "loss": 5.7103, "mean_token_accuracy": 0.15287835896015167, "num_tokens": 6034264.0, "step": 3115 }, { "entropy": 6.039470911026001, "epoch": 0.18130575006537467, "grad_norm": 1.1484375, "learning_rate": 0.00013915295165149513, "loss": 5.7217, "mean_token_accuracy": 0.15219166725873948, "num_tokens": 6043048.0, "step": 3120 }, { "entropy": 6.063743162155151, "epoch": 0.18159630415201788, "grad_norm": 0.96875, "learning_rate": 0.00013821568301871384, "loss": 5.7789, "mean_token_accuracy": 0.14522581547498703, "num_tokens": 6053255.0, "step": 3125 }, { "entropy": 5.9902284145355225, "epoch": 0.18188685823866113, "grad_norm": 1.015625, "learning_rate": 0.00013728216439707862, "loss": 5.6082, "mean_token_accuracy": 0.1634930595755577, "num_tokens": 6062968.0, "step": 3130 }, { "entropy": 6.016612720489502, "epoch": 0.18217741232530435, "grad_norm": 1.078125, "learning_rate": 0.00013635242137947419, "loss": 5.5827, "mean_token_accuracy": 0.16410193741321563, "num_tokens": 6072071.0, "step": 3135 }, { "entropy": 6.09333872795105, "epoch": 0.1824679664119476, "grad_norm": 0.9609375, "learning_rate": 0.00013542647945527498, "loss": 5.8434, "mean_token_accuracy": 0.1482686847448349, "num_tokens": 6081887.0, "step": 3140 }, { "entropy": 6.0756289005279545, "epoch": 0.18275852049859082, "grad_norm": 0.97265625, "learning_rate": 0.0001345043640096465, "loss": 5.7092, "mean_token_accuracy": 0.1483364373445511, "num_tokens": 6092254.0, "step": 3145 }, { "entropy": 6.047313165664673, "epoch": 0.18304907458523403, "grad_norm": 1.0234375, "learning_rate": 0.00013358610032284957, "loss": 5.72, "mean_token_accuracy": 0.15549504160881042, "num_tokens": 6102497.0, "step": 3150 }, { "entropy": 6.062089967727661, "epoch": 0.18333962867187728, "grad_norm": 1.1015625, "learning_rate": 0.000132671713569547, "loss": 5.734, "mean_token_accuracy": 0.1556983083486557, "num_tokens": 6112277.0, "step": 3155 }, { "entropy": 6.076785612106323, "epoch": 0.1836301827585205, "grad_norm": 1.1328125, "learning_rate": 0.0001317612288181136, "loss": 5.7002, "mean_token_accuracy": 0.1554645776748657, "num_tokens": 6121471.0, "step": 3160 }, { "entropy": 6.072947454452515, "epoch": 0.18392073684516372, "grad_norm": 0.953125, "learning_rate": 0.00013085467102994864, "loss": 5.7982, "mean_token_accuracy": 0.14808910414576532, "num_tokens": 6131717.0, "step": 3165 }, { "entropy": 6.053280687332153, "epoch": 0.18421129093180696, "grad_norm": 1.125, "learning_rate": 0.00012995206505879198, "loss": 5.708, "mean_token_accuracy": 0.15172735154628753, "num_tokens": 6142002.0, "step": 3170 }, { "entropy": 6.130218172073365, "epoch": 0.18450184501845018, "grad_norm": 0.9921875, "learning_rate": 0.0001290534356500421, "loss": 5.7836, "mean_token_accuracy": 0.14181277081370353, "num_tokens": 6152034.0, "step": 3175 }, { "entropy": 6.088483095169067, "epoch": 0.1847923991050934, "grad_norm": 1.0390625, "learning_rate": 0.00012815880744007827, "loss": 5.7851, "mean_token_accuracy": 0.1522969976067543, "num_tokens": 6161737.0, "step": 3180 }, { "entropy": 5.9690474510192875, "epoch": 0.18508295319173665, "grad_norm": 1.0703125, "learning_rate": 0.00012726820495558483, "loss": 5.6982, "mean_token_accuracy": 0.153187994658947, "num_tokens": 6170088.0, "step": 3185 }, { "entropy": 6.107737588882446, "epoch": 0.18537350727837987, "grad_norm": 1.046875, "learning_rate": 0.00012638165261287868, "loss": 5.8119, "mean_token_accuracy": 0.1520651862025261, "num_tokens": 6179914.0, "step": 3190 }, { "entropy": 6.0809577941894535, "epoch": 0.1856640613650231, "grad_norm": 1.0703125, "learning_rate": 0.0001254991747172402, "loss": 5.7642, "mean_token_accuracy": 0.14715377390384674, "num_tokens": 6191145.0, "step": 3195 }, { "entropy": 5.9991779804229735, "epoch": 0.18595461545166633, "grad_norm": 0.97265625, "learning_rate": 0.00012462079546224662, "loss": 5.6445, "mean_token_accuracy": 0.15297795236110687, "num_tokens": 6201587.0, "step": 3200 }, { "entropy": 6.071609449386597, "epoch": 0.18624516953830955, "grad_norm": 0.98046875, "learning_rate": 0.00012374653892910896, "loss": 5.7869, "mean_token_accuracy": 0.14952635765075684, "num_tokens": 6210659.0, "step": 3205 }, { "entropy": 6.06493353843689, "epoch": 0.1865357236249528, "grad_norm": 1.1015625, "learning_rate": 0.00012287642908601166, "loss": 5.6995, "mean_token_accuracy": 0.16741256564855575, "num_tokens": 6220179.0, "step": 3210 }, { "entropy": 6.087040662765503, "epoch": 0.18682627771159602, "grad_norm": 1.0078125, "learning_rate": 0.00012201048978745569, "loss": 5.8608, "mean_token_accuracy": 0.15103042125701904, "num_tokens": 6230568.0, "step": 3215 }, { "entropy": 6.1079224109649655, "epoch": 0.18711683179823924, "grad_norm": 1.0390625, "learning_rate": 0.00012114874477360427, "loss": 5.7732, "mean_token_accuracy": 0.15811807066202163, "num_tokens": 6240571.0, "step": 3220 }, { "entropy": 6.066962003707886, "epoch": 0.18740738588488248, "grad_norm": 1.03125, "learning_rate": 0.00012029121766963236, "loss": 5.6959, "mean_token_accuracy": 0.151812843978405, "num_tokens": 6250232.0, "step": 3225 }, { "entropy": 6.136503267288208, "epoch": 0.1876979399715257, "grad_norm": 1.09375, "learning_rate": 0.00011943793198507858, "loss": 5.8473, "mean_token_accuracy": 0.14256954118609427, "num_tokens": 6260191.0, "step": 3230 }, { "entropy": 6.1150651454925535, "epoch": 0.18798849405816892, "grad_norm": 0.94921875, "learning_rate": 0.00011858891111320104, "loss": 5.7593, "mean_token_accuracy": 0.15107759982347488, "num_tokens": 6270348.0, "step": 3235 }, { "entropy": 6.040066814422607, "epoch": 0.18827904814481217, "grad_norm": 1.046875, "learning_rate": 0.0001177441783303359, "loss": 5.7222, "mean_token_accuracy": 0.14626873433589935, "num_tokens": 6281212.0, "step": 3240 }, { "entropy": 6.0591840744018555, "epoch": 0.18856960223145539, "grad_norm": 1.03125, "learning_rate": 0.00011690375679525896, "loss": 5.7569, "mean_token_accuracy": 0.15225170105695723, "num_tokens": 6290194.0, "step": 3245 }, { "entropy": 6.092053413391113, "epoch": 0.1888601563180986, "grad_norm": 1.046875, "learning_rate": 0.00011606766954855124, "loss": 5.7817, "mean_token_accuracy": 0.15150906667113304, "num_tokens": 6299875.0, "step": 3250 }, { "entropy": 6.102940845489502, "epoch": 0.18915071040474185, "grad_norm": 1.0859375, "learning_rate": 0.00011523593951196702, "loss": 5.7722, "mean_token_accuracy": 0.15258708745241165, "num_tokens": 6309359.0, "step": 3255 }, { "entropy": 6.080491304397583, "epoch": 0.18944126449138507, "grad_norm": 0.953125, "learning_rate": 0.00011440858948780523, "loss": 5.7347, "mean_token_accuracy": 0.1517005071043968, "num_tokens": 6320783.0, "step": 3260 }, { "entropy": 5.987936544418335, "epoch": 0.1897318185780283, "grad_norm": 1.1015625, "learning_rate": 0.00011358564215828484, "loss": 5.5715, "mean_token_accuracy": 0.1621371328830719, "num_tokens": 6330150.0, "step": 3265 }, { "entropy": 5.964755249023438, "epoch": 0.19002237266467153, "grad_norm": 1.0234375, "learning_rate": 0.00011276712008492254, "loss": 5.7216, "mean_token_accuracy": 0.15560902208089827, "num_tokens": 6340421.0, "step": 3270 }, { "entropy": 6.003197145462036, "epoch": 0.19031292675131475, "grad_norm": 1.046875, "learning_rate": 0.00011195304570791451, "loss": 5.7291, "mean_token_accuracy": 0.14981625527143477, "num_tokens": 6351093.0, "step": 3275 }, { "entropy": 6.069776487350464, "epoch": 0.19060348083795797, "grad_norm": 1.0078125, "learning_rate": 0.00011114344134552094, "loss": 5.7178, "mean_token_accuracy": 0.14927603229880332, "num_tokens": 6360373.0, "step": 3280 }, { "entropy": 6.128297328948975, "epoch": 0.19089403492460122, "grad_norm": 1.1015625, "learning_rate": 0.0001103383291934545, "loss": 5.8211, "mean_token_accuracy": 0.15346538573503493, "num_tokens": 6371291.0, "step": 3285 }, { "entropy": 6.147673416137695, "epoch": 0.19118458901124444, "grad_norm": 1.125, "learning_rate": 0.00010953773132427141, "loss": 5.706, "mean_token_accuracy": 0.15231086164712906, "num_tokens": 6380869.0, "step": 3290 }, { "entropy": 6.075566673278809, "epoch": 0.19147514309788768, "grad_norm": 1.0625, "learning_rate": 0.00010874166968676677, "loss": 5.7058, "mean_token_accuracy": 0.15668356716632842, "num_tokens": 6390174.0, "step": 3295 }, { "entropy": 6.030622339248657, "epoch": 0.1917656971845309, "grad_norm": 1.0703125, "learning_rate": 0.00010795016610537251, "loss": 5.6748, "mean_token_accuracy": 0.15518099069595337, "num_tokens": 6399768.0, "step": 3300 }, { "entropy": 6.077046346664429, "epoch": 0.19205625127117412, "grad_norm": 1.0625, "learning_rate": 0.00010716324227955904, "loss": 5.802, "mean_token_accuracy": 0.1477528505027294, "num_tokens": 6409460.0, "step": 3305 }, { "entropy": 6.034036684036255, "epoch": 0.19234680535781737, "grad_norm": 1.078125, "learning_rate": 0.0001063809197832406, "loss": 5.6572, "mean_token_accuracy": 0.16222479790449143, "num_tokens": 6417968.0, "step": 3310 }, { "entropy": 6.085168313980103, "epoch": 0.1926373594444606, "grad_norm": 1.09375, "learning_rate": 0.00010560322006418368, "loss": 5.7371, "mean_token_accuracy": 0.1469581514596939, "num_tokens": 6427402.0, "step": 3315 }, { "entropy": 6.114069175720215, "epoch": 0.1929279135311038, "grad_norm": 1.0390625, "learning_rate": 0.00010483016444341887, "loss": 5.7702, "mean_token_accuracy": 0.15116747766733168, "num_tokens": 6437203.0, "step": 3320 }, { "entropy": 6.091079235076904, "epoch": 0.19321846761774705, "grad_norm": 1.0234375, "learning_rate": 0.00010406177411465654, "loss": 5.6856, "mean_token_accuracy": 0.15697493702173232, "num_tokens": 6446440.0, "step": 3325 }, { "entropy": 6.068310308456421, "epoch": 0.19350902170439027, "grad_norm": 1.0234375, "learning_rate": 0.00010329807014370562, "loss": 5.6496, "mean_token_accuracy": 0.157624289393425, "num_tokens": 6455842.0, "step": 3330 }, { "entropy": 6.069522714614868, "epoch": 0.1937995757910335, "grad_norm": 1.109375, "learning_rate": 0.00010253907346789632, "loss": 5.6689, "mean_token_accuracy": 0.15749077796936034, "num_tokens": 6464538.0, "step": 3335 }, { "entropy": 5.971843290328979, "epoch": 0.19409012987767674, "grad_norm": 0.984375, "learning_rate": 0.00010178480489550596, "loss": 5.6299, "mean_token_accuracy": 0.15191589742898942, "num_tokens": 6474646.0, "step": 3340 }, { "entropy": 6.010790205001831, "epoch": 0.19438068396431996, "grad_norm": 1.109375, "learning_rate": 0.00010103528510518836, "loss": 5.7641, "mean_token_accuracy": 0.14547111392021178, "num_tokens": 6484397.0, "step": 3345 }, { "entropy": 6.044341564178467, "epoch": 0.19467123805096317, "grad_norm": 1.046875, "learning_rate": 0.0001002905346454073, "loss": 5.7108, "mean_token_accuracy": 0.14943148642778398, "num_tokens": 6494254.0, "step": 3350 }, { "entropy": 6.041453218460083, "epoch": 0.19496179213760642, "grad_norm": 0.9921875, "learning_rate": 9.955057393387285e-05, "loss": 5.6536, "mean_token_accuracy": 0.16039068698883058, "num_tokens": 6503862.0, "step": 3355 }, { "entropy": 6.147222709655762, "epoch": 0.19525234622424964, "grad_norm": 1.0703125, "learning_rate": 9.88154232569816e-05, "loss": 5.7122, "mean_token_accuracy": 0.14973994195461274, "num_tokens": 6513644.0, "step": 3360 }, { "entropy": 6.084700441360473, "epoch": 0.1955429003108929, "grad_norm": 1.0390625, "learning_rate": 9.808510276926075e-05, "loss": 5.7, "mean_token_accuracy": 0.14390370547771453, "num_tokens": 6523991.0, "step": 3365 }, { "entropy": 6.049271821975708, "epoch": 0.1958334543975361, "grad_norm": 1.0390625, "learning_rate": 9.735963249281549e-05, "loss": 5.7439, "mean_token_accuracy": 0.14755677580833435, "num_tokens": 6533761.0, "step": 3370 }, { "entropy": 6.067014598846436, "epoch": 0.19612400848417932, "grad_norm": 1.1484375, "learning_rate": 9.663903231677974e-05, "loss": 5.7136, "mean_token_accuracy": 0.1520987056195736, "num_tokens": 6544115.0, "step": 3375 }, { "entropy": 6.102639961242676, "epoch": 0.19641456257082257, "grad_norm": 0.9375, "learning_rate": 9.592332199677145e-05, "loss": 5.7757, "mean_token_accuracy": 0.1473358005285263, "num_tokens": 6555207.0, "step": 3380 }, { "entropy": 6.06288423538208, "epoch": 0.1967051166574658, "grad_norm": 0.92578125, "learning_rate": 9.521252115435061e-05, "loss": 5.7049, "mean_token_accuracy": 0.15305796936154364, "num_tokens": 6564725.0, "step": 3385 }, { "entropy": 6.05938458442688, "epoch": 0.196995670744109, "grad_norm": 0.99609375, "learning_rate": 9.450664927648126e-05, "loss": 5.7016, "mean_token_accuracy": 0.14934950321912766, "num_tokens": 6575036.0, "step": 3390 }, { "entropy": 6.02567343711853, "epoch": 0.19728622483075225, "grad_norm": 1.0, "learning_rate": 9.380572571499758e-05, "loss": 5.7274, "mean_token_accuracy": 0.1436111845076084, "num_tokens": 6585489.0, "step": 3395 }, { "entropy": 6.0807962894439695, "epoch": 0.19757677891739547, "grad_norm": 0.93359375, "learning_rate": 9.310976968607307e-05, "loss": 5.7484, "mean_token_accuracy": 0.1487852841615677, "num_tokens": 6594848.0, "step": 3400 }, { "entropy": 6.089216613769532, "epoch": 0.1978673330040387, "grad_norm": 1.046875, "learning_rate": 9.241880026969381e-05, "loss": 5.7464, "mean_token_accuracy": 0.14339143857359887, "num_tokens": 6605983.0, "step": 3405 }, { "entropy": 6.055511140823365, "epoch": 0.19815788709068194, "grad_norm": 1.0703125, "learning_rate": 9.173283640913537e-05, "loss": 5.6737, "mean_token_accuracy": 0.1530997022986412, "num_tokens": 6615093.0, "step": 3410 }, { "entropy": 6.013196659088135, "epoch": 0.19844844117732516, "grad_norm": 1.1015625, "learning_rate": 9.10518969104436e-05, "loss": 5.6165, "mean_token_accuracy": 0.15827906131744385, "num_tokens": 6623551.0, "step": 3415 }, { "entropy": 5.9676004409790036, "epoch": 0.19873899526396838, "grad_norm": 1.0234375, "learning_rate": 9.037600044191868e-05, "loss": 5.7311, "mean_token_accuracy": 0.1509920448064804, "num_tokens": 6633611.0, "step": 3420 }, { "entropy": 6.004412841796875, "epoch": 0.19902954935061162, "grad_norm": 1.0859375, "learning_rate": 8.970516553360383e-05, "loss": 5.6986, "mean_token_accuracy": 0.1615770772099495, "num_tokens": 6642605.0, "step": 3425 }, { "entropy": 6.040467405319214, "epoch": 0.19932010343725484, "grad_norm": 1.125, "learning_rate": 8.903941057677692e-05, "loss": 5.7086, "mean_token_accuracy": 0.15024487525224686, "num_tokens": 6652398.0, "step": 3430 }, { "entropy": 6.084765291213989, "epoch": 0.19961065752389806, "grad_norm": 1.0625, "learning_rate": 8.837875382344635e-05, "loss": 5.6634, "mean_token_accuracy": 0.15238699167966843, "num_tokens": 6661667.0, "step": 3435 }, { "entropy": 6.060661029815674, "epoch": 0.1999012116105413, "grad_norm": 1.0234375, "learning_rate": 8.772321338585076e-05, "loss": 5.6415, "mean_token_accuracy": 0.16024876087903978, "num_tokens": 6670613.0, "step": 3440 }, { "entropy": 6.083744382858276, "epoch": 0.20019176569718453, "grad_norm": 1.046875, "learning_rate": 8.707280723596242e-05, "loss": 5.7538, "mean_token_accuracy": 0.1504388488829136, "num_tokens": 6679543.0, "step": 3445 }, { "entropy": 6.077122068405151, "epoch": 0.20048231978382777, "grad_norm": 0.95703125, "learning_rate": 8.64275532049944e-05, "loss": 5.8425, "mean_token_accuracy": 0.140785413980484, "num_tokens": 6689638.0, "step": 3450 }, { "entropy": 6.026986122131348, "epoch": 0.200772873870471, "grad_norm": 1.0390625, "learning_rate": 8.578746898291198e-05, "loss": 5.7096, "mean_token_accuracy": 0.15388644784688948, "num_tokens": 6699561.0, "step": 3455 }, { "entropy": 6.036118078231811, "epoch": 0.2010634279571142, "grad_norm": 1.0625, "learning_rate": 8.515257211794742e-05, "loss": 5.7424, "mean_token_accuracy": 0.1555853232741356, "num_tokens": 6709542.0, "step": 3460 }, { "entropy": 6.055869913101196, "epoch": 0.20135398204375746, "grad_norm": 0.99609375, "learning_rate": 8.452288001611896e-05, "loss": 5.6998, "mean_token_accuracy": 0.15517012774944305, "num_tokens": 6718545.0, "step": 3465 }, { "entropy": 6.0092888355255125, "epoch": 0.20164453613040068, "grad_norm": 1.1171875, "learning_rate": 8.389840994075379e-05, "loss": 5.5914, "mean_token_accuracy": 0.15963410586118698, "num_tokens": 6727491.0, "step": 3470 }, { "entropy": 6.133456993103027, "epoch": 0.2019350902170439, "grad_norm": 1.09375, "learning_rate": 8.327917901201435e-05, "loss": 5.862, "mean_token_accuracy": 0.1492785707116127, "num_tokens": 6737690.0, "step": 3475 }, { "entropy": 6.023900270462036, "epoch": 0.20222564430368714, "grad_norm": 1.078125, "learning_rate": 8.266520420642931e-05, "loss": 5.638, "mean_token_accuracy": 0.16246868669986725, "num_tokens": 6747049.0, "step": 3480 }, { "entropy": 6.069623184204102, "epoch": 0.20251619839033036, "grad_norm": 1.1796875, "learning_rate": 8.205650235642828e-05, "loss": 5.7306, "mean_token_accuracy": 0.15011803209781646, "num_tokens": 6756199.0, "step": 3485 }, { "entropy": 6.060231304168701, "epoch": 0.20280675247697358, "grad_norm": 1.125, "learning_rate": 8.145309014987978e-05, "loss": 5.6926, "mean_token_accuracy": 0.14650730416178703, "num_tokens": 6765595.0, "step": 3490 }, { "entropy": 6.029575967788697, "epoch": 0.20309730656361683, "grad_norm": 1.171875, "learning_rate": 8.085498412963437e-05, "loss": 5.628, "mean_token_accuracy": 0.16038369089365007, "num_tokens": 6775078.0, "step": 3495 }, { "entropy": 6.112934684753418, "epoch": 0.20338786065026004, "grad_norm": 0.95703125, "learning_rate": 8.026220069307078e-05, "loss": 5.7506, "mean_token_accuracy": 0.15270390585064889, "num_tokens": 6785419.0, "step": 3500 }, { "entropy": 6.068965864181519, "epoch": 0.20367841473690326, "grad_norm": 0.9609375, "learning_rate": 7.967475609164621e-05, "loss": 5.7165, "mean_token_accuracy": 0.15362876802682876, "num_tokens": 6795289.0, "step": 3505 }, { "entropy": 5.970501708984375, "epoch": 0.2039689688235465, "grad_norm": 0.99609375, "learning_rate": 7.909266643045124e-05, "loss": 5.606, "mean_token_accuracy": 0.16173766702413558, "num_tokens": 6804425.0, "step": 3510 }, { "entropy": 6.029342889785767, "epoch": 0.20425952291018973, "grad_norm": 1.0546875, "learning_rate": 7.851594766776802e-05, "loss": 5.6689, "mean_token_accuracy": 0.1656169682741165, "num_tokens": 6814102.0, "step": 3515 }, { "entropy": 6.081440544128418, "epoch": 0.20455007699683295, "grad_norm": 0.98828125, "learning_rate": 7.794461561463265e-05, "loss": 5.6685, "mean_token_accuracy": 0.16380728930234909, "num_tokens": 6824693.0, "step": 3520 }, { "entropy": 6.080014085769653, "epoch": 0.2048406310834762, "grad_norm": 0.96875, "learning_rate": 7.7378685934402e-05, "loss": 5.7272, "mean_token_accuracy": 0.1497935637831688, "num_tokens": 6834951.0, "step": 3525 }, { "entropy": 6.026291465759277, "epoch": 0.2051311851701194, "grad_norm": 1.0703125, "learning_rate": 7.68181741423242e-05, "loss": 5.6537, "mean_token_accuracy": 0.15431195497512817, "num_tokens": 6843922.0, "step": 3530 }, { "entropy": 5.994410610198974, "epoch": 0.20542173925676266, "grad_norm": 1.109375, "learning_rate": 7.626309560511313e-05, "loss": 5.5725, "mean_token_accuracy": 0.15747455805540084, "num_tokens": 6852903.0, "step": 3535 }, { "entropy": 6.06242561340332, "epoch": 0.20571229334340588, "grad_norm": 1.0859375, "learning_rate": 7.571346554052724e-05, "loss": 5.6528, "mean_token_accuracy": 0.15987856090068817, "num_tokens": 6862748.0, "step": 3540 }, { "entropy": 6.058012628555298, "epoch": 0.2060028474300491, "grad_norm": 0.984375, "learning_rate": 7.516929901695249e-05, "loss": 5.7014, "mean_token_accuracy": 0.15260830670595169, "num_tokens": 6873043.0, "step": 3545 }, { "entropy": 6.067829179763794, "epoch": 0.20629340151669234, "grad_norm": 1.0, "learning_rate": 7.463061095298893e-05, "loss": 5.6857, "mean_token_accuracy": 0.15442362278699875, "num_tokens": 6883256.0, "step": 3550 }, { "entropy": 6.019204711914062, "epoch": 0.20658395560333556, "grad_norm": 0.99609375, "learning_rate": 7.409741611704198e-05, "loss": 5.7186, "mean_token_accuracy": 0.15627539306879043, "num_tokens": 6893350.0, "step": 3555 }, { "entropy": 6.0289897441864015, "epoch": 0.20687450968997878, "grad_norm": 1.03125, "learning_rate": 7.35697291269174e-05, "loss": 5.7174, "mean_token_accuracy": 0.15497809946537017, "num_tokens": 6904378.0, "step": 3560 }, { "entropy": 6.094915437698364, "epoch": 0.20716506377662203, "grad_norm": 1.0390625, "learning_rate": 7.304756444942056e-05, "loss": 5.821, "mean_token_accuracy": 0.14591436162590982, "num_tokens": 6913293.0, "step": 3565 }, { "entropy": 6.109770011901856, "epoch": 0.20745561786326525, "grad_norm": 0.98828125, "learning_rate": 7.253093639995994e-05, "loss": 5.6726, "mean_token_accuracy": 0.1514420121908188, "num_tokens": 6922950.0, "step": 3570 }, { "entropy": 6.070370769500732, "epoch": 0.20774617194990846, "grad_norm": 1.0859375, "learning_rate": 7.20198591421544e-05, "loss": 5.6393, "mean_token_accuracy": 0.15796124786138535, "num_tokens": 6931746.0, "step": 3575 }, { "entropy": 6.085724258422852, "epoch": 0.2080367260365517, "grad_norm": 1.0390625, "learning_rate": 7.151434668744517e-05, "loss": 5.7744, "mean_token_accuracy": 0.1553097262978554, "num_tokens": 6941617.0, "step": 3580 }, { "entropy": 6.020153665542603, "epoch": 0.20832728012319493, "grad_norm": 1.0703125, "learning_rate": 7.101441289471153e-05, "loss": 5.6818, "mean_token_accuracy": 0.1518129140138626, "num_tokens": 6950743.0, "step": 3585 }, { "entropy": 6.0454991340637205, "epoch": 0.20861783420983815, "grad_norm": 1.09375, "learning_rate": 7.052007146989098e-05, "loss": 5.7299, "mean_token_accuracy": 0.145194461196661, "num_tokens": 6960665.0, "step": 3590 }, { "entropy": 6.010702991485596, "epoch": 0.2089083882964814, "grad_norm": 1.0546875, "learning_rate": 7.003133596560341e-05, "loss": 5.6009, "mean_token_accuracy": 0.15414568036794662, "num_tokens": 6970316.0, "step": 3595 }, { "entropy": 6.042399644851685, "epoch": 0.20919894238312461, "grad_norm": 1.109375, "learning_rate": 6.954821978077952e-05, "loss": 5.6891, "mean_token_accuracy": 0.15473626405000687, "num_tokens": 6980116.0, "step": 3600 }, { "entropy": 6.05302414894104, "epoch": 0.20948949646976786, "grad_norm": 1.1015625, "learning_rate": 6.907073616029356e-05, "loss": 5.7069, "mean_token_accuracy": 0.15373171120882034, "num_tokens": 6989901.0, "step": 3605 }, { "entropy": 6.045177841186524, "epoch": 0.20978005055641108, "grad_norm": 1.015625, "learning_rate": 6.85988981946002e-05, "loss": 5.7036, "mean_token_accuracy": 0.15624198466539382, "num_tokens": 6999405.0, "step": 3610 }, { "entropy": 6.048674821853638, "epoch": 0.2100706046430543, "grad_norm": 0.99609375, "learning_rate": 6.813271881937564e-05, "loss": 5.8436, "mean_token_accuracy": 0.14876155257225038, "num_tokens": 7010198.0, "step": 3615 }, { "entropy": 6.046766996383667, "epoch": 0.21036115872969754, "grad_norm": 1.0390625, "learning_rate": 6.767221081516286e-05, "loss": 5.677, "mean_token_accuracy": 0.16342443078756333, "num_tokens": 7019930.0, "step": 3620 }, { "entropy": 6.053871488571167, "epoch": 0.21065171281634076, "grad_norm": 0.9765625, "learning_rate": 6.72173868070215e-05, "loss": 5.6483, "mean_token_accuracy": 0.15757839530706405, "num_tokens": 7029485.0, "step": 3625 }, { "entropy": 6.064899587631226, "epoch": 0.21094226690298398, "grad_norm": 1.0546875, "learning_rate": 6.676825926418149e-05, "loss": 5.7667, "mean_token_accuracy": 0.15013407468795775, "num_tokens": 7038891.0, "step": 3630 }, { "entropy": 6.028255796432495, "epoch": 0.21123282098962723, "grad_norm": 1.046875, "learning_rate": 6.632484049970122e-05, "loss": 5.6891, "mean_token_accuracy": 0.1538076549768448, "num_tokens": 7048477.0, "step": 3635 }, { "entropy": 6.019302177429199, "epoch": 0.21152337507627045, "grad_norm": 1.0390625, "learning_rate": 6.588714267013019e-05, "loss": 5.5945, "mean_token_accuracy": 0.16957512646913528, "num_tokens": 7057705.0, "step": 3640 }, { "entropy": 6.048618459701538, "epoch": 0.21181392916291367, "grad_norm": 1.0703125, "learning_rate": 6.545517777517544e-05, "loss": 5.6504, "mean_token_accuracy": 0.1564537763595581, "num_tokens": 7067174.0, "step": 3645 }, { "entropy": 6.0795738697052, "epoch": 0.2121044832495569, "grad_norm": 1.1953125, "learning_rate": 6.502895765737281e-05, "loss": 5.7639, "mean_token_accuracy": 0.15077428221702577, "num_tokens": 7076250.0, "step": 3650 }, { "entropy": 6.046699380874633, "epoch": 0.21239503733620013, "grad_norm": 1.125, "learning_rate": 6.460849400176212e-05, "loss": 5.7002, "mean_token_accuracy": 0.1563662827014923, "num_tokens": 7085347.0, "step": 3655 }, { "entropy": 5.965932989120484, "epoch": 0.21268559142284335, "grad_norm": 1.0234375, "learning_rate": 6.419379833556694e-05, "loss": 5.6007, "mean_token_accuracy": 0.15481184422969818, "num_tokens": 7094639.0, "step": 3660 }, { "entropy": 6.034517765045166, "epoch": 0.2129761455094866, "grad_norm": 1.1484375, "learning_rate": 6.378488202787835e-05, "loss": 5.6765, "mean_token_accuracy": 0.1551177941262722, "num_tokens": 7103819.0, "step": 3665 }, { "entropy": 6.1038103103637695, "epoch": 0.21326669959612982, "grad_norm": 1.1484375, "learning_rate": 6.33817562893435e-05, "loss": 5.7149, "mean_token_accuracy": 0.15616475343704223, "num_tokens": 7113904.0, "step": 3670 }, { "entropy": 6.051551055908203, "epoch": 0.21355725368277303, "grad_norm": 1.078125, "learning_rate": 6.29844321718582e-05, "loss": 5.6672, "mean_token_accuracy": 0.15475056320428848, "num_tokens": 7123241.0, "step": 3675 }, { "entropy": 5.995166349411011, "epoch": 0.21384780776941628, "grad_norm": 1.1171875, "learning_rate": 6.259292056826383e-05, "loss": 5.5651, "mean_token_accuracy": 0.167102213203907, "num_tokens": 7131676.0, "step": 3680 }, { "entropy": 6.0871337890625, "epoch": 0.2141383618560595, "grad_norm": 1.015625, "learning_rate": 6.220723221204873e-05, "loss": 5.6893, "mean_token_accuracy": 0.15597724542021751, "num_tokens": 7141237.0, "step": 3685 }, { "entropy": 6.110262441635132, "epoch": 0.21442891594270275, "grad_norm": 1.0546875, "learning_rate": 6.182737767705406e-05, "loss": 5.7844, "mean_token_accuracy": 0.15404658690094947, "num_tokens": 7150696.0, "step": 3690 }, { "entropy": 6.048623514175415, "epoch": 0.21471947002934597, "grad_norm": 1.0234375, "learning_rate": 6.145336737718375e-05, "loss": 5.6907, "mean_token_accuracy": 0.1541660889983177, "num_tokens": 7160444.0, "step": 3695 }, { "entropy": 6.030835723876953, "epoch": 0.21501002411598918, "grad_norm": 1.0390625, "learning_rate": 6.10852115661191e-05, "loss": 5.7177, "mean_token_accuracy": 0.1561812475323677, "num_tokens": 7170170.0, "step": 3700 }, { "entropy": 5.982100868225098, "epoch": 0.21530057820263243, "grad_norm": 1.1796875, "learning_rate": 6.072292033703766e-05, "loss": 5.6144, "mean_token_accuracy": 0.15959885716438293, "num_tokens": 7179594.0, "step": 3705 }, { "entropy": 5.983188962936401, "epoch": 0.21559113228927565, "grad_norm": 1.125, "learning_rate": 6.036650362233648e-05, "loss": 5.5925, "mean_token_accuracy": 0.15840766131877898, "num_tokens": 7189139.0, "step": 3710 }, { "entropy": 6.049566268920898, "epoch": 0.21588168637591887, "grad_norm": 1.0546875, "learning_rate": 6.0015971193359824e-05, "loss": 5.7009, "mean_token_accuracy": 0.1592419296503067, "num_tokens": 7198259.0, "step": 3715 }, { "entropy": 6.067971229553223, "epoch": 0.21617224046256212, "grad_norm": 1.0078125, "learning_rate": 5.9671332660131306e-05, "loss": 5.6917, "mean_token_accuracy": 0.15316389575600625, "num_tokens": 7208420.0, "step": 3720 }, { "entropy": 6.0361669063568115, "epoch": 0.21646279454920533, "grad_norm": 1.1171875, "learning_rate": 5.933259747109042e-05, "loss": 5.7355, "mean_token_accuracy": 0.15207717418670655, "num_tokens": 7219045.0, "step": 3725 }, { "entropy": 6.09136929512024, "epoch": 0.21675334863584855, "grad_norm": 1.0234375, "learning_rate": 5.899977491283351e-05, "loss": 5.7985, "mean_token_accuracy": 0.1460676074028015, "num_tokens": 7229337.0, "step": 3730 }, { "entropy": 6.045521926879883, "epoch": 0.2170439027224918, "grad_norm": 1.0703125, "learning_rate": 5.867287410985908e-05, "loss": 5.676, "mean_token_accuracy": 0.15704198330640792, "num_tokens": 7239217.0, "step": 3735 }, { "entropy": 6.094875192642212, "epoch": 0.21733445680913502, "grad_norm": 0.9609375, "learning_rate": 5.835190402431779e-05, "loss": 5.7444, "mean_token_accuracy": 0.1542062446475029, "num_tokens": 7249671.0, "step": 3740 }, { "entropy": 6.0608758449554445, "epoch": 0.21762501089577824, "grad_norm": 1.0625, "learning_rate": 5.803687345576673e-05, "loss": 5.6701, "mean_token_accuracy": 0.16079277247190477, "num_tokens": 7259028.0, "step": 3745 }, { "entropy": 6.12493200302124, "epoch": 0.21791556498242148, "grad_norm": 1.140625, "learning_rate": 5.7727791040927977e-05, "loss": 5.8492, "mean_token_accuracy": 0.1423958010971546, "num_tokens": 7269519.0, "step": 3750 }, { "entropy": 6.104196643829345, "epoch": 0.2182061190690647, "grad_norm": 1.0, "learning_rate": 5.742466525345213e-05, "loss": 5.7234, "mean_token_accuracy": 0.15387822464108467, "num_tokens": 7278822.0, "step": 3755 }, { "entropy": 6.136565637588501, "epoch": 0.21849667315570795, "grad_norm": 1.0859375, "learning_rate": 5.7127504403685775e-05, "loss": 5.7549, "mean_token_accuracy": 0.159536774456501, "num_tokens": 7289431.0, "step": 3760 }, { "entropy": 6.078890037536621, "epoch": 0.21878722724235117, "grad_norm": 0.9453125, "learning_rate": 5.6836316638443664e-05, "loss": 5.7342, "mean_token_accuracy": 0.1513395741581917, "num_tokens": 7299735.0, "step": 3765 }, { "entropy": 6.022913074493408, "epoch": 0.2190777813289944, "grad_norm": 1.0625, "learning_rate": 5.655110994078553e-05, "loss": 5.64, "mean_token_accuracy": 0.15833066552877426, "num_tokens": 7308798.0, "step": 3770 }, { "entropy": 6.052260112762451, "epoch": 0.21936833541563763, "grad_norm": 1.0859375, "learning_rate": 5.6271892129797056e-05, "loss": 5.6916, "mean_token_accuracy": 0.1567830964922905, "num_tokens": 7318159.0, "step": 3775 }, { "entropy": 5.97794771194458, "epoch": 0.21965888950228085, "grad_norm": 1.015625, "learning_rate": 5.599867086037556e-05, "loss": 5.631, "mean_token_accuracy": 0.15575975477695464, "num_tokens": 7328327.0, "step": 3780 }, { "entropy": 6.021618509292603, "epoch": 0.21994944358892407, "grad_norm": 1.0546875, "learning_rate": 5.573145362302012e-05, "loss": 5.6629, "mean_token_accuracy": 0.15210975110530853, "num_tokens": 7337595.0, "step": 3785 }, { "entropy": 6.101519775390625, "epoch": 0.22023999767556732, "grad_norm": 1.0546875, "learning_rate": 5.5470247743626404e-05, "loss": 5.8443, "mean_token_accuracy": 0.14724614471197128, "num_tokens": 7347049.0, "step": 3790 }, { "entropy": 6.0834808349609375, "epoch": 0.22053055176221054, "grad_norm": 1.1328125, "learning_rate": 5.5215060383285414e-05, "loss": 5.7311, "mean_token_accuracy": 0.1475095644593239, "num_tokens": 7356908.0, "step": 3795 }, { "entropy": 6.040810489654541, "epoch": 0.22082110584885375, "grad_norm": 1.046875, "learning_rate": 5.496589853808759e-05, "loss": 5.6475, "mean_token_accuracy": 0.1548875778913498, "num_tokens": 7366823.0, "step": 3800 }, { "entropy": 6.104894399642944, "epoch": 0.221111659935497, "grad_norm": 1.0859375, "learning_rate": 5.47227690389308e-05, "loss": 5.762, "mean_token_accuracy": 0.1513790875673294, "num_tokens": 7376502.0, "step": 3805 }, { "entropy": 6.138718318939209, "epoch": 0.22140221402214022, "grad_norm": 1.0234375, "learning_rate": 5.448567855133306e-05, "loss": 5.7874, "mean_token_accuracy": 0.1471219077706337, "num_tokens": 7385835.0, "step": 3810 }, { "entropy": 6.074849843978882, "epoch": 0.22169276810878344, "grad_norm": 1.0078125, "learning_rate": 5.425463357524986e-05, "loss": 5.668, "mean_token_accuracy": 0.16173817664384843, "num_tokens": 7395524.0, "step": 3815 }, { "entropy": 6.055670976638794, "epoch": 0.22198332219542669, "grad_norm": 1.0859375, "learning_rate": 5.402964044489591e-05, "loss": 5.6701, "mean_token_accuracy": 0.15527116730809212, "num_tokens": 7404724.0, "step": 3820 }, { "entropy": 6.029075717926025, "epoch": 0.2222738762820699, "grad_norm": 1.1640625, "learning_rate": 5.381070532857153e-05, "loss": 5.6407, "mean_token_accuracy": 0.15602124780416488, "num_tokens": 7413992.0, "step": 3825 }, { "entropy": 6.046741342544555, "epoch": 0.22256443036871312, "grad_norm": 1.0234375, "learning_rate": 5.359783422849357e-05, "loss": 5.6456, "mean_token_accuracy": 0.15968140810728074, "num_tokens": 7424449.0, "step": 3830 }, { "entropy": 6.006699466705323, "epoch": 0.22285498445535637, "grad_norm": 1.046875, "learning_rate": 5.3391032980630736e-05, "loss": 5.6126, "mean_token_accuracy": 0.15793580561876297, "num_tokens": 7433151.0, "step": 3835 }, { "entropy": 6.002806901931763, "epoch": 0.2231455385419996, "grad_norm": 1.03125, "learning_rate": 5.31903072545437e-05, "loss": 5.6538, "mean_token_accuracy": 0.15995650440454484, "num_tokens": 7442773.0, "step": 3840 }, { "entropy": 6.006146192550659, "epoch": 0.22343609262864284, "grad_norm": 1.1328125, "learning_rate": 5.29956625532297e-05, "loss": 5.6508, "mean_token_accuracy": 0.15787655711174012, "num_tokens": 7451316.0, "step": 3845 }, { "entropy": 6.016618442535401, "epoch": 0.22372664671528605, "grad_norm": 1.0390625, "learning_rate": 5.280710421297146e-05, "loss": 5.6834, "mean_token_accuracy": 0.15794799476861954, "num_tokens": 7459834.0, "step": 3850 }, { "entropy": 6.0921612739562985, "epoch": 0.22401720080192927, "grad_norm": 1.0234375, "learning_rate": 5.2624637403191165e-05, "loss": 5.8168, "mean_token_accuracy": 0.14339097440242768, "num_tokens": 7470885.0, "step": 3855 }, { "entropy": 6.014931774139404, "epoch": 0.22430775488857252, "grad_norm": 1.046875, "learning_rate": 5.2448267126308605e-05, "loss": 5.662, "mean_token_accuracy": 0.1614661380648613, "num_tokens": 7480045.0, "step": 3860 }, { "entropy": 6.109454822540283, "epoch": 0.22459830897521574, "grad_norm": 0.9765625, "learning_rate": 5.2277998217603954e-05, "loss": 5.7537, "mean_token_accuracy": 0.1512547492980957, "num_tokens": 7489734.0, "step": 3865 }, { "entropy": 6.119794225692749, "epoch": 0.22488886306185896, "grad_norm": 1.1015625, "learning_rate": 5.211383534508541e-05, "loss": 5.6229, "mean_token_accuracy": 0.16408309638500213, "num_tokens": 7499443.0, "step": 3870 }, { "entropy": 6.05659670829773, "epoch": 0.2251794171485022, "grad_norm": 0.99609375, "learning_rate": 5.1955783009361044e-05, "loss": 5.6906, "mean_token_accuracy": 0.15638800263404845, "num_tokens": 7509085.0, "step": 3875 }, { "entropy": 6.054824876785278, "epoch": 0.22546997123514542, "grad_norm": 1.015625, "learning_rate": 5.180384554351543e-05, "loss": 5.6424, "mean_token_accuracy": 0.15496647357940674, "num_tokens": 7519510.0, "step": 3880 }, { "entropy": 6.047353935241699, "epoch": 0.22576052532178864, "grad_norm": 1.1640625, "learning_rate": 5.1658027112990976e-05, "loss": 5.6524, "mean_token_accuracy": 0.1555292695760727, "num_tokens": 7528004.0, "step": 3885 }, { "entropy": 6.07807502746582, "epoch": 0.2260510794084319, "grad_norm": 1.078125, "learning_rate": 5.151833171547365e-05, "loss": 5.7679, "mean_token_accuracy": 0.14742455780506133, "num_tokens": 7538394.0, "step": 3890 }, { "entropy": 6.009245443344116, "epoch": 0.2263416334950751, "grad_norm": 1.078125, "learning_rate": 5.1384763180783274e-05, "loss": 5.6188, "mean_token_accuracy": 0.1577170416712761, "num_tokens": 7547998.0, "step": 3895 }, { "entropy": 6.032008123397827, "epoch": 0.22663218758171833, "grad_norm": 1.0625, "learning_rate": 5.125732517076876e-05, "loss": 5.7206, "mean_token_accuracy": 0.15503439009189607, "num_tokens": 7557914.0, "step": 3900 }, { "entropy": 6.0673810005187985, "epoch": 0.22692274166836157, "grad_norm": 1.015625, "learning_rate": 5.113602117920747e-05, "loss": 5.7497, "mean_token_accuracy": 0.14790019690990447, "num_tokens": 7567606.0, "step": 3905 }, { "entropy": 6.01710991859436, "epoch": 0.2272132957550048, "grad_norm": 1.0078125, "learning_rate": 5.102085453170966e-05, "loss": 5.6722, "mean_token_accuracy": 0.1605480507016182, "num_tokens": 7576774.0, "step": 3910 }, { "entropy": 6.016513299942017, "epoch": 0.227503849841648, "grad_norm": 1.59375, "learning_rate": 5.091182838562709e-05, "loss": 5.7362, "mean_token_accuracy": 0.1543383590877056, "num_tokens": 7586490.0, "step": 3915 }, { "entropy": 6.041366291046143, "epoch": 0.22779440392829126, "grad_norm": 1.2265625, "learning_rate": 5.0808945729966716e-05, "loss": 5.62, "mean_token_accuracy": 0.16046071499586106, "num_tokens": 7595527.0, "step": 3920 }, { "entropy": 6.037348937988281, "epoch": 0.22808495801493447, "grad_norm": 1.015625, "learning_rate": 5.071220938530844e-05, "loss": 5.6927, "mean_token_accuracy": 0.15463001430034637, "num_tokens": 7605829.0, "step": 3925 }, { "entropy": 6.089096164703369, "epoch": 0.22837551210157772, "grad_norm": 1.046875, "learning_rate": 5.0621622003728026e-05, "loss": 5.7329, "mean_token_accuracy": 0.1547334223985672, "num_tokens": 7615363.0, "step": 3930 }, { "entropy": 6.023686742782592, "epoch": 0.22866606618822094, "grad_norm": 1.1640625, "learning_rate": 5.053718606872433e-05, "loss": 5.6753, "mean_token_accuracy": 0.15325114876031876, "num_tokens": 7624752.0, "step": 3935 }, { "entropy": 6.073769903182983, "epoch": 0.22895662027486416, "grad_norm": 1.0234375, "learning_rate": 5.0458903895151134e-05, "loss": 5.714, "mean_token_accuracy": 0.15586349815130235, "num_tokens": 7633958.0, "step": 3940 }, { "entropy": 6.045828437805175, "epoch": 0.2292471743615074, "grad_norm": 1.0390625, "learning_rate": 5.038677762915381e-05, "loss": 5.6356, "mean_token_accuracy": 0.16323170363903045, "num_tokens": 7643278.0, "step": 3945 }, { "entropy": 6.108937072753906, "epoch": 0.22953772844815062, "grad_norm": 0.99609375, "learning_rate": 5.032080924811033e-05, "loss": 5.7243, "mean_token_accuracy": 0.15668186247348787, "num_tokens": 7653378.0, "step": 3950 }, { "entropy": 6.043759202957153, "epoch": 0.22982828253479384, "grad_norm": 1.1015625, "learning_rate": 5.026100056057718e-05, "loss": 5.6752, "mean_token_accuracy": 0.16054976135492324, "num_tokens": 7663218.0, "step": 3955 }, { "entropy": 6.083030700683594, "epoch": 0.2301188366214371, "grad_norm": 1.0703125, "learning_rate": 5.0207353206239764e-05, "loss": 5.6591, "mean_token_accuracy": 0.15775054395198823, "num_tokens": 7672578.0, "step": 3960 }, { "entropy": 6.10399055480957, "epoch": 0.2304093907080803, "grad_norm": 1.09375, "learning_rate": 5.0159868655867436e-05, "loss": 5.7522, "mean_token_accuracy": 0.1491328150033951, "num_tokens": 7682335.0, "step": 3965 }, { "entropy": 6.117215824127197, "epoch": 0.23069994479472353, "grad_norm": 1.015625, "learning_rate": 5.011854821127305e-05, "loss": 5.7766, "mean_token_accuracy": 0.15085969120264053, "num_tokens": 7692363.0, "step": 3970 }, { "entropy": 6.1109333515167235, "epoch": 0.23099049888136677, "grad_norm": 1.125, "learning_rate": 5.008339300527755e-05, "loss": 5.6969, "mean_token_accuracy": 0.16164307296276093, "num_tokens": 7701880.0, "step": 3975 }, { "entropy": 6.090880966186523, "epoch": 0.23128105296801, "grad_norm": 1.0, "learning_rate": 5.005440400167859e-05, "loss": 5.722, "mean_token_accuracy": 0.15293850004673004, "num_tokens": 7711203.0, "step": 3980 }, { "entropy": 6.051671552658081, "epoch": 0.2315716070546532, "grad_norm": 1.1171875, "learning_rate": 5.003158199522442e-05, "loss": 5.6291, "mean_token_accuracy": 0.15574965327978135, "num_tokens": 7720154.0, "step": 3985 }, { "entropy": 6.046511125564575, "epoch": 0.23186216114129646, "grad_norm": 0.94140625, "learning_rate": 5.0014927611591806e-05, "loss": 5.6926, "mean_token_accuracy": 0.15550984293222428, "num_tokens": 7730469.0, "step": 3990 }, { "entropy": 6.068615293502807, "epoch": 0.23215271522793968, "grad_norm": 1.046875, "learning_rate": 5.000444130736916e-05, "loss": 5.6147, "mean_token_accuracy": 0.158310903608799, "num_tokens": 7739190.0, "step": 3995 }, { "entropy": 6.060236883163452, "epoch": 0.23244326931458292, "grad_norm": 0.9765625, "learning_rate": 5.0000123370043736e-05, "loss": 5.7513, "mean_token_accuracy": 0.15361028015613556, "num_tokens": 7749106.0, "step": 4000 } ], "logging_steps": 5, "max_steps": 4000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1686628301045760.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }