{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0871662259929686, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 10.742584228515625, "epoch": 0.0002905540866432286, "grad_norm": 4.90625, "learning_rate": 2e-06, "loss": 10.7837, "mean_token_accuracy": 0.0, "num_tokens": 10156.0, "step": 5 }, { "entropy": 10.742587471008301, "epoch": 0.0005811081732864572, "grad_norm": 4.8125, "learning_rate": 4.5e-06, "loss": 10.7753, "mean_token_accuracy": 9.267840650863945e-05, "num_tokens": 20933.0, "step": 10 }, { "entropy": 10.74257869720459, "epoch": 0.0008716622599296859, "grad_norm": 4.375, "learning_rate": 7e-06, "loss": 10.7508, "mean_token_accuracy": 0.0, "num_tokens": 31298.0, "step": 15 }, { "entropy": 10.742635726928711, "epoch": 0.0011622163465729145, "grad_norm": 4.875, "learning_rate": 9.5e-06, "loss": 10.697, "mean_token_accuracy": 0.0, "num_tokens": 40913.0, "step": 20 }, { "entropy": 10.742652702331544, "epoch": 0.0014527704332161432, "grad_norm": 4.28125, "learning_rate": 1.2e-05, "loss": 10.5798, "mean_token_accuracy": 0.0007269373920280487, "num_tokens": 49901.0, "step": 25 }, { "entropy": 10.742454719543456, "epoch": 0.0017433245198593718, "grad_norm": 4.0625, "learning_rate": 1.4500000000000002e-05, "loss": 10.4688, "mean_token_accuracy": 0.01560134175233543, "num_tokens": 59328.0, "step": 30 }, { "entropy": 10.741775226593017, "epoch": 0.0020338786065026006, "grad_norm": 3.25, "learning_rate": 1.7000000000000003e-05, "loss": 10.3287, "mean_token_accuracy": 0.037073963694274424, "num_tokens": 68405.0, "step": 35 }, { "entropy": 10.740037631988525, "epoch": 0.002324432693145829, "grad_norm": 2.578125, "learning_rate": 1.95e-05, "loss": 10.2203, "mean_token_accuracy": 0.037133642472326756, "num_tokens": 77591.0, "step": 40 }, { "entropy": 10.73731575012207, "epoch": 0.0026149867797890577, "grad_norm": 2.359375, "learning_rate": 2.2e-05, "loss": 10.1202, "mean_token_accuracy": 0.03901108838617802, "num_tokens": 88186.0, "step": 45 }, { "entropy": 10.734606838226318, "epoch": 0.0029055408664322865, "grad_norm": 2.09375, "learning_rate": 2.4500000000000003e-05, "loss": 10.0211, "mean_token_accuracy": 0.04241710864007473, "num_tokens": 97594.0, "step": 50 }, { "entropy": 10.73211612701416, "epoch": 0.003196094953075515, "grad_norm": 1.9921875, "learning_rate": 2.7e-05, "loss": 9.9871, "mean_token_accuracy": 0.03826836366206408, "num_tokens": 107386.0, "step": 55 }, { "entropy": 10.73102445602417, "epoch": 0.0034866490397187436, "grad_norm": 1.9140625, "learning_rate": 2.95e-05, "loss": 9.9132, "mean_token_accuracy": 0.03943221494555473, "num_tokens": 116742.0, "step": 60 }, { "entropy": 10.729645252227783, "epoch": 0.0037772031263619723, "grad_norm": 1.859375, "learning_rate": 3.2e-05, "loss": 9.8519, "mean_token_accuracy": 0.03962419871240854, "num_tokens": 126520.0, "step": 65 }, { "entropy": 10.727834033966065, "epoch": 0.004067757213005201, "grad_norm": 1.7734375, "learning_rate": 3.4500000000000005e-05, "loss": 9.7907, "mean_token_accuracy": 0.03989919070154428, "num_tokens": 136382.0, "step": 70 }, { "entropy": 10.724947452545166, "epoch": 0.0043583112996484295, "grad_norm": 1.7421875, "learning_rate": 3.7e-05, "loss": 9.7212, "mean_token_accuracy": 0.03671109899878502, "num_tokens": 146435.0, "step": 75 }, { "entropy": 10.72182493209839, "epoch": 0.004648865386291658, "grad_norm": 1.8359375, "learning_rate": 3.95e-05, "loss": 9.6591, "mean_token_accuracy": 0.037667426839470865, "num_tokens": 156174.0, "step": 80 }, { "entropy": 10.71723222732544, "epoch": 0.004939419472934887, "grad_norm": 1.765625, "learning_rate": 4.2000000000000004e-05, "loss": 9.5783, "mean_token_accuracy": 0.04142397493124008, "num_tokens": 165118.0, "step": 85 }, { "entropy": 10.708585739135742, "epoch": 0.005229973559578115, "grad_norm": 1.875, "learning_rate": 4.45e-05, "loss": 9.5252, "mean_token_accuracy": 0.04036426953971386, "num_tokens": 174401.0, "step": 90 }, { "entropy": 10.697160243988037, "epoch": 0.005520527646221344, "grad_norm": 1.765625, "learning_rate": 4.7000000000000004e-05, "loss": 9.443, "mean_token_accuracy": 0.04118307866156101, "num_tokens": 183533.0, "step": 95 }, { "entropy": 10.683875274658202, "epoch": 0.005811081732864573, "grad_norm": 1.7734375, "learning_rate": 4.9500000000000004e-05, "loss": 9.3596, "mean_token_accuracy": 0.045602331310510634, "num_tokens": 193296.0, "step": 100 }, { "entropy": 10.665638542175293, "epoch": 0.006101635819507801, "grad_norm": 1.75, "learning_rate": 5.2e-05, "loss": 9.2242, "mean_token_accuracy": 0.055989645794034, "num_tokens": 202741.0, "step": 105 }, { "entropy": 10.637047958374023, "epoch": 0.00639218990615103, "grad_norm": 1.703125, "learning_rate": 5.45e-05, "loss": 9.1359, "mean_token_accuracy": 0.05134495124220848, "num_tokens": 212441.0, "step": 110 }, { "entropy": 10.61000461578369, "epoch": 0.006682743992794259, "grad_norm": 1.6875, "learning_rate": 5.7e-05, "loss": 8.9868, "mean_token_accuracy": 0.04918566383421421, "num_tokens": 220671.0, "step": 115 }, { "entropy": 10.56931962966919, "epoch": 0.006973298079437487, "grad_norm": 1.7578125, "learning_rate": 5.9499999999999996e-05, "loss": 8.9878, "mean_token_accuracy": 0.04560479037463665, "num_tokens": 231390.0, "step": 120 }, { "entropy": 10.515452098846435, "epoch": 0.007263852166080716, "grad_norm": 1.5859375, "learning_rate": 6.2e-05, "loss": 8.8241, "mean_token_accuracy": 0.05023673102259636, "num_tokens": 241137.0, "step": 125 }, { "entropy": 10.430156230926514, "epoch": 0.007554406252723945, "grad_norm": 1.578125, "learning_rate": 6.450000000000001e-05, "loss": 8.6778, "mean_token_accuracy": 0.05138532817363739, "num_tokens": 250627.0, "step": 130 }, { "entropy": 10.353140926361084, "epoch": 0.007844960339367173, "grad_norm": 1.5859375, "learning_rate": 6.7e-05, "loss": 8.5255, "mean_token_accuracy": 0.05529710613191128, "num_tokens": 259564.0, "step": 135 }, { "entropy": 10.262280082702636, "epoch": 0.008135514426010402, "grad_norm": 1.453125, "learning_rate": 6.950000000000001e-05, "loss": 8.4168, "mean_token_accuracy": 0.05102897398173809, "num_tokens": 268997.0, "step": 140 }, { "entropy": 10.17268762588501, "epoch": 0.00842606851265363, "grad_norm": 1.40625, "learning_rate": 7.2e-05, "loss": 8.402, "mean_token_accuracy": 0.04707291163504124, "num_tokens": 278989.0, "step": 145 }, { "entropy": 10.068906784057617, "epoch": 0.008716622599296859, "grad_norm": 1.3828125, "learning_rate": 7.45e-05, "loss": 8.2195, "mean_token_accuracy": 0.04823922924697399, "num_tokens": 288770.0, "step": 150 }, { "entropy": 9.884156227111816, "epoch": 0.009007176685940088, "grad_norm": 1.234375, "learning_rate": 7.7e-05, "loss": 8.1604, "mean_token_accuracy": 0.05296766012907028, "num_tokens": 298368.0, "step": 155 }, { "entropy": 9.749515438079834, "epoch": 0.009297730772583316, "grad_norm": 1.125, "learning_rate": 7.950000000000001e-05, "loss": 7.9887, "mean_token_accuracy": 0.054083061218261716, "num_tokens": 307437.0, "step": 160 }, { "entropy": 9.539670753479005, "epoch": 0.009588284859226545, "grad_norm": 1.3828125, "learning_rate": 8.2e-05, "loss": 7.931, "mean_token_accuracy": 0.05368399284780025, "num_tokens": 317842.0, "step": 165 }, { "entropy": 9.367785167694091, "epoch": 0.009878838945869774, "grad_norm": 0.984375, "learning_rate": 8.450000000000001e-05, "loss": 7.7746, "mean_token_accuracy": 0.056211471930146216, "num_tokens": 327455.0, "step": 170 }, { "entropy": 9.106531143188477, "epoch": 0.010169393032513002, "grad_norm": 0.96484375, "learning_rate": 8.7e-05, "loss": 7.7023, "mean_token_accuracy": 0.059121083468198776, "num_tokens": 338593.0, "step": 175 }, { "entropy": 8.891216564178468, "epoch": 0.01045994711915623, "grad_norm": 1.0, "learning_rate": 8.95e-05, "loss": 7.6717, "mean_token_accuracy": 0.060001150518655774, "num_tokens": 348278.0, "step": 180 }, { "entropy": 8.690237522125244, "epoch": 0.01075050120579946, "grad_norm": 0.91796875, "learning_rate": 9.2e-05, "loss": 7.5848, "mean_token_accuracy": 0.060652027279138564, "num_tokens": 358293.0, "step": 185 }, { "entropy": 8.500970458984375, "epoch": 0.011041055292442687, "grad_norm": 0.70703125, "learning_rate": 9.45e-05, "loss": 7.6462, "mean_token_accuracy": 0.06345079019665718, "num_tokens": 368177.0, "step": 190 }, { "entropy": 8.432841682434082, "epoch": 0.011331609379085917, "grad_norm": 0.87890625, "learning_rate": 9.7e-05, "loss": 7.5041, "mean_token_accuracy": 0.06438801400363445, "num_tokens": 377258.0, "step": 195 }, { "entropy": 8.328762531280518, "epoch": 0.011622163465729146, "grad_norm": 0.79296875, "learning_rate": 9.95e-05, "loss": 7.515, "mean_token_accuracy": 0.06462946832180023, "num_tokens": 385931.0, "step": 200 }, { "entropy": 8.228355598449706, "epoch": 0.011912717552372373, "grad_norm": 0.984375, "learning_rate": 0.000102, "loss": 7.4262, "mean_token_accuracy": 0.06731356121599674, "num_tokens": 394370.0, "step": 205 }, { "entropy": 8.163572025299072, "epoch": 0.012203271639015602, "grad_norm": 0.765625, "learning_rate": 0.00010449999999999999, "loss": 7.5127, "mean_token_accuracy": 0.06187250129878521, "num_tokens": 405167.0, "step": 210 }, { "entropy": 8.144425964355468, "epoch": 0.012493825725658832, "grad_norm": 0.90234375, "learning_rate": 0.000107, "loss": 7.4823, "mean_token_accuracy": 0.06424942426383495, "num_tokens": 414954.0, "step": 215 }, { "entropy": 8.074434852600097, "epoch": 0.01278437981230206, "grad_norm": 1.0078125, "learning_rate": 0.0001095, "loss": 7.4379, "mean_token_accuracy": 0.07021872885525227, "num_tokens": 423806.0, "step": 220 }, { "entropy": 8.100719451904297, "epoch": 0.013074933898945288, "grad_norm": 1.1015625, "learning_rate": 0.000112, "loss": 7.4049, "mean_token_accuracy": 0.07006631046533585, "num_tokens": 433416.0, "step": 225 }, { "entropy": 8.068440341949463, "epoch": 0.013365487985588518, "grad_norm": 1.015625, "learning_rate": 0.0001145, "loss": 7.4086, "mean_token_accuracy": 0.0656484205275774, "num_tokens": 443237.0, "step": 230 }, { "entropy": 8.008077144622803, "epoch": 0.013656042072231747, "grad_norm": 1.0078125, "learning_rate": 0.00011700000000000001, "loss": 7.3811, "mean_token_accuracy": 0.07138268202543259, "num_tokens": 452334.0, "step": 235 }, { "entropy": 7.95733003616333, "epoch": 0.013946596158874974, "grad_norm": 1.0390625, "learning_rate": 0.00011949999999999999, "loss": 7.451, "mean_token_accuracy": 0.07069577798247337, "num_tokens": 462604.0, "step": 240 }, { "entropy": 7.985943031311035, "epoch": 0.014237150245518203, "grad_norm": 1.109375, "learning_rate": 0.000122, "loss": 7.3342, "mean_token_accuracy": 0.07418472990393639, "num_tokens": 472105.0, "step": 245 }, { "entropy": 7.985353708267212, "epoch": 0.014527704332161433, "grad_norm": 0.89453125, "learning_rate": 0.0001245, "loss": 7.3573, "mean_token_accuracy": 0.07192124761641025, "num_tokens": 481873.0, "step": 250 }, { "entropy": 7.852858924865723, "epoch": 0.01481825841880466, "grad_norm": 0.9140625, "learning_rate": 0.000127, "loss": 7.3134, "mean_token_accuracy": 0.07094009146094322, "num_tokens": 490776.0, "step": 255 }, { "entropy": 7.97090711593628, "epoch": 0.01510881250544789, "grad_norm": 0.97265625, "learning_rate": 0.0001295, "loss": 7.3459, "mean_token_accuracy": 0.06945950090885163, "num_tokens": 500237.0, "step": 260 }, { "entropy": 7.988322401046753, "epoch": 0.015399366592091119, "grad_norm": 1.0078125, "learning_rate": 0.000132, "loss": 7.3569, "mean_token_accuracy": 0.0719369538128376, "num_tokens": 509449.0, "step": 265 }, { "entropy": 7.863973140716553, "epoch": 0.015689920678734346, "grad_norm": 0.9296875, "learning_rate": 0.00013450000000000002, "loss": 7.3463, "mean_token_accuracy": 0.07629362866282463, "num_tokens": 519335.0, "step": 270 }, { "entropy": 7.850080347061157, "epoch": 0.015980474765377575, "grad_norm": 0.828125, "learning_rate": 0.00013700000000000002, "loss": 7.269, "mean_token_accuracy": 0.07348301075398922, "num_tokens": 529108.0, "step": 275 }, { "entropy": 7.803001642227173, "epoch": 0.016271028852020804, "grad_norm": 0.9609375, "learning_rate": 0.0001395, "loss": 7.3421, "mean_token_accuracy": 0.07442944496870041, "num_tokens": 539409.0, "step": 280 }, { "entropy": 7.8401947021484375, "epoch": 0.016561582938664034, "grad_norm": 0.98828125, "learning_rate": 0.00014199999999999998, "loss": 7.269, "mean_token_accuracy": 0.07476447969675064, "num_tokens": 549790.0, "step": 285 }, { "entropy": 7.773062610626221, "epoch": 0.01685213702530726, "grad_norm": 0.90234375, "learning_rate": 0.0001445, "loss": 7.2597, "mean_token_accuracy": 0.07743276208639145, "num_tokens": 559343.0, "step": 290 }, { "entropy": 7.833370351791382, "epoch": 0.01714269111195049, "grad_norm": 1.1171875, "learning_rate": 0.000147, "loss": 7.306, "mean_token_accuracy": 0.07507650516927242, "num_tokens": 568806.0, "step": 295 }, { "entropy": 7.692620134353637, "epoch": 0.017433245198593718, "grad_norm": 1.0703125, "learning_rate": 0.0001495, "loss": 7.1532, "mean_token_accuracy": 0.07671754881739616, "num_tokens": 578988.0, "step": 300 }, { "entropy": 7.840510559082031, "epoch": 0.017723799285236947, "grad_norm": 1.015625, "learning_rate": 0.000152, "loss": 7.258, "mean_token_accuracy": 0.0767325557768345, "num_tokens": 588588.0, "step": 305 }, { "entropy": 7.740892934799194, "epoch": 0.018014353371880176, "grad_norm": 0.9453125, "learning_rate": 0.00015450000000000001, "loss": 7.2385, "mean_token_accuracy": 0.07767370343208313, "num_tokens": 597957.0, "step": 310 }, { "entropy": 7.761815309524536, "epoch": 0.018304907458523405, "grad_norm": 0.8671875, "learning_rate": 0.000157, "loss": 7.2168, "mean_token_accuracy": 0.07732245922088624, "num_tokens": 607446.0, "step": 315 }, { "entropy": 7.723113679885865, "epoch": 0.01859546154516663, "grad_norm": 0.8359375, "learning_rate": 0.0001595, "loss": 7.1559, "mean_token_accuracy": 0.07753840312361718, "num_tokens": 617064.0, "step": 320 }, { "entropy": 7.695508337020874, "epoch": 0.01888601563180986, "grad_norm": 1.03125, "learning_rate": 0.000162, "loss": 7.2008, "mean_token_accuracy": 0.08057244047522545, "num_tokens": 625927.0, "step": 325 }, { "entropy": 7.717827177047729, "epoch": 0.01917656971845309, "grad_norm": 0.97265625, "learning_rate": 0.00016450000000000001, "loss": 7.1152, "mean_token_accuracy": 0.07994545996189117, "num_tokens": 635341.0, "step": 330 }, { "entropy": 7.675025224685669, "epoch": 0.01946712380509632, "grad_norm": 2.171875, "learning_rate": 0.00016700000000000002, "loss": 7.1106, "mean_token_accuracy": 0.08988085016608238, "num_tokens": 645095.0, "step": 335 }, { "entropy": 7.714554166793823, "epoch": 0.019757677891739548, "grad_norm": 1.0234375, "learning_rate": 0.00016950000000000003, "loss": 7.1558, "mean_token_accuracy": 0.0730321068316698, "num_tokens": 654754.0, "step": 340 }, { "entropy": 7.60111026763916, "epoch": 0.020048231978382777, "grad_norm": 0.8671875, "learning_rate": 0.00017199999999999998, "loss": 7.1266, "mean_token_accuracy": 0.07690966166555882, "num_tokens": 664589.0, "step": 345 }, { "entropy": 7.6628223896026615, "epoch": 0.020338786065026003, "grad_norm": 1.0703125, "learning_rate": 0.00017449999999999999, "loss": 7.1425, "mean_token_accuracy": 0.07918459475040436, "num_tokens": 673870.0, "step": 350 }, { "entropy": 7.577814197540283, "epoch": 0.020629340151669232, "grad_norm": 1.03125, "learning_rate": 0.000177, "loss": 7.1137, "mean_token_accuracy": 0.07997918874025345, "num_tokens": 684309.0, "step": 355 }, { "entropy": 7.6769345760345455, "epoch": 0.02091989423831246, "grad_norm": 1.359375, "learning_rate": 0.0001795, "loss": 7.1629, "mean_token_accuracy": 0.07469077445566655, "num_tokens": 693702.0, "step": 360 }, { "entropy": 7.534895896911621, "epoch": 0.02121044832495569, "grad_norm": 0.94140625, "learning_rate": 0.000182, "loss": 7.0599, "mean_token_accuracy": 0.07970957532525062, "num_tokens": 702951.0, "step": 365 }, { "entropy": 7.588031339645386, "epoch": 0.02150100241159892, "grad_norm": 1.25, "learning_rate": 0.0001845, "loss": 7.0677, "mean_token_accuracy": 0.08218754455447197, "num_tokens": 712481.0, "step": 370 }, { "entropy": 7.600922870635986, "epoch": 0.02179155649824215, "grad_norm": 1.046875, "learning_rate": 0.000187, "loss": 7.0683, "mean_token_accuracy": 0.08380770459771156, "num_tokens": 721579.0, "step": 375 }, { "entropy": 7.572713327407837, "epoch": 0.022082110584885375, "grad_norm": 1.03125, "learning_rate": 0.0001895, "loss": 7.0774, "mean_token_accuracy": 0.07982454895973205, "num_tokens": 731404.0, "step": 380 }, { "entropy": 7.548839807510376, "epoch": 0.022372664671528604, "grad_norm": 0.93359375, "learning_rate": 0.000192, "loss": 7.0556, "mean_token_accuracy": 0.07496214136481286, "num_tokens": 740751.0, "step": 385 }, { "entropy": 7.523876476287842, "epoch": 0.022663218758171833, "grad_norm": 0.9765625, "learning_rate": 0.0001945, "loss": 7.0247, "mean_token_accuracy": 0.08082472011446953, "num_tokens": 751171.0, "step": 390 }, { "entropy": 7.552808237075806, "epoch": 0.022953772844815062, "grad_norm": 1.078125, "learning_rate": 0.00019700000000000002, "loss": 7.0823, "mean_token_accuracy": 0.07615064568817616, "num_tokens": 760874.0, "step": 395 }, { "entropy": 7.583486127853393, "epoch": 0.02324432693145829, "grad_norm": 1.2734375, "learning_rate": 0.00019950000000000002, "loss": 7.0585, "mean_token_accuracy": 0.08326990716159344, "num_tokens": 769652.0, "step": 400 }, { "entropy": 7.488273334503174, "epoch": 0.02353488101810152, "grad_norm": 0.98828125, "learning_rate": 0.000202, "loss": 7.0421, "mean_token_accuracy": 0.07620194889605045, "num_tokens": 779591.0, "step": 405 }, { "entropy": 7.564187002182007, "epoch": 0.023825435104744747, "grad_norm": 0.94921875, "learning_rate": 0.00020449999999999998, "loss": 7.156, "mean_token_accuracy": 0.08098742663860321, "num_tokens": 789582.0, "step": 410 }, { "entropy": 7.506245565414429, "epoch": 0.024115989191387976, "grad_norm": 1.1015625, "learning_rate": 0.000207, "loss": 7.0546, "mean_token_accuracy": 0.07765479311347008, "num_tokens": 799146.0, "step": 415 }, { "entropy": 7.4926127910614015, "epoch": 0.024406543278031205, "grad_norm": 1.0390625, "learning_rate": 0.0002095, "loss": 7.0246, "mean_token_accuracy": 0.0782523088157177, "num_tokens": 808934.0, "step": 420 }, { "entropy": 7.532363748550415, "epoch": 0.024697097364674434, "grad_norm": 0.87109375, "learning_rate": 0.000212, "loss": 7.0821, "mean_token_accuracy": 0.07597277015447616, "num_tokens": 819280.0, "step": 425 }, { "entropy": 7.457432746887207, "epoch": 0.024987651451317663, "grad_norm": 0.97265625, "learning_rate": 0.0002145, "loss": 6.9892, "mean_token_accuracy": 0.0840725652873516, "num_tokens": 828818.0, "step": 430 }, { "entropy": 7.463752698898316, "epoch": 0.025278205537960893, "grad_norm": 1.0703125, "learning_rate": 0.00021700000000000002, "loss": 6.9816, "mean_token_accuracy": 0.08661384396255016, "num_tokens": 839175.0, "step": 435 }, { "entropy": 7.5449175357818605, "epoch": 0.02556875962460412, "grad_norm": 1.125, "learning_rate": 0.0002195, "loss": 7.0777, "mean_token_accuracy": 0.07947314418852329, "num_tokens": 849965.0, "step": 440 }, { "entropy": 7.392349624633789, "epoch": 0.025859313711247348, "grad_norm": 1.0859375, "learning_rate": 0.000222, "loss": 6.9968, "mean_token_accuracy": 0.08229465186595916, "num_tokens": 859229.0, "step": 445 }, { "entropy": 7.4397971630096436, "epoch": 0.026149867797890577, "grad_norm": 1.28125, "learning_rate": 0.0002245, "loss": 6.9708, "mean_token_accuracy": 0.0816520519554615, "num_tokens": 869199.0, "step": 450 }, { "entropy": 7.399962043762207, "epoch": 0.026440421884533806, "grad_norm": 0.93359375, "learning_rate": 0.00022700000000000002, "loss": 6.9666, "mean_token_accuracy": 0.09285714998841285, "num_tokens": 879470.0, "step": 455 }, { "entropy": 7.4366514682769775, "epoch": 0.026730975971177035, "grad_norm": 0.921875, "learning_rate": 0.00022950000000000002, "loss": 6.9274, "mean_token_accuracy": 0.0792453158646822, "num_tokens": 888397.0, "step": 460 }, { "entropy": 7.370485734939575, "epoch": 0.027021530057820264, "grad_norm": 0.953125, "learning_rate": 0.00023200000000000003, "loss": 6.8202, "mean_token_accuracy": 0.0855403620749712, "num_tokens": 898321.0, "step": 465 }, { "entropy": 7.4845947265625, "epoch": 0.027312084144463494, "grad_norm": 1.0546875, "learning_rate": 0.00023449999999999998, "loss": 7.0856, "mean_token_accuracy": 0.0808610200881958, "num_tokens": 907947.0, "step": 470 }, { "entropy": 7.327203702926636, "epoch": 0.02760263823110672, "grad_norm": 1.1015625, "learning_rate": 0.000237, "loss": 6.9103, "mean_token_accuracy": 0.0954340323805809, "num_tokens": 916842.0, "step": 475 }, { "entropy": 7.380954456329346, "epoch": 0.02789319231774995, "grad_norm": 1.046875, "learning_rate": 0.0002395, "loss": 7.0098, "mean_token_accuracy": 0.08165798112750053, "num_tokens": 926431.0, "step": 480 }, { "entropy": 7.412681722640992, "epoch": 0.028183746404393178, "grad_norm": 0.98828125, "learning_rate": 0.000242, "loss": 6.9162, "mean_token_accuracy": 0.08133741281926632, "num_tokens": 935819.0, "step": 485 }, { "entropy": 7.44426212310791, "epoch": 0.028474300491036407, "grad_norm": 1.15625, "learning_rate": 0.0002445, "loss": 6.9418, "mean_token_accuracy": 0.08402741849422454, "num_tokens": 944198.0, "step": 490 }, { "entropy": 7.264917373657227, "epoch": 0.028764854577679636, "grad_norm": 0.88671875, "learning_rate": 0.000247, "loss": 6.9628, "mean_token_accuracy": 0.08352083042263984, "num_tokens": 954972.0, "step": 495 }, { "entropy": 7.385922384262085, "epoch": 0.029055408664322865, "grad_norm": 1.0703125, "learning_rate": 0.0002495, "loss": 6.9018, "mean_token_accuracy": 0.08520250022411346, "num_tokens": 964532.0, "step": 500 }, { "entropy": 7.475071048736572, "epoch": 0.02934596275096609, "grad_norm": 1.1171875, "learning_rate": 0.000252, "loss": 6.9955, "mean_token_accuracy": 0.07958225682377815, "num_tokens": 974547.0, "step": 505 }, { "entropy": 7.299204540252686, "epoch": 0.02963651683760932, "grad_norm": 0.98046875, "learning_rate": 0.0002545, "loss": 6.935, "mean_token_accuracy": 0.08022963926196099, "num_tokens": 984245.0, "step": 510 }, { "entropy": 7.318370199203491, "epoch": 0.02992707092425255, "grad_norm": 0.94140625, "learning_rate": 0.000257, "loss": 6.7766, "mean_token_accuracy": 0.08500204458832741, "num_tokens": 994400.0, "step": 515 }, { "entropy": 7.352757215499878, "epoch": 0.03021762501089578, "grad_norm": 1.2109375, "learning_rate": 0.0002595, "loss": 7.0024, "mean_token_accuracy": 0.07765024341642857, "num_tokens": 1005775.0, "step": 520 }, { "entropy": 7.312537145614624, "epoch": 0.030508179097539008, "grad_norm": 1.0859375, "learning_rate": 0.000262, "loss": 6.9055, "mean_token_accuracy": 0.08693855553865433, "num_tokens": 1015386.0, "step": 525 }, { "entropy": 7.383286190032959, "epoch": 0.030798733184182237, "grad_norm": 1.0859375, "learning_rate": 0.00026450000000000003, "loss": 6.8994, "mean_token_accuracy": 0.09188547134399414, "num_tokens": 1024963.0, "step": 530 }, { "entropy": 7.249363946914673, "epoch": 0.031089287270825463, "grad_norm": 0.9921875, "learning_rate": 0.00026700000000000004, "loss": 6.8996, "mean_token_accuracy": 0.08531768508255481, "num_tokens": 1034667.0, "step": 535 }, { "entropy": 7.265355777740479, "epoch": 0.03137984135746869, "grad_norm": 0.9921875, "learning_rate": 0.00026950000000000005, "loss": 6.8796, "mean_token_accuracy": 0.08795020580291749, "num_tokens": 1044171.0, "step": 540 }, { "entropy": 7.295146417617798, "epoch": 0.031670395444111925, "grad_norm": 1.1171875, "learning_rate": 0.00027200000000000005, "loss": 6.8538, "mean_token_accuracy": 0.08691519349813462, "num_tokens": 1053585.0, "step": 545 }, { "entropy": 7.237406063079834, "epoch": 0.03196094953075515, "grad_norm": 1.15625, "learning_rate": 0.0002745, "loss": 6.7515, "mean_token_accuracy": 0.09050033241510391, "num_tokens": 1063310.0, "step": 550 }, { "entropy": 7.263738679885864, "epoch": 0.032251503617398376, "grad_norm": 0.953125, "learning_rate": 0.000277, "loss": 6.8651, "mean_token_accuracy": 0.08824861124157905, "num_tokens": 1073529.0, "step": 555 }, { "entropy": 7.175330972671508, "epoch": 0.03254205770404161, "grad_norm": 1.109375, "learning_rate": 0.0002795, "loss": 6.8319, "mean_token_accuracy": 0.08951647505164147, "num_tokens": 1083432.0, "step": 560 }, { "entropy": 7.184946346282959, "epoch": 0.032832611790684835, "grad_norm": 0.953125, "learning_rate": 0.00028199999999999997, "loss": 6.8004, "mean_token_accuracy": 0.09656240493059158, "num_tokens": 1092453.0, "step": 565 }, { "entropy": 7.274725437164307, "epoch": 0.03312316587732807, "grad_norm": 1.03125, "learning_rate": 0.0002845, "loss": 6.8865, "mean_token_accuracy": 0.08661114051938057, "num_tokens": 1102402.0, "step": 570 }, { "entropy": 7.303795433044433, "epoch": 0.03341371996397129, "grad_norm": 1.1171875, "learning_rate": 0.000287, "loss": 6.8928, "mean_token_accuracy": 0.09610759019851685, "num_tokens": 1111907.0, "step": 575 }, { "entropy": 7.228280067443848, "epoch": 0.03370427405061452, "grad_norm": 1.125, "learning_rate": 0.0002895, "loss": 6.7846, "mean_token_accuracy": 0.09133462607860565, "num_tokens": 1120712.0, "step": 580 }, { "entropy": 7.0720751762390135, "epoch": 0.03399482813725775, "grad_norm": 1.0703125, "learning_rate": 0.000292, "loss": 6.6691, "mean_token_accuracy": 0.0894063800573349, "num_tokens": 1131165.0, "step": 585 }, { "entropy": 7.229758644104004, "epoch": 0.03428538222390098, "grad_norm": 1.0625, "learning_rate": 0.0002945, "loss": 6.8337, "mean_token_accuracy": 0.08700250834226608, "num_tokens": 1140527.0, "step": 590 }, { "entropy": 7.137591791152954, "epoch": 0.03457593631054421, "grad_norm": 1.140625, "learning_rate": 0.000297, "loss": 6.792, "mean_token_accuracy": 0.08842456936836243, "num_tokens": 1149977.0, "step": 595 }, { "entropy": 7.240325021743774, "epoch": 0.034866490397187436, "grad_norm": 1.1328125, "learning_rate": 0.0002995, "loss": 6.8153, "mean_token_accuracy": 0.08972005397081376, "num_tokens": 1159918.0, "step": 600 }, { "entropy": 7.116828918457031, "epoch": 0.03515704448383067, "grad_norm": 0.96484375, "learning_rate": 0.000302, "loss": 6.7965, "mean_token_accuracy": 0.08587550893425941, "num_tokens": 1169218.0, "step": 605 }, { "entropy": 7.1641600131988525, "epoch": 0.035447598570473894, "grad_norm": 1.1953125, "learning_rate": 0.0003045, "loss": 6.8058, "mean_token_accuracy": 0.09056585654616356, "num_tokens": 1179429.0, "step": 610 }, { "entropy": 7.0538177490234375, "epoch": 0.03573815265711712, "grad_norm": 0.953125, "learning_rate": 0.000307, "loss": 6.7051, "mean_token_accuracy": 0.0951805867254734, "num_tokens": 1189379.0, "step": 615 }, { "entropy": 7.165834856033325, "epoch": 0.03602870674376035, "grad_norm": 1.1328125, "learning_rate": 0.0003095, "loss": 6.6834, "mean_token_accuracy": 0.09452618882060052, "num_tokens": 1198643.0, "step": 620 }, { "entropy": 7.1435986995697025, "epoch": 0.03631926083040358, "grad_norm": 1.203125, "learning_rate": 0.000312, "loss": 6.8985, "mean_token_accuracy": 0.08901753202080727, "num_tokens": 1207933.0, "step": 625 }, { "entropy": 7.125590705871582, "epoch": 0.03660981491704681, "grad_norm": 1.1640625, "learning_rate": 0.0003145, "loss": 6.7771, "mean_token_accuracy": 0.09473630785942078, "num_tokens": 1217000.0, "step": 630 }, { "entropy": 7.342123746871948, "epoch": 0.03690036900369004, "grad_norm": 1.1796875, "learning_rate": 0.000317, "loss": 6.8715, "mean_token_accuracy": 0.08738602064549923, "num_tokens": 1227054.0, "step": 635 }, { "entropy": 7.0751423835754395, "epoch": 0.03719092309033326, "grad_norm": 1.0625, "learning_rate": 0.0003195, "loss": 6.8639, "mean_token_accuracy": 0.08903967961668968, "num_tokens": 1237126.0, "step": 640 }, { "entropy": 7.132748985290528, "epoch": 0.037481477176976495, "grad_norm": 1.140625, "learning_rate": 0.000322, "loss": 6.7309, "mean_token_accuracy": 0.09907565861940384, "num_tokens": 1247404.0, "step": 645 }, { "entropy": 7.105540752410889, "epoch": 0.03777203126361972, "grad_norm": 0.97265625, "learning_rate": 0.00032450000000000003, "loss": 6.6672, "mean_token_accuracy": 0.08641588017344475, "num_tokens": 1257130.0, "step": 650 }, { "entropy": 7.073269605636597, "epoch": 0.038062585350262954, "grad_norm": 1.0234375, "learning_rate": 0.00032700000000000003, "loss": 6.7423, "mean_token_accuracy": 0.09811322540044784, "num_tokens": 1266931.0, "step": 655 }, { "entropy": 7.157707405090332, "epoch": 0.03835313943690618, "grad_norm": 0.8828125, "learning_rate": 0.00032950000000000004, "loss": 6.7753, "mean_token_accuracy": 0.08842945359647274, "num_tokens": 1277770.0, "step": 660 }, { "entropy": 7.074891519546509, "epoch": 0.03864369352354941, "grad_norm": 1.1953125, "learning_rate": 0.00033200000000000005, "loss": 6.6966, "mean_token_accuracy": 0.09733218997716904, "num_tokens": 1287188.0, "step": 665 }, { "entropy": 7.035866546630859, "epoch": 0.03893424761019264, "grad_norm": 1.046875, "learning_rate": 0.00033450000000000005, "loss": 6.7408, "mean_token_accuracy": 0.09134816229343415, "num_tokens": 1297038.0, "step": 670 }, { "entropy": 7.091120624542237, "epoch": 0.03922480169683586, "grad_norm": 0.984375, "learning_rate": 0.000337, "loss": 6.6964, "mean_token_accuracy": 0.09473009631037713, "num_tokens": 1306860.0, "step": 675 }, { "entropy": 7.030598735809326, "epoch": 0.039515355783479096, "grad_norm": 0.94140625, "learning_rate": 0.0003395, "loss": 6.6668, "mean_token_accuracy": 0.09435953348875045, "num_tokens": 1316585.0, "step": 680 }, { "entropy": 7.1326805591583256, "epoch": 0.03980590987012232, "grad_norm": 1.0546875, "learning_rate": 0.000342, "loss": 6.7282, "mean_token_accuracy": 0.09551571607589722, "num_tokens": 1325601.0, "step": 685 }, { "entropy": 7.101321458816528, "epoch": 0.040096463956765555, "grad_norm": 1.0625, "learning_rate": 0.00034449999999999997, "loss": 6.7604, "mean_token_accuracy": 0.09247554913163185, "num_tokens": 1336305.0, "step": 690 }, { "entropy": 7.1049731254577635, "epoch": 0.04038701804340878, "grad_norm": 1.140625, "learning_rate": 0.000347, "loss": 6.6507, "mean_token_accuracy": 0.09341847449541092, "num_tokens": 1344820.0, "step": 695 }, { "entropy": 6.997063255310058, "epoch": 0.040677572130052006, "grad_norm": 1.125, "learning_rate": 0.0003495, "loss": 6.6331, "mean_token_accuracy": 0.09355669766664505, "num_tokens": 1353950.0, "step": 700 }, { "entropy": 7.01454758644104, "epoch": 0.04096812621669524, "grad_norm": 1.0078125, "learning_rate": 0.000352, "loss": 6.7545, "mean_token_accuracy": 0.09254956245422363, "num_tokens": 1364881.0, "step": 705 }, { "entropy": 7.0095212936401365, "epoch": 0.041258680303338464, "grad_norm": 1.078125, "learning_rate": 0.0003545, "loss": 6.7061, "mean_token_accuracy": 0.09260506108403206, "num_tokens": 1374018.0, "step": 710 }, { "entropy": 7.11537013053894, "epoch": 0.0415492343899817, "grad_norm": 1.0625, "learning_rate": 0.000357, "loss": 6.6946, "mean_token_accuracy": 0.08821133449673653, "num_tokens": 1384319.0, "step": 715 }, { "entropy": 6.958690166473389, "epoch": 0.04183978847662492, "grad_norm": 1.0546875, "learning_rate": 0.0003595, "loss": 6.5713, "mean_token_accuracy": 0.09440450817346573, "num_tokens": 1393753.0, "step": 720 }, { "entropy": 6.922836446762085, "epoch": 0.042130342563268156, "grad_norm": 1.046875, "learning_rate": 0.000362, "loss": 6.6616, "mean_token_accuracy": 0.09427325800061226, "num_tokens": 1403599.0, "step": 725 }, { "entropy": 7.020907402038574, "epoch": 0.04242089664991138, "grad_norm": 1.1484375, "learning_rate": 0.0003645, "loss": 6.6611, "mean_token_accuracy": 0.10043973848223686, "num_tokens": 1412508.0, "step": 730 }, { "entropy": 7.071925306320191, "epoch": 0.04271145073655461, "grad_norm": 1.125, "learning_rate": 0.000367, "loss": 6.8015, "mean_token_accuracy": 0.0910523734986782, "num_tokens": 1422776.0, "step": 735 }, { "entropy": 6.998428392410278, "epoch": 0.04300200482319784, "grad_norm": 1.140625, "learning_rate": 0.0003695, "loss": 6.6414, "mean_token_accuracy": 0.09633751660585403, "num_tokens": 1432901.0, "step": 740 }, { "entropy": 7.035877513885498, "epoch": 0.043292558909841065, "grad_norm": 1.0625, "learning_rate": 0.000372, "loss": 6.677, "mean_token_accuracy": 0.09542910531163215, "num_tokens": 1442916.0, "step": 745 }, { "entropy": 6.878139925003052, "epoch": 0.0435831129964843, "grad_norm": 1.0078125, "learning_rate": 0.0003745, "loss": 6.5395, "mean_token_accuracy": 0.09616116657853127, "num_tokens": 1453037.0, "step": 750 }, { "entropy": 6.96289029121399, "epoch": 0.043873667083127524, "grad_norm": 1.0703125, "learning_rate": 0.000377, "loss": 6.6196, "mean_token_accuracy": 0.10786209627985954, "num_tokens": 1461963.0, "step": 755 }, { "entropy": 7.00122447013855, "epoch": 0.04416422116977075, "grad_norm": 1.15625, "learning_rate": 0.0003795, "loss": 6.7012, "mean_token_accuracy": 0.09169812574982643, "num_tokens": 1471521.0, "step": 760 }, { "entropy": 6.930304098129272, "epoch": 0.04445477525641398, "grad_norm": 1.21875, "learning_rate": 0.000382, "loss": 6.5366, "mean_token_accuracy": 0.0987947553396225, "num_tokens": 1481438.0, "step": 765 }, { "entropy": 6.89730920791626, "epoch": 0.04474532934305721, "grad_norm": 1.1171875, "learning_rate": 0.0003845, "loss": 6.5654, "mean_token_accuracy": 0.09912522435188294, "num_tokens": 1490522.0, "step": 770 }, { "entropy": 6.994078540802002, "epoch": 0.04503588342970044, "grad_norm": 0.96875, "learning_rate": 0.00038700000000000003, "loss": 6.7343, "mean_token_accuracy": 0.09250347167253495, "num_tokens": 1501034.0, "step": 775 }, { "entropy": 6.894172525405883, "epoch": 0.045326437516343666, "grad_norm": 1.1796875, "learning_rate": 0.00038950000000000003, "loss": 6.5391, "mean_token_accuracy": 0.10528326034545898, "num_tokens": 1510390.0, "step": 780 }, { "entropy": 6.992980337142944, "epoch": 0.0456169916029869, "grad_norm": 1.21875, "learning_rate": 0.00039200000000000004, "loss": 6.6468, "mean_token_accuracy": 0.09232402816414834, "num_tokens": 1520048.0, "step": 785 }, { "entropy": 6.977211618423462, "epoch": 0.045907545689630125, "grad_norm": 1.2578125, "learning_rate": 0.00039450000000000005, "loss": 6.5275, "mean_token_accuracy": 0.10221462920308114, "num_tokens": 1529113.0, "step": 790 }, { "entropy": 6.760094785690308, "epoch": 0.04619809977627335, "grad_norm": 1.09375, "learning_rate": 0.00039700000000000005, "loss": 6.6057, "mean_token_accuracy": 0.09887640923261642, "num_tokens": 1538573.0, "step": 795 }, { "entropy": 6.975562715530396, "epoch": 0.04648865386291658, "grad_norm": 1.1640625, "learning_rate": 0.0003995, "loss": 6.6064, "mean_token_accuracy": 0.10373581051826478, "num_tokens": 1547471.0, "step": 800 }, { "entropy": 6.8805656909942625, "epoch": 0.04677920794955981, "grad_norm": 1.015625, "learning_rate": 0.000402, "loss": 6.5641, "mean_token_accuracy": 0.10285315811634063, "num_tokens": 1557259.0, "step": 805 }, { "entropy": 7.063277673721314, "epoch": 0.04706976203620304, "grad_norm": 0.98046875, "learning_rate": 0.0004045, "loss": 6.7921, "mean_token_accuracy": 0.09200607016682624, "num_tokens": 1567383.0, "step": 810 }, { "entropy": 6.87684121131897, "epoch": 0.04736031612284627, "grad_norm": 1.078125, "learning_rate": 0.00040699999999999997, "loss": 6.4826, "mean_token_accuracy": 0.11064840331673623, "num_tokens": 1577106.0, "step": 815 }, { "entropy": 6.807673025131225, "epoch": 0.04765087020948949, "grad_norm": 1.0703125, "learning_rate": 0.0004095, "loss": 6.5393, "mean_token_accuracy": 0.10080247670412064, "num_tokens": 1586100.0, "step": 820 }, { "entropy": 6.877712535858154, "epoch": 0.047941424296132726, "grad_norm": 1.0859375, "learning_rate": 0.000412, "loss": 6.6279, "mean_token_accuracy": 0.09564873427152634, "num_tokens": 1596950.0, "step": 825 }, { "entropy": 6.891899585723877, "epoch": 0.04823197838277595, "grad_norm": 1.0859375, "learning_rate": 0.0004145, "loss": 6.5837, "mean_token_accuracy": 0.09832958057522774, "num_tokens": 1606001.0, "step": 830 }, { "entropy": 6.978082180023193, "epoch": 0.048522532469419184, "grad_norm": 1.0390625, "learning_rate": 0.000417, "loss": 6.6825, "mean_token_accuracy": 0.0975476372987032, "num_tokens": 1616498.0, "step": 835 }, { "entropy": 6.831979036331177, "epoch": 0.04881308655606241, "grad_norm": 1.2109375, "learning_rate": 0.0004195, "loss": 6.5199, "mean_token_accuracy": 0.10347988307476044, "num_tokens": 1625195.0, "step": 840 }, { "entropy": 6.784482002258301, "epoch": 0.04910364064270564, "grad_norm": 1.0, "learning_rate": 0.000422, "loss": 6.4476, "mean_token_accuracy": 0.10162880271673203, "num_tokens": 1635176.0, "step": 845 }, { "entropy": 6.806185960769653, "epoch": 0.04939419472934887, "grad_norm": 1.09375, "learning_rate": 0.0004245, "loss": 6.553, "mean_token_accuracy": 0.1015662670135498, "num_tokens": 1645183.0, "step": 850 }, { "entropy": 6.801709985733032, "epoch": 0.049684748815992094, "grad_norm": 1.046875, "learning_rate": 0.000427, "loss": 6.5479, "mean_token_accuracy": 0.10148834735155106, "num_tokens": 1654226.0, "step": 855 }, { "entropy": 6.834500074386597, "epoch": 0.04997530290263533, "grad_norm": 1.0078125, "learning_rate": 0.0004295, "loss": 6.5426, "mean_token_accuracy": 0.10362305790185929, "num_tokens": 1664572.0, "step": 860 }, { "entropy": 6.950858306884766, "epoch": 0.05026585698927855, "grad_norm": 1.0234375, "learning_rate": 0.000432, "loss": 6.6472, "mean_token_accuracy": 0.09981537386775016, "num_tokens": 1674070.0, "step": 865 }, { "entropy": 6.791647720336914, "epoch": 0.050556411075921785, "grad_norm": 1.1171875, "learning_rate": 0.0004345, "loss": 6.4773, "mean_token_accuracy": 0.09943379536271095, "num_tokens": 1683473.0, "step": 870 }, { "entropy": 6.777591514587402, "epoch": 0.05084696516256501, "grad_norm": 1.1328125, "learning_rate": 0.000437, "loss": 6.4869, "mean_token_accuracy": 0.10118941962718964, "num_tokens": 1693171.0, "step": 875 }, { "entropy": 6.898639726638794, "epoch": 0.05113751924920824, "grad_norm": 0.953125, "learning_rate": 0.0004395, "loss": 6.606, "mean_token_accuracy": 0.09705074802041054, "num_tokens": 1703023.0, "step": 880 }, { "entropy": 6.73418025970459, "epoch": 0.05142807333585147, "grad_norm": 1.0703125, "learning_rate": 0.000442, "loss": 6.4984, "mean_token_accuracy": 0.1019330695271492, "num_tokens": 1712698.0, "step": 885 }, { "entropy": 6.906363248825073, "epoch": 0.051718627422494695, "grad_norm": 1.1171875, "learning_rate": 0.0004445, "loss": 6.6098, "mean_token_accuracy": 0.09838435426354408, "num_tokens": 1721502.0, "step": 890 }, { "entropy": 6.7474723815917965, "epoch": 0.05200918150913793, "grad_norm": 1.0546875, "learning_rate": 0.000447, "loss": 6.4942, "mean_token_accuracy": 0.10594057068228721, "num_tokens": 1730551.0, "step": 895 }, { "entropy": 6.808920383453369, "epoch": 0.052299735595781154, "grad_norm": 1.015625, "learning_rate": 0.00044950000000000003, "loss": 6.5645, "mean_token_accuracy": 0.10622440055012702, "num_tokens": 1739368.0, "step": 900 }, { "entropy": 6.827513933181763, "epoch": 0.052590289682424386, "grad_norm": 1.140625, "learning_rate": 0.00045200000000000004, "loss": 6.4176, "mean_token_accuracy": 0.11146403327584267, "num_tokens": 1748528.0, "step": 905 }, { "entropy": 6.713736248016358, "epoch": 0.05288084376906761, "grad_norm": 0.9765625, "learning_rate": 0.00045450000000000004, "loss": 6.5739, "mean_token_accuracy": 0.09899114519357681, "num_tokens": 1759569.0, "step": 910 }, { "entropy": 6.80773286819458, "epoch": 0.05317139785571084, "grad_norm": 1.15625, "learning_rate": 0.00045700000000000005, "loss": 6.5099, "mean_token_accuracy": 0.10788461863994599, "num_tokens": 1769366.0, "step": 915 }, { "entropy": 6.76817569732666, "epoch": 0.05346195194235407, "grad_norm": 1.046875, "learning_rate": 0.00045950000000000006, "loss": 6.6024, "mean_token_accuracy": 0.09936894476413727, "num_tokens": 1780155.0, "step": 920 }, { "entropy": 6.755830335617065, "epoch": 0.053752506028997296, "grad_norm": 1.1484375, "learning_rate": 0.000462, "loss": 6.4233, "mean_token_accuracy": 0.10512633025646209, "num_tokens": 1789436.0, "step": 925 }, { "entropy": 6.823408889770508, "epoch": 0.05404306011564053, "grad_norm": 1.1640625, "learning_rate": 0.0004645, "loss": 6.5652, "mean_token_accuracy": 0.0998048096895218, "num_tokens": 1798836.0, "step": 930 }, { "entropy": 6.751146364212036, "epoch": 0.054333614202283755, "grad_norm": 1.03125, "learning_rate": 0.000467, "loss": 6.444, "mean_token_accuracy": 0.10532717406749725, "num_tokens": 1808666.0, "step": 935 }, { "entropy": 6.8108867645263675, "epoch": 0.05462416828892699, "grad_norm": 1.1171875, "learning_rate": 0.0004695, "loss": 6.5972, "mean_token_accuracy": 0.09496863186359406, "num_tokens": 1820001.0, "step": 940 }, { "entropy": 6.751294231414795, "epoch": 0.05491472237557021, "grad_norm": 1.03125, "learning_rate": 0.000472, "loss": 6.4693, "mean_token_accuracy": 0.10566612035036087, "num_tokens": 1830284.0, "step": 945 }, { "entropy": 6.820448493957519, "epoch": 0.05520527646221344, "grad_norm": 1.1171875, "learning_rate": 0.0004745, "loss": 6.4794, "mean_token_accuracy": 0.10577797368168831, "num_tokens": 1839930.0, "step": 950 }, { "entropy": 6.629036235809326, "epoch": 0.05549583054885667, "grad_norm": 0.98046875, "learning_rate": 0.000477, "loss": 6.5675, "mean_token_accuracy": 0.10090194195508957, "num_tokens": 1850697.0, "step": 955 }, { "entropy": 6.817226839065552, "epoch": 0.0557863846354999, "grad_norm": 1.0234375, "learning_rate": 0.0004795, "loss": 6.497, "mean_token_accuracy": 0.10740380734205246, "num_tokens": 1860196.0, "step": 960 }, { "entropy": 6.774875259399414, "epoch": 0.05607693872214313, "grad_norm": 1.0546875, "learning_rate": 0.000482, "loss": 6.47, "mean_token_accuracy": 0.1075842596590519, "num_tokens": 1869000.0, "step": 965 }, { "entropy": 6.722468996047974, "epoch": 0.056367492808786356, "grad_norm": 1.1875, "learning_rate": 0.0004845, "loss": 6.469, "mean_token_accuracy": 0.10600791200995445, "num_tokens": 1878687.0, "step": 970 }, { "entropy": 6.728367662429809, "epoch": 0.05665804689542958, "grad_norm": 1.1640625, "learning_rate": 0.000487, "loss": 6.3467, "mean_token_accuracy": 0.10569515079259872, "num_tokens": 1886914.0, "step": 975 }, { "entropy": 6.671978425979614, "epoch": 0.056948600982072814, "grad_norm": 0.97265625, "learning_rate": 0.0004895, "loss": 6.5321, "mean_token_accuracy": 0.10422437414526939, "num_tokens": 1897392.0, "step": 980 }, { "entropy": 6.805356025695801, "epoch": 0.05723915506871604, "grad_norm": 1.109375, "learning_rate": 0.000492, "loss": 6.488, "mean_token_accuracy": 0.10600305423140526, "num_tokens": 1906215.0, "step": 985 }, { "entropy": 6.8313037872314455, "epoch": 0.05752970915535927, "grad_norm": 1.0859375, "learning_rate": 0.0004945, "loss": 6.5017, "mean_token_accuracy": 0.10730748698115349, "num_tokens": 1915376.0, "step": 990 }, { "entropy": 6.659111022949219, "epoch": 0.0578202632420025, "grad_norm": 1.03125, "learning_rate": 0.000497, "loss": 6.465, "mean_token_accuracy": 0.10440039038658142, "num_tokens": 1925558.0, "step": 995 }, { "entropy": 6.676358318328857, "epoch": 0.05811081732864573, "grad_norm": 1.0625, "learning_rate": 0.0004995, "loss": 6.4301, "mean_token_accuracy": 0.10430914014577866, "num_tokens": 1935176.0, "step": 1000 }, { "entropy": 6.770152616500854, "epoch": 0.05840137141528896, "grad_norm": 0.9921875, "learning_rate": 0.000499998026082006, "loss": 6.4924, "mean_token_accuracy": 0.10445862039923667, "num_tokens": 1945135.0, "step": 1005 }, { "entropy": 6.597527885437012, "epoch": 0.05869192550193218, "grad_norm": 1.1875, "learning_rate": 0.0004999900070995136, "loss": 6.4838, "mean_token_accuracy": 0.10765932872891426, "num_tokens": 1955585.0, "step": 1010 }, { "entropy": 6.867468976974488, "epoch": 0.058982479588575415, "grad_norm": 1.125, "learning_rate": 0.0004999758199023239, "loss": 6.4687, "mean_token_accuracy": 0.10314074084162712, "num_tokens": 1964750.0, "step": 1015 }, { "entropy": 6.624800300598144, "epoch": 0.05927303367521864, "grad_norm": 1.0, "learning_rate": 0.0004999554648793858, "loss": 6.5436, "mean_token_accuracy": 0.10335941463708878, "num_tokens": 1974697.0, "step": 1020 }, { "entropy": 6.7362236976623535, "epoch": 0.05956358776186187, "grad_norm": 1.09375, "learning_rate": 0.0004999289425887425, "loss": 6.4934, "mean_token_accuracy": 0.10554013177752494, "num_tokens": 1983384.0, "step": 1025 }, { "entropy": 6.754078722000122, "epoch": 0.0598541418485051, "grad_norm": 0.98828125, "learning_rate": 0.0004998962537575161, "loss": 6.5229, "mean_token_accuracy": 0.11017107889056206, "num_tokens": 1993790.0, "step": 1030 }, { "entropy": 6.697407197952271, "epoch": 0.060144695935148325, "grad_norm": 1.046875, "learning_rate": 0.0004998573992818874, "loss": 6.4027, "mean_token_accuracy": 0.10623413920402527, "num_tokens": 2003296.0, "step": 1035 }, { "entropy": 6.585323095321655, "epoch": 0.06043525002179156, "grad_norm": 1.0625, "learning_rate": 0.0004998123802270715, "loss": 6.3345, "mean_token_accuracy": 0.11027837991714477, "num_tokens": 2012481.0, "step": 1040 }, { "entropy": 6.705205965042114, "epoch": 0.06072580410843478, "grad_norm": 1.1796875, "learning_rate": 0.0004997611978272886, "loss": 6.4994, "mean_token_accuracy": 0.10490612536668778, "num_tokens": 2022382.0, "step": 1045 }, { "entropy": 6.638956928253174, "epoch": 0.061016358195078016, "grad_norm": 1.0546875, "learning_rate": 0.0004997038534857298, "loss": 6.4097, "mean_token_accuracy": 0.11042128577828407, "num_tokens": 2032290.0, "step": 1050 }, { "entropy": 6.6624797821044925, "epoch": 0.06130691228172124, "grad_norm": 0.984375, "learning_rate": 0.0004996403487745194, "loss": 6.3594, "mean_token_accuracy": 0.10972521901130676, "num_tokens": 2041094.0, "step": 1055 }, { "entropy": 6.609392881393433, "epoch": 0.061597466368364474, "grad_norm": 1.109375, "learning_rate": 0.000499570685434671, "loss": 6.5125, "mean_token_accuracy": 0.10544388592243195, "num_tokens": 2051169.0, "step": 1060 }, { "entropy": 6.6946526050567625, "epoch": 0.0618880204550077, "grad_norm": 1.03125, "learning_rate": 0.0004994948653760405, "loss": 6.3966, "mean_token_accuracy": 0.1103939101099968, "num_tokens": 2061310.0, "step": 1065 }, { "entropy": 6.619559907913208, "epoch": 0.062178574541650926, "grad_norm": 1.0390625, "learning_rate": 0.0004994128906772729, "loss": 6.3829, "mean_token_accuracy": 0.10736953839659691, "num_tokens": 2071537.0, "step": 1070 }, { "entropy": 6.6101906299591064, "epoch": 0.06246912862829416, "grad_norm": 0.9296875, "learning_rate": 0.000499324763585746, "loss": 6.4507, "mean_token_accuracy": 0.10780780464410782, "num_tokens": 2082540.0, "step": 1075 }, { "entropy": 6.621304225921631, "epoch": 0.06275968271493738, "grad_norm": 1.1328125, "learning_rate": 0.0004992304865175085, "loss": 6.4413, "mean_token_accuracy": 0.11023736447095871, "num_tokens": 2091313.0, "step": 1080 }, { "entropy": 6.691177225112915, "epoch": 0.06305023680158062, "grad_norm": 1.0234375, "learning_rate": 0.0004991300620572138, "loss": 6.4862, "mean_token_accuracy": 0.10716225057840348, "num_tokens": 2100826.0, "step": 1085 }, { "entropy": 6.671515083312988, "epoch": 0.06334079088822385, "grad_norm": 1.0625, "learning_rate": 0.0004990234929580494, "loss": 6.4177, "mean_token_accuracy": 0.10876795202493668, "num_tokens": 2109798.0, "step": 1090 }, { "entropy": 6.640522909164429, "epoch": 0.06363134497486707, "grad_norm": 0.9765625, "learning_rate": 0.0004989107821416609, "loss": 6.3138, "mean_token_accuracy": 0.11188038140535354, "num_tokens": 2119641.0, "step": 1095 }, { "entropy": 6.565330696105957, "epoch": 0.0639218990615103, "grad_norm": 1.140625, "learning_rate": 0.0004987919326980723, "loss": 6.3525, "mean_token_accuracy": 0.11164129376411439, "num_tokens": 2128724.0, "step": 1100 }, { "entropy": 6.521946573257447, "epoch": 0.06421245314815353, "grad_norm": 1.109375, "learning_rate": 0.0004986669478856011, "loss": 6.2737, "mean_token_accuracy": 0.11544388085603714, "num_tokens": 2137251.0, "step": 1105 }, { "entropy": 6.6156073093414305, "epoch": 0.06450300723479675, "grad_norm": 1.0, "learning_rate": 0.0004985358311307688, "loss": 6.3821, "mean_token_accuracy": 0.118138437718153, "num_tokens": 2146978.0, "step": 1110 }, { "entropy": 6.669202089309692, "epoch": 0.06479356132143999, "grad_norm": 0.98046875, "learning_rate": 0.0004983985860282081, "loss": 6.4636, "mean_token_accuracy": 0.10260412320494652, "num_tokens": 2157153.0, "step": 1115 }, { "entropy": 6.475356149673462, "epoch": 0.06508411540808322, "grad_norm": 0.9609375, "learning_rate": 0.0004982552163405623, "loss": 6.3599, "mean_token_accuracy": 0.11348235085606576, "num_tokens": 2166946.0, "step": 1120 }, { "entropy": 6.657857656478882, "epoch": 0.06537466949472644, "grad_norm": 1.0703125, "learning_rate": 0.0004981057259983839, "loss": 6.3772, "mean_token_accuracy": 0.11038358807563782, "num_tokens": 2177249.0, "step": 1125 }, { "entropy": 6.466132879257202, "epoch": 0.06566522358136967, "grad_norm": 0.99609375, "learning_rate": 0.0004979501191000262, "loss": 6.3098, "mean_token_accuracy": 0.11056527942419052, "num_tokens": 2187240.0, "step": 1130 }, { "entropy": 6.6453643321990965, "epoch": 0.0659557776680129, "grad_norm": 1.0625, "learning_rate": 0.0004977883999115311, "loss": 6.3145, "mean_token_accuracy": 0.11672020331025124, "num_tokens": 2196199.0, "step": 1135 }, { "entropy": 6.595391893386841, "epoch": 0.06624633175465613, "grad_norm": 1.0703125, "learning_rate": 0.0004976205728665113, "loss": 6.2689, "mean_token_accuracy": 0.11631305515766144, "num_tokens": 2205726.0, "step": 1140 }, { "entropy": 6.587292861938477, "epoch": 0.06653688584129935, "grad_norm": 0.9765625, "learning_rate": 0.0004974466425660307, "loss": 6.4457, "mean_token_accuracy": 0.10664665251970291, "num_tokens": 2216552.0, "step": 1145 }, { "entropy": 6.597306776046753, "epoch": 0.06682743992794259, "grad_norm": 0.953125, "learning_rate": 0.0004972666137784759, "loss": 6.3034, "mean_token_accuracy": 0.11342373788356781, "num_tokens": 2225935.0, "step": 1150 }, { "entropy": 6.644480466842651, "epoch": 0.06711799401458582, "grad_norm": 0.953125, "learning_rate": 0.0004970804914394271, "loss": 6.4604, "mean_token_accuracy": 0.11499964445829391, "num_tokens": 2235907.0, "step": 1155 }, { "entropy": 6.599408388137817, "epoch": 0.06740854810122904, "grad_norm": 1.1328125, "learning_rate": 0.0004968882806515225, "loss": 6.3881, "mean_token_accuracy": 0.10959212481975555, "num_tokens": 2244473.0, "step": 1160 }, { "entropy": 6.641416931152344, "epoch": 0.06769910218787227, "grad_norm": 1.1875, "learning_rate": 0.0004966899866843177, "loss": 6.4123, "mean_token_accuracy": 0.1027280792593956, "num_tokens": 2253834.0, "step": 1165 }, { "entropy": 6.5416028022766115, "epoch": 0.0679896562745155, "grad_norm": 1.015625, "learning_rate": 0.000496485614974142, "loss": 6.3413, "mean_token_accuracy": 0.11207354813814163, "num_tokens": 2263243.0, "step": 1170 }, { "entropy": 6.6198502540588375, "epoch": 0.06828021036115874, "grad_norm": 1.0859375, "learning_rate": 0.0004962751711239492, "loss": 6.3035, "mean_token_accuracy": 0.11463942378759384, "num_tokens": 2273008.0, "step": 1175 }, { "entropy": 6.430229234695434, "epoch": 0.06857076444780195, "grad_norm": 1.0078125, "learning_rate": 0.0004960586609031636, "loss": 6.3457, "mean_token_accuracy": 0.1155870608985424, "num_tokens": 2282522.0, "step": 1180 }, { "entropy": 6.601986408233643, "epoch": 0.06886131853444519, "grad_norm": 1.0625, "learning_rate": 0.0004958360902475224, "loss": 6.2529, "mean_token_accuracy": 0.12027783617377281, "num_tokens": 2292114.0, "step": 1185 }, { "entropy": 6.400939083099365, "epoch": 0.06915187262108842, "grad_norm": 0.94921875, "learning_rate": 0.0004956074652589125, "loss": 6.1978, "mean_token_accuracy": 0.12538810446858406, "num_tokens": 2301592.0, "step": 1190 }, { "entropy": 6.51713194847107, "epoch": 0.06944242670773164, "grad_norm": 0.9921875, "learning_rate": 0.0004953727922052035, "loss": 6.3201, "mean_token_accuracy": 0.11454231590032578, "num_tokens": 2310940.0, "step": 1195 }, { "entropy": 6.463452672958374, "epoch": 0.06973298079437487, "grad_norm": 1.0703125, "learning_rate": 0.0004951320775200756, "loss": 6.3959, "mean_token_accuracy": 0.1151392012834549, "num_tokens": 2320535.0, "step": 1200 }, { "entropy": 6.596390962600708, "epoch": 0.0700235348810181, "grad_norm": 0.96875, "learning_rate": 0.0004948853278028436, "loss": 6.2563, "mean_token_accuracy": 0.12523823976516724, "num_tokens": 2330431.0, "step": 1205 }, { "entropy": 6.3869446277618405, "epoch": 0.07031408896766134, "grad_norm": 1.0546875, "learning_rate": 0.0004946325498182755, "loss": 6.2036, "mean_token_accuracy": 0.12079060897231102, "num_tokens": 2339323.0, "step": 1210 }, { "entropy": 6.510322713851929, "epoch": 0.07060464305430456, "grad_norm": 1.0390625, "learning_rate": 0.0004943737504964076, "loss": 6.2992, "mean_token_accuracy": 0.11487918049097061, "num_tokens": 2349750.0, "step": 1215 }, { "entropy": 6.503530073165893, "epoch": 0.07089519714094779, "grad_norm": 1.1171875, "learning_rate": 0.000494108936932354, "loss": 6.2558, "mean_token_accuracy": 0.1210679478943348, "num_tokens": 2359147.0, "step": 1220 }, { "entropy": 6.520279359817505, "epoch": 0.07118575122759102, "grad_norm": 0.953125, "learning_rate": 0.0004938381163861124, "loss": 6.2786, "mean_token_accuracy": 0.11829182729125023, "num_tokens": 2368762.0, "step": 1225 }, { "entropy": 6.391372203826904, "epoch": 0.07147630531423424, "grad_norm": 0.9765625, "learning_rate": 0.0004935612962823645, "loss": 6.1568, "mean_token_accuracy": 0.12013374790549278, "num_tokens": 2378060.0, "step": 1230 }, { "entropy": 6.465664291381836, "epoch": 0.07176685940087747, "grad_norm": 1.0625, "learning_rate": 0.0004932784842102739, "loss": 6.2575, "mean_token_accuracy": 0.12200002744793892, "num_tokens": 2386997.0, "step": 1235 }, { "entropy": 6.6493157863616945, "epoch": 0.0720574134875207, "grad_norm": 1.2578125, "learning_rate": 0.0004929896879232758, "loss": 6.4026, "mean_token_accuracy": 0.11086667999625206, "num_tokens": 2396980.0, "step": 1240 }, { "entropy": 6.435001850128174, "epoch": 0.07234796757416392, "grad_norm": 1.0703125, "learning_rate": 0.0004926949153388668, "loss": 6.2556, "mean_token_accuracy": 0.1203616626560688, "num_tokens": 2406450.0, "step": 1245 }, { "entropy": 6.519892168045044, "epoch": 0.07263852166080716, "grad_norm": 1.03125, "learning_rate": 0.0004923941745383859, "loss": 6.2632, "mean_token_accuracy": 0.11274134442210197, "num_tokens": 2415985.0, "step": 1250 }, { "entropy": 6.457003879547119, "epoch": 0.07292907574745039, "grad_norm": 0.94921875, "learning_rate": 0.000492087473766794, "loss": 6.2928, "mean_token_accuracy": 0.11486212983727455, "num_tokens": 2425676.0, "step": 1255 }, { "entropy": 6.508018493652344, "epoch": 0.07321962983409362, "grad_norm": 1.0, "learning_rate": 0.000491774821432448, "loss": 6.2922, "mean_token_accuracy": 0.10985862240195274, "num_tokens": 2435918.0, "step": 1260 }, { "entropy": 6.5097509860992435, "epoch": 0.07351018392073684, "grad_norm": 1.0703125, "learning_rate": 0.0004914562261068693, "loss": 6.3562, "mean_token_accuracy": 0.11788229197263718, "num_tokens": 2445267.0, "step": 1265 }, { "entropy": 6.599736261367798, "epoch": 0.07380073800738007, "grad_norm": 1.140625, "learning_rate": 0.0004911316965245098, "loss": 6.3224, "mean_token_accuracy": 0.11191006749868393, "num_tokens": 2455885.0, "step": 1270 }, { "entropy": 6.489064168930054, "epoch": 0.0740912920940233, "grad_norm": 1.0234375, "learning_rate": 0.000490801241582512, "loss": 6.3483, "mean_token_accuracy": 0.11579938605427742, "num_tokens": 2465604.0, "step": 1275 }, { "entropy": 6.5532605171203615, "epoch": 0.07438184618066652, "grad_norm": 1.1015625, "learning_rate": 0.000490464870340465, "loss": 6.4458, "mean_token_accuracy": 0.10784725919365883, "num_tokens": 2475168.0, "step": 1280 }, { "entropy": 6.473039054870606, "epoch": 0.07467240026730976, "grad_norm": 1.1796875, "learning_rate": 0.0004901225920201563, "loss": 6.2243, "mean_token_accuracy": 0.12185250818729401, "num_tokens": 2484185.0, "step": 1285 }, { "entropy": 6.583461809158325, "epoch": 0.07496295435395299, "grad_norm": 1.1171875, "learning_rate": 0.000489774416005319, "loss": 6.3387, "mean_token_accuracy": 0.11904568299651146, "num_tokens": 2492992.0, "step": 1290 }, { "entropy": 6.418948078155518, "epoch": 0.07525350844059622, "grad_norm": 1.03125, "learning_rate": 0.0004894203518413742, "loss": 6.2065, "mean_token_accuracy": 0.119369375705719, "num_tokens": 2502541.0, "step": 1295 }, { "entropy": 6.468045377731324, "epoch": 0.07554406252723944, "grad_norm": 1.0546875, "learning_rate": 0.0004890604092351701, "loss": 6.2364, "mean_token_accuracy": 0.11862708181142807, "num_tokens": 2511947.0, "step": 1300 }, { "entropy": 6.385909509658814, "epoch": 0.07583461661388267, "grad_norm": 1.0703125, "learning_rate": 0.000488694598054715, "loss": 6.2525, "mean_token_accuracy": 0.12124920263886452, "num_tokens": 2521727.0, "step": 1305 }, { "entropy": 6.531244993209839, "epoch": 0.07612517070052591, "grad_norm": 1.0625, "learning_rate": 0.0004883229283289071, "loss": 6.2694, "mean_token_accuracy": 0.1218131199479103, "num_tokens": 2530680.0, "step": 1310 }, { "entropy": 6.422513055801391, "epoch": 0.07641572478716913, "grad_norm": 1.0703125, "learning_rate": 0.00048794541024725993, "loss": 6.1542, "mean_token_accuracy": 0.12266649156808854, "num_tokens": 2539414.0, "step": 1315 }, { "entropy": 6.491461181640625, "epoch": 0.07670627887381236, "grad_norm": 1.0390625, "learning_rate": 0.0004875620541596221, "loss": 6.3072, "mean_token_accuracy": 0.1141884945333004, "num_tokens": 2549609.0, "step": 1320 }, { "entropy": 6.4648158073425295, "epoch": 0.07699683296045559, "grad_norm": 1.0625, "learning_rate": 0.00048717287057589454, "loss": 6.2773, "mean_token_accuracy": 0.11799687221646309, "num_tokens": 2560081.0, "step": 1325 }, { "entropy": 6.400183534622192, "epoch": 0.07728738704709882, "grad_norm": 1.09375, "learning_rate": 0.0004867778701657417, "loss": 6.2328, "mean_token_accuracy": 0.11631238982081413, "num_tokens": 2569995.0, "step": 1330 }, { "entropy": 6.37140007019043, "epoch": 0.07757794113374204, "grad_norm": 1.046875, "learning_rate": 0.00048637706375829955, "loss": 6.1738, "mean_token_accuracy": 0.1213558554649353, "num_tokens": 2579502.0, "step": 1335 }, { "entropy": 6.476347970962524, "epoch": 0.07786849522038528, "grad_norm": 0.9921875, "learning_rate": 0.000485970462341878, "loss": 6.2553, "mean_token_accuracy": 0.12006450816988945, "num_tokens": 2589515.0, "step": 1340 }, { "entropy": 6.434140920639038, "epoch": 0.07815904930702851, "grad_norm": 1.0859375, "learning_rate": 0.00048555807706366044, "loss": 6.1897, "mean_token_accuracy": 0.12782623916864394, "num_tokens": 2598822.0, "step": 1345 }, { "entropy": 6.443134021759033, "epoch": 0.07844960339367173, "grad_norm": 0.93359375, "learning_rate": 0.00048513991922939756, "loss": 6.315, "mean_token_accuracy": 0.11421679928898812, "num_tokens": 2609169.0, "step": 1350 }, { "entropy": 6.484804105758667, "epoch": 0.07874015748031496, "grad_norm": 0.98046875, "learning_rate": 0.00048471600030309744, "loss": 6.2716, "mean_token_accuracy": 0.11644304916262627, "num_tokens": 2618683.0, "step": 1355 }, { "entropy": 6.466926431655883, "epoch": 0.07903071156695819, "grad_norm": 1.140625, "learning_rate": 0.00048428633190671186, "loss": 6.2371, "mean_token_accuracy": 0.12091248780488968, "num_tokens": 2627976.0, "step": 1360 }, { "entropy": 6.505730533599854, "epoch": 0.07932126565360141, "grad_norm": 1.0703125, "learning_rate": 0.0004838509258198167, "loss": 6.294, "mean_token_accuracy": 0.11860666498541832, "num_tokens": 2637235.0, "step": 1365 }, { "entropy": 6.393795537948608, "epoch": 0.07961181974024464, "grad_norm": 0.984375, "learning_rate": 0.00048340979397929, "loss": 6.2951, "mean_token_accuracy": 0.11754858568310737, "num_tokens": 2646698.0, "step": 1370 }, { "entropy": 6.505375099182129, "epoch": 0.07990237382688788, "grad_norm": 1.125, "learning_rate": 0.00048296294847898386, "loss": 6.2788, "mean_token_accuracy": 0.12090856656432152, "num_tokens": 2656357.0, "step": 1375 }, { "entropy": 6.434703159332275, "epoch": 0.08019292791353111, "grad_norm": 1.0859375, "learning_rate": 0.0004825104015693934, "loss": 6.1776, "mean_token_accuracy": 0.11764631941914558, "num_tokens": 2665561.0, "step": 1380 }, { "entropy": 6.437805318832398, "epoch": 0.08048348200017433, "grad_norm": 1.0859375, "learning_rate": 0.0004820521656573208, "loss": 6.1909, "mean_token_accuracy": 0.12296778410673141, "num_tokens": 2674600.0, "step": 1385 }, { "entropy": 6.368801641464233, "epoch": 0.08077403608681756, "grad_norm": 1.0234375, "learning_rate": 0.00048158825330553505, "loss": 6.1838, "mean_token_accuracy": 0.12880179584026336, "num_tokens": 2684944.0, "step": 1390 }, { "entropy": 6.461294555664063, "epoch": 0.0810645901734608, "grad_norm": 1.0078125, "learning_rate": 0.00048111867723242763, "loss": 6.1342, "mean_token_accuracy": 0.12006727010011672, "num_tokens": 2694467.0, "step": 1395 }, { "entropy": 6.442787504196167, "epoch": 0.08135514426010401, "grad_norm": 1.0546875, "learning_rate": 0.0004806434503116637, "loss": 6.2769, "mean_token_accuracy": 0.11950750723481178, "num_tokens": 2704499.0, "step": 1400 }, { "entropy": 6.378614997863769, "epoch": 0.08164569834674724, "grad_norm": 1.0, "learning_rate": 0.0004801625855718296, "loss": 6.1896, "mean_token_accuracy": 0.11940810978412628, "num_tokens": 2715424.0, "step": 1405 }, { "entropy": 6.41011266708374, "epoch": 0.08193625243339048, "grad_norm": 1.09375, "learning_rate": 0.00047967609619607477, "loss": 6.1788, "mean_token_accuracy": 0.12036006227135658, "num_tokens": 2724805.0, "step": 1410 }, { "entropy": 6.3130451202392575, "epoch": 0.08222680652003371, "grad_norm": 1.0234375, "learning_rate": 0.0004791839955217513, "loss": 6.1481, "mean_token_accuracy": 0.12863539010286332, "num_tokens": 2734216.0, "step": 1415 }, { "entropy": 6.424062490463257, "epoch": 0.08251736060667693, "grad_norm": 1.0234375, "learning_rate": 0.00047868629704004786, "loss": 6.2572, "mean_token_accuracy": 0.11476619765162469, "num_tokens": 2744146.0, "step": 1420 }, { "entropy": 6.422879314422607, "epoch": 0.08280791469332016, "grad_norm": 1.046875, "learning_rate": 0.00047818301439561965, "loss": 6.2419, "mean_token_accuracy": 0.12102322354912758, "num_tokens": 2754000.0, "step": 1425 }, { "entropy": 6.637474250793457, "epoch": 0.0830984687799634, "grad_norm": 1.046875, "learning_rate": 0.00047767416138621454, "loss": 6.288, "mean_token_accuracy": 0.11775907129049301, "num_tokens": 2763185.0, "step": 1430 }, { "entropy": 6.372423696517944, "epoch": 0.08338902286660661, "grad_norm": 1.078125, "learning_rate": 0.000477159751962295, "loss": 6.2381, "mean_token_accuracy": 0.11884959116578102, "num_tokens": 2773324.0, "step": 1435 }, { "entropy": 6.485676908493042, "epoch": 0.08367957695324985, "grad_norm": 1.0546875, "learning_rate": 0.00047663980022665507, "loss": 6.2207, "mean_token_accuracy": 0.11649533435702324, "num_tokens": 2783184.0, "step": 1440 }, { "entropy": 6.396980142593383, "epoch": 0.08397013103989308, "grad_norm": 0.9296875, "learning_rate": 0.00047611432043403437, "loss": 6.2223, "mean_token_accuracy": 0.11544240266084671, "num_tokens": 2793278.0, "step": 1445 }, { "entropy": 6.366146802902222, "epoch": 0.08426068512653631, "grad_norm": 1.0625, "learning_rate": 0.0004755833269907267, "loss": 6.1262, "mean_token_accuracy": 0.12203074395656585, "num_tokens": 2802164.0, "step": 1450 }, { "entropy": 6.457718706130981, "epoch": 0.08455123921317953, "grad_norm": 1.0078125, "learning_rate": 0.0004750468344541857, "loss": 6.1891, "mean_token_accuracy": 0.11854342371225357, "num_tokens": 2811537.0, "step": 1455 }, { "entropy": 6.381798458099365, "epoch": 0.08484179329982276, "grad_norm": 1.0546875, "learning_rate": 0.00047450485753262525, "loss": 6.2965, "mean_token_accuracy": 0.11684540212154389, "num_tokens": 2821861.0, "step": 1460 }, { "entropy": 6.412109518051148, "epoch": 0.085132347386466, "grad_norm": 0.98046875, "learning_rate": 0.00047395741108461633, "loss": 6.1718, "mean_token_accuracy": 0.12374548763036727, "num_tokens": 2831916.0, "step": 1465 }, { "entropy": 6.33392972946167, "epoch": 0.08542290147310921, "grad_norm": 1.0546875, "learning_rate": 0.00047340451011867985, "loss": 6.1604, "mean_token_accuracy": 0.12683377638459206, "num_tokens": 2840979.0, "step": 1470 }, { "entropy": 6.418259906768799, "epoch": 0.08571345555975245, "grad_norm": 1.1015625, "learning_rate": 0.00047284616979287515, "loss": 6.1782, "mean_token_accuracy": 0.11932171955704689, "num_tokens": 2851332.0, "step": 1475 }, { "entropy": 6.265405559539795, "epoch": 0.08600400964639568, "grad_norm": 1.03125, "learning_rate": 0.00047228240541438433, "loss": 6.073, "mean_token_accuracy": 0.12999156266450881, "num_tokens": 2860134.0, "step": 1480 }, { "entropy": 6.458755302429199, "epoch": 0.08629456373303891, "grad_norm": 1.1171875, "learning_rate": 0.00047171323243909257, "loss": 6.2126, "mean_token_accuracy": 0.11848914325237274, "num_tokens": 2869218.0, "step": 1485 }, { "entropy": 6.345139837265014, "epoch": 0.08658511781968213, "grad_norm": 0.98828125, "learning_rate": 0.00047113866647116457, "loss": 6.1426, "mean_token_accuracy": 0.12274593263864517, "num_tokens": 2878529.0, "step": 1490 }, { "entropy": 6.426075124740601, "epoch": 0.08687567190632536, "grad_norm": 1.0625, "learning_rate": 0.0004705587232626164, "loss": 6.1579, "mean_token_accuracy": 0.11727055683732032, "num_tokens": 2888149.0, "step": 1495 }, { "entropy": 6.3561450958251955, "epoch": 0.0871662259929686, "grad_norm": 1.03125, "learning_rate": 0.00046997341871288424, "loss": 6.1347, "mean_token_accuracy": 0.12332948073744773, "num_tokens": 2897790.0, "step": 1500 } ], "logging_steps": 5, "max_steps": 4000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 629831493550080.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }