{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.029055408664322865, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 10.742584228515625, "epoch": 0.0002905540866432286, "grad_norm": 4.90625, "learning_rate": 2e-06, "loss": 10.7837, "mean_token_accuracy": 0.0, "num_tokens": 10156.0, "step": 5 }, { "entropy": 10.742587471008301, "epoch": 0.0005811081732864572, "grad_norm": 4.8125, "learning_rate": 4.5e-06, "loss": 10.7753, "mean_token_accuracy": 9.267840650863945e-05, "num_tokens": 20933.0, "step": 10 }, { "entropy": 10.74257869720459, "epoch": 0.0008716622599296859, "grad_norm": 4.375, "learning_rate": 7e-06, "loss": 10.7508, "mean_token_accuracy": 0.0, "num_tokens": 31298.0, "step": 15 }, { "entropy": 10.742635726928711, "epoch": 0.0011622163465729145, "grad_norm": 4.875, "learning_rate": 9.5e-06, "loss": 10.697, "mean_token_accuracy": 0.0, "num_tokens": 40913.0, "step": 20 }, { "entropy": 10.742652702331544, "epoch": 0.0014527704332161432, "grad_norm": 4.28125, "learning_rate": 1.2e-05, "loss": 10.5798, "mean_token_accuracy": 0.0007269373920280487, "num_tokens": 49901.0, "step": 25 }, { "entropy": 10.742454719543456, "epoch": 0.0017433245198593718, "grad_norm": 4.0625, "learning_rate": 1.4500000000000002e-05, "loss": 10.4688, "mean_token_accuracy": 0.01560134175233543, "num_tokens": 59328.0, "step": 30 }, { "entropy": 10.741775226593017, "epoch": 0.0020338786065026006, "grad_norm": 3.25, "learning_rate": 1.7000000000000003e-05, "loss": 10.3287, "mean_token_accuracy": 0.037073963694274424, "num_tokens": 68405.0, "step": 35 }, { "entropy": 10.740037631988525, "epoch": 0.002324432693145829, "grad_norm": 2.578125, "learning_rate": 1.95e-05, "loss": 10.2203, "mean_token_accuracy": 0.037133642472326756, "num_tokens": 77591.0, "step": 40 }, { "entropy": 10.73731575012207, "epoch": 0.0026149867797890577, "grad_norm": 2.359375, "learning_rate": 2.2e-05, "loss": 10.1202, "mean_token_accuracy": 0.03901108838617802, "num_tokens": 88186.0, "step": 45 }, { "entropy": 10.734606838226318, "epoch": 0.0029055408664322865, "grad_norm": 2.09375, "learning_rate": 2.4500000000000003e-05, "loss": 10.0211, "mean_token_accuracy": 0.04241710864007473, "num_tokens": 97594.0, "step": 50 }, { "entropy": 10.73211612701416, "epoch": 0.003196094953075515, "grad_norm": 1.9921875, "learning_rate": 2.7e-05, "loss": 9.9871, "mean_token_accuracy": 0.03826836366206408, "num_tokens": 107386.0, "step": 55 }, { "entropy": 10.73102445602417, "epoch": 0.0034866490397187436, "grad_norm": 1.9140625, "learning_rate": 2.95e-05, "loss": 9.9132, "mean_token_accuracy": 0.03943221494555473, "num_tokens": 116742.0, "step": 60 }, { "entropy": 10.729645252227783, "epoch": 0.0037772031263619723, "grad_norm": 1.859375, "learning_rate": 3.2e-05, "loss": 9.8519, "mean_token_accuracy": 0.03962419871240854, "num_tokens": 126520.0, "step": 65 }, { "entropy": 10.727834033966065, "epoch": 0.004067757213005201, "grad_norm": 1.7734375, "learning_rate": 3.4500000000000005e-05, "loss": 9.7907, "mean_token_accuracy": 0.03989919070154428, "num_tokens": 136382.0, "step": 70 }, { "entropy": 10.724947452545166, "epoch": 0.0043583112996484295, "grad_norm": 1.7421875, "learning_rate": 3.7e-05, "loss": 9.7212, "mean_token_accuracy": 0.03671109899878502, "num_tokens": 146435.0, "step": 75 }, { "entropy": 10.72182493209839, "epoch": 0.004648865386291658, "grad_norm": 1.8359375, "learning_rate": 3.95e-05, "loss": 9.6591, "mean_token_accuracy": 0.037667426839470865, "num_tokens": 156174.0, "step": 80 }, { "entropy": 10.71723222732544, "epoch": 0.004939419472934887, "grad_norm": 1.765625, "learning_rate": 4.2000000000000004e-05, "loss": 9.5783, "mean_token_accuracy": 0.04142397493124008, "num_tokens": 165118.0, "step": 85 }, { "entropy": 10.708585739135742, "epoch": 0.005229973559578115, "grad_norm": 1.875, "learning_rate": 4.45e-05, "loss": 9.5252, "mean_token_accuracy": 0.04036426953971386, "num_tokens": 174401.0, "step": 90 }, { "entropy": 10.697160243988037, "epoch": 0.005520527646221344, "grad_norm": 1.765625, "learning_rate": 4.7000000000000004e-05, "loss": 9.443, "mean_token_accuracy": 0.04118307866156101, "num_tokens": 183533.0, "step": 95 }, { "entropy": 10.683875274658202, "epoch": 0.005811081732864573, "grad_norm": 1.7734375, "learning_rate": 4.9500000000000004e-05, "loss": 9.3596, "mean_token_accuracy": 0.045602331310510634, "num_tokens": 193296.0, "step": 100 }, { "entropy": 10.665638542175293, "epoch": 0.006101635819507801, "grad_norm": 1.75, "learning_rate": 5.2e-05, "loss": 9.2242, "mean_token_accuracy": 0.055989645794034, "num_tokens": 202741.0, "step": 105 }, { "entropy": 10.637047958374023, "epoch": 0.00639218990615103, "grad_norm": 1.703125, "learning_rate": 5.45e-05, "loss": 9.1359, "mean_token_accuracy": 0.05134495124220848, "num_tokens": 212441.0, "step": 110 }, { "entropy": 10.61000461578369, "epoch": 0.006682743992794259, "grad_norm": 1.6875, "learning_rate": 5.7e-05, "loss": 8.9868, "mean_token_accuracy": 0.04918566383421421, "num_tokens": 220671.0, "step": 115 }, { "entropy": 10.56931962966919, "epoch": 0.006973298079437487, "grad_norm": 1.7578125, "learning_rate": 5.9499999999999996e-05, "loss": 8.9878, "mean_token_accuracy": 0.04560479037463665, "num_tokens": 231390.0, "step": 120 }, { "entropy": 10.515452098846435, "epoch": 0.007263852166080716, "grad_norm": 1.5859375, "learning_rate": 6.2e-05, "loss": 8.8241, "mean_token_accuracy": 0.05023673102259636, "num_tokens": 241137.0, "step": 125 }, { "entropy": 10.430156230926514, "epoch": 0.007554406252723945, "grad_norm": 1.578125, "learning_rate": 6.450000000000001e-05, "loss": 8.6778, "mean_token_accuracy": 0.05138532817363739, "num_tokens": 250627.0, "step": 130 }, { "entropy": 10.353140926361084, "epoch": 0.007844960339367173, "grad_norm": 1.5859375, "learning_rate": 6.7e-05, "loss": 8.5255, "mean_token_accuracy": 0.05529710613191128, "num_tokens": 259564.0, "step": 135 }, { "entropy": 10.262280082702636, "epoch": 0.008135514426010402, "grad_norm": 1.453125, "learning_rate": 6.950000000000001e-05, "loss": 8.4168, "mean_token_accuracy": 0.05102897398173809, "num_tokens": 268997.0, "step": 140 }, { "entropy": 10.17268762588501, "epoch": 0.00842606851265363, "grad_norm": 1.40625, "learning_rate": 7.2e-05, "loss": 8.402, "mean_token_accuracy": 0.04707291163504124, "num_tokens": 278989.0, "step": 145 }, { "entropy": 10.068906784057617, "epoch": 0.008716622599296859, "grad_norm": 1.3828125, "learning_rate": 7.45e-05, "loss": 8.2195, "mean_token_accuracy": 0.04823922924697399, "num_tokens": 288770.0, "step": 150 }, { "entropy": 9.884156227111816, "epoch": 0.009007176685940088, "grad_norm": 1.234375, "learning_rate": 7.7e-05, "loss": 8.1604, "mean_token_accuracy": 0.05296766012907028, "num_tokens": 298368.0, "step": 155 }, { "entropy": 9.749515438079834, "epoch": 0.009297730772583316, "grad_norm": 1.125, "learning_rate": 7.950000000000001e-05, "loss": 7.9887, "mean_token_accuracy": 0.054083061218261716, "num_tokens": 307437.0, "step": 160 }, { "entropy": 9.539670753479005, "epoch": 0.009588284859226545, "grad_norm": 1.3828125, "learning_rate": 8.2e-05, "loss": 7.931, "mean_token_accuracy": 0.05368399284780025, "num_tokens": 317842.0, "step": 165 }, { "entropy": 9.367785167694091, "epoch": 0.009878838945869774, "grad_norm": 0.984375, "learning_rate": 8.450000000000001e-05, "loss": 7.7746, "mean_token_accuracy": 0.056211471930146216, "num_tokens": 327455.0, "step": 170 }, { "entropy": 9.106531143188477, "epoch": 0.010169393032513002, "grad_norm": 0.96484375, "learning_rate": 8.7e-05, "loss": 7.7023, "mean_token_accuracy": 0.059121083468198776, "num_tokens": 338593.0, "step": 175 }, { "entropy": 8.891216564178468, "epoch": 0.01045994711915623, "grad_norm": 1.0, "learning_rate": 8.95e-05, "loss": 7.6717, "mean_token_accuracy": 0.060001150518655774, "num_tokens": 348278.0, "step": 180 }, { "entropy": 8.690237522125244, "epoch": 0.01075050120579946, "grad_norm": 0.91796875, "learning_rate": 9.2e-05, "loss": 7.5848, "mean_token_accuracy": 0.060652027279138564, "num_tokens": 358293.0, "step": 185 }, { "entropy": 8.500970458984375, "epoch": 0.011041055292442687, "grad_norm": 0.70703125, "learning_rate": 9.45e-05, "loss": 7.6462, "mean_token_accuracy": 0.06345079019665718, "num_tokens": 368177.0, "step": 190 }, { "entropy": 8.432841682434082, "epoch": 0.011331609379085917, "grad_norm": 0.87890625, "learning_rate": 9.7e-05, "loss": 7.5041, "mean_token_accuracy": 0.06438801400363445, "num_tokens": 377258.0, "step": 195 }, { "entropy": 8.328762531280518, "epoch": 0.011622163465729146, "grad_norm": 0.79296875, "learning_rate": 9.95e-05, "loss": 7.515, "mean_token_accuracy": 0.06462946832180023, "num_tokens": 385931.0, "step": 200 }, { "entropy": 8.228355598449706, "epoch": 0.011912717552372373, "grad_norm": 0.984375, "learning_rate": 0.000102, "loss": 7.4262, "mean_token_accuracy": 0.06731356121599674, "num_tokens": 394370.0, "step": 205 }, { "entropy": 8.163572025299072, "epoch": 0.012203271639015602, "grad_norm": 0.765625, "learning_rate": 0.00010449999999999999, "loss": 7.5127, "mean_token_accuracy": 0.06187250129878521, "num_tokens": 405167.0, "step": 210 }, { "entropy": 8.144425964355468, "epoch": 0.012493825725658832, "grad_norm": 0.90234375, "learning_rate": 0.000107, "loss": 7.4823, "mean_token_accuracy": 0.06424942426383495, "num_tokens": 414954.0, "step": 215 }, { "entropy": 8.074434852600097, "epoch": 0.01278437981230206, "grad_norm": 1.0078125, "learning_rate": 0.0001095, "loss": 7.4379, "mean_token_accuracy": 0.07021872885525227, "num_tokens": 423806.0, "step": 220 }, { "entropy": 8.100719451904297, "epoch": 0.013074933898945288, "grad_norm": 1.1015625, "learning_rate": 0.000112, "loss": 7.4049, "mean_token_accuracy": 0.07006631046533585, "num_tokens": 433416.0, "step": 225 }, { "entropy": 8.068440341949463, "epoch": 0.013365487985588518, "grad_norm": 1.015625, "learning_rate": 0.0001145, "loss": 7.4086, "mean_token_accuracy": 0.0656484205275774, "num_tokens": 443237.0, "step": 230 }, { "entropy": 8.008077144622803, "epoch": 0.013656042072231747, "grad_norm": 1.0078125, "learning_rate": 0.00011700000000000001, "loss": 7.3811, "mean_token_accuracy": 0.07138268202543259, "num_tokens": 452334.0, "step": 235 }, { "entropy": 7.95733003616333, "epoch": 0.013946596158874974, "grad_norm": 1.0390625, "learning_rate": 0.00011949999999999999, "loss": 7.451, "mean_token_accuracy": 0.07069577798247337, "num_tokens": 462604.0, "step": 240 }, { "entropy": 7.985943031311035, "epoch": 0.014237150245518203, "grad_norm": 1.109375, "learning_rate": 0.000122, "loss": 7.3342, "mean_token_accuracy": 0.07418472990393639, "num_tokens": 472105.0, "step": 245 }, { "entropy": 7.985353708267212, "epoch": 0.014527704332161433, "grad_norm": 0.89453125, "learning_rate": 0.0001245, "loss": 7.3573, "mean_token_accuracy": 0.07192124761641025, "num_tokens": 481873.0, "step": 250 }, { "entropy": 7.852858924865723, "epoch": 0.01481825841880466, "grad_norm": 0.9140625, "learning_rate": 0.000127, "loss": 7.3134, "mean_token_accuracy": 0.07094009146094322, "num_tokens": 490776.0, "step": 255 }, { "entropy": 7.97090711593628, "epoch": 0.01510881250544789, "grad_norm": 0.97265625, "learning_rate": 0.0001295, "loss": 7.3459, "mean_token_accuracy": 0.06945950090885163, "num_tokens": 500237.0, "step": 260 }, { "entropy": 7.988322401046753, "epoch": 0.015399366592091119, "grad_norm": 1.0078125, "learning_rate": 0.000132, "loss": 7.3569, "mean_token_accuracy": 0.0719369538128376, "num_tokens": 509449.0, "step": 265 }, { "entropy": 7.863973140716553, "epoch": 0.015689920678734346, "grad_norm": 0.9296875, "learning_rate": 0.00013450000000000002, "loss": 7.3463, "mean_token_accuracy": 0.07629362866282463, "num_tokens": 519335.0, "step": 270 }, { "entropy": 7.850080347061157, "epoch": 0.015980474765377575, "grad_norm": 0.828125, "learning_rate": 0.00013700000000000002, "loss": 7.269, "mean_token_accuracy": 0.07348301075398922, "num_tokens": 529108.0, "step": 275 }, { "entropy": 7.803001642227173, "epoch": 0.016271028852020804, "grad_norm": 0.9609375, "learning_rate": 0.0001395, "loss": 7.3421, "mean_token_accuracy": 0.07442944496870041, "num_tokens": 539409.0, "step": 280 }, { "entropy": 7.8401947021484375, "epoch": 0.016561582938664034, "grad_norm": 0.98828125, "learning_rate": 0.00014199999999999998, "loss": 7.269, "mean_token_accuracy": 0.07476447969675064, "num_tokens": 549790.0, "step": 285 }, { "entropy": 7.773062610626221, "epoch": 0.01685213702530726, "grad_norm": 0.90234375, "learning_rate": 0.0001445, "loss": 7.2597, "mean_token_accuracy": 0.07743276208639145, "num_tokens": 559343.0, "step": 290 }, { "entropy": 7.833370351791382, "epoch": 0.01714269111195049, "grad_norm": 1.1171875, "learning_rate": 0.000147, "loss": 7.306, "mean_token_accuracy": 0.07507650516927242, "num_tokens": 568806.0, "step": 295 }, { "entropy": 7.692620134353637, "epoch": 0.017433245198593718, "grad_norm": 1.0703125, "learning_rate": 0.0001495, "loss": 7.1532, "mean_token_accuracy": 0.07671754881739616, "num_tokens": 578988.0, "step": 300 }, { "entropy": 7.840510559082031, "epoch": 0.017723799285236947, "grad_norm": 1.015625, "learning_rate": 0.000152, "loss": 7.258, "mean_token_accuracy": 0.0767325557768345, "num_tokens": 588588.0, "step": 305 }, { "entropy": 7.740892934799194, "epoch": 0.018014353371880176, "grad_norm": 0.9453125, "learning_rate": 0.00015450000000000001, "loss": 7.2385, "mean_token_accuracy": 0.07767370343208313, "num_tokens": 597957.0, "step": 310 }, { "entropy": 7.761815309524536, "epoch": 0.018304907458523405, "grad_norm": 0.8671875, "learning_rate": 0.000157, "loss": 7.2168, "mean_token_accuracy": 0.07732245922088624, "num_tokens": 607446.0, "step": 315 }, { "entropy": 7.723113679885865, "epoch": 0.01859546154516663, "grad_norm": 0.8359375, "learning_rate": 0.0001595, "loss": 7.1559, "mean_token_accuracy": 0.07753840312361718, "num_tokens": 617064.0, "step": 320 }, { "entropy": 7.695508337020874, "epoch": 0.01888601563180986, "grad_norm": 1.03125, "learning_rate": 0.000162, "loss": 7.2008, "mean_token_accuracy": 0.08057244047522545, "num_tokens": 625927.0, "step": 325 }, { "entropy": 7.717827177047729, "epoch": 0.01917656971845309, "grad_norm": 0.97265625, "learning_rate": 0.00016450000000000001, "loss": 7.1152, "mean_token_accuracy": 0.07994545996189117, "num_tokens": 635341.0, "step": 330 }, { "entropy": 7.675025224685669, "epoch": 0.01946712380509632, "grad_norm": 2.171875, "learning_rate": 0.00016700000000000002, "loss": 7.1106, "mean_token_accuracy": 0.08988085016608238, "num_tokens": 645095.0, "step": 335 }, { "entropy": 7.714554166793823, "epoch": 0.019757677891739548, "grad_norm": 1.0234375, "learning_rate": 0.00016950000000000003, "loss": 7.1558, "mean_token_accuracy": 0.0730321068316698, "num_tokens": 654754.0, "step": 340 }, { "entropy": 7.60111026763916, "epoch": 0.020048231978382777, "grad_norm": 0.8671875, "learning_rate": 0.00017199999999999998, "loss": 7.1266, "mean_token_accuracy": 0.07690966166555882, "num_tokens": 664589.0, "step": 345 }, { "entropy": 7.6628223896026615, "epoch": 0.020338786065026003, "grad_norm": 1.0703125, "learning_rate": 0.00017449999999999999, "loss": 7.1425, "mean_token_accuracy": 0.07918459475040436, "num_tokens": 673870.0, "step": 350 }, { "entropy": 7.577814197540283, "epoch": 0.020629340151669232, "grad_norm": 1.03125, "learning_rate": 0.000177, "loss": 7.1137, "mean_token_accuracy": 0.07997918874025345, "num_tokens": 684309.0, "step": 355 }, { "entropy": 7.6769345760345455, "epoch": 0.02091989423831246, "grad_norm": 1.359375, "learning_rate": 0.0001795, "loss": 7.1629, "mean_token_accuracy": 0.07469077445566655, "num_tokens": 693702.0, "step": 360 }, { "entropy": 7.534895896911621, "epoch": 0.02121044832495569, "grad_norm": 0.94140625, "learning_rate": 0.000182, "loss": 7.0599, "mean_token_accuracy": 0.07970957532525062, "num_tokens": 702951.0, "step": 365 }, { "entropy": 7.588031339645386, "epoch": 0.02150100241159892, "grad_norm": 1.25, "learning_rate": 0.0001845, "loss": 7.0677, "mean_token_accuracy": 0.08218754455447197, "num_tokens": 712481.0, "step": 370 }, { "entropy": 7.600922870635986, "epoch": 0.02179155649824215, "grad_norm": 1.046875, "learning_rate": 0.000187, "loss": 7.0683, "mean_token_accuracy": 0.08380770459771156, "num_tokens": 721579.0, "step": 375 }, { "entropy": 7.572713327407837, "epoch": 0.022082110584885375, "grad_norm": 1.03125, "learning_rate": 0.0001895, "loss": 7.0774, "mean_token_accuracy": 0.07982454895973205, "num_tokens": 731404.0, "step": 380 }, { "entropy": 7.548839807510376, "epoch": 0.022372664671528604, "grad_norm": 0.93359375, "learning_rate": 0.000192, "loss": 7.0556, "mean_token_accuracy": 0.07496214136481286, "num_tokens": 740751.0, "step": 385 }, { "entropy": 7.523876476287842, "epoch": 0.022663218758171833, "grad_norm": 0.9765625, "learning_rate": 0.0001945, "loss": 7.0247, "mean_token_accuracy": 0.08082472011446953, "num_tokens": 751171.0, "step": 390 }, { "entropy": 7.552808237075806, "epoch": 0.022953772844815062, "grad_norm": 1.078125, "learning_rate": 0.00019700000000000002, "loss": 7.0823, "mean_token_accuracy": 0.07615064568817616, "num_tokens": 760874.0, "step": 395 }, { "entropy": 7.583486127853393, "epoch": 0.02324432693145829, "grad_norm": 1.2734375, "learning_rate": 0.00019950000000000002, "loss": 7.0585, "mean_token_accuracy": 0.08326990716159344, "num_tokens": 769652.0, "step": 400 }, { "entropy": 7.488273334503174, "epoch": 0.02353488101810152, "grad_norm": 0.98828125, "learning_rate": 0.000202, "loss": 7.0421, "mean_token_accuracy": 0.07620194889605045, "num_tokens": 779591.0, "step": 405 }, { "entropy": 7.564187002182007, "epoch": 0.023825435104744747, "grad_norm": 0.94921875, "learning_rate": 0.00020449999999999998, "loss": 7.156, "mean_token_accuracy": 0.08098742663860321, "num_tokens": 789582.0, "step": 410 }, { "entropy": 7.506245565414429, "epoch": 0.024115989191387976, "grad_norm": 1.1015625, "learning_rate": 0.000207, "loss": 7.0546, "mean_token_accuracy": 0.07765479311347008, "num_tokens": 799146.0, "step": 415 }, { "entropy": 7.4926127910614015, "epoch": 0.024406543278031205, "grad_norm": 1.0390625, "learning_rate": 0.0002095, "loss": 7.0246, "mean_token_accuracy": 0.0782523088157177, "num_tokens": 808934.0, "step": 420 }, { "entropy": 7.532363748550415, "epoch": 0.024697097364674434, "grad_norm": 0.87109375, "learning_rate": 0.000212, "loss": 7.0821, "mean_token_accuracy": 0.07597277015447616, "num_tokens": 819280.0, "step": 425 }, { "entropy": 7.457432746887207, "epoch": 0.024987651451317663, "grad_norm": 0.97265625, "learning_rate": 0.0002145, "loss": 6.9892, "mean_token_accuracy": 0.0840725652873516, "num_tokens": 828818.0, "step": 430 }, { "entropy": 7.463752698898316, "epoch": 0.025278205537960893, "grad_norm": 1.0703125, "learning_rate": 0.00021700000000000002, "loss": 6.9816, "mean_token_accuracy": 0.08661384396255016, "num_tokens": 839175.0, "step": 435 }, { "entropy": 7.5449175357818605, "epoch": 0.02556875962460412, "grad_norm": 1.125, "learning_rate": 0.0002195, "loss": 7.0777, "mean_token_accuracy": 0.07947314418852329, "num_tokens": 849965.0, "step": 440 }, { "entropy": 7.392349624633789, "epoch": 0.025859313711247348, "grad_norm": 1.0859375, "learning_rate": 0.000222, "loss": 6.9968, "mean_token_accuracy": 0.08229465186595916, "num_tokens": 859229.0, "step": 445 }, { "entropy": 7.4397971630096436, "epoch": 0.026149867797890577, "grad_norm": 1.28125, "learning_rate": 0.0002245, "loss": 6.9708, "mean_token_accuracy": 0.0816520519554615, "num_tokens": 869199.0, "step": 450 }, { "entropy": 7.399962043762207, "epoch": 0.026440421884533806, "grad_norm": 0.93359375, "learning_rate": 0.00022700000000000002, "loss": 6.9666, "mean_token_accuracy": 0.09285714998841285, "num_tokens": 879470.0, "step": 455 }, { "entropy": 7.4366514682769775, "epoch": 0.026730975971177035, "grad_norm": 0.921875, "learning_rate": 0.00022950000000000002, "loss": 6.9274, "mean_token_accuracy": 0.0792453158646822, "num_tokens": 888397.0, "step": 460 }, { "entropy": 7.370485734939575, "epoch": 0.027021530057820264, "grad_norm": 0.953125, "learning_rate": 0.00023200000000000003, "loss": 6.8202, "mean_token_accuracy": 0.0855403620749712, "num_tokens": 898321.0, "step": 465 }, { "entropy": 7.4845947265625, "epoch": 0.027312084144463494, "grad_norm": 1.0546875, "learning_rate": 0.00023449999999999998, "loss": 7.0856, "mean_token_accuracy": 0.0808610200881958, "num_tokens": 907947.0, "step": 470 }, { "entropy": 7.327203702926636, "epoch": 0.02760263823110672, "grad_norm": 1.1015625, "learning_rate": 0.000237, "loss": 6.9103, "mean_token_accuracy": 0.0954340323805809, "num_tokens": 916842.0, "step": 475 }, { "entropy": 7.380954456329346, "epoch": 0.02789319231774995, "grad_norm": 1.046875, "learning_rate": 0.0002395, "loss": 7.0098, "mean_token_accuracy": 0.08165798112750053, "num_tokens": 926431.0, "step": 480 }, { "entropy": 7.412681722640992, "epoch": 0.028183746404393178, "grad_norm": 0.98828125, "learning_rate": 0.000242, "loss": 6.9162, "mean_token_accuracy": 0.08133741281926632, "num_tokens": 935819.0, "step": 485 }, { "entropy": 7.44426212310791, "epoch": 0.028474300491036407, "grad_norm": 1.15625, "learning_rate": 0.0002445, "loss": 6.9418, "mean_token_accuracy": 0.08402741849422454, "num_tokens": 944198.0, "step": 490 }, { "entropy": 7.264917373657227, "epoch": 0.028764854577679636, "grad_norm": 0.88671875, "learning_rate": 0.000247, "loss": 6.9628, "mean_token_accuracy": 0.08352083042263984, "num_tokens": 954972.0, "step": 495 }, { "entropy": 7.385922384262085, "epoch": 0.029055408664322865, "grad_norm": 1.0703125, "learning_rate": 0.0002495, "loss": 6.9018, "mean_token_accuracy": 0.08520250022411346, "num_tokens": 964532.0, "step": 500 } ], "logging_steps": 5, "max_steps": 4000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 210168601804800.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }