{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5298481102084069, "eval_steps": 3000, "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 4.790674829483033, "epoch": 0.0004415400918403391, "grad_norm": 13.0625, "learning_rate": 2e-06, "loss": 14.4349, "mean_token_accuracy": 0.0, "num_tokens": 9390.0, "step": 5 }, { "entropy": 4.818728256225586, "epoch": 0.0008830801836806782, "grad_norm": 14.1875, "learning_rate": 4.5e-06, "loss": 14.4117, "mean_token_accuracy": 0.0, "num_tokens": 18671.0, "step": 10 }, { "entropy": 4.857943296432495, "epoch": 0.0013246202755210173, "grad_norm": 16.25, "learning_rate": 7e-06, "loss": 14.1693, "mean_token_accuracy": 0.00014005602570250631, "num_tokens": 27614.0, "step": 15 }, { "entropy": 5.027469444274902, "epoch": 0.0017661603673613563, "grad_norm": 25.875, "learning_rate": 9.5e-06, "loss": 13.7713, "mean_token_accuracy": 8.547008619643747e-05, "num_tokens": 37850.0, "step": 20 }, { "entropy": 6.5380439281463625, "epoch": 0.0022077004592016957, "grad_norm": 26.375, "learning_rate": 1.2e-05, "loss": 12.2509, "mean_token_accuracy": 0.0, "num_tokens": 47166.0, "step": 25 }, { "entropy": 9.970003509521485, "epoch": 0.0026492405510420347, "grad_norm": 3.4375, "learning_rate": 1.4500000000000002e-05, "loss": 10.9371, "mean_token_accuracy": 0.00022374301915988325, "num_tokens": 55500.0, "step": 30 }, { "entropy": 10.680026626586914, "epoch": 0.0030907806428823736, "grad_norm": 3.15625, "learning_rate": 1.7000000000000003e-05, "loss": 10.6238, "mean_token_accuracy": 0.009453117521479726, "num_tokens": 63851.0, "step": 35 }, { "entropy": 10.703511428833007, "epoch": 0.0035323207347227126, "grad_norm": 3.21875, "learning_rate": 1.95e-05, "loss": 10.3602, "mean_token_accuracy": 0.029037438705563544, "num_tokens": 73697.0, "step": 40 }, { "entropy": 10.651962184906006, "epoch": 0.003973860826563052, "grad_norm": 2.625, "learning_rate": 2.2e-05, "loss": 10.0115, "mean_token_accuracy": 0.05894971713423729, "num_tokens": 83000.0, "step": 45 }, { "entropy": 10.439279747009277, "epoch": 0.004415400918403391, "grad_norm": 2.03125, "learning_rate": 2.4500000000000003e-05, "loss": 9.8132, "mean_token_accuracy": 0.05815875120460987, "num_tokens": 92982.0, "step": 50 }, { "entropy": 10.318083763122559, "epoch": 0.00485694101024373, "grad_norm": 1.8203125, "learning_rate": 2.7e-05, "loss": 9.6231, "mean_token_accuracy": 0.05530005097389221, "num_tokens": 101455.0, "step": 55 }, { "entropy": 10.381121063232422, "epoch": 0.005298481102084069, "grad_norm": 1.921875, "learning_rate": 2.95e-05, "loss": 9.5438, "mean_token_accuracy": 0.05805549845099449, "num_tokens": 110782.0, "step": 60 }, { "entropy": 10.360444736480712, "epoch": 0.005740021193924408, "grad_norm": 1.71875, "learning_rate": 3.2e-05, "loss": 9.4168, "mean_token_accuracy": 0.060499183088541034, "num_tokens": 119241.0, "step": 65 }, { "entropy": 10.300647163391114, "epoch": 0.006181561285764747, "grad_norm": 1.75, "learning_rate": 3.4500000000000005e-05, "loss": 9.4178, "mean_token_accuracy": 0.055320289358496665, "num_tokens": 127903.0, "step": 70 }, { "entropy": 10.332123184204102, "epoch": 0.006623101377605086, "grad_norm": 1.6328125, "learning_rate": 3.7e-05, "loss": 9.3721, "mean_token_accuracy": 0.05736841931939125, "num_tokens": 137370.0, "step": 75 }, { "entropy": 10.290982055664063, "epoch": 0.007064641469445425, "grad_norm": 1.625, "learning_rate": 3.95e-05, "loss": 9.2214, "mean_token_accuracy": 0.06618293710052967, "num_tokens": 146582.0, "step": 80 }, { "entropy": 10.196907424926758, "epoch": 0.007506181561285765, "grad_norm": 1.5703125, "learning_rate": 4.2000000000000004e-05, "loss": 9.1585, "mean_token_accuracy": 0.05961471572518349, "num_tokens": 154933.0, "step": 85 }, { "entropy": 10.205323791503906, "epoch": 0.007947721653126103, "grad_norm": 1.5078125, "learning_rate": 4.45e-05, "loss": 9.1026, "mean_token_accuracy": 0.072137650847435, "num_tokens": 165157.0, "step": 90 }, { "entropy": 10.10411615371704, "epoch": 0.008389261744966443, "grad_norm": 1.328125, "learning_rate": 4.7000000000000004e-05, "loss": 8.9848, "mean_token_accuracy": 0.0728946004062891, "num_tokens": 174958.0, "step": 95 }, { "entropy": 10.01873140335083, "epoch": 0.008830801836806783, "grad_norm": 1.34375, "learning_rate": 4.9500000000000004e-05, "loss": 8.8889, "mean_token_accuracy": 0.07516518756747245, "num_tokens": 184256.0, "step": 100 }, { "entropy": 9.956882572174072, "epoch": 0.009272341928647121, "grad_norm": 1.546875, "learning_rate": 5.2e-05, "loss": 8.7839, "mean_token_accuracy": 0.067950439453125, "num_tokens": 192894.0, "step": 105 }, { "entropy": 9.884513092041015, "epoch": 0.00971388202048746, "grad_norm": 1.375, "learning_rate": 5.45e-05, "loss": 8.6868, "mean_token_accuracy": 0.07383731976151467, "num_tokens": 202675.0, "step": 110 }, { "entropy": 9.810705184936523, "epoch": 0.010155422112327799, "grad_norm": 1.1953125, "learning_rate": 5.7e-05, "loss": 8.624, "mean_token_accuracy": 0.07006355635821819, "num_tokens": 212261.0, "step": 115 }, { "entropy": 9.743825721740723, "epoch": 0.010596962204168139, "grad_norm": 1.171875, "learning_rate": 5.9499999999999996e-05, "loss": 8.599, "mean_token_accuracy": 0.06874018795788288, "num_tokens": 222329.0, "step": 120 }, { "entropy": 9.528209781646728, "epoch": 0.011038502296008477, "grad_norm": 1.109375, "learning_rate": 6.2e-05, "loss": 8.4477, "mean_token_accuracy": 0.06682575456798076, "num_tokens": 231247.0, "step": 125 }, { "entropy": 9.442446994781495, "epoch": 0.011480042387848817, "grad_norm": 1.15625, "learning_rate": 6.450000000000001e-05, "loss": 8.3412, "mean_token_accuracy": 0.06921537183225154, "num_tokens": 239978.0, "step": 130 }, { "entropy": 9.272939491271973, "epoch": 0.011921582479689156, "grad_norm": 0.90625, "learning_rate": 6.7e-05, "loss": 8.3, "mean_token_accuracy": 0.06850462295114994, "num_tokens": 249735.0, "step": 135 }, { "entropy": 9.222266483306885, "epoch": 0.012363122571529495, "grad_norm": 1.15625, "learning_rate": 6.950000000000001e-05, "loss": 8.2344, "mean_token_accuracy": 0.06959122642874718, "num_tokens": 259369.0, "step": 140 }, { "entropy": 8.956540203094482, "epoch": 0.012804662663369834, "grad_norm": 0.81640625, "learning_rate": 7.2e-05, "loss": 8.2305, "mean_token_accuracy": 0.06539506763219834, "num_tokens": 268645.0, "step": 145 }, { "entropy": 8.88605546951294, "epoch": 0.013246202755210172, "grad_norm": 1.1640625, "learning_rate": 7.45e-05, "loss": 8.0685, "mean_token_accuracy": 0.07155903875827789, "num_tokens": 276667.0, "step": 150 }, { "entropy": 8.623716259002686, "epoch": 0.013687742847050512, "grad_norm": 0.703125, "learning_rate": 7.7e-05, "loss": 8.0682, "mean_token_accuracy": 0.07539896108210087, "num_tokens": 286017.0, "step": 155 }, { "entropy": 8.590844440460206, "epoch": 0.01412928293889085, "grad_norm": 0.83984375, "learning_rate": 7.950000000000001e-05, "loss": 8.0571, "mean_token_accuracy": 0.07278457470238209, "num_tokens": 295631.0, "step": 160 }, { "entropy": 8.562520027160645, "epoch": 0.01457082303073119, "grad_norm": 0.86328125, "learning_rate": 8.2e-05, "loss": 8.0486, "mean_token_accuracy": 0.06986252851784229, "num_tokens": 304704.0, "step": 165 }, { "entropy": 8.490843200683594, "epoch": 0.01501236312257153, "grad_norm": 0.7734375, "learning_rate": 8.450000000000001e-05, "loss": 8.0665, "mean_token_accuracy": 0.07160350978374481, "num_tokens": 314195.0, "step": 170 }, { "entropy": 8.416227722167969, "epoch": 0.015453903214411868, "grad_norm": 1.0078125, "learning_rate": 8.7e-05, "loss": 8.0632, "mean_token_accuracy": 0.07028085552155972, "num_tokens": 323379.0, "step": 175 }, { "entropy": 8.398184299468994, "epoch": 0.015895443306252206, "grad_norm": 0.734375, "learning_rate": 8.95e-05, "loss": 7.9637, "mean_token_accuracy": 0.08185541778802871, "num_tokens": 332322.0, "step": 180 }, { "entropy": 8.336036014556885, "epoch": 0.016336983398092548, "grad_norm": 0.79296875, "learning_rate": 9.2e-05, "loss": 7.9427, "mean_token_accuracy": 0.08073886930942535, "num_tokens": 341735.0, "step": 185 }, { "entropy": 8.337114715576172, "epoch": 0.016778523489932886, "grad_norm": 0.92578125, "learning_rate": 9.45e-05, "loss": 8.0349, "mean_token_accuracy": 0.06938613168895244, "num_tokens": 351209.0, "step": 190 }, { "entropy": 8.39198350906372, "epoch": 0.017220063581773224, "grad_norm": 0.98046875, "learning_rate": 9.7e-05, "loss": 7.9792, "mean_token_accuracy": 0.07559169828891754, "num_tokens": 360467.0, "step": 195 }, { "entropy": 8.235328102111817, "epoch": 0.017661603673613566, "grad_norm": 0.875, "learning_rate": 9.95e-05, "loss": 7.9423, "mean_token_accuracy": 0.07669526152312756, "num_tokens": 370361.0, "step": 200 }, { "entropy": 8.374059581756592, "epoch": 0.018103143765453904, "grad_norm": 1.2109375, "learning_rate": 0.000102, "loss": 8.0075, "mean_token_accuracy": 0.07567069008946418, "num_tokens": 380366.0, "step": 205 }, { "entropy": 8.206629276275635, "epoch": 0.018544683857294242, "grad_norm": 1.0, "learning_rate": 0.00010449999999999999, "loss": 7.9185, "mean_token_accuracy": 0.08159190192818641, "num_tokens": 390690.0, "step": 210 }, { "entropy": 8.24603796005249, "epoch": 0.01898622394913458, "grad_norm": 0.87109375, "learning_rate": 0.000107, "loss": 7.9601, "mean_token_accuracy": 0.0793293446302414, "num_tokens": 400722.0, "step": 215 }, { "entropy": 8.157498931884765, "epoch": 0.01942776404097492, "grad_norm": 0.84375, "learning_rate": 0.0001095, "loss": 7.8501, "mean_token_accuracy": 0.08334142193198205, "num_tokens": 410223.0, "step": 220 }, { "entropy": 8.205572509765625, "epoch": 0.01986930413281526, "grad_norm": 1.0, "learning_rate": 0.000112, "loss": 7.9021, "mean_token_accuracy": 0.07716193869709968, "num_tokens": 420214.0, "step": 225 }, { "entropy": 8.172825717926026, "epoch": 0.020310844224655598, "grad_norm": 0.88671875, "learning_rate": 0.0001145, "loss": 7.8564, "mean_token_accuracy": 0.08035471551120281, "num_tokens": 429407.0, "step": 230 }, { "entropy": 8.162760925292968, "epoch": 0.02075238431649594, "grad_norm": 1.4609375, "learning_rate": 0.00011700000000000001, "loss": 7.8246, "mean_token_accuracy": 0.07542734369635581, "num_tokens": 438403.0, "step": 235 }, { "entropy": 8.177341651916503, "epoch": 0.021193924408336277, "grad_norm": 1.2109375, "learning_rate": 0.00011949999999999999, "loss": 7.8545, "mean_token_accuracy": 0.08535856604576111, "num_tokens": 447466.0, "step": 240 }, { "entropy": 8.069422197341918, "epoch": 0.021635464500176615, "grad_norm": 0.921875, "learning_rate": 0.000122, "loss": 7.9366, "mean_token_accuracy": 0.07459555268287658, "num_tokens": 457141.0, "step": 245 }, { "entropy": 8.160084056854249, "epoch": 0.022077004592016954, "grad_norm": 0.86328125, "learning_rate": 0.0001245, "loss": 7.812, "mean_token_accuracy": 0.08223466873168946, "num_tokens": 465708.0, "step": 250 }, { "entropy": 8.142998504638673, "epoch": 0.022518544683857295, "grad_norm": 1.0859375, "learning_rate": 0.000127, "loss": 7.8339, "mean_token_accuracy": 0.07565066292881965, "num_tokens": 475369.0, "step": 255 }, { "entropy": 8.075135421752929, "epoch": 0.022960084775697633, "grad_norm": 1.0859375, "learning_rate": 0.0001295, "loss": 7.7972, "mean_token_accuracy": 0.08645984381437302, "num_tokens": 484249.0, "step": 260 }, { "entropy": 8.122587871551513, "epoch": 0.02340162486753797, "grad_norm": 1.109375, "learning_rate": 0.000132, "loss": 7.8872, "mean_token_accuracy": 0.07687325775623322, "num_tokens": 493303.0, "step": 265 }, { "entropy": 8.101485538482667, "epoch": 0.023843164959378313, "grad_norm": 1.0859375, "learning_rate": 0.00013450000000000002, "loss": 7.8664, "mean_token_accuracy": 0.0807331919670105, "num_tokens": 501503.0, "step": 270 }, { "entropy": 8.036290693283082, "epoch": 0.02428470505121865, "grad_norm": 1.2109375, "learning_rate": 0.00013700000000000002, "loss": 7.8074, "mean_token_accuracy": 0.08591768592596054, "num_tokens": 509661.0, "step": 275 }, { "entropy": 8.045488977432251, "epoch": 0.02472624514305899, "grad_norm": 1.1875, "learning_rate": 0.0001395, "loss": 7.7904, "mean_token_accuracy": 0.08441019728779793, "num_tokens": 519464.0, "step": 280 }, { "entropy": 8.107398653030396, "epoch": 0.025167785234899327, "grad_norm": 1.0625, "learning_rate": 0.00014199999999999998, "loss": 7.7489, "mean_token_accuracy": 0.08773190379142762, "num_tokens": 527968.0, "step": 285 }, { "entropy": 8.081705808639526, "epoch": 0.02560932532673967, "grad_norm": 1.1875, "learning_rate": 0.0001445, "loss": 7.7768, "mean_token_accuracy": 0.0868467777967453, "num_tokens": 537234.0, "step": 290 }, { "entropy": 7.99565052986145, "epoch": 0.026050865418580007, "grad_norm": 1.0078125, "learning_rate": 0.000147, "loss": 7.7747, "mean_token_accuracy": 0.08527034223079681, "num_tokens": 546398.0, "step": 295 }, { "entropy": 8.011523675918578, "epoch": 0.026492405510420345, "grad_norm": 1.015625, "learning_rate": 0.0001495, "loss": 7.7616, "mean_token_accuracy": 0.08982880860567093, "num_tokens": 555362.0, "step": 300 }, { "entropy": 8.107937812805176, "epoch": 0.026933945602260687, "grad_norm": 1.0, "learning_rate": 0.000152, "loss": 7.8221, "mean_token_accuracy": 0.07775180079042912, "num_tokens": 564575.0, "step": 305 }, { "entropy": 8.133016395568848, "epoch": 0.027375485694101025, "grad_norm": 1.0390625, "learning_rate": 0.00015450000000000001, "loss": 7.8384, "mean_token_accuracy": 0.08304800540208816, "num_tokens": 573915.0, "step": 310 }, { "entropy": 8.016209363937378, "epoch": 0.027817025785941363, "grad_norm": 1.0390625, "learning_rate": 0.000157, "loss": 7.7322, "mean_token_accuracy": 0.08581754639744758, "num_tokens": 583216.0, "step": 315 }, { "entropy": 7.982406425476074, "epoch": 0.0282585658777817, "grad_norm": 1.09375, "learning_rate": 0.0001595, "loss": 7.7553, "mean_token_accuracy": 0.08679840788245201, "num_tokens": 591955.0, "step": 320 }, { "entropy": 7.9430736064910885, "epoch": 0.028700105969622042, "grad_norm": 1.2265625, "learning_rate": 0.000162, "loss": 7.7588, "mean_token_accuracy": 0.08934888392686843, "num_tokens": 600999.0, "step": 325 }, { "entropy": 8.070584297180176, "epoch": 0.02914164606146238, "grad_norm": 1.2109375, "learning_rate": 0.00016450000000000001, "loss": 7.6563, "mean_token_accuracy": 0.09217674285173416, "num_tokens": 609478.0, "step": 330 }, { "entropy": 7.987708568572998, "epoch": 0.02958318615330272, "grad_norm": 1.3125, "learning_rate": 0.00016700000000000002, "loss": 7.7225, "mean_token_accuracy": 0.08663035854697228, "num_tokens": 618348.0, "step": 335 }, { "entropy": 7.911137056350708, "epoch": 0.03002472624514306, "grad_norm": 1.1484375, "learning_rate": 0.00016950000000000003, "loss": 7.7691, "mean_token_accuracy": 0.083287762850523, "num_tokens": 628548.0, "step": 340 }, { "entropy": 8.057271575927734, "epoch": 0.0304662663369834, "grad_norm": 1.0234375, "learning_rate": 0.00017199999999999998, "loss": 7.7102, "mean_token_accuracy": 0.08196588605642319, "num_tokens": 637489.0, "step": 345 }, { "entropy": 7.939978122711182, "epoch": 0.030907806428823736, "grad_norm": 0.953125, "learning_rate": 0.00017449999999999999, "loss": 7.689, "mean_token_accuracy": 0.0814521424472332, "num_tokens": 646715.0, "step": 350 }, { "entropy": 7.897878551483155, "epoch": 0.031349346520664075, "grad_norm": 0.98046875, "learning_rate": 0.000177, "loss": 7.6894, "mean_token_accuracy": 0.08998864293098449, "num_tokens": 656858.0, "step": 355 }, { "entropy": 8.019395637512208, "epoch": 0.03179088661250441, "grad_norm": 0.98828125, "learning_rate": 0.0001795, "loss": 7.6755, "mean_token_accuracy": 0.08710955381393433, "num_tokens": 665968.0, "step": 360 }, { "entropy": 8.001319217681885, "epoch": 0.03223242670434476, "grad_norm": 1.6328125, "learning_rate": 0.000182, "loss": 7.6655, "mean_token_accuracy": 0.08621685430407525, "num_tokens": 674295.0, "step": 365 }, { "entropy": 7.810992002487183, "epoch": 0.032673966796185096, "grad_norm": 1.234375, "learning_rate": 0.0001845, "loss": 7.591, "mean_token_accuracy": 0.08370614722371102, "num_tokens": 683559.0, "step": 370 }, { "entropy": 7.816927337646485, "epoch": 0.033115506888025434, "grad_norm": 1.21875, "learning_rate": 0.000187, "loss": 7.6253, "mean_token_accuracy": 0.08996079638600349, "num_tokens": 692402.0, "step": 375 }, { "entropy": 7.967683601379394, "epoch": 0.03355704697986577, "grad_norm": 1.2890625, "learning_rate": 0.0001895, "loss": 7.7304, "mean_token_accuracy": 0.08065761215984821, "num_tokens": 702052.0, "step": 380 }, { "entropy": 8.058749055862426, "epoch": 0.03399858707170611, "grad_norm": 1.6328125, "learning_rate": 0.000192, "loss": 7.6528, "mean_token_accuracy": 0.08705045655369759, "num_tokens": 711926.0, "step": 385 }, { "entropy": 7.8771873950958256, "epoch": 0.03444012716354645, "grad_norm": 0.96875, "learning_rate": 0.0001945, "loss": 7.612, "mean_token_accuracy": 0.08773906156420708, "num_tokens": 720948.0, "step": 390 }, { "entropy": 7.893786334991455, "epoch": 0.034881667255386786, "grad_norm": 1.5, "learning_rate": 0.00019700000000000002, "loss": 7.6301, "mean_token_accuracy": 0.09444142654538154, "num_tokens": 729611.0, "step": 395 }, { "entropy": 7.892533588409424, "epoch": 0.03532320734722713, "grad_norm": 1.15625, "learning_rate": 0.00019950000000000002, "loss": 7.6187, "mean_token_accuracy": 0.08193654045462609, "num_tokens": 738433.0, "step": 400 }, { "entropy": 7.945340347290039, "epoch": 0.03576474743906747, "grad_norm": 1.0859375, "learning_rate": 0.000202, "loss": 7.6397, "mean_token_accuracy": 0.08668759167194366, "num_tokens": 747310.0, "step": 405 }, { "entropy": 7.854477500915527, "epoch": 0.03620628753090781, "grad_norm": 1.1328125, "learning_rate": 0.00020449999999999998, "loss": 7.5994, "mean_token_accuracy": 0.09020926207304, "num_tokens": 756362.0, "step": 410 }, { "entropy": 7.90778489112854, "epoch": 0.036647827622748146, "grad_norm": 0.984375, "learning_rate": 0.000207, "loss": 7.6034, "mean_token_accuracy": 0.08586042672395706, "num_tokens": 764978.0, "step": 415 }, { "entropy": 7.87300386428833, "epoch": 0.037089367714588484, "grad_norm": 1.03125, "learning_rate": 0.0002095, "loss": 7.5707, "mean_token_accuracy": 0.09018185958266259, "num_tokens": 774058.0, "step": 420 }, { "entropy": 7.795767593383789, "epoch": 0.03753090780642882, "grad_norm": 1.0390625, "learning_rate": 0.000212, "loss": 7.5492, "mean_token_accuracy": 0.0897379383444786, "num_tokens": 783332.0, "step": 425 }, { "entropy": 7.853004789352417, "epoch": 0.03797244789826916, "grad_norm": 1.3203125, "learning_rate": 0.0002145, "loss": 7.6298, "mean_token_accuracy": 0.08684360906481743, "num_tokens": 792481.0, "step": 430 }, { "entropy": 7.766995525360107, "epoch": 0.038413987990109505, "grad_norm": 1.1015625, "learning_rate": 0.00021700000000000002, "loss": 7.5212, "mean_token_accuracy": 0.09301207512617111, "num_tokens": 801396.0, "step": 435 }, { "entropy": 7.8428326606750485, "epoch": 0.03885552808194984, "grad_norm": 1.0078125, "learning_rate": 0.0002195, "loss": 7.5865, "mean_token_accuracy": 0.08940735682845116, "num_tokens": 809903.0, "step": 440 }, { "entropy": 7.828377294540405, "epoch": 0.03929706817379018, "grad_norm": 0.99609375, "learning_rate": 0.000222, "loss": 7.5389, "mean_token_accuracy": 0.0962544821202755, "num_tokens": 819144.0, "step": 445 }, { "entropy": 7.7183678150177, "epoch": 0.03973860826563052, "grad_norm": 1.1328125, "learning_rate": 0.0002245, "loss": 7.5608, "mean_token_accuracy": 0.08849129751324654, "num_tokens": 828881.0, "step": 450 }, { "entropy": 7.764478397369385, "epoch": 0.04018014835747086, "grad_norm": 1.109375, "learning_rate": 0.00022700000000000002, "loss": 7.4877, "mean_token_accuracy": 0.08765893578529357, "num_tokens": 837588.0, "step": 455 }, { "entropy": 7.767373847961426, "epoch": 0.040621688449311195, "grad_norm": 1.0, "learning_rate": 0.00022950000000000002, "loss": 7.4849, "mean_token_accuracy": 0.09265839084982871, "num_tokens": 847002.0, "step": 460 }, { "entropy": 7.726333475112915, "epoch": 0.041063228541151534, "grad_norm": 1.1015625, "learning_rate": 0.00023200000000000003, "loss": 7.4945, "mean_token_accuracy": 0.09333177357912063, "num_tokens": 855791.0, "step": 465 }, { "entropy": 7.7462080955505375, "epoch": 0.04150476863299188, "grad_norm": 1.1484375, "learning_rate": 0.00023449999999999998, "loss": 7.5242, "mean_token_accuracy": 0.0911882683634758, "num_tokens": 865392.0, "step": 470 }, { "entropy": 7.736569499969482, "epoch": 0.04194630872483222, "grad_norm": 1.2109375, "learning_rate": 0.000237, "loss": 7.5873, "mean_token_accuracy": 0.08873779252171517, "num_tokens": 874807.0, "step": 475 }, { "entropy": 7.758917284011841, "epoch": 0.042387848816672555, "grad_norm": 1.1875, "learning_rate": 0.0002395, "loss": 7.5409, "mean_token_accuracy": 0.09495326653122901, "num_tokens": 883928.0, "step": 480 }, { "entropy": 7.777913904190063, "epoch": 0.04282938890851289, "grad_norm": 1.140625, "learning_rate": 0.000242, "loss": 7.4436, "mean_token_accuracy": 0.09124857932329178, "num_tokens": 893047.0, "step": 485 }, { "entropy": 7.662859010696411, "epoch": 0.04327092900035323, "grad_norm": 1.2421875, "learning_rate": 0.0002445, "loss": 7.4593, "mean_token_accuracy": 0.09315531030297279, "num_tokens": 901645.0, "step": 490 }, { "entropy": 7.743328475952149, "epoch": 0.04371246909219357, "grad_norm": 1.109375, "learning_rate": 0.000247, "loss": 7.4727, "mean_token_accuracy": 0.09244368895888329, "num_tokens": 911169.0, "step": 495 }, { "entropy": 7.7239625453948975, "epoch": 0.04415400918403391, "grad_norm": 1.4140625, "learning_rate": 0.0002495, "loss": 7.4748, "mean_token_accuracy": 0.08498905003070831, "num_tokens": 921382.0, "step": 500 }, { "entropy": 7.544922304153443, "epoch": 0.04459554927587425, "grad_norm": 1.234375, "learning_rate": 0.000252, "loss": 7.4121, "mean_token_accuracy": 0.09429771155118942, "num_tokens": 930409.0, "step": 505 }, { "entropy": 7.67856912612915, "epoch": 0.04503708936771459, "grad_norm": 1.078125, "learning_rate": 0.0002545, "loss": 7.3879, "mean_token_accuracy": 0.09879431128501892, "num_tokens": 939049.0, "step": 510 }, { "entropy": 7.718625736236572, "epoch": 0.04547862945955493, "grad_norm": 1.25, "learning_rate": 0.000257, "loss": 7.395, "mean_token_accuracy": 0.0960740551352501, "num_tokens": 947575.0, "step": 515 }, { "entropy": 7.709804058074951, "epoch": 0.045920169551395267, "grad_norm": 1.09375, "learning_rate": 0.0002595, "loss": 7.5643, "mean_token_accuracy": 0.09047991409897804, "num_tokens": 957848.0, "step": 520 }, { "entropy": 7.655015087127685, "epoch": 0.046361709643235605, "grad_norm": 1.1484375, "learning_rate": 0.000262, "loss": 7.3857, "mean_token_accuracy": 0.09998803585767746, "num_tokens": 966521.0, "step": 525 }, { "entropy": 7.688518905639649, "epoch": 0.04680324973507594, "grad_norm": 1.3359375, "learning_rate": 0.00026450000000000003, "loss": 7.4461, "mean_token_accuracy": 0.09324755370616913, "num_tokens": 975827.0, "step": 530 }, { "entropy": 7.606715154647827, "epoch": 0.04724478982691628, "grad_norm": 1.1796875, "learning_rate": 0.00026700000000000004, "loss": 7.4075, "mean_token_accuracy": 0.09566703587770461, "num_tokens": 985292.0, "step": 535 }, { "entropy": 7.616068124771118, "epoch": 0.047686329918756626, "grad_norm": 1.09375, "learning_rate": 0.00026950000000000005, "loss": 7.3841, "mean_token_accuracy": 0.09411159604787826, "num_tokens": 994791.0, "step": 540 }, { "entropy": 7.51567816734314, "epoch": 0.048127870010596964, "grad_norm": 1.1015625, "learning_rate": 0.00027200000000000005, "loss": 7.3594, "mean_token_accuracy": 0.1026044063270092, "num_tokens": 1003700.0, "step": 545 }, { "entropy": 7.510391616821289, "epoch": 0.0485694101024373, "grad_norm": 1.0, "learning_rate": 0.0002745, "loss": 7.381, "mean_token_accuracy": 0.09829011410474778, "num_tokens": 1012682.0, "step": 550 }, { "entropy": 7.683912038803101, "epoch": 0.04901095019427764, "grad_norm": 1.28125, "learning_rate": 0.000277, "loss": 7.4075, "mean_token_accuracy": 0.09179475829005242, "num_tokens": 1021018.0, "step": 555 }, { "entropy": 7.570155191421509, "epoch": 0.04945249028611798, "grad_norm": 1.3046875, "learning_rate": 0.0002795, "loss": 7.2759, "mean_token_accuracy": 0.09699172824621201, "num_tokens": 1029744.0, "step": 560 }, { "entropy": 7.495694351196289, "epoch": 0.049894030377958316, "grad_norm": 1.1484375, "learning_rate": 0.00028199999999999997, "loss": 7.3605, "mean_token_accuracy": 0.09879247918725013, "num_tokens": 1038805.0, "step": 565 }, { "entropy": 7.5144976615905765, "epoch": 0.050335570469798654, "grad_norm": 1.328125, "learning_rate": 0.0002845, "loss": 7.4131, "mean_token_accuracy": 0.0988279327750206, "num_tokens": 1047656.0, "step": 570 }, { "entropy": 7.647522783279419, "epoch": 0.050777110561639, "grad_norm": 1.140625, "learning_rate": 0.000287, "loss": 7.4048, "mean_token_accuracy": 0.09629088416695594, "num_tokens": 1056598.0, "step": 575 }, { "entropy": 7.6095935821533205, "epoch": 0.05121865065347934, "grad_norm": 1.1015625, "learning_rate": 0.0002895, "loss": 7.3994, "mean_token_accuracy": 0.09847217947244644, "num_tokens": 1065226.0, "step": 580 }, { "entropy": 7.529495334625244, "epoch": 0.051660190745319676, "grad_norm": 1.4375, "learning_rate": 0.000292, "loss": 7.3208, "mean_token_accuracy": 0.10098938867449761, "num_tokens": 1074661.0, "step": 585 }, { "entropy": 7.503559398651123, "epoch": 0.052101730837160014, "grad_norm": 1.1484375, "learning_rate": 0.0002945, "loss": 7.4251, "mean_token_accuracy": 0.09441772177815437, "num_tokens": 1083921.0, "step": 590 }, { "entropy": 7.540312194824219, "epoch": 0.05254327092900035, "grad_norm": 1.1328125, "learning_rate": 0.000297, "loss": 7.3162, "mean_token_accuracy": 0.104298634827137, "num_tokens": 1093399.0, "step": 595 }, { "entropy": 7.5133528232574465, "epoch": 0.05298481102084069, "grad_norm": 1.2734375, "learning_rate": 0.0002995, "loss": 7.391, "mean_token_accuracy": 0.09825902208685874, "num_tokens": 1104065.0, "step": 600 }, { "entropy": 7.433008003234863, "epoch": 0.05342635111268103, "grad_norm": 1.125, "learning_rate": 0.000302, "loss": 7.2778, "mean_token_accuracy": 0.1005440428853035, "num_tokens": 1112995.0, "step": 605 }, { "entropy": 7.47243971824646, "epoch": 0.05386789120452137, "grad_norm": 1.0859375, "learning_rate": 0.0003045, "loss": 7.3103, "mean_token_accuracy": 0.10175202563405036, "num_tokens": 1121637.0, "step": 610 }, { "entropy": 7.455365610122681, "epoch": 0.05430943129636171, "grad_norm": 1.140625, "learning_rate": 0.000307, "loss": 7.2549, "mean_token_accuracy": 0.09826337993144989, "num_tokens": 1131166.0, "step": 615 }, { "entropy": 7.4712036609649655, "epoch": 0.05475097138820205, "grad_norm": 1.0859375, "learning_rate": 0.0003095, "loss": 7.2562, "mean_token_accuracy": 0.10475531965494156, "num_tokens": 1140888.0, "step": 620 }, { "entropy": 7.551609897613526, "epoch": 0.05519251148004239, "grad_norm": 1.1171875, "learning_rate": 0.000312, "loss": 7.4148, "mean_token_accuracy": 0.0961816966533661, "num_tokens": 1150278.0, "step": 625 }, { "entropy": 7.433546924591065, "epoch": 0.055634051571882726, "grad_norm": 1.4375, "learning_rate": 0.0003145, "loss": 7.3742, "mean_token_accuracy": 0.0970606379210949, "num_tokens": 1159348.0, "step": 630 }, { "entropy": 7.624134588241577, "epoch": 0.056075591663723064, "grad_norm": 1.046875, "learning_rate": 0.000317, "loss": 7.3756, "mean_token_accuracy": 0.0949991799890995, "num_tokens": 1168883.0, "step": 635 }, { "entropy": 7.48681526184082, "epoch": 0.0565171317555634, "grad_norm": 1.0, "learning_rate": 0.0003195, "loss": 7.269, "mean_token_accuracy": 0.1064944364130497, "num_tokens": 1178572.0, "step": 640 }, { "entropy": 7.44178466796875, "epoch": 0.05695867184740375, "grad_norm": 1.046875, "learning_rate": 0.000322, "loss": 7.3576, "mean_token_accuracy": 0.0987204596400261, "num_tokens": 1188909.0, "step": 645 }, { "entropy": 7.466546869277954, "epoch": 0.057400211939244085, "grad_norm": 1.28125, "learning_rate": 0.00032450000000000003, "loss": 7.2649, "mean_token_accuracy": 0.09890259429812431, "num_tokens": 1197705.0, "step": 650 }, { "entropy": 7.450878572463989, "epoch": 0.05784175203108442, "grad_norm": 1.1171875, "learning_rate": 0.00032700000000000003, "loss": 7.1061, "mean_token_accuracy": 0.10522415414452553, "num_tokens": 1206351.0, "step": 655 }, { "entropy": 7.340301847457885, "epoch": 0.05828329212292476, "grad_norm": 1.3359375, "learning_rate": 0.00032950000000000004, "loss": 7.2237, "mean_token_accuracy": 0.09693196043372154, "num_tokens": 1214984.0, "step": 660 }, { "entropy": 7.4402018070220945, "epoch": 0.0587248322147651, "grad_norm": 1.1796875, "learning_rate": 0.00033200000000000005, "loss": 7.2999, "mean_token_accuracy": 0.09738482013344765, "num_tokens": 1224485.0, "step": 665 }, { "entropy": 7.435847473144531, "epoch": 0.05916637230660544, "grad_norm": 1.1796875, "learning_rate": 0.00033450000000000005, "loss": 7.3266, "mean_token_accuracy": 0.09173622950911522, "num_tokens": 1233560.0, "step": 670 }, { "entropy": 7.428315305709839, "epoch": 0.05960791239844578, "grad_norm": 1.3515625, "learning_rate": 0.000337, "loss": 7.2436, "mean_token_accuracy": 0.09967414885759354, "num_tokens": 1242628.0, "step": 675 }, { "entropy": 7.388672494888306, "epoch": 0.06004945249028612, "grad_norm": 1.0703125, "learning_rate": 0.0003395, "loss": 7.1697, "mean_token_accuracy": 0.10538085550069809, "num_tokens": 1251004.0, "step": 680 }, { "entropy": 7.459445238113403, "epoch": 0.06049099258212646, "grad_norm": 1.0703125, "learning_rate": 0.000342, "loss": 7.3463, "mean_token_accuracy": 0.09609238728880883, "num_tokens": 1260344.0, "step": 685 }, { "entropy": 7.343485164642334, "epoch": 0.0609325326739668, "grad_norm": 1.0625, "learning_rate": 0.00034449999999999997, "loss": 7.2517, "mean_token_accuracy": 0.09760257676243782, "num_tokens": 1269988.0, "step": 690 }, { "entropy": 7.340139007568359, "epoch": 0.061374072765807135, "grad_norm": 1.0859375, "learning_rate": 0.000347, "loss": 7.2126, "mean_token_accuracy": 0.10715288370847702, "num_tokens": 1280912.0, "step": 695 }, { "entropy": 7.350299119949341, "epoch": 0.06181561285764747, "grad_norm": 0.98828125, "learning_rate": 0.0003495, "loss": 7.2246, "mean_token_accuracy": 0.10604915320873261, "num_tokens": 1289684.0, "step": 700 }, { "entropy": 7.427703905105591, "epoch": 0.06225715294948781, "grad_norm": 1.0390625, "learning_rate": 0.000352, "loss": 7.2534, "mean_token_accuracy": 0.09802542477846146, "num_tokens": 1298853.0, "step": 705 }, { "entropy": 7.3198949813842775, "epoch": 0.06269869304132815, "grad_norm": 1.0390625, "learning_rate": 0.0003545, "loss": 7.214, "mean_token_accuracy": 0.10874532908201218, "num_tokens": 1309112.0, "step": 710 }, { "entropy": 7.372763156890869, "epoch": 0.0631402331331685, "grad_norm": 1.296875, "learning_rate": 0.000357, "loss": 7.1981, "mean_token_accuracy": 0.10583075731992722, "num_tokens": 1319064.0, "step": 715 }, { "entropy": 7.269387340545654, "epoch": 0.06358177322500883, "grad_norm": 1.1015625, "learning_rate": 0.0003595, "loss": 7.1748, "mean_token_accuracy": 0.1100200168788433, "num_tokens": 1327889.0, "step": 720 }, { "entropy": 7.324726533889771, "epoch": 0.06402331331684917, "grad_norm": 1.28125, "learning_rate": 0.000362, "loss": 7.1938, "mean_token_accuracy": 0.10258080512285232, "num_tokens": 1337241.0, "step": 725 }, { "entropy": 7.319574499130249, "epoch": 0.06446485340868952, "grad_norm": 1.1015625, "learning_rate": 0.0003645, "loss": 7.2533, "mean_token_accuracy": 0.10085726305842399, "num_tokens": 1346527.0, "step": 730 }, { "entropy": 7.314885807037354, "epoch": 0.06490639350052985, "grad_norm": 1.125, "learning_rate": 0.000367, "loss": 7.2315, "mean_token_accuracy": 0.10445504561066628, "num_tokens": 1355677.0, "step": 735 }, { "entropy": 7.396700429916382, "epoch": 0.06534793359237019, "grad_norm": 1.03125, "learning_rate": 0.0003695, "loss": 7.2163, "mean_token_accuracy": 0.10588330775499344, "num_tokens": 1364874.0, "step": 740 }, { "entropy": 7.285468482971192, "epoch": 0.06578947368421052, "grad_norm": 1.1015625, "learning_rate": 0.000372, "loss": 7.1378, "mean_token_accuracy": 0.1090671844780445, "num_tokens": 1373717.0, "step": 745 }, { "entropy": 7.375531625747681, "epoch": 0.06623101377605087, "grad_norm": 0.984375, "learning_rate": 0.0003745, "loss": 7.0955, "mean_token_accuracy": 0.10741576477885247, "num_tokens": 1382767.0, "step": 750 }, { "entropy": 7.1357104778289795, "epoch": 0.0666725538678912, "grad_norm": 1.3515625, "learning_rate": 0.000377, "loss": 7.1389, "mean_token_accuracy": 0.10613262876868249, "num_tokens": 1391190.0, "step": 755 }, { "entropy": 7.234480524063111, "epoch": 0.06711409395973154, "grad_norm": 1.1796875, "learning_rate": 0.0003795, "loss": 7.1509, "mean_token_accuracy": 0.10508784130215645, "num_tokens": 1400722.0, "step": 760 }, { "entropy": 7.402392435073852, "epoch": 0.06755563405157189, "grad_norm": 1.03125, "learning_rate": 0.000382, "loss": 7.1779, "mean_token_accuracy": 0.10437385067343712, "num_tokens": 1409328.0, "step": 765 }, { "entropy": 7.06873927116394, "epoch": 0.06799717414341222, "grad_norm": 1.2109375, "learning_rate": 0.0003845, "loss": 7.0531, "mean_token_accuracy": 0.11192933171987533, "num_tokens": 1418504.0, "step": 770 }, { "entropy": 7.440014839172363, "epoch": 0.06843871423525257, "grad_norm": 1.0703125, "learning_rate": 0.00038700000000000003, "loss": 7.1989, "mean_token_accuracy": 0.10317453742027283, "num_tokens": 1427690.0, "step": 775 }, { "entropy": 7.181108903884888, "epoch": 0.0688802543270929, "grad_norm": 1.125, "learning_rate": 0.00038950000000000003, "loss": 7.1806, "mean_token_accuracy": 0.10798285007476807, "num_tokens": 1436798.0, "step": 780 }, { "entropy": 7.2046185493469235, "epoch": 0.06932179441893324, "grad_norm": 0.96875, "learning_rate": 0.00039200000000000004, "loss": 7.1646, "mean_token_accuracy": 0.10358999595046044, "num_tokens": 1446357.0, "step": 785 }, { "entropy": 7.2555629253387455, "epoch": 0.06976333451077357, "grad_norm": 1.0703125, "learning_rate": 0.00039450000000000005, "loss": 7.0882, "mean_token_accuracy": 0.11000654250383377, "num_tokens": 1455998.0, "step": 790 }, { "entropy": 7.207996559143067, "epoch": 0.07020487460261392, "grad_norm": 0.984375, "learning_rate": 0.00039700000000000005, "loss": 7.145, "mean_token_accuracy": 0.09857687279582024, "num_tokens": 1465237.0, "step": 795 }, { "entropy": 7.24621729850769, "epoch": 0.07064641469445426, "grad_norm": 1.1015625, "learning_rate": 0.0003995, "loss": 7.0958, "mean_token_accuracy": 0.11087250858545303, "num_tokens": 1474363.0, "step": 800 }, { "entropy": 7.272359037399292, "epoch": 0.0710879547862946, "grad_norm": 1.3359375, "learning_rate": 0.000402, "loss": 7.1713, "mean_token_accuracy": 0.10843008160591125, "num_tokens": 1483379.0, "step": 805 }, { "entropy": 7.28739447593689, "epoch": 0.07152949487813494, "grad_norm": 1.1640625, "learning_rate": 0.0004045, "loss": 7.1265, "mean_token_accuracy": 0.10922098532319069, "num_tokens": 1492507.0, "step": 810 }, { "entropy": 7.144436979293824, "epoch": 0.07197103496997527, "grad_norm": 1.2734375, "learning_rate": 0.00040699999999999997, "loss": 7.0154, "mean_token_accuracy": 0.11775125116109848, "num_tokens": 1500888.0, "step": 815 }, { "entropy": 7.11500997543335, "epoch": 0.07241257506181561, "grad_norm": 0.9921875, "learning_rate": 0.0004095, "loss": 7.0709, "mean_token_accuracy": 0.10802061259746551, "num_tokens": 1510310.0, "step": 820 }, { "entropy": 7.1448290824890135, "epoch": 0.07285411515365595, "grad_norm": 1.1796875, "learning_rate": 0.000412, "loss": 7.0494, "mean_token_accuracy": 0.11422519460320472, "num_tokens": 1519427.0, "step": 825 }, { "entropy": 7.2035074710845945, "epoch": 0.07329565524549629, "grad_norm": 1.1328125, "learning_rate": 0.0004145, "loss": 7.0679, "mean_token_accuracy": 0.1063395880162716, "num_tokens": 1529456.0, "step": 830 }, { "entropy": 7.131991720199585, "epoch": 0.07373719533733664, "grad_norm": 1.078125, "learning_rate": 0.000417, "loss": 7.0241, "mean_token_accuracy": 0.11403456106781959, "num_tokens": 1537695.0, "step": 835 }, { "entropy": 7.203299617767334, "epoch": 0.07417873542917697, "grad_norm": 1.046875, "learning_rate": 0.0004195, "loss": 7.1456, "mean_token_accuracy": 0.10954299196600914, "num_tokens": 1547511.0, "step": 840 }, { "entropy": 7.255322885513306, "epoch": 0.07462027552101731, "grad_norm": 1.078125, "learning_rate": 0.000422, "loss": 7.1315, "mean_token_accuracy": 0.1110302060842514, "num_tokens": 1557035.0, "step": 845 }, { "entropy": 7.1888104438781735, "epoch": 0.07506181561285764, "grad_norm": 0.90234375, "learning_rate": 0.0004245, "loss": 7.0906, "mean_token_accuracy": 0.11411306262016296, "num_tokens": 1566773.0, "step": 850 }, { "entropy": 7.094766998291016, "epoch": 0.07550335570469799, "grad_norm": 1.2421875, "learning_rate": 0.000427, "loss": 7.0787, "mean_token_accuracy": 0.10884842053055763, "num_tokens": 1576873.0, "step": 855 }, { "entropy": 7.178222894668579, "epoch": 0.07594489579653832, "grad_norm": 1.03125, "learning_rate": 0.0004295, "loss": 7.111, "mean_token_accuracy": 0.10762306824326515, "num_tokens": 1586170.0, "step": 860 }, { "entropy": 7.286298131942749, "epoch": 0.07638643588837866, "grad_norm": 1.171875, "learning_rate": 0.000432, "loss": 7.154, "mean_token_accuracy": 0.10613771453499794, "num_tokens": 1596054.0, "step": 865 }, { "entropy": 7.1001307487487795, "epoch": 0.07682797598021901, "grad_norm": 1.0546875, "learning_rate": 0.0004345, "loss": 7.0262, "mean_token_accuracy": 0.11607334911823272, "num_tokens": 1604544.0, "step": 870 }, { "entropy": 7.172781848907471, "epoch": 0.07726951607205934, "grad_norm": 1.1171875, "learning_rate": 0.000437, "loss": 7.0446, "mean_token_accuracy": 0.11472792029380799, "num_tokens": 1614580.0, "step": 875 }, { "entropy": 7.132223224639892, "epoch": 0.07771105616389969, "grad_norm": 1.046875, "learning_rate": 0.0004395, "loss": 7.1232, "mean_token_accuracy": 0.1109985999763012, "num_tokens": 1624701.0, "step": 880 }, { "entropy": 7.128903436660766, "epoch": 0.07815259625574002, "grad_norm": 1.015625, "learning_rate": 0.000442, "loss": 7.0573, "mean_token_accuracy": 0.10825628340244293, "num_tokens": 1634085.0, "step": 885 }, { "entropy": 7.123282432556152, "epoch": 0.07859413634758036, "grad_norm": 1.0859375, "learning_rate": 0.0004445, "loss": 7.0322, "mean_token_accuracy": 0.11617021560668946, "num_tokens": 1643190.0, "step": 890 }, { "entropy": 7.060208940505982, "epoch": 0.0790356764394207, "grad_norm": 1.1796875, "learning_rate": 0.000447, "loss": 7.07, "mean_token_accuracy": 0.11254222765564918, "num_tokens": 1652705.0, "step": 895 }, { "entropy": 7.132848882675171, "epoch": 0.07947721653126104, "grad_norm": 1.046875, "learning_rate": 0.00044950000000000003, "loss": 7.0536, "mean_token_accuracy": 0.10692465007305145, "num_tokens": 1662210.0, "step": 900 }, { "entropy": 7.192712593078613, "epoch": 0.07991875662310138, "grad_norm": 0.984375, "learning_rate": 0.00045200000000000004, "loss": 7.1014, "mean_token_accuracy": 0.10652303621172905, "num_tokens": 1671893.0, "step": 905 }, { "entropy": 7.059550428390503, "epoch": 0.08036029671494171, "grad_norm": 1.1015625, "learning_rate": 0.00045450000000000004, "loss": 7.0402, "mean_token_accuracy": 0.11181816533207893, "num_tokens": 1681217.0, "step": 910 }, { "entropy": 7.149940156936646, "epoch": 0.08080183680678206, "grad_norm": 1.0390625, "learning_rate": 0.00045700000000000005, "loss": 7.053, "mean_token_accuracy": 0.11131602600216865, "num_tokens": 1690447.0, "step": 915 }, { "entropy": 7.081046295166016, "epoch": 0.08124337689862239, "grad_norm": 1.0, "learning_rate": 0.00045950000000000006, "loss": 7.1332, "mean_token_accuracy": 0.10568991601467133, "num_tokens": 1700355.0, "step": 920 }, { "entropy": 7.16390905380249, "epoch": 0.08168491699046274, "grad_norm": 1.015625, "learning_rate": 0.000462, "loss": 7.028, "mean_token_accuracy": 0.10254786685109138, "num_tokens": 1709449.0, "step": 925 }, { "entropy": 7.091014242172241, "epoch": 0.08212645708230307, "grad_norm": 1.1171875, "learning_rate": 0.0004645, "loss": 7.058, "mean_token_accuracy": 0.10658924430608749, "num_tokens": 1718838.0, "step": 930 }, { "entropy": 7.023260927200317, "epoch": 0.08256799717414341, "grad_norm": 1.046875, "learning_rate": 0.000467, "loss": 7.0217, "mean_token_accuracy": 0.11323517858982086, "num_tokens": 1728594.0, "step": 935 }, { "entropy": 7.1843287467956545, "epoch": 0.08300953726598376, "grad_norm": 0.984375, "learning_rate": 0.0004695, "loss": 7.0731, "mean_token_accuracy": 0.11138227805495263, "num_tokens": 1738814.0, "step": 940 }, { "entropy": 7.071042823791504, "epoch": 0.08345107735782409, "grad_norm": 1.1171875, "learning_rate": 0.000472, "loss": 7.0089, "mean_token_accuracy": 0.11532488241791725, "num_tokens": 1747644.0, "step": 945 }, { "entropy": 7.104792213439941, "epoch": 0.08389261744966443, "grad_norm": 1.21875, "learning_rate": 0.0004745, "loss": 7.0338, "mean_token_accuracy": 0.11352440416812896, "num_tokens": 1757489.0, "step": 950 }, { "entropy": 6.995518827438355, "epoch": 0.08433415754150476, "grad_norm": 1.0390625, "learning_rate": 0.000477, "loss": 6.9488, "mean_token_accuracy": 0.11878458335995674, "num_tokens": 1767546.0, "step": 955 }, { "entropy": 7.094525289535523, "epoch": 0.08477569763334511, "grad_norm": 1.3984375, "learning_rate": 0.0004795, "loss": 7.0088, "mean_token_accuracy": 0.10509251430630684, "num_tokens": 1776035.0, "step": 960 }, { "entropy": 7.100050449371338, "epoch": 0.08521723772518544, "grad_norm": 0.8984375, "learning_rate": 0.000482, "loss": 7.0869, "mean_token_accuracy": 0.10708501487970352, "num_tokens": 1786161.0, "step": 965 }, { "entropy": 7.161181020736694, "epoch": 0.08565877781702579, "grad_norm": 1.0625, "learning_rate": 0.0004845, "loss": 7.1355, "mean_token_accuracy": 0.10680384710431098, "num_tokens": 1796093.0, "step": 970 }, { "entropy": 7.064108896255493, "epoch": 0.08610031790886613, "grad_norm": 1.0390625, "learning_rate": 0.000487, "loss": 6.9829, "mean_token_accuracy": 0.1097193941473961, "num_tokens": 1805574.0, "step": 975 }, { "entropy": 7.07778491973877, "epoch": 0.08654185800070646, "grad_norm": 1.0703125, "learning_rate": 0.0004895, "loss": 7.0171, "mean_token_accuracy": 0.11008013710379601, "num_tokens": 1815175.0, "step": 980 }, { "entropy": 7.017868852615356, "epoch": 0.08698339809254681, "grad_norm": 1.03125, "learning_rate": 0.000492, "loss": 6.9322, "mean_token_accuracy": 0.11862852200865745, "num_tokens": 1824683.0, "step": 985 }, { "entropy": 7.004701948165893, "epoch": 0.08742493818438714, "grad_norm": 1.0703125, "learning_rate": 0.0004945, "loss": 6.9091, "mean_token_accuracy": 0.1145630083978176, "num_tokens": 1833174.0, "step": 990 }, { "entropy": 7.008507776260376, "epoch": 0.08786647827622748, "grad_norm": 1.203125, "learning_rate": 0.000497, "loss": 7.0124, "mean_token_accuracy": 0.1165225401520729, "num_tokens": 1842409.0, "step": 995 }, { "entropy": 6.900066137313843, "epoch": 0.08830801836806781, "grad_norm": 0.99609375, "learning_rate": 0.0004995, "loss": 6.9172, "mean_token_accuracy": 0.1189465768635273, "num_tokens": 1851441.0, "step": 1000 }, { "entropy": 7.234589004516602, "epoch": 0.08874955845990816, "grad_norm": 1.0703125, "learning_rate": 0.000499999998589561, "loss": 7.0545, "mean_token_accuracy": 0.1098681665956974, "num_tokens": 1861188.0, "step": 1005 }, { "entropy": 6.916832828521729, "epoch": 0.0891910985517485, "grad_norm": 1.1875, "learning_rate": 0.0004999999928596523, "loss": 6.9934, "mean_token_accuracy": 0.1134356640279293, "num_tokens": 1870284.0, "step": 1010 }, { "entropy": 6.8979510307312015, "epoch": 0.08963263864358884, "grad_norm": 1.046875, "learning_rate": 0.0004999999827221219, "loss": 6.9508, "mean_token_accuracy": 0.11564922854304313, "num_tokens": 1879744.0, "step": 1015 }, { "entropy": 7.141992807388306, "epoch": 0.09007417873542918, "grad_norm": 0.9921875, "learning_rate": 0.0004999999681769696, "loss": 6.9612, "mean_token_accuracy": 0.1163177601993084, "num_tokens": 1889241.0, "step": 1020 }, { "entropy": 6.971645736694336, "epoch": 0.09051571882726951, "grad_norm": 1.09375, "learning_rate": 0.000499999949224196, "loss": 6.969, "mean_token_accuracy": 0.11504201143980027, "num_tokens": 1898247.0, "step": 1025 }, { "entropy": 7.006195020675659, "epoch": 0.09095725891910986, "grad_norm": 1.0078125, "learning_rate": 0.0004999999258638013, "loss": 6.9244, "mean_token_accuracy": 0.11498644798994065, "num_tokens": 1907559.0, "step": 1030 }, { "entropy": 6.903467321395874, "epoch": 0.09139879901095019, "grad_norm": 1.09375, "learning_rate": 0.0004999998980957861, "loss": 6.924, "mean_token_accuracy": 0.11912157312035561, "num_tokens": 1917072.0, "step": 1035 }, { "entropy": 6.983310127258301, "epoch": 0.09184033910279053, "grad_norm": 0.94921875, "learning_rate": 0.0004999998659201508, "loss": 6.8753, "mean_token_accuracy": 0.11874028518795968, "num_tokens": 1926597.0, "step": 1040 }, { "entropy": 7.006656980514526, "epoch": 0.09228187919463088, "grad_norm": 0.96484375, "learning_rate": 0.0004999998293368961, "loss": 6.918, "mean_token_accuracy": 0.11443031057715417, "num_tokens": 1935978.0, "step": 1045 }, { "entropy": 6.924140882492066, "epoch": 0.09272341928647121, "grad_norm": 0.9765625, "learning_rate": 0.0004999997883460227, "loss": 6.864, "mean_token_accuracy": 0.1171707384288311, "num_tokens": 1944424.0, "step": 1050 }, { "entropy": 6.984135913848877, "epoch": 0.09316495937831155, "grad_norm": 0.98828125, "learning_rate": 0.0004999997429475314, "loss": 6.8384, "mean_token_accuracy": 0.12138021439313888, "num_tokens": 1953844.0, "step": 1055 }, { "entropy": 6.947112941741944, "epoch": 0.09360649947015189, "grad_norm": 1.0234375, "learning_rate": 0.0004999996931414232, "loss": 6.9207, "mean_token_accuracy": 0.12120825350284577, "num_tokens": 1963974.0, "step": 1060 }, { "entropy": 6.977735805511474, "epoch": 0.09404803956199223, "grad_norm": 1.03125, "learning_rate": 0.0004999996389276988, "loss": 6.8969, "mean_token_accuracy": 0.12291403263807296, "num_tokens": 1973466.0, "step": 1065 }, { "entropy": 6.803595685958863, "epoch": 0.09448957965383256, "grad_norm": 1.09375, "learning_rate": 0.0004999995803063596, "loss": 6.918, "mean_token_accuracy": 0.12123456448316575, "num_tokens": 1983478.0, "step": 1070 }, { "entropy": 7.003172111511231, "epoch": 0.0949311197456729, "grad_norm": 1.0390625, "learning_rate": 0.0004999995172774065, "loss": 6.9879, "mean_token_accuracy": 0.11715293675661087, "num_tokens": 1992775.0, "step": 1075 }, { "entropy": 7.030760860443115, "epoch": 0.09537265983751325, "grad_norm": 1.0625, "learning_rate": 0.0004999994498408408, "loss": 6.9419, "mean_token_accuracy": 0.11398354098200798, "num_tokens": 2002526.0, "step": 1080 }, { "entropy": 6.936420059204101, "epoch": 0.09581419992935358, "grad_norm": 0.9765625, "learning_rate": 0.0004999993779966639, "loss": 6.9592, "mean_token_accuracy": 0.11412434950470925, "num_tokens": 2012476.0, "step": 1085 }, { "entropy": 6.838730955123902, "epoch": 0.09625574002119393, "grad_norm": 1.1953125, "learning_rate": 0.0004999993017448771, "loss": 6.7924, "mean_token_accuracy": 0.13063410446047782, "num_tokens": 2021252.0, "step": 1090 }, { "entropy": 7.061969757080078, "epoch": 0.09669728011303426, "grad_norm": 1.0, "learning_rate": 0.0004999992210854821, "loss": 6.9411, "mean_token_accuracy": 0.11837697625160218, "num_tokens": 2031438.0, "step": 1095 }, { "entropy": 6.9324125289917, "epoch": 0.0971388202048746, "grad_norm": 1.1015625, "learning_rate": 0.0004999991360184801, "loss": 6.9789, "mean_token_accuracy": 0.11443927884101868, "num_tokens": 2041319.0, "step": 1100 }, { "entropy": 6.95938081741333, "epoch": 0.09758036029671494, "grad_norm": 0.859375, "learning_rate": 0.0004999990465438731, "loss": 6.9746, "mean_token_accuracy": 0.11487890034914017, "num_tokens": 2052060.0, "step": 1105 }, { "entropy": 6.891486358642578, "epoch": 0.09802190038855528, "grad_norm": 1.1640625, "learning_rate": 0.0004999989526616628, "loss": 6.8643, "mean_token_accuracy": 0.12554761841893197, "num_tokens": 2061331.0, "step": 1110 }, { "entropy": 6.952145195007324, "epoch": 0.09846344048039563, "grad_norm": 1.0390625, "learning_rate": 0.0004999988543718509, "loss": 6.8733, "mean_token_accuracy": 0.11660940647125244, "num_tokens": 2070006.0, "step": 1115 }, { "entropy": 7.005694484710693, "epoch": 0.09890498057223596, "grad_norm": 1.171875, "learning_rate": 0.0004999987516744394, "loss": 6.8403, "mean_token_accuracy": 0.12942354679107665, "num_tokens": 2079089.0, "step": 1120 }, { "entropy": 6.814391231536865, "epoch": 0.0993465206640763, "grad_norm": 1.109375, "learning_rate": 0.0004999986445694303, "loss": 6.8164, "mean_token_accuracy": 0.12300374433398246, "num_tokens": 2087237.0, "step": 1125 }, { "entropy": 6.877581930160522, "epoch": 0.09978806075591663, "grad_norm": 1.109375, "learning_rate": 0.0004999985330568258, "loss": 6.7838, "mean_token_accuracy": 0.12274843603372573, "num_tokens": 2096598.0, "step": 1130 }, { "entropy": 6.867826128005982, "epoch": 0.10022960084775698, "grad_norm": 1.1015625, "learning_rate": 0.0004999984171366278, "loss": 6.8802, "mean_token_accuracy": 0.11098882853984833, "num_tokens": 2106106.0, "step": 1135 }, { "entropy": 6.928639554977417, "epoch": 0.10067114093959731, "grad_norm": 0.98828125, "learning_rate": 0.000499998296808839, "loss": 6.856, "mean_token_accuracy": 0.11591004803776742, "num_tokens": 2115478.0, "step": 1140 }, { "entropy": 6.942514848709107, "epoch": 0.10111268103143765, "grad_norm": 1.25, "learning_rate": 0.0004999981720734615, "loss": 6.9007, "mean_token_accuracy": 0.11097749546170235, "num_tokens": 2124439.0, "step": 1145 }, { "entropy": 6.928429985046387, "epoch": 0.101554221123278, "grad_norm": 1.1015625, "learning_rate": 0.0004999980429304977, "loss": 6.8983, "mean_token_accuracy": 0.11434343308210373, "num_tokens": 2133718.0, "step": 1150 }, { "entropy": 6.835229969024658, "epoch": 0.10199576121511833, "grad_norm": 1.1171875, "learning_rate": 0.0004999979093799502, "loss": 6.801, "mean_token_accuracy": 0.12133207321166992, "num_tokens": 2142978.0, "step": 1155 }, { "entropy": 6.8165655612945555, "epoch": 0.10243730130695868, "grad_norm": 1.1484375, "learning_rate": 0.0004999977714218217, "loss": 6.863, "mean_token_accuracy": 0.117961073666811, "num_tokens": 2152250.0, "step": 1160 }, { "entropy": 6.972959232330322, "epoch": 0.102878841398799, "grad_norm": 0.9765625, "learning_rate": 0.0004999976290561147, "loss": 6.8608, "mean_token_accuracy": 0.11624824330210685, "num_tokens": 2161620.0, "step": 1165 }, { "entropy": 6.921932792663574, "epoch": 0.10332038149063935, "grad_norm": 0.9375, "learning_rate": 0.0004999974822828322, "loss": 6.8726, "mean_token_accuracy": 0.1173239678144455, "num_tokens": 2170856.0, "step": 1170 }, { "entropy": 6.885772609710694, "epoch": 0.10376192158247968, "grad_norm": 0.98828125, "learning_rate": 0.000499997331101977, "loss": 6.8626, "mean_token_accuracy": 0.11568826138973236, "num_tokens": 2180926.0, "step": 1175 }, { "entropy": 6.911135244369507, "epoch": 0.10420346167432003, "grad_norm": 1.1171875, "learning_rate": 0.000499997175513552, "loss": 6.8419, "mean_token_accuracy": 0.11412648186087608, "num_tokens": 2190248.0, "step": 1180 }, { "entropy": 6.86896710395813, "epoch": 0.10464500176616037, "grad_norm": 0.96484375, "learning_rate": 0.0004999970155175603, "loss": 6.855, "mean_token_accuracy": 0.12222710996866226, "num_tokens": 2199833.0, "step": 1185 }, { "entropy": 6.856761360168457, "epoch": 0.1050865418580007, "grad_norm": 0.98828125, "learning_rate": 0.000499996851114005, "loss": 6.8115, "mean_token_accuracy": 0.12742498219013215, "num_tokens": 2208240.0, "step": 1190 }, { "entropy": 6.837878751754761, "epoch": 0.10552808194984105, "grad_norm": 0.93359375, "learning_rate": 0.0004999966823028894, "loss": 6.8776, "mean_token_accuracy": 0.11117666661739349, "num_tokens": 2218758.0, "step": 1195 }, { "entropy": 6.94004077911377, "epoch": 0.10596962204168138, "grad_norm": 0.984375, "learning_rate": 0.0004999965090842168, "loss": 6.8665, "mean_token_accuracy": 0.12320348769426345, "num_tokens": 2228443.0, "step": 1200 }, { "entropy": 6.74250750541687, "epoch": 0.10641116213352173, "grad_norm": 1.0625, "learning_rate": 0.0004999963314579905, "loss": 6.7084, "mean_token_accuracy": 0.1319122113287449, "num_tokens": 2236787.0, "step": 1205 }, { "entropy": 6.956686353683471, "epoch": 0.10685270222536206, "grad_norm": 1.0234375, "learning_rate": 0.0004999961494242139, "loss": 6.8901, "mean_token_accuracy": 0.11468368023633957, "num_tokens": 2247089.0, "step": 1210 }, { "entropy": 6.789958381652832, "epoch": 0.1072942423172024, "grad_norm": 1.1328125, "learning_rate": 0.0004999959629828908, "loss": 6.816, "mean_token_accuracy": 0.11569953635334969, "num_tokens": 2256045.0, "step": 1215 }, { "entropy": 6.929527616500854, "epoch": 0.10773578240904275, "grad_norm": 0.96875, "learning_rate": 0.0004999957721340248, "loss": 6.8769, "mean_token_accuracy": 0.12202595993876457, "num_tokens": 2265835.0, "step": 1220 }, { "entropy": 6.807823610305786, "epoch": 0.10817732250088308, "grad_norm": 1.2109375, "learning_rate": 0.0004999955768776195, "loss": 6.8076, "mean_token_accuracy": 0.11702087000012398, "num_tokens": 2275318.0, "step": 1225 }, { "entropy": 6.869472360610962, "epoch": 0.10861886259272342, "grad_norm": 1.09375, "learning_rate": 0.0004999953772136788, "loss": 6.7978, "mean_token_accuracy": 0.12102394551038742, "num_tokens": 2284821.0, "step": 1230 }, { "entropy": 6.885239315032959, "epoch": 0.10906040268456375, "grad_norm": 1.015625, "learning_rate": 0.0004999951731422068, "loss": 6.7645, "mean_token_accuracy": 0.12054353281855583, "num_tokens": 2294013.0, "step": 1235 }, { "entropy": 6.847450494766235, "epoch": 0.1095019427764041, "grad_norm": 1.125, "learning_rate": 0.0004999949646632072, "loss": 6.7626, "mean_token_accuracy": 0.12177001982927323, "num_tokens": 2302727.0, "step": 1240 }, { "entropy": 6.8077342987060545, "epoch": 0.10994348286824443, "grad_norm": 1.078125, "learning_rate": 0.0004999947517766842, "loss": 6.8031, "mean_token_accuracy": 0.12163913846015931, "num_tokens": 2312032.0, "step": 1245 }, { "entropy": 6.986685180664063, "epoch": 0.11038502296008477, "grad_norm": 1.1640625, "learning_rate": 0.000499994534482642, "loss": 6.8748, "mean_token_accuracy": 0.1190544456243515, "num_tokens": 2321839.0, "step": 1250 }, { "entropy": 6.811014032363891, "epoch": 0.11082656305192512, "grad_norm": 0.95703125, "learning_rate": 0.0004999943127810847, "loss": 6.8536, "mean_token_accuracy": 0.1122577242553234, "num_tokens": 2331255.0, "step": 1255 }, { "entropy": 6.794656276702881, "epoch": 0.11126810314376545, "grad_norm": 1.015625, "learning_rate": 0.0004999940866720169, "loss": 6.6705, "mean_token_accuracy": 0.12881582453846932, "num_tokens": 2340038.0, "step": 1260 }, { "entropy": 6.71243953704834, "epoch": 0.1117096432356058, "grad_norm": 1.0234375, "learning_rate": 0.0004999938561554429, "loss": 6.7797, "mean_token_accuracy": 0.12242485880851746, "num_tokens": 2348901.0, "step": 1265 }, { "entropy": 6.858448314666748, "epoch": 0.11215118332744613, "grad_norm": 0.97265625, "learning_rate": 0.0004999936212313672, "loss": 6.8659, "mean_token_accuracy": 0.11461173072457313, "num_tokens": 2358842.0, "step": 1270 }, { "entropy": 6.8239977836608885, "epoch": 0.11259272341928647, "grad_norm": 1.0625, "learning_rate": 0.0004999933818997943, "loss": 6.7596, "mean_token_accuracy": 0.12424605414271354, "num_tokens": 2368650.0, "step": 1275 }, { "entropy": 6.831825399398804, "epoch": 0.1130342635111268, "grad_norm": 0.96875, "learning_rate": 0.0004999931381607292, "loss": 6.8252, "mean_token_accuracy": 0.12058763056993485, "num_tokens": 2377916.0, "step": 1280 }, { "entropy": 6.818245553970337, "epoch": 0.11347580360296715, "grad_norm": 1.09375, "learning_rate": 0.0004999928900141764, "loss": 6.7698, "mean_token_accuracy": 0.12198482304811478, "num_tokens": 2387507.0, "step": 1285 }, { "entropy": 6.819052505493164, "epoch": 0.1139173436948075, "grad_norm": 1.109375, "learning_rate": 0.000499992637460141, "loss": 6.8052, "mean_token_accuracy": 0.12523134648799897, "num_tokens": 2396148.0, "step": 1290 }, { "entropy": 6.816552209854126, "epoch": 0.11435888378664782, "grad_norm": 0.9921875, "learning_rate": 0.0004999923804986275, "loss": 6.693, "mean_token_accuracy": 0.11803872361779214, "num_tokens": 2404891.0, "step": 1295 }, { "entropy": 6.840794086456299, "epoch": 0.11480042387848817, "grad_norm": 0.98828125, "learning_rate": 0.0004999921191296415, "loss": 6.7153, "mean_token_accuracy": 0.12199744880199433, "num_tokens": 2414406.0, "step": 1300 }, { "entropy": 6.7452630519866945, "epoch": 0.1152419639703285, "grad_norm": 1.1171875, "learning_rate": 0.0004999918533531877, "loss": 6.8046, "mean_token_accuracy": 0.1228412576019764, "num_tokens": 2424363.0, "step": 1305 }, { "entropy": 6.852883148193359, "epoch": 0.11568350406216885, "grad_norm": 0.9921875, "learning_rate": 0.0004999915831692714, "loss": 6.7419, "mean_token_accuracy": 0.1251549780368805, "num_tokens": 2433753.0, "step": 1310 }, { "entropy": 6.7218766689300535, "epoch": 0.11612504415400918, "grad_norm": 1.0, "learning_rate": 0.0004999913085778981, "loss": 6.7685, "mean_token_accuracy": 0.1185051940381527, "num_tokens": 2443275.0, "step": 1315 }, { "entropy": 6.873374080657959, "epoch": 0.11656658424584952, "grad_norm": 1.0625, "learning_rate": 0.0004999910295790729, "loss": 6.7937, "mean_token_accuracy": 0.11835979968309403, "num_tokens": 2452510.0, "step": 1320 }, { "entropy": 6.8684648990631105, "epoch": 0.11700812433768987, "grad_norm": 1.078125, "learning_rate": 0.0004999907461728014, "loss": 6.8746, "mean_token_accuracy": 0.1169828750193119, "num_tokens": 2462742.0, "step": 1325 }, { "entropy": 6.740426445007325, "epoch": 0.1174496644295302, "grad_norm": 0.96875, "learning_rate": 0.0004999904583590893, "loss": 6.7434, "mean_token_accuracy": 0.12029099017381668, "num_tokens": 2471409.0, "step": 1330 }, { "entropy": 6.839800691604614, "epoch": 0.11789120452137054, "grad_norm": 0.984375, "learning_rate": 0.0004999901661379418, "loss": 6.6931, "mean_token_accuracy": 0.12773663252592088, "num_tokens": 2481011.0, "step": 1335 }, { "entropy": 6.690527105331421, "epoch": 0.11833274461321087, "grad_norm": 0.953125, "learning_rate": 0.0004999898695093652, "loss": 6.7866, "mean_token_accuracy": 0.12104339599609375, "num_tokens": 2490664.0, "step": 1340 }, { "entropy": 6.818962049484253, "epoch": 0.11877428470505122, "grad_norm": 1.046875, "learning_rate": 0.0004999895684733648, "loss": 6.7279, "mean_token_accuracy": 0.12824407517910003, "num_tokens": 2499799.0, "step": 1345 }, { "entropy": 6.8539299964904785, "epoch": 0.11921582479689156, "grad_norm": 1.53125, "learning_rate": 0.0004999892630299467, "loss": 6.7045, "mean_token_accuracy": 0.1257259279489517, "num_tokens": 2508780.0, "step": 1350 }, { "entropy": 6.706318616867065, "epoch": 0.1196573648887319, "grad_norm": 1.0546875, "learning_rate": 0.0004999889531791171, "loss": 6.7138, "mean_token_accuracy": 0.12127138078212737, "num_tokens": 2517741.0, "step": 1355 }, { "entropy": 6.766215896606445, "epoch": 0.12009890498057224, "grad_norm": 0.9921875, "learning_rate": 0.0004999886389208817, "loss": 6.7972, "mean_token_accuracy": 0.11826895922422409, "num_tokens": 2528742.0, "step": 1360 }, { "entropy": 6.717579746246338, "epoch": 0.12054044507241257, "grad_norm": 0.94140625, "learning_rate": 0.0004999883202552468, "loss": 6.7345, "mean_token_accuracy": 0.12455343306064606, "num_tokens": 2538609.0, "step": 1365 }, { "entropy": 6.882754898071289, "epoch": 0.12098198516425292, "grad_norm": 1.046875, "learning_rate": 0.0004999879971822189, "loss": 6.7157, "mean_token_accuracy": 0.11966117843985558, "num_tokens": 2547772.0, "step": 1370 }, { "entropy": 6.69037971496582, "epoch": 0.12142352525609325, "grad_norm": 1.1015625, "learning_rate": 0.0004999876697018038, "loss": 6.6897, "mean_token_accuracy": 0.12769502475857736, "num_tokens": 2556114.0, "step": 1375 }, { "entropy": 6.8372406482696535, "epoch": 0.1218650653479336, "grad_norm": 1.1328125, "learning_rate": 0.0004999873378140085, "loss": 6.7182, "mean_token_accuracy": 0.12253274098038673, "num_tokens": 2566814.0, "step": 1380 }, { "entropy": 6.714360284805298, "epoch": 0.12230660543977394, "grad_norm": 1.1484375, "learning_rate": 0.0004999870015188389, "loss": 6.6914, "mean_token_accuracy": 0.12260655164718628, "num_tokens": 2576030.0, "step": 1385 }, { "entropy": 6.743939208984375, "epoch": 0.12274814553161427, "grad_norm": 0.96484375, "learning_rate": 0.0004999866608163021, "loss": 6.7176, "mean_token_accuracy": 0.1260794699192047, "num_tokens": 2585756.0, "step": 1390 }, { "entropy": 6.800289154052734, "epoch": 0.12318968562345461, "grad_norm": 1.0703125, "learning_rate": 0.0004999863157064045, "loss": 6.7797, "mean_token_accuracy": 0.1238692507147789, "num_tokens": 2595676.0, "step": 1395 }, { "entropy": 6.8109955310821535, "epoch": 0.12363122571529495, "grad_norm": 0.96484375, "learning_rate": 0.0004999859661891529, "loss": 6.7624, "mean_token_accuracy": 0.1246532566845417, "num_tokens": 2606197.0, "step": 1400 }, { "entropy": 6.76338677406311, "epoch": 0.12407276580713529, "grad_norm": 1.1328125, "learning_rate": 0.0004999856122645543, "loss": 6.6854, "mean_token_accuracy": 0.12518818601965903, "num_tokens": 2615311.0, "step": 1405 }, { "entropy": 6.725164937973022, "epoch": 0.12451430589897562, "grad_norm": 1.1484375, "learning_rate": 0.0004999852539326154, "loss": 6.5931, "mean_token_accuracy": 0.12619537115097046, "num_tokens": 2624074.0, "step": 1410 }, { "entropy": 6.681712102890015, "epoch": 0.12495584599081597, "grad_norm": 1.0859375, "learning_rate": 0.0004999848911933434, "loss": 6.7411, "mean_token_accuracy": 0.13295672461390495, "num_tokens": 2633877.0, "step": 1415 }, { "entropy": 6.7498420715332035, "epoch": 0.1253973860826563, "grad_norm": 0.92578125, "learning_rate": 0.0004999845240467453, "loss": 6.6702, "mean_token_accuracy": 0.12471742331981658, "num_tokens": 2643330.0, "step": 1420 }, { "entropy": 6.74535722732544, "epoch": 0.12583892617449666, "grad_norm": 0.98046875, "learning_rate": 0.0004999841524928282, "loss": 6.6543, "mean_token_accuracy": 0.12766205966472627, "num_tokens": 2652070.0, "step": 1425 }, { "entropy": 6.722480249404907, "epoch": 0.126280466266337, "grad_norm": 1.1171875, "learning_rate": 0.0004999837765315997, "loss": 6.697, "mean_token_accuracy": 0.1277957484126091, "num_tokens": 2660546.0, "step": 1430 }, { "entropy": 6.598982477188111, "epoch": 0.12672200635817732, "grad_norm": 0.9453125, "learning_rate": 0.0004999833961630669, "loss": 6.5999, "mean_token_accuracy": 0.1297621488571167, "num_tokens": 2669938.0, "step": 1435 }, { "entropy": 6.852873420715332, "epoch": 0.12716354645001765, "grad_norm": 0.94140625, "learning_rate": 0.0004999830113872374, "loss": 6.7248, "mean_token_accuracy": 0.12213384285569191, "num_tokens": 2679814.0, "step": 1440 }, { "entropy": 6.585267829895019, "epoch": 0.127605086541858, "grad_norm": 1.046875, "learning_rate": 0.0004999826222041186, "loss": 6.6355, "mean_token_accuracy": 0.13383011817932128, "num_tokens": 2688733.0, "step": 1445 }, { "entropy": 6.706393527984619, "epoch": 0.12804662663369834, "grad_norm": 1.1015625, "learning_rate": 0.0004999822286137182, "loss": 6.6188, "mean_token_accuracy": 0.12774292454123498, "num_tokens": 2697744.0, "step": 1450 }, { "entropy": 6.633390617370606, "epoch": 0.12848816672553867, "grad_norm": 0.98828125, "learning_rate": 0.0004999818306160439, "loss": 6.5827, "mean_token_accuracy": 0.13278514444828032, "num_tokens": 2707037.0, "step": 1455 }, { "entropy": 6.808015394210815, "epoch": 0.12892970681737903, "grad_norm": 0.90625, "learning_rate": 0.0004999814282111034, "loss": 6.7453, "mean_token_accuracy": 0.12486135885119438, "num_tokens": 2717345.0, "step": 1460 }, { "entropy": 6.7304778575897215, "epoch": 0.12937124690921936, "grad_norm": 1.0390625, "learning_rate": 0.0004999810213989047, "loss": 6.7017, "mean_token_accuracy": 0.12036777138710023, "num_tokens": 2726892.0, "step": 1465 }, { "entropy": 6.667453098297119, "epoch": 0.1298127870010597, "grad_norm": 1.0234375, "learning_rate": 0.0004999806101794558, "loss": 6.6615, "mean_token_accuracy": 0.12705308422446251, "num_tokens": 2736479.0, "step": 1470 }, { "entropy": 6.809973049163818, "epoch": 0.13025432709290002, "grad_norm": 1.09375, "learning_rate": 0.0004999801945527648, "loss": 6.7078, "mean_token_accuracy": 0.12117967531085014, "num_tokens": 2745998.0, "step": 1475 }, { "entropy": 6.725075721740723, "epoch": 0.13069586718474038, "grad_norm": 1.03125, "learning_rate": 0.0004999797745188395, "loss": 6.6906, "mean_token_accuracy": 0.12821464985609055, "num_tokens": 2754346.0, "step": 1480 }, { "entropy": 6.686868619918823, "epoch": 0.13113740727658071, "grad_norm": 1.0703125, "learning_rate": 0.0004999793500776886, "loss": 6.6285, "mean_token_accuracy": 0.1294437274336815, "num_tokens": 2763391.0, "step": 1485 }, { "entropy": 6.674964761734008, "epoch": 0.13157894736842105, "grad_norm": 1.1015625, "learning_rate": 0.0004999789212293201, "loss": 6.6827, "mean_token_accuracy": 0.12898893728852273, "num_tokens": 2772764.0, "step": 1490 }, { "entropy": 6.775622892379761, "epoch": 0.1320204874602614, "grad_norm": 1.1640625, "learning_rate": 0.0004999784879737423, "loss": 6.78, "mean_token_accuracy": 0.12160285860300064, "num_tokens": 2782312.0, "step": 1495 }, { "entropy": 6.745266675949097, "epoch": 0.13246202755210174, "grad_norm": 1.0625, "learning_rate": 0.0004999780503109642, "loss": 6.6798, "mean_token_accuracy": 0.1227384127676487, "num_tokens": 2791159.0, "step": 1500 }, { "entropy": 6.650141859054566, "epoch": 0.13290356764394207, "grad_norm": 1.046875, "learning_rate": 0.0004999776082409939, "loss": 6.5068, "mean_token_accuracy": 0.13456878885626794, "num_tokens": 2799319.0, "step": 1505 }, { "entropy": 6.6320771217346195, "epoch": 0.1333451077357824, "grad_norm": 1.078125, "learning_rate": 0.0004999771617638401, "loss": 6.6316, "mean_token_accuracy": 0.12712259590625763, "num_tokens": 2807401.0, "step": 1510 }, { "entropy": 6.758873414993286, "epoch": 0.13378664782762276, "grad_norm": 1.078125, "learning_rate": 0.0004999767108795118, "loss": 6.6961, "mean_token_accuracy": 0.12330949455499648, "num_tokens": 2817734.0, "step": 1515 }, { "entropy": 6.644004774093628, "epoch": 0.1342281879194631, "grad_norm": 1.0390625, "learning_rate": 0.0004999762555880176, "loss": 6.6783, "mean_token_accuracy": 0.12152940481901169, "num_tokens": 2828235.0, "step": 1520 }, { "entropy": 6.743390846252441, "epoch": 0.13466972801130342, "grad_norm": 1.046875, "learning_rate": 0.0004999757958893666, "loss": 6.7124, "mean_token_accuracy": 0.1237283930182457, "num_tokens": 2837453.0, "step": 1525 }, { "entropy": 6.60786280632019, "epoch": 0.13511126810314378, "grad_norm": 0.9453125, "learning_rate": 0.0004999753317835677, "loss": 6.6795, "mean_token_accuracy": 0.12087962031364441, "num_tokens": 2847055.0, "step": 1530 }, { "entropy": 6.7429241180419925, "epoch": 0.1355528081949841, "grad_norm": 1.0, "learning_rate": 0.0004999748632706299, "loss": 6.6568, "mean_token_accuracy": 0.13167096227407454, "num_tokens": 2857101.0, "step": 1535 }, { "entropy": 6.673015403747558, "epoch": 0.13599434828682444, "grad_norm": 1.140625, "learning_rate": 0.0004999743903505626, "loss": 6.553, "mean_token_accuracy": 0.1336723633110523, "num_tokens": 2866685.0, "step": 1540 }, { "entropy": 6.676457214355469, "epoch": 0.13643588837866477, "grad_norm": 1.0234375, "learning_rate": 0.0004999739130233749, "loss": 6.6731, "mean_token_accuracy": 0.12713466510176658, "num_tokens": 2876022.0, "step": 1545 }, { "entropy": 6.783061361312866, "epoch": 0.13687742847050513, "grad_norm": 0.96484375, "learning_rate": 0.0004999734312890761, "loss": 6.6062, "mean_token_accuracy": 0.12626957073807715, "num_tokens": 2885560.0, "step": 1550 }, { "entropy": 6.59228024482727, "epoch": 0.13731896856234546, "grad_norm": 0.9375, "learning_rate": 0.0004999729451476757, "loss": 6.6439, "mean_token_accuracy": 0.12623701319098474, "num_tokens": 2894686.0, "step": 1555 }, { "entropy": 6.730476140975952, "epoch": 0.1377605086541858, "grad_norm": 0.98046875, "learning_rate": 0.0004999724545991835, "loss": 6.6588, "mean_token_accuracy": 0.13341889455914496, "num_tokens": 2904390.0, "step": 1560 }, { "entropy": 6.662082195281982, "epoch": 0.13820204874602615, "grad_norm": 1.1171875, "learning_rate": 0.0004999719596436086, "loss": 6.6982, "mean_token_accuracy": 0.12678939029574393, "num_tokens": 2913311.0, "step": 1565 }, { "entropy": 6.736054372787476, "epoch": 0.13864358883786648, "grad_norm": 0.95703125, "learning_rate": 0.0004999714602809611, "loss": 6.578, "mean_token_accuracy": 0.13448369055986403, "num_tokens": 2923196.0, "step": 1570 }, { "entropy": 6.637621927261352, "epoch": 0.1390851289297068, "grad_norm": 1.0859375, "learning_rate": 0.0004999709565112506, "loss": 6.6428, "mean_token_accuracy": 0.12785085365176202, "num_tokens": 2932813.0, "step": 1575 }, { "entropy": 6.658017587661743, "epoch": 0.13952666902154715, "grad_norm": 0.95703125, "learning_rate": 0.000499970448334487, "loss": 6.5634, "mean_token_accuracy": 0.119098000228405, "num_tokens": 2942694.0, "step": 1580 }, { "entropy": 6.5801304340362545, "epoch": 0.1399682091133875, "grad_norm": 1.0859375, "learning_rate": 0.0004999699357506803, "loss": 6.5185, "mean_token_accuracy": 0.1330326519906521, "num_tokens": 2950932.0, "step": 1585 }, { "entropy": 6.701116371154785, "epoch": 0.14040974920522784, "grad_norm": 1.0703125, "learning_rate": 0.0004999694187598406, "loss": 6.6085, "mean_token_accuracy": 0.1314128704369068, "num_tokens": 2960102.0, "step": 1590 }, { "entropy": 6.627189350128174, "epoch": 0.14085128929706817, "grad_norm": 1.578125, "learning_rate": 0.0004999688973619777, "loss": 6.6088, "mean_token_accuracy": 0.12604895159602164, "num_tokens": 2969968.0, "step": 1595 }, { "entropy": 6.588514184951782, "epoch": 0.14129282938890853, "grad_norm": 0.9296875, "learning_rate": 0.0004999683715571022, "loss": 6.5542, "mean_token_accuracy": 0.1347218669950962, "num_tokens": 2978880.0, "step": 1600 }, { "entropy": 6.644829893112183, "epoch": 0.14173436948074886, "grad_norm": 0.93359375, "learning_rate": 0.0004999678413452242, "loss": 6.5863, "mean_token_accuracy": 0.12890450209379195, "num_tokens": 2988369.0, "step": 1605 }, { "entropy": 6.667511320114135, "epoch": 0.1421759095725892, "grad_norm": 0.953125, "learning_rate": 0.0004999673067263542, "loss": 6.6373, "mean_token_accuracy": 0.12620072290301323, "num_tokens": 2997070.0, "step": 1610 }, { "entropy": 6.682367372512817, "epoch": 0.14261744966442952, "grad_norm": 0.9921875, "learning_rate": 0.0004999667677005026, "loss": 6.5749, "mean_token_accuracy": 0.13310741856694222, "num_tokens": 3006547.0, "step": 1615 }, { "entropy": 6.584122562408448, "epoch": 0.14305898975626988, "grad_norm": 0.953125, "learning_rate": 0.0004999662242676799, "loss": 6.5986, "mean_token_accuracy": 0.1310425490140915, "num_tokens": 3015821.0, "step": 1620 }, { "entropy": 6.665965223312378, "epoch": 0.1435005298481102, "grad_norm": 0.96875, "learning_rate": 0.0004999656764278968, "loss": 6.5655, "mean_token_accuracy": 0.1309148021042347, "num_tokens": 3024750.0, "step": 1625 }, { "entropy": 6.623423147201538, "epoch": 0.14394206993995054, "grad_norm": 1.0546875, "learning_rate": 0.0004999651241811642, "loss": 6.5389, "mean_token_accuracy": 0.1308048278093338, "num_tokens": 3033345.0, "step": 1630 }, { "entropy": 6.59770941734314, "epoch": 0.1443836100317909, "grad_norm": 1.03125, "learning_rate": 0.0004999645675274925, "loss": 6.5209, "mean_token_accuracy": 0.13443350791931152, "num_tokens": 3042060.0, "step": 1635 }, { "entropy": 6.660798358917236, "epoch": 0.14482515012363123, "grad_norm": 1.0625, "learning_rate": 0.0004999640064668931, "loss": 6.6684, "mean_token_accuracy": 0.12563745751976968, "num_tokens": 3052490.0, "step": 1640 }, { "entropy": 6.726674222946167, "epoch": 0.14526669021547156, "grad_norm": 1.046875, "learning_rate": 0.0004999634409993766, "loss": 6.6441, "mean_token_accuracy": 0.12772516757249833, "num_tokens": 3061934.0, "step": 1645 }, { "entropy": 6.556687307357788, "epoch": 0.1457082303073119, "grad_norm": 1.046875, "learning_rate": 0.0004999628711249544, "loss": 6.5591, "mean_token_accuracy": 0.13611432090401648, "num_tokens": 3070890.0, "step": 1650 }, { "entropy": 6.703302097320557, "epoch": 0.14614977039915225, "grad_norm": 0.93359375, "learning_rate": 0.0004999622968436373, "loss": 6.5614, "mean_token_accuracy": 0.12631918862462044, "num_tokens": 3079933.0, "step": 1655 }, { "entropy": 6.5565461158752445, "epoch": 0.14659131049099258, "grad_norm": 0.96484375, "learning_rate": 0.0004999617181554369, "loss": 6.6078, "mean_token_accuracy": 0.13254086449742317, "num_tokens": 3089910.0, "step": 1660 }, { "entropy": 6.718549633026123, "epoch": 0.1470328505828329, "grad_norm": 1.1875, "learning_rate": 0.0004999611350603643, "loss": 6.5916, "mean_token_accuracy": 0.13437702059745787, "num_tokens": 3098676.0, "step": 1665 }, { "entropy": 6.595283174514771, "epoch": 0.14747439067467327, "grad_norm": 1.0078125, "learning_rate": 0.000499960547558431, "loss": 6.7159, "mean_token_accuracy": 0.1256335400044918, "num_tokens": 3109055.0, "step": 1670 }, { "entropy": 6.624332904815674, "epoch": 0.1479159307665136, "grad_norm": 1.1328125, "learning_rate": 0.0004999599556496486, "loss": 6.5472, "mean_token_accuracy": 0.13295547068119049, "num_tokens": 3117517.0, "step": 1675 }, { "entropy": 6.573185825347901, "epoch": 0.14835747085835393, "grad_norm": 1.0390625, "learning_rate": 0.0004999593593340286, "loss": 6.5544, "mean_token_accuracy": 0.13249536529183387, "num_tokens": 3126606.0, "step": 1680 }, { "entropy": 6.704092359542846, "epoch": 0.14879901095019427, "grad_norm": 0.95703125, "learning_rate": 0.0004999587586115826, "loss": 6.5733, "mean_token_accuracy": 0.12993996366858482, "num_tokens": 3135444.0, "step": 1685 }, { "entropy": 6.57456955909729, "epoch": 0.14924055104203462, "grad_norm": 0.94140625, "learning_rate": 0.0004999581534823226, "loss": 6.4927, "mean_token_accuracy": 0.14061653688549997, "num_tokens": 3144967.0, "step": 1690 }, { "entropy": 6.607442092895508, "epoch": 0.14968209113387496, "grad_norm": 0.99609375, "learning_rate": 0.0004999575439462601, "loss": 6.522, "mean_token_accuracy": 0.12690635845065118, "num_tokens": 3153898.0, "step": 1695 }, { "entropy": 6.536995601654053, "epoch": 0.1501236312257153, "grad_norm": 1.0546875, "learning_rate": 0.0004999569300034075, "loss": 6.5921, "mean_token_accuracy": 0.12649724259972572, "num_tokens": 3162341.0, "step": 1700 }, { "entropy": 6.60729718208313, "epoch": 0.15056517131755565, "grad_norm": 1.0234375, "learning_rate": 0.0004999563116537764, "loss": 6.3826, "mean_token_accuracy": 0.14100785404443741, "num_tokens": 3171420.0, "step": 1705 }, { "entropy": 6.54091591835022, "epoch": 0.15100671140939598, "grad_norm": 1.0078125, "learning_rate": 0.0004999556888973792, "loss": 6.4366, "mean_token_accuracy": 0.13033056780695915, "num_tokens": 3180221.0, "step": 1710 }, { "entropy": 6.6625391960144045, "epoch": 0.1514482515012363, "grad_norm": 1.0859375, "learning_rate": 0.0004999550617342279, "loss": 6.587, "mean_token_accuracy": 0.12798949852585792, "num_tokens": 3189156.0, "step": 1715 }, { "entropy": 6.539492607116699, "epoch": 0.15188979159307664, "grad_norm": 1.0390625, "learning_rate": 0.000499954430164335, "loss": 6.5149, "mean_token_accuracy": 0.13213447630405425, "num_tokens": 3199870.0, "step": 1720 }, { "entropy": 6.559759998321534, "epoch": 0.152331331684917, "grad_norm": 1.0234375, "learning_rate": 0.0004999537941877127, "loss": 6.4883, "mean_token_accuracy": 0.13440720960497857, "num_tokens": 3208815.0, "step": 1725 }, { "entropy": 6.604632616043091, "epoch": 0.15277287177675733, "grad_norm": 0.921875, "learning_rate": 0.0004999531538043735, "loss": 6.5717, "mean_token_accuracy": 0.13279605731368066, "num_tokens": 3218692.0, "step": 1730 }, { "entropy": 6.584544372558594, "epoch": 0.15321441186859766, "grad_norm": 1.0078125, "learning_rate": 0.0004999525090143298, "loss": 6.5367, "mean_token_accuracy": 0.14099612906575204, "num_tokens": 3227604.0, "step": 1735 }, { "entropy": 6.629879665374756, "epoch": 0.15365595196043802, "grad_norm": 1.03125, "learning_rate": 0.0004999518598175946, "loss": 6.5446, "mean_token_accuracy": 0.14111834466457368, "num_tokens": 3237912.0, "step": 1740 }, { "entropy": 6.453984022140503, "epoch": 0.15409749205227835, "grad_norm": 1.0546875, "learning_rate": 0.0004999512062141805, "loss": 6.4239, "mean_token_accuracy": 0.13701695203781128, "num_tokens": 3246003.0, "step": 1745 }, { "entropy": 6.6436468124389645, "epoch": 0.15453903214411868, "grad_norm": 0.85546875, "learning_rate": 0.0004999505482040999, "loss": 6.5558, "mean_token_accuracy": 0.13121686428785323, "num_tokens": 3256363.0, "step": 1750 }, { "entropy": 6.583334445953369, "epoch": 0.154980572235959, "grad_norm": 1.046875, "learning_rate": 0.0004999498857873662, "loss": 6.5518, "mean_token_accuracy": 0.13320463374257088, "num_tokens": 3265822.0, "step": 1755 }, { "entropy": 6.478305387496948, "epoch": 0.15542211232779937, "grad_norm": 1.03125, "learning_rate": 0.0004999492189639921, "loss": 6.4224, "mean_token_accuracy": 0.13528963476419448, "num_tokens": 3274614.0, "step": 1760 }, { "entropy": 6.61256685256958, "epoch": 0.1558636524196397, "grad_norm": 1.0078125, "learning_rate": 0.0004999485477339907, "loss": 6.5201, "mean_token_accuracy": 0.13317029252648355, "num_tokens": 3283800.0, "step": 1765 }, { "entropy": 6.526872968673706, "epoch": 0.15630519251148003, "grad_norm": 1.078125, "learning_rate": 0.0004999478720973753, "loss": 6.4319, "mean_token_accuracy": 0.14009243845939637, "num_tokens": 3293221.0, "step": 1770 }, { "entropy": 6.5570995807647705, "epoch": 0.1567467326033204, "grad_norm": 0.97265625, "learning_rate": 0.000499947192054159, "loss": 6.6029, "mean_token_accuracy": 0.1324251540005207, "num_tokens": 3302852.0, "step": 1775 }, { "entropy": 6.650819587707519, "epoch": 0.15718827269516072, "grad_norm": 1.1484375, "learning_rate": 0.000499946507604355, "loss": 6.4641, "mean_token_accuracy": 0.14009604677557946, "num_tokens": 3311752.0, "step": 1780 }, { "entropy": 6.473453664779663, "epoch": 0.15762981278700106, "grad_norm": 1.078125, "learning_rate": 0.000499945818747977, "loss": 6.5492, "mean_token_accuracy": 0.13722263872623444, "num_tokens": 3321339.0, "step": 1785 }, { "entropy": 6.671073293685913, "epoch": 0.1580713528788414, "grad_norm": 1.0546875, "learning_rate": 0.0004999451254850383, "loss": 6.5514, "mean_token_accuracy": 0.1324629843235016, "num_tokens": 3330269.0, "step": 1790 }, { "entropy": 6.550148677825928, "epoch": 0.15851289297068175, "grad_norm": 0.98828125, "learning_rate": 0.0004999444278155525, "loss": 6.4576, "mean_token_accuracy": 0.1376182422041893, "num_tokens": 3340770.0, "step": 1795 }, { "entropy": 6.544644594192505, "epoch": 0.15895443306252208, "grad_norm": 1.0234375, "learning_rate": 0.0004999437257395333, "loss": 6.5753, "mean_token_accuracy": 0.13476464301347732, "num_tokens": 3349976.0, "step": 1800 }, { "entropy": 6.597910165786743, "epoch": 0.1593959731543624, "grad_norm": 0.99609375, "learning_rate": 0.0004999430192569944, "loss": 6.6158, "mean_token_accuracy": 0.12608520165085793, "num_tokens": 3359764.0, "step": 1805 }, { "entropy": 6.539244031906128, "epoch": 0.15983751324620277, "grad_norm": 0.96484375, "learning_rate": 0.0004999423083679498, "loss": 6.4868, "mean_token_accuracy": 0.13115186169743537, "num_tokens": 3369939.0, "step": 1810 }, { "entropy": 6.639003801345825, "epoch": 0.1602790533380431, "grad_norm": 1.03125, "learning_rate": 0.0004999415930724133, "loss": 6.6346, "mean_token_accuracy": 0.12994891554117202, "num_tokens": 3381326.0, "step": 1815 }, { "entropy": 6.659589242935181, "epoch": 0.16072059342988343, "grad_norm": 0.9296875, "learning_rate": 0.0004999408733703988, "loss": 6.5949, "mean_token_accuracy": 0.12143847793340683, "num_tokens": 3391016.0, "step": 1820 }, { "entropy": 6.533053731918335, "epoch": 0.16116213352172376, "grad_norm": 0.953125, "learning_rate": 0.0004999401492619207, "loss": 6.3366, "mean_token_accuracy": 0.13945143148303032, "num_tokens": 3400294.0, "step": 1825 }, { "entropy": 6.587634897232055, "epoch": 0.16160367361356412, "grad_norm": 0.99609375, "learning_rate": 0.0004999394207469928, "loss": 6.5447, "mean_token_accuracy": 0.12779992073774338, "num_tokens": 3409685.0, "step": 1830 }, { "entropy": 6.636557579040527, "epoch": 0.16204521370540445, "grad_norm": 1.0546875, "learning_rate": 0.0004999386878256297, "loss": 6.4611, "mean_token_accuracy": 0.1320968374609947, "num_tokens": 3418946.0, "step": 1835 }, { "entropy": 6.423011112213135, "epoch": 0.16248675379724478, "grad_norm": 1.0078125, "learning_rate": 0.0004999379504978457, "loss": 6.468, "mean_token_accuracy": 0.13338307663798332, "num_tokens": 3428245.0, "step": 1840 }, { "entropy": 6.578200912475586, "epoch": 0.16292829388908514, "grad_norm": 1.046875, "learning_rate": 0.000499937208763655, "loss": 6.4741, "mean_token_accuracy": 0.1402788795530796, "num_tokens": 3437580.0, "step": 1845 }, { "entropy": 6.617180633544922, "epoch": 0.16336983398092547, "grad_norm": 1.0390625, "learning_rate": 0.0004999364626230724, "loss": 6.4472, "mean_token_accuracy": 0.14007550328969956, "num_tokens": 3446393.0, "step": 1850 }, { "entropy": 6.451139450073242, "epoch": 0.1638113740727658, "grad_norm": 0.9921875, "learning_rate": 0.0004999357120761124, "loss": 6.551, "mean_token_accuracy": 0.13571736514568328, "num_tokens": 3455956.0, "step": 1855 }, { "entropy": 6.6568724632263185, "epoch": 0.16425291416460613, "grad_norm": 1.0078125, "learning_rate": 0.0004999349571227898, "loss": 6.5495, "mean_token_accuracy": 0.13430711701512338, "num_tokens": 3465722.0, "step": 1860 }, { "entropy": 6.55973629951477, "epoch": 0.1646944542564465, "grad_norm": 0.953125, "learning_rate": 0.0004999341977631193, "loss": 6.52, "mean_token_accuracy": 0.12885117009282113, "num_tokens": 3475467.0, "step": 1865 }, { "entropy": 6.513189268112183, "epoch": 0.16513599434828682, "grad_norm": 1.0, "learning_rate": 0.0004999334339971157, "loss": 6.4162, "mean_token_accuracy": 0.13661258816719055, "num_tokens": 3484931.0, "step": 1870 }, { "entropy": 6.572188520431519, "epoch": 0.16557753444012716, "grad_norm": 0.95703125, "learning_rate": 0.0004999326658247942, "loss": 6.5161, "mean_token_accuracy": 0.13407543525099755, "num_tokens": 3494001.0, "step": 1875 }, { "entropy": 6.522925567626953, "epoch": 0.16601907453196751, "grad_norm": 1.0859375, "learning_rate": 0.0004999318932461696, "loss": 6.4558, "mean_token_accuracy": 0.1386028841137886, "num_tokens": 3503021.0, "step": 1880 }, { "entropy": 6.46082649230957, "epoch": 0.16646061462380785, "grad_norm": 1.2421875, "learning_rate": 0.0004999311162612571, "loss": 6.4843, "mean_token_accuracy": 0.13625017702579498, "num_tokens": 3513005.0, "step": 1885 }, { "entropy": 6.661849021911621, "epoch": 0.16690215471564818, "grad_norm": 0.9609375, "learning_rate": 0.000499930334870072, "loss": 6.5629, "mean_token_accuracy": 0.13853515014052392, "num_tokens": 3523219.0, "step": 1890 }, { "entropy": 6.584684419631958, "epoch": 0.1673436948074885, "grad_norm": 0.97265625, "learning_rate": 0.0004999295490726296, "loss": 6.51, "mean_token_accuracy": 0.13359814062714576, "num_tokens": 3532917.0, "step": 1895 }, { "entropy": 6.513212633132935, "epoch": 0.16778523489932887, "grad_norm": 1.1015625, "learning_rate": 0.0004999287588689453, "loss": 6.5245, "mean_token_accuracy": 0.1389150969684124, "num_tokens": 3542845.0, "step": 1900 }, { "entropy": 6.599180459976196, "epoch": 0.1682267749911692, "grad_norm": 1.0546875, "learning_rate": 0.0004999279642590344, "loss": 6.5438, "mean_token_accuracy": 0.13201173320412635, "num_tokens": 3552126.0, "step": 1905 }, { "entropy": 6.5960267066955565, "epoch": 0.16866831508300953, "grad_norm": 1.0703125, "learning_rate": 0.0004999271652429127, "loss": 6.5561, "mean_token_accuracy": 0.12684691920876504, "num_tokens": 3561254.0, "step": 1910 }, { "entropy": 6.528163957595825, "epoch": 0.1691098551748499, "grad_norm": 0.98046875, "learning_rate": 0.0004999263618205958, "loss": 6.3933, "mean_token_accuracy": 0.1403527893126011, "num_tokens": 3569781.0, "step": 1915 }, { "entropy": 6.555511140823365, "epoch": 0.16955139526669022, "grad_norm": 1.0703125, "learning_rate": 0.0004999255539920993, "loss": 6.4866, "mean_token_accuracy": 0.13409090787172318, "num_tokens": 3579664.0, "step": 1920 }, { "entropy": 6.567546367645264, "epoch": 0.16999293535853055, "grad_norm": 1.015625, "learning_rate": 0.0004999247417574391, "loss": 6.5376, "mean_token_accuracy": 0.13570395410060881, "num_tokens": 3588671.0, "step": 1925 }, { "entropy": 6.54329285621643, "epoch": 0.17043447545037088, "grad_norm": 0.97265625, "learning_rate": 0.0004999239251166312, "loss": 6.4281, "mean_token_accuracy": 0.13799721151590347, "num_tokens": 3597656.0, "step": 1930 }, { "entropy": 6.480014228820801, "epoch": 0.17087601554221124, "grad_norm": 0.9609375, "learning_rate": 0.0004999231040696914, "loss": 6.4491, "mean_token_accuracy": 0.13938046917319297, "num_tokens": 3608017.0, "step": 1935 }, { "entropy": 6.529258060455322, "epoch": 0.17131755563405157, "grad_norm": 0.98046875, "learning_rate": 0.0004999222786166361, "loss": 6.5236, "mean_token_accuracy": 0.1355483777821064, "num_tokens": 3618189.0, "step": 1940 }, { "entropy": 6.60916223526001, "epoch": 0.1717590957258919, "grad_norm": 1.125, "learning_rate": 0.0004999214487574812, "loss": 6.4772, "mean_token_accuracy": 0.13526797890663148, "num_tokens": 3627211.0, "step": 1945 }, { "entropy": 6.498088455200195, "epoch": 0.17220063581773226, "grad_norm": 1.0859375, "learning_rate": 0.0004999206144922431, "loss": 6.4181, "mean_token_accuracy": 0.13236208409070968, "num_tokens": 3636781.0, "step": 1950 }, { "entropy": 6.4924522876739506, "epoch": 0.1726421759095726, "grad_norm": 1.0859375, "learning_rate": 0.000499919775820938, "loss": 6.4985, "mean_token_accuracy": 0.13709900975227357, "num_tokens": 3644891.0, "step": 1955 }, { "entropy": 6.591422748565674, "epoch": 0.17308371600141292, "grad_norm": 1.0078125, "learning_rate": 0.0004999189327435825, "loss": 6.5295, "mean_token_accuracy": 0.13574066162109374, "num_tokens": 3655477.0, "step": 1960 }, { "entropy": 6.5104138374328615, "epoch": 0.17352525609325326, "grad_norm": 1.015625, "learning_rate": 0.0004999180852601929, "loss": 6.5386, "mean_token_accuracy": 0.13653166219592094, "num_tokens": 3664542.0, "step": 1965 }, { "entropy": 6.577597141265869, "epoch": 0.17396679618509361, "grad_norm": 1.2265625, "learning_rate": 0.000499917233370786, "loss": 6.4609, "mean_token_accuracy": 0.1293606199324131, "num_tokens": 3673806.0, "step": 1970 }, { "entropy": 6.484398126602173, "epoch": 0.17440833627693395, "grad_norm": 0.98828125, "learning_rate": 0.0004999163770753784, "loss": 6.4524, "mean_token_accuracy": 0.13555625528097154, "num_tokens": 3683238.0, "step": 1975 }, { "entropy": 6.558987331390381, "epoch": 0.17484987636877428, "grad_norm": 1.2265625, "learning_rate": 0.0004999155163739869, "loss": 6.4372, "mean_token_accuracy": 0.13345005139708518, "num_tokens": 3692161.0, "step": 1980 }, { "entropy": 6.510820627212524, "epoch": 0.17529141646061464, "grad_norm": 1.0703125, "learning_rate": 0.0004999146512666284, "loss": 6.4535, "mean_token_accuracy": 0.13496344164013863, "num_tokens": 3701431.0, "step": 1985 }, { "entropy": 6.530817365646362, "epoch": 0.17573295655245497, "grad_norm": 0.890625, "learning_rate": 0.0004999137817533197, "loss": 6.4293, "mean_token_accuracy": 0.1400221474468708, "num_tokens": 3710963.0, "step": 1990 }, { "entropy": 6.48544750213623, "epoch": 0.1761744966442953, "grad_norm": 1.125, "learning_rate": 0.0004999129078340779, "loss": 6.4221, "mean_token_accuracy": 0.14005866199731826, "num_tokens": 3720177.0, "step": 1995 }, { "entropy": 6.465721511840821, "epoch": 0.17661603673613563, "grad_norm": 1.09375, "learning_rate": 0.0004999120295089202, "loss": 6.3795, "mean_token_accuracy": 0.14514245688915253, "num_tokens": 3728123.0, "step": 2000 }, { "entropy": 6.571569442749023, "epoch": 0.177057576827976, "grad_norm": 0.9921875, "learning_rate": 0.0004999111467778639, "loss": 6.4978, "mean_token_accuracy": 0.1349009484052658, "num_tokens": 3736869.0, "step": 2005 }, { "entropy": 6.4819518566131595, "epoch": 0.17749911691981632, "grad_norm": 1.109375, "learning_rate": 0.000499910259640926, "loss": 6.3989, "mean_token_accuracy": 0.1346891440451145, "num_tokens": 3745830.0, "step": 2010 }, { "entropy": 6.486936283111572, "epoch": 0.17794065701165665, "grad_norm": 0.9921875, "learning_rate": 0.000499909368098124, "loss": 6.4328, "mean_token_accuracy": 0.13926308006048202, "num_tokens": 3755019.0, "step": 2015 }, { "entropy": 6.517996597290039, "epoch": 0.178382197103497, "grad_norm": 0.98828125, "learning_rate": 0.0004999084721494754, "loss": 6.4076, "mean_token_accuracy": 0.13306454047560692, "num_tokens": 3764814.0, "step": 2020 }, { "entropy": 6.465504217147827, "epoch": 0.17882373719533734, "grad_norm": 1.0546875, "learning_rate": 0.0004999075717949978, "loss": 6.384, "mean_token_accuracy": 0.14004691764712335, "num_tokens": 3774258.0, "step": 2025 }, { "entropy": 6.422070741653442, "epoch": 0.17926527728717767, "grad_norm": 0.9375, "learning_rate": 0.0004999066670347089, "loss": 6.4155, "mean_token_accuracy": 0.14247353076934816, "num_tokens": 3783961.0, "step": 2030 }, { "entropy": 6.416653776168824, "epoch": 0.179706817379018, "grad_norm": 0.953125, "learning_rate": 0.0004999057578686261, "loss": 6.3804, "mean_token_accuracy": 0.1390853337943554, "num_tokens": 3792417.0, "step": 2035 }, { "entropy": 6.496200704574585, "epoch": 0.18014835747085836, "grad_norm": 1.1171875, "learning_rate": 0.0004999048442967675, "loss": 6.3592, "mean_token_accuracy": 0.13822411596775055, "num_tokens": 3801479.0, "step": 2040 }, { "entropy": 6.456766891479492, "epoch": 0.1805898975626987, "grad_norm": 0.96875, "learning_rate": 0.0004999039263191508, "loss": 6.4305, "mean_token_accuracy": 0.133939179033041, "num_tokens": 3810799.0, "step": 2045 }, { "entropy": 6.5264753818511965, "epoch": 0.18103143765453902, "grad_norm": 1.03125, "learning_rate": 0.0004999030039357943, "loss": 6.48, "mean_token_accuracy": 0.1298608623445034, "num_tokens": 3820966.0, "step": 2050 }, { "entropy": 6.493787717819214, "epoch": 0.18147297774637938, "grad_norm": 1.0234375, "learning_rate": 0.0004999020771467158, "loss": 6.4572, "mean_token_accuracy": 0.1327923409640789, "num_tokens": 3829247.0, "step": 2055 }, { "entropy": 6.596197843551636, "epoch": 0.1819145178382197, "grad_norm": 0.98046875, "learning_rate": 0.0004999011459519335, "loss": 6.3948, "mean_token_accuracy": 0.13827238082885743, "num_tokens": 3838114.0, "step": 2060 }, { "entropy": 6.390679597854614, "epoch": 0.18235605793006004, "grad_norm": 0.96875, "learning_rate": 0.0004999002103514655, "loss": 6.4735, "mean_token_accuracy": 0.13869670927524566, "num_tokens": 3848075.0, "step": 2065 }, { "entropy": 6.585737419128418, "epoch": 0.18279759802190038, "grad_norm": 1.0234375, "learning_rate": 0.0004998992703453304, "loss": 6.5106, "mean_token_accuracy": 0.13122646436095237, "num_tokens": 3857934.0, "step": 2070 }, { "entropy": 6.450649833679199, "epoch": 0.18323913811374073, "grad_norm": 0.953125, "learning_rate": 0.0004998983259335466, "loss": 6.3519, "mean_token_accuracy": 0.14236594662070273, "num_tokens": 3866707.0, "step": 2075 }, { "entropy": 6.58008508682251, "epoch": 0.18368067820558107, "grad_norm": 1.046875, "learning_rate": 0.0004998973771161324, "loss": 6.4244, "mean_token_accuracy": 0.1374949462711811, "num_tokens": 3875233.0, "step": 2080 }, { "entropy": 6.389654541015625, "epoch": 0.1841222182974214, "grad_norm": 1.015625, "learning_rate": 0.0004998964238931065, "loss": 6.4131, "mean_token_accuracy": 0.1403422772884369, "num_tokens": 3885173.0, "step": 2085 }, { "entropy": 6.564711236953736, "epoch": 0.18456375838926176, "grad_norm": 1.0390625, "learning_rate": 0.0004998954662644876, "loss": 6.3803, "mean_token_accuracy": 0.13195990920066833, "num_tokens": 3894198.0, "step": 2090 }, { "entropy": 6.44653811454773, "epoch": 0.1850052984811021, "grad_norm": 1.0546875, "learning_rate": 0.0004998945042302943, "loss": 6.382, "mean_token_accuracy": 0.1373509407043457, "num_tokens": 3904076.0, "step": 2095 }, { "entropy": 6.4586262702941895, "epoch": 0.18544683857294242, "grad_norm": 1.0625, "learning_rate": 0.0004998935377905457, "loss": 6.4943, "mean_token_accuracy": 0.13231708630919456, "num_tokens": 3913204.0, "step": 2100 }, { "entropy": 6.587808132171631, "epoch": 0.18588837866478275, "grad_norm": 0.984375, "learning_rate": 0.0004998925669452605, "loss": 6.4565, "mean_token_accuracy": 0.1339179016649723, "num_tokens": 3922148.0, "step": 2105 }, { "entropy": 6.401728963851928, "epoch": 0.1863299187566231, "grad_norm": 0.921875, "learning_rate": 0.0004998915916944579, "loss": 6.4234, "mean_token_accuracy": 0.1407448723912239, "num_tokens": 3931333.0, "step": 2110 }, { "entropy": 6.545709466934204, "epoch": 0.18677145884846344, "grad_norm": 1.078125, "learning_rate": 0.0004998906120381568, "loss": 6.3789, "mean_token_accuracy": 0.1448886923491955, "num_tokens": 3941061.0, "step": 2115 }, { "entropy": 6.505582046508789, "epoch": 0.18721299894030377, "grad_norm": 1.078125, "learning_rate": 0.0004998896279763766, "loss": 6.4761, "mean_token_accuracy": 0.13257319629192352, "num_tokens": 3950075.0, "step": 2120 }, { "entropy": 6.455861520767212, "epoch": 0.18765453903214413, "grad_norm": 1.0234375, "learning_rate": 0.0004998886395091365, "loss": 6.3345, "mean_token_accuracy": 0.1409289576113224, "num_tokens": 3958885.0, "step": 2125 }, { "entropy": 6.417616128921509, "epoch": 0.18809607912398446, "grad_norm": 1.03125, "learning_rate": 0.0004998876466364559, "loss": 6.437, "mean_token_accuracy": 0.13843559697270394, "num_tokens": 3968218.0, "step": 2130 }, { "entropy": 6.400978994369507, "epoch": 0.1885376192158248, "grad_norm": 1.046875, "learning_rate": 0.0004998866493583541, "loss": 6.364, "mean_token_accuracy": 0.14499804973602295, "num_tokens": 3977435.0, "step": 2135 }, { "entropy": 6.388528203964233, "epoch": 0.18897915930766512, "grad_norm": 0.99609375, "learning_rate": 0.0004998856476748509, "loss": 6.349, "mean_token_accuracy": 0.14326094537973405, "num_tokens": 3986608.0, "step": 2140 }, { "entropy": 6.440869951248169, "epoch": 0.18942069939950548, "grad_norm": 1.0625, "learning_rate": 0.0004998846415859656, "loss": 6.3602, "mean_token_accuracy": 0.14130929261445999, "num_tokens": 3996087.0, "step": 2145 }, { "entropy": 6.476541662216187, "epoch": 0.1898622394913458, "grad_norm": 1.0, "learning_rate": 0.0004998836310917182, "loss": 6.4058, "mean_token_accuracy": 0.13654726892709732, "num_tokens": 4006257.0, "step": 2150 }, { "entropy": 6.478487300872803, "epoch": 0.19030377958318614, "grad_norm": 0.9765625, "learning_rate": 0.0004998826161921282, "loss": 6.4072, "mean_token_accuracy": 0.14593140706419944, "num_tokens": 4015904.0, "step": 2155 }, { "entropy": 6.427832221984863, "epoch": 0.1907453196750265, "grad_norm": 0.94921875, "learning_rate": 0.0004998815968872157, "loss": 6.4181, "mean_token_accuracy": 0.13653010204434396, "num_tokens": 4025417.0, "step": 2160 }, { "entropy": 6.392728614807129, "epoch": 0.19118685976686683, "grad_norm": 0.984375, "learning_rate": 0.0004998805731770007, "loss": 6.2628, "mean_token_accuracy": 0.15659967064857483, "num_tokens": 4035181.0, "step": 2165 }, { "entropy": 6.4461814880371096, "epoch": 0.19162839985870717, "grad_norm": 0.9765625, "learning_rate": 0.000499879545061503, "loss": 6.4504, "mean_token_accuracy": 0.13371687456965448, "num_tokens": 4045112.0, "step": 2170 }, { "entropy": 6.5883321285247805, "epoch": 0.1920699399505475, "grad_norm": 0.9609375, "learning_rate": 0.0004998785125407432, "loss": 6.5425, "mean_token_accuracy": 0.12125966772437095, "num_tokens": 4054566.0, "step": 2175 }, { "entropy": 6.552111434936523, "epoch": 0.19251148004238786, "grad_norm": 1.0078125, "learning_rate": 0.000499877475614741, "loss": 6.3758, "mean_token_accuracy": 0.1340583384037018, "num_tokens": 4063960.0, "step": 2180 }, { "entropy": 6.3515486240386965, "epoch": 0.1929530201342282, "grad_norm": 1.015625, "learning_rate": 0.0004998764342835169, "loss": 6.3157, "mean_token_accuracy": 0.14617449343204497, "num_tokens": 4072620.0, "step": 2185 }, { "entropy": 6.406283712387085, "epoch": 0.19339456022606852, "grad_norm": 1.0390625, "learning_rate": 0.0004998753885470915, "loss": 6.2789, "mean_token_accuracy": 0.14062159806489943, "num_tokens": 4081590.0, "step": 2190 }, { "entropy": 6.425132322311401, "epoch": 0.19383610031790888, "grad_norm": 1.03125, "learning_rate": 0.0004998743384054851, "loss": 6.4202, "mean_token_accuracy": 0.14044143706560136, "num_tokens": 4090758.0, "step": 2195 }, { "entropy": 6.3962568759918215, "epoch": 0.1942776404097492, "grad_norm": 1.015625, "learning_rate": 0.0004998732838587183, "loss": 6.2458, "mean_token_accuracy": 0.149021477997303, "num_tokens": 4099281.0, "step": 2200 }, { "entropy": 6.40768404006958, "epoch": 0.19471918050158954, "grad_norm": 1.0234375, "learning_rate": 0.0004998722249068118, "loss": 6.3451, "mean_token_accuracy": 0.1404854990541935, "num_tokens": 4108953.0, "step": 2205 }, { "entropy": 6.396564531326294, "epoch": 0.19516072059342987, "grad_norm": 1.0078125, "learning_rate": 0.0004998711615497863, "loss": 6.4535, "mean_token_accuracy": 0.13740591406822206, "num_tokens": 4118799.0, "step": 2210 }, { "entropy": 6.579476261138916, "epoch": 0.19560226068527023, "grad_norm": 1.0078125, "learning_rate": 0.0004998700937876626, "loss": 6.4122, "mean_token_accuracy": 0.13571444600820542, "num_tokens": 4127862.0, "step": 2215 }, { "entropy": 6.4028857231140135, "epoch": 0.19604380077711056, "grad_norm": 1.078125, "learning_rate": 0.0004998690216204615, "loss": 6.5068, "mean_token_accuracy": 0.12704429849982263, "num_tokens": 4139029.0, "step": 2220 }, { "entropy": 6.5047478675842285, "epoch": 0.1964853408689509, "grad_norm": 1.078125, "learning_rate": 0.0004998679450482043, "loss": 6.3028, "mean_token_accuracy": 0.14782762974500657, "num_tokens": 4148257.0, "step": 2225 }, { "entropy": 6.341253662109375, "epoch": 0.19692688096079125, "grad_norm": 0.9921875, "learning_rate": 0.000499866864070912, "loss": 6.2627, "mean_token_accuracy": 0.14017492160201073, "num_tokens": 4157626.0, "step": 2230 }, { "entropy": 6.429310417175293, "epoch": 0.19736842105263158, "grad_norm": 1.09375, "learning_rate": 0.0004998657786886056, "loss": 6.4713, "mean_token_accuracy": 0.13993202298879623, "num_tokens": 4166804.0, "step": 2235 }, { "entropy": 6.554042720794678, "epoch": 0.1978099611444719, "grad_norm": 0.98828125, "learning_rate": 0.0004998646889013066, "loss": 6.3607, "mean_token_accuracy": 0.14240839183330536, "num_tokens": 4175701.0, "step": 2240 }, { "entropy": 6.4098762512207035, "epoch": 0.19825150123631224, "grad_norm": 0.96875, "learning_rate": 0.0004998635947090362, "loss": 6.4425, "mean_token_accuracy": 0.14030273035168647, "num_tokens": 4184711.0, "step": 2245 }, { "entropy": 6.475190782546997, "epoch": 0.1986930413281526, "grad_norm": 1.2421875, "learning_rate": 0.0004998624961118158, "loss": 6.4017, "mean_token_accuracy": 0.14099944159388542, "num_tokens": 4193931.0, "step": 2250 }, { "entropy": 6.44319109916687, "epoch": 0.19913458141999293, "grad_norm": 1.0390625, "learning_rate": 0.000499861393109667, "loss": 6.2899, "mean_token_accuracy": 0.14828752726316452, "num_tokens": 4203000.0, "step": 2255 }, { "entropy": 6.274944400787353, "epoch": 0.19957612151183327, "grad_norm": 0.9765625, "learning_rate": 0.0004998602857026114, "loss": 6.2991, "mean_token_accuracy": 0.1458234503865242, "num_tokens": 4211977.0, "step": 2260 }, { "entropy": 6.472422647476196, "epoch": 0.20001766160367362, "grad_norm": 1.0625, "learning_rate": 0.0004998591738906708, "loss": 6.3814, "mean_token_accuracy": 0.15182094275951385, "num_tokens": 4220375.0, "step": 2265 }, { "entropy": 6.494894886016846, "epoch": 0.20045920169551396, "grad_norm": 1.03125, "learning_rate": 0.0004998580576738668, "loss": 6.3793, "mean_token_accuracy": 0.1367909237742424, "num_tokens": 4230506.0, "step": 2270 }, { "entropy": 6.404332447052002, "epoch": 0.2009007417873543, "grad_norm": 1.0234375, "learning_rate": 0.0004998569370522213, "loss": 6.3524, "mean_token_accuracy": 0.13977290093898773, "num_tokens": 4240235.0, "step": 2275 }, { "entropy": 6.448247480392456, "epoch": 0.20134228187919462, "grad_norm": 0.96484375, "learning_rate": 0.0004998558120257563, "loss": 6.4189, "mean_token_accuracy": 0.14339498728513717, "num_tokens": 4249775.0, "step": 2280 }, { "entropy": 6.419510173797607, "epoch": 0.20178382197103498, "grad_norm": 1.0625, "learning_rate": 0.0004998546825944938, "loss": 6.3504, "mean_token_accuracy": 0.141066338121891, "num_tokens": 4258523.0, "step": 2285 }, { "entropy": 6.419132709503174, "epoch": 0.2022253620628753, "grad_norm": 1.046875, "learning_rate": 0.000499853548758456, "loss": 6.358, "mean_token_accuracy": 0.13891511633992196, "num_tokens": 4267683.0, "step": 2290 }, { "entropy": 6.495290946960449, "epoch": 0.20266690215471564, "grad_norm": 1.3828125, "learning_rate": 0.000499852410517665, "loss": 6.417, "mean_token_accuracy": 0.13713881745934486, "num_tokens": 4276903.0, "step": 2295 }, { "entropy": 6.426017999649048, "epoch": 0.203108442246556, "grad_norm": 1.4296875, "learning_rate": 0.0004998512678721431, "loss": 6.3661, "mean_token_accuracy": 0.135909353941679, "num_tokens": 4287257.0, "step": 2300 }, { "entropy": 6.447468948364258, "epoch": 0.20354998233839633, "grad_norm": 1.0625, "learning_rate": 0.000499850120821913, "loss": 6.3875, "mean_token_accuracy": 0.13645304143428802, "num_tokens": 4297345.0, "step": 2305 }, { "entropy": 6.528428220748902, "epoch": 0.20399152243023666, "grad_norm": 1.09375, "learning_rate": 0.0004998489693669967, "loss": 6.3613, "mean_token_accuracy": 0.13837311565876007, "num_tokens": 4306533.0, "step": 2310 }, { "entropy": 6.392482471466065, "epoch": 0.204433062522077, "grad_norm": 1.046875, "learning_rate": 0.000499847813507417, "loss": 6.4023, "mean_token_accuracy": 0.14094773977994918, "num_tokens": 4316338.0, "step": 2315 }, { "entropy": 6.399602174758911, "epoch": 0.20487460261391735, "grad_norm": 1.0, "learning_rate": 0.0004998466532431966, "loss": 6.3549, "mean_token_accuracy": 0.13948202207684518, "num_tokens": 4326585.0, "step": 2320 }, { "entropy": 6.544098567962647, "epoch": 0.20531614270575768, "grad_norm": 0.92578125, "learning_rate": 0.0004998454885743581, "loss": 6.4795, "mean_token_accuracy": 0.1351695440709591, "num_tokens": 4336490.0, "step": 2325 }, { "entropy": 6.363640975952149, "epoch": 0.205757682797598, "grad_norm": 1.046875, "learning_rate": 0.0004998443195009242, "loss": 6.3589, "mean_token_accuracy": 0.1348782531917095, "num_tokens": 4346264.0, "step": 2330 }, { "entropy": 6.461185693740845, "epoch": 0.20619922288943837, "grad_norm": 1.0078125, "learning_rate": 0.0004998431460229182, "loss": 6.3318, "mean_token_accuracy": 0.1448797807097435, "num_tokens": 4355102.0, "step": 2335 }, { "entropy": 6.482104396820068, "epoch": 0.2066407629812787, "grad_norm": 1.125, "learning_rate": 0.0004998419681403627, "loss": 6.5133, "mean_token_accuracy": 0.13086750581860543, "num_tokens": 4365569.0, "step": 2340 }, { "entropy": 6.507464361190796, "epoch": 0.20708230307311903, "grad_norm": 0.91796875, "learning_rate": 0.0004998407858532809, "loss": 6.36, "mean_token_accuracy": 0.14296017587184906, "num_tokens": 4375437.0, "step": 2345 }, { "entropy": 6.37006025314331, "epoch": 0.20752384316495937, "grad_norm": 1.0546875, "learning_rate": 0.000499839599161696, "loss": 6.3389, "mean_token_accuracy": 0.1450774312019348, "num_tokens": 4384420.0, "step": 2350 }, { "entropy": 6.460700035095215, "epoch": 0.20796538325679972, "grad_norm": 1.1015625, "learning_rate": 0.0004998384080656314, "loss": 6.3106, "mean_token_accuracy": 0.14050144031643869, "num_tokens": 4393730.0, "step": 2355 }, { "entropy": 6.332716035842895, "epoch": 0.20840692334864006, "grad_norm": 1.0625, "learning_rate": 0.00049983721256511, "loss": 6.2798, "mean_token_accuracy": 0.14351205080747603, "num_tokens": 4402731.0, "step": 2360 }, { "entropy": 6.3846518993377686, "epoch": 0.2088484634404804, "grad_norm": 1.078125, "learning_rate": 0.0004998360126601556, "loss": 6.3369, "mean_token_accuracy": 0.14289727210998535, "num_tokens": 4411606.0, "step": 2365 }, { "entropy": 6.3337644100189205, "epoch": 0.20929000353232075, "grad_norm": 1.0625, "learning_rate": 0.0004998348083507916, "loss": 6.4062, "mean_token_accuracy": 0.14081210866570473, "num_tokens": 4421685.0, "step": 2370 }, { "entropy": 6.558507633209229, "epoch": 0.20973154362416108, "grad_norm": 1.078125, "learning_rate": 0.0004998335996370416, "loss": 6.3782, "mean_token_accuracy": 0.14399294778704644, "num_tokens": 4431765.0, "step": 2375 }, { "entropy": 6.321137380599976, "epoch": 0.2101730837160014, "grad_norm": 1.0703125, "learning_rate": 0.0004998323865189291, "loss": 6.3523, "mean_token_accuracy": 0.13661579713225364, "num_tokens": 4441191.0, "step": 2380 }, { "entropy": 6.481552457809448, "epoch": 0.21061462380784174, "grad_norm": 1.1015625, "learning_rate": 0.0004998311689964781, "loss": 6.4322, "mean_token_accuracy": 0.13680859059095382, "num_tokens": 4450156.0, "step": 2385 }, { "entropy": 6.4807047843933105, "epoch": 0.2110561638996821, "grad_norm": 1.1015625, "learning_rate": 0.0004998299470697125, "loss": 6.4163, "mean_token_accuracy": 0.14260546639561653, "num_tokens": 4459466.0, "step": 2390 }, { "entropy": 6.461113500595093, "epoch": 0.21149770399152243, "grad_norm": 0.984375, "learning_rate": 0.0004998287207386559, "loss": 6.4156, "mean_token_accuracy": 0.14247968047857285, "num_tokens": 4468539.0, "step": 2395 }, { "entropy": 6.525749444961548, "epoch": 0.21193924408336276, "grad_norm": 1.1015625, "learning_rate": 0.0004998274900033326, "loss": 6.315, "mean_token_accuracy": 0.14835015684366226, "num_tokens": 4477579.0, "step": 2400 }, { "entropy": 6.236664247512818, "epoch": 0.21238078417520312, "grad_norm": 0.984375, "learning_rate": 0.0004998262548637667, "loss": 6.2842, "mean_token_accuracy": 0.14991160482168198, "num_tokens": 4486800.0, "step": 2405 }, { "entropy": 6.444223546981812, "epoch": 0.21282232426704345, "grad_norm": 1.03125, "learning_rate": 0.0004998250153199822, "loss": 6.2465, "mean_token_accuracy": 0.14562757611274718, "num_tokens": 4495985.0, "step": 2410 }, { "entropy": 6.417825555801391, "epoch": 0.21326386435888378, "grad_norm": 0.98046875, "learning_rate": 0.0004998237713720036, "loss": 6.4031, "mean_token_accuracy": 0.14113787487149237, "num_tokens": 4504944.0, "step": 2415 }, { "entropy": 6.344033908843994, "epoch": 0.2137054044507241, "grad_norm": 0.9296875, "learning_rate": 0.0004998225230198552, "loss": 6.2875, "mean_token_accuracy": 0.14928205609321593, "num_tokens": 4515402.0, "step": 2420 }, { "entropy": 6.412223052978516, "epoch": 0.21414694454256447, "grad_norm": 0.9921875, "learning_rate": 0.0004998212702635614, "loss": 6.355, "mean_token_accuracy": 0.14224686175584794, "num_tokens": 4525009.0, "step": 2425 }, { "entropy": 6.537710332870484, "epoch": 0.2145884846344048, "grad_norm": 1.1484375, "learning_rate": 0.0004998200131031469, "loss": 6.4066, "mean_token_accuracy": 0.142412006855011, "num_tokens": 4534460.0, "step": 2430 }, { "entropy": 6.355252885818482, "epoch": 0.21503002472624513, "grad_norm": 1.0078125, "learning_rate": 0.0004998187515386361, "loss": 6.2239, "mean_token_accuracy": 0.15292632952332497, "num_tokens": 4543748.0, "step": 2435 }, { "entropy": 6.32159743309021, "epoch": 0.2154715648180855, "grad_norm": 1.046875, "learning_rate": 0.0004998174855700538, "loss": 6.3515, "mean_token_accuracy": 0.14536840543150903, "num_tokens": 4552722.0, "step": 2440 }, { "entropy": 6.455835342407227, "epoch": 0.21591310490992582, "grad_norm": 1.1953125, "learning_rate": 0.0004998162151974248, "loss": 6.2244, "mean_token_accuracy": 0.1433302193880081, "num_tokens": 4561607.0, "step": 2445 }, { "entropy": 6.376346635818481, "epoch": 0.21635464500176615, "grad_norm": 0.89453125, "learning_rate": 0.000499814940420774, "loss": 6.4926, "mean_token_accuracy": 0.13043315410614015, "num_tokens": 4572524.0, "step": 2450 }, { "entropy": 6.473890399932861, "epoch": 0.21679618509360649, "grad_norm": 1.0078125, "learning_rate": 0.0004998136612401266, "loss": 6.306, "mean_token_accuracy": 0.13808697760105132, "num_tokens": 4581601.0, "step": 2455 }, { "entropy": 6.362298917770386, "epoch": 0.21723772518544684, "grad_norm": 1.0625, "learning_rate": 0.0004998123776555071, "loss": 6.355, "mean_token_accuracy": 0.13434374257922171, "num_tokens": 4591795.0, "step": 2460 }, { "entropy": 6.4316198348999025, "epoch": 0.21767926527728718, "grad_norm": 1.046875, "learning_rate": 0.0004998110896669412, "loss": 6.3127, "mean_token_accuracy": 0.14232389852404595, "num_tokens": 4600745.0, "step": 2465 }, { "entropy": 6.403714847564697, "epoch": 0.2181208053691275, "grad_norm": 0.953125, "learning_rate": 0.0004998097972744539, "loss": 6.3668, "mean_token_accuracy": 0.13958390951156616, "num_tokens": 4610490.0, "step": 2470 }, { "entropy": 6.421542739868164, "epoch": 0.21856234546096787, "grad_norm": 1.0546875, "learning_rate": 0.0004998085004780705, "loss": 6.3027, "mean_token_accuracy": 0.14644000679254532, "num_tokens": 4619511.0, "step": 2475 }, { "entropy": 6.411759996414185, "epoch": 0.2190038855528082, "grad_norm": 1.0546875, "learning_rate": 0.0004998071992778164, "loss": 6.3536, "mean_token_accuracy": 0.13926490917801856, "num_tokens": 4628186.0, "step": 2480 }, { "entropy": 6.416973400115967, "epoch": 0.21944542564464853, "grad_norm": 1.0390625, "learning_rate": 0.000499805893673717, "loss": 6.3574, "mean_token_accuracy": 0.14341954439878463, "num_tokens": 4637431.0, "step": 2485 }, { "entropy": 6.347478723526001, "epoch": 0.21988696573648886, "grad_norm": 0.96484375, "learning_rate": 0.0004998045836657982, "loss": 6.2093, "mean_token_accuracy": 0.14213306605815887, "num_tokens": 4646627.0, "step": 2490 }, { "entropy": 6.368200349807739, "epoch": 0.22032850582832922, "grad_norm": 1.0859375, "learning_rate": 0.0004998032692540853, "loss": 6.3012, "mean_token_accuracy": 0.14045739471912383, "num_tokens": 4656095.0, "step": 2495 }, { "entropy": 6.476345205307007, "epoch": 0.22077004592016955, "grad_norm": 1.03125, "learning_rate": 0.0004998019504386044, "loss": 6.3876, "mean_token_accuracy": 0.13640450164675713, "num_tokens": 4665807.0, "step": 2500 }, { "entropy": 6.444914436340332, "epoch": 0.22121158601200988, "grad_norm": 1.09375, "learning_rate": 0.0004998006272193809, "loss": 6.3493, "mean_token_accuracy": 0.13855071663856505, "num_tokens": 4674459.0, "step": 2505 }, { "entropy": 6.361084604263306, "epoch": 0.22165312610385024, "grad_norm": 0.92578125, "learning_rate": 0.0004997992995964412, "loss": 6.4325, "mean_token_accuracy": 0.13779560700058938, "num_tokens": 4684063.0, "step": 2510 }, { "entropy": 6.45024299621582, "epoch": 0.22209466619569057, "grad_norm": 1.15625, "learning_rate": 0.0004997979675698109, "loss": 6.3029, "mean_token_accuracy": 0.14212062656879426, "num_tokens": 4692807.0, "step": 2515 }, { "entropy": 6.438331031799317, "epoch": 0.2225362062875309, "grad_norm": 1.0625, "learning_rate": 0.0004997966311395164, "loss": 6.2422, "mean_token_accuracy": 0.14746622294187545, "num_tokens": 4701100.0, "step": 2520 }, { "entropy": 6.284497547149658, "epoch": 0.22297774637937123, "grad_norm": 1.0390625, "learning_rate": 0.0004997952903055836, "loss": 6.3071, "mean_token_accuracy": 0.13971827551722527, "num_tokens": 4710697.0, "step": 2525 }, { "entropy": 6.4242840766906735, "epoch": 0.2234192864712116, "grad_norm": 1.125, "learning_rate": 0.000499793945068039, "loss": 6.2875, "mean_token_accuracy": 0.14202770590782166, "num_tokens": 4718745.0, "step": 2530 }, { "entropy": 6.358328342437744, "epoch": 0.22386082656305192, "grad_norm": 0.9765625, "learning_rate": 0.0004997925954269088, "loss": 6.2493, "mean_token_accuracy": 0.15010830983519555, "num_tokens": 4728056.0, "step": 2535 }, { "entropy": 6.390234851837159, "epoch": 0.22430236665489225, "grad_norm": 0.9765625, "learning_rate": 0.0004997912413822196, "loss": 6.3892, "mean_token_accuracy": 0.14226726815104485, "num_tokens": 4737605.0, "step": 2540 }, { "entropy": 6.333422613143921, "epoch": 0.2247439067467326, "grad_norm": 1.0078125, "learning_rate": 0.0004997898829339979, "loss": 6.216, "mean_token_accuracy": 0.15019772350788116, "num_tokens": 4746168.0, "step": 2545 }, { "entropy": 6.414393615722656, "epoch": 0.22518544683857294, "grad_norm": 1.015625, "learning_rate": 0.00049978852008227, "loss": 6.2827, "mean_token_accuracy": 0.14390757903456688, "num_tokens": 4755072.0, "step": 2550 }, { "entropy": 6.292112064361572, "epoch": 0.22562698693041328, "grad_norm": 0.96484375, "learning_rate": 0.000499787152827063, "loss": 6.2887, "mean_token_accuracy": 0.14927180036902427, "num_tokens": 4764580.0, "step": 2555 }, { "entropy": 6.362084197998047, "epoch": 0.2260685270222536, "grad_norm": 1.03125, "learning_rate": 0.0004997857811684035, "loss": 6.3109, "mean_token_accuracy": 0.14619807451963424, "num_tokens": 4774135.0, "step": 2560 }, { "entropy": 6.443705081939697, "epoch": 0.22651006711409397, "grad_norm": 1.03125, "learning_rate": 0.0004997844051063183, "loss": 6.3931, "mean_token_accuracy": 0.1407366193830967, "num_tokens": 4784733.0, "step": 2565 }, { "entropy": 6.393408250808716, "epoch": 0.2269516072059343, "grad_norm": 0.9140625, "learning_rate": 0.0004997830246408346, "loss": 6.3304, "mean_token_accuracy": 0.14791636019945145, "num_tokens": 4795327.0, "step": 2570 }, { "entropy": 6.342170095443725, "epoch": 0.22739314729777463, "grad_norm": 0.98046875, "learning_rate": 0.0004997816397719791, "loss": 6.3028, "mean_token_accuracy": 0.14114121049642564, "num_tokens": 4804314.0, "step": 2575 }, { "entropy": 6.458865118026734, "epoch": 0.227834687389615, "grad_norm": 1.0078125, "learning_rate": 0.0004997802504997792, "loss": 6.3913, "mean_token_accuracy": 0.13652418628335, "num_tokens": 4813637.0, "step": 2580 }, { "entropy": 6.382553291320801, "epoch": 0.22827622748145532, "grad_norm": 1.015625, "learning_rate": 0.0004997788568242621, "loss": 6.2591, "mean_token_accuracy": 0.14338775500655174, "num_tokens": 4823094.0, "step": 2585 }, { "entropy": 6.327674865722656, "epoch": 0.22871776757329565, "grad_norm": 0.8984375, "learning_rate": 0.000499777458745455, "loss": 6.1969, "mean_token_accuracy": 0.1477431207895279, "num_tokens": 4833199.0, "step": 2590 }, { "entropy": 6.392719554901123, "epoch": 0.22915930766513598, "grad_norm": 1.0234375, "learning_rate": 0.0004997760562633853, "loss": 6.2909, "mean_token_accuracy": 0.14219107255339622, "num_tokens": 4842970.0, "step": 2595 }, { "entropy": 6.394578695297241, "epoch": 0.22960084775697634, "grad_norm": 1.015625, "learning_rate": 0.0004997746493780804, "loss": 6.3788, "mean_token_accuracy": 0.13738251477479935, "num_tokens": 4852043.0, "step": 2600 }, { "entropy": 6.376160097122193, "epoch": 0.23004238784881667, "grad_norm": 1.0390625, "learning_rate": 0.000499773238089568, "loss": 6.2693, "mean_token_accuracy": 0.13766007199883462, "num_tokens": 4862232.0, "step": 2605 }, { "entropy": 6.345012950897217, "epoch": 0.230483927940657, "grad_norm": 1.125, "learning_rate": 0.0004997718223978758, "loss": 6.2081, "mean_token_accuracy": 0.1466532751917839, "num_tokens": 4871186.0, "step": 2610 }, { "entropy": 6.343753385543823, "epoch": 0.23092546803249736, "grad_norm": 1.0078125, "learning_rate": 0.0004997704023030315, "loss": 6.3059, "mean_token_accuracy": 0.150559451431036, "num_tokens": 4879974.0, "step": 2615 }, { "entropy": 6.393126726150513, "epoch": 0.2313670081243377, "grad_norm": 1.0078125, "learning_rate": 0.0004997689778050627, "loss": 6.3617, "mean_token_accuracy": 0.14292784333229064, "num_tokens": 4890300.0, "step": 2620 }, { "entropy": 6.369229030609131, "epoch": 0.23180854821617802, "grad_norm": 0.9921875, "learning_rate": 0.0004997675489039975, "loss": 6.3301, "mean_token_accuracy": 0.1426799289882183, "num_tokens": 4900428.0, "step": 2625 }, { "entropy": 6.3974145412445065, "epoch": 0.23225008830801835, "grad_norm": 1.03125, "learning_rate": 0.0004997661155998638, "loss": 6.3245, "mean_token_accuracy": 0.1442883849143982, "num_tokens": 4910092.0, "step": 2630 }, { "entropy": 6.384046411514282, "epoch": 0.2326916283998587, "grad_norm": 1.03125, "learning_rate": 0.0004997646778926898, "loss": 6.3247, "mean_token_accuracy": 0.13738622814416884, "num_tokens": 4919593.0, "step": 2635 }, { "entropy": 6.373585891723633, "epoch": 0.23313316849169904, "grad_norm": 1.0390625, "learning_rate": 0.0004997632357825035, "loss": 6.3001, "mean_token_accuracy": 0.14202155098319053, "num_tokens": 4929098.0, "step": 2640 }, { "entropy": 6.43012547492981, "epoch": 0.23357470858353938, "grad_norm": 1.0859375, "learning_rate": 0.0004997617892693333, "loss": 6.3657, "mean_token_accuracy": 0.1421157017350197, "num_tokens": 4938265.0, "step": 2645 }, { "entropy": 6.394748878479004, "epoch": 0.23401624867537973, "grad_norm": 1.0390625, "learning_rate": 0.0004997603383532075, "loss": 6.3016, "mean_token_accuracy": 0.14679210409522056, "num_tokens": 4946694.0, "step": 2650 }, { "entropy": 6.368328046798706, "epoch": 0.23445778876722007, "grad_norm": 1.078125, "learning_rate": 0.0004997588830341545, "loss": 6.3132, "mean_token_accuracy": 0.1434150867164135, "num_tokens": 4955296.0, "step": 2655 }, { "entropy": 6.32787938117981, "epoch": 0.2348993288590604, "grad_norm": 1.015625, "learning_rate": 0.0004997574233122028, "loss": 6.2759, "mean_token_accuracy": 0.14597226828336715, "num_tokens": 4964409.0, "step": 2660 }, { "entropy": 6.3884584426879885, "epoch": 0.23534086895090076, "grad_norm": 1.0234375, "learning_rate": 0.0004997559591873809, "loss": 6.284, "mean_token_accuracy": 0.1481903851032257, "num_tokens": 4973449.0, "step": 2665 }, { "entropy": 6.345009279251099, "epoch": 0.2357824090427411, "grad_norm": 0.99609375, "learning_rate": 0.0004997544906597178, "loss": 6.2779, "mean_token_accuracy": 0.1470661997795105, "num_tokens": 4983057.0, "step": 2670 }, { "entropy": 6.2613893985748295, "epoch": 0.23622394913458142, "grad_norm": 2.03125, "learning_rate": 0.0004997530177292418, "loss": 6.3532, "mean_token_accuracy": 0.13861697241663934, "num_tokens": 4991950.0, "step": 2675 }, { "entropy": 6.461032247543335, "epoch": 0.23666548922642175, "grad_norm": 0.9921875, "learning_rate": 0.0004997515403959823, "loss": 6.2857, "mean_token_accuracy": 0.14603266417980193, "num_tokens": 5001042.0, "step": 2680 }, { "entropy": 6.356680679321289, "epoch": 0.2371070293182621, "grad_norm": 1.0625, "learning_rate": 0.0004997500586599677, "loss": 6.2198, "mean_token_accuracy": 0.15022996366024016, "num_tokens": 5009827.0, "step": 2685 }, { "entropy": 6.292784547805786, "epoch": 0.23754856941010244, "grad_norm": 0.99609375, "learning_rate": 0.0004997485725212274, "loss": 6.2662, "mean_token_accuracy": 0.1465997129678726, "num_tokens": 5018708.0, "step": 2690 }, { "entropy": 6.334398937225342, "epoch": 0.23799010950194277, "grad_norm": 1.046875, "learning_rate": 0.0004997470819797903, "loss": 6.1678, "mean_token_accuracy": 0.149826068431139, "num_tokens": 5027522.0, "step": 2695 }, { "entropy": 6.312096786499024, "epoch": 0.23843164959378313, "grad_norm": 1.0078125, "learning_rate": 0.0004997455870356857, "loss": 6.2858, "mean_token_accuracy": 0.14754335582256317, "num_tokens": 5035755.0, "step": 2700 }, { "entropy": 6.38718318939209, "epoch": 0.23887318968562346, "grad_norm": 1.078125, "learning_rate": 0.0004997440876889429, "loss": 6.2373, "mean_token_accuracy": 0.14902258217334746, "num_tokens": 5045289.0, "step": 2705 }, { "entropy": 6.2602025985717775, "epoch": 0.2393147297774638, "grad_norm": 1.03125, "learning_rate": 0.0004997425839395913, "loss": 6.2623, "mean_token_accuracy": 0.14851141721010208, "num_tokens": 5053774.0, "step": 2710 }, { "entropy": 6.4446056365966795, "epoch": 0.23975626986930412, "grad_norm": 0.98828125, "learning_rate": 0.0004997410757876602, "loss": 6.3368, "mean_token_accuracy": 0.13768139705061913, "num_tokens": 5062911.0, "step": 2715 }, { "entropy": 6.34313178062439, "epoch": 0.24019780996114448, "grad_norm": 1.0234375, "learning_rate": 0.0004997395632331793, "loss": 6.1974, "mean_token_accuracy": 0.15056394785642624, "num_tokens": 5072107.0, "step": 2720 }, { "entropy": 6.232982730865478, "epoch": 0.2406393500529848, "grad_norm": 1.078125, "learning_rate": 0.0004997380462761781, "loss": 6.1744, "mean_token_accuracy": 0.15013156086206436, "num_tokens": 5080588.0, "step": 2725 }, { "entropy": 6.360136985778809, "epoch": 0.24108089014482514, "grad_norm": 1.046875, "learning_rate": 0.0004997365249166864, "loss": 6.3571, "mean_token_accuracy": 0.1455472856760025, "num_tokens": 5090262.0, "step": 2730 }, { "entropy": 6.398047304153442, "epoch": 0.2415224302366655, "grad_norm": 1.0078125, "learning_rate": 0.0004997349991547342, "loss": 6.2776, "mean_token_accuracy": 0.15021264627575875, "num_tokens": 5099285.0, "step": 2735 }, { "entropy": 6.356108903884888, "epoch": 0.24196397032850583, "grad_norm": 1.0625, "learning_rate": 0.0004997334689903509, "loss": 6.3226, "mean_token_accuracy": 0.14855852872133254, "num_tokens": 5109115.0, "step": 2740 }, { "entropy": 6.388672256469727, "epoch": 0.24240551042034617, "grad_norm": 1.2265625, "learning_rate": 0.0004997319344235668, "loss": 6.3429, "mean_token_accuracy": 0.14180680066347123, "num_tokens": 5117977.0, "step": 2745 }, { "entropy": 6.393019914627075, "epoch": 0.2428470505121865, "grad_norm": 1.015625, "learning_rate": 0.000499730395454412, "loss": 6.3025, "mean_token_accuracy": 0.1451731264591217, "num_tokens": 5127457.0, "step": 2750 }, { "entropy": 6.300089502334595, "epoch": 0.24328859060402686, "grad_norm": 1.0625, "learning_rate": 0.0004997288520829166, "loss": 6.3466, "mean_token_accuracy": 0.14065672382712363, "num_tokens": 5137310.0, "step": 2755 }, { "entropy": 6.394161605834961, "epoch": 0.2437301306958672, "grad_norm": 1.0625, "learning_rate": 0.0004997273043091107, "loss": 6.2725, "mean_token_accuracy": 0.14155926927924156, "num_tokens": 5146963.0, "step": 2760 }, { "entropy": 6.31079797744751, "epoch": 0.24417167078770752, "grad_norm": 1.0390625, "learning_rate": 0.0004997257521330248, "loss": 6.2601, "mean_token_accuracy": 0.14022860154509545, "num_tokens": 5155521.0, "step": 2765 }, { "entropy": 6.40408935546875, "epoch": 0.24461321087954788, "grad_norm": 1.0625, "learning_rate": 0.0004997241955546892, "loss": 6.237, "mean_token_accuracy": 0.14580907300114632, "num_tokens": 5165182.0, "step": 2770 }, { "entropy": 6.289978647232056, "epoch": 0.2450547509713882, "grad_norm": 1.0, "learning_rate": 0.0004997226345741343, "loss": 6.2649, "mean_token_accuracy": 0.13975519686937332, "num_tokens": 5175511.0, "step": 2775 }, { "entropy": 6.354159593582153, "epoch": 0.24549629106322854, "grad_norm": 1.1015625, "learning_rate": 0.000499721069191391, "loss": 6.2126, "mean_token_accuracy": 0.14699607565999032, "num_tokens": 5184772.0, "step": 2780 }, { "entropy": 6.288126516342163, "epoch": 0.24593783115506887, "grad_norm": 0.96484375, "learning_rate": 0.0004997194994064896, "loss": 6.3014, "mean_token_accuracy": 0.14003223031759263, "num_tokens": 5195136.0, "step": 2785 }, { "entropy": 6.391946315765381, "epoch": 0.24637937124690923, "grad_norm": 1.140625, "learning_rate": 0.000499717925219461, "loss": 6.2279, "mean_token_accuracy": 0.15093553364276885, "num_tokens": 5203163.0, "step": 2790 }, { "entropy": 6.346267318725586, "epoch": 0.24682091133874956, "grad_norm": 1.015625, "learning_rate": 0.0004997163466303362, "loss": 6.3224, "mean_token_accuracy": 0.14383373707532882, "num_tokens": 5213233.0, "step": 2795 }, { "entropy": 6.329722881317139, "epoch": 0.2472624514305899, "grad_norm": 0.96875, "learning_rate": 0.000499714763639146, "loss": 6.2346, "mean_token_accuracy": 0.14386766105890275, "num_tokens": 5222940.0, "step": 2800 }, { "entropy": 6.36411566734314, "epoch": 0.24770399152243025, "grad_norm": 1.125, "learning_rate": 0.0004997131762459211, "loss": 6.2596, "mean_token_accuracy": 0.14432956129312516, "num_tokens": 5232263.0, "step": 2805 }, { "entropy": 6.369926023483276, "epoch": 0.24814553161427058, "grad_norm": 1.1484375, "learning_rate": 0.0004997115844506932, "loss": 6.2334, "mean_token_accuracy": 0.14295720756053926, "num_tokens": 5241536.0, "step": 2810 }, { "entropy": 6.348195028305054, "epoch": 0.2485870717061109, "grad_norm": 1.078125, "learning_rate": 0.0004997099882534929, "loss": 6.2732, "mean_token_accuracy": 0.14211497604846954, "num_tokens": 5250702.0, "step": 2815 }, { "entropy": 6.359058141708374, "epoch": 0.24902861179795124, "grad_norm": 1.015625, "learning_rate": 0.0004997083876543519, "loss": 6.2763, "mean_token_accuracy": 0.14498503208160402, "num_tokens": 5259811.0, "step": 2820 }, { "entropy": 6.397128582000732, "epoch": 0.2494701518897916, "grad_norm": 0.95703125, "learning_rate": 0.0004997067826533014, "loss": 6.3615, "mean_token_accuracy": 0.13723283037543296, "num_tokens": 5270518.0, "step": 2825 }, { "entropy": 6.375804328918457, "epoch": 0.24991169198163193, "grad_norm": 1.421875, "learning_rate": 0.0004997051732503726, "loss": 6.2458, "mean_token_accuracy": 0.14747673273086548, "num_tokens": 5279538.0, "step": 2830 }, { "entropy": 6.313772678375244, "epoch": 0.25035323207347226, "grad_norm": 1.1015625, "learning_rate": 0.0004997035594455975, "loss": 6.2702, "mean_token_accuracy": 0.13872402533888817, "num_tokens": 5289633.0, "step": 2835 }, { "entropy": 6.370833015441894, "epoch": 0.2507947721653126, "grad_norm": 1.109375, "learning_rate": 0.0004997019412390074, "loss": 6.3603, "mean_token_accuracy": 0.1444901555776596, "num_tokens": 5299148.0, "step": 2840 }, { "entropy": 6.36065092086792, "epoch": 0.2512363122571529, "grad_norm": 1.015625, "learning_rate": 0.000499700318630634, "loss": 6.2527, "mean_token_accuracy": 0.1457889422774315, "num_tokens": 5309090.0, "step": 2845 }, { "entropy": 6.384716939926148, "epoch": 0.2516778523489933, "grad_norm": 1.0546875, "learning_rate": 0.0004996986916205092, "loss": 6.3297, "mean_token_accuracy": 0.14129810705780982, "num_tokens": 5318798.0, "step": 2850 }, { "entropy": 6.313976621627807, "epoch": 0.25211939244083365, "grad_norm": 1.0390625, "learning_rate": 0.0004996970602086648, "loss": 6.1848, "mean_token_accuracy": 0.15023760497570038, "num_tokens": 5327915.0, "step": 2855 }, { "entropy": 6.251888847351074, "epoch": 0.252560932532674, "grad_norm": 1.078125, "learning_rate": 0.0004996954243951327, "loss": 6.2192, "mean_token_accuracy": 0.15385923832654952, "num_tokens": 5336970.0, "step": 2860 }, { "entropy": 6.299870014190674, "epoch": 0.2530024726245143, "grad_norm": 1.0625, "learning_rate": 0.0004996937841799451, "loss": 6.1821, "mean_token_accuracy": 0.15226729065179825, "num_tokens": 5345167.0, "step": 2865 }, { "entropy": 6.1627833366394045, "epoch": 0.25344401271635464, "grad_norm": 1.28125, "learning_rate": 0.0004996921395631342, "loss": 6.1711, "mean_token_accuracy": 0.14804726019501685, "num_tokens": 5353399.0, "step": 2870 }, { "entropy": 6.380198335647583, "epoch": 0.25388555280819497, "grad_norm": 1.046875, "learning_rate": 0.000499690490544732, "loss": 6.285, "mean_token_accuracy": 0.146748573333025, "num_tokens": 5363224.0, "step": 2875 }, { "entropy": 6.372404766082764, "epoch": 0.2543270929000353, "grad_norm": 1.015625, "learning_rate": 0.0004996888371247707, "loss": 6.2862, "mean_token_accuracy": 0.14108646661043167, "num_tokens": 5372274.0, "step": 2880 }, { "entropy": 6.302175998687744, "epoch": 0.2547686329918757, "grad_norm": 1.0390625, "learning_rate": 0.000499687179303283, "loss": 6.2746, "mean_token_accuracy": 0.15104661732912064, "num_tokens": 5380240.0, "step": 2885 }, { "entropy": 6.269204902648926, "epoch": 0.255210173083716, "grad_norm": 0.99609375, "learning_rate": 0.0004996855170803012, "loss": 6.138, "mean_token_accuracy": 0.15041064321994782, "num_tokens": 5389390.0, "step": 2890 }, { "entropy": 6.343308639526367, "epoch": 0.25565171317555635, "grad_norm": 1.03125, "learning_rate": 0.0004996838504558581, "loss": 6.2986, "mean_token_accuracy": 0.14492825120687486, "num_tokens": 5399425.0, "step": 2895 }, { "entropy": 6.379653215408325, "epoch": 0.2560932532673967, "grad_norm": 1.1640625, "learning_rate": 0.000499682179429986, "loss": 6.3089, "mean_token_accuracy": 0.14139388352632523, "num_tokens": 5408717.0, "step": 2900 }, { "entropy": 6.292103576660156, "epoch": 0.256534793359237, "grad_norm": 1.0078125, "learning_rate": 0.0004996805040027178, "loss": 6.2399, "mean_token_accuracy": 0.1403766691684723, "num_tokens": 5418475.0, "step": 2905 }, { "entropy": 6.395513296127319, "epoch": 0.25697633345107734, "grad_norm": 1.0625, "learning_rate": 0.0004996788241740863, "loss": 6.2884, "mean_token_accuracy": 0.143946073949337, "num_tokens": 5428403.0, "step": 2910 }, { "entropy": 6.366812467575073, "epoch": 0.2574178735429177, "grad_norm": 1.0625, "learning_rate": 0.0004996771399441243, "loss": 6.3188, "mean_token_accuracy": 0.14125285297632217, "num_tokens": 5437347.0, "step": 2915 }, { "entropy": 6.395107555389404, "epoch": 0.25785941363475806, "grad_norm": 0.99609375, "learning_rate": 0.0004996754513128652, "loss": 6.2216, "mean_token_accuracy": 0.1553879424929619, "num_tokens": 5446804.0, "step": 2920 }, { "entropy": 6.245992279052734, "epoch": 0.2583009537265984, "grad_norm": 1.0078125, "learning_rate": 0.0004996737582803416, "loss": 6.1701, "mean_token_accuracy": 0.14774591475725174, "num_tokens": 5455888.0, "step": 2925 }, { "entropy": 6.349690961837768, "epoch": 0.2587424938184387, "grad_norm": 1.0859375, "learning_rate": 0.0004996720608465868, "loss": 6.1785, "mean_token_accuracy": 0.14454589933156967, "num_tokens": 5463977.0, "step": 2930 }, { "entropy": 6.251695680618286, "epoch": 0.25918403391027905, "grad_norm": 0.953125, "learning_rate": 0.0004996703590116342, "loss": 6.2901, "mean_token_accuracy": 0.1413638859987259, "num_tokens": 5473780.0, "step": 2935 }, { "entropy": 6.343139219284057, "epoch": 0.2596255740021194, "grad_norm": 1.1328125, "learning_rate": 0.0004996686527755171, "loss": 6.1747, "mean_token_accuracy": 0.15054369121789932, "num_tokens": 5482151.0, "step": 2940 }, { "entropy": 6.287330961227417, "epoch": 0.2600671140939597, "grad_norm": 1.015625, "learning_rate": 0.0004996669421382687, "loss": 6.181, "mean_token_accuracy": 0.15408090725541115, "num_tokens": 5491103.0, "step": 2945 }, { "entropy": 6.23843822479248, "epoch": 0.26050865418580005, "grad_norm": 1.25, "learning_rate": 0.0004996652270999228, "loss": 6.2051, "mean_token_accuracy": 0.1455566719174385, "num_tokens": 5500367.0, "step": 2950 }, { "entropy": 6.401996898651123, "epoch": 0.26095019427764043, "grad_norm": 1.125, "learning_rate": 0.0004996635076605128, "loss": 6.2392, "mean_token_accuracy": 0.1509515941143036, "num_tokens": 5509631.0, "step": 2955 }, { "entropy": 6.3384003162384035, "epoch": 0.26139173436948077, "grad_norm": 1.15625, "learning_rate": 0.0004996617838200725, "loss": 6.2572, "mean_token_accuracy": 0.14331620335578918, "num_tokens": 5518635.0, "step": 2960 }, { "entropy": 6.241027069091797, "epoch": 0.2618332744613211, "grad_norm": 1.125, "learning_rate": 0.0004996600555786357, "loss": 6.2142, "mean_token_accuracy": 0.1464727446436882, "num_tokens": 5527696.0, "step": 2965 }, { "entropy": 6.348132085800171, "epoch": 0.26227481455316143, "grad_norm": 1.1328125, "learning_rate": 0.0004996583229362362, "loss": 6.1834, "mean_token_accuracy": 0.14780823439359664, "num_tokens": 5536632.0, "step": 2970 }, { "entropy": 6.378821849822998, "epoch": 0.26271635464500176, "grad_norm": 1.1171875, "learning_rate": 0.0004996565858929078, "loss": 6.2627, "mean_token_accuracy": 0.14528179541230202, "num_tokens": 5545825.0, "step": 2975 }, { "entropy": 6.259585618972778, "epoch": 0.2631578947368421, "grad_norm": 1.03125, "learning_rate": 0.0004996548444486847, "loss": 6.1389, "mean_token_accuracy": 0.15060140788555146, "num_tokens": 5555158.0, "step": 2980 }, { "entropy": 6.116889953613281, "epoch": 0.2635994348286824, "grad_norm": 1.0234375, "learning_rate": 0.0004996530986036008, "loss": 6.0795, "mean_token_accuracy": 0.15272270664572715, "num_tokens": 5564218.0, "step": 2985 }, { "entropy": 6.255494451522827, "epoch": 0.2640409749205228, "grad_norm": 1.0703125, "learning_rate": 0.0004996513483576907, "loss": 6.2219, "mean_token_accuracy": 0.14951637461781503, "num_tokens": 5572760.0, "step": 2990 }, { "entropy": 6.423755645751953, "epoch": 0.26448251501236314, "grad_norm": 1.09375, "learning_rate": 0.0004996495937109884, "loss": 6.2825, "mean_token_accuracy": 0.14191085398197173, "num_tokens": 5581660.0, "step": 2995 }, { "entropy": 6.250067615509034, "epoch": 0.26492405510420347, "grad_norm": 1.046875, "learning_rate": 0.0004996478346635283, "loss": 6.1968, "mean_token_accuracy": 0.1436440147459507, "num_tokens": 5590664.0, "step": 3000 }, { "epoch": 0.26492405510420347, "eval_entropy": 6.077911184598204, "eval_loss": 6.2711029052734375, "eval_mean_token_accuracy": 0.15016848111384373, "eval_num_tokens": 5590664.0, "eval_runtime": 26.2453, "eval_samples_per_second": 1345.574, "eval_steps_per_second": 168.221, "step": 3000 }, { "entropy": 6.211904573440552, "epoch": 0.2653655951960438, "grad_norm": 1.0703125, "learning_rate": 0.0004996460712153448, "loss": 6.0603, "mean_token_accuracy": 0.1621384307742119, "num_tokens": 5598727.0, "step": 3005 }, { "entropy": 6.220605993270874, "epoch": 0.26580713528788413, "grad_norm": 1.3984375, "learning_rate": 0.0004996443033664726, "loss": 6.2746, "mean_token_accuracy": 0.1402523137629032, "num_tokens": 5608549.0, "step": 3010 }, { "entropy": 6.372675228118896, "epoch": 0.26624867537972446, "grad_norm": 1.125, "learning_rate": 0.0004996425311169463, "loss": 6.1935, "mean_token_accuracy": 0.15231838524341584, "num_tokens": 5617628.0, "step": 3015 }, { "entropy": 6.362924861907959, "epoch": 0.2666902154715648, "grad_norm": 1.0703125, "learning_rate": 0.0004996407544668005, "loss": 6.2649, "mean_token_accuracy": 0.14236303716897963, "num_tokens": 5627182.0, "step": 3020 }, { "entropy": 6.325074291229248, "epoch": 0.2671317555634052, "grad_norm": 1.109375, "learning_rate": 0.0004996389734160701, "loss": 6.293, "mean_token_accuracy": 0.14348414838314055, "num_tokens": 5636413.0, "step": 3025 }, { "entropy": 6.389656829833984, "epoch": 0.2675732956552455, "grad_norm": 1.078125, "learning_rate": 0.00049963718796479, "loss": 6.2355, "mean_token_accuracy": 0.14448917284607887, "num_tokens": 5646025.0, "step": 3030 }, { "entropy": 6.272923135757447, "epoch": 0.26801483574708584, "grad_norm": 1.0078125, "learning_rate": 0.0004996353981129952, "loss": 6.2316, "mean_token_accuracy": 0.15121424347162246, "num_tokens": 5656577.0, "step": 3035 }, { "entropy": 6.341066169738769, "epoch": 0.2684563758389262, "grad_norm": 1.09375, "learning_rate": 0.0004996336038607206, "loss": 6.2194, "mean_token_accuracy": 0.1483485922217369, "num_tokens": 5665163.0, "step": 3040 }, { "entropy": 6.198571634292603, "epoch": 0.2688979159307665, "grad_norm": 1.09375, "learning_rate": 0.0004996318052080015, "loss": 6.2208, "mean_token_accuracy": 0.14988780170679092, "num_tokens": 5675196.0, "step": 3045 }, { "entropy": 6.395411729812622, "epoch": 0.26933945602260684, "grad_norm": 1.1328125, "learning_rate": 0.0004996300021548731, "loss": 6.163, "mean_token_accuracy": 0.14734217673540115, "num_tokens": 5684361.0, "step": 3050 }, { "entropy": 6.142444658279419, "epoch": 0.26978099611444717, "grad_norm": 1.03125, "learning_rate": 0.0004996281947013707, "loss": 6.1584, "mean_token_accuracy": 0.1533804029226303, "num_tokens": 5694187.0, "step": 3055 }, { "entropy": 6.307489728927612, "epoch": 0.27022253620628756, "grad_norm": 1.2265625, "learning_rate": 0.0004996263828475298, "loss": 6.2235, "mean_token_accuracy": 0.1459271177649498, "num_tokens": 5702858.0, "step": 3060 }, { "entropy": 6.287201023101806, "epoch": 0.2706640762981279, "grad_norm": 1.0390625, "learning_rate": 0.0004996245665933857, "loss": 6.124, "mean_token_accuracy": 0.1559446483850479, "num_tokens": 5712727.0, "step": 3065 }, { "entropy": 6.197343635559082, "epoch": 0.2711056163899682, "grad_norm": 1.1484375, "learning_rate": 0.0004996227459389741, "loss": 6.2573, "mean_token_accuracy": 0.14863042607903482, "num_tokens": 5722904.0, "step": 3070 }, { "entropy": 6.400126695632935, "epoch": 0.27154715648180855, "grad_norm": 1.0859375, "learning_rate": 0.0004996209208843307, "loss": 6.3578, "mean_token_accuracy": 0.14644555673003196, "num_tokens": 5732228.0, "step": 3075 }, { "entropy": 6.322457504272461, "epoch": 0.2719886965736489, "grad_norm": 0.94921875, "learning_rate": 0.0004996190914294912, "loss": 6.2574, "mean_token_accuracy": 0.1481010966002941, "num_tokens": 5743548.0, "step": 3080 }, { "entropy": 6.271878862380982, "epoch": 0.2724302366654892, "grad_norm": 1.015625, "learning_rate": 0.0004996172575744914, "loss": 6.2867, "mean_token_accuracy": 0.13876855000853539, "num_tokens": 5754112.0, "step": 3085 }, { "entropy": 6.337986612319947, "epoch": 0.27287177675732954, "grad_norm": 0.99609375, "learning_rate": 0.0004996154193193673, "loss": 6.0993, "mean_token_accuracy": 0.15423450618982315, "num_tokens": 5763377.0, "step": 3090 }, { "entropy": 6.137074375152588, "epoch": 0.27331331684916993, "grad_norm": 1.0546875, "learning_rate": 0.0004996135766641549, "loss": 6.0806, "mean_token_accuracy": 0.15594624429941178, "num_tokens": 5772000.0, "step": 3095 }, { "entropy": 6.32679500579834, "epoch": 0.27375485694101026, "grad_norm": 1.078125, "learning_rate": 0.0004996117296088903, "loss": 6.2131, "mean_token_accuracy": 0.14269956201314926, "num_tokens": 5782192.0, "step": 3100 }, { "entropy": 6.341950845718384, "epoch": 0.2741963970328506, "grad_norm": 1.0546875, "learning_rate": 0.0004996098781536095, "loss": 6.2614, "mean_token_accuracy": 0.1522984981536865, "num_tokens": 5791163.0, "step": 3105 }, { "entropy": 6.3308337211608885, "epoch": 0.2746379371246909, "grad_norm": 1.0546875, "learning_rate": 0.0004996080222983492, "loss": 6.259, "mean_token_accuracy": 0.14087159857153891, "num_tokens": 5801815.0, "step": 3110 }, { "entropy": 6.410363006591797, "epoch": 0.27507947721653125, "grad_norm": 1.078125, "learning_rate": 0.0004996061620431453, "loss": 6.2805, "mean_token_accuracy": 0.1418459102511406, "num_tokens": 5811545.0, "step": 3115 }, { "entropy": 6.3260101795196535, "epoch": 0.2755210173083716, "grad_norm": 1.0546875, "learning_rate": 0.0004996042973880344, "loss": 6.2846, "mean_token_accuracy": 0.14676327556371688, "num_tokens": 5820285.0, "step": 3120 }, { "entropy": 6.260831451416015, "epoch": 0.2759625574002119, "grad_norm": 1.0234375, "learning_rate": 0.0004996024283330532, "loss": 6.1242, "mean_token_accuracy": 0.14824864715337754, "num_tokens": 5829284.0, "step": 3125 }, { "entropy": 6.268897342681885, "epoch": 0.2764040974920523, "grad_norm": 1.03125, "learning_rate": 0.000499600554878238, "loss": 6.2951, "mean_token_accuracy": 0.1431872047483921, "num_tokens": 5838973.0, "step": 3130 }, { "entropy": 6.278724241256714, "epoch": 0.27684563758389263, "grad_norm": 1.2578125, "learning_rate": 0.0004995986770236258, "loss": 6.1837, "mean_token_accuracy": 0.15050409361720085, "num_tokens": 5847434.0, "step": 3135 }, { "entropy": 6.304096031188965, "epoch": 0.27728717767573297, "grad_norm": 1.1015625, "learning_rate": 0.0004995967947692533, "loss": 6.1808, "mean_token_accuracy": 0.14390211701393127, "num_tokens": 5856464.0, "step": 3140 }, { "entropy": 6.311007452011109, "epoch": 0.2777287177675733, "grad_norm": 1.1171875, "learning_rate": 0.0004995949081151571, "loss": 6.2402, "mean_token_accuracy": 0.14435049369931222, "num_tokens": 5865892.0, "step": 3145 }, { "entropy": 6.353293609619141, "epoch": 0.2781702578594136, "grad_norm": 1.0625, "learning_rate": 0.0004995930170613746, "loss": 6.2075, "mean_token_accuracy": 0.1576365649700165, "num_tokens": 5874606.0, "step": 3150 }, { "entropy": 6.275713014602661, "epoch": 0.27861179795125396, "grad_norm": 1.0078125, "learning_rate": 0.0004995911216079425, "loss": 6.2116, "mean_token_accuracy": 0.15098029375076294, "num_tokens": 5883853.0, "step": 3155 }, { "entropy": 6.34206337928772, "epoch": 0.2790533380430943, "grad_norm": 1.1484375, "learning_rate": 0.0004995892217548981, "loss": 6.2474, "mean_token_accuracy": 0.1448906570672989, "num_tokens": 5892841.0, "step": 3160 }, { "entropy": 6.324929285049438, "epoch": 0.2794948781349347, "grad_norm": 1.0078125, "learning_rate": 0.0004995873175022786, "loss": 6.2295, "mean_token_accuracy": 0.15006719902157784, "num_tokens": 5901966.0, "step": 3165 }, { "entropy": 6.260264158248901, "epoch": 0.279936418226775, "grad_norm": 1.109375, "learning_rate": 0.0004995854088501213, "loss": 6.1281, "mean_token_accuracy": 0.14603292495012282, "num_tokens": 5910003.0, "step": 3170 }, { "entropy": 6.189469957351685, "epoch": 0.28037795831861534, "grad_norm": 1.0, "learning_rate": 0.0004995834957984634, "loss": 6.2606, "mean_token_accuracy": 0.14485765993595123, "num_tokens": 5920307.0, "step": 3175 }, { "entropy": 6.290289306640625, "epoch": 0.28081949841045567, "grad_norm": 1.015625, "learning_rate": 0.0004995815783473428, "loss": 6.1153, "mean_token_accuracy": 0.15536403357982637, "num_tokens": 5929875.0, "step": 3180 }, { "entropy": 6.247414398193359, "epoch": 0.281261038502296, "grad_norm": 1.1953125, "learning_rate": 0.0004995796564967967, "loss": 6.1021, "mean_token_accuracy": 0.14264860302209853, "num_tokens": 5938570.0, "step": 3185 }, { "entropy": 6.209772968292237, "epoch": 0.28170257859413633, "grad_norm": 1.078125, "learning_rate": 0.0004995777302468628, "loss": 6.2353, "mean_token_accuracy": 0.14704401940107345, "num_tokens": 5947693.0, "step": 3190 }, { "entropy": 6.332046413421631, "epoch": 0.28214411868597666, "grad_norm": 1.1171875, "learning_rate": 0.0004995757995975789, "loss": 6.3286, "mean_token_accuracy": 0.14671236276626587, "num_tokens": 5957377.0, "step": 3195 }, { "entropy": 6.426817464828491, "epoch": 0.28258565877781705, "grad_norm": 1.1171875, "learning_rate": 0.0004995738645489828, "loss": 6.2285, "mean_token_accuracy": 0.14824536591768264, "num_tokens": 5966443.0, "step": 3200 }, { "entropy": 6.258016681671142, "epoch": 0.2830271988696574, "grad_norm": 1.1875, "learning_rate": 0.0004995719251011124, "loss": 6.1471, "mean_token_accuracy": 0.14848615527153014, "num_tokens": 5975027.0, "step": 3205 }, { "entropy": 6.288780975341797, "epoch": 0.2834687389614977, "grad_norm": 1.1171875, "learning_rate": 0.0004995699812540058, "loss": 6.2244, "mean_token_accuracy": 0.15162927508354188, "num_tokens": 5983722.0, "step": 3210 }, { "entropy": 6.282262182235717, "epoch": 0.28391027905333804, "grad_norm": 0.9921875, "learning_rate": 0.000499568033007701, "loss": 6.2324, "mean_token_accuracy": 0.1441471680998802, "num_tokens": 5993358.0, "step": 3215 }, { "entropy": 6.377847146987915, "epoch": 0.2843518191451784, "grad_norm": 0.95703125, "learning_rate": 0.0004995660803622361, "loss": 6.161, "mean_token_accuracy": 0.1480330415070057, "num_tokens": 6002743.0, "step": 3220 }, { "entropy": 6.323432493209839, "epoch": 0.2847933592370187, "grad_norm": 1.09375, "learning_rate": 0.0004995641233176494, "loss": 6.3205, "mean_token_accuracy": 0.14193187803030013, "num_tokens": 6013727.0, "step": 3225 }, { "entropy": 6.404472589492798, "epoch": 0.28523489932885904, "grad_norm": 1.0234375, "learning_rate": 0.0004995621618739792, "loss": 6.2652, "mean_token_accuracy": 0.14536072462797164, "num_tokens": 6023397.0, "step": 3230 }, { "entropy": 6.271181678771972, "epoch": 0.2856764394206994, "grad_norm": 1.0625, "learning_rate": 0.000499560196031264, "loss": 6.1638, "mean_token_accuracy": 0.15034203678369523, "num_tokens": 6032453.0, "step": 3235 }, { "entropy": 6.19854884147644, "epoch": 0.28611797951253976, "grad_norm": 0.96875, "learning_rate": 0.0004995582257895423, "loss": 6.1258, "mean_token_accuracy": 0.14395371079444885, "num_tokens": 6042441.0, "step": 3240 }, { "entropy": 6.274320507049561, "epoch": 0.2865595196043801, "grad_norm": 1.140625, "learning_rate": 0.0004995562511488528, "loss": 6.1197, "mean_token_accuracy": 0.15483347177505494, "num_tokens": 6050881.0, "step": 3245 }, { "entropy": 6.220722341537476, "epoch": 0.2870010596962204, "grad_norm": 1.0546875, "learning_rate": 0.0004995542721092337, "loss": 6.1872, "mean_token_accuracy": 0.15040701180696486, "num_tokens": 6060804.0, "step": 3250 }, { "entropy": 6.368199014663697, "epoch": 0.28744259978806075, "grad_norm": 1.1015625, "learning_rate": 0.0004995522886707244, "loss": 6.2847, "mean_token_accuracy": 0.14250023737549783, "num_tokens": 6070774.0, "step": 3255 }, { "entropy": 6.330855846405029, "epoch": 0.2878841398799011, "grad_norm": 1.0234375, "learning_rate": 0.0004995503008333634, "loss": 6.2491, "mean_token_accuracy": 0.14379776269197464, "num_tokens": 6080158.0, "step": 3260 }, { "entropy": 6.32779221534729, "epoch": 0.2883256799717414, "grad_norm": 1.09375, "learning_rate": 0.0004995483085971897, "loss": 6.16, "mean_token_accuracy": 0.15112278908491134, "num_tokens": 6089183.0, "step": 3265 }, { "entropy": 6.200159549713135, "epoch": 0.2887672200635818, "grad_norm": 1.078125, "learning_rate": 0.0004995463119622424, "loss": 6.2524, "mean_token_accuracy": 0.14461245387792587, "num_tokens": 6098536.0, "step": 3270 }, { "entropy": 6.196738433837891, "epoch": 0.28920876015542213, "grad_norm": 0.98046875, "learning_rate": 0.0004995443109285604, "loss": 6.0112, "mean_token_accuracy": 0.1629092276096344, "num_tokens": 6107745.0, "step": 3275 }, { "entropy": 6.289895725250244, "epoch": 0.28965030024726246, "grad_norm": 0.98046875, "learning_rate": 0.0004995423054961832, "loss": 6.2158, "mean_token_accuracy": 0.15729496926069259, "num_tokens": 6117512.0, "step": 3280 }, { "entropy": 6.227946424484253, "epoch": 0.2900918403391028, "grad_norm": 1.078125, "learning_rate": 0.00049954029566515, "loss": 6.2178, "mean_token_accuracy": 0.15285916179418563, "num_tokens": 6126030.0, "step": 3285 }, { "entropy": 6.346371364593506, "epoch": 0.2905333804309431, "grad_norm": 0.98828125, "learning_rate": 0.0004995382814355, "loss": 6.2625, "mean_token_accuracy": 0.14053603783249854, "num_tokens": 6134888.0, "step": 3290 }, { "entropy": 6.351688432693481, "epoch": 0.29097492052278345, "grad_norm": 1.09375, "learning_rate": 0.0004995362628072728, "loss": 6.2117, "mean_token_accuracy": 0.1513692669570446, "num_tokens": 6144274.0, "step": 3295 }, { "entropy": 6.218261671066284, "epoch": 0.2914164606146238, "grad_norm": 1.0078125, "learning_rate": 0.0004995342397805078, "loss": 6.2217, "mean_token_accuracy": 0.1503726065158844, "num_tokens": 6153406.0, "step": 3300 }, { "entropy": 6.262264680862427, "epoch": 0.29185800070646417, "grad_norm": 1.0859375, "learning_rate": 0.0004995322123552448, "loss": 6.1233, "mean_token_accuracy": 0.1533094823360443, "num_tokens": 6162743.0, "step": 3305 }, { "entropy": 6.2753763675689695, "epoch": 0.2922995407983045, "grad_norm": 1.1015625, "learning_rate": 0.0004995301805315235, "loss": 6.1201, "mean_token_accuracy": 0.15415377020835877, "num_tokens": 6171997.0, "step": 3310 }, { "entropy": 6.181772375106812, "epoch": 0.29274108089014483, "grad_norm": 1.1328125, "learning_rate": 0.0004995281443093837, "loss": 6.1422, "mean_token_accuracy": 0.15062253326177596, "num_tokens": 6181275.0, "step": 3315 }, { "entropy": 6.289641571044922, "epoch": 0.29318262098198516, "grad_norm": 1.0234375, "learning_rate": 0.0004995261036888652, "loss": 6.2135, "mean_token_accuracy": 0.14647497087717057, "num_tokens": 6191640.0, "step": 3320 }, { "entropy": 6.357338809967041, "epoch": 0.2936241610738255, "grad_norm": 1.0078125, "learning_rate": 0.0004995240586700081, "loss": 6.2137, "mean_token_accuracy": 0.14479927867650985, "num_tokens": 6201508.0, "step": 3325 }, { "entropy": 6.220925617218017, "epoch": 0.2940657011656658, "grad_norm": 1.0234375, "learning_rate": 0.0004995220092528522, "loss": 6.1079, "mean_token_accuracy": 0.15776659697294235, "num_tokens": 6209902.0, "step": 3330 }, { "entropy": 6.257762432098389, "epoch": 0.29450724125750616, "grad_norm": 1.0546875, "learning_rate": 0.000499519955437438, "loss": 6.2514, "mean_token_accuracy": 0.14023935049772263, "num_tokens": 6219760.0, "step": 3335 }, { "entropy": 6.248654699325561, "epoch": 0.29494878134934654, "grad_norm": 1.109375, "learning_rate": 0.0004995178972238054, "loss": 6.2308, "mean_token_accuracy": 0.14278148710727692, "num_tokens": 6228721.0, "step": 3340 }, { "entropy": 6.228511571884155, "epoch": 0.2953903214411869, "grad_norm": 1.1171875, "learning_rate": 0.000499515834611995, "loss": 6.0645, "mean_token_accuracy": 0.16055997535586358, "num_tokens": 6237070.0, "step": 3345 }, { "entropy": 6.2931300640106205, "epoch": 0.2958318615330272, "grad_norm": 1.046875, "learning_rate": 0.0004995137676020472, "loss": 6.2051, "mean_token_accuracy": 0.1466339647769928, "num_tokens": 6245659.0, "step": 3350 }, { "entropy": 6.231089639663696, "epoch": 0.29627340162486754, "grad_norm": 1.09375, "learning_rate": 0.0004995116961940023, "loss": 6.1736, "mean_token_accuracy": 0.15321153849363328, "num_tokens": 6255175.0, "step": 3355 }, { "entropy": 6.241381883621216, "epoch": 0.29671494171670787, "grad_norm": 0.96875, "learning_rate": 0.0004995096203879009, "loss": 6.1761, "mean_token_accuracy": 0.1443149983882904, "num_tokens": 6264962.0, "step": 3360 }, { "entropy": 6.29106593132019, "epoch": 0.2971564818085482, "grad_norm": 1.09375, "learning_rate": 0.0004995075401837837, "loss": 6.0989, "mean_token_accuracy": 0.15684471875429154, "num_tokens": 6273411.0, "step": 3365 }, { "entropy": 6.135526657104492, "epoch": 0.29759802190038853, "grad_norm": 1.078125, "learning_rate": 0.0004995054555816915, "loss": 6.0795, "mean_token_accuracy": 0.15481019616127015, "num_tokens": 6282618.0, "step": 3370 }, { "entropy": 6.155969190597534, "epoch": 0.2980395619922289, "grad_norm": 1.0625, "learning_rate": 0.0004995033665816651, "loss": 6.0726, "mean_token_accuracy": 0.15434197783470155, "num_tokens": 6292008.0, "step": 3375 }, { "entropy": 6.147941923141479, "epoch": 0.29848110208406925, "grad_norm": 1.1328125, "learning_rate": 0.0004995012731837454, "loss": 6.0441, "mean_token_accuracy": 0.15588683634996414, "num_tokens": 6301079.0, "step": 3380 }, { "entropy": 6.323877429962158, "epoch": 0.2989226421759096, "grad_norm": 1.046875, "learning_rate": 0.0004994991753879736, "loss": 6.2273, "mean_token_accuracy": 0.14481185078620912, "num_tokens": 6310543.0, "step": 3385 }, { "entropy": 6.276103544235229, "epoch": 0.2993641822677499, "grad_norm": 1.296875, "learning_rate": 0.0004994970731943904, "loss": 6.1692, "mean_token_accuracy": 0.1476268857717514, "num_tokens": 6320037.0, "step": 3390 }, { "entropy": 6.177714109420776, "epoch": 0.29980572235959024, "grad_norm": 1.1015625, "learning_rate": 0.0004994949666030374, "loss": 6.1795, "mean_token_accuracy": 0.15479264855384828, "num_tokens": 6328948.0, "step": 3395 }, { "entropy": 6.212888717651367, "epoch": 0.3002472624514306, "grad_norm": 1.0703125, "learning_rate": 0.0004994928556139557, "loss": 6.0425, "mean_token_accuracy": 0.1581359773874283, "num_tokens": 6337449.0, "step": 3400 }, { "entropy": 6.162185001373291, "epoch": 0.3006888025432709, "grad_norm": 1.109375, "learning_rate": 0.0004994907402271865, "loss": 6.1028, "mean_token_accuracy": 0.15499115511775016, "num_tokens": 6346130.0, "step": 3405 }, { "entropy": 6.15036244392395, "epoch": 0.3011303426351113, "grad_norm": 1.1484375, "learning_rate": 0.0004994886204427715, "loss": 6.1344, "mean_token_accuracy": 0.15851569175720215, "num_tokens": 6354973.0, "step": 3410 }, { "entropy": 6.328536748886108, "epoch": 0.3015718827269516, "grad_norm": 1.1328125, "learning_rate": 0.0004994864962607519, "loss": 6.2148, "mean_token_accuracy": 0.14824963808059693, "num_tokens": 6364107.0, "step": 3415 }, { "entropy": 6.26984658241272, "epoch": 0.30201342281879195, "grad_norm": 1.1484375, "learning_rate": 0.0004994843676811697, "loss": 6.1438, "mean_token_accuracy": 0.14578014612197876, "num_tokens": 6372859.0, "step": 3420 }, { "entropy": 6.184666395187378, "epoch": 0.3024549629106323, "grad_norm": 1.1015625, "learning_rate": 0.0004994822347040664, "loss": 6.0282, "mean_token_accuracy": 0.1636571153998375, "num_tokens": 6381818.0, "step": 3425 }, { "entropy": 6.257387351989746, "epoch": 0.3028965030024726, "grad_norm": 1.09375, "learning_rate": 0.0004994800973294837, "loss": 6.1797, "mean_token_accuracy": 0.1482721135020256, "num_tokens": 6391460.0, "step": 3430 }, { "entropy": 6.25637993812561, "epoch": 0.30333804309431295, "grad_norm": 1.0625, "learning_rate": 0.0004994779555574636, "loss": 6.2086, "mean_token_accuracy": 0.15168215036392213, "num_tokens": 6401461.0, "step": 3435 }, { "entropy": 6.262447023391724, "epoch": 0.3037795831861533, "grad_norm": 1.2265625, "learning_rate": 0.000499475809388048, "loss": 6.0842, "mean_token_accuracy": 0.14643362984061242, "num_tokens": 6410435.0, "step": 3440 }, { "entropy": 6.279139566421509, "epoch": 0.30422112327799367, "grad_norm": 1.125, "learning_rate": 0.000499473658821279, "loss": 6.2844, "mean_token_accuracy": 0.14141586795449257, "num_tokens": 6420710.0, "step": 3445 }, { "entropy": 6.305064296722412, "epoch": 0.304662663369834, "grad_norm": 1.0859375, "learning_rate": 0.0004994715038571986, "loss": 6.1276, "mean_token_accuracy": 0.149975299090147, "num_tokens": 6429882.0, "step": 3450 }, { "entropy": 6.167029666900635, "epoch": 0.30510420346167433, "grad_norm": 1.1171875, "learning_rate": 0.0004994693444958493, "loss": 6.1142, "mean_token_accuracy": 0.15782218649983407, "num_tokens": 6439183.0, "step": 3455 }, { "entropy": 6.307245588302612, "epoch": 0.30554574355351466, "grad_norm": 1.0390625, "learning_rate": 0.000499467180737273, "loss": 6.288, "mean_token_accuracy": 0.14309904649853705, "num_tokens": 6448460.0, "step": 3460 }, { "entropy": 6.270366525650024, "epoch": 0.305987283645355, "grad_norm": 1.1015625, "learning_rate": 0.0004994650125815124, "loss": 6.1656, "mean_token_accuracy": 0.14977657794952393, "num_tokens": 6457687.0, "step": 3465 }, { "entropy": 6.241862058639526, "epoch": 0.3064288237371953, "grad_norm": 1.1484375, "learning_rate": 0.0004994628400286097, "loss": 6.1063, "mean_token_accuracy": 0.15189075246453285, "num_tokens": 6467057.0, "step": 3470 }, { "entropy": 6.209890508651734, "epoch": 0.30687036382903565, "grad_norm": 1.1328125, "learning_rate": 0.0004994606630786078, "loss": 6.1745, "mean_token_accuracy": 0.15533267706632614, "num_tokens": 6477484.0, "step": 3475 }, { "entropy": 6.248384809494018, "epoch": 0.30731190392087604, "grad_norm": 1.0234375, "learning_rate": 0.0004994584817315492, "loss": 6.233, "mean_token_accuracy": 0.14967372938990592, "num_tokens": 6488381.0, "step": 3480 }, { "entropy": 6.30739917755127, "epoch": 0.30775344401271637, "grad_norm": 1.1015625, "learning_rate": 0.0004994562959874765, "loss": 6.1277, "mean_token_accuracy": 0.1523176297545433, "num_tokens": 6498279.0, "step": 3485 }, { "entropy": 6.166545104980469, "epoch": 0.3081949841045567, "grad_norm": 1.0390625, "learning_rate": 0.0004994541058464326, "loss": 6.1476, "mean_token_accuracy": 0.14837286472320557, "num_tokens": 6508008.0, "step": 3490 }, { "entropy": 6.321815013885498, "epoch": 0.30863652419639703, "grad_norm": 1.1015625, "learning_rate": 0.0004994519113084605, "loss": 6.1947, "mean_token_accuracy": 0.1496584579348564, "num_tokens": 6517263.0, "step": 3495 }, { "entropy": 6.262754631042481, "epoch": 0.30907806428823736, "grad_norm": 1.171875, "learning_rate": 0.0004994497123736029, "loss": 6.2631, "mean_token_accuracy": 0.1426304429769516, "num_tokens": 6527682.0, "step": 3500 }, { "entropy": 6.267001819610596, "epoch": 0.3095196043800777, "grad_norm": 1.0546875, "learning_rate": 0.0004994475090419034, "loss": 6.1298, "mean_token_accuracy": 0.15242742300033568, "num_tokens": 6537143.0, "step": 3505 }, { "entropy": 6.250511837005615, "epoch": 0.309961144471918, "grad_norm": 1.09375, "learning_rate": 0.0004994453013134047, "loss": 6.1352, "mean_token_accuracy": 0.14925057888031007, "num_tokens": 6546561.0, "step": 3510 }, { "entropy": 6.185484886169434, "epoch": 0.3104026845637584, "grad_norm": 1.0546875, "learning_rate": 0.0004994430891881502, "loss": 6.124, "mean_token_accuracy": 0.15090959072113036, "num_tokens": 6555806.0, "step": 3515 }, { "entropy": 6.202518653869629, "epoch": 0.31084422465559874, "grad_norm": 1.0546875, "learning_rate": 0.0004994408726661832, "loss": 6.166, "mean_token_accuracy": 0.1501057654619217, "num_tokens": 6564836.0, "step": 3520 }, { "entropy": 6.245743179321289, "epoch": 0.3112857647474391, "grad_norm": 1.046875, "learning_rate": 0.0004994386517475472, "loss": 6.1713, "mean_token_accuracy": 0.1496300369501114, "num_tokens": 6574997.0, "step": 3525 }, { "entropy": 6.246868562698364, "epoch": 0.3117273048392794, "grad_norm": 1.15625, "learning_rate": 0.0004994364264322856, "loss": 6.1715, "mean_token_accuracy": 0.14895060658454895, "num_tokens": 6584589.0, "step": 3530 }, { "entropy": 6.162112808227539, "epoch": 0.31216884493111974, "grad_norm": 1.0703125, "learning_rate": 0.0004994341967204421, "loss": 6.1199, "mean_token_accuracy": 0.15284974724054337, "num_tokens": 6592898.0, "step": 3535 }, { "entropy": 6.326304626464844, "epoch": 0.31261038502296007, "grad_norm": 0.984375, "learning_rate": 0.0004994319626120603, "loss": 6.1268, "mean_token_accuracy": 0.16072259843349457, "num_tokens": 6602454.0, "step": 3540 }, { "entropy": 6.1308966159820555, "epoch": 0.3130519251148004, "grad_norm": 1.1328125, "learning_rate": 0.0004994297241071841, "loss": 6.1467, "mean_token_accuracy": 0.1551404133439064, "num_tokens": 6612454.0, "step": 3545 }, { "entropy": 6.29251446723938, "epoch": 0.3134934652066408, "grad_norm": 1.0703125, "learning_rate": 0.000499427481205857, "loss": 6.1423, "mean_token_accuracy": 0.15539032369852065, "num_tokens": 6621286.0, "step": 3550 }, { "entropy": 6.253758335113526, "epoch": 0.3139350052984811, "grad_norm": 1.265625, "learning_rate": 0.0004994252339081234, "loss": 6.0651, "mean_token_accuracy": 0.15716515630483627, "num_tokens": 6629684.0, "step": 3555 }, { "entropy": 6.167656517028808, "epoch": 0.31437654539032145, "grad_norm": 1.0703125, "learning_rate": 0.000499422982214027, "loss": 6.1775, "mean_token_accuracy": 0.15429823398590087, "num_tokens": 6638836.0, "step": 3560 }, { "entropy": 6.254745101928711, "epoch": 0.3148180854821618, "grad_norm": 1.09375, "learning_rate": 0.0004994207261236121, "loss": 6.1189, "mean_token_accuracy": 0.14906142503023148, "num_tokens": 6647337.0, "step": 3565 }, { "entropy": 6.145832633972168, "epoch": 0.3152596255740021, "grad_norm": 1.1015625, "learning_rate": 0.0004994184656369227, "loss": 6.1026, "mean_token_accuracy": 0.145336801558733, "num_tokens": 6657246.0, "step": 3570 }, { "entropy": 6.310924911499024, "epoch": 0.31570116566584244, "grad_norm": 1.1328125, "learning_rate": 0.0004994162007540033, "loss": 6.2505, "mean_token_accuracy": 0.14196238815784454, "num_tokens": 6667981.0, "step": 3575 }, { "entropy": 6.2675220489501955, "epoch": 0.3161427057576828, "grad_norm": 1.140625, "learning_rate": 0.0004994139314748981, "loss": 6.0974, "mean_token_accuracy": 0.15776362121105195, "num_tokens": 6677462.0, "step": 3580 }, { "entropy": 6.107239675521851, "epoch": 0.31658424584952316, "grad_norm": 1.0234375, "learning_rate": 0.0004994116577996517, "loss": 6.1152, "mean_token_accuracy": 0.15776521414518357, "num_tokens": 6687225.0, "step": 3585 }, { "entropy": 6.2434509754180905, "epoch": 0.3170257859413635, "grad_norm": 1.1171875, "learning_rate": 0.0004994093797283084, "loss": 6.2122, "mean_token_accuracy": 0.1487804666161537, "num_tokens": 6696345.0, "step": 3590 }, { "entropy": 6.313143348693847, "epoch": 0.3174673260332038, "grad_norm": 1.1015625, "learning_rate": 0.0004994070972609132, "loss": 6.1867, "mean_token_accuracy": 0.14744829311966895, "num_tokens": 6706207.0, "step": 3595 }, { "entropy": 6.24591326713562, "epoch": 0.31790886612504415, "grad_norm": 1.1953125, "learning_rate": 0.0004994048103975103, "loss": 6.0904, "mean_token_accuracy": 0.15303746610879898, "num_tokens": 6714430.0, "step": 3600 }, { "entropy": 6.204165840148926, "epoch": 0.3183504062168845, "grad_norm": 1.0703125, "learning_rate": 0.000499402519138145, "loss": 6.1197, "mean_token_accuracy": 0.1557806834578514, "num_tokens": 6724139.0, "step": 3605 }, { "entropy": 6.151504611968994, "epoch": 0.3187919463087248, "grad_norm": 1.078125, "learning_rate": 0.0004994002234828619, "loss": 6.1566, "mean_token_accuracy": 0.1508852459490299, "num_tokens": 6733525.0, "step": 3610 }, { "entropy": 6.321462392807007, "epoch": 0.31923348640056515, "grad_norm": 1.0859375, "learning_rate": 0.000499397923431706, "loss": 6.2666, "mean_token_accuracy": 0.1465996690094471, "num_tokens": 6743738.0, "step": 3615 }, { "entropy": 6.337309741973877, "epoch": 0.31967502649240553, "grad_norm": 1.21875, "learning_rate": 0.0004993956189847226, "loss": 6.1012, "mean_token_accuracy": 0.15459158420562744, "num_tokens": 6752900.0, "step": 3620 }, { "entropy": 6.150611686706543, "epoch": 0.32011656658424587, "grad_norm": 1.140625, "learning_rate": 0.0004993933101419565, "loss": 6.106, "mean_token_accuracy": 0.1578878253698349, "num_tokens": 6761776.0, "step": 3625 }, { "entropy": 6.23021969795227, "epoch": 0.3205581066760862, "grad_norm": 1.0390625, "learning_rate": 0.0004993909969034531, "loss": 6.2486, "mean_token_accuracy": 0.1432959534227848, "num_tokens": 6771543.0, "step": 3630 }, { "entropy": 6.232846355438232, "epoch": 0.3209996467679265, "grad_norm": 1.1796875, "learning_rate": 0.0004993886792692576, "loss": 6.1261, "mean_token_accuracy": 0.15022226572036743, "num_tokens": 6780010.0, "step": 3635 }, { "entropy": 6.273227500915527, "epoch": 0.32144118685976686, "grad_norm": 1.09375, "learning_rate": 0.0004993863572394156, "loss": 6.186, "mean_token_accuracy": 0.14826097190380097, "num_tokens": 6788830.0, "step": 3640 }, { "entropy": 6.266057395935059, "epoch": 0.3218827269516072, "grad_norm": 1.03125, "learning_rate": 0.0004993840308139724, "loss": 6.163, "mean_token_accuracy": 0.15327871441841126, "num_tokens": 6799121.0, "step": 3645 }, { "entropy": 6.232186365127563, "epoch": 0.3223242670434475, "grad_norm": 1.0703125, "learning_rate": 0.0004993816999929738, "loss": 6.1253, "mean_token_accuracy": 0.15162717401981354, "num_tokens": 6808519.0, "step": 3650 }, { "entropy": 6.129374217987061, "epoch": 0.3227658071352879, "grad_norm": 1.015625, "learning_rate": 0.0004993793647764651, "loss": 6.1462, "mean_token_accuracy": 0.16090914756059646, "num_tokens": 6817583.0, "step": 3655 }, { "entropy": 6.254270458221436, "epoch": 0.32320734722712824, "grad_norm": 1.078125, "learning_rate": 0.0004993770251644923, "loss": 6.1315, "mean_token_accuracy": 0.15279360860586166, "num_tokens": 6827732.0, "step": 3660 }, { "entropy": 6.188901424407959, "epoch": 0.32364888731896857, "grad_norm": 1.1953125, "learning_rate": 0.0004993746811571013, "loss": 6.1101, "mean_token_accuracy": 0.15757904648780824, "num_tokens": 6836903.0, "step": 3665 }, { "entropy": 6.073100471496582, "epoch": 0.3240904274108089, "grad_norm": 1.0625, "learning_rate": 0.0004993723327543379, "loss": 6.119, "mean_token_accuracy": 0.15876784324645996, "num_tokens": 6845684.0, "step": 3670 }, { "entropy": 6.200423860549927, "epoch": 0.32453196750264923, "grad_norm": 1.0703125, "learning_rate": 0.000499369979956248, "loss": 6.1035, "mean_token_accuracy": 0.14955462887883186, "num_tokens": 6854650.0, "step": 3675 }, { "entropy": 6.267183446884156, "epoch": 0.32497350759448956, "grad_norm": 1.125, "learning_rate": 0.0004993676227628779, "loss": 6.0977, "mean_token_accuracy": 0.1460177183151245, "num_tokens": 6864340.0, "step": 3680 }, { "entropy": 6.208888530731201, "epoch": 0.3254150476863299, "grad_norm": 1.171875, "learning_rate": 0.0004993652611742736, "loss": 6.1343, "mean_token_accuracy": 0.15321332961320877, "num_tokens": 6873244.0, "step": 3685 }, { "entropy": 6.15152006149292, "epoch": 0.3258565877781703, "grad_norm": 0.95703125, "learning_rate": 0.0004993628951904815, "loss": 6.0176, "mean_token_accuracy": 0.15752123296260834, "num_tokens": 6882392.0, "step": 3690 }, { "entropy": 6.2300208568572994, "epoch": 0.3262981278700106, "grad_norm": 1.15625, "learning_rate": 0.0004993605248115479, "loss": 6.2446, "mean_token_accuracy": 0.1435020685195923, "num_tokens": 6891515.0, "step": 3695 }, { "entropy": 6.330176734924317, "epoch": 0.32673966796185094, "grad_norm": 1.0859375, "learning_rate": 0.0004993581500375191, "loss": 6.1177, "mean_token_accuracy": 0.14757276177406312, "num_tokens": 6900104.0, "step": 3700 }, { "entropy": 6.2001354694366455, "epoch": 0.3271812080536913, "grad_norm": 1.3203125, "learning_rate": 0.0004993557708684417, "loss": 6.2227, "mean_token_accuracy": 0.14519331306219102, "num_tokens": 6910443.0, "step": 3705 }, { "entropy": 6.208211469650268, "epoch": 0.3276227481455316, "grad_norm": 1.2109375, "learning_rate": 0.0004993533873043625, "loss": 6.1062, "mean_token_accuracy": 0.15348025262355805, "num_tokens": 6919882.0, "step": 3710 }, { "entropy": 6.281739521026611, "epoch": 0.32806428823737194, "grad_norm": 1.1015625, "learning_rate": 0.000499350999345328, "loss": 6.2142, "mean_token_accuracy": 0.14149869233369827, "num_tokens": 6929757.0, "step": 3715 }, { "entropy": 6.222444868087768, "epoch": 0.32850582832921227, "grad_norm": 1.1171875, "learning_rate": 0.000499348606991385, "loss": 6.201, "mean_token_accuracy": 0.1466899633407593, "num_tokens": 6938578.0, "step": 3720 }, { "entropy": 6.181999588012696, "epoch": 0.32894736842105265, "grad_norm": 1.1953125, "learning_rate": 0.0004993462102425805, "loss": 6.0957, "mean_token_accuracy": 0.14744215086102486, "num_tokens": 6947920.0, "step": 3725 }, { "entropy": 6.170785808563233, "epoch": 0.329388908512893, "grad_norm": 1.125, "learning_rate": 0.0004993438090989612, "loss": 6.0325, "mean_token_accuracy": 0.15373560339212416, "num_tokens": 6957291.0, "step": 3730 }, { "entropy": 6.204143905639649, "epoch": 0.3298304486047333, "grad_norm": 1.1875, "learning_rate": 0.0004993414035605743, "loss": 6.0944, "mean_token_accuracy": 0.16135938167572023, "num_tokens": 6966504.0, "step": 3735 }, { "entropy": 6.26701283454895, "epoch": 0.33027198869657365, "grad_norm": 1.1015625, "learning_rate": 0.0004993389936274669, "loss": 6.1381, "mean_token_accuracy": 0.15607366263866423, "num_tokens": 6976234.0, "step": 3740 }, { "entropy": 6.179308748245239, "epoch": 0.330713528788414, "grad_norm": 1.0859375, "learning_rate": 0.0004993365792996862, "loss": 6.1882, "mean_token_accuracy": 0.1475161299109459, "num_tokens": 6986372.0, "step": 3745 }, { "entropy": 6.258656454086304, "epoch": 0.3311550688802543, "grad_norm": 1.046875, "learning_rate": 0.0004993341605772795, "loss": 6.0768, "mean_token_accuracy": 0.155860435962677, "num_tokens": 6995553.0, "step": 3750 }, { "entropy": 6.148722696304321, "epoch": 0.33159660897209464, "grad_norm": 1.1015625, "learning_rate": 0.0004993317374602941, "loss": 5.9902, "mean_token_accuracy": 0.16303742080926895, "num_tokens": 7005675.0, "step": 3755 }, { "entropy": 6.081928777694702, "epoch": 0.33203814906393503, "grad_norm": 1.2734375, "learning_rate": 0.0004993293099487777, "loss": 6.1198, "mean_token_accuracy": 0.14885973036289216, "num_tokens": 7014953.0, "step": 3760 }, { "entropy": 6.263726997375488, "epoch": 0.33247968915577536, "grad_norm": 1.1328125, "learning_rate": 0.0004993268780427776, "loss": 6.1445, "mean_token_accuracy": 0.1564795732498169, "num_tokens": 7025075.0, "step": 3765 }, { "entropy": 6.201370096206665, "epoch": 0.3329212292476157, "grad_norm": 1.28125, "learning_rate": 0.0004993244417423416, "loss": 6.1224, "mean_token_accuracy": 0.15349680185317993, "num_tokens": 7034286.0, "step": 3770 }, { "entropy": 6.201808977127075, "epoch": 0.333362769339456, "grad_norm": 1.1796875, "learning_rate": 0.0004993220010475174, "loss": 6.1767, "mean_token_accuracy": 0.14727241545915604, "num_tokens": 7043957.0, "step": 3775 }, { "entropy": 6.2423442840576175, "epoch": 0.33380430943129635, "grad_norm": 1.109375, "learning_rate": 0.0004993195559583526, "loss": 6.091, "mean_token_accuracy": 0.15680659636855127, "num_tokens": 7053170.0, "step": 3780 }, { "entropy": 6.163934850692749, "epoch": 0.3342458495231367, "grad_norm": 1.0859375, "learning_rate": 0.0004993171064748954, "loss": 6.0909, "mean_token_accuracy": 0.15337430387735368, "num_tokens": 7062469.0, "step": 3785 }, { "entropy": 6.245288276672364, "epoch": 0.334687389614977, "grad_norm": 1.109375, "learning_rate": 0.0004993146525971937, "loss": 6.0751, "mean_token_accuracy": 0.15703266561031343, "num_tokens": 7071463.0, "step": 3790 }, { "entropy": 6.30460000038147, "epoch": 0.3351289297068174, "grad_norm": 1.03125, "learning_rate": 0.0004993121943252955, "loss": 6.1685, "mean_token_accuracy": 0.1484996944665909, "num_tokens": 7081574.0, "step": 3795 }, { "entropy": 6.146984243392945, "epoch": 0.33557046979865773, "grad_norm": 1.1171875, "learning_rate": 0.0004993097316592489, "loss": 6.0826, "mean_token_accuracy": 0.15749146193265914, "num_tokens": 7090835.0, "step": 3800 }, { "entropy": 6.072944211959839, "epoch": 0.33601200989049806, "grad_norm": 1.5859375, "learning_rate": 0.0004993072645991023, "loss": 6.0092, "mean_token_accuracy": 0.14467609971761702, "num_tokens": 7100521.0, "step": 3805 }, { "entropy": 6.172339153289795, "epoch": 0.3364535499823384, "grad_norm": 1.2421875, "learning_rate": 0.000499304793144904, "loss": 6.108, "mean_token_accuracy": 0.15104963332414628, "num_tokens": 7109011.0, "step": 3810 }, { "entropy": 6.258471536636352, "epoch": 0.3368950900741787, "grad_norm": 1.109375, "learning_rate": 0.0004993023172967022, "loss": 6.1451, "mean_token_accuracy": 0.14512295573949813, "num_tokens": 7119128.0, "step": 3815 }, { "entropy": 6.266326761245727, "epoch": 0.33733663016601906, "grad_norm": 1.1328125, "learning_rate": 0.0004992998370545458, "loss": 6.2073, "mean_token_accuracy": 0.14363647550344466, "num_tokens": 7128313.0, "step": 3820 }, { "entropy": 6.234633827209473, "epoch": 0.3377781702578594, "grad_norm": 1.21875, "learning_rate": 0.0004992973524184831, "loss": 6.1686, "mean_token_accuracy": 0.14797609224915503, "num_tokens": 7137567.0, "step": 3825 }, { "entropy": 6.209169197082519, "epoch": 0.3382197103496998, "grad_norm": 1.265625, "learning_rate": 0.0004992948633885627, "loss": 6.1355, "mean_token_accuracy": 0.15170362889766692, "num_tokens": 7147254.0, "step": 3830 }, { "entropy": 6.123815822601318, "epoch": 0.3386612504415401, "grad_norm": 1.125, "learning_rate": 0.0004992923699648335, "loss": 6.055, "mean_token_accuracy": 0.15891509354114533, "num_tokens": 7156215.0, "step": 3835 }, { "entropy": 6.286255836486816, "epoch": 0.33910279053338044, "grad_norm": 1.2265625, "learning_rate": 0.0004992898721473445, "loss": 6.1341, "mean_token_accuracy": 0.14742862731218337, "num_tokens": 7165653.0, "step": 3840 }, { "entropy": 6.107093811035156, "epoch": 0.33954433062522077, "grad_norm": 1.2265625, "learning_rate": 0.0004992873699361444, "loss": 6.0116, "mean_token_accuracy": 0.15364348739385605, "num_tokens": 7174797.0, "step": 3845 }, { "entropy": 6.231772947311401, "epoch": 0.3399858707170611, "grad_norm": 1.1015625, "learning_rate": 0.0004992848633312822, "loss": 6.076, "mean_token_accuracy": 0.15550567209720612, "num_tokens": 7184040.0, "step": 3850 }, { "entropy": 6.206028461456299, "epoch": 0.34042741080890143, "grad_norm": 1.21875, "learning_rate": 0.0004992823523328071, "loss": 6.0162, "mean_token_accuracy": 0.156949782371521, "num_tokens": 7193533.0, "step": 3855 }, { "entropy": 6.040725946426392, "epoch": 0.34086895090074176, "grad_norm": 1.4296875, "learning_rate": 0.0004992798369407684, "loss": 6.1271, "mean_token_accuracy": 0.1540011927485466, "num_tokens": 7203355.0, "step": 3860 }, { "entropy": 6.231026697158813, "epoch": 0.34131049099258215, "grad_norm": 1.140625, "learning_rate": 0.0004992773171552152, "loss": 6.0662, "mean_token_accuracy": 0.15278246700763704, "num_tokens": 7212569.0, "step": 3865 }, { "entropy": 6.1147346019744875, "epoch": 0.3417520310844225, "grad_norm": 1.1015625, "learning_rate": 0.0004992747929761968, "loss": 6.0862, "mean_token_accuracy": 0.1568584769964218, "num_tokens": 7221886.0, "step": 3870 }, { "entropy": 6.20992078781128, "epoch": 0.3421935711762628, "grad_norm": 1.234375, "learning_rate": 0.0004992722644037628, "loss": 6.1399, "mean_token_accuracy": 0.14878576919436454, "num_tokens": 7232065.0, "step": 3875 }, { "entropy": 6.2117961883544925, "epoch": 0.34263511126810314, "grad_norm": 0.98828125, "learning_rate": 0.0004992697314379628, "loss": 6.0802, "mean_token_accuracy": 0.1546872690320015, "num_tokens": 7242287.0, "step": 3880 }, { "entropy": 6.163637351989746, "epoch": 0.3430766513599435, "grad_norm": 1.203125, "learning_rate": 0.0004992671940788462, "loss": 6.0744, "mean_token_accuracy": 0.15107770562171935, "num_tokens": 7250638.0, "step": 3885 }, { "entropy": 6.146958065032959, "epoch": 0.3435181914517838, "grad_norm": 1.1171875, "learning_rate": 0.0004992646523264628, "loss": 6.0492, "mean_token_accuracy": 0.1611901268362999, "num_tokens": 7259526.0, "step": 3890 }, { "entropy": 6.119075059890747, "epoch": 0.34395973154362414, "grad_norm": 1.125, "learning_rate": 0.0004992621061808625, "loss": 6.0604, "mean_token_accuracy": 0.15892861932516097, "num_tokens": 7269260.0, "step": 3895 }, { "entropy": 6.161959505081176, "epoch": 0.3444012716354645, "grad_norm": 1.1484375, "learning_rate": 0.000499259555642095, "loss": 6.0786, "mean_token_accuracy": 0.15186367481946944, "num_tokens": 7279061.0, "step": 3900 }, { "entropy": 6.21967043876648, "epoch": 0.34484281172730485, "grad_norm": 1.1328125, "learning_rate": 0.0004992570007102104, "loss": 6.0831, "mean_token_accuracy": 0.15683933049440385, "num_tokens": 7289721.0, "step": 3905 }, { "entropy": 6.214530658721924, "epoch": 0.3452843518191452, "grad_norm": 1.265625, "learning_rate": 0.0004992544413852587, "loss": 6.2136, "mean_token_accuracy": 0.14554179906845094, "num_tokens": 7299836.0, "step": 3910 }, { "entropy": 6.225806951522827, "epoch": 0.3457258919109855, "grad_norm": 1.125, "learning_rate": 0.00049925187766729, "loss": 6.2249, "mean_token_accuracy": 0.14765020608901977, "num_tokens": 7309306.0, "step": 3915 }, { "entropy": 6.249141550064087, "epoch": 0.34616743200282585, "grad_norm": 1.203125, "learning_rate": 0.0004992493095563545, "loss": 6.1238, "mean_token_accuracy": 0.16257761269807816, "num_tokens": 7318486.0, "step": 3920 }, { "entropy": 6.111485767364502, "epoch": 0.3466089720946662, "grad_norm": 1.1484375, "learning_rate": 0.0004992467370525026, "loss": 6.083, "mean_token_accuracy": 0.1557182028889656, "num_tokens": 7327764.0, "step": 3925 }, { "entropy": 6.248839044570923, "epoch": 0.3470505121865065, "grad_norm": 1.140625, "learning_rate": 0.0004992441601557848, "loss": 6.168, "mean_token_accuracy": 0.15208059847354888, "num_tokens": 7337118.0, "step": 3930 }, { "entropy": 6.197002506256103, "epoch": 0.3474920522783469, "grad_norm": 1.203125, "learning_rate": 0.0004992415788662514, "loss": 6.0958, "mean_token_accuracy": 0.1525207430124283, "num_tokens": 7345973.0, "step": 3935 }, { "entropy": 6.194469833374024, "epoch": 0.34793359237018723, "grad_norm": 1.0546875, "learning_rate": 0.0004992389931839529, "loss": 6.0928, "mean_token_accuracy": 0.1534672871232033, "num_tokens": 7355776.0, "step": 3940 }, { "entropy": 6.168269205093384, "epoch": 0.34837513246202756, "grad_norm": 1.1640625, "learning_rate": 0.0004992364031089401, "loss": 6.1161, "mean_token_accuracy": 0.1555505856871605, "num_tokens": 7364820.0, "step": 3945 }, { "entropy": 6.2373566150665285, "epoch": 0.3488166725538679, "grad_norm": 1.234375, "learning_rate": 0.0004992338086412636, "loss": 6.1218, "mean_token_accuracy": 0.14930969327688218, "num_tokens": 7373881.0, "step": 3950 }, { "entropy": 6.135814476013183, "epoch": 0.3492582126457082, "grad_norm": 1.2109375, "learning_rate": 0.0004992312097809744, "loss": 6.0892, "mean_token_accuracy": 0.15562115162611007, "num_tokens": 7383095.0, "step": 3955 }, { "entropy": 6.195784330368042, "epoch": 0.34969975273754855, "grad_norm": 1.125, "learning_rate": 0.0004992286065281234, "loss": 6.1216, "mean_token_accuracy": 0.15000923871994018, "num_tokens": 7392702.0, "step": 3960 }, { "entropy": 6.247405576705932, "epoch": 0.3501412928293889, "grad_norm": 1.1015625, "learning_rate": 0.0004992259988827614, "loss": 6.2769, "mean_token_accuracy": 0.14470015615224838, "num_tokens": 7402560.0, "step": 3965 }, { "entropy": 6.284795522689819, "epoch": 0.35058283292122927, "grad_norm": 1.125, "learning_rate": 0.0004992233868449397, "loss": 6.051, "mean_token_accuracy": 0.15953975468873977, "num_tokens": 7412375.0, "step": 3970 }, { "entropy": 6.195076513290405, "epoch": 0.3510243730130696, "grad_norm": 1.0625, "learning_rate": 0.0004992207704147093, "loss": 6.0928, "mean_token_accuracy": 0.15187776535749437, "num_tokens": 7420734.0, "step": 3975 }, { "entropy": 6.101401853561401, "epoch": 0.35146591310490993, "grad_norm": 1.109375, "learning_rate": 0.0004992181495921216, "loss": 6.0919, "mean_token_accuracy": 0.15785269439220428, "num_tokens": 7430156.0, "step": 3980 }, { "entropy": 6.222190999984742, "epoch": 0.35190745319675026, "grad_norm": 1.15625, "learning_rate": 0.0004992155243772277, "loss": 6.1007, "mean_token_accuracy": 0.1574430137872696, "num_tokens": 7439189.0, "step": 3985 }, { "entropy": 6.2206672668457035, "epoch": 0.3523489932885906, "grad_norm": 1.0703125, "learning_rate": 0.0004992128947700795, "loss": 6.1249, "mean_token_accuracy": 0.1455397441983223, "num_tokens": 7448768.0, "step": 3990 }, { "entropy": 6.180650806427002, "epoch": 0.3527905333804309, "grad_norm": 1.1171875, "learning_rate": 0.000499210260770728, "loss": 6.064, "mean_token_accuracy": 0.15497558265924455, "num_tokens": 7458468.0, "step": 3995 }, { "entropy": 6.150880622863769, "epoch": 0.35323207347227126, "grad_norm": 1.1328125, "learning_rate": 0.000499207622379225, "loss": 6.1475, "mean_token_accuracy": 0.1480465464293957, "num_tokens": 7467351.0, "step": 4000 }, { "entropy": 6.2564455509185795, "epoch": 0.35367361356411164, "grad_norm": 1.0703125, "learning_rate": 0.0004992049795956222, "loss": 6.1012, "mean_token_accuracy": 0.14711347222328186, "num_tokens": 7477116.0, "step": 4005 }, { "entropy": 6.234343814849853, "epoch": 0.354115153655952, "grad_norm": 1.2734375, "learning_rate": 0.0004992023324199715, "loss": 6.0249, "mean_token_accuracy": 0.15656134784221648, "num_tokens": 7485216.0, "step": 4010 }, { "entropy": 6.075209522247315, "epoch": 0.3545566937477923, "grad_norm": 1.03125, "learning_rate": 0.0004991996808523245, "loss": 6.1798, "mean_token_accuracy": 0.1469147637486458, "num_tokens": 7495084.0, "step": 4015 }, { "entropy": 6.306225442886353, "epoch": 0.35499823383963264, "grad_norm": 1.09375, "learning_rate": 0.0004991970248927332, "loss": 6.1973, "mean_token_accuracy": 0.14604026302695275, "num_tokens": 7503219.0, "step": 4020 }, { "entropy": 6.196821165084839, "epoch": 0.35543977393147297, "grad_norm": 1.234375, "learning_rate": 0.0004991943645412498, "loss": 5.9766, "mean_token_accuracy": 0.1642112761735916, "num_tokens": 7511961.0, "step": 4025 }, { "entropy": 5.985270690917969, "epoch": 0.3558813140233133, "grad_norm": 1.1015625, "learning_rate": 0.0004991916997979263, "loss": 6.1096, "mean_token_accuracy": 0.1546097069978714, "num_tokens": 7521927.0, "step": 4030 }, { "entropy": 6.103000640869141, "epoch": 0.35632285411515363, "grad_norm": 1.171875, "learning_rate": 0.0004991890306628149, "loss": 6.0451, "mean_token_accuracy": 0.15554122179746627, "num_tokens": 7531093.0, "step": 4035 }, { "entropy": 6.198269557952881, "epoch": 0.356764394206994, "grad_norm": 1.125, "learning_rate": 0.0004991863571359678, "loss": 6.0078, "mean_token_accuracy": 0.15710966363549234, "num_tokens": 7539813.0, "step": 4040 }, { "entropy": 6.244335699081421, "epoch": 0.35720593429883435, "grad_norm": 1.3125, "learning_rate": 0.0004991836792174376, "loss": 6.1875, "mean_token_accuracy": 0.1476308137178421, "num_tokens": 7548954.0, "step": 4045 }, { "entropy": 6.100079536437988, "epoch": 0.3576474743906747, "grad_norm": 1.09375, "learning_rate": 0.0004991809969072765, "loss": 6.0322, "mean_token_accuracy": 0.1548540085554123, "num_tokens": 7558275.0, "step": 4050 }, { "entropy": 6.13811764717102, "epoch": 0.358089014482515, "grad_norm": 1.234375, "learning_rate": 0.0004991783102055371, "loss": 6.0688, "mean_token_accuracy": 0.15873494520783424, "num_tokens": 7567870.0, "step": 4055 }, { "entropy": 6.249936914443969, "epoch": 0.35853055457435534, "grad_norm": 1.1015625, "learning_rate": 0.0004991756191122723, "loss": 6.0385, "mean_token_accuracy": 0.15568251237273217, "num_tokens": 7577485.0, "step": 4060 }, { "entropy": 6.142808294296264, "epoch": 0.3589720946661957, "grad_norm": 1.109375, "learning_rate": 0.0004991729236275346, "loss": 6.0888, "mean_token_accuracy": 0.150615693628788, "num_tokens": 7587398.0, "step": 4065 }, { "entropy": 6.10182294845581, "epoch": 0.359413634758036, "grad_norm": 1.125, "learning_rate": 0.0004991702237513768, "loss": 6.0896, "mean_token_accuracy": 0.1515856146812439, "num_tokens": 7596310.0, "step": 4070 }, { "entropy": 6.236705541610718, "epoch": 0.3598551748498764, "grad_norm": 1.078125, "learning_rate": 0.0004991675194838517, "loss": 6.1214, "mean_token_accuracy": 0.14487750828266144, "num_tokens": 7606971.0, "step": 4075 }, { "entropy": 6.157118654251098, "epoch": 0.3602967149417167, "grad_norm": 1.265625, "learning_rate": 0.0004991648108250125, "loss": 6.1073, "mean_token_accuracy": 0.15116591304540633, "num_tokens": 7616636.0, "step": 4080 }, { "entropy": 6.23088059425354, "epoch": 0.36073825503355705, "grad_norm": 1.1953125, "learning_rate": 0.000499162097774912, "loss": 6.1325, "mean_token_accuracy": 0.15135489255189896, "num_tokens": 7625128.0, "step": 4085 }, { "entropy": 6.168088626861572, "epoch": 0.3611797951253974, "grad_norm": 1.09375, "learning_rate": 0.0004991593803336037, "loss": 6.0464, "mean_token_accuracy": 0.15914286002516748, "num_tokens": 7634224.0, "step": 4090 }, { "entropy": 6.109174299240112, "epoch": 0.3616213352172377, "grad_norm": 1.1015625, "learning_rate": 0.0004991566585011405, "loss": 6.0204, "mean_token_accuracy": 0.1594786301255226, "num_tokens": 7643392.0, "step": 4095 }, { "entropy": 6.046089458465576, "epoch": 0.36206287530907805, "grad_norm": 1.265625, "learning_rate": 0.0004991539322775758, "loss": 6.0402, "mean_token_accuracy": 0.15390734672546386, "num_tokens": 7652491.0, "step": 4100 }, { "entropy": 6.333108568191529, "epoch": 0.3625044154009184, "grad_norm": 1.140625, "learning_rate": 0.0004991512016629632, "loss": 6.2631, "mean_token_accuracy": 0.1374726377427578, "num_tokens": 7663192.0, "step": 4105 }, { "entropy": 6.233335781097412, "epoch": 0.36294595549275877, "grad_norm": 1.140625, "learning_rate": 0.0004991484666573558, "loss": 6.0951, "mean_token_accuracy": 0.1491971492767334, "num_tokens": 7672801.0, "step": 4110 }, { "entropy": 6.150677967071533, "epoch": 0.3633874955845991, "grad_norm": 1.0859375, "learning_rate": 0.0004991457272608077, "loss": 6.1107, "mean_token_accuracy": 0.1492188058793545, "num_tokens": 7683013.0, "step": 4115 }, { "entropy": 6.215946054458618, "epoch": 0.3638290356764394, "grad_norm": 1.1484375, "learning_rate": 0.0004991429834733721, "loss": 6.1377, "mean_token_accuracy": 0.14726671427488328, "num_tokens": 7692760.0, "step": 4120 }, { "entropy": 6.212368583679199, "epoch": 0.36427057576827976, "grad_norm": 1.390625, "learning_rate": 0.000499140235295103, "loss": 6.0806, "mean_token_accuracy": 0.15191175639629365, "num_tokens": 7702005.0, "step": 4125 }, { "entropy": 6.164025735855103, "epoch": 0.3647121158601201, "grad_norm": 1.296875, "learning_rate": 0.0004991374827260542, "loss": 6.129, "mean_token_accuracy": 0.1487259477376938, "num_tokens": 7711604.0, "step": 4130 }, { "entropy": 6.113088750839234, "epoch": 0.3651536559519604, "grad_norm": 1.296875, "learning_rate": 0.0004991347257662795, "loss": 6.0471, "mean_token_accuracy": 0.153825144469738, "num_tokens": 7720434.0, "step": 4135 }, { "entropy": 6.218751907348633, "epoch": 0.36559519604380075, "grad_norm": 1.109375, "learning_rate": 0.000499131964415833, "loss": 6.0877, "mean_token_accuracy": 0.1532296895980835, "num_tokens": 7730733.0, "step": 4140 }, { "entropy": 6.158688402175903, "epoch": 0.36603673613564114, "grad_norm": 1.3828125, "learning_rate": 0.0004991291986747689, "loss": 6.1581, "mean_token_accuracy": 0.153475009649992, "num_tokens": 7740394.0, "step": 4145 }, { "entropy": 6.191565799713135, "epoch": 0.36647827622748147, "grad_norm": 1.15625, "learning_rate": 0.0004991264285431412, "loss": 6.1045, "mean_token_accuracy": 0.15052587389945984, "num_tokens": 7748919.0, "step": 4150 }, { "entropy": 6.293946027755737, "epoch": 0.3669198163193218, "grad_norm": 1.171875, "learning_rate": 0.0004991236540210041, "loss": 6.1009, "mean_token_accuracy": 0.1537343256175518, "num_tokens": 7758591.0, "step": 4155 }, { "entropy": 6.244499015808105, "epoch": 0.36736135641116213, "grad_norm": 1.15625, "learning_rate": 0.0004991208751084122, "loss": 6.1485, "mean_token_accuracy": 0.1491132453083992, "num_tokens": 7768269.0, "step": 4160 }, { "entropy": 6.249349975585938, "epoch": 0.36780289650300246, "grad_norm": 1.203125, "learning_rate": 0.0004991180918054199, "loss": 6.1767, "mean_token_accuracy": 0.1463681861758232, "num_tokens": 7776980.0, "step": 4165 }, { "entropy": 6.171654605865479, "epoch": 0.3682444365948428, "grad_norm": 1.1953125, "learning_rate": 0.0004991153041120815, "loss": 6.0811, "mean_token_accuracy": 0.15733396261930466, "num_tokens": 7786664.0, "step": 4170 }, { "entropy": 6.12093939781189, "epoch": 0.3686859766866831, "grad_norm": 1.171875, "learning_rate": 0.0004991125120284519, "loss": 5.9525, "mean_token_accuracy": 0.1650825932621956, "num_tokens": 7794981.0, "step": 4175 }, { "entropy": 6.110823106765747, "epoch": 0.3691275167785235, "grad_norm": 1.0859375, "learning_rate": 0.0004991097155545856, "loss": 6.0387, "mean_token_accuracy": 0.1560913234949112, "num_tokens": 7804850.0, "step": 4180 }, { "entropy": 6.106446075439453, "epoch": 0.36956905687036384, "grad_norm": 1.28125, "learning_rate": 0.0004991069146905374, "loss": 6.052, "mean_token_accuracy": 0.15863640755414962, "num_tokens": 7814117.0, "step": 4185 }, { "entropy": 6.183217334747314, "epoch": 0.3700105969622042, "grad_norm": 1.140625, "learning_rate": 0.0004991041094363621, "loss": 6.0928, "mean_token_accuracy": 0.15649753212928771, "num_tokens": 7823384.0, "step": 4190 }, { "entropy": 6.2000589847564695, "epoch": 0.3704521370540445, "grad_norm": 1.0703125, "learning_rate": 0.0004991012997921149, "loss": 6.0863, "mean_token_accuracy": 0.154317145049572, "num_tokens": 7834072.0, "step": 4195 }, { "entropy": 6.225698471069336, "epoch": 0.37089367714588484, "grad_norm": 1.1953125, "learning_rate": 0.0004990984857578506, "loss": 6.0718, "mean_token_accuracy": 0.1523341119289398, "num_tokens": 7843334.0, "step": 4200 }, { "entropy": 6.040940427780152, "epoch": 0.37133521723772517, "grad_norm": 1.5234375, "learning_rate": 0.0004990956673336245, "loss": 6.0132, "mean_token_accuracy": 0.15985633432865143, "num_tokens": 7853187.0, "step": 4205 }, { "entropy": 6.121350336074829, "epoch": 0.3717767573295655, "grad_norm": 1.1484375, "learning_rate": 0.0004990928445194917, "loss": 6.0836, "mean_token_accuracy": 0.1519768550992012, "num_tokens": 7862262.0, "step": 4210 }, { "entropy": 6.15352463722229, "epoch": 0.3722182974214059, "grad_norm": 1.1640625, "learning_rate": 0.0004990900173155074, "loss": 6.0184, "mean_token_accuracy": 0.15704632550477982, "num_tokens": 7870725.0, "step": 4215 }, { "entropy": 6.124208784103393, "epoch": 0.3726598375132462, "grad_norm": 1.2109375, "learning_rate": 0.0004990871857217273, "loss": 5.9447, "mean_token_accuracy": 0.15963577330112458, "num_tokens": 7879492.0, "step": 4220 }, { "entropy": 6.087597894668579, "epoch": 0.37310137760508655, "grad_norm": 1.375, "learning_rate": 0.0004990843497382066, "loss": 6.0649, "mean_token_accuracy": 0.15454517751932145, "num_tokens": 7888710.0, "step": 4225 }, { "entropy": 6.171204519271851, "epoch": 0.3735429176969269, "grad_norm": 1.109375, "learning_rate": 0.0004990815093650009, "loss": 6.0846, "mean_token_accuracy": 0.15838453769683838, "num_tokens": 7898313.0, "step": 4230 }, { "entropy": 6.163542985916138, "epoch": 0.3739844577887672, "grad_norm": 1.3203125, "learning_rate": 0.0004990786646021659, "loss": 6.0972, "mean_token_accuracy": 0.15132492929697036, "num_tokens": 7906726.0, "step": 4235 }, { "entropy": 6.143102693557739, "epoch": 0.37442599788060754, "grad_norm": 1.0859375, "learning_rate": 0.0004990758154497573, "loss": 6.0083, "mean_token_accuracy": 0.16228035539388658, "num_tokens": 7915554.0, "step": 4240 }, { "entropy": 6.224862670898437, "epoch": 0.3748675379724479, "grad_norm": 1.1875, "learning_rate": 0.0004990729619078309, "loss": 6.1357, "mean_token_accuracy": 0.15337878912687303, "num_tokens": 7924732.0, "step": 4245 }, { "entropy": 6.230613327026367, "epoch": 0.37530907806428826, "grad_norm": 1.0703125, "learning_rate": 0.0004990701039764427, "loss": 6.061, "mean_token_accuracy": 0.1594015821814537, "num_tokens": 7933808.0, "step": 4250 }, { "entropy": 6.101580095291138, "epoch": 0.3757506181561286, "grad_norm": 1.1171875, "learning_rate": 0.0004990672416556485, "loss": 6.0319, "mean_token_accuracy": 0.15845490843057633, "num_tokens": 7942558.0, "step": 4255 }, { "entropy": 6.129454803466797, "epoch": 0.3761921582479689, "grad_norm": 1.2265625, "learning_rate": 0.0004990643749455045, "loss": 6.1321, "mean_token_accuracy": 0.14685340449213982, "num_tokens": 7951951.0, "step": 4260 }, { "entropy": 6.2848834037780765, "epoch": 0.37663369833980925, "grad_norm": 1.1484375, "learning_rate": 0.0004990615038460667, "loss": 6.0247, "mean_token_accuracy": 0.1576920345425606, "num_tokens": 7960543.0, "step": 4265 }, { "entropy": 6.126399230957031, "epoch": 0.3770752384316496, "grad_norm": 1.109375, "learning_rate": 0.0004990586283573916, "loss": 6.007, "mean_token_accuracy": 0.155757275223732, "num_tokens": 7970273.0, "step": 4270 }, { "entropy": 6.158202123641968, "epoch": 0.3775167785234899, "grad_norm": 1.2109375, "learning_rate": 0.0004990557484795355, "loss": 6.1038, "mean_token_accuracy": 0.1517634019255638, "num_tokens": 7978888.0, "step": 4275 }, { "entropy": 6.186517333984375, "epoch": 0.37795831861533025, "grad_norm": 1.1953125, "learning_rate": 0.0004990528642125545, "loss": 6.0938, "mean_token_accuracy": 0.15008659660816193, "num_tokens": 7988843.0, "step": 4280 }, { "entropy": 6.1609944820404055, "epoch": 0.37839985870717063, "grad_norm": 1.171875, "learning_rate": 0.0004990499755565055, "loss": 6.0944, "mean_token_accuracy": 0.15281789302825927, "num_tokens": 7998403.0, "step": 4285 }, { "entropy": 6.190595436096191, "epoch": 0.37884139879901096, "grad_norm": 1.1015625, "learning_rate": 0.0004990470825114448, "loss": 6.1108, "mean_token_accuracy": 0.15413309782743453, "num_tokens": 8007493.0, "step": 4290 }, { "entropy": 6.112000417709351, "epoch": 0.3792829388908513, "grad_norm": 1.109375, "learning_rate": 0.0004990441850774292, "loss": 6.0581, "mean_token_accuracy": 0.1584509640932083, "num_tokens": 8016399.0, "step": 4295 }, { "entropy": 6.215829849243164, "epoch": 0.3797244789826916, "grad_norm": 1.09375, "learning_rate": 0.0004990412832545155, "loss": 6.0591, "mean_token_accuracy": 0.1544952630996704, "num_tokens": 8025891.0, "step": 4300 }, { "entropy": 6.077403736114502, "epoch": 0.38016601907453196, "grad_norm": 1.21875, "learning_rate": 0.0004990383770427603, "loss": 6.0097, "mean_token_accuracy": 0.15796091556549072, "num_tokens": 8034733.0, "step": 4305 }, { "entropy": 6.135886240005493, "epoch": 0.3806075591663723, "grad_norm": 1.109375, "learning_rate": 0.0004990354664422209, "loss": 6.0527, "mean_token_accuracy": 0.15982279628515245, "num_tokens": 8044443.0, "step": 4310 }, { "entropy": 6.160210800170899, "epoch": 0.3810490992582126, "grad_norm": 1.3125, "learning_rate": 0.0004990325514529541, "loss": 6.0403, "mean_token_accuracy": 0.16555903106927872, "num_tokens": 8053920.0, "step": 4315 }, { "entropy": 6.1733404159545895, "epoch": 0.381490639350053, "grad_norm": 1.1484375, "learning_rate": 0.0004990296320750169, "loss": 6.0242, "mean_token_accuracy": 0.15823598951101303, "num_tokens": 8063455.0, "step": 4320 }, { "entropy": 6.164728498458862, "epoch": 0.38193217944189334, "grad_norm": 1.1640625, "learning_rate": 0.0004990267083084667, "loss": 6.1096, "mean_token_accuracy": 0.15378031879663467, "num_tokens": 8073353.0, "step": 4325 }, { "entropy": 6.216868829727173, "epoch": 0.38237371953373367, "grad_norm": 1.171875, "learning_rate": 0.0004990237801533607, "loss": 6.1198, "mean_token_accuracy": 0.1568579077720642, "num_tokens": 8082164.0, "step": 4330 }, { "entropy": 6.222427320480347, "epoch": 0.382815259625574, "grad_norm": 1.21875, "learning_rate": 0.0004990208476097562, "loss": 6.126, "mean_token_accuracy": 0.15226055085659027, "num_tokens": 8090790.0, "step": 4335 }, { "entropy": 6.293479156494141, "epoch": 0.38325679971741433, "grad_norm": 1.15625, "learning_rate": 0.0004990179106777109, "loss": 6.1346, "mean_token_accuracy": 0.14660235047340392, "num_tokens": 8100412.0, "step": 4340 }, { "entropy": 6.121231889724731, "epoch": 0.38369833980925466, "grad_norm": 1.2265625, "learning_rate": 0.0004990149693572819, "loss": 6.1328, "mean_token_accuracy": 0.14986878782510757, "num_tokens": 8110002.0, "step": 4345 }, { "entropy": 6.1944701194763185, "epoch": 0.384139879901095, "grad_norm": 1.46875, "learning_rate": 0.0004990120236485271, "loss": 6.0262, "mean_token_accuracy": 0.15945157259702683, "num_tokens": 8118898.0, "step": 4350 }, { "entropy": 6.093376398086548, "epoch": 0.3845814199929354, "grad_norm": 1.1953125, "learning_rate": 0.0004990090735515043, "loss": 6.0696, "mean_token_accuracy": 0.1521653488278389, "num_tokens": 8128207.0, "step": 4355 }, { "entropy": 6.162809753417969, "epoch": 0.3850229600847757, "grad_norm": 1.21875, "learning_rate": 0.000499006119066271, "loss": 6.0659, "mean_token_accuracy": 0.15488530248403548, "num_tokens": 8137317.0, "step": 4360 }, { "entropy": 6.234450483322144, "epoch": 0.38546450017661604, "grad_norm": 1.125, "learning_rate": 0.0004990031601928854, "loss": 6.0901, "mean_token_accuracy": 0.15392402857542037, "num_tokens": 8146519.0, "step": 4365 }, { "entropy": 6.081244421005249, "epoch": 0.3859060402684564, "grad_norm": 1.1640625, "learning_rate": 0.0004990001969314051, "loss": 6.1156, "mean_token_accuracy": 0.150378455221653, "num_tokens": 8155916.0, "step": 4370 }, { "entropy": 6.188456583023071, "epoch": 0.3863475803602967, "grad_norm": 1.125, "learning_rate": 0.0004989972292818884, "loss": 6.1306, "mean_token_accuracy": 0.15578001141548156, "num_tokens": 8165251.0, "step": 4375 }, { "entropy": 6.154488277435303, "epoch": 0.38678912045213704, "grad_norm": 1.1328125, "learning_rate": 0.0004989942572443934, "loss": 6.1046, "mean_token_accuracy": 0.154154072701931, "num_tokens": 8174761.0, "step": 4380 }, { "entropy": 6.2399732112884525, "epoch": 0.38723066054397737, "grad_norm": 1.1328125, "learning_rate": 0.0004989912808189784, "loss": 6.0624, "mean_token_accuracy": 0.1586390733718872, "num_tokens": 8182980.0, "step": 4385 }, { "entropy": 6.286262083053589, "epoch": 0.38767220063581775, "grad_norm": 1.1015625, "learning_rate": 0.0004989883000057013, "loss": 6.1013, "mean_token_accuracy": 0.15117012858390808, "num_tokens": 8193306.0, "step": 4390 }, { "entropy": 6.023512887954712, "epoch": 0.3881137407276581, "grad_norm": 1.203125, "learning_rate": 0.000498985314804621, "loss": 5.9827, "mean_token_accuracy": 0.16275162994861603, "num_tokens": 8203446.0, "step": 4395 }, { "entropy": 6.129458618164063, "epoch": 0.3885552808194984, "grad_norm": 1.1015625, "learning_rate": 0.0004989823252157958, "loss": 6.0411, "mean_token_accuracy": 0.14927417337894439, "num_tokens": 8212414.0, "step": 4400 }, { "entropy": 6.107169580459595, "epoch": 0.38899682091133875, "grad_norm": 1.125, "learning_rate": 0.0004989793312392841, "loss": 5.9633, "mean_token_accuracy": 0.16139311194419861, "num_tokens": 8221509.0, "step": 4405 }, { "entropy": 6.121396112442016, "epoch": 0.3894383610031791, "grad_norm": 1.46875, "learning_rate": 0.0004989763328751448, "loss": 6.0825, "mean_token_accuracy": 0.15815991312265396, "num_tokens": 8230017.0, "step": 4410 }, { "entropy": 6.1887977600097654, "epoch": 0.3898799010950194, "grad_norm": 1.265625, "learning_rate": 0.0004989733301234365, "loss": 6.0249, "mean_token_accuracy": 0.15967779457569123, "num_tokens": 8238853.0, "step": 4415 }, { "entropy": 6.224899578094482, "epoch": 0.39032144118685974, "grad_norm": 1.078125, "learning_rate": 0.000498970322984218, "loss": 6.1023, "mean_token_accuracy": 0.1519481733441353, "num_tokens": 8248112.0, "step": 4420 }, { "entropy": 6.203225469589233, "epoch": 0.39076298127870013, "grad_norm": 1.3125, "learning_rate": 0.0004989673114575483, "loss": 6.0589, "mean_token_accuracy": 0.1536906696856022, "num_tokens": 8257166.0, "step": 4425 }, { "entropy": 6.128660726547241, "epoch": 0.39120452137054046, "grad_norm": 1.2265625, "learning_rate": 0.0004989642955434863, "loss": 6.0714, "mean_token_accuracy": 0.1515656217932701, "num_tokens": 8266229.0, "step": 4430 }, { "entropy": 6.163424253463745, "epoch": 0.3916460614623808, "grad_norm": 1.171875, "learning_rate": 0.0004989612752420912, "loss": 6.039, "mean_token_accuracy": 0.15465213656425475, "num_tokens": 8275102.0, "step": 4435 }, { "entropy": 6.022084140777588, "epoch": 0.3920876015542211, "grad_norm": 1.203125, "learning_rate": 0.000498958250553422, "loss": 5.9631, "mean_token_accuracy": 0.16303362101316451, "num_tokens": 8283847.0, "step": 4440 }, { "entropy": 6.193567132949829, "epoch": 0.39252914164606145, "grad_norm": 1.1640625, "learning_rate": 0.0004989552214775381, "loss": 6.0871, "mean_token_accuracy": 0.15131851583719252, "num_tokens": 8292622.0, "step": 4445 }, { "entropy": 6.194511032104492, "epoch": 0.3929706817379018, "grad_norm": 1.2734375, "learning_rate": 0.0004989521880144988, "loss": 5.9982, "mean_token_accuracy": 0.17004551142454147, "num_tokens": 8301026.0, "step": 4450 }, { "entropy": 6.184074640274048, "epoch": 0.3934122218297421, "grad_norm": 1.359375, "learning_rate": 0.0004989491501643635, "loss": 6.2578, "mean_token_accuracy": 0.14575279951095582, "num_tokens": 8310977.0, "step": 4455 }, { "entropy": 6.20936131477356, "epoch": 0.3938537619215825, "grad_norm": 1.0859375, "learning_rate": 0.0004989461079271916, "loss": 6.0296, "mean_token_accuracy": 0.16169211864471436, "num_tokens": 8319391.0, "step": 4460 }, { "entropy": 6.169527339935303, "epoch": 0.39429530201342283, "grad_norm": 1.1796875, "learning_rate": 0.0004989430613030429, "loss": 5.9922, "mean_token_accuracy": 0.15554805994033813, "num_tokens": 8328639.0, "step": 4465 }, { "entropy": 6.117289400100708, "epoch": 0.39473684210526316, "grad_norm": 1.2890625, "learning_rate": 0.000498940010291977, "loss": 6.096, "mean_token_accuracy": 0.15155849754810333, "num_tokens": 8338190.0, "step": 4470 }, { "entropy": 6.100512361526489, "epoch": 0.3951783821971035, "grad_norm": 1.171875, "learning_rate": 0.0004989369548940536, "loss": 6.0395, "mean_token_accuracy": 0.15572706907987593, "num_tokens": 8346874.0, "step": 4475 }, { "entropy": 6.190318775177002, "epoch": 0.3956199222889438, "grad_norm": 1.2265625, "learning_rate": 0.0004989338951093327, "loss": 6.0178, "mean_token_accuracy": 0.16222874522209169, "num_tokens": 8356446.0, "step": 4480 }, { "entropy": 6.1148622035980225, "epoch": 0.39606146238078416, "grad_norm": 1.140625, "learning_rate": 0.0004989308309378741, "loss": 6.0199, "mean_token_accuracy": 0.15970418155193328, "num_tokens": 8365694.0, "step": 4485 }, { "entropy": 6.157066297531128, "epoch": 0.3965030024726245, "grad_norm": 1.296875, "learning_rate": 0.0004989277623797379, "loss": 6.0754, "mean_token_accuracy": 0.1534503474831581, "num_tokens": 8374282.0, "step": 4490 }, { "entropy": 6.160027027130127, "epoch": 0.3969445425644649, "grad_norm": 1.15625, "learning_rate": 0.0004989246894349841, "loss": 6.0372, "mean_token_accuracy": 0.1563117504119873, "num_tokens": 8383315.0, "step": 4495 }, { "entropy": 6.214316082000733, "epoch": 0.3973860826563052, "grad_norm": 1.1796875, "learning_rate": 0.0004989216121036732, "loss": 6.0236, "mean_token_accuracy": 0.15833714008331298, "num_tokens": 8392263.0, "step": 4500 }, { "entropy": 6.106395578384399, "epoch": 0.39782762274814554, "grad_norm": 1.3671875, "learning_rate": 0.0004989185303858651, "loss": 6.0696, "mean_token_accuracy": 0.14981550127267837, "num_tokens": 8400734.0, "step": 4505 }, { "entropy": 6.156090450286865, "epoch": 0.39826916283998587, "grad_norm": 1.21875, "learning_rate": 0.0004989154442816203, "loss": 6.0793, "mean_token_accuracy": 0.15283239632844925, "num_tokens": 8410635.0, "step": 4510 }, { "entropy": 6.205539274215698, "epoch": 0.3987107029318262, "grad_norm": 1.3046875, "learning_rate": 0.0004989123537909994, "loss": 6.0664, "mean_token_accuracy": 0.15485348254442216, "num_tokens": 8420111.0, "step": 4515 }, { "entropy": 6.103611946105957, "epoch": 0.39915224302366653, "grad_norm": 1.2421875, "learning_rate": 0.0004989092589140629, "loss": 6.0177, "mean_token_accuracy": 0.1484901040792465, "num_tokens": 8429459.0, "step": 4520 }, { "entropy": 6.117227792739868, "epoch": 0.39959378311550686, "grad_norm": 1.3671875, "learning_rate": 0.0004989061596508712, "loss": 6.0403, "mean_token_accuracy": 0.16086821481585503, "num_tokens": 8438083.0, "step": 4525 }, { "entropy": 6.07325234413147, "epoch": 0.40003532320734725, "grad_norm": 1.1484375, "learning_rate": 0.0004989030560014853, "loss": 6.0506, "mean_token_accuracy": 0.15744656324386597, "num_tokens": 8447713.0, "step": 4530 }, { "entropy": 6.1523223400115965, "epoch": 0.4004768632991876, "grad_norm": 1.2421875, "learning_rate": 0.0004988999479659657, "loss": 6.0226, "mean_token_accuracy": 0.158653824031353, "num_tokens": 8457394.0, "step": 4535 }, { "entropy": 6.117883396148682, "epoch": 0.4009184033910279, "grad_norm": 1.296875, "learning_rate": 0.0004988968355443737, "loss": 5.9913, "mean_token_accuracy": 0.16181344538927078, "num_tokens": 8467333.0, "step": 4540 }, { "entropy": 6.124732971191406, "epoch": 0.40135994348286824, "grad_norm": 1.2421875, "learning_rate": 0.0004988937187367699, "loss": 6.1032, "mean_token_accuracy": 0.15110900700092317, "num_tokens": 8477530.0, "step": 4545 }, { "entropy": 6.217570829391479, "epoch": 0.4018014835747086, "grad_norm": 1.1328125, "learning_rate": 0.0004988905975432154, "loss": 6.1802, "mean_token_accuracy": 0.15269524306058885, "num_tokens": 8486861.0, "step": 4550 }, { "entropy": 6.170765733718872, "epoch": 0.4022430236665489, "grad_norm": 1.171875, "learning_rate": 0.0004988874719637715, "loss": 5.9867, "mean_token_accuracy": 0.158355513215065, "num_tokens": 8496541.0, "step": 4555 }, { "entropy": 6.052766561508179, "epoch": 0.40268456375838924, "grad_norm": 1.5234375, "learning_rate": 0.0004988843419984994, "loss": 6.064, "mean_token_accuracy": 0.15494307354092599, "num_tokens": 8505667.0, "step": 4560 }, { "entropy": 6.24229063987732, "epoch": 0.4031261038502296, "grad_norm": 1.1484375, "learning_rate": 0.0004988812076474604, "loss": 6.052, "mean_token_accuracy": 0.15319542214274406, "num_tokens": 8515133.0, "step": 4565 }, { "entropy": 6.170625066757202, "epoch": 0.40356764394206995, "grad_norm": 1.4140625, "learning_rate": 0.0004988780689107158, "loss": 6.007, "mean_token_accuracy": 0.1620650038123131, "num_tokens": 8524797.0, "step": 4570 }, { "entropy": 6.0161412239074705, "epoch": 0.4040091840339103, "grad_norm": 1.1015625, "learning_rate": 0.0004988749257883271, "loss": 6.0288, "mean_token_accuracy": 0.15016194060444832, "num_tokens": 8534667.0, "step": 4575 }, { "entropy": 6.143600559234619, "epoch": 0.4044507241257506, "grad_norm": 1.09375, "learning_rate": 0.000498871778280356, "loss": 6.0166, "mean_token_accuracy": 0.15943924337625504, "num_tokens": 8543874.0, "step": 4580 }, { "entropy": 6.1106805324554445, "epoch": 0.40489226421759095, "grad_norm": 1.1796875, "learning_rate": 0.0004988686263868641, "loss": 6.0353, "mean_token_accuracy": 0.1529652863740921, "num_tokens": 8553620.0, "step": 4585 }, { "entropy": 6.17731556892395, "epoch": 0.4053338043094313, "grad_norm": 1.2265625, "learning_rate": 0.0004988654701079131, "loss": 6.1113, "mean_token_accuracy": 0.15474483817815782, "num_tokens": 8563857.0, "step": 4590 }, { "entropy": 6.2202249526977536, "epoch": 0.4057753444012716, "grad_norm": 1.140625, "learning_rate": 0.0004988623094435649, "loss": 6.0898, "mean_token_accuracy": 0.1531184583902359, "num_tokens": 8572677.0, "step": 4595 }, { "entropy": 6.140727758407593, "epoch": 0.406216884493112, "grad_norm": 1.1796875, "learning_rate": 0.0004988591443938813, "loss": 6.016, "mean_token_accuracy": 0.15535678565502167, "num_tokens": 8581907.0, "step": 4600 }, { "entropy": 6.138199281692505, "epoch": 0.4066584245849523, "grad_norm": 1.1875, "learning_rate": 0.0004988559749589244, "loss": 6.0609, "mean_token_accuracy": 0.15539143681526185, "num_tokens": 8591701.0, "step": 4605 }, { "entropy": 6.19168872833252, "epoch": 0.40709996467679266, "grad_norm": 1.1328125, "learning_rate": 0.0004988528011387563, "loss": 6.1128, "mean_token_accuracy": 0.15955741629004477, "num_tokens": 8601054.0, "step": 4610 }, { "entropy": 6.115007019042968, "epoch": 0.407541504768633, "grad_norm": 1.2109375, "learning_rate": 0.0004988496229334392, "loss": 6.0089, "mean_token_accuracy": 0.16271338164806365, "num_tokens": 8610346.0, "step": 4615 }, { "entropy": 6.082312822341919, "epoch": 0.4079830448604733, "grad_norm": 1.3828125, "learning_rate": 0.0004988464403430352, "loss": 6.0114, "mean_token_accuracy": 0.1530932977795601, "num_tokens": 8620823.0, "step": 4620 }, { "entropy": 6.228973913192749, "epoch": 0.40842458495231365, "grad_norm": 1.109375, "learning_rate": 0.0004988432533676067, "loss": 6.1457, "mean_token_accuracy": 0.14871701523661612, "num_tokens": 8630184.0, "step": 4625 }, { "entropy": 6.265304517745972, "epoch": 0.408866125044154, "grad_norm": 1.390625, "learning_rate": 0.0004988400620072163, "loss": 6.1541, "mean_token_accuracy": 0.14276653826236724, "num_tokens": 8640064.0, "step": 4630 }, { "entropy": 6.173982954025268, "epoch": 0.40930766513599437, "grad_norm": 1.2109375, "learning_rate": 0.0004988368662619263, "loss": 6.0413, "mean_token_accuracy": 0.15715423077344895, "num_tokens": 8650503.0, "step": 4635 }, { "entropy": 6.191125011444091, "epoch": 0.4097492052278347, "grad_norm": 1.4296875, "learning_rate": 0.0004988336661317994, "loss": 6.0652, "mean_token_accuracy": 0.15463789254426957, "num_tokens": 8659125.0, "step": 4640 }, { "entropy": 6.157907199859619, "epoch": 0.41019074531967503, "grad_norm": 1.109375, "learning_rate": 0.0004988304616168984, "loss": 6.0841, "mean_token_accuracy": 0.15441161543130874, "num_tokens": 8668193.0, "step": 4645 }, { "entropy": 6.116786527633667, "epoch": 0.41063228541151536, "grad_norm": 1.21875, "learning_rate": 0.0004988272527172858, "loss": 6.0384, "mean_token_accuracy": 0.15524010509252548, "num_tokens": 8677515.0, "step": 4650 }, { "entropy": 6.21679277420044, "epoch": 0.4110738255033557, "grad_norm": 1.3515625, "learning_rate": 0.0004988240394330246, "loss": 6.0764, "mean_token_accuracy": 0.14973534047603607, "num_tokens": 8687549.0, "step": 4655 }, { "entropy": 6.157021522521973, "epoch": 0.411515365595196, "grad_norm": 1.375, "learning_rate": 0.0004988208217641778, "loss": 6.0639, "mean_token_accuracy": 0.15594548732042313, "num_tokens": 8697371.0, "step": 4660 }, { "entropy": 6.13969612121582, "epoch": 0.41195690568703636, "grad_norm": 1.5, "learning_rate": 0.0004988175997108086, "loss": 6.0735, "mean_token_accuracy": 0.15609194859862327, "num_tokens": 8707193.0, "step": 4665 }, { "entropy": 6.1602945804595945, "epoch": 0.41239844577887674, "grad_norm": 1.1953125, "learning_rate": 0.0004988143732729797, "loss": 6.0644, "mean_token_accuracy": 0.15450926274061202, "num_tokens": 8716052.0, "step": 4670 }, { "entropy": 6.081143188476562, "epoch": 0.4128399858707171, "grad_norm": 1.3671875, "learning_rate": 0.0004988111424507546, "loss": 6.0209, "mean_token_accuracy": 0.1590244859457016, "num_tokens": 8726140.0, "step": 4675 }, { "entropy": 6.134521245956421, "epoch": 0.4132815259625574, "grad_norm": 1.3515625, "learning_rate": 0.0004988079072441964, "loss": 6.0028, "mean_token_accuracy": 0.16453344523906707, "num_tokens": 8735299.0, "step": 4680 }, { "entropy": 6.157465314865112, "epoch": 0.41372306605439774, "grad_norm": 1.3046875, "learning_rate": 0.0004988046676533687, "loss": 6.0908, "mean_token_accuracy": 0.15370513945817948, "num_tokens": 8744686.0, "step": 4685 }, { "entropy": 6.140842294692993, "epoch": 0.41416460614623807, "grad_norm": 1.203125, "learning_rate": 0.0004988014236783347, "loss": 6.063, "mean_token_accuracy": 0.1632717102766037, "num_tokens": 8754942.0, "step": 4690 }, { "entropy": 6.232805824279785, "epoch": 0.4146061462380784, "grad_norm": 1.3203125, "learning_rate": 0.0004987981753191582, "loss": 6.0523, "mean_token_accuracy": 0.15246548503637314, "num_tokens": 8764054.0, "step": 4695 }, { "entropy": 6.0288361549377445, "epoch": 0.41504768632991873, "grad_norm": 1.296875, "learning_rate": 0.0004987949225759027, "loss": 5.9405, "mean_token_accuracy": 0.15906094312667846, "num_tokens": 8773050.0, "step": 4700 }, { "entropy": 6.1974766731262205, "epoch": 0.4154892264217591, "grad_norm": 1.3359375, "learning_rate": 0.0004987916654486321, "loss": 6.0544, "mean_token_accuracy": 0.1537775442004204, "num_tokens": 8782476.0, "step": 4705 }, { "entropy": 6.254334783554077, "epoch": 0.41593076651359945, "grad_norm": 1.21875, "learning_rate": 0.0004987884039374099, "loss": 6.1212, "mean_token_accuracy": 0.1502728283405304, "num_tokens": 8791147.0, "step": 4710 }, { "entropy": 6.237736749649048, "epoch": 0.4163723066054398, "grad_norm": 1.3203125, "learning_rate": 0.0004987851380423001, "loss": 6.1069, "mean_token_accuracy": 0.1571350358426571, "num_tokens": 8801151.0, "step": 4715 }, { "entropy": 6.067666149139404, "epoch": 0.4168138466972801, "grad_norm": 1.40625, "learning_rate": 0.0004987818677633668, "loss": 6.0577, "mean_token_accuracy": 0.14970119222998618, "num_tokens": 8809587.0, "step": 4720 }, { "entropy": 6.103376960754394, "epoch": 0.41725538678912044, "grad_norm": 1.3359375, "learning_rate": 0.000498778593100674, "loss": 5.94, "mean_token_accuracy": 0.15202507078647615, "num_tokens": 8818168.0, "step": 4725 }, { "entropy": 6.131261110305786, "epoch": 0.4176969268809608, "grad_norm": 1.171875, "learning_rate": 0.0004987753140542857, "loss": 6.0105, "mean_token_accuracy": 0.16188293397426606, "num_tokens": 8827477.0, "step": 4730 }, { "entropy": 6.104963493347168, "epoch": 0.4181384669728011, "grad_norm": 1.28125, "learning_rate": 0.0004987720306242664, "loss": 5.9847, "mean_token_accuracy": 0.16286925673484803, "num_tokens": 8837067.0, "step": 4735 }, { "entropy": 6.133073472976685, "epoch": 0.4185800070646415, "grad_norm": 1.3046875, "learning_rate": 0.0004987687428106803, "loss": 6.0872, "mean_token_accuracy": 0.16172488033771515, "num_tokens": 8845790.0, "step": 4740 }, { "entropy": 6.185379123687744, "epoch": 0.4190215471564818, "grad_norm": 1.5703125, "learning_rate": 0.0004987654506135917, "loss": 6.0226, "mean_token_accuracy": 0.15944662541151047, "num_tokens": 8855242.0, "step": 4745 }, { "entropy": 6.192673587799073, "epoch": 0.41946308724832215, "grad_norm": 1.265625, "learning_rate": 0.0004987621540330652, "loss": 6.0723, "mean_token_accuracy": 0.15567026063799858, "num_tokens": 8864459.0, "step": 4750 }, { "entropy": 6.19939341545105, "epoch": 0.4199046273401625, "grad_norm": 1.4296875, "learning_rate": 0.0004987588530691653, "loss": 6.1385, "mean_token_accuracy": 0.14711003005504608, "num_tokens": 8875028.0, "step": 4755 }, { "entropy": 6.135413789749146, "epoch": 0.4203461674320028, "grad_norm": 1.6640625, "learning_rate": 0.0004987555477219569, "loss": 5.9786, "mean_token_accuracy": 0.15931818783283233, "num_tokens": 8883857.0, "step": 4760 }, { "entropy": 6.110503196716309, "epoch": 0.42078770752384315, "grad_norm": 1.3125, "learning_rate": 0.0004987522379915045, "loss": 5.9705, "mean_token_accuracy": 0.15814343243837356, "num_tokens": 8893499.0, "step": 4765 }, { "entropy": 6.095895624160766, "epoch": 0.4212292476156835, "grad_norm": 1.25, "learning_rate": 0.000498748923877873, "loss": 5.9891, "mean_token_accuracy": 0.1591852620244026, "num_tokens": 8903368.0, "step": 4770 }, { "entropy": 6.126567029953003, "epoch": 0.42167078770752386, "grad_norm": 1.3984375, "learning_rate": 0.0004987456053811273, "loss": 6.0966, "mean_token_accuracy": 0.1539273589849472, "num_tokens": 8912701.0, "step": 4775 }, { "entropy": 6.1158490657806395, "epoch": 0.4221123277993642, "grad_norm": 1.2578125, "learning_rate": 0.0004987422825013325, "loss": 6.0444, "mean_token_accuracy": 0.15109995752573013, "num_tokens": 8921962.0, "step": 4780 }, { "entropy": 6.1833864688873295, "epoch": 0.4225538678912045, "grad_norm": 1.3515625, "learning_rate": 0.0004987389552385536, "loss": 6.0307, "mean_token_accuracy": 0.15406385958194732, "num_tokens": 8931923.0, "step": 4785 }, { "entropy": 6.131430625915527, "epoch": 0.42299540798304486, "grad_norm": 1.71875, "learning_rate": 0.0004987356235928558, "loss": 6.0635, "mean_token_accuracy": 0.1470622941851616, "num_tokens": 8940403.0, "step": 4790 }, { "entropy": 6.131550025939942, "epoch": 0.4234369480748852, "grad_norm": 1.1953125, "learning_rate": 0.0004987322875643044, "loss": 5.9887, "mean_token_accuracy": 0.16142310500144957, "num_tokens": 8949377.0, "step": 4795 }, { "entropy": 6.2069591045379635, "epoch": 0.4238784881667255, "grad_norm": 1.46875, "learning_rate": 0.0004987289471529647, "loss": 6.1304, "mean_token_accuracy": 0.1479768604040146, "num_tokens": 8958719.0, "step": 4800 }, { "entropy": 6.165348720550537, "epoch": 0.42432002825856585, "grad_norm": 1.234375, "learning_rate": 0.0004987256023589022, "loss": 6.1048, "mean_token_accuracy": 0.14835046380758285, "num_tokens": 8968226.0, "step": 4805 }, { "entropy": 6.13749942779541, "epoch": 0.42476156835040624, "grad_norm": 1.296875, "learning_rate": 0.0004987222531821824, "loss": 6.0528, "mean_token_accuracy": 0.14842675924301146, "num_tokens": 8976670.0, "step": 4810 }, { "entropy": 6.09095401763916, "epoch": 0.42520310844224657, "grad_norm": 1.171875, "learning_rate": 0.0004987188996228709, "loss": 5.9901, "mean_token_accuracy": 0.16065036058425902, "num_tokens": 8986185.0, "step": 4815 }, { "entropy": 6.155765628814697, "epoch": 0.4256446485340869, "grad_norm": 1.2578125, "learning_rate": 0.0004987155416810334, "loss": 5.9644, "mean_token_accuracy": 0.16488435715436936, "num_tokens": 8995124.0, "step": 4820 }, { "entropy": 6.094462633132935, "epoch": 0.42608618862592723, "grad_norm": 1.1796875, "learning_rate": 0.0004987121793567356, "loss": 6.0704, "mean_token_accuracy": 0.15586088821291924, "num_tokens": 9004380.0, "step": 4825 }, { "entropy": 6.142639970779419, "epoch": 0.42652772871776756, "grad_norm": 1.25, "learning_rate": 0.0004987088126500436, "loss": 5.963, "mean_token_accuracy": 0.16008084118366242, "num_tokens": 9013791.0, "step": 4830 }, { "entropy": 6.069281530380249, "epoch": 0.4269692688096079, "grad_norm": 1.234375, "learning_rate": 0.000498705441561023, "loss": 5.9998, "mean_token_accuracy": 0.16714757829904556, "num_tokens": 9023076.0, "step": 4835 }, { "entropy": 6.315524005889893, "epoch": 0.4274108089014482, "grad_norm": 1.21875, "learning_rate": 0.0004987020660897401, "loss": 6.1089, "mean_token_accuracy": 0.14946657419204712, "num_tokens": 9032720.0, "step": 4840 }, { "entropy": 6.199988222122192, "epoch": 0.4278523489932886, "grad_norm": 1.1953125, "learning_rate": 0.000498698686236261, "loss": 6.0577, "mean_token_accuracy": 0.1480626255273819, "num_tokens": 9042652.0, "step": 4845 }, { "entropy": 6.085298490524292, "epoch": 0.42829388908512894, "grad_norm": 1.3203125, "learning_rate": 0.0004986953020006519, "loss": 6.1376, "mean_token_accuracy": 0.14969860166311263, "num_tokens": 9052172.0, "step": 4850 }, { "entropy": 6.134743070602417, "epoch": 0.4287354291769693, "grad_norm": 1.140625, "learning_rate": 0.0004986919133829788, "loss": 5.9956, "mean_token_accuracy": 0.15985623747110367, "num_tokens": 9061798.0, "step": 4855 }, { "entropy": 6.040613985061645, "epoch": 0.4291769692688096, "grad_norm": 1.2109375, "learning_rate": 0.0004986885203833086, "loss": 5.9499, "mean_token_accuracy": 0.15626863837242128, "num_tokens": 9070429.0, "step": 4860 }, { "entropy": 6.098531293869018, "epoch": 0.42961850936064994, "grad_norm": 1.1953125, "learning_rate": 0.0004986851230017075, "loss": 6.0492, "mean_token_accuracy": 0.16342882812023163, "num_tokens": 9080200.0, "step": 4865 }, { "entropy": 6.163259077072143, "epoch": 0.43006004945249027, "grad_norm": 1.21875, "learning_rate": 0.0004986817212382419, "loss": 6.0382, "mean_token_accuracy": 0.15508455336093901, "num_tokens": 9089555.0, "step": 4870 }, { "entropy": 6.126989889144897, "epoch": 0.4305015895443306, "grad_norm": 1.234375, "learning_rate": 0.0004986783150929786, "loss": 6.0263, "mean_token_accuracy": 0.15248029232025145, "num_tokens": 9099091.0, "step": 4875 }, { "entropy": 6.172487115859985, "epoch": 0.430943129636171, "grad_norm": 1.234375, "learning_rate": 0.0004986749045659845, "loss": 6.1075, "mean_token_accuracy": 0.14817112535238267, "num_tokens": 9109008.0, "step": 4880 }, { "entropy": 6.152895927429199, "epoch": 0.4313846697280113, "grad_norm": 1.4921875, "learning_rate": 0.0004986714896573261, "loss": 5.9513, "mean_token_accuracy": 0.16084639877080917, "num_tokens": 9117882.0, "step": 4885 }, { "entropy": 6.139704322814941, "epoch": 0.43182620981985165, "grad_norm": 1.3125, "learning_rate": 0.0004986680703670704, "loss": 6.1215, "mean_token_accuracy": 0.15487379878759383, "num_tokens": 9126860.0, "step": 4890 }, { "entropy": 6.141199541091919, "epoch": 0.432267749911692, "grad_norm": 1.3828125, "learning_rate": 0.0004986646466952845, "loss": 6.0119, "mean_token_accuracy": 0.15635189563035964, "num_tokens": 9135819.0, "step": 4895 }, { "entropy": 6.10592885017395, "epoch": 0.4327092900035323, "grad_norm": 1.1640625, "learning_rate": 0.0004986612186420353, "loss": 5.9934, "mean_token_accuracy": 0.15510470867156984, "num_tokens": 9145302.0, "step": 4900 }, { "entropy": 6.044864749908447, "epoch": 0.43315083009537264, "grad_norm": 1.234375, "learning_rate": 0.0004986577862073901, "loss": 6.0575, "mean_token_accuracy": 0.15394357293844224, "num_tokens": 9154667.0, "step": 4905 }, { "entropy": 6.2363903522491455, "epoch": 0.43359237018721297, "grad_norm": 1.2578125, "learning_rate": 0.0004986543493914159, "loss": 6.098, "mean_token_accuracy": 0.14959411323070526, "num_tokens": 9164562.0, "step": 4910 }, { "entropy": 6.208088731765747, "epoch": 0.43403391027905336, "grad_norm": 1.296875, "learning_rate": 0.0004986509081941805, "loss": 6.1071, "mean_token_accuracy": 0.15276289731264114, "num_tokens": 9174872.0, "step": 4915 }, { "entropy": 6.079789733886718, "epoch": 0.4344754503708937, "grad_norm": 1.28125, "learning_rate": 0.0004986474626157507, "loss": 5.862, "mean_token_accuracy": 0.17531196177005767, "num_tokens": 9184322.0, "step": 4920 }, { "entropy": 6.058652734756469, "epoch": 0.434916990462734, "grad_norm": 1.25, "learning_rate": 0.0004986440126561945, "loss": 5.9894, "mean_token_accuracy": 0.15689075142145156, "num_tokens": 9194450.0, "step": 4925 }, { "entropy": 6.151858711242676, "epoch": 0.43535853055457435, "grad_norm": 1.4140625, "learning_rate": 0.0004986405583155792, "loss": 6.0305, "mean_token_accuracy": 0.15203123837709426, "num_tokens": 9203658.0, "step": 4930 }, { "entropy": 6.100604057312012, "epoch": 0.4358000706464147, "grad_norm": 1.1328125, "learning_rate": 0.0004986370995939725, "loss": 6.0239, "mean_token_accuracy": 0.15403898507356645, "num_tokens": 9213609.0, "step": 4935 }, { "entropy": 6.112157011032105, "epoch": 0.436241610738255, "grad_norm": 1.171875, "learning_rate": 0.0004986336364914423, "loss": 6.0414, "mean_token_accuracy": 0.15200137123465537, "num_tokens": 9222704.0, "step": 4940 }, { "entropy": 6.147404766082763, "epoch": 0.43668315083009535, "grad_norm": 1.484375, "learning_rate": 0.0004986301690080564, "loss": 6.0472, "mean_token_accuracy": 0.1494756668806076, "num_tokens": 9231599.0, "step": 4945 }, { "entropy": 6.176404428482056, "epoch": 0.43712469092193573, "grad_norm": 1.234375, "learning_rate": 0.0004986266971438826, "loss": 6.0761, "mean_token_accuracy": 0.15215079635381698, "num_tokens": 9241886.0, "step": 4950 }, { "entropy": 6.142639207839966, "epoch": 0.43756623101377606, "grad_norm": 1.34375, "learning_rate": 0.000498623220898989, "loss": 6.0651, "mean_token_accuracy": 0.1492701292037964, "num_tokens": 9251085.0, "step": 4955 }, { "entropy": 6.155983543395996, "epoch": 0.4380077711056164, "grad_norm": 1.4609375, "learning_rate": 0.0004986197402734436, "loss": 6.0272, "mean_token_accuracy": 0.15382544845342636, "num_tokens": 9259601.0, "step": 4960 }, { "entropy": 6.18567385673523, "epoch": 0.4384493111974567, "grad_norm": 1.2578125, "learning_rate": 0.0004986162552673148, "loss": 6.0699, "mean_token_accuracy": 0.15705521255731583, "num_tokens": 9268935.0, "step": 4965 }, { "entropy": 6.133043241500855, "epoch": 0.43889085128929706, "grad_norm": 1.4921875, "learning_rate": 0.0004986127658806706, "loss": 6.0813, "mean_token_accuracy": 0.15293170362710953, "num_tokens": 9277647.0, "step": 4970 }, { "entropy": 6.120587873458862, "epoch": 0.4393323913811374, "grad_norm": 1.2109375, "learning_rate": 0.0004986092721135796, "loss": 6.0199, "mean_token_accuracy": 0.15239207521080972, "num_tokens": 9286610.0, "step": 4975 }, { "entropy": 6.135066556930542, "epoch": 0.4397739314729777, "grad_norm": 1.2421875, "learning_rate": 0.0004986057739661101, "loss": 6.1032, "mean_token_accuracy": 0.1508208692073822, "num_tokens": 9295946.0, "step": 4980 }, { "entropy": 6.1115028858184814, "epoch": 0.4402154715648181, "grad_norm": 1.2109375, "learning_rate": 0.0004986022714383307, "loss": 6.0042, "mean_token_accuracy": 0.1543491631746292, "num_tokens": 9304903.0, "step": 4985 }, { "entropy": 6.15944766998291, "epoch": 0.44065701165665844, "grad_norm": 1.265625, "learning_rate": 0.0004985987645303099, "loss": 5.9185, "mean_token_accuracy": 0.16130532771348954, "num_tokens": 9313606.0, "step": 4990 }, { "entropy": 6.016908359527588, "epoch": 0.44109855174849877, "grad_norm": 1.296875, "learning_rate": 0.0004985952532421164, "loss": 6.0275, "mean_token_accuracy": 0.15621849447488784, "num_tokens": 9322815.0, "step": 4995 }, { "entropy": 6.17736701965332, "epoch": 0.4415400918403391, "grad_norm": 1.2109375, "learning_rate": 0.0004985917375738193, "loss": 5.9593, "mean_token_accuracy": 0.15536014586687089, "num_tokens": 9332630.0, "step": 5000 }, { "entropy": 6.067361211776733, "epoch": 0.44198163193217943, "grad_norm": 1.2109375, "learning_rate": 0.0004985882175254871, "loss": 5.9846, "mean_token_accuracy": 0.15410226881504058, "num_tokens": 9342216.0, "step": 5005 }, { "entropy": 6.146738815307617, "epoch": 0.44242317202401976, "grad_norm": 1.1484375, "learning_rate": 0.0004985846930971887, "loss": 6.0431, "mean_token_accuracy": 0.1559235379099846, "num_tokens": 9352295.0, "step": 5010 }, { "entropy": 6.107598447799683, "epoch": 0.4428647121158601, "grad_norm": 1.328125, "learning_rate": 0.0004985811642889937, "loss": 6.0348, "mean_token_accuracy": 0.15718846172094345, "num_tokens": 9361462.0, "step": 5015 }, { "entropy": 6.144611740112305, "epoch": 0.4433062522077005, "grad_norm": 1.234375, "learning_rate": 0.0004985776311009705, "loss": 5.9684, "mean_token_accuracy": 0.15345567613840103, "num_tokens": 9370379.0, "step": 5020 }, { "entropy": 6.103248453140258, "epoch": 0.4437477922995408, "grad_norm": 1.3046875, "learning_rate": 0.0004985740935331888, "loss": 6.0098, "mean_token_accuracy": 0.15823583900928498, "num_tokens": 9379255.0, "step": 5025 }, { "entropy": 6.085464525222778, "epoch": 0.44418933239138114, "grad_norm": 1.2421875, "learning_rate": 0.0004985705515857177, "loss": 5.99, "mean_token_accuracy": 0.14899933189153672, "num_tokens": 9389313.0, "step": 5030 }, { "entropy": 6.020923233032226, "epoch": 0.4446308724832215, "grad_norm": 1.203125, "learning_rate": 0.0004985670052586268, "loss": 5.7916, "mean_token_accuracy": 0.17029385417699813, "num_tokens": 9397778.0, "step": 5035 }, { "entropy": 6.082318449020386, "epoch": 0.4450724125750618, "grad_norm": 1.234375, "learning_rate": 0.0004985634545519853, "loss": 6.0589, "mean_token_accuracy": 0.15298160612583162, "num_tokens": 9407831.0, "step": 5040 }, { "entropy": 6.0772803783416744, "epoch": 0.44551395266690214, "grad_norm": 1.53125, "learning_rate": 0.0004985598994658629, "loss": 5.9669, "mean_token_accuracy": 0.15956881046295165, "num_tokens": 9418458.0, "step": 5045 }, { "entropy": 6.016619396209717, "epoch": 0.44595549275874247, "grad_norm": 1.21875, "learning_rate": 0.0004985563400003291, "loss": 5.8707, "mean_token_accuracy": 0.17422997653484346, "num_tokens": 9426911.0, "step": 5050 }, { "entropy": 6.14707236289978, "epoch": 0.44639703285058285, "grad_norm": 1.2421875, "learning_rate": 0.0004985527761554539, "loss": 6.0541, "mean_token_accuracy": 0.1595864400267601, "num_tokens": 9435896.0, "step": 5055 }, { "entropy": 6.207449245452881, "epoch": 0.4468385729424232, "grad_norm": 1.296875, "learning_rate": 0.000498549207931307, "loss": 6.0612, "mean_token_accuracy": 0.15011950582265854, "num_tokens": 9445272.0, "step": 5060 }, { "entropy": 6.06727409362793, "epoch": 0.4472801130342635, "grad_norm": 1.3359375, "learning_rate": 0.0004985456353279581, "loss": 6.0228, "mean_token_accuracy": 0.15027147233486177, "num_tokens": 9455377.0, "step": 5065 }, { "entropy": 6.108924150466919, "epoch": 0.44772165312610385, "grad_norm": 1.2734375, "learning_rate": 0.0004985420583454774, "loss": 6.0477, "mean_token_accuracy": 0.1528654247522354, "num_tokens": 9464918.0, "step": 5070 }, { "entropy": 6.118135070800781, "epoch": 0.4481631932179442, "grad_norm": 1.3203125, "learning_rate": 0.0004985384769839349, "loss": 6.0416, "mean_token_accuracy": 0.15041410326957702, "num_tokens": 9473322.0, "step": 5075 }, { "entropy": 6.191469526290893, "epoch": 0.4486047333097845, "grad_norm": 1.203125, "learning_rate": 0.0004985348912434008, "loss": 5.969, "mean_token_accuracy": 0.16189600080251693, "num_tokens": 9482255.0, "step": 5080 }, { "entropy": 6.092324352264404, "epoch": 0.44904627340162484, "grad_norm": 1.328125, "learning_rate": 0.0004985313011239452, "loss": 5.9959, "mean_token_accuracy": 0.1581245869398117, "num_tokens": 9491709.0, "step": 5085 }, { "entropy": 6.024058151245117, "epoch": 0.4494878134934652, "grad_norm": 1.296875, "learning_rate": 0.0004985277066256388, "loss": 5.9807, "mean_token_accuracy": 0.16034325063228608, "num_tokens": 9500594.0, "step": 5090 }, { "entropy": 6.112436008453369, "epoch": 0.44992935358530556, "grad_norm": 1.296875, "learning_rate": 0.0004985241077485515, "loss": 6.0018, "mean_token_accuracy": 0.15738717019557952, "num_tokens": 9509088.0, "step": 5095 }, { "entropy": 6.149496126174927, "epoch": 0.4503708936771459, "grad_norm": 1.4609375, "learning_rate": 0.0004985205044927541, "loss": 5.9725, "mean_token_accuracy": 0.15938366055488587, "num_tokens": 9517776.0, "step": 5100 }, { "entropy": 6.069735097885132, "epoch": 0.4508124337689862, "grad_norm": 1.2578125, "learning_rate": 0.0004985168968583173, "loss": 6.0348, "mean_token_accuracy": 0.15837667435407637, "num_tokens": 9527080.0, "step": 5105 }, { "entropy": 6.106137084960937, "epoch": 0.45125397386082655, "grad_norm": 1.265625, "learning_rate": 0.0004985132848453114, "loss": 5.8952, "mean_token_accuracy": 0.16738586127758026, "num_tokens": 9536358.0, "step": 5110 }, { "entropy": 6.04980616569519, "epoch": 0.4516955139526669, "grad_norm": 1.296875, "learning_rate": 0.0004985096684538075, "loss": 5.9813, "mean_token_accuracy": 0.15662433505058287, "num_tokens": 9545528.0, "step": 5115 }, { "entropy": 6.060785722732544, "epoch": 0.4521370540445072, "grad_norm": 1.3515625, "learning_rate": 0.0004985060476838763, "loss": 6.0113, "mean_token_accuracy": 0.15891691744327546, "num_tokens": 9554433.0, "step": 5120 }, { "entropy": 6.073312520980835, "epoch": 0.4525785941363476, "grad_norm": 1.3125, "learning_rate": 0.0004985024225355887, "loss": 5.9656, "mean_token_accuracy": 0.1543935567140579, "num_tokens": 9563932.0, "step": 5125 }, { "entropy": 6.100546646118164, "epoch": 0.45302013422818793, "grad_norm": 1.1953125, "learning_rate": 0.0004984987930090158, "loss": 5.9552, "mean_token_accuracy": 0.15897123962640763, "num_tokens": 9572829.0, "step": 5130 }, { "entropy": 6.025969839096069, "epoch": 0.45346167432002826, "grad_norm": 1.203125, "learning_rate": 0.0004984951591042285, "loss": 6.0045, "mean_token_accuracy": 0.16426790058612822, "num_tokens": 9583597.0, "step": 5135 }, { "entropy": 6.24551477432251, "epoch": 0.4539032144118686, "grad_norm": 1.25, "learning_rate": 0.0004984915208212983, "loss": 6.0912, "mean_token_accuracy": 0.15156230181455613, "num_tokens": 9593114.0, "step": 5140 }, { "entropy": 6.167973184585572, "epoch": 0.4543447545037089, "grad_norm": 1.1953125, "learning_rate": 0.0004984878781602964, "loss": 5.9722, "mean_token_accuracy": 0.1515617176890373, "num_tokens": 9601454.0, "step": 5145 }, { "entropy": 6.117550992965699, "epoch": 0.45478629459554926, "grad_norm": 1.203125, "learning_rate": 0.0004984842311212939, "loss": 6.0266, "mean_token_accuracy": 0.1547075927257538, "num_tokens": 9611994.0, "step": 5150 }, { "entropy": 6.099160146713257, "epoch": 0.4552278346873896, "grad_norm": 1.2890625, "learning_rate": 0.0004984805797043625, "loss": 5.9819, "mean_token_accuracy": 0.1554704263806343, "num_tokens": 9621318.0, "step": 5155 }, { "entropy": 6.086650943756103, "epoch": 0.45566937477923, "grad_norm": 1.3359375, "learning_rate": 0.0004984769239095736, "loss": 5.9871, "mean_token_accuracy": 0.1622908428311348, "num_tokens": 9630270.0, "step": 5160 }, { "entropy": 6.161284017562866, "epoch": 0.4561109148710703, "grad_norm": 1.1953125, "learning_rate": 0.0004984732637369989, "loss": 6.0164, "mean_token_accuracy": 0.15295830443501474, "num_tokens": 9640391.0, "step": 5165 }, { "entropy": 6.111026573181152, "epoch": 0.45655245496291064, "grad_norm": 1.828125, "learning_rate": 0.0004984695991867099, "loss": 6.0302, "mean_token_accuracy": 0.15423648655414582, "num_tokens": 9648827.0, "step": 5170 }, { "entropy": 6.066336679458618, "epoch": 0.45699399505475097, "grad_norm": 1.6484375, "learning_rate": 0.0004984659302587788, "loss": 5.9651, "mean_token_accuracy": 0.1540757015347481, "num_tokens": 9657940.0, "step": 5175 }, { "entropy": 6.132708024978638, "epoch": 0.4574355351465913, "grad_norm": 1.25, "learning_rate": 0.000498462256953277, "loss": 6.0864, "mean_token_accuracy": 0.15138714611530305, "num_tokens": 9668296.0, "step": 5180 }, { "entropy": 6.187513446807861, "epoch": 0.45787707523843163, "grad_norm": 1.421875, "learning_rate": 0.0004984585792702767, "loss": 6.0401, "mean_token_accuracy": 0.14838093519210815, "num_tokens": 9677914.0, "step": 5185 }, { "entropy": 6.183677721023559, "epoch": 0.45831861533027196, "grad_norm": 1.34375, "learning_rate": 0.0004984548972098501, "loss": 6.174, "mean_token_accuracy": 0.13970668166875838, "num_tokens": 9687339.0, "step": 5190 }, { "entropy": 6.133589363098144, "epoch": 0.45876015542211235, "grad_norm": 1.546875, "learning_rate": 0.000498451210772069, "loss": 5.9454, "mean_token_accuracy": 0.16496401354670526, "num_tokens": 9696889.0, "step": 5195 }, { "entropy": 6.0567436695098875, "epoch": 0.4592016955139527, "grad_norm": 1.4375, "learning_rate": 0.0004984475199570058, "loss": 5.9136, "mean_token_accuracy": 0.1610390767455101, "num_tokens": 9705498.0, "step": 5200 }, { "entropy": 6.099144554138183, "epoch": 0.459643235605793, "grad_norm": 2.046875, "learning_rate": 0.0004984438247647329, "loss": 5.9902, "mean_token_accuracy": 0.15816803127527237, "num_tokens": 9713936.0, "step": 5205 }, { "entropy": 6.152586174011231, "epoch": 0.46008477569763334, "grad_norm": 1.3671875, "learning_rate": 0.0004984401251953223, "loss": 6.06, "mean_token_accuracy": 0.15645882338285447, "num_tokens": 9723924.0, "step": 5210 }, { "entropy": 6.179743766784668, "epoch": 0.4605263157894737, "grad_norm": 1.21875, "learning_rate": 0.0004984364212488469, "loss": 6.0873, "mean_token_accuracy": 0.15282048285007477, "num_tokens": 9733427.0, "step": 5215 }, { "entropy": 6.035888767242431, "epoch": 0.460967855881314, "grad_norm": 1.3828125, "learning_rate": 0.0004984327129253789, "loss": 5.8837, "mean_token_accuracy": 0.16696648448705673, "num_tokens": 9742054.0, "step": 5220 }, { "entropy": 6.049908781051636, "epoch": 0.46140939597315433, "grad_norm": 1.25, "learning_rate": 0.0004984290002249914, "loss": 6.063, "mean_token_accuracy": 0.1488596171140671, "num_tokens": 9752447.0, "step": 5225 }, { "entropy": 6.131543159484863, "epoch": 0.4618509360649947, "grad_norm": 1.5, "learning_rate": 0.0004984252831477567, "loss": 5.8886, "mean_token_accuracy": 0.16800648123025894, "num_tokens": 9760878.0, "step": 5230 }, { "entropy": 5.97862868309021, "epoch": 0.46229247615683505, "grad_norm": 1.2890625, "learning_rate": 0.0004984215616937477, "loss": 5.9523, "mean_token_accuracy": 0.1603931352496147, "num_tokens": 9770200.0, "step": 5235 }, { "entropy": 6.214956617355346, "epoch": 0.4627340162486754, "grad_norm": 1.2265625, "learning_rate": 0.0004984178358630374, "loss": 6.0469, "mean_token_accuracy": 0.15920519679784775, "num_tokens": 9780303.0, "step": 5240 }, { "entropy": 6.130382776260376, "epoch": 0.4631755563405157, "grad_norm": 1.296875, "learning_rate": 0.0004984141056556989, "loss": 5.9529, "mean_token_accuracy": 0.16305534839630126, "num_tokens": 9790248.0, "step": 5245 }, { "entropy": 6.11019229888916, "epoch": 0.46361709643235605, "grad_norm": 1.3046875, "learning_rate": 0.0004984103710718051, "loss": 6.0503, "mean_token_accuracy": 0.1460764303803444, "num_tokens": 9799345.0, "step": 5250 }, { "entropy": 6.091036796569824, "epoch": 0.4640586365241964, "grad_norm": 1.390625, "learning_rate": 0.000498406632111429, "loss": 5.9308, "mean_token_accuracy": 0.15862552225589752, "num_tokens": 9808561.0, "step": 5255 }, { "entropy": 6.106261253356934, "epoch": 0.4645001766160367, "grad_norm": 1.2421875, "learning_rate": 0.0004984028887746443, "loss": 5.9739, "mean_token_accuracy": 0.16280067563056946, "num_tokens": 9818324.0, "step": 5260 }, { "entropy": 5.984898710250855, "epoch": 0.4649417167078771, "grad_norm": 1.4296875, "learning_rate": 0.0004983991410615239, "loss": 5.9645, "mean_token_accuracy": 0.16320008635520936, "num_tokens": 9827935.0, "step": 5265 }, { "entropy": 6.089527177810669, "epoch": 0.4653832567997174, "grad_norm": 1.3359375, "learning_rate": 0.0004983953889721414, "loss": 5.9782, "mean_token_accuracy": 0.15890799909830094, "num_tokens": 9837118.0, "step": 5270 }, { "entropy": 6.149358510971069, "epoch": 0.46582479689155776, "grad_norm": 1.4375, "learning_rate": 0.0004983916325065703, "loss": 6.0455, "mean_token_accuracy": 0.1533641681075096, "num_tokens": 9846197.0, "step": 5275 }, { "entropy": 6.196761894226074, "epoch": 0.4662663369833981, "grad_norm": 1.2421875, "learning_rate": 0.0004983878716648842, "loss": 6.107, "mean_token_accuracy": 0.14858163744211197, "num_tokens": 9856120.0, "step": 5280 }, { "entropy": 6.119464254379272, "epoch": 0.4667078770752384, "grad_norm": 1.15625, "learning_rate": 0.0004983841064471567, "loss": 6.056, "mean_token_accuracy": 0.1537718027830124, "num_tokens": 9865599.0, "step": 5285 }, { "entropy": 6.096899557113647, "epoch": 0.46714941716707875, "grad_norm": 1.203125, "learning_rate": 0.0004983803368534617, "loss": 5.9688, "mean_token_accuracy": 0.15628019720315933, "num_tokens": 9876471.0, "step": 5290 }, { "entropy": 6.196185064315796, "epoch": 0.4675909572589191, "grad_norm": 1.4453125, "learning_rate": 0.0004983765628838728, "loss": 6.0124, "mean_token_accuracy": 0.15632506608963012, "num_tokens": 9887680.0, "step": 5295 }, { "entropy": 6.205327701568604, "epoch": 0.46803249735075947, "grad_norm": 1.59375, "learning_rate": 0.0004983727845384641, "loss": 6.0798, "mean_token_accuracy": 0.15046066045761108, "num_tokens": 9897366.0, "step": 5300 }, { "entropy": 6.112229824066162, "epoch": 0.4684740374425998, "grad_norm": 1.328125, "learning_rate": 0.0004983690018173096, "loss": 6.0473, "mean_token_accuracy": 0.15282203108072281, "num_tokens": 9907437.0, "step": 5305 }, { "entropy": 6.03530478477478, "epoch": 0.46891557753444013, "grad_norm": 1.3125, "learning_rate": 0.0004983652147204834, "loss": 5.8493, "mean_token_accuracy": 0.16888897120952606, "num_tokens": 9915766.0, "step": 5310 }, { "entropy": 6.039136505126953, "epoch": 0.46935711762628046, "grad_norm": 1.328125, "learning_rate": 0.0004983614232480598, "loss": 6.0557, "mean_token_accuracy": 0.1539039731025696, "num_tokens": 9925389.0, "step": 5315 }, { "entropy": 6.1289163589477536, "epoch": 0.4697986577181208, "grad_norm": 1.234375, "learning_rate": 0.0004983576274001127, "loss": 6.0973, "mean_token_accuracy": 0.15069840773940085, "num_tokens": 9935798.0, "step": 5320 }, { "entropy": 6.164807653427124, "epoch": 0.4702401978099611, "grad_norm": 1.2578125, "learning_rate": 0.000498353827176717, "loss": 5.9409, "mean_token_accuracy": 0.16221913695335388, "num_tokens": 9945579.0, "step": 5325 }, { "entropy": 6.1347403049469, "epoch": 0.4706817379018015, "grad_norm": 1.328125, "learning_rate": 0.0004983500225779466, "loss": 5.9683, "mean_token_accuracy": 0.15579652935266494, "num_tokens": 9955248.0, "step": 5330 }, { "entropy": 6.085616779327393, "epoch": 0.47112327799364184, "grad_norm": 1.2890625, "learning_rate": 0.0004983462136038764, "loss": 5.9757, "mean_token_accuracy": 0.16684675961732864, "num_tokens": 9965078.0, "step": 5335 }, { "entropy": 6.127761745452881, "epoch": 0.4715648180854822, "grad_norm": 1.6953125, "learning_rate": 0.0004983424002545809, "loss": 5.9417, "mean_token_accuracy": 0.16493815779685975, "num_tokens": 9973932.0, "step": 5340 }, { "entropy": 6.168215322494507, "epoch": 0.4720063581773225, "grad_norm": 1.328125, "learning_rate": 0.0004983385825301348, "loss": 6.0002, "mean_token_accuracy": 0.1548996612429619, "num_tokens": 9983315.0, "step": 5345 }, { "entropy": 6.058854579925537, "epoch": 0.47244789826916284, "grad_norm": 2.0625, "learning_rate": 0.0004983347604306129, "loss": 5.9641, "mean_token_accuracy": 0.15613725483417512, "num_tokens": 9992987.0, "step": 5350 }, { "entropy": 6.019079732894897, "epoch": 0.47288943836100317, "grad_norm": 1.453125, "learning_rate": 0.0004983309339560899, "loss": 6.0237, "mean_token_accuracy": 0.15275818705558777, "num_tokens": 10002268.0, "step": 5355 }, { "entropy": 6.169502782821655, "epoch": 0.4733309784528435, "grad_norm": 1.3359375, "learning_rate": 0.0004983271031066412, "loss": 6.0223, "mean_token_accuracy": 0.14990446120500564, "num_tokens": 10011772.0, "step": 5360 }, { "entropy": 6.224719524383545, "epoch": 0.4737725185446839, "grad_norm": 1.5234375, "learning_rate": 0.0004983232678823414, "loss": 6.0393, "mean_token_accuracy": 0.15731042325496675, "num_tokens": 10021069.0, "step": 5365 }, { "entropy": 6.063888311386108, "epoch": 0.4742140586365242, "grad_norm": 1.3125, "learning_rate": 0.0004983194282832657, "loss": 5.9557, "mean_token_accuracy": 0.161125111579895, "num_tokens": 10029706.0, "step": 5370 }, { "entropy": 5.960548067092896, "epoch": 0.47465559872836455, "grad_norm": 1.421875, "learning_rate": 0.0004983155843094895, "loss": 5.8997, "mean_token_accuracy": 0.16574549674987793, "num_tokens": 10039633.0, "step": 5375 }, { "entropy": 6.088515663146973, "epoch": 0.4750971388202049, "grad_norm": 1.8125, "learning_rate": 0.0004983117359610881, "loss": 5.9683, "mean_token_accuracy": 0.1593565970659256, "num_tokens": 10048675.0, "step": 5380 }, { "entropy": 6.142480039596558, "epoch": 0.4755386789120452, "grad_norm": 1.1640625, "learning_rate": 0.0004983078832381367, "loss": 5.9571, "mean_token_accuracy": 0.16286074072122575, "num_tokens": 10057447.0, "step": 5385 }, { "entropy": 6.040254259109497, "epoch": 0.47598021900388554, "grad_norm": 1.75, "learning_rate": 0.0004983040261407109, "loss": 6.0401, "mean_token_accuracy": 0.15268651247024537, "num_tokens": 10067020.0, "step": 5390 }, { "entropy": 6.183468341827393, "epoch": 0.47642175909572587, "grad_norm": 1.5078125, "learning_rate": 0.0004983001646688863, "loss": 6.0737, "mean_token_accuracy": 0.15229557305574418, "num_tokens": 10076466.0, "step": 5395 }, { "entropy": 6.156057071685791, "epoch": 0.47686329918756626, "grad_norm": 1.40625, "learning_rate": 0.0004982962988227383, "loss": 5.9383, "mean_token_accuracy": 0.1562927931547165, "num_tokens": 10085384.0, "step": 5400 }, { "entropy": 6.085453033447266, "epoch": 0.4773048392794066, "grad_norm": 1.3828125, "learning_rate": 0.000498292428602343, "loss": 6.0543, "mean_token_accuracy": 0.15518272593617438, "num_tokens": 10094015.0, "step": 5405 }, { "entropy": 6.131287050247193, "epoch": 0.4777463793712469, "grad_norm": 1.40625, "learning_rate": 0.0004982885540077758, "loss": 5.9489, "mean_token_accuracy": 0.16021449863910675, "num_tokens": 10103107.0, "step": 5410 }, { "entropy": 6.061293983459473, "epoch": 0.47818791946308725, "grad_norm": 1.21875, "learning_rate": 0.0004982846750391129, "loss": 5.883, "mean_token_accuracy": 0.165081886947155, "num_tokens": 10112408.0, "step": 5415 }, { "entropy": 6.103114128112793, "epoch": 0.4786294595549276, "grad_norm": 1.3125, "learning_rate": 0.0004982807916964303, "loss": 5.9587, "mean_token_accuracy": 0.15787963271141053, "num_tokens": 10121605.0, "step": 5420 }, { "entropy": 6.012549638748169, "epoch": 0.4790709996467679, "grad_norm": 1.4609375, "learning_rate": 0.000498276903979804, "loss": 5.9534, "mean_token_accuracy": 0.1598804622888565, "num_tokens": 10130484.0, "step": 5425 }, { "entropy": 5.984193134307861, "epoch": 0.47951253973860825, "grad_norm": 1.2890625, "learning_rate": 0.00049827301188931, "loss": 5.906, "mean_token_accuracy": 0.16575224399566652, "num_tokens": 10140002.0, "step": 5430 }, { "entropy": 6.203652048110962, "epoch": 0.47995407983044863, "grad_norm": 1.4140625, "learning_rate": 0.0004982691154250247, "loss": 6.0681, "mean_token_accuracy": 0.1568043977022171, "num_tokens": 10150287.0, "step": 5435 }, { "entropy": 6.098856639862061, "epoch": 0.48039561992228896, "grad_norm": 1.328125, "learning_rate": 0.0004982652145870245, "loss": 5.9027, "mean_token_accuracy": 0.16656555682420732, "num_tokens": 10160615.0, "step": 5440 }, { "entropy": 5.997969436645508, "epoch": 0.4808371600141293, "grad_norm": 1.2890625, "learning_rate": 0.0004982613093753856, "loss": 5.9701, "mean_token_accuracy": 0.16098668649792672, "num_tokens": 10170271.0, "step": 5445 }, { "entropy": 6.051625394821167, "epoch": 0.4812787001059696, "grad_norm": 1.34375, "learning_rate": 0.0004982573997901847, "loss": 5.9498, "mean_token_accuracy": 0.16217143833637238, "num_tokens": 10179663.0, "step": 5450 }, { "entropy": 6.167967081069946, "epoch": 0.48172024019780996, "grad_norm": 1.265625, "learning_rate": 0.0004982534858314982, "loss": 6.0302, "mean_token_accuracy": 0.1533224031329155, "num_tokens": 10188608.0, "step": 5455 }, { "entropy": 6.123821449279785, "epoch": 0.4821617802896503, "grad_norm": 1.34375, "learning_rate": 0.0004982495674994031, "loss": 5.9888, "mean_token_accuracy": 0.1592209592461586, "num_tokens": 10198462.0, "step": 5460 }, { "entropy": 5.982554721832275, "epoch": 0.4826033203814906, "grad_norm": 1.3359375, "learning_rate": 0.0004982456447939758, "loss": 5.9323, "mean_token_accuracy": 0.16437650620937347, "num_tokens": 10208362.0, "step": 5465 }, { "entropy": 6.075630807876587, "epoch": 0.483044860473331, "grad_norm": 1.328125, "learning_rate": 0.0004982417177152933, "loss": 5.955, "mean_token_accuracy": 0.1612927421927452, "num_tokens": 10217915.0, "step": 5470 }, { "entropy": 6.136414337158203, "epoch": 0.48348640056517134, "grad_norm": 1.484375, "learning_rate": 0.0004982377862634325, "loss": 6.0141, "mean_token_accuracy": 0.16026580333709717, "num_tokens": 10226913.0, "step": 5475 }, { "entropy": 6.0771276473999025, "epoch": 0.48392794065701167, "grad_norm": 1.28125, "learning_rate": 0.0004982338504384705, "loss": 5.9819, "mean_token_accuracy": 0.15748471468687059, "num_tokens": 10236516.0, "step": 5480 }, { "entropy": 6.1290271282196045, "epoch": 0.484369480748852, "grad_norm": 1.296875, "learning_rate": 0.0004982299102404843, "loss": 5.9977, "mean_token_accuracy": 0.15856396406888962, "num_tokens": 10245492.0, "step": 5485 }, { "entropy": 5.977679443359375, "epoch": 0.48481102084069233, "grad_norm": 1.734375, "learning_rate": 0.000498225965669551, "loss": 5.8679, "mean_token_accuracy": 0.17316461503505706, "num_tokens": 10254094.0, "step": 5490 }, { "entropy": 6.03317437171936, "epoch": 0.48525256093253266, "grad_norm": 1.4296875, "learning_rate": 0.0004982220167257482, "loss": 5.9069, "mean_token_accuracy": 0.1637963816523552, "num_tokens": 10263645.0, "step": 5495 }, { "entropy": 6.013968801498413, "epoch": 0.485694101024373, "grad_norm": 1.2578125, "learning_rate": 0.0004982180634091529, "loss": 5.8919, "mean_token_accuracy": 0.16816270351409912, "num_tokens": 10273416.0, "step": 5500 }, { "entropy": 6.0709892272949215, "epoch": 0.4861356411162134, "grad_norm": 1.25, "learning_rate": 0.0004982141057198427, "loss": 5.9987, "mean_token_accuracy": 0.15827764123678206, "num_tokens": 10283194.0, "step": 5505 }, { "entropy": 6.133906984329224, "epoch": 0.4865771812080537, "grad_norm": 1.4375, "learning_rate": 0.0004982101436578952, "loss": 5.9737, "mean_token_accuracy": 0.15922945886850357, "num_tokens": 10292188.0, "step": 5510 }, { "entropy": 6.0081017971038815, "epoch": 0.48701872129989404, "grad_norm": 1.65625, "learning_rate": 0.0004982061772233878, "loss": 5.9009, "mean_token_accuracy": 0.17360175549983978, "num_tokens": 10301539.0, "step": 5515 }, { "entropy": 6.000823640823365, "epoch": 0.4874602613917344, "grad_norm": 1.3203125, "learning_rate": 0.0004982022064163984, "loss": 5.9093, "mean_token_accuracy": 0.16490670889616013, "num_tokens": 10310379.0, "step": 5520 }, { "entropy": 6.19787015914917, "epoch": 0.4879018014835747, "grad_norm": 1.296875, "learning_rate": 0.0004981982312370047, "loss": 6.1108, "mean_token_accuracy": 0.1488179437816143, "num_tokens": 10320622.0, "step": 5525 }, { "entropy": 6.114509153366089, "epoch": 0.48834334157541504, "grad_norm": 1.171875, "learning_rate": 0.0004981942516852847, "loss": 5.9356, "mean_token_accuracy": 0.15839738100767137, "num_tokens": 10330525.0, "step": 5530 }, { "entropy": 6.041282749176025, "epoch": 0.48878488166725537, "grad_norm": 1.203125, "learning_rate": 0.0004981902677613161, "loss": 5.9671, "mean_token_accuracy": 0.1586508110165596, "num_tokens": 10340288.0, "step": 5535 }, { "entropy": 6.1574572086334225, "epoch": 0.48922642175909575, "grad_norm": 1.21875, "learning_rate": 0.0004981862794651771, "loss": 6.0482, "mean_token_accuracy": 0.15438254177570343, "num_tokens": 10349836.0, "step": 5540 }, { "entropy": 6.124531698226929, "epoch": 0.4896679618509361, "grad_norm": 1.5078125, "learning_rate": 0.0004981822867969459, "loss": 5.9835, "mean_token_accuracy": 0.1561306193470955, "num_tokens": 10359481.0, "step": 5545 }, { "entropy": 6.077211523056031, "epoch": 0.4901095019427764, "grad_norm": 1.3203125, "learning_rate": 0.0004981782897567006, "loss": 5.9606, "mean_token_accuracy": 0.16075108498334884, "num_tokens": 10369837.0, "step": 5550 }, { "entropy": 6.108371019363403, "epoch": 0.49055104203461675, "grad_norm": 1.671875, "learning_rate": 0.0004981742883445195, "loss": 5.927, "mean_token_accuracy": 0.16440120637416838, "num_tokens": 10379276.0, "step": 5555 }, { "entropy": 6.127094459533692, "epoch": 0.4909925821264571, "grad_norm": 1.4453125, "learning_rate": 0.000498170282560481, "loss": 6.0562, "mean_token_accuracy": 0.15287164598703384, "num_tokens": 10388878.0, "step": 5560 }, { "entropy": 6.158031272888183, "epoch": 0.4914341222182974, "grad_norm": 1.3359375, "learning_rate": 0.0004981662724046637, "loss": 6.0416, "mean_token_accuracy": 0.15418365895748137, "num_tokens": 10399148.0, "step": 5565 }, { "entropy": 6.072212409973145, "epoch": 0.49187566231013774, "grad_norm": 1.2265625, "learning_rate": 0.000498162257877146, "loss": 5.9705, "mean_token_accuracy": 0.16276238560676576, "num_tokens": 10409002.0, "step": 5570 }, { "entropy": 6.052406740188599, "epoch": 0.4923172024019781, "grad_norm": 1.4296875, "learning_rate": 0.0004981582389780065, "loss": 6.0182, "mean_token_accuracy": 0.15543360412120819, "num_tokens": 10418083.0, "step": 5575 }, { "entropy": 6.2516388416290285, "epoch": 0.49275874249381846, "grad_norm": 1.3671875, "learning_rate": 0.0004981542157073241, "loss": 6.0216, "mean_token_accuracy": 0.15745319724082946, "num_tokens": 10428134.0, "step": 5580 }, { "entropy": 6.102121114730835, "epoch": 0.4932002825856588, "grad_norm": 1.3984375, "learning_rate": 0.0004981501880651775, "loss": 6.0144, "mean_token_accuracy": 0.15700092390179635, "num_tokens": 10437436.0, "step": 5585 }, { "entropy": 6.011652183532715, "epoch": 0.4936418226774991, "grad_norm": 1.2734375, "learning_rate": 0.0004981461560516457, "loss": 5.9437, "mean_token_accuracy": 0.15731440335512162, "num_tokens": 10446176.0, "step": 5590 }, { "entropy": 6.185711860656738, "epoch": 0.49408336276933945, "grad_norm": 1.3125, "learning_rate": 0.0004981421196668075, "loss": 6.1086, "mean_token_accuracy": 0.15340851247310638, "num_tokens": 10455953.0, "step": 5595 }, { "entropy": 6.20690336227417, "epoch": 0.4945249028611798, "grad_norm": 1.546875, "learning_rate": 0.0004981380789107422, "loss": 5.9589, "mean_token_accuracy": 0.15672969669103623, "num_tokens": 10465129.0, "step": 5600 }, { "entropy": 6.003274393081665, "epoch": 0.4949664429530201, "grad_norm": 1.40625, "learning_rate": 0.0004981340337835287, "loss": 5.9608, "mean_token_accuracy": 0.1587096706032753, "num_tokens": 10474376.0, "step": 5605 }, { "entropy": 6.04667649269104, "epoch": 0.4954079830448605, "grad_norm": 1.78125, "learning_rate": 0.0004981299842852464, "loss": 6.02, "mean_token_accuracy": 0.1601344585418701, "num_tokens": 10484849.0, "step": 5610 }, { "entropy": 6.119048452377319, "epoch": 0.49584952313670083, "grad_norm": 1.3671875, "learning_rate": 0.0004981259304159747, "loss": 5.9104, "mean_token_accuracy": 0.17004732936620712, "num_tokens": 10493950.0, "step": 5615 }, { "entropy": 6.0551595211029055, "epoch": 0.49629106322854116, "grad_norm": 1.3671875, "learning_rate": 0.0004981218721757929, "loss": 5.877, "mean_token_accuracy": 0.16873401552438735, "num_tokens": 10502890.0, "step": 5620 }, { "entropy": 5.9873803615570065, "epoch": 0.4967326033203815, "grad_norm": 1.421875, "learning_rate": 0.0004981178095647805, "loss": 5.8707, "mean_token_accuracy": 0.160433566570282, "num_tokens": 10511702.0, "step": 5625 }, { "entropy": 6.003235149383545, "epoch": 0.4971741434122218, "grad_norm": 1.328125, "learning_rate": 0.0004981137425830171, "loss": 5.9481, "mean_token_accuracy": 0.16074998080730438, "num_tokens": 10520212.0, "step": 5630 }, { "entropy": 6.066579341888428, "epoch": 0.49761568350406216, "grad_norm": 1.359375, "learning_rate": 0.0004981096712305825, "loss": 5.971, "mean_token_accuracy": 0.1613484501838684, "num_tokens": 10529652.0, "step": 5635 }, { "entropy": 6.099881601333618, "epoch": 0.4980572235959025, "grad_norm": 1.296875, "learning_rate": 0.0004981055955075561, "loss": 6.0225, "mean_token_accuracy": 0.158574703335762, "num_tokens": 10537980.0, "step": 5640 }, { "entropy": 6.089739847183227, "epoch": 0.4984987636877429, "grad_norm": 1.3125, "learning_rate": 0.0004981015154140181, "loss": 5.8995, "mean_token_accuracy": 0.16764541566371918, "num_tokens": 10546854.0, "step": 5645 }, { "entropy": 5.998034429550171, "epoch": 0.4989403037795832, "grad_norm": 1.359375, "learning_rate": 0.0004980974309500483, "loss": 5.923, "mean_token_accuracy": 0.16142944097518921, "num_tokens": 10555691.0, "step": 5650 }, { "entropy": 6.080042123794556, "epoch": 0.49938184387142354, "grad_norm": 1.90625, "learning_rate": 0.0004980933421157267, "loss": 6.0191, "mean_token_accuracy": 0.1503999724984169, "num_tokens": 10564750.0, "step": 5655 }, { "entropy": 6.1378124237060545, "epoch": 0.49982338396326387, "grad_norm": 1.3203125, "learning_rate": 0.0004980892489111334, "loss": 6.0389, "mean_token_accuracy": 0.15747978240251542, "num_tokens": 10573515.0, "step": 5660 }, { "entropy": 6.055855131149292, "epoch": 0.5002649240551043, "grad_norm": 1.3515625, "learning_rate": 0.0004980851513363486, "loss": 5.9161, "mean_token_accuracy": 0.1634524017572403, "num_tokens": 10582730.0, "step": 5665 }, { "entropy": 5.963370323181152, "epoch": 0.5007064641469445, "grad_norm": 5.15625, "learning_rate": 0.0004980810493914526, "loss": 5.8141, "mean_token_accuracy": 0.18174145370721817, "num_tokens": 10591267.0, "step": 5670 }, { "entropy": 6.040016841888428, "epoch": 0.5011480042387849, "grad_norm": 1.3046875, "learning_rate": 0.0004980769430765256, "loss": 5.918, "mean_token_accuracy": 0.15779468268156052, "num_tokens": 10599776.0, "step": 5675 }, { "entropy": 6.018471956253052, "epoch": 0.5015895443306252, "grad_norm": 2.03125, "learning_rate": 0.0004980728323916484, "loss": 5.9294, "mean_token_accuracy": 0.15198549777269363, "num_tokens": 10609670.0, "step": 5680 }, { "entropy": 6.0253712177276615, "epoch": 0.5020310844224656, "grad_norm": 1.21875, "learning_rate": 0.0004980687173369009, "loss": 5.8857, "mean_token_accuracy": 0.16054306030273438, "num_tokens": 10619032.0, "step": 5685 }, { "entropy": 6.13008975982666, "epoch": 0.5024726245143059, "grad_norm": 1.5390625, "learning_rate": 0.0004980645979123644, "loss": 5.9696, "mean_token_accuracy": 0.15761574804782869, "num_tokens": 10628851.0, "step": 5690 }, { "entropy": 6.1254706382751465, "epoch": 0.5029141646061462, "grad_norm": 1.3671875, "learning_rate": 0.0004980604741181192, "loss": 5.9384, "mean_token_accuracy": 0.1661163553595543, "num_tokens": 10638335.0, "step": 5695 }, { "entropy": 6.104246044158936, "epoch": 0.5033557046979866, "grad_norm": 1.265625, "learning_rate": 0.0004980563459542461, "loss": 6.0288, "mean_token_accuracy": 0.15877759456634521, "num_tokens": 10648450.0, "step": 5700 }, { "entropy": 6.129976463317871, "epoch": 0.5037972447898269, "grad_norm": 1.203125, "learning_rate": 0.0004980522134208261, "loss": 6.0499, "mean_token_accuracy": 0.15289226770401002, "num_tokens": 10658700.0, "step": 5705 }, { "entropy": 6.125590181350708, "epoch": 0.5042387848816673, "grad_norm": 1.34375, "learning_rate": 0.0004980480765179401, "loss": 6.0498, "mean_token_accuracy": 0.15848255753517151, "num_tokens": 10667952.0, "step": 5710 }, { "entropy": 6.015667390823364, "epoch": 0.5046803249735076, "grad_norm": 1.46875, "learning_rate": 0.0004980439352456692, "loss": 5.8856, "mean_token_accuracy": 0.16176477670669556, "num_tokens": 10677228.0, "step": 5715 }, { "entropy": 6.1053516387939455, "epoch": 0.505121865065348, "grad_norm": 1.203125, "learning_rate": 0.0004980397896040944, "loss": 5.974, "mean_token_accuracy": 0.16118671298027037, "num_tokens": 10686183.0, "step": 5720 }, { "entropy": 6.190397262573242, "epoch": 0.5055634051571882, "grad_norm": 1.265625, "learning_rate": 0.0004980356395932969, "loss": 5.9997, "mean_token_accuracy": 0.15813823491334916, "num_tokens": 10695293.0, "step": 5725 }, { "entropy": 6.0877281665802006, "epoch": 0.5060049452490286, "grad_norm": 1.21875, "learning_rate": 0.0004980314852133581, "loss": 5.9647, "mean_token_accuracy": 0.1628525137901306, "num_tokens": 10704853.0, "step": 5730 }, { "entropy": 6.098628950119019, "epoch": 0.506446485340869, "grad_norm": 1.2578125, "learning_rate": 0.0004980273264643594, "loss": 5.9596, "mean_token_accuracy": 0.15651024580001832, "num_tokens": 10714307.0, "step": 5735 }, { "entropy": 5.91804838180542, "epoch": 0.5068880254327093, "grad_norm": 1.2109375, "learning_rate": 0.0004980231633463822, "loss": 5.8195, "mean_token_accuracy": 0.16056904792785645, "num_tokens": 10723513.0, "step": 5740 }, { "entropy": 6.041822719573974, "epoch": 0.5073295655245497, "grad_norm": 1.453125, "learning_rate": 0.0004980189958595081, "loss": 6.009, "mean_token_accuracy": 0.15504895150661469, "num_tokens": 10732809.0, "step": 5745 }, { "entropy": 6.180126619338989, "epoch": 0.5077711056163899, "grad_norm": 1.1796875, "learning_rate": 0.0004980148240038186, "loss": 6.0285, "mean_token_accuracy": 0.16252532303333284, "num_tokens": 10742960.0, "step": 5750 }, { "entropy": 6.167002391815186, "epoch": 0.5082126457082303, "grad_norm": 1.5625, "learning_rate": 0.0004980106477793957, "loss": 6.0962, "mean_token_accuracy": 0.15922853201627732, "num_tokens": 10752835.0, "step": 5755 }, { "entropy": 6.090708923339844, "epoch": 0.5086541858000706, "grad_norm": 1.40625, "learning_rate": 0.0004980064671863209, "loss": 5.9568, "mean_token_accuracy": 0.16255878955125808, "num_tokens": 10762139.0, "step": 5760 }, { "entropy": 6.121759986877441, "epoch": 0.509095725891911, "grad_norm": 1.4296875, "learning_rate": 0.0004980022822246763, "loss": 6.0284, "mean_token_accuracy": 0.1527095004916191, "num_tokens": 10771961.0, "step": 5765 }, { "entropy": 6.189744853973389, "epoch": 0.5095372659837514, "grad_norm": 1.2421875, "learning_rate": 0.0004979980928945439, "loss": 6.0799, "mean_token_accuracy": 0.14973168522119523, "num_tokens": 10780163.0, "step": 5770 }, { "entropy": 6.200748443603516, "epoch": 0.5099788060755917, "grad_norm": 1.4921875, "learning_rate": 0.0004979938991960056, "loss": 6.0327, "mean_token_accuracy": 0.156923408806324, "num_tokens": 10788339.0, "step": 5775 }, { "entropy": 6.119477415084839, "epoch": 0.510420346167432, "grad_norm": 1.4296875, "learning_rate": 0.0004979897011291436, "loss": 6.0257, "mean_token_accuracy": 0.15160492956638336, "num_tokens": 10797315.0, "step": 5780 }, { "entropy": 6.097667026519775, "epoch": 0.5108618862592723, "grad_norm": 1.3515625, "learning_rate": 0.0004979854986940402, "loss": 5.9344, "mean_token_accuracy": 0.15801928341388702, "num_tokens": 10806864.0, "step": 5785 }, { "entropy": 6.078370189666748, "epoch": 0.5113034263511127, "grad_norm": 1.4921875, "learning_rate": 0.0004979812918907777, "loss": 5.971, "mean_token_accuracy": 0.15703734010457993, "num_tokens": 10816417.0, "step": 5790 }, { "entropy": 6.111645221710205, "epoch": 0.511744966442953, "grad_norm": 1.3984375, "learning_rate": 0.0004979770807194385, "loss": 6.0491, "mean_token_accuracy": 0.15200948417186738, "num_tokens": 10825692.0, "step": 5795 }, { "entropy": 6.125649499893188, "epoch": 0.5121865065347934, "grad_norm": 1.171875, "learning_rate": 0.0004979728651801051, "loss": 6.0184, "mean_token_accuracy": 0.1593027725815773, "num_tokens": 10834652.0, "step": 5800 }, { "entropy": 6.142418766021729, "epoch": 0.5126280466266337, "grad_norm": 1.296875, "learning_rate": 0.0004979686452728602, "loss": 6.0045, "mean_token_accuracy": 0.15610153079032899, "num_tokens": 10844491.0, "step": 5805 }, { "entropy": 6.076530647277832, "epoch": 0.513069586718474, "grad_norm": 1.3828125, "learning_rate": 0.0004979644209977863, "loss": 5.9576, "mean_token_accuracy": 0.16532048285007478, "num_tokens": 10853948.0, "step": 5810 }, { "entropy": 6.112010383605957, "epoch": 0.5135111268103144, "grad_norm": 1.515625, "learning_rate": 0.0004979601923549661, "loss": 5.9547, "mean_token_accuracy": 0.16036146879196167, "num_tokens": 10861745.0, "step": 5815 }, { "entropy": 6.1195940494537355, "epoch": 0.5139526669021547, "grad_norm": 1.2734375, "learning_rate": 0.0004979559593444826, "loss": 6.0383, "mean_token_accuracy": 0.1518269196152687, "num_tokens": 10871239.0, "step": 5820 }, { "entropy": 6.060505533218384, "epoch": 0.5143942069939951, "grad_norm": 1.3359375, "learning_rate": 0.0004979517219664187, "loss": 5.8893, "mean_token_accuracy": 0.16240942627191543, "num_tokens": 10880883.0, "step": 5825 }, { "entropy": 6.087244081497192, "epoch": 0.5148357470858353, "grad_norm": 1.34375, "learning_rate": 0.0004979474802208572, "loss": 5.9569, "mean_token_accuracy": 0.16193697676062585, "num_tokens": 10890002.0, "step": 5830 }, { "entropy": 6.015149307250977, "epoch": 0.5152772871776757, "grad_norm": 1.71875, "learning_rate": 0.0004979432341078816, "loss": 5.8864, "mean_token_accuracy": 0.15554073452949524, "num_tokens": 10898299.0, "step": 5835 }, { "entropy": 6.089050912857056, "epoch": 0.5157188272695161, "grad_norm": 1.3359375, "learning_rate": 0.0004979389836275746, "loss": 6.0774, "mean_token_accuracy": 0.14996080696582795, "num_tokens": 10907918.0, "step": 5840 }, { "entropy": 6.133493900299072, "epoch": 0.5161603673613564, "grad_norm": 1.296875, "learning_rate": 0.0004979347287800198, "loss": 5.9596, "mean_token_accuracy": 0.15501177459955215, "num_tokens": 10917318.0, "step": 5845 }, { "entropy": 6.060728597640991, "epoch": 0.5166019074531968, "grad_norm": 1.484375, "learning_rate": 0.0004979304695653005, "loss": 5.9144, "mean_token_accuracy": 0.16151465103030205, "num_tokens": 10926338.0, "step": 5850 }, { "entropy": 5.939118814468384, "epoch": 0.5170434475450371, "grad_norm": 1.3828125, "learning_rate": 0.0004979262059835001, "loss": 5.9432, "mean_token_accuracy": 0.16033163890242577, "num_tokens": 10935244.0, "step": 5855 }, { "entropy": 6.143280649185181, "epoch": 0.5174849876368774, "grad_norm": 1.3828125, "learning_rate": 0.0004979219380347021, "loss": 5.9905, "mean_token_accuracy": 0.15961788594722748, "num_tokens": 10943764.0, "step": 5860 }, { "entropy": 6.160701942443848, "epoch": 0.5179265277287177, "grad_norm": 1.3203125, "learning_rate": 0.0004979176657189901, "loss": 5.9547, "mean_token_accuracy": 0.16057219803333284, "num_tokens": 10952580.0, "step": 5865 }, { "entropy": 5.9959807872772215, "epoch": 0.5183680678205581, "grad_norm": 1.171875, "learning_rate": 0.0004979133890364477, "loss": 5.9825, "mean_token_accuracy": 0.1650010645389557, "num_tokens": 10961745.0, "step": 5870 }, { "entropy": 5.959691524505615, "epoch": 0.5188096079123985, "grad_norm": 1.296875, "learning_rate": 0.000497909107987159, "loss": 5.9087, "mean_token_accuracy": 0.15640757530927657, "num_tokens": 10971018.0, "step": 5875 }, { "entropy": 6.110277318954468, "epoch": 0.5192511480042388, "grad_norm": 1.2734375, "learning_rate": 0.0004979048225712074, "loss": 5.9408, "mean_token_accuracy": 0.1647150531411171, "num_tokens": 10980070.0, "step": 5880 }, { "entropy": 6.112304973602295, "epoch": 0.5196926880960792, "grad_norm": 1.265625, "learning_rate": 0.0004979005327886772, "loss": 5.9115, "mean_token_accuracy": 0.15777890086174012, "num_tokens": 10989090.0, "step": 5885 }, { "entropy": 6.028247594833374, "epoch": 0.5201342281879194, "grad_norm": 1.3828125, "learning_rate": 0.0004978962386396525, "loss": 5.9676, "mean_token_accuracy": 0.16010279804468155, "num_tokens": 10998652.0, "step": 5890 }, { "entropy": 5.936372900009156, "epoch": 0.5205757682797598, "grad_norm": 1.2578125, "learning_rate": 0.0004978919401242171, "loss": 5.8702, "mean_token_accuracy": 0.17004906684160231, "num_tokens": 11006323.0, "step": 5895 }, { "entropy": 5.953699111938477, "epoch": 0.5210173083716001, "grad_norm": 1.390625, "learning_rate": 0.0004978876372424553, "loss": 5.8013, "mean_token_accuracy": 0.17337664365768432, "num_tokens": 11015054.0, "step": 5900 }, { "entropy": 6.08972134590149, "epoch": 0.5214588484634405, "grad_norm": 1.2265625, "learning_rate": 0.0004978833299944515, "loss": 5.9875, "mean_token_accuracy": 0.15530108660459518, "num_tokens": 11025787.0, "step": 5905 }, { "entropy": 6.201382303237915, "epoch": 0.5219003885552809, "grad_norm": 1.2421875, "learning_rate": 0.0004978790183802901, "loss": 6.0957, "mean_token_accuracy": 0.15121813490986824, "num_tokens": 11036091.0, "step": 5910 }, { "entropy": 6.078470230102539, "epoch": 0.5223419286471211, "grad_norm": 1.328125, "learning_rate": 0.0004978747024000554, "loss": 6.0798, "mean_token_accuracy": 0.15142182558774947, "num_tokens": 11046442.0, "step": 5915 }, { "entropy": 6.137741947174073, "epoch": 0.5227834687389615, "grad_norm": 1.1875, "learning_rate": 0.0004978703820538321, "loss": 6.0767, "mean_token_accuracy": 0.15347778424620628, "num_tokens": 11056907.0, "step": 5920 }, { "entropy": 6.088362789154052, "epoch": 0.5232250088308018, "grad_norm": 1.4140625, "learning_rate": 0.0004978660573417048, "loss": 5.9226, "mean_token_accuracy": 0.165810264647007, "num_tokens": 11065298.0, "step": 5925 }, { "entropy": 6.02788200378418, "epoch": 0.5236665489226422, "grad_norm": 1.4609375, "learning_rate": 0.000497861728263758, "loss": 5.9277, "mean_token_accuracy": 0.16316543370485306, "num_tokens": 11074509.0, "step": 5930 }, { "entropy": 6.140137195587158, "epoch": 0.5241080890144825, "grad_norm": 1.4375, "learning_rate": 0.0004978573948200769, "loss": 5.9728, "mean_token_accuracy": 0.16266270875930786, "num_tokens": 11082433.0, "step": 5935 }, { "entropy": 6.069718503952027, "epoch": 0.5245496291063229, "grad_norm": 1.296875, "learning_rate": 0.0004978530570107461, "loss": 5.9336, "mean_token_accuracy": 0.16215700507164002, "num_tokens": 11090838.0, "step": 5940 }, { "entropy": 6.054668140411377, "epoch": 0.5249911691981632, "grad_norm": 1.3125, "learning_rate": 0.0004978487148358506, "loss": 5.9288, "mean_token_accuracy": 0.1637731358408928, "num_tokens": 11100048.0, "step": 5945 }, { "entropy": 5.982475614547729, "epoch": 0.5254327092900035, "grad_norm": 1.21875, "learning_rate": 0.0004978443682954756, "loss": 5.9389, "mean_token_accuracy": 0.16071433573961258, "num_tokens": 11109871.0, "step": 5950 }, { "entropy": 6.093014669418335, "epoch": 0.5258742493818439, "grad_norm": 1.171875, "learning_rate": 0.0004978400173897061, "loss": 5.924, "mean_token_accuracy": 0.1567403718829155, "num_tokens": 11119450.0, "step": 5955 }, { "entropy": 6.094651889801026, "epoch": 0.5263157894736842, "grad_norm": 1.4609375, "learning_rate": 0.0004978356621186275, "loss": 5.8755, "mean_token_accuracy": 0.17035853564739228, "num_tokens": 11128321.0, "step": 5960 }, { "entropy": 6.019528484344482, "epoch": 0.5267573295655246, "grad_norm": 1.2734375, "learning_rate": 0.0004978313024823249, "loss": 5.9397, "mean_token_accuracy": 0.15585954636335372, "num_tokens": 11138657.0, "step": 5965 }, { "entropy": 6.009906387329101, "epoch": 0.5271988696573648, "grad_norm": 1.2734375, "learning_rate": 0.0004978269384808839, "loss": 5.9209, "mean_token_accuracy": 0.16706853806972505, "num_tokens": 11147704.0, "step": 5970 }, { "entropy": 6.089446401596069, "epoch": 0.5276404097492052, "grad_norm": 1.2578125, "learning_rate": 0.0004978225701143898, "loss": 6.0138, "mean_token_accuracy": 0.1562186986207962, "num_tokens": 11156979.0, "step": 5975 }, { "entropy": 6.104236221313476, "epoch": 0.5280819498410456, "grad_norm": 1.3046875, "learning_rate": 0.0004978181973829284, "loss": 5.8985, "mean_token_accuracy": 0.15810604989528657, "num_tokens": 11165539.0, "step": 5980 }, { "entropy": 6.11846137046814, "epoch": 0.5285234899328859, "grad_norm": 1.1796875, "learning_rate": 0.0004978138202865851, "loss": 5.9758, "mean_token_accuracy": 0.1503559224307537, "num_tokens": 11175911.0, "step": 5985 }, { "entropy": 6.050204277038574, "epoch": 0.5289650300247263, "grad_norm": 1.390625, "learning_rate": 0.0004978094388254459, "loss": 5.9883, "mean_token_accuracy": 0.15720559507608414, "num_tokens": 11185365.0, "step": 5990 }, { "entropy": 6.095414018630981, "epoch": 0.5294065701165666, "grad_norm": 1.3125, "learning_rate": 0.0004978050529995965, "loss": 5.8979, "mean_token_accuracy": 0.16393667906522752, "num_tokens": 11195318.0, "step": 5995 }, { "entropy": 6.074192810058594, "epoch": 0.5298481102084069, "grad_norm": 1.234375, "learning_rate": 0.0004978006628091228, "loss": 6.0079, "mean_token_accuracy": 0.15748494416475295, "num_tokens": 11205246.0, "step": 6000 }, { "epoch": 0.5298481102084069, "eval_entropy": 5.929435700441406, "eval_loss": 5.984138011932373, "eval_mean_token_accuracy": 0.16603976909265278, "eval_num_tokens": 11205246.0, "eval_runtime": 26.1397, "eval_samples_per_second": 1351.009, "eval_steps_per_second": 168.9, "step": 6000 } ], "logging_steps": 5, "max_steps": 113230, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.6542104592384e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }