{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 3820, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.4903042420744896, "epoch": 0.005235944760782774, "grad_norm": 7.59375, "learning_rate": 1.1780104712041885e-06, "loss": 1.6362, "mean_token_accuracy": 0.6602190021425486, "num_tokens": 148667.0, "step": 10 }, { "entropy": 1.5531679213047027, "epoch": 0.010471889521565548, "grad_norm": 6.9375, "learning_rate": 2.486910994764398e-06, "loss": 1.7071, "mean_token_accuracy": 0.6444286152720451, "num_tokens": 289380.0, "step": 20 }, { "entropy": 1.5901738341897727, "epoch": 0.015707834282348322, "grad_norm": 5.65625, "learning_rate": 3.7958115183246074e-06, "loss": 1.6751, "mean_token_accuracy": 0.6434450890868902, "num_tokens": 433015.0, "step": 30 }, { "entropy": 1.5159442014992237, "epoch": 0.020943779043131095, "grad_norm": 5.0625, "learning_rate": 5.104712041884817e-06, "loss": 1.5104, "mean_token_accuracy": 0.6608107829466462, "num_tokens": 586196.0, "step": 40 }, { "entropy": 1.5390321850776671, "epoch": 0.026179723803913868, "grad_norm": 3.0, "learning_rate": 6.4136125654450265e-06, "loss": 1.4851, "mean_token_accuracy": 0.664670011587441, "num_tokens": 732500.0, "step": 50 }, { "entropy": 1.486197516322136, "epoch": 0.031415668564696644, "grad_norm": 2.703125, "learning_rate": 7.722513089005236e-06, "loss": 1.4375, "mean_token_accuracy": 0.6728810863569379, "num_tokens": 871796.0, "step": 60 }, { "entropy": 1.4761194687336683, "epoch": 0.036651613325479414, "grad_norm": 2.265625, "learning_rate": 9.031413612565446e-06, "loss": 1.4175, "mean_token_accuracy": 0.6700865641236305, "num_tokens": 1015760.0, "step": 70 }, { "entropy": 1.397418873384595, "epoch": 0.04188755808626219, "grad_norm": 2.25, "learning_rate": 1.0340314136125655e-05, "loss": 1.3627, "mean_token_accuracy": 0.6793193189427257, "num_tokens": 1162521.0, "step": 80 }, { "entropy": 1.4198294993489982, "epoch": 0.04712350284704497, "grad_norm": 2.25, "learning_rate": 1.1649214659685865e-05, "loss": 1.3982, "mean_token_accuracy": 0.678496933169663, "num_tokens": 1304606.0, "step": 90 }, { "entropy": 1.3636814955621959, "epoch": 0.052359447607827736, "grad_norm": 1.9140625, "learning_rate": 1.2958115183246074e-05, "loss": 1.3157, "mean_token_accuracy": 0.6834567856043577, "num_tokens": 1459672.0, "step": 100 }, { "entropy": 1.371455504372716, "epoch": 0.05759539236861051, "grad_norm": 2.578125, "learning_rate": 1.4267015706806284e-05, "loss": 1.337, "mean_token_accuracy": 0.680043394304812, "num_tokens": 1614513.0, "step": 110 }, { "entropy": 1.4191205739974975, "epoch": 0.06283133712939329, "grad_norm": 2.4375, "learning_rate": 1.5575916230366495e-05, "loss": 1.3601, "mean_token_accuracy": 0.6790978884324431, "num_tokens": 1764120.0, "step": 120 }, { "entropy": 1.4203862166032195, "epoch": 0.06806728189017607, "grad_norm": 2.421875, "learning_rate": 1.68848167539267e-05, "loss": 1.3384, "mean_token_accuracy": 0.6819259503856301, "num_tokens": 1905542.0, "step": 130 }, { "entropy": 1.4149893302470447, "epoch": 0.07330322665095883, "grad_norm": 2.484375, "learning_rate": 1.8193717277486914e-05, "loss": 1.3124, "mean_token_accuracy": 0.6826399000361562, "num_tokens": 2051546.0, "step": 140 }, { "entropy": 1.319907895848155, "epoch": 0.0785391714117416, "grad_norm": 2.046875, "learning_rate": 1.950261780104712e-05, "loss": 1.2291, "mean_token_accuracy": 0.6927984276786446, "num_tokens": 2197518.0, "step": 150 }, { "entropy": 1.3174231912940741, "epoch": 0.08377511617252438, "grad_norm": 1.9453125, "learning_rate": 2.0811518324607333e-05, "loss": 1.2863, "mean_token_accuracy": 0.6850969936698675, "num_tokens": 2348210.0, "step": 160 }, { "entropy": 1.2859620593488217, "epoch": 0.08901106093330716, "grad_norm": 2.5, "learning_rate": 2.212041884816754e-05, "loss": 1.2469, "mean_token_accuracy": 0.694661033526063, "num_tokens": 2488974.0, "step": 170 }, { "entropy": 1.3225422732532024, "epoch": 0.09424700569408993, "grad_norm": 2.203125, "learning_rate": 2.3429319371727752e-05, "loss": 1.3216, "mean_token_accuracy": 0.6867891995236277, "num_tokens": 2638402.0, "step": 180 }, { "entropy": 1.2810360241681338, "epoch": 0.0994829504548727, "grad_norm": 2.09375, "learning_rate": 2.473821989528796e-05, "loss": 1.2682, "mean_token_accuracy": 0.6961364889517426, "num_tokens": 2780719.0, "step": 190 }, { "entropy": 1.30760794095695, "epoch": 0.10471889521565547, "grad_norm": 4.0625, "learning_rate": 2.604712041884817e-05, "loss": 1.3035, "mean_token_accuracy": 0.6898889668285847, "num_tokens": 2926604.0, "step": 200 }, { "entropy": 1.264478771481663, "epoch": 0.10995483997643825, "grad_norm": 1.90625, "learning_rate": 2.7356020942408378e-05, "loss": 1.2493, "mean_token_accuracy": 0.6941331747919322, "num_tokens": 3070548.0, "step": 210 }, { "entropy": 1.2868385933339597, "epoch": 0.11519078473722102, "grad_norm": 1.9296875, "learning_rate": 2.8664921465968587e-05, "loss": 1.2975, "mean_token_accuracy": 0.6899457449093461, "num_tokens": 3222578.0, "step": 220 }, { "entropy": 1.2868557438254355, "epoch": 0.1204267294980038, "grad_norm": 2.0625, "learning_rate": 2.99738219895288e-05, "loss": 1.2843, "mean_token_accuracy": 0.6913154577836395, "num_tokens": 3367648.0, "step": 230 }, { "entropy": 1.2696480546146631, "epoch": 0.12566267425878658, "grad_norm": 2.171875, "learning_rate": 3.1282722513089006e-05, "loss": 1.271, "mean_token_accuracy": 0.6934641852974892, "num_tokens": 3513611.0, "step": 240 }, { "entropy": 1.2611329367384314, "epoch": 0.13089861901956934, "grad_norm": 2.515625, "learning_rate": 3.2591623036649216e-05, "loss": 1.27, "mean_token_accuracy": 0.6962274318560958, "num_tokens": 3654075.0, "step": 250 }, { "entropy": 1.3185498464852572, "epoch": 0.13613456378035213, "grad_norm": 2.0625, "learning_rate": 3.3900523560209426e-05, "loss": 1.3081, "mean_token_accuracy": 0.6832702022045851, "num_tokens": 3799547.0, "step": 260 }, { "entropy": 1.2299459297209978, "epoch": 0.1413705085411349, "grad_norm": 2.09375, "learning_rate": 3.5209424083769635e-05, "loss": 1.2355, "mean_token_accuracy": 0.6997983153909445, "num_tokens": 3940094.0, "step": 270 }, { "entropy": 1.2170744601637125, "epoch": 0.14660645330191766, "grad_norm": 1.9609375, "learning_rate": 3.6518324607329845e-05, "loss": 1.2097, "mean_token_accuracy": 0.7026064315810799, "num_tokens": 4084450.0, "step": 280 }, { "entropy": 1.2503085616976022, "epoch": 0.15184239806270045, "grad_norm": 1.9375, "learning_rate": 3.7827225130890054e-05, "loss": 1.2366, "mean_token_accuracy": 0.6935763908550143, "num_tokens": 4235094.0, "step": 290 }, { "entropy": 1.2002368062734603, "epoch": 0.1570783428234832, "grad_norm": 2.953125, "learning_rate": 3.9136125654450264e-05, "loss": 1.1993, "mean_token_accuracy": 0.7031140483915805, "num_tokens": 4378493.0, "step": 300 }, { "entropy": 1.2433323854580522, "epoch": 0.162314287584266, "grad_norm": 1.828125, "learning_rate": 4.044502617801047e-05, "loss": 1.2437, "mean_token_accuracy": 0.6959625506773591, "num_tokens": 4513351.0, "step": 310 }, { "entropy": 1.2667765196412801, "epoch": 0.16755023234504876, "grad_norm": 2.078125, "learning_rate": 4.175392670157068e-05, "loss": 1.2609, "mean_token_accuracy": 0.6903780495747924, "num_tokens": 4654221.0, "step": 320 }, { "entropy": 1.257903415709734, "epoch": 0.17278617710583152, "grad_norm": 2.171875, "learning_rate": 4.306282722513089e-05, "loss": 1.2591, "mean_token_accuracy": 0.6927119480445981, "num_tokens": 4793001.0, "step": 330 }, { "entropy": 1.2474724128842354, "epoch": 0.1780221218666143, "grad_norm": 1.8828125, "learning_rate": 4.43717277486911e-05, "loss": 1.258, "mean_token_accuracy": 0.6958453560248017, "num_tokens": 4938800.0, "step": 340 }, { "entropy": 1.271824512630701, "epoch": 0.18325806662739708, "grad_norm": 2.3125, "learning_rate": 4.568062827225131e-05, "loss": 1.2777, "mean_token_accuracy": 0.6864122781902552, "num_tokens": 5088676.0, "step": 350 }, { "entropy": 1.206870013102889, "epoch": 0.18849401138817987, "grad_norm": 2.1875, "learning_rate": 4.698952879581152e-05, "loss": 1.2145, "mean_token_accuracy": 0.7025035681203008, "num_tokens": 5233017.0, "step": 360 }, { "entropy": 1.2822908700443805, "epoch": 0.19372995614896263, "grad_norm": 1.6953125, "learning_rate": 4.829842931937173e-05, "loss": 1.281, "mean_token_accuracy": 0.685633241944015, "num_tokens": 5383911.0, "step": 370 }, { "entropy": 1.2813241746276618, "epoch": 0.1989659009097454, "grad_norm": 2.0, "learning_rate": 4.960732984293194e-05, "loss": 1.2896, "mean_token_accuracy": 0.6846682282164693, "num_tokens": 5532300.0, "step": 380 }, { "entropy": 1.2817171201109887, "epoch": 0.20420184567052818, "grad_norm": 1.8359375, "learning_rate": 4.999948856244768e-05, "loss": 1.2811, "mean_token_accuracy": 0.6898531707003712, "num_tokens": 5673323.0, "step": 390 }, { "entropy": 1.2512086292728781, "epoch": 0.20943779043131094, "grad_norm": 2.21875, "learning_rate": 4.9996983612565773e-05, "loss": 1.2605, "mean_token_accuracy": 0.690356932580471, "num_tokens": 5818798.0, "step": 400 }, { "entropy": 1.2130850929766894, "epoch": 0.21467373519209373, "grad_norm": 1.6640625, "learning_rate": 4.999239142174581e-05, "loss": 1.2191, "mean_token_accuracy": 0.6989637283608318, "num_tokens": 5967139.0, "step": 410 }, { "entropy": 1.2118938906118273, "epoch": 0.2199096799528765, "grad_norm": 1.96875, "learning_rate": 4.99857123734344e-05, "loss": 1.2075, "mean_token_accuracy": 0.6999017883092165, "num_tokens": 6125485.0, "step": 420 }, { "entropy": 1.2652900835499168, "epoch": 0.22514562471365926, "grad_norm": 1.9765625, "learning_rate": 4.9976947025330155e-05, "loss": 1.2729, "mean_token_accuracy": 0.6881730291992426, "num_tokens": 6271940.0, "step": 430 }, { "entropy": 1.2763973344117403, "epoch": 0.23038156947444205, "grad_norm": 2.125, "learning_rate": 4.9966096109337125e-05, "loss": 1.3224, "mean_token_accuracy": 0.6868822824209928, "num_tokens": 6415021.0, "step": 440 }, { "entropy": 1.2256551414728165, "epoch": 0.2356175142352248, "grad_norm": 1.6796875, "learning_rate": 4.995316053150366e-05, "loss": 1.2271, "mean_token_accuracy": 0.6921151876449585, "num_tokens": 6558653.0, "step": 450 }, { "entropy": 1.233339687436819, "epoch": 0.2408534589960076, "grad_norm": 2.34375, "learning_rate": 4.993814137194681e-05, "loss": 1.2679, "mean_token_accuracy": 0.6938443537801504, "num_tokens": 6698611.0, "step": 460 }, { "entropy": 1.2777669046074152, "epoch": 0.24608940375679036, "grad_norm": 1.6640625, "learning_rate": 4.9921039884762057e-05, "loss": 1.2873, "mean_token_accuracy": 0.6864250931888819, "num_tokens": 6847385.0, "step": 470 }, { "entropy": 1.197480170428753, "epoch": 0.25132534851757315, "grad_norm": 2.03125, "learning_rate": 4.9901857497918655e-05, "loss": 1.2159, "mean_token_accuracy": 0.7006905306130647, "num_tokens": 6990503.0, "step": 480 }, { "entropy": 1.2612487450242043, "epoch": 0.2565612932783559, "grad_norm": 2.140625, "learning_rate": 4.98805958131404e-05, "loss": 1.2667, "mean_token_accuracy": 0.6904748784378171, "num_tokens": 7138114.0, "step": 490 }, { "entropy": 1.2380808498710394, "epoch": 0.2617972380391387, "grad_norm": 3.03125, "learning_rate": 4.985725660577184e-05, "loss": 1.2627, "mean_token_accuracy": 0.693475303426385, "num_tokens": 7295788.0, "step": 500 }, { "entropy": 1.2590212849900126, "epoch": 0.26703318279992144, "grad_norm": 2.03125, "learning_rate": 4.983184182463009e-05, "loss": 1.252, "mean_token_accuracy": 0.6881766313686967, "num_tokens": 7431663.0, "step": 510 }, { "entropy": 1.2515497665852309, "epoch": 0.27226912756070426, "grad_norm": 3.703125, "learning_rate": 4.980435359184204e-05, "loss": 1.2831, "mean_token_accuracy": 0.693330561555922, "num_tokens": 7569801.0, "step": 520 }, { "entropy": 1.184147422760725, "epoch": 0.277505072321487, "grad_norm": 1.6328125, "learning_rate": 4.977479420266723e-05, "loss": 1.1646, "mean_token_accuracy": 0.703077656775713, "num_tokens": 7713092.0, "step": 530 }, { "entropy": 1.2746197815984488, "epoch": 0.2827410170822698, "grad_norm": 2.0625, "learning_rate": 4.974316612530615e-05, "loss": 1.2765, "mean_token_accuracy": 0.6863818326964974, "num_tokens": 7854137.0, "step": 540 }, { "entropy": 1.2566815540194511, "epoch": 0.28797696184305255, "grad_norm": 1.90625, "learning_rate": 4.970947200069416e-05, "loss": 1.2648, "mean_token_accuracy": 0.6892008159309626, "num_tokens": 7992676.0, "step": 550 }, { "entropy": 1.2914229419082404, "epoch": 0.2932129066038353, "grad_norm": 1.640625, "learning_rate": 4.967371464228096e-05, "loss": 1.3179, "mean_token_accuracy": 0.6854391321539879, "num_tokens": 8149846.0, "step": 560 }, { "entropy": 1.2183680593967439, "epoch": 0.29844885136461813, "grad_norm": 1.8046875, "learning_rate": 4.963589703579569e-05, "loss": 1.24, "mean_token_accuracy": 0.6942471470683813, "num_tokens": 8288016.0, "step": 570 }, { "entropy": 1.3149288706481457, "epoch": 0.3036847961254009, "grad_norm": 1.75, "learning_rate": 4.959602233899762e-05, "loss": 1.3198, "mean_token_accuracy": 0.679352731257677, "num_tokens": 8435936.0, "step": 580 }, { "entropy": 1.234706364199519, "epoch": 0.30892074088618365, "grad_norm": 1.7265625, "learning_rate": 4.955409388141243e-05, "loss": 1.2742, "mean_token_accuracy": 0.6961829710751772, "num_tokens": 8576679.0, "step": 590 }, { "entropy": 1.2482819214463234, "epoch": 0.3141566856469664, "grad_norm": 2.1875, "learning_rate": 4.9510115164054297e-05, "loss": 1.2703, "mean_token_accuracy": 0.6918804241344333, "num_tokens": 8724159.0, "step": 600 }, { "entropy": 1.2805782459676265, "epoch": 0.3193926304077492, "grad_norm": 1.7578125, "learning_rate": 4.946408985913344e-05, "loss": 1.3043, "mean_token_accuracy": 0.6854454703629017, "num_tokens": 8871635.0, "step": 610 }, { "entropy": 1.2219614367932081, "epoch": 0.324628575168532, "grad_norm": 1.578125, "learning_rate": 4.941602180974958e-05, "loss": 1.2703, "mean_token_accuracy": 0.6931755296885967, "num_tokens": 9025175.0, "step": 620 }, { "entropy": 1.2477784302085637, "epoch": 0.32986451992931476, "grad_norm": 1.5625, "learning_rate": 4.9365915029571007e-05, "loss": 1.2585, "mean_token_accuracy": 0.6917018702253699, "num_tokens": 9162826.0, "step": 630 }, { "entropy": 1.230723000690341, "epoch": 0.3351004646900975, "grad_norm": 2.140625, "learning_rate": 4.9313773702499455e-05, "loss": 1.2391, "mean_token_accuracy": 0.6946986148133873, "num_tokens": 9312989.0, "step": 640 }, { "entropy": 1.2285594891756773, "epoch": 0.3403364094508803, "grad_norm": 1.7578125, "learning_rate": 4.925960218232073e-05, "loss": 1.2367, "mean_token_accuracy": 0.6967854388058186, "num_tokens": 9455995.0, "step": 650 }, { "entropy": 1.3053442865610123, "epoch": 0.34557235421166305, "grad_norm": 2.328125, "learning_rate": 4.920340499234116e-05, "loss": 1.311, "mean_token_accuracy": 0.6871177144348621, "num_tokens": 9595268.0, "step": 660 }, { "entropy": 1.2352213632315396, "epoch": 0.35080829897244586, "grad_norm": 2.125, "learning_rate": 4.914518682500995e-05, "loss": 1.2577, "mean_token_accuracy": 0.69151939060539, "num_tokens": 9737073.0, "step": 670 }, { "entropy": 1.2220519341528415, "epoch": 0.3560442437332286, "grad_norm": 2.0625, "learning_rate": 4.908495254152731e-05, "loss": 1.2419, "mean_token_accuracy": 0.7000335277989507, "num_tokens": 9880500.0, "step": 680 }, { "entropy": 1.2571235705167054, "epoch": 0.3612801884940114, "grad_norm": 1.890625, "learning_rate": 4.902270717143859e-05, "loss": 1.2772, "mean_token_accuracy": 0.6950660437345505, "num_tokens": 10029677.0, "step": 690 }, { "entropy": 1.1837443890050054, "epoch": 0.36651613325479415, "grad_norm": 2.125, "learning_rate": 4.895845591221426e-05, "loss": 1.1883, "mean_token_accuracy": 0.7034358236938715, "num_tokens": 10169257.0, "step": 700 }, { "entropy": 1.1898296054452657, "epoch": 0.3717520780155769, "grad_norm": 1.96875, "learning_rate": 4.8892204128816e-05, "loss": 1.2018, "mean_token_accuracy": 0.6991554461419582, "num_tokens": 10312298.0, "step": 710 }, { "entropy": 1.2473948691040277, "epoch": 0.37698802277635973, "grad_norm": 1.75, "learning_rate": 4.882395735324864e-05, "loss": 1.2845, "mean_token_accuracy": 0.6941293969750404, "num_tokens": 10455848.0, "step": 720 }, { "entropy": 1.217660278454423, "epoch": 0.3822239675371425, "grad_norm": 1.921875, "learning_rate": 4.87537212840983e-05, "loss": 1.2161, "mean_token_accuracy": 0.6975388413295149, "num_tokens": 10582901.0, "step": 730 }, { "entropy": 1.2198585540056228, "epoch": 0.38745991229792526, "grad_norm": 1.8671875, "learning_rate": 4.8681501786056544e-05, "loss": 1.2394, "mean_token_accuracy": 0.6949820145964622, "num_tokens": 10724617.0, "step": 740 }, { "entropy": 1.2234349481761455, "epoch": 0.392695857058708, "grad_norm": 1.7734375, "learning_rate": 4.860730488943068e-05, "loss": 1.2335, "mean_token_accuracy": 0.6962696801871061, "num_tokens": 10869452.0, "step": 750 }, { "entropy": 1.2394332230091094, "epoch": 0.3979318018194908, "grad_norm": 2.296875, "learning_rate": 4.8531136789640216e-05, "loss": 1.2645, "mean_token_accuracy": 0.695160668157041, "num_tokens": 11017465.0, "step": 760 }, { "entropy": 1.219304683059454, "epoch": 0.4031677465802736, "grad_norm": 1.8125, "learning_rate": 4.845300384669958e-05, "loss": 1.2453, "mean_token_accuracy": 0.6994039881974459, "num_tokens": 11158876.0, "step": 770 }, { "entropy": 1.256004797667265, "epoch": 0.40840369134105636, "grad_norm": 1.953125, "learning_rate": 4.837291258468701e-05, "loss": 1.2915, "mean_token_accuracy": 0.6878539452329278, "num_tokens": 11297155.0, "step": 780 }, { "entropy": 1.201148072630167, "epoch": 0.4136396361018391, "grad_norm": 1.7734375, "learning_rate": 4.8290869691199834e-05, "loss": 1.22, "mean_token_accuracy": 0.7018963057547808, "num_tokens": 11439337.0, "step": 790 }, { "entropy": 1.233640456199646, "epoch": 0.4188755808626219, "grad_norm": 1.6796875, "learning_rate": 4.820688201679605e-05, "loss": 1.2485, "mean_token_accuracy": 0.6938533913344145, "num_tokens": 11585266.0, "step": 800 }, { "entropy": 1.2794130560010673, "epoch": 0.42411152562340465, "grad_norm": 1.75, "learning_rate": 4.812095657442231e-05, "loss": 1.2894, "mean_token_accuracy": 0.6880532244220376, "num_tokens": 11725643.0, "step": 810 }, { "entropy": 1.2399780409410597, "epoch": 0.42934747038418747, "grad_norm": 1.7265625, "learning_rate": 4.803310053882831e-05, "loss": 1.2887, "mean_token_accuracy": 0.6891330601647496, "num_tokens": 11878982.0, "step": 820 }, { "entropy": 1.2244506664574146, "epoch": 0.43458341514497023, "grad_norm": 1.6953125, "learning_rate": 4.794332124596775e-05, "loss": 1.2513, "mean_token_accuracy": 0.6966191967949271, "num_tokens": 12025844.0, "step": 830 }, { "entropy": 1.2087435230612755, "epoch": 0.439819359905753, "grad_norm": 1.5, "learning_rate": 4.7851626192385745e-05, "loss": 1.2347, "mean_token_accuracy": 0.7009090483188629, "num_tokens": 12177345.0, "step": 840 }, { "entropy": 1.2146162753924727, "epoch": 0.44505530466653576, "grad_norm": 1.8515625, "learning_rate": 4.775802303459288e-05, "loss": 1.2465, "mean_token_accuracy": 0.6959697719663381, "num_tokens": 12320998.0, "step": 850 }, { "entropy": 1.198814813606441, "epoch": 0.4502912494273185, "grad_norm": 1.875, "learning_rate": 4.76625195884259e-05, "loss": 1.2194, "mean_token_accuracy": 0.6998335400596261, "num_tokens": 12470540.0, "step": 860 }, { "entropy": 1.1972925199195743, "epoch": 0.45552719418810134, "grad_norm": 1.5859375, "learning_rate": 4.7565123828395064e-05, "loss": 1.2199, "mean_token_accuracy": 0.7027627993375063, "num_tokens": 12605966.0, "step": 870 }, { "entropy": 1.2037498267367481, "epoch": 0.4607631389488841, "grad_norm": 1.6484375, "learning_rate": 4.7465843887018316e-05, "loss": 1.2338, "mean_token_accuracy": 0.7029284704476595, "num_tokens": 12753176.0, "step": 880 }, { "entropy": 1.2665604405105113, "epoch": 0.46599908370966686, "grad_norm": 1.953125, "learning_rate": 4.736468805414218e-05, "loss": 1.2826, "mean_token_accuracy": 0.6867102902382612, "num_tokens": 12891112.0, "step": 890 }, { "entropy": 1.2413052493706345, "epoch": 0.4712350284704496, "grad_norm": 1.6953125, "learning_rate": 4.72616647762496e-05, "loss": 1.2708, "mean_token_accuracy": 0.6939191322773695, "num_tokens": 13037695.0, "step": 900 }, { "entropy": 1.2334762597456574, "epoch": 0.4764709732312324, "grad_norm": 1.8984375, "learning_rate": 4.7156782655754625e-05, "loss": 1.2801, "mean_token_accuracy": 0.6962197717279196, "num_tokens": 13188396.0, "step": 910 }, { "entropy": 1.2926313485950232, "epoch": 0.4817069179920152, "grad_norm": 2.34375, "learning_rate": 4.7050050450284147e-05, "loss": 1.2973, "mean_token_accuracy": 0.6834666855633259, "num_tokens": 13334753.0, "step": 920 }, { "entropy": 1.2605458820238709, "epoch": 0.48694286275279797, "grad_norm": 1.6953125, "learning_rate": 4.6941477071946594e-05, "loss": 1.273, "mean_token_accuracy": 0.6905643936246634, "num_tokens": 13481005.0, "step": 930 }, { "entropy": 1.2309048125520348, "epoch": 0.49217880751358073, "grad_norm": 1.7421875, "learning_rate": 4.683107158658781e-05, "loss": 1.2438, "mean_token_accuracy": 0.6907353041693568, "num_tokens": 13618213.0, "step": 940 }, { "entropy": 1.2219564571976662, "epoch": 0.4974147522743635, "grad_norm": 2.28125, "learning_rate": 4.6718843213034067e-05, "loss": 1.228, "mean_token_accuracy": 0.6988438554108143, "num_tokens": 13751965.0, "step": 950 }, { "entropy": 1.1836062878370286, "epoch": 0.5026506970351463, "grad_norm": 6.375, "learning_rate": 4.660480132232225e-05, "loss": 1.2209, "mean_token_accuracy": 0.6979983827099204, "num_tokens": 13901640.0, "step": 960 }, { "entropy": 1.1964640978723764, "epoch": 0.507886641795929, "grad_norm": 1.8984375, "learning_rate": 4.648895543691741e-05, "loss": 1.2005, "mean_token_accuracy": 0.6987225420773029, "num_tokens": 14047322.0, "step": 970 }, { "entropy": 1.2625075351446866, "epoch": 0.5131225865567118, "grad_norm": 1.75, "learning_rate": 4.637131522991764e-05, "loss": 1.2557, "mean_token_accuracy": 0.6897652598097921, "num_tokens": 14188766.0, "step": 980 }, { "entropy": 1.2458597056567668, "epoch": 0.5183585313174947, "grad_norm": 1.75, "learning_rate": 4.625189052424638e-05, "loss": 1.2676, "mean_token_accuracy": 0.6945432106032967, "num_tokens": 14329591.0, "step": 990 }, { "entropy": 1.245772442035377, "epoch": 0.5235944760782774, "grad_norm": 1.5625, "learning_rate": 4.613069129183218e-05, "loss": 1.2665, "mean_token_accuracy": 0.6959560567513108, "num_tokens": 14477258.0, "step": 1000 }, { "entropy": 1.2078047215938568, "epoch": 0.5288304208390602, "grad_norm": 1.6484375, "learning_rate": 4.600772765277607e-05, "loss": 1.2176, "mean_token_accuracy": 0.697881168872118, "num_tokens": 14633237.0, "step": 1010 }, { "entropy": 1.2214165650308133, "epoch": 0.5340663655998429, "grad_norm": 1.765625, "learning_rate": 4.588300987450652e-05, "loss": 1.2345, "mean_token_accuracy": 0.6999158889055253, "num_tokens": 14776068.0, "step": 1020 }, { "entropy": 1.2305742222815752, "epoch": 0.5393023103606257, "grad_norm": 2.34375, "learning_rate": 4.575654837092214e-05, "loss": 1.2398, "mean_token_accuracy": 0.6949134254828095, "num_tokens": 14913719.0, "step": 1030 }, { "entropy": 1.2660383846610785, "epoch": 0.5445382551214085, "grad_norm": 1.6640625, "learning_rate": 4.5628353701522055e-05, "loss": 1.2685, "mean_token_accuracy": 0.6927320031449199, "num_tokens": 15066188.0, "step": 1040 }, { "entropy": 1.2355303570628167, "epoch": 0.5497741998821912, "grad_norm": 1.8125, "learning_rate": 4.5498436570524296e-05, "loss": 1.2563, "mean_token_accuracy": 0.6943851266056299, "num_tokens": 15211662.0, "step": 1050 }, { "entropy": 1.2411348339170218, "epoch": 0.555010144642974, "grad_norm": 1.6171875, "learning_rate": 4.536680782597191e-05, "loss": 1.2807, "mean_token_accuracy": 0.6922256585210562, "num_tokens": 15361477.0, "step": 1060 }, { "entropy": 1.2601716944947838, "epoch": 0.5602460894037568, "grad_norm": 3.46875, "learning_rate": 4.5233478458827176e-05, "loss": 1.2964, "mean_token_accuracy": 0.6930005580186844, "num_tokens": 15512017.0, "step": 1070 }, { "entropy": 1.3123046960681677, "epoch": 0.5654820341645396, "grad_norm": 1.6796875, "learning_rate": 4.509845960205389e-05, "loss": 1.3435, "mean_token_accuracy": 0.6821131203323603, "num_tokens": 15666802.0, "step": 1080 }, { "entropy": 1.1636217474937438, "epoch": 0.5707179789253224, "grad_norm": 1.9609375, "learning_rate": 4.496176252968774e-05, "loss": 1.1911, "mean_token_accuracy": 0.7117271330207586, "num_tokens": 15811306.0, "step": 1090 }, { "entropy": 1.1874678194522859, "epoch": 0.5759539236861051, "grad_norm": 2.25, "learning_rate": 4.4823398655894924e-05, "loss": 1.1878, "mean_token_accuracy": 0.7028247270733118, "num_tokens": 15957078.0, "step": 1100 }, { "entropy": 1.247788500599563, "epoch": 0.5811898684468879, "grad_norm": 2.0625, "learning_rate": 4.468337953401908e-05, "loss": 1.2483, "mean_token_accuracy": 0.6906874619424344, "num_tokens": 16099300.0, "step": 1110 }, { "entropy": 1.2340633975341917, "epoch": 0.5864258132076706, "grad_norm": 1.734375, "learning_rate": 4.45417168556166e-05, "loss": 1.2385, "mean_token_accuracy": 0.6969642581418156, "num_tokens": 16241998.0, "step": 1120 }, { "entropy": 1.1755719013512134, "epoch": 0.5916617579684534, "grad_norm": 1.7109375, "learning_rate": 4.4398422449480356e-05, "loss": 1.203, "mean_token_accuracy": 0.7083542978391051, "num_tokens": 16387522.0, "step": 1130 }, { "entropy": 1.2200544375926257, "epoch": 0.5968977027292363, "grad_norm": 1.9375, "learning_rate": 4.425350828065204e-05, "loss": 1.2805, "mean_token_accuracy": 0.6941867485642433, "num_tokens": 16529425.0, "step": 1140 }, { "entropy": 1.2577802315354347, "epoch": 0.602133647490019, "grad_norm": 2.265625, "learning_rate": 4.410698644942303e-05, "loss": 1.2505, "mean_token_accuracy": 0.692890228703618, "num_tokens": 16670311.0, "step": 1150 }, { "entropy": 1.1618805171921849, "epoch": 0.6073695922508018, "grad_norm": 1.4921875, "learning_rate": 4.395886919032406e-05, "loss": 1.1903, "mean_token_accuracy": 0.7042877223342657, "num_tokens": 16813681.0, "step": 1160 }, { "entropy": 1.2381837129592896, "epoch": 0.6126055370115845, "grad_norm": 1.9609375, "learning_rate": 4.380916887110366e-05, "loss": 1.2605, "mean_token_accuracy": 0.6947620201855897, "num_tokens": 16958167.0, "step": 1170 }, { "entropy": 1.2540216479450463, "epoch": 0.6178414817723673, "grad_norm": 1.7890625, "learning_rate": 4.365789799169539e-05, "loss": 1.2586, "mean_token_accuracy": 0.6949862573295832, "num_tokens": 17101042.0, "step": 1180 }, { "entropy": 1.2213780038058757, "epoch": 0.6230774265331501, "grad_norm": 1.6171875, "learning_rate": 4.350506918317416e-05, "loss": 1.2509, "mean_token_accuracy": 0.6928936781361699, "num_tokens": 17241395.0, "step": 1190 }, { "entropy": 1.1887108445167542, "epoch": 0.6283133712939328, "grad_norm": 1.671875, "learning_rate": 4.335069520670149e-05, "loss": 1.2134, "mean_token_accuracy": 0.7078650841489434, "num_tokens": 17382621.0, "step": 1200 }, { "entropy": 1.1683177448809148, "epoch": 0.6335493160547156, "grad_norm": 1.8671875, "learning_rate": 4.3194788952459996e-05, "loss": 1.1862, "mean_token_accuracy": 0.7075361222028732, "num_tokens": 17524511.0, "step": 1210 }, { "entropy": 1.2176016632467508, "epoch": 0.6387852608154984, "grad_norm": 1.5703125, "learning_rate": 4.303736343857704e-05, "loss": 1.2635, "mean_token_accuracy": 0.6953268457204104, "num_tokens": 17674442.0, "step": 1220 }, { "entropy": 1.2287296935915948, "epoch": 0.6440212055762812, "grad_norm": 1.5234375, "learning_rate": 4.2878431810037724e-05, "loss": 1.239, "mean_token_accuracy": 0.7013807725161314, "num_tokens": 17827391.0, "step": 1230 }, { "entropy": 1.236463399976492, "epoch": 0.649257150337064, "grad_norm": 2.0, "learning_rate": 4.27180073375873e-05, "loss": 1.2571, "mean_token_accuracy": 0.6894650906324387, "num_tokens": 17972502.0, "step": 1240 }, { "entropy": 1.1414937254041433, "epoch": 0.6544930950978467, "grad_norm": 1.640625, "learning_rate": 4.255610341662304e-05, "loss": 1.1546, "mean_token_accuracy": 0.7119367253035307, "num_tokens": 18115408.0, "step": 1250 }, { "entropy": 1.193948952294886, "epoch": 0.6597290398586295, "grad_norm": 1.8671875, "learning_rate": 4.239273356607576e-05, "loss": 1.2245, "mean_token_accuracy": 0.7001152852550149, "num_tokens": 18262489.0, "step": 1260 }, { "entropy": 1.246325920522213, "epoch": 0.6649649846194122, "grad_norm": 1.71875, "learning_rate": 4.222791142728097e-05, "loss": 1.2505, "mean_token_accuracy": 0.6927321873605251, "num_tokens": 18399068.0, "step": 1270 }, { "entropy": 1.1776666756719352, "epoch": 0.670200929380195, "grad_norm": 1.8203125, "learning_rate": 4.2061650762839825e-05, "loss": 1.2147, "mean_token_accuracy": 0.7039317097514868, "num_tokens": 18543913.0, "step": 1280 }, { "entropy": 1.2006115175783634, "epoch": 0.6754368741409779, "grad_norm": 1.5078125, "learning_rate": 4.189396545546995e-05, "loss": 1.2154, "mean_token_accuracy": 0.7036881025880575, "num_tokens": 18678858.0, "step": 1290 }, { "entropy": 1.1709488430991768, "epoch": 0.6806728189017606, "grad_norm": 1.78125, "learning_rate": 4.1724869506846267e-05, "loss": 1.1949, "mean_token_accuracy": 0.7089729970321059, "num_tokens": 18834916.0, "step": 1300 }, { "entropy": 1.2211165010929108, "epoch": 0.6859087636625434, "grad_norm": 1.46875, "learning_rate": 4.1554377036431816e-05, "loss": 1.2437, "mean_token_accuracy": 0.6957355309277773, "num_tokens": 18987694.0, "step": 1310 }, { "entropy": 1.2230720650404692, "epoch": 0.6911447084233261, "grad_norm": 1.7265625, "learning_rate": 4.138250228029882e-05, "loss": 1.2417, "mean_token_accuracy": 0.6949638992547988, "num_tokens": 19137646.0, "step": 1320 }, { "entropy": 1.1824469189159572, "epoch": 0.6963806531841089, "grad_norm": 1.5546875, "learning_rate": 4.120925958993994e-05, "loss": 1.2038, "mean_token_accuracy": 0.7080136310309172, "num_tokens": 19285574.0, "step": 1330 }, { "entropy": 1.1995828442275525, "epoch": 0.7016165979448917, "grad_norm": 1.8125, "learning_rate": 4.103466343106999e-05, "loss": 1.2104, "mean_token_accuracy": 0.7012225743383169, "num_tokens": 19432213.0, "step": 1340 }, { "entropy": 1.2089691065251826, "epoch": 0.7068525427056744, "grad_norm": 1.8046875, "learning_rate": 4.0858728382417966e-05, "loss": 1.2252, "mean_token_accuracy": 0.6981374306604267, "num_tokens": 19586405.0, "step": 1350 }, { "entropy": 1.2061294008046388, "epoch": 0.7120884874664573, "grad_norm": 1.6015625, "learning_rate": 4.06814691345098e-05, "loss": 1.2052, "mean_token_accuracy": 0.6995515301823616, "num_tokens": 19736389.0, "step": 1360 }, { "entropy": 1.2196388389915227, "epoch": 0.71732443222724, "grad_norm": 1.6953125, "learning_rate": 4.0502900488441706e-05, "loss": 1.2295, "mean_token_accuracy": 0.6974983751773834, "num_tokens": 19884434.0, "step": 1370 }, { "entropy": 1.1925444403663277, "epoch": 0.7225603769880228, "grad_norm": 1.65625, "learning_rate": 4.032303735464422e-05, "loss": 1.1885, "mean_token_accuracy": 0.7031191129237413, "num_tokens": 20019339.0, "step": 1380 }, { "entropy": 1.184141149930656, "epoch": 0.7277963217488056, "grad_norm": 1.515625, "learning_rate": 4.0141894751637264e-05, "loss": 1.2061, "mean_token_accuracy": 0.7056893218308687, "num_tokens": 20172036.0, "step": 1390 }, { "entropy": 1.18991824015975, "epoch": 0.7330322665095883, "grad_norm": 1.8125, "learning_rate": 3.995948780477605e-05, "loss": 1.1945, "mean_token_accuracy": 0.7036307500675321, "num_tokens": 20316279.0, "step": 1400 }, { "entropy": 1.221291032806039, "epoch": 0.7382682112703711, "grad_norm": 2.234375, "learning_rate": 3.977583174498816e-05, "loss": 1.2229, "mean_token_accuracy": 0.6996599985286593, "num_tokens": 20457700.0, "step": 1410 }, { "entropy": 1.2056787729263305, "epoch": 0.7435041560311538, "grad_norm": 1.9765625, "learning_rate": 3.959094190750172e-05, "loss": 1.2209, "mean_token_accuracy": 0.699485157802701, "num_tokens": 20601526.0, "step": 1420 }, { "entropy": 1.214169954136014, "epoch": 0.7487401007919366, "grad_norm": 1.640625, "learning_rate": 3.940483373056498e-05, "loss": 1.2251, "mean_token_accuracy": 0.7045085027813911, "num_tokens": 20740478.0, "step": 1430 }, { "entropy": 1.2199758583679796, "epoch": 0.7539760455527195, "grad_norm": 1.421875, "learning_rate": 3.921752275415712e-05, "loss": 1.232, "mean_token_accuracy": 0.6983612652868032, "num_tokens": 20889352.0, "step": 1440 }, { "entropy": 1.1763014759868384, "epoch": 0.7592119903135022, "grad_norm": 1.6875, "learning_rate": 3.902902461869079e-05, "loss": 1.1852, "mean_token_accuracy": 0.706533107161522, "num_tokens": 21047325.0, "step": 1450 }, { "entropy": 1.2274865956045686, "epoch": 0.764447935074285, "grad_norm": 1.515625, "learning_rate": 3.883935506370605e-05, "loss": 1.2559, "mean_token_accuracy": 0.6984432989731431, "num_tokens": 21193313.0, "step": 1460 }, { "entropy": 1.2031854771077632, "epoch": 0.7696838798350677, "grad_norm": 1.4140625, "learning_rate": 3.864852992655617e-05, "loss": 1.2106, "mean_token_accuracy": 0.7018306776881218, "num_tokens": 21340786.0, "step": 1470 }, { "entropy": 1.2027717839926482, "epoch": 0.7749198245958505, "grad_norm": 1.671875, "learning_rate": 3.845656514108515e-05, "loss": 1.22, "mean_token_accuracy": 0.7030729129910469, "num_tokens": 21484126.0, "step": 1480 }, { "entropy": 1.1977868607267737, "epoch": 0.7801557693566333, "grad_norm": 1.5, "learning_rate": 3.8263476736297374e-05, "loss": 1.1941, "mean_token_accuracy": 0.7007357392460107, "num_tokens": 21629712.0, "step": 1490 }, { "entropy": 1.280288253352046, "epoch": 0.785391714117416, "grad_norm": 1.53125, "learning_rate": 3.806928083501906e-05, "loss": 1.3073, "mean_token_accuracy": 0.6855264658108353, "num_tokens": 21769811.0, "step": 1500 }, { "entropy": 1.2234169896692038, "epoch": 0.7906276588781989, "grad_norm": 1.640625, "learning_rate": 3.787399365255207e-05, "loss": 1.2603, "mean_token_accuracy": 0.6935047794133424, "num_tokens": 21919583.0, "step": 1510 }, { "entropy": 1.2192364005371927, "epoch": 0.7958636036389816, "grad_norm": 1.71875, "learning_rate": 3.7677631495319956e-05, "loss": 1.2092, "mean_token_accuracy": 0.702324446476996, "num_tokens": 22067375.0, "step": 1520 }, { "entropy": 1.1928664781153202, "epoch": 0.8010995483997644, "grad_norm": 1.640625, "learning_rate": 3.748021075950633e-05, "loss": 1.2297, "mean_token_accuracy": 0.6969239924103021, "num_tokens": 22210682.0, "step": 1530 }, { "entropy": 1.2294584538787603, "epoch": 0.8063354931605472, "grad_norm": 1.6328125, "learning_rate": 3.728174792968582e-05, "loss": 1.253, "mean_token_accuracy": 0.6952543262392282, "num_tokens": 22362042.0, "step": 1540 }, { "entropy": 1.2315257797017694, "epoch": 0.8115714379213299, "grad_norm": 1.796875, "learning_rate": 3.7082259577447605e-05, "loss": 1.2633, "mean_token_accuracy": 0.6920363411307335, "num_tokens": 22518138.0, "step": 1550 }, { "entropy": 1.2164895705878735, "epoch": 0.8168073826821127, "grad_norm": 1.4375, "learning_rate": 3.688176236001168e-05, "loss": 1.2215, "mean_token_accuracy": 0.6953978851437569, "num_tokens": 22670347.0, "step": 1560 }, { "entropy": 1.1897184619680048, "epoch": 0.8220433274428954, "grad_norm": 1.6796875, "learning_rate": 3.668027301883802e-05, "loss": 1.2025, "mean_token_accuracy": 0.7067706823348999, "num_tokens": 22816471.0, "step": 1570 }, { "entropy": 1.2356853460893036, "epoch": 0.8272792722036783, "grad_norm": 1.5078125, "learning_rate": 3.6477808378228604e-05, "loss": 1.2735, "mean_token_accuracy": 0.6951639795675874, "num_tokens": 22969371.0, "step": 1580 }, { "entropy": 1.172374564781785, "epoch": 0.8325152169644611, "grad_norm": 1.6640625, "learning_rate": 3.6274385343922677e-05, "loss": 1.1798, "mean_token_accuracy": 0.7071554753929377, "num_tokens": 23112802.0, "step": 1590 }, { "entropy": 1.2456749143078922, "epoch": 0.8377511617252438, "grad_norm": 2.359375, "learning_rate": 3.607002090168506e-05, "loss": 1.2789, "mean_token_accuracy": 0.6919172059744596, "num_tokens": 23251431.0, "step": 1600 }, { "entropy": 1.2133430268615484, "epoch": 0.8429871064860266, "grad_norm": 1.75, "learning_rate": 3.5864732115887866e-05, "loss": 1.2482, "mean_token_accuracy": 0.7028827562928199, "num_tokens": 23398750.0, "step": 1610 }, { "entropy": 1.2265673983842134, "epoch": 0.8482230512468093, "grad_norm": 1.5625, "learning_rate": 3.565853612808562e-05, "loss": 1.2666, "mean_token_accuracy": 0.6953880734741688, "num_tokens": 23541708.0, "step": 1620 }, { "entropy": 1.1701237484812737, "epoch": 0.8534589960075921, "grad_norm": 1.609375, "learning_rate": 3.545145015558399e-05, "loss": 1.1376, "mean_token_accuracy": 0.7052776984870434, "num_tokens": 23689211.0, "step": 1630 }, { "entropy": 1.227908807620406, "epoch": 0.8586949407683749, "grad_norm": 1.578125, "learning_rate": 3.524349149000206e-05, "loss": 1.2574, "mean_token_accuracy": 0.6968840681016445, "num_tokens": 23830779.0, "step": 1640 }, { "entropy": 1.1823246696963907, "epoch": 0.8639308855291576, "grad_norm": 1.7109375, "learning_rate": 3.503467749582857e-05, "loss": 1.176, "mean_token_accuracy": 0.7053217653185129, "num_tokens": 23966735.0, "step": 1650 }, { "entropy": 1.1737437251955272, "epoch": 0.8691668302899405, "grad_norm": 1.6015625, "learning_rate": 3.482502560897195e-05, "loss": 1.202, "mean_token_accuracy": 0.7063202302902937, "num_tokens": 24114205.0, "step": 1660 }, { "entropy": 1.2372199261561037, "epoch": 0.8744027750507232, "grad_norm": 1.46875, "learning_rate": 3.4614553335304406e-05, "loss": 1.2486, "mean_token_accuracy": 0.694911016151309, "num_tokens": 24257335.0, "step": 1670 }, { "entropy": 1.2101770553737878, "epoch": 0.879638719811506, "grad_norm": 1.5, "learning_rate": 3.440327824920022e-05, "loss": 1.1971, "mean_token_accuracy": 0.6996189601719379, "num_tokens": 24405338.0, "step": 1680 }, { "entropy": 1.1508656131103634, "epoch": 0.8848746645722888, "grad_norm": 1.6171875, "learning_rate": 3.419121799206829e-05, "loss": 1.17, "mean_token_accuracy": 0.7079865211620927, "num_tokens": 24550534.0, "step": 1690 }, { "entropy": 1.2128825964406134, "epoch": 0.8901106093330715, "grad_norm": 1.71875, "learning_rate": 3.3978390270879055e-05, "loss": 1.248, "mean_token_accuracy": 0.6985778672620654, "num_tokens": 24699673.0, "step": 1700 }, { "entropy": 1.1805501360446216, "epoch": 0.8953465540938543, "grad_norm": 1.6171875, "learning_rate": 3.3764812856686e-05, "loss": 1.1703, "mean_token_accuracy": 0.7011347938328981, "num_tokens": 24848326.0, "step": 1710 }, { "entropy": 1.211562325246632, "epoch": 0.900582498854637, "grad_norm": 1.7421875, "learning_rate": 3.355050358314172e-05, "loss": 1.2425, "mean_token_accuracy": 0.702686908468604, "num_tokens": 24992122.0, "step": 1720 }, { "entropy": 1.1974574619904161, "epoch": 0.9058184436154199, "grad_norm": 2.15625, "learning_rate": 3.3335480345008905e-05, "loss": 1.2259, "mean_token_accuracy": 0.6980314027518034, "num_tokens": 25139079.0, "step": 1730 }, { "entropy": 1.2152755599468947, "epoch": 0.9110543883762027, "grad_norm": 1.8046875, "learning_rate": 3.311976109666605e-05, "loss": 1.251, "mean_token_accuracy": 0.6992737432941795, "num_tokens": 25286349.0, "step": 1740 }, { "entropy": 1.2275765413418411, "epoch": 0.9162903331369854, "grad_norm": 1.4453125, "learning_rate": 3.290336385060832e-05, "loss": 1.2348, "mean_token_accuracy": 0.6949001539498567, "num_tokens": 25431706.0, "step": 1750 }, { "entropy": 1.1663161270320415, "epoch": 0.9215262778977682, "grad_norm": 1.4765625, "learning_rate": 3.268630667594348e-05, "loss": 1.1854, "mean_token_accuracy": 0.7103127352893353, "num_tokens": 25575146.0, "step": 1760 }, { "entropy": 1.2058376437053084, "epoch": 0.9267622226585509, "grad_norm": 1.4765625, "learning_rate": 3.2468607696883146e-05, "loss": 1.2425, "mean_token_accuracy": 0.7038685705512762, "num_tokens": 25726777.0, "step": 1770 }, { "entropy": 1.189345240779221, "epoch": 0.9319981674193337, "grad_norm": 2.03125, "learning_rate": 3.225028509122944e-05, "loss": 1.2287, "mean_token_accuracy": 0.7018254602327942, "num_tokens": 25875049.0, "step": 1780 }, { "entropy": 1.1771747019141912, "epoch": 0.9372341121801165, "grad_norm": 1.515625, "learning_rate": 3.2031357088857085e-05, "loss": 1.1915, "mean_token_accuracy": 0.7071957625448704, "num_tokens": 26026216.0, "step": 1790 }, { "entropy": 1.220152474567294, "epoch": 0.9424700569408992, "grad_norm": 1.8828125, "learning_rate": 3.181184197019127e-05, "loss": 1.2299, "mean_token_accuracy": 0.7012155883014202, "num_tokens": 26170779.0, "step": 1800 }, { "entropy": 1.1854938926175236, "epoch": 0.9477060017016821, "grad_norm": 1.625, "learning_rate": 3.159175806468126e-05, "loss": 1.1874, "mean_token_accuracy": 0.705037958547473, "num_tokens": 26319322.0, "step": 1810 }, { "entropy": 1.2158665416762233, "epoch": 0.9529419464624648, "grad_norm": 1.3984375, "learning_rate": 3.1371123749269805e-05, "loss": 1.2222, "mean_token_accuracy": 0.6985156688839197, "num_tokens": 26465154.0, "step": 1820 }, { "entropy": 1.2151709901168943, "epoch": 0.9581778912232476, "grad_norm": 1.6953125, "learning_rate": 3.114995744685877e-05, "loss": 1.2667, "mean_token_accuracy": 0.6974882191047073, "num_tokens": 26611774.0, "step": 1830 }, { "entropy": 1.1511426636949182, "epoch": 0.9634138359840304, "grad_norm": 1.765625, "learning_rate": 3.092827762477074e-05, "loss": 1.197, "mean_token_accuracy": 0.7165371583774686, "num_tokens": 26760701.0, "step": 1840 }, { "entropy": 1.2074316812679171, "epoch": 0.9686497807448131, "grad_norm": 1.5234375, "learning_rate": 3.070610279320707e-05, "loss": 1.2485, "mean_token_accuracy": 0.7016795247793197, "num_tokens": 26902851.0, "step": 1850 }, { "entropy": 1.1557184986770153, "epoch": 0.9738857255055959, "grad_norm": 1.5, "learning_rate": 3.0483451503702264e-05, "loss": 1.1973, "mean_token_accuracy": 0.7114167800173163, "num_tokens": 27053159.0, "step": 1860 }, { "entropy": 1.145219304971397, "epoch": 0.9791216702663786, "grad_norm": 1.6015625, "learning_rate": 3.0260342347574915e-05, "loss": 1.1371, "mean_token_accuracy": 0.7116047518327833, "num_tokens": 27195594.0, "step": 1870 }, { "entropy": 1.1833709230646492, "epoch": 0.9843576150271615, "grad_norm": 1.578125, "learning_rate": 3.003679395437536e-05, "loss": 1.1913, "mean_token_accuracy": 0.7063193745911122, "num_tokens": 27335855.0, "step": 1880 }, { "entropy": 1.2038549520075321, "epoch": 0.9895935597879443, "grad_norm": 1.7578125, "learning_rate": 2.981282499033009e-05, "loss": 1.2036, "mean_token_accuracy": 0.6992362190037966, "num_tokens": 27485225.0, "step": 1890 }, { "entropy": 1.1822845570743084, "epoch": 0.994829504548727, "grad_norm": 1.6328125, "learning_rate": 2.9588454156783163e-05, "loss": 1.2295, "mean_token_accuracy": 0.7060987044125795, "num_tokens": 27634788.0, "step": 1900 }, { "entropy": 1.1678016036748886, "epoch": 1.0, "grad_norm": 1.96875, "learning_rate": 2.9363700188634598e-05, "loss": 1.1579, "mean_token_accuracy": 0.7110490016167677, "num_tokens": 27776648.0, "step": 1910 }, { "epoch": 1.0, "eval_entropy": 1.1545482213497162, "eval_loss": 1.1723113059997559, "eval_mean_token_accuracy": 0.7067238150835037, "eval_num_tokens": 27776648.0, "eval_runtime": 60.9052, "eval_samples_per_second": 32.838, "eval_steps_per_second": 16.419, "step": 1910 }, { "entropy": 1.092913302220404, "epoch": 1.0052359447607828, "grad_norm": 1.609375, "learning_rate": 2.9138581852776055e-05, "loss": 1.0676, "mean_token_accuracy": 0.7341579392552375, "num_tokens": 27920450.0, "step": 1920 }, { "entropy": 1.076912895217538, "epoch": 1.0104718895215656, "grad_norm": 2.53125, "learning_rate": 2.8913117946523803e-05, "loss": 1.0904, "mean_token_accuracy": 0.7310005821287632, "num_tokens": 28067925.0, "step": 1930 }, { "entropy": 1.0259252307936548, "epoch": 1.0157078342823482, "grad_norm": 1.7578125, "learning_rate": 2.8687327296049128e-05, "loss": 0.9952, "mean_token_accuracy": 0.7427292361855506, "num_tokens": 28212860.0, "step": 1940 }, { "entropy": 1.0669849675148726, "epoch": 1.020943779043131, "grad_norm": 1.6171875, "learning_rate": 2.8461228754806375e-05, "loss": 1.0461, "mean_token_accuracy": 0.7347980726510286, "num_tokens": 28356365.0, "step": 1950 }, { "entropy": 1.0898705100640655, "epoch": 1.0261797238039139, "grad_norm": 1.6171875, "learning_rate": 2.823484120195865e-05, "loss": 1.0786, "mean_token_accuracy": 0.7276619732379913, "num_tokens": 28495047.0, "step": 1960 }, { "entropy": 1.0758044727146625, "epoch": 1.0314156685646967, "grad_norm": 1.59375, "learning_rate": 2.8008183540801484e-05, "loss": 1.0782, "mean_token_accuracy": 0.7319583360105753, "num_tokens": 28633968.0, "step": 1970 }, { "entropy": 1.031227163411677, "epoch": 1.0366516133254795, "grad_norm": 1.5859375, "learning_rate": 2.7781274697184352e-05, "loss": 1.0339, "mean_token_accuracy": 0.7387025002390146, "num_tokens": 28778453.0, "step": 1980 }, { "entropy": 1.1043319918215275, "epoch": 1.041887558086262, "grad_norm": 2.1875, "learning_rate": 2.7554133617930394e-05, "loss": 1.1214, "mean_token_accuracy": 0.7271257348358631, "num_tokens": 28919232.0, "step": 1990 }, { "entropy": 1.0519304445013404, "epoch": 1.047123502847045, "grad_norm": 1.7421875, "learning_rate": 2.732677926925436e-05, "loss": 1.059, "mean_token_accuracy": 0.7360297767445445, "num_tokens": 29050240.0, "step": 2000 }, { "entropy": 1.074704316444695, "epoch": 1.0523594476078277, "grad_norm": 1.3203125, "learning_rate": 2.709923063517895e-05, "loss": 1.0769, "mean_token_accuracy": 0.7276826776564121, "num_tokens": 29198739.0, "step": 2010 }, { "entropy": 1.1091003093868494, "epoch": 1.0575953923686106, "grad_norm": 1.4765625, "learning_rate": 2.6871506715949606e-05, "loss": 1.1141, "mean_token_accuracy": 0.7292679946869611, "num_tokens": 29343130.0, "step": 2020 }, { "entropy": 1.1154426285997032, "epoch": 1.0628313371293934, "grad_norm": 1.625, "learning_rate": 2.664362652644806e-05, "loss": 1.1339, "mean_token_accuracy": 0.7209920965135097, "num_tokens": 29489679.0, "step": 2030 }, { "entropy": 1.0308820417150855, "epoch": 1.068067281890176, "grad_norm": 1.5234375, "learning_rate": 2.641560909460456e-05, "loss": 1.0344, "mean_token_accuracy": 0.737670699879527, "num_tokens": 29630133.0, "step": 2040 }, { "entropy": 1.1122422970831394, "epoch": 1.0733032266509588, "grad_norm": 1.515625, "learning_rate": 2.6187473459809043e-05, "loss": 1.1265, "mean_token_accuracy": 0.7212287154048681, "num_tokens": 29781663.0, "step": 2050 }, { "entropy": 1.0620404394343494, "epoch": 1.0785391714117416, "grad_norm": 1.5, "learning_rate": 2.595923867132136e-05, "loss": 1.0485, "mean_token_accuracy": 0.7329498503357172, "num_tokens": 29930945.0, "step": 2060 }, { "entropy": 1.0296688327565788, "epoch": 1.0837751161725244, "grad_norm": 1.6484375, "learning_rate": 2.573092378668067e-05, "loss": 1.0277, "mean_token_accuracy": 0.7400956619530916, "num_tokens": 30074139.0, "step": 2070 }, { "entropy": 1.071678783558309, "epoch": 1.0890110609333072, "grad_norm": 2.203125, "learning_rate": 2.5502547870114135e-05, "loss": 1.0915, "mean_token_accuracy": 0.7298048492521048, "num_tokens": 30216099.0, "step": 2080 }, { "entropy": 1.032330046594143, "epoch": 1.0942470056940898, "grad_norm": 2.203125, "learning_rate": 2.5274129990945067e-05, "loss": 1.0344, "mean_token_accuracy": 0.7430270429700613, "num_tokens": 30368171.0, "step": 2090 }, { "entropy": 1.074818222783506, "epoch": 1.0994829504548727, "grad_norm": 1.578125, "learning_rate": 2.504568922200064e-05, "loss": 1.0715, "mean_token_accuracy": 0.726781240105629, "num_tokens": 30510708.0, "step": 2100 }, { "entropy": 1.0945234788581728, "epoch": 1.1047188952156555, "grad_norm": 1.6171875, "learning_rate": 2.481724463801933e-05, "loss": 1.1047, "mean_token_accuracy": 0.7265350595116615, "num_tokens": 30662144.0, "step": 2110 }, { "entropy": 1.0475447980687023, "epoch": 1.1099548399764383, "grad_norm": 1.40625, "learning_rate": 2.4588815314058155e-05, "loss": 1.0611, "mean_token_accuracy": 0.7366870004683733, "num_tokens": 30810485.0, "step": 2120 }, { "entropy": 1.0811086906120182, "epoch": 1.115190784737221, "grad_norm": 1.5625, "learning_rate": 2.436042032389992e-05, "loss": 1.0993, "mean_token_accuracy": 0.7298943884670734, "num_tokens": 30963681.0, "step": 2130 }, { "entropy": 1.0684187861159444, "epoch": 1.1204267294980037, "grad_norm": 1.328125, "learning_rate": 2.4132078738460588e-05, "loss": 1.0798, "mean_token_accuracy": 0.7327737433835864, "num_tokens": 31116018.0, "step": 2140 }, { "entropy": 1.0509660685434938, "epoch": 1.1256626742587865, "grad_norm": 1.71875, "learning_rate": 2.3903809624196825e-05, "loss": 1.0373, "mean_token_accuracy": 0.73740617595613, "num_tokens": 31248950.0, "step": 2150 }, { "entropy": 1.0591240156441928, "epoch": 1.1308986190195693, "grad_norm": 1.6171875, "learning_rate": 2.3675632041513978e-05, "loss": 1.0743, "mean_token_accuracy": 0.7330824228003621, "num_tokens": 31386629.0, "step": 2160 }, { "entropy": 1.0668636929243802, "epoch": 1.1361345637803522, "grad_norm": 1.8828125, "learning_rate": 2.3447565043174533e-05, "loss": 1.0589, "mean_token_accuracy": 0.7297359511256218, "num_tokens": 31535751.0, "step": 2170 }, { "entropy": 1.0401808319613337, "epoch": 1.141370508541135, "grad_norm": 1.4609375, "learning_rate": 2.321962767270724e-05, "loss": 1.0402, "mean_token_accuracy": 0.7368248403072357, "num_tokens": 31676167.0, "step": 2180 }, { "entropy": 1.0865553246811033, "epoch": 1.1466064533019176, "grad_norm": 1.828125, "learning_rate": 2.299183896281692e-05, "loss": 1.1019, "mean_token_accuracy": 0.7275375993922353, "num_tokens": 31831724.0, "step": 2190 }, { "entropy": 1.0097376400604845, "epoch": 1.1518423980627004, "grad_norm": 1.5703125, "learning_rate": 2.27642179337953e-05, "loss": 1.0072, "mean_token_accuracy": 0.7466676604002714, "num_tokens": 31966434.0, "step": 2200 }, { "entropy": 1.086430662125349, "epoch": 1.1570783428234832, "grad_norm": 1.671875, "learning_rate": 2.2536783591932784e-05, "loss": 1.1011, "mean_token_accuracy": 0.7281720124185085, "num_tokens": 32110771.0, "step": 2210 }, { "entropy": 1.049152427725494, "epoch": 1.162314287584266, "grad_norm": 1.546875, "learning_rate": 2.2309554927931493e-05, "loss": 1.0408, "mean_token_accuracy": 0.7368430346250534, "num_tokens": 32251257.0, "step": 2220 }, { "entropy": 1.0445547673851252, "epoch": 1.1675502323450488, "grad_norm": 2.46875, "learning_rate": 2.208255091531947e-05, "loss": 1.0657, "mean_token_accuracy": 0.7318288933485746, "num_tokens": 32392959.0, "step": 2230 }, { "entropy": 1.075485266186297, "epoch": 1.1727861771058314, "grad_norm": 1.8125, "learning_rate": 2.1855790508866435e-05, "loss": 1.09, "mean_token_accuracy": 0.7345754325389862, "num_tokens": 32528087.0, "step": 2240 }, { "entropy": 1.041004289314151, "epoch": 1.1780221218666143, "grad_norm": 1.5390625, "learning_rate": 2.162929264300107e-05, "loss": 1.031, "mean_token_accuracy": 0.7372262746095657, "num_tokens": 32670677.0, "step": 2250 }, { "entropy": 1.053070001862943, "epoch": 1.183258066627397, "grad_norm": 2.0, "learning_rate": 2.1403076230230006e-05, "loss": 1.0635, "mean_token_accuracy": 0.7337488930672407, "num_tokens": 32817744.0, "step": 2260 }, { "entropy": 1.1205989433452488, "epoch": 1.18849401138818, "grad_norm": 1.5, "learning_rate": 2.11771601595586e-05, "loss": 1.1453, "mean_token_accuracy": 0.7223296284675598, "num_tokens": 32959765.0, "step": 2270 }, { "entropy": 1.0392343305051326, "epoch": 1.1937299561489627, "grad_norm": 1.5703125, "learning_rate": 2.0951563294913738e-05, "loss": 1.0358, "mean_token_accuracy": 0.7359498247504235, "num_tokens": 33097626.0, "step": 2280 }, { "entropy": 1.059504010900855, "epoch": 1.1989659009097453, "grad_norm": 1.984375, "learning_rate": 2.0726304473568693e-05, "loss": 1.081, "mean_token_accuracy": 0.735545065253973, "num_tokens": 33239983.0, "step": 2290 }, { "entropy": 1.0819577634334565, "epoch": 1.2042018456705281, "grad_norm": 1.6953125, "learning_rate": 2.0501402504570234e-05, "loss": 1.094, "mean_token_accuracy": 0.7268587298691273, "num_tokens": 33384613.0, "step": 2300 }, { "entropy": 1.0227667864412069, "epoch": 1.209437790431311, "grad_norm": 1.953125, "learning_rate": 2.0276876167168044e-05, "loss": 1.027, "mean_token_accuracy": 0.7428420815616846, "num_tokens": 33530796.0, "step": 2310 }, { "entropy": 1.0223766604438425, "epoch": 1.2146737351920938, "grad_norm": 1.640625, "learning_rate": 2.005274420924668e-05, "loss": 1.031, "mean_token_accuracy": 0.7396607849746942, "num_tokens": 33666957.0, "step": 2320 }, { "entropy": 1.0607954716309904, "epoch": 1.2199096799528766, "grad_norm": 1.4375, "learning_rate": 1.9829025345760124e-05, "loss": 1.071, "mean_token_accuracy": 0.7329499468207359, "num_tokens": 33812151.0, "step": 2330 }, { "entropy": 1.038625803217292, "epoch": 1.2251456247136592, "grad_norm": 1.6015625, "learning_rate": 1.960573825716911e-05, "loss": 1.0433, "mean_token_accuracy": 0.7345146417617798, "num_tokens": 33958267.0, "step": 2340 }, { "entropy": 1.0953392999246716, "epoch": 1.230381569474442, "grad_norm": 1.3515625, "learning_rate": 1.9382901587881275e-05, "loss": 1.0805, "mean_token_accuracy": 0.7263612521812319, "num_tokens": 34104819.0, "step": 2350 }, { "entropy": 1.0933478716760874, "epoch": 1.2356175142352248, "grad_norm": 1.6484375, "learning_rate": 1.9160533944694366e-05, "loss": 1.1009, "mean_token_accuracy": 0.7227355781942606, "num_tokens": 34238326.0, "step": 2360 }, { "entropy": 1.0345038840547205, "epoch": 1.2408534589960076, "grad_norm": 1.7578125, "learning_rate": 1.8938653895242604e-05, "loss": 1.0354, "mean_token_accuracy": 0.739068279415369, "num_tokens": 34376182.0, "step": 2370 }, { "entropy": 1.0739264035597444, "epoch": 1.2460894037567904, "grad_norm": 1.4140625, "learning_rate": 1.8717279966446267e-05, "loss": 1.0603, "mean_token_accuracy": 0.7294208355247974, "num_tokens": 34519957.0, "step": 2380 }, { "entropy": 1.0502849434502424, "epoch": 1.251325348517573, "grad_norm": 1.65625, "learning_rate": 1.8496430642964696e-05, "loss": 1.0985, "mean_token_accuracy": 0.7344448113813996, "num_tokens": 34674907.0, "step": 2390 }, { "entropy": 1.0143306592479349, "epoch": 1.2565612932783559, "grad_norm": 1.453125, "learning_rate": 1.827612436565286e-05, "loss": 1.0286, "mean_token_accuracy": 0.7477799784392118, "num_tokens": 34821308.0, "step": 2400 }, { "entropy": 1.0486345458775759, "epoch": 1.2617972380391387, "grad_norm": 1.6484375, "learning_rate": 1.8056379530021493e-05, "loss": 1.0472, "mean_token_accuracy": 0.7336918633431196, "num_tokens": 34965952.0, "step": 2410 }, { "entropy": 1.0630970790982246, "epoch": 1.2670331827999215, "grad_norm": 1.515625, "learning_rate": 1.7837214484701154e-05, "loss": 1.079, "mean_token_accuracy": 0.7314780931919813, "num_tokens": 35110373.0, "step": 2420 }, { "entropy": 1.0528906928375363, "epoch": 1.2722691275607043, "grad_norm": 1.6640625, "learning_rate": 1.7618647529910042e-05, "loss": 1.0483, "mean_token_accuracy": 0.7345411021262407, "num_tokens": 35258495.0, "step": 2430 }, { "entropy": 1.018369135260582, "epoch": 1.2775050723214871, "grad_norm": 1.71875, "learning_rate": 1.7400696915925996e-05, "loss": 1.0174, "mean_token_accuracy": 0.7416710961610079, "num_tokens": 35404823.0, "step": 2440 }, { "entropy": 1.0643933141604065, "epoch": 1.2827410170822697, "grad_norm": 1.59375, "learning_rate": 1.718338084156254e-05, "loss": 1.0795, "mean_token_accuracy": 0.7312329623848199, "num_tokens": 35555059.0, "step": 2450 }, { "entropy": 1.0236325599253178, "epoch": 1.2879769618430525, "grad_norm": 1.4921875, "learning_rate": 1.6966717452649373e-05, "loss": 1.0252, "mean_token_accuracy": 0.7439669661223889, "num_tokens": 35704746.0, "step": 2460 }, { "entropy": 1.0227021113038064, "epoch": 1.2932129066038354, "grad_norm": 1.3828125, "learning_rate": 1.67507248405171e-05, "loss": 1.0337, "mean_token_accuracy": 0.7387756012380123, "num_tokens": 35852823.0, "step": 2470 }, { "entropy": 1.1011345129460097, "epoch": 1.2984488513646182, "grad_norm": 1.59375, "learning_rate": 1.6535421040486686e-05, "loss": 1.0906, "mean_token_accuracy": 0.7245030000805854, "num_tokens": 36004677.0, "step": 2480 }, { "entropy": 1.0781516009941696, "epoch": 1.3036847961254008, "grad_norm": 1.7734375, "learning_rate": 1.6320824030363458e-05, "loss": 1.11, "mean_token_accuracy": 0.7256867518648505, "num_tokens": 36141496.0, "step": 2490 }, { "entropy": 1.0459418123587967, "epoch": 1.3089207408861836, "grad_norm": 1.5546875, "learning_rate": 1.6106951728936025e-05, "loss": 1.0554, "mean_token_accuracy": 0.734595287963748, "num_tokens": 36292155.0, "step": 2500 }, { "entropy": 1.0505487740039825, "epoch": 1.3141566856469664, "grad_norm": 1.703125, "learning_rate": 1.5893821994479995e-05, "loss": 1.0537, "mean_token_accuracy": 0.7354540932923556, "num_tokens": 36438145.0, "step": 2510 }, { "entropy": 1.0511848462745548, "epoch": 1.3193926304077492, "grad_norm": 1.5546875, "learning_rate": 1.5681452623266867e-05, "loss": 1.068, "mean_token_accuracy": 0.7358380068093539, "num_tokens": 36577572.0, "step": 2520 }, { "entropy": 1.0947185611352324, "epoch": 1.324628575168532, "grad_norm": 1.5859375, "learning_rate": 1.5469861348078014e-05, "loss": 1.1011, "mean_token_accuracy": 0.7275305841118097, "num_tokens": 36722937.0, "step": 2530 }, { "entropy": 1.0880089100450276, "epoch": 1.3298645199293149, "grad_norm": 1.703125, "learning_rate": 1.5259065836724033e-05, "loss": 1.0962, "mean_token_accuracy": 0.7264829911291599, "num_tokens": 36872221.0, "step": 2540 }, { "entropy": 1.0425203360617161, "epoch": 1.3351004646900975, "grad_norm": 1.5703125, "learning_rate": 1.5049083690569455e-05, "loss": 1.047, "mean_token_accuracy": 0.7343699801713228, "num_tokens": 37016594.0, "step": 2550 }, { "entropy": 1.0769597385078669, "epoch": 1.3403364094508803, "grad_norm": 1.6640625, "learning_rate": 1.4839932443063056e-05, "loss": 1.0833, "mean_token_accuracy": 0.7293191211298108, "num_tokens": 37165697.0, "step": 2560 }, { "entropy": 1.0958488559350372, "epoch": 1.345572354211663, "grad_norm": 1.453125, "learning_rate": 1.4631629558273801e-05, "loss": 1.1182, "mean_token_accuracy": 0.7267538867890835, "num_tokens": 37315842.0, "step": 2570 }, { "entropy": 1.0367282923310994, "epoch": 1.350808298972446, "grad_norm": 1.53125, "learning_rate": 1.4424192429432656e-05, "loss": 1.0612, "mean_token_accuracy": 0.736408182233572, "num_tokens": 37459644.0, "step": 2580 }, { "entropy": 1.0303277362138032, "epoch": 1.3560442437332285, "grad_norm": 1.671875, "learning_rate": 1.4217638377480158e-05, "loss": 1.0507, "mean_token_accuracy": 0.7396169764921069, "num_tokens": 37607638.0, "step": 2590 }, { "entropy": 1.089581909775734, "epoch": 1.3612801884940113, "grad_norm": 1.6875, "learning_rate": 1.4011984649620211e-05, "loss": 1.0904, "mean_token_accuracy": 0.7292284790426493, "num_tokens": 37754738.0, "step": 2600 }, { "entropy": 1.0630993578583001, "epoch": 1.3665161332547942, "grad_norm": 1.6328125, "learning_rate": 1.3807248417879895e-05, "loss": 1.0852, "mean_token_accuracy": 0.731200734898448, "num_tokens": 37894410.0, "step": 2610 }, { "entropy": 1.0982538178563117, "epoch": 1.371752078015577, "grad_norm": 1.7109375, "learning_rate": 1.3603446777675665e-05, "loss": 1.1, "mean_token_accuracy": 0.7262859750539065, "num_tokens": 38038356.0, "step": 2620 }, { "entropy": 1.0451888531446456, "epoch": 1.3769880227763598, "grad_norm": 2.03125, "learning_rate": 1.3400596746385815e-05, "loss": 1.0298, "mean_token_accuracy": 0.7341227237135172, "num_tokens": 38184874.0, "step": 2630 }, { "entropy": 1.0088561842218042, "epoch": 1.3822239675371426, "grad_norm": 1.53125, "learning_rate": 1.3198715261929586e-05, "loss": 0.9888, "mean_token_accuracy": 0.7419910099357366, "num_tokens": 38327082.0, "step": 2640 }, { "entropy": 1.062550875172019, "epoch": 1.3874599122979252, "grad_norm": 1.6015625, "learning_rate": 1.2997819181352822e-05, "loss": 1.0731, "mean_token_accuracy": 0.7318339478224516, "num_tokens": 38476686.0, "step": 2650 }, { "entropy": 1.066984947770834, "epoch": 1.392695857058708, "grad_norm": 1.4375, "learning_rate": 1.2797925279420453e-05, "loss": 1.0677, "mean_token_accuracy": 0.7307767707854509, "num_tokens": 38635267.0, "step": 2660 }, { "entropy": 1.08070537019521, "epoch": 1.3979318018194908, "grad_norm": 1.53125, "learning_rate": 1.2599050247215764e-05, "loss": 1.0787, "mean_token_accuracy": 0.7295586479827761, "num_tokens": 38782523.0, "step": 2670 }, { "entropy": 1.1134451285004616, "epoch": 1.4031677465802737, "grad_norm": 1.6015625, "learning_rate": 1.2401210690746703e-05, "loss": 1.1551, "mean_token_accuracy": 0.7224681507796049, "num_tokens": 38934863.0, "step": 2680 }, { "entropy": 1.006534701772034, "epoch": 1.4084036913410563, "grad_norm": 1.421875, "learning_rate": 1.2204423129559306e-05, "loss": 1.0215, "mean_token_accuracy": 0.7462513867765665, "num_tokens": 39088409.0, "step": 2690 }, { "entropy": 1.0482663962990046, "epoch": 1.413639636101839, "grad_norm": 1.578125, "learning_rate": 1.20087039953583e-05, "loss": 1.0421, "mean_token_accuracy": 0.7341501908376813, "num_tokens": 39236900.0, "step": 2700 }, { "entropy": 1.0507450453937053, "epoch": 1.4188755808626219, "grad_norm": 1.5390625, "learning_rate": 1.1814069630635068e-05, "loss": 1.0481, "mean_token_accuracy": 0.7352609395980835, "num_tokens": 39386493.0, "step": 2710 }, { "entropy": 1.0338343350216745, "epoch": 1.4241115256234047, "grad_norm": 1.4453125, "learning_rate": 1.1620536287303052e-05, "loss": 1.0533, "mean_token_accuracy": 0.7419755831360817, "num_tokens": 39541549.0, "step": 2720 }, { "entropy": 1.0351802745833993, "epoch": 1.4293474703841875, "grad_norm": 1.4375, "learning_rate": 1.1428120125340716e-05, "loss": 1.055, "mean_token_accuracy": 0.7351777728646993, "num_tokens": 39694300.0, "step": 2730 }, { "entropy": 1.0450334103778005, "epoch": 1.4345834151449703, "grad_norm": 1.546875, "learning_rate": 1.1236837211442231e-05, "loss": 1.0436, "mean_token_accuracy": 0.739201857149601, "num_tokens": 39833319.0, "step": 2740 }, { "entropy": 1.0565732188522816, "epoch": 1.439819359905753, "grad_norm": 1.6796875, "learning_rate": 1.1046703517675846e-05, "loss": 1.0437, "mean_token_accuracy": 0.7327644564211369, "num_tokens": 39977306.0, "step": 2750 }, { "entropy": 1.0406348885968328, "epoch": 1.4450553046665358, "grad_norm": 1.984375, "learning_rate": 1.085773492015028e-05, "loss": 1.0215, "mean_token_accuracy": 0.739136977866292, "num_tokens": 40117010.0, "step": 2760 }, { "entropy": 1.0673470385372639, "epoch": 1.4502912494273186, "grad_norm": 1.734375, "learning_rate": 1.0669947197689034e-05, "loss": 1.088, "mean_token_accuracy": 0.7327257882803678, "num_tokens": 40258345.0, "step": 2770 }, { "entropy": 1.108323130570352, "epoch": 1.4555271941881014, "grad_norm": 1.5703125, "learning_rate": 1.0483356030512912e-05, "loss": 1.0889, "mean_token_accuracy": 0.7247421193867922, "num_tokens": 40401612.0, "step": 2780 }, { "entropy": 1.0475279117003082, "epoch": 1.460763138948884, "grad_norm": 1.4609375, "learning_rate": 1.0297976998930664e-05, "loss": 1.0514, "mean_token_accuracy": 0.7359228234738111, "num_tokens": 40540992.0, "step": 2790 }, { "entropy": 1.0940048353746534, "epoch": 1.4659990837096668, "grad_norm": 1.6640625, "learning_rate": 1.0113825582038078e-05, "loss": 1.0891, "mean_token_accuracy": 0.7249374518170952, "num_tokens": 40681843.0, "step": 2800 }, { "entropy": 1.0349202129989863, "epoch": 1.4712350284704496, "grad_norm": 1.8203125, "learning_rate": 9.930917156425476e-06, "loss": 1.034, "mean_token_accuracy": 0.7375153541564942, "num_tokens": 40819621.0, "step": 2810 }, { "entropy": 1.0569802735000848, "epoch": 1.4764709732312324, "grad_norm": 1.671875, "learning_rate": 9.749266994893755e-06, "loss": 1.0714, "mean_token_accuracy": 0.7332708152011037, "num_tokens": 40963825.0, "step": 2820 }, { "entropy": 1.0443452363833785, "epoch": 1.4817069179920153, "grad_norm": 1.4609375, "learning_rate": 9.568890265179128e-06, "loss": 1.0527, "mean_token_accuracy": 0.7357666999101639, "num_tokens": 41115227.0, "step": 2830 }, { "entropy": 1.0533791413530706, "epoch": 1.486942862752798, "grad_norm": 1.7734375, "learning_rate": 9.389802028686617e-06, "loss": 1.0627, "mean_token_accuracy": 0.7320496127009392, "num_tokens": 41263145.0, "step": 2840 }, { "entropy": 1.0201024271547794, "epoch": 1.4921788075135807, "grad_norm": 1.6171875, "learning_rate": 9.212017239232425e-06, "loss": 1.0202, "mean_token_accuracy": 0.7403682049363851, "num_tokens": 41407284.0, "step": 2850 }, { "entropy": 1.072772230580449, "epoch": 1.4974147522743635, "grad_norm": 1.8125, "learning_rate": 9.03555074179533e-06, "loss": 1.0785, "mean_token_accuracy": 0.7321215584874153, "num_tokens": 41554005.0, "step": 2860 }, { "entropy": 1.0840026365593076, "epoch": 1.5026506970351463, "grad_norm": 1.6015625, "learning_rate": 8.860417271277066e-06, "loss": 1.0806, "mean_token_accuracy": 0.7338382225483656, "num_tokens": 41694676.0, "step": 2870 }, { "entropy": 1.0635898549109697, "epoch": 1.507886641795929, "grad_norm": 1.4375, "learning_rate": 8.68663145127203e-06, "loss": 1.0874, "mean_token_accuracy": 0.7339150093495845, "num_tokens": 41852022.0, "step": 2880 }, { "entropy": 1.0565571097657085, "epoch": 1.5131225865567117, "grad_norm": 1.7265625, "learning_rate": 8.514207792846169e-06, "loss": 1.0684, "mean_token_accuracy": 0.735099958628416, "num_tokens": 41997358.0, "step": 2890 }, { "entropy": 1.0264921691268682, "epoch": 1.5183585313174945, "grad_norm": 1.734375, "learning_rate": 8.343160693325355e-06, "loss": 1.0521, "mean_token_accuracy": 0.7389906920492649, "num_tokens": 42142495.0, "step": 2900 }, { "entropy": 1.0624020613729954, "epoch": 1.5235944760782774, "grad_norm": 1.5390625, "learning_rate": 8.173504435093174e-06, "loss": 1.0369, "mean_token_accuracy": 0.7302116710692644, "num_tokens": 42278534.0, "step": 2910 }, { "entropy": 1.066250941902399, "epoch": 1.5288304208390602, "grad_norm": 1.78125, "learning_rate": 8.005253184398359e-06, "loss": 1.0605, "mean_token_accuracy": 0.7321529988199472, "num_tokens": 42414588.0, "step": 2920 }, { "entropy": 1.0273714432492853, "epoch": 1.534066365599843, "grad_norm": 1.7890625, "learning_rate": 7.838420990171928e-06, "loss": 1.0421, "mean_token_accuracy": 0.7360200606286526, "num_tokens": 42558263.0, "step": 2930 }, { "entropy": 1.0550982277840375, "epoch": 1.5393023103606258, "grad_norm": 1.59375, "learning_rate": 7.673021782854084e-06, "loss": 1.0462, "mean_token_accuracy": 0.734701413474977, "num_tokens": 42699208.0, "step": 2940 }, { "entropy": 1.0459831846877932, "epoch": 1.5445382551214086, "grad_norm": 1.4609375, "learning_rate": 7.50906937323104e-06, "loss": 1.071, "mean_token_accuracy": 0.735860938206315, "num_tokens": 42843543.0, "step": 2950 }, { "entropy": 1.0140997383743524, "epoch": 1.5497741998821912, "grad_norm": 1.46875, "learning_rate": 7.346577451281822e-06, "loss": 1.0028, "mean_token_accuracy": 0.7434132274240255, "num_tokens": 42991376.0, "step": 2960 }, { "entropy": 1.0464172219857573, "epoch": 1.555010144642974, "grad_norm": 1.6484375, "learning_rate": 7.185559585035137e-06, "loss": 1.0408, "mean_token_accuracy": 0.7328146979212761, "num_tokens": 43138825.0, "step": 2970 }, { "entropy": 1.0702244764193893, "epoch": 1.5602460894037566, "grad_norm": 1.6953125, "learning_rate": 7.026029219436503e-06, "loss": 1.0959, "mean_token_accuracy": 0.732689993456006, "num_tokens": 43283731.0, "step": 2980 }, { "entropy": 1.0607844032347202, "epoch": 1.5654820341645395, "grad_norm": 1.328125, "learning_rate": 6.8679996752255224e-06, "loss": 1.0445, "mean_token_accuracy": 0.731873894110322, "num_tokens": 43438468.0, "step": 2990 }, { "entropy": 1.0623069098219275, "epoch": 1.5707179789253223, "grad_norm": 1.8984375, "learning_rate": 6.711484147823663e-06, "loss": 1.0859, "mean_token_accuracy": 0.7330159761011601, "num_tokens": 43586062.0, "step": 3000 }, { "entropy": 1.0011677112430335, "epoch": 1.575953923686105, "grad_norm": 1.7578125, "learning_rate": 6.556495706232412e-06, "loss": 1.004, "mean_token_accuracy": 0.7418603513389825, "num_tokens": 43729361.0, "step": 3010 }, { "entropy": 1.0570779686793685, "epoch": 1.581189868446888, "grad_norm": 1.7109375, "learning_rate": 6.403047291942057e-06, "loss": 1.0487, "mean_token_accuracy": 0.7325568657368422, "num_tokens": 43866642.0, "step": 3020 }, { "entropy": 1.075397195480764, "epoch": 1.5864258132076707, "grad_norm": 1.546875, "learning_rate": 6.251151717851023e-06, "loss": 1.0957, "mean_token_accuracy": 0.731479388475418, "num_tokens": 44012594.0, "step": 3030 }, { "entropy": 1.0650788258761168, "epoch": 1.5916617579684535, "grad_norm": 1.5703125, "learning_rate": 6.100821667196041e-06, "loss": 1.0668, "mean_token_accuracy": 0.7329257596284151, "num_tokens": 44156219.0, "step": 3040 }, { "entropy": 1.075978034362197, "epoch": 1.5968977027292364, "grad_norm": 1.6875, "learning_rate": 5.952069692493062e-06, "loss": 1.0978, "mean_token_accuracy": 0.7265279643237591, "num_tokens": 44300766.0, "step": 3050 }, { "entropy": 1.074189928546548, "epoch": 1.602133647490019, "grad_norm": 1.46875, "learning_rate": 5.80490821448918e-06, "loss": 1.0844, "mean_token_accuracy": 0.731312808021903, "num_tokens": 44457805.0, "step": 3060 }, { "entropy": 1.0550312519073486, "epoch": 1.6073695922508018, "grad_norm": 1.8671875, "learning_rate": 5.65934952112546e-06, "loss": 1.0639, "mean_token_accuracy": 0.7341483242809772, "num_tokens": 44602720.0, "step": 3070 }, { "entropy": 1.0775770872831345, "epoch": 1.6126055370115844, "grad_norm": 1.6171875, "learning_rate": 5.5154057665109e-06, "loss": 1.0682, "mean_token_accuracy": 0.7288901913911104, "num_tokens": 44754337.0, "step": 3080 }, { "entropy": 1.0554250160232186, "epoch": 1.6178414817723672, "grad_norm": 1.7265625, "learning_rate": 5.373088969907586e-06, "loss": 1.0413, "mean_token_accuracy": 0.7333483207970858, "num_tokens": 44901493.0, "step": 3090 }, { "entropy": 1.0390476867556573, "epoch": 1.62307742653315, "grad_norm": 1.6015625, "learning_rate": 5.23241101472709e-06, "loss": 1.0654, "mean_token_accuracy": 0.7387387953698635, "num_tokens": 45051644.0, "step": 3100 }, { "entropy": 1.0286037972196937, "epoch": 1.6283133712939328, "grad_norm": 1.9140625, "learning_rate": 5.09338364753818e-06, "loss": 1.0294, "mean_token_accuracy": 0.7419391922652722, "num_tokens": 45206269.0, "step": 3110 }, { "entropy": 1.0362283935770393, "epoch": 1.6335493160547156, "grad_norm": 1.3828125, "learning_rate": 4.956018477086005e-06, "loss": 1.0556, "mean_token_accuracy": 0.7357694737613201, "num_tokens": 45352368.0, "step": 3120 }, { "entropy": 1.0753269331529736, "epoch": 1.6387852608154985, "grad_norm": 1.421875, "learning_rate": 4.820326973322764e-06, "loss": 1.0746, "mean_token_accuracy": 0.7283272542059421, "num_tokens": 45499839.0, "step": 3130 }, { "entropy": 1.0553035859018565, "epoch": 1.6440212055762813, "grad_norm": 1.671875, "learning_rate": 4.686320466449981e-06, "loss": 1.1012, "mean_token_accuracy": 0.7353771705180406, "num_tokens": 45638917.0, "step": 3140 }, { "entropy": 1.0884598640725016, "epoch": 1.649257150337064, "grad_norm": 1.765625, "learning_rate": 4.554010145972417e-06, "loss": 1.1183, "mean_token_accuracy": 0.7301896862685681, "num_tokens": 45789557.0, "step": 3150 }, { "entropy": 1.01657194532454, "epoch": 1.6544930950978467, "grad_norm": 1.5625, "learning_rate": 4.423407059763745e-06, "loss": 1.0208, "mean_token_accuracy": 0.740845986828208, "num_tokens": 45932715.0, "step": 3160 }, { "entropy": 1.0645186068490147, "epoch": 1.6597290398586295, "grad_norm": 1.5, "learning_rate": 4.294522113144078e-06, "loss": 1.0814, "mean_token_accuracy": 0.7319622810930013, "num_tokens": 46085727.0, "step": 3170 }, { "entropy": 1.0936360348947347, "epoch": 1.6649649846194121, "grad_norm": 1.5703125, "learning_rate": 4.16736606796938e-06, "loss": 1.0958, "mean_token_accuracy": 0.7290900621563197, "num_tokens": 46237949.0, "step": 3180 }, { "entropy": 1.0574020750820636, "epoch": 1.670200929380195, "grad_norm": 1.5234375, "learning_rate": 4.041949541732826e-06, "loss": 1.0481, "mean_token_accuracy": 0.734756362810731, "num_tokens": 46383620.0, "step": 3190 }, { "entropy": 1.0825668659992516, "epoch": 1.6754368741409777, "grad_norm": 1.5859375, "learning_rate": 3.9182830066782614e-06, "loss": 1.0925, "mean_token_accuracy": 0.7287914883345366, "num_tokens": 46522204.0, "step": 3200 }, { "entropy": 1.0611087726429105, "epoch": 1.6806728189017606, "grad_norm": 1.5859375, "learning_rate": 3.7963767889257704e-06, "loss": 1.0773, "mean_token_accuracy": 0.7307531669735908, "num_tokens": 46664305.0, "step": 3210 }, { "entropy": 1.0411738075315953, "epoch": 1.6859087636625434, "grad_norm": 1.6484375, "learning_rate": 3.676241067609465e-06, "loss": 1.035, "mean_token_accuracy": 0.7382205333560705, "num_tokens": 46810702.0, "step": 3220 }, { "entropy": 1.023532929085195, "epoch": 1.6911447084233262, "grad_norm": 1.546875, "learning_rate": 3.5578858740274973e-06, "loss": 1.0167, "mean_token_accuracy": 0.743023120239377, "num_tokens": 46959451.0, "step": 3230 }, { "entropy": 1.092939823679626, "epoch": 1.696380653184109, "grad_norm": 2.015625, "learning_rate": 3.4413210908044696e-06, "loss": 1.1151, "mean_token_accuracy": 0.7277210278436541, "num_tokens": 47108206.0, "step": 3240 }, { "entropy": 1.07600337844342, "epoch": 1.7016165979448918, "grad_norm": 2.203125, "learning_rate": 3.3265564510662343e-06, "loss": 1.102, "mean_token_accuracy": 0.7284684276208282, "num_tokens": 47262079.0, "step": 3250 }, { "entropy": 1.0967259481549263, "epoch": 1.7068525427056744, "grad_norm": 1.4609375, "learning_rate": 3.213601537627195e-06, "loss": 1.1053, "mean_token_accuracy": 0.72360435500741, "num_tokens": 47414104.0, "step": 3260 }, { "entropy": 1.029846752807498, "epoch": 1.7120884874664573, "grad_norm": 2.125, "learning_rate": 3.102465782190106e-06, "loss": 1.0467, "mean_token_accuracy": 0.7382533248513937, "num_tokens": 47555455.0, "step": 3270 }, { "entropy": 1.090262323245406, "epoch": 1.7173244322272398, "grad_norm": 1.6484375, "learning_rate": 2.9931584645585654e-06, "loss": 1.0854, "mean_token_accuracy": 0.7280427444726228, "num_tokens": 47689549.0, "step": 3280 }, { "entropy": 1.0471955848857761, "epoch": 1.7225603769880227, "grad_norm": 1.7421875, "learning_rate": 2.8856887118621364e-06, "loss": 1.0303, "mean_token_accuracy": 0.7392646053805947, "num_tokens": 47828675.0, "step": 3290 }, { "entropy": 1.0756644216366111, "epoch": 1.7277963217488055, "grad_norm": 1.5390625, "learning_rate": 2.7800654977942488e-06, "loss": 1.0728, "mean_token_accuracy": 0.7287148278206587, "num_tokens": 47977441.0, "step": 3300 }, { "entropy": 1.0582644551992417, "epoch": 1.7330322665095883, "grad_norm": 1.4765625, "learning_rate": 2.676297641862879e-06, "loss": 1.051, "mean_token_accuracy": 0.7314374148845673, "num_tokens": 48130026.0, "step": 3310 }, { "entropy": 1.0880960457026958, "epoch": 1.7382682112703711, "grad_norm": 1.7890625, "learning_rate": 2.5743938086541354e-06, "loss": 1.1199, "mean_token_accuracy": 0.7294365499168635, "num_tokens": 48279615.0, "step": 3320 }, { "entropy": 1.0541712949052453, "epoch": 1.743504156031154, "grad_norm": 1.765625, "learning_rate": 2.4743625071087574e-06, "loss": 1.0614, "mean_token_accuracy": 0.7369356131181121, "num_tokens": 48435016.0, "step": 3330 }, { "entropy": 1.0605900973081588, "epoch": 1.7487401007919368, "grad_norm": 1.7109375, "learning_rate": 2.3762120898116498e-06, "loss": 1.0477, "mean_token_accuracy": 0.7361986979842186, "num_tokens": 48575627.0, "step": 3340 }, { "entropy": 1.1169460522010923, "epoch": 1.7539760455527196, "grad_norm": 1.6640625, "learning_rate": 2.2799507522944048e-06, "loss": 1.143, "mean_token_accuracy": 0.7201281778514386, "num_tokens": 48712687.0, "step": 3350 }, { "entropy": 1.0832444079220296, "epoch": 1.7592119903135022, "grad_norm": 1.671875, "learning_rate": 2.1855865323510055e-06, "loss": 1.0956, "mean_token_accuracy": 0.7290066111832857, "num_tokens": 48857330.0, "step": 3360 }, { "entropy": 1.0283724040724338, "epoch": 1.764447935074285, "grad_norm": 1.5859375, "learning_rate": 2.0931273093666575e-06, "loss": 1.0341, "mean_token_accuracy": 0.743080747872591, "num_tokens": 48998035.0, "step": 3370 }, { "entropy": 1.027001916244626, "epoch": 1.7696838798350676, "grad_norm": 1.71875, "learning_rate": 2.002580803659873e-06, "loss": 1.0041, "mean_token_accuracy": 0.7414408419281244, "num_tokens": 49139070.0, "step": 3380 }, { "entropy": 0.9976796295493842, "epoch": 1.7749198245958504, "grad_norm": 1.7421875, "learning_rate": 1.9139545758378256e-06, "loss": 0.9828, "mean_token_accuracy": 0.7511645819991827, "num_tokens": 49276032.0, "step": 3390 }, { "entropy": 1.0575186382979154, "epoch": 1.7801557693566332, "grad_norm": 1.6171875, "learning_rate": 1.8272560261650279e-06, "loss": 1.0549, "mean_token_accuracy": 0.7337985582649708, "num_tokens": 49412690.0, "step": 3400 }, { "entropy": 1.0142314087599515, "epoch": 1.785391714117416, "grad_norm": 1.53125, "learning_rate": 1.7424923939454273e-06, "loss": 1.0415, "mean_token_accuracy": 0.7428950823843479, "num_tokens": 49553666.0, "step": 3410 }, { "entropy": 1.0519614189863205, "epoch": 1.7906276588781989, "grad_norm": 1.5703125, "learning_rate": 1.6596707569179304e-06, "loss": 1.0709, "mean_token_accuracy": 0.7382104344666004, "num_tokens": 49704519.0, "step": 3420 }, { "entropy": 1.0671229269355536, "epoch": 1.7958636036389817, "grad_norm": 1.6015625, "learning_rate": 1.578798030665385e-06, "loss": 1.0751, "mean_token_accuracy": 0.7311564918607474, "num_tokens": 49843624.0, "step": 3430 }, { "entropy": 1.0378224339336157, "epoch": 1.8010995483997645, "grad_norm": 1.609375, "learning_rate": 1.499880968037165e-06, "loss": 1.0427, "mean_token_accuracy": 0.7399079620838165, "num_tokens": 49980104.0, "step": 3440 }, { "entropy": 1.1307296685874462, "epoch": 1.8063354931605473, "grad_norm": 1.4140625, "learning_rate": 1.4229261585852805e-06, "loss": 1.1653, "mean_token_accuracy": 0.7181130038574338, "num_tokens": 50135452.0, "step": 3450 }, { "entropy": 1.0895096741616725, "epoch": 1.81157143792133, "grad_norm": 1.578125, "learning_rate": 1.3479400280141884e-06, "loss": 1.106, "mean_token_accuracy": 0.7267964135855436, "num_tokens": 50285568.0, "step": 3460 }, { "entropy": 1.1072740200906992, "epoch": 1.8168073826821127, "grad_norm": 1.5703125, "learning_rate": 1.2749288376442043e-06, "loss": 1.1295, "mean_token_accuracy": 0.7238930713385343, "num_tokens": 50431109.0, "step": 3470 }, { "entropy": 1.0873282797634602, "epoch": 1.8220433274428953, "grad_norm": 1.8828125, "learning_rate": 1.203898683888713e-06, "loss": 1.1206, "mean_token_accuracy": 0.7258367579430341, "num_tokens": 50572278.0, "step": 3480 }, { "entropy": 1.074448931775987, "epoch": 1.8272792722036781, "grad_norm": 1.7890625, "learning_rate": 1.134855497745113e-06, "loss": 1.0828, "mean_token_accuracy": 0.728118471056223, "num_tokens": 50715803.0, "step": 3490 }, { "entropy": 1.0785529548302293, "epoch": 1.832515216964461, "grad_norm": 1.4453125, "learning_rate": 1.0678050442995801e-06, "loss": 1.0723, "mean_token_accuracy": 0.7296169890090823, "num_tokens": 50860742.0, "step": 3500 }, { "entropy": 1.070153540931642, "epoch": 1.8377511617252438, "grad_norm": 1.703125, "learning_rate": 1.0027529222456756e-06, "loss": 1.0884, "mean_token_accuracy": 0.7305432733148336, "num_tokens": 51006683.0, "step": 3510 }, { "entropy": 1.0611914629116654, "epoch": 1.8429871064860266, "grad_norm": 1.7109375, "learning_rate": 9.397045634168766e-07, "loss": 1.043, "mean_token_accuracy": 0.7299406290054321, "num_tokens": 51154218.0, "step": 3520 }, { "entropy": 1.0603390594944357, "epoch": 1.8482230512468094, "grad_norm": 1.5390625, "learning_rate": 8.78665232332998e-07, "loss": 1.0571, "mean_token_accuracy": 0.7298200543969869, "num_tokens": 51299870.0, "step": 3530 }, { "entropy": 1.0776327732950448, "epoch": 1.8534589960075922, "grad_norm": 1.625, "learning_rate": 8.196400257606207e-07, "loss": 1.1094, "mean_token_accuracy": 0.7307827772572637, "num_tokens": 51449416.0, "step": 3540 }, { "entropy": 1.1012815684080124, "epoch": 1.858694940768375, "grad_norm": 1.5390625, "learning_rate": 7.626338722875076e-07, "loss": 1.135, "mean_token_accuracy": 0.7262665273621678, "num_tokens": 51589483.0, "step": 3550 }, { "entropy": 1.0277717508375646, "epoch": 1.8639308855291576, "grad_norm": 1.8046875, "learning_rate": 7.076515319110688e-07, "loss": 1.0289, "mean_token_accuracy": 0.7407132972031831, "num_tokens": 51743472.0, "step": 3560 }, { "entropy": 0.9922656198963523, "epoch": 1.8691668302899405, "grad_norm": 1.9453125, "learning_rate": 6.54697595640899e-07, "loss": 0.9894, "mean_token_accuracy": 0.7477642957121133, "num_tokens": 51890190.0, "step": 3570 }, { "entropy": 1.08709951415658, "epoch": 1.874402775050723, "grad_norm": 1.671875, "learning_rate": 6.037764851154426e-07, "loss": 1.1142, "mean_token_accuracy": 0.7220855403691531, "num_tokens": 52036734.0, "step": 3580 }, { "entropy": 1.0564101081341506, "epoch": 1.8796387198115059, "grad_norm": 1.8125, "learning_rate": 5.548924522327747e-07, "loss": 1.0584, "mean_token_accuracy": 0.7324411410838365, "num_tokens": 52169348.0, "step": 3590 }, { "entropy": 1.0335981843993067, "epoch": 1.8848746645722887, "grad_norm": 1.6484375, "learning_rate": 5.080495787955691e-07, "loss": 1.0412, "mean_token_accuracy": 0.739568930119276, "num_tokens": 52311669.0, "step": 3600 }, { "entropy": 1.044294580630958, "epoch": 1.8901106093330715, "grad_norm": 1.59375, "learning_rate": 4.632517761702815e-07, "loss": 1.02, "mean_token_accuracy": 0.7376545470207929, "num_tokens": 52468463.0, "step": 3610 }, { "entropy": 1.0835541209205986, "epoch": 1.8953465540938543, "grad_norm": 1.5, "learning_rate": 4.2050278496053587e-07, "loss": 1.1012, "mean_token_accuracy": 0.729495657980442, "num_tokens": 52626044.0, "step": 3620 }, { "entropy": 1.0643612802028657, "epoch": 1.9005824988546371, "grad_norm": 1.6953125, "learning_rate": 3.7980617469479953e-07, "loss": 1.0706, "mean_token_accuracy": 0.7347536141052842, "num_tokens": 52772867.0, "step": 3630 }, { "entropy": 1.0667219148948788, "epoch": 1.90581844361542, "grad_norm": 1.71875, "learning_rate": 3.4116534352831576e-07, "loss": 1.0627, "mean_token_accuracy": 0.7293199263513088, "num_tokens": 52908004.0, "step": 3640 }, { "entropy": 1.135395216010511, "epoch": 1.9110543883762028, "grad_norm": 1.6171875, "learning_rate": 3.0458351795936703e-07, "loss": 1.1365, "mean_token_accuracy": 0.7202159762382507, "num_tokens": 53044041.0, "step": 3650 }, { "entropy": 1.0965589692816138, "epoch": 1.9162903331369854, "grad_norm": 1.8203125, "learning_rate": 2.7006375255985985e-07, "loss": 1.0917, "mean_token_accuracy": 0.7249834679067135, "num_tokens": 53186581.0, "step": 3660 }, { "entropy": 1.032194511592388, "epoch": 1.9215262778977682, "grad_norm": 2.1875, "learning_rate": 2.3760892972027328e-07, "loss": 1.0481, "mean_token_accuracy": 0.7402662597596645, "num_tokens": 53337558.0, "step": 3670 }, { "entropy": 1.1025461964309216, "epoch": 1.9267622226585508, "grad_norm": 1.46875, "learning_rate": 2.072217594089765e-07, "loss": 1.1038, "mean_token_accuracy": 0.7263487908989191, "num_tokens": 53474564.0, "step": 3680 }, { "entropy": 1.0789904015138745, "epoch": 1.9319981674193336, "grad_norm": 1.8671875, "learning_rate": 1.7890477894593748e-07, "loss": 1.0865, "mean_token_accuracy": 0.7280137140303851, "num_tokens": 53614875.0, "step": 3690 }, { "entropy": 1.0704231640323996, "epoch": 1.9372341121801164, "grad_norm": 2.15625, "learning_rate": 1.5266035279088708e-07, "loss": 1.0724, "mean_token_accuracy": 0.7300405781716108, "num_tokens": 53764538.0, "step": 3700 }, { "entropy": 1.0188416039571166, "epoch": 1.9424700569408992, "grad_norm": 1.515625, "learning_rate": 1.284906723458462e-07, "loss": 1.0245, "mean_token_accuracy": 0.738005406036973, "num_tokens": 53908241.0, "step": 3710 }, { "entropy": 1.033022477477789, "epoch": 1.947706001701682, "grad_norm": 1.6328125, "learning_rate": 1.0639775577218625e-07, "loss": 1.0267, "mean_token_accuracy": 0.7387443576008081, "num_tokens": 54051083.0, "step": 3720 }, { "entropy": 1.0919482603669166, "epoch": 1.9529419464624649, "grad_norm": 1.578125, "learning_rate": 8.638344782207486e-08, "loss": 1.0833, "mean_token_accuracy": 0.7268906071782112, "num_tokens": 54203892.0, "step": 3730 }, { "entropy": 1.0021182408556342, "epoch": 1.9581778912232477, "grad_norm": 1.734375, "learning_rate": 6.84494196844715e-08, "loss": 0.9942, "mean_token_accuracy": 0.7441616494208574, "num_tokens": 54355138.0, "step": 3740 }, { "entropy": 1.0863986648619175, "epoch": 1.9634138359840305, "grad_norm": 1.5703125, "learning_rate": 5.2597168845561206e-08, "loss": 1.0861, "mean_token_accuracy": 0.7288298228755593, "num_tokens": 54503031.0, "step": 3750 }, { "entropy": 1.0323819531127811, "epoch": 1.9686497807448131, "grad_norm": 1.578125, "learning_rate": 3.882801896372967e-08, "loss": 1.0337, "mean_token_accuracy": 0.7384911965578794, "num_tokens": 54656622.0, "step": 3760 }, { "entropy": 1.0538762006908655, "epoch": 1.973885725505596, "grad_norm": 1.6875, "learning_rate": 2.7143119759026613e-08, "loss": 1.0898, "mean_token_accuracy": 0.738424026966095, "num_tokens": 54815724.0, "step": 3770 }, { "entropy": 1.1061828639358282, "epoch": 1.9791216702663785, "grad_norm": 1.6953125, "learning_rate": 1.754344691717591e-08, "loss": 1.0847, "mean_token_accuracy": 0.7238223964348436, "num_tokens": 54962285.0, "step": 3780 }, { "entropy": 1.062331521883607, "epoch": 1.9843576150271613, "grad_norm": 2.125, "learning_rate": 1.0029802008096334e-08, "loss": 1.0895, "mean_token_accuracy": 0.7333987768739462, "num_tokens": 55112914.0, "step": 3790 }, { "entropy": 1.0460052080452442, "epoch": 1.9895935597879442, "grad_norm": 1.90625, "learning_rate": 4.602812418974534e-09, "loss": 1.0321, "mean_token_accuracy": 0.7373423630371689, "num_tokens": 55256115.0, "step": 3800 }, { "entropy": 1.0018283769488334, "epoch": 1.994829504548727, "grad_norm": 1.625, "learning_rate": 1.2629313018819311e-09, "loss": 0.9884, "mean_token_accuracy": 0.7454275876283646, "num_tokens": 55398525.0, "step": 3810 }, { "entropy": 1.0823518706462052, "epoch": 2.0, "grad_norm": 1.6875, "learning_rate": 1.0437535929996856e-11, "loss": 1.1044, "mean_token_accuracy": 0.7266742470143717, "num_tokens": 55553296.0, "step": 3820 }, { "epoch": 2.0, "eval_entropy": 1.0679908826947213, "eval_loss": 1.1749457120895386, "eval_mean_token_accuracy": 0.707340413838625, "eval_num_tokens": 55553296.0, "eval_runtime": 60.9411, "eval_samples_per_second": 32.819, "eval_steps_per_second": 16.409, "step": 3820 } ], "logging_steps": 10, "max_steps": 3820, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.9198789959923e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }