{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 9.880103137086378, "eval_steps": 500, "global_step": 11500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 7.6312949657440186, "epoch": 0.004297378599054577, "grad_norm": 0.94921875, "learning_rate": 2e-06, "loss": 7.384, "mean_token_accuracy": 0.09047168418765068, "num_tokens": 10107.0, "step": 5 }, { "entropy": 7.674387979507446, "epoch": 0.008594757198109154, "grad_norm": 1.1484375, "learning_rate": 4.5e-06, "loss": 7.3814, "mean_token_accuracy": 0.09915048182010651, "num_tokens": 18391.0, "step": 10 }, { "entropy": 7.658490705490112, "epoch": 0.01289213579716373, "grad_norm": 1.015625, "learning_rate": 7e-06, "loss": 7.4194, "mean_token_accuracy": 0.09372682273387908, "num_tokens": 27061.0, "step": 15 }, { "entropy": 7.6485553741455075, "epoch": 0.017189514396218308, "grad_norm": 1.09375, "learning_rate": 9.5e-06, "loss": 7.4387, "mean_token_accuracy": 0.09950413554906845, "num_tokens": 36339.0, "step": 20 }, { "entropy": 7.655299663543701, "epoch": 0.021486892995272882, "grad_norm": 0.95703125, "learning_rate": 1.2e-05, "loss": 7.4336, "mean_token_accuracy": 0.09199422970414162, "num_tokens": 45770.0, "step": 25 }, { "entropy": 7.707321071624756, "epoch": 0.02578427159432746, "grad_norm": 0.96875, "learning_rate": 1.4500000000000002e-05, "loss": 7.4406, "mean_token_accuracy": 0.09267855286598206, "num_tokens": 54575.0, "step": 30 }, { "entropy": 7.718957376480103, "epoch": 0.030081650193382038, "grad_norm": 0.97265625, "learning_rate": 1.7000000000000003e-05, "loss": 7.5222, "mean_token_accuracy": 0.08976790606975556, "num_tokens": 66403.0, "step": 35 }, { "entropy": 7.742082262039185, "epoch": 0.034379028792436615, "grad_norm": 0.87890625, "learning_rate": 1.95e-05, "loss": 7.4377, "mean_token_accuracy": 0.09164252653717994, "num_tokens": 76510.0, "step": 40 }, { "entropy": 7.745701646804809, "epoch": 0.03867640739149119, "grad_norm": 0.99609375, "learning_rate": 2.2e-05, "loss": 7.358, "mean_token_accuracy": 0.0955798089504242, "num_tokens": 84836.0, "step": 45 }, { "entropy": 7.780595874786377, "epoch": 0.042973785990545764, "grad_norm": 0.984375, "learning_rate": 2.4500000000000003e-05, "loss": 7.3289, "mean_token_accuracy": 0.10552914068102837, "num_tokens": 93197.0, "step": 50 }, { "entropy": 7.764179325103759, "epoch": 0.047271164589600345, "grad_norm": 0.98828125, "learning_rate": 2.7e-05, "loss": 7.3234, "mean_token_accuracy": 0.09917277097702026, "num_tokens": 101546.0, "step": 55 }, { "entropy": 7.719727945327759, "epoch": 0.05156854318865492, "grad_norm": 0.8515625, "learning_rate": 2.95e-05, "loss": 7.4172, "mean_token_accuracy": 0.0928034670650959, "num_tokens": 111703.0, "step": 60 }, { "entropy": 7.748228645324707, "epoch": 0.055865921787709494, "grad_norm": 0.95703125, "learning_rate": 3.2e-05, "loss": 7.3403, "mean_token_accuracy": 0.10037123262882233, "num_tokens": 119894.0, "step": 65 }, { "entropy": 7.714352416992187, "epoch": 0.060163300386764075, "grad_norm": 0.89453125, "learning_rate": 3.4500000000000005e-05, "loss": 7.2915, "mean_token_accuracy": 0.1022428810596466, "num_tokens": 128885.0, "step": 70 }, { "entropy": 7.679376173019409, "epoch": 0.06446067898581866, "grad_norm": 0.8984375, "learning_rate": 3.7e-05, "loss": 7.4226, "mean_token_accuracy": 0.0972097434103489, "num_tokens": 138106.0, "step": 75 }, { "entropy": 7.72790002822876, "epoch": 0.06875805758487323, "grad_norm": 1.140625, "learning_rate": 3.95e-05, "loss": 7.3294, "mean_token_accuracy": 0.1022751808166504, "num_tokens": 146691.0, "step": 80 }, { "entropy": 7.730126142501831, "epoch": 0.0730554361839278, "grad_norm": 0.99609375, "learning_rate": 4.2000000000000004e-05, "loss": 7.382, "mean_token_accuracy": 0.09973402544856072, "num_tokens": 155792.0, "step": 85 }, { "entropy": 7.727601718902588, "epoch": 0.07735281478298238, "grad_norm": 0.89453125, "learning_rate": 4.45e-05, "loss": 7.4474, "mean_token_accuracy": 0.08758748695254326, "num_tokens": 166944.0, "step": 90 }, { "entropy": 7.782265329360962, "epoch": 0.08165019338203695, "grad_norm": 0.98828125, "learning_rate": 4.7000000000000004e-05, "loss": 7.2886, "mean_token_accuracy": 0.1041356198489666, "num_tokens": 175303.0, "step": 95 }, { "entropy": 7.751953029632569, "epoch": 0.08594757198109153, "grad_norm": 1.0078125, "learning_rate": 4.9500000000000004e-05, "loss": 7.3403, "mean_token_accuracy": 0.09793160557746887, "num_tokens": 184708.0, "step": 100 }, { "entropy": 7.702822208404541, "epoch": 0.09024495058014612, "grad_norm": 0.921875, "learning_rate": 5.2e-05, "loss": 7.3117, "mean_token_accuracy": 0.09851032048463822, "num_tokens": 193835.0, "step": 105 }, { "entropy": 7.686660861968994, "epoch": 0.09454232917920069, "grad_norm": 1.1328125, "learning_rate": 5.45e-05, "loss": 7.3479, "mean_token_accuracy": 0.0979080393910408, "num_tokens": 203344.0, "step": 110 }, { "entropy": 7.698584461212159, "epoch": 0.09883970777825526, "grad_norm": 0.9296875, "learning_rate": 5.7e-05, "loss": 7.4586, "mean_token_accuracy": 0.09130895733833314, "num_tokens": 213048.0, "step": 115 }, { "entropy": 7.781258678436279, "epoch": 0.10313708637730984, "grad_norm": 1.109375, "learning_rate": 5.9499999999999996e-05, "loss": 7.3094, "mean_token_accuracy": 0.10353164449334144, "num_tokens": 221784.0, "step": 120 }, { "entropy": 7.650211572647095, "epoch": 0.10743446497636441, "grad_norm": 1.0078125, "learning_rate": 6.2e-05, "loss": 7.3189, "mean_token_accuracy": 0.09726176261901856, "num_tokens": 230971.0, "step": 125 }, { "entropy": 7.655170726776123, "epoch": 0.11173184357541899, "grad_norm": 0.96484375, "learning_rate": 6.450000000000001e-05, "loss": 7.2818, "mean_token_accuracy": 0.1042576052248478, "num_tokens": 240524.0, "step": 130 }, { "entropy": 7.7341550350189205, "epoch": 0.11602922217447358, "grad_norm": 0.88671875, "learning_rate": 6.7e-05, "loss": 7.2512, "mean_token_accuracy": 0.1007460281252861, "num_tokens": 249220.0, "step": 135 }, { "entropy": 7.745693302154541, "epoch": 0.12032660077352815, "grad_norm": 1.0234375, "learning_rate": 6.950000000000001e-05, "loss": 7.3688, "mean_token_accuracy": 0.10030856803059578, "num_tokens": 258934.0, "step": 140 }, { "entropy": 7.694993305206299, "epoch": 0.12462397937258272, "grad_norm": 1.0234375, "learning_rate": 7.2e-05, "loss": 7.2936, "mean_token_accuracy": 0.10321335718035698, "num_tokens": 267680.0, "step": 145 }, { "entropy": 7.719129991531372, "epoch": 0.1289213579716373, "grad_norm": 1.0078125, "learning_rate": 7.45e-05, "loss": 7.3236, "mean_token_accuracy": 0.10207543894648552, "num_tokens": 276227.0, "step": 150 }, { "entropy": 7.648375129699707, "epoch": 0.1332187365706919, "grad_norm": 0.94921875, "learning_rate": 7.7e-05, "loss": 7.2203, "mean_token_accuracy": 0.1059327855706215, "num_tokens": 286342.0, "step": 155 }, { "entropy": 7.674158382415771, "epoch": 0.13751611516974646, "grad_norm": 1.0625, "learning_rate": 7.950000000000001e-05, "loss": 7.2988, "mean_token_accuracy": 0.09665355160832405, "num_tokens": 294994.0, "step": 160 }, { "entropy": 7.717900514602661, "epoch": 0.14181349376880104, "grad_norm": 1.046875, "learning_rate": 8.2e-05, "loss": 7.2704, "mean_token_accuracy": 0.10349940955638885, "num_tokens": 303882.0, "step": 165 }, { "entropy": 7.6729988098144535, "epoch": 0.1461108723678556, "grad_norm": 0.9609375, "learning_rate": 8.450000000000001e-05, "loss": 7.3104, "mean_token_accuracy": 0.10128599181771278, "num_tokens": 312515.0, "step": 170 }, { "entropy": 7.739007139205933, "epoch": 0.15040825096691018, "grad_norm": 1.2109375, "learning_rate": 8.7e-05, "loss": 7.27, "mean_token_accuracy": 0.10081852003931999, "num_tokens": 320801.0, "step": 175 }, { "entropy": 7.720875406265259, "epoch": 0.15470562956596476, "grad_norm": 1.015625, "learning_rate": 8.95e-05, "loss": 7.2872, "mean_token_accuracy": 0.10100285485386848, "num_tokens": 329382.0, "step": 180 }, { "entropy": 7.66646089553833, "epoch": 0.15900300816501933, "grad_norm": 1.0390625, "learning_rate": 9.2e-05, "loss": 7.2814, "mean_token_accuracy": 0.1028428927063942, "num_tokens": 337894.0, "step": 185 }, { "entropy": 7.772510719299317, "epoch": 0.1633003867640739, "grad_norm": 1.125, "learning_rate": 9.45e-05, "loss": 7.2803, "mean_token_accuracy": 0.10378619506955147, "num_tokens": 346380.0, "step": 190 }, { "entropy": 7.690706968307495, "epoch": 0.16759776536312848, "grad_norm": 0.890625, "learning_rate": 9.7e-05, "loss": 7.3588, "mean_token_accuracy": 0.09733301475644111, "num_tokens": 356305.0, "step": 195 }, { "entropy": 7.79454927444458, "epoch": 0.17189514396218306, "grad_norm": 1.0078125, "learning_rate": 9.95e-05, "loss": 7.306, "mean_token_accuracy": 0.09683404862880707, "num_tokens": 364899.0, "step": 200 }, { "entropy": 7.694888687133789, "epoch": 0.17619252256123766, "grad_norm": 1.015625, "learning_rate": 0.000102, "loss": 7.2938, "mean_token_accuracy": 0.09810400977730752, "num_tokens": 373663.0, "step": 205 }, { "entropy": 7.748025798797608, "epoch": 0.18048990116029223, "grad_norm": 1.1640625, "learning_rate": 0.00010449999999999999, "loss": 7.2566, "mean_token_accuracy": 0.10043591782450675, "num_tokens": 382730.0, "step": 210 }, { "entropy": 7.706165361404419, "epoch": 0.1847872797593468, "grad_norm": 1.1328125, "learning_rate": 0.000107, "loss": 7.3157, "mean_token_accuracy": 0.09612104147672654, "num_tokens": 392676.0, "step": 215 }, { "entropy": 7.760982656478882, "epoch": 0.18908465835840138, "grad_norm": 1.2265625, "learning_rate": 0.0001095, "loss": 7.2955, "mean_token_accuracy": 0.10281639397144318, "num_tokens": 401050.0, "step": 220 }, { "entropy": 7.626513719558716, "epoch": 0.19338203695745596, "grad_norm": 1.078125, "learning_rate": 0.000112, "loss": 7.2692, "mean_token_accuracy": 0.10119878426194191, "num_tokens": 410009.0, "step": 225 }, { "entropy": 7.726489019393921, "epoch": 0.19767941555651053, "grad_norm": 0.98828125, "learning_rate": 0.0001145, "loss": 7.2683, "mean_token_accuracy": 0.10186234638094901, "num_tokens": 419302.0, "step": 230 }, { "entropy": 7.643717670440674, "epoch": 0.2019767941555651, "grad_norm": 1.109375, "learning_rate": 0.00011700000000000001, "loss": 7.1665, "mean_token_accuracy": 0.10647615045309067, "num_tokens": 427296.0, "step": 235 }, { "entropy": 7.666737127304077, "epoch": 0.20627417275461968, "grad_norm": 1.125, "learning_rate": 0.00011949999999999999, "loss": 7.3139, "mean_token_accuracy": 0.10131902173161507, "num_tokens": 436368.0, "step": 240 }, { "entropy": 7.772911167144775, "epoch": 0.21057155135367425, "grad_norm": 1.046875, "learning_rate": 0.000122, "loss": 7.2112, "mean_token_accuracy": 0.1055280588567257, "num_tokens": 445535.0, "step": 245 }, { "entropy": 7.602903366088867, "epoch": 0.21486892995272883, "grad_norm": 1.046875, "learning_rate": 0.0001245, "loss": 7.2153, "mean_token_accuracy": 0.10406075567007064, "num_tokens": 454769.0, "step": 250 }, { "entropy": 7.693030595779419, "epoch": 0.2191663085517834, "grad_norm": 1.125, "learning_rate": 0.000127, "loss": 7.2315, "mean_token_accuracy": 0.10270996242761612, "num_tokens": 463975.0, "step": 255 }, { "entropy": 7.637308835983276, "epoch": 0.22346368715083798, "grad_norm": 1.109375, "learning_rate": 0.0001295, "loss": 7.2542, "mean_token_accuracy": 0.10225536078214645, "num_tokens": 472899.0, "step": 260 }, { "entropy": 7.740519666671753, "epoch": 0.22776106574989258, "grad_norm": 1.09375, "learning_rate": 0.000132, "loss": 7.229, "mean_token_accuracy": 0.1005932256579399, "num_tokens": 481556.0, "step": 265 }, { "entropy": 7.654651689529419, "epoch": 0.23205844434894715, "grad_norm": 1.0625, "learning_rate": 0.00013450000000000002, "loss": 7.2258, "mean_token_accuracy": 0.10702893435955048, "num_tokens": 490253.0, "step": 270 }, { "entropy": 7.660864973068238, "epoch": 0.23635582294800173, "grad_norm": 1.2265625, "learning_rate": 0.00013700000000000002, "loss": 7.2451, "mean_token_accuracy": 0.10333684608340263, "num_tokens": 498444.0, "step": 275 }, { "entropy": 7.637535953521729, "epoch": 0.2406532015470563, "grad_norm": 0.98046875, "learning_rate": 0.0001395, "loss": 7.191, "mean_token_accuracy": 0.10794568434357643, "num_tokens": 508330.0, "step": 280 }, { "entropy": 7.6566917419433596, "epoch": 0.24495058014611087, "grad_norm": 1.234375, "learning_rate": 0.00014199999999999998, "loss": 7.3004, "mean_token_accuracy": 0.10417937636375427, "num_tokens": 517900.0, "step": 285 }, { "entropy": 7.670303010940552, "epoch": 0.24924795874516545, "grad_norm": 1.1484375, "learning_rate": 0.0001445, "loss": 7.2276, "mean_token_accuracy": 0.10308908969163895, "num_tokens": 527808.0, "step": 290 }, { "entropy": 7.719700765609741, "epoch": 0.25354533734422, "grad_norm": 1.1484375, "learning_rate": 0.000147, "loss": 7.2415, "mean_token_accuracy": 0.10010977610945701, "num_tokens": 536931.0, "step": 295 }, { "entropy": 7.668509387969971, "epoch": 0.2578427159432746, "grad_norm": 1.1796875, "learning_rate": 0.0001495, "loss": 7.279, "mean_token_accuracy": 0.10248880609869956, "num_tokens": 545758.0, "step": 300 }, { "entropy": 7.700217819213867, "epoch": 0.26214009454232917, "grad_norm": 1.0390625, "learning_rate": 0.000152, "loss": 7.2819, "mean_token_accuracy": 0.10198702886700631, "num_tokens": 555165.0, "step": 305 }, { "entropy": 7.6267822265625, "epoch": 0.2664374731413838, "grad_norm": 1.1171875, "learning_rate": 0.00015450000000000001, "loss": 7.2035, "mean_token_accuracy": 0.10117841735482216, "num_tokens": 564719.0, "step": 310 }, { "entropy": 7.646708202362061, "epoch": 0.2707348517404383, "grad_norm": 1.0859375, "learning_rate": 0.000157, "loss": 7.1638, "mean_token_accuracy": 0.10670615658164025, "num_tokens": 573572.0, "step": 315 }, { "entropy": 7.759027910232544, "epoch": 0.2750322303394929, "grad_norm": 1.3984375, "learning_rate": 0.0001595, "loss": 7.3476, "mean_token_accuracy": 0.10210367739200592, "num_tokens": 581497.0, "step": 320 }, { "entropy": 7.590592908859253, "epoch": 0.27932960893854747, "grad_norm": 1.125, "learning_rate": 0.000162, "loss": 7.2138, "mean_token_accuracy": 0.10664469674229622, "num_tokens": 591107.0, "step": 325 }, { "entropy": 7.70356388092041, "epoch": 0.28362698753760207, "grad_norm": 1.0546875, "learning_rate": 0.00016450000000000001, "loss": 7.2482, "mean_token_accuracy": 0.1050640620291233, "num_tokens": 600241.0, "step": 330 }, { "entropy": 7.639587259292602, "epoch": 0.2879243661366566, "grad_norm": 1.0703125, "learning_rate": 0.00016700000000000002, "loss": 7.161, "mean_token_accuracy": 0.1065776713192463, "num_tokens": 608697.0, "step": 335 }, { "entropy": 7.602131795883179, "epoch": 0.2922217447357112, "grad_norm": 1.1484375, "learning_rate": 0.00016950000000000003, "loss": 7.1698, "mean_token_accuracy": 0.1098954938352108, "num_tokens": 617275.0, "step": 340 }, { "entropy": 7.669042348861694, "epoch": 0.29651912333476577, "grad_norm": 1.0859375, "learning_rate": 0.00017199999999999998, "loss": 7.2602, "mean_token_accuracy": 0.1007254920899868, "num_tokens": 626644.0, "step": 345 }, { "entropy": 7.623440217971802, "epoch": 0.30081650193382037, "grad_norm": 1.1171875, "learning_rate": 0.00017449999999999999, "loss": 7.1639, "mean_token_accuracy": 0.1080157920718193, "num_tokens": 635110.0, "step": 350 }, { "entropy": 7.711002826690674, "epoch": 0.30511388053287497, "grad_norm": 0.97265625, "learning_rate": 0.000177, "loss": 7.3139, "mean_token_accuracy": 0.10216462090611458, "num_tokens": 644746.0, "step": 355 }, { "entropy": 7.708708238601685, "epoch": 0.3094112591319295, "grad_norm": 1.234375, "learning_rate": 0.0001795, "loss": 7.2216, "mean_token_accuracy": 0.1021303728222847, "num_tokens": 654281.0, "step": 360 }, { "entropy": 7.534019136428833, "epoch": 0.3137086377309841, "grad_norm": 1.234375, "learning_rate": 0.000182, "loss": 7.2333, "mean_token_accuracy": 0.10576817691326142, "num_tokens": 663174.0, "step": 365 }, { "entropy": 7.660452365875244, "epoch": 0.31800601633003867, "grad_norm": 1.0625, "learning_rate": 0.0001845, "loss": 7.1525, "mean_token_accuracy": 0.10541519671678543, "num_tokens": 672178.0, "step": 370 }, { "entropy": 7.651990938186645, "epoch": 0.32230339492909327, "grad_norm": 1.1484375, "learning_rate": 0.000187, "loss": 7.1748, "mean_token_accuracy": 0.10421534106135369, "num_tokens": 681323.0, "step": 375 }, { "entropy": 7.537337684631348, "epoch": 0.3266007735281478, "grad_norm": 0.98046875, "learning_rate": 0.0001895, "loss": 7.1001, "mean_token_accuracy": 0.11140918657183647, "num_tokens": 690461.0, "step": 380 }, { "entropy": 7.596573305130005, "epoch": 0.3308981521272024, "grad_norm": 1.2734375, "learning_rate": 0.000192, "loss": 7.1461, "mean_token_accuracy": 0.10594902262091636, "num_tokens": 699199.0, "step": 385 }, { "entropy": 7.566946506500244, "epoch": 0.33519553072625696, "grad_norm": 1.2265625, "learning_rate": 0.0001945, "loss": 7.109, "mean_token_accuracy": 0.11522968709468842, "num_tokens": 707949.0, "step": 390 }, { "entropy": 7.66830849647522, "epoch": 0.33949290932531156, "grad_norm": 1.15625, "learning_rate": 0.00019700000000000002, "loss": 7.1843, "mean_token_accuracy": 0.10416831225156784, "num_tokens": 715752.0, "step": 395 }, { "entropy": 7.619978666305542, "epoch": 0.3437902879243661, "grad_norm": 1.2734375, "learning_rate": 0.00019950000000000002, "loss": 7.1119, "mean_token_accuracy": 0.11198346018791198, "num_tokens": 724416.0, "step": 400 }, { "entropy": 7.594716548919678, "epoch": 0.3480876665234207, "grad_norm": 1.3203125, "learning_rate": 0.000202, "loss": 7.1774, "mean_token_accuracy": 0.10296614542603492, "num_tokens": 733116.0, "step": 405 }, { "entropy": 7.614369249343872, "epoch": 0.3523850451224753, "grad_norm": 1.265625, "learning_rate": 0.00020449999999999998, "loss": 7.1639, "mean_token_accuracy": 0.10737873241305351, "num_tokens": 742093.0, "step": 410 }, { "entropy": 7.532227945327759, "epoch": 0.35668242372152986, "grad_norm": 1.1640625, "learning_rate": 0.000207, "loss": 7.1385, "mean_token_accuracy": 0.11264142915606498, "num_tokens": 750402.0, "step": 415 }, { "entropy": 7.510246276855469, "epoch": 0.36097980232058446, "grad_norm": 1.0625, "learning_rate": 0.0002095, "loss": 7.1129, "mean_token_accuracy": 0.11108387559652329, "num_tokens": 760961.0, "step": 420 }, { "entropy": 7.720337963104248, "epoch": 0.365277180919639, "grad_norm": 1.171875, "learning_rate": 0.000212, "loss": 7.2042, "mean_token_accuracy": 0.10612902790307999, "num_tokens": 770554.0, "step": 425 }, { "entropy": 7.437310361862183, "epoch": 0.3695745595186936, "grad_norm": 1.328125, "learning_rate": 0.0002145, "loss": 7.1596, "mean_token_accuracy": 0.11299800872802734, "num_tokens": 779172.0, "step": 430 }, { "entropy": 7.663910818099976, "epoch": 0.37387193811774816, "grad_norm": 1.1953125, "learning_rate": 0.00021700000000000002, "loss": 7.2239, "mean_token_accuracy": 0.10290571823716163, "num_tokens": 788040.0, "step": 435 }, { "entropy": 7.589281415939331, "epoch": 0.37816931671680276, "grad_norm": 1.125, "learning_rate": 0.0002195, "loss": 7.1461, "mean_token_accuracy": 0.10722599253058433, "num_tokens": 796786.0, "step": 440 }, { "entropy": 7.543337059020996, "epoch": 0.3824666953158573, "grad_norm": 1.4296875, "learning_rate": 0.000222, "loss": 7.1192, "mean_token_accuracy": 0.10885161831974983, "num_tokens": 805520.0, "step": 445 }, { "entropy": 7.486078453063965, "epoch": 0.3867640739149119, "grad_norm": 1.3125, "learning_rate": 0.0002245, "loss": 7.074, "mean_token_accuracy": 0.10658745989203453, "num_tokens": 814939.0, "step": 450 }, { "entropy": 7.534557342529297, "epoch": 0.39106145251396646, "grad_norm": 1.2421875, "learning_rate": 0.00022700000000000002, "loss": 7.0766, "mean_token_accuracy": 0.11227057129144669, "num_tokens": 823862.0, "step": 455 }, { "entropy": 7.5476549625396725, "epoch": 0.39535883111302106, "grad_norm": 1.15625, "learning_rate": 0.00022950000000000002, "loss": 7.1124, "mean_token_accuracy": 0.10576009079813957, "num_tokens": 832820.0, "step": 460 }, { "entropy": 7.601094675064087, "epoch": 0.39965620971207566, "grad_norm": 1.234375, "learning_rate": 0.00023200000000000003, "loss": 7.0697, "mean_token_accuracy": 0.11121490225195885, "num_tokens": 841538.0, "step": 465 }, { "entropy": 7.544060945510864, "epoch": 0.4039535883111302, "grad_norm": 1.1953125, "learning_rate": 0.00023449999999999998, "loss": 7.2069, "mean_token_accuracy": 0.10181558132171631, "num_tokens": 851123.0, "step": 470 }, { "entropy": 7.549469089508056, "epoch": 0.4082509669101848, "grad_norm": 1.1875, "learning_rate": 0.000237, "loss": 7.1633, "mean_token_accuracy": 0.11091246008872986, "num_tokens": 860357.0, "step": 475 }, { "entropy": 7.547894096374511, "epoch": 0.41254834550923936, "grad_norm": 1.234375, "learning_rate": 0.0002395, "loss": 7.0874, "mean_token_accuracy": 0.10722309574484826, "num_tokens": 869980.0, "step": 480 }, { "entropy": 7.507503604888916, "epoch": 0.41684572410829396, "grad_norm": 1.2421875, "learning_rate": 0.000242, "loss": 7.0572, "mean_token_accuracy": 0.11242355704307556, "num_tokens": 878250.0, "step": 485 }, { "entropy": 7.5191121101379395, "epoch": 0.4211431027073485, "grad_norm": 1.125, "learning_rate": 0.0002445, "loss": 7.1411, "mean_token_accuracy": 0.11158529818058013, "num_tokens": 887624.0, "step": 490 }, { "entropy": 7.454204320907593, "epoch": 0.4254404813064031, "grad_norm": 1.1640625, "learning_rate": 0.000247, "loss": 7.1159, "mean_token_accuracy": 0.11260272860527039, "num_tokens": 897120.0, "step": 495 }, { "entropy": 7.495032835006714, "epoch": 0.42973785990545765, "grad_norm": 1.140625, "learning_rate": 0.0002495, "loss": 7.0795, "mean_token_accuracy": 0.11134620234370232, "num_tokens": 906215.0, "step": 500 }, { "epoch": 0.42973785990545765, "eval_entropy": 7.203803374960616, "eval_loss": 7.096514701843262, "eval_mean_token_accuracy": 0.11462040213649874, "eval_num_tokens": 906215.0, "eval_runtime": 2.0645, "eval_samples_per_second": 1719.022, "eval_steps_per_second": 215.059, "step": 500 }, { "entropy": 7.447824621200562, "epoch": 0.43403523850451226, "grad_norm": 1.15625, "learning_rate": 0.000252, "loss": 7.0811, "mean_token_accuracy": 0.1122453585267067, "num_tokens": 915181.0, "step": 505 }, { "entropy": 7.498021125793457, "epoch": 0.4383326171035668, "grad_norm": 1.328125, "learning_rate": 0.0002545, "loss": 7.1044, "mean_token_accuracy": 0.10958386138081551, "num_tokens": 924377.0, "step": 510 }, { "entropy": 7.607626008987427, "epoch": 0.4426299957026214, "grad_norm": 1.1796875, "learning_rate": 0.000257, "loss": 7.1944, "mean_token_accuracy": 0.10655399709939957, "num_tokens": 933114.0, "step": 515 }, { "entropy": 7.6139122486114506, "epoch": 0.44692737430167595, "grad_norm": 1.0625, "learning_rate": 0.0002595, "loss": 7.1453, "mean_token_accuracy": 0.11119715198874473, "num_tokens": 943306.0, "step": 520 }, { "entropy": 7.436026573181152, "epoch": 0.45122475290073055, "grad_norm": 1.2578125, "learning_rate": 0.000262, "loss": 7.0354, "mean_token_accuracy": 0.11904665902256965, "num_tokens": 951515.0, "step": 525 }, { "entropy": 7.494698238372803, "epoch": 0.45552213149978515, "grad_norm": 1.2578125, "learning_rate": 0.00026450000000000003, "loss": 7.1519, "mean_token_accuracy": 0.10504961535334587, "num_tokens": 962686.0, "step": 530 }, { "entropy": 7.572213172912598, "epoch": 0.4598195100988397, "grad_norm": 1.125, "learning_rate": 0.00026700000000000004, "loss": 7.1449, "mean_token_accuracy": 0.11348244249820709, "num_tokens": 972136.0, "step": 535 }, { "entropy": 7.405817127227783, "epoch": 0.4641168886978943, "grad_norm": 1.2734375, "learning_rate": 0.00026950000000000005, "loss": 7.0518, "mean_token_accuracy": 0.1100372053682804, "num_tokens": 981301.0, "step": 540 }, { "entropy": 7.484500360488892, "epoch": 0.46841426729694885, "grad_norm": 1.390625, "learning_rate": 0.00027200000000000005, "loss": 7.0823, "mean_token_accuracy": 0.1120329774916172, "num_tokens": 990360.0, "step": 545 }, { "entropy": 7.573296546936035, "epoch": 0.47271164589600345, "grad_norm": 1.21875, "learning_rate": 0.0002745, "loss": 7.1293, "mean_token_accuracy": 0.10760239511728287, "num_tokens": 999415.0, "step": 550 }, { "entropy": 7.419287919998169, "epoch": 0.477009024495058, "grad_norm": 1.0859375, "learning_rate": 0.000277, "loss": 7.057, "mean_token_accuracy": 0.10999582111835479, "num_tokens": 1008762.0, "step": 555 }, { "entropy": 7.44342451095581, "epoch": 0.4813064030941126, "grad_norm": 1.2890625, "learning_rate": 0.0002795, "loss": 7.0505, "mean_token_accuracy": 0.11702658385038375, "num_tokens": 1017704.0, "step": 560 }, { "entropy": 7.457871007919311, "epoch": 0.48560378169316715, "grad_norm": 1.234375, "learning_rate": 0.00028199999999999997, "loss": 7.018, "mean_token_accuracy": 0.11318592131137847, "num_tokens": 1026251.0, "step": 565 }, { "entropy": 7.356105470657349, "epoch": 0.48990116029222175, "grad_norm": 1.0859375, "learning_rate": 0.0002845, "loss": 7.0083, "mean_token_accuracy": 0.11355392187833786, "num_tokens": 1036191.0, "step": 570 }, { "entropy": 7.5119133472442625, "epoch": 0.4941985388912763, "grad_norm": 1.1953125, "learning_rate": 0.000287, "loss": 7.0501, "mean_token_accuracy": 0.11168754398822785, "num_tokens": 1044936.0, "step": 575 }, { "entropy": 7.406773805618286, "epoch": 0.4984959174903309, "grad_norm": 1.171875, "learning_rate": 0.0002895, "loss": 7.0476, "mean_token_accuracy": 0.1135815680027008, "num_tokens": 1053683.0, "step": 580 }, { "entropy": 7.3828895568847654, "epoch": 0.5027932960893855, "grad_norm": 1.15625, "learning_rate": 0.000292, "loss": 7.0283, "mean_token_accuracy": 0.11782724559307098, "num_tokens": 1062932.0, "step": 585 }, { "entropy": 7.4789910316467285, "epoch": 0.50709067468844, "grad_norm": 1.0859375, "learning_rate": 0.0002945, "loss": 7.0524, "mean_token_accuracy": 0.11150057762861251, "num_tokens": 1072313.0, "step": 590 }, { "entropy": 7.458136653900146, "epoch": 0.5113880532874946, "grad_norm": 1.078125, "learning_rate": 0.000297, "loss": 7.033, "mean_token_accuracy": 0.10738502442836761, "num_tokens": 1081675.0, "step": 595 }, { "entropy": 7.437460470199585, "epoch": 0.5156854318865493, "grad_norm": 1.1875, "learning_rate": 0.0002995, "loss": 7.0392, "mean_token_accuracy": 0.11078862249851226, "num_tokens": 1091541.0, "step": 600 }, { "entropy": 7.43347053527832, "epoch": 0.5199828104856038, "grad_norm": 1.1171875, "learning_rate": 0.000302, "loss": 7.0467, "mean_token_accuracy": 0.11545747444033623, "num_tokens": 1100724.0, "step": 605 }, { "entropy": 7.34070782661438, "epoch": 0.5242801890846583, "grad_norm": 1.265625, "learning_rate": 0.0003045, "loss": 7.0062, "mean_token_accuracy": 0.11681902781128883, "num_tokens": 1108869.0, "step": 610 }, { "entropy": 7.513333511352539, "epoch": 0.5285775676837129, "grad_norm": 1.2109375, "learning_rate": 0.000307, "loss": 7.0303, "mean_token_accuracy": 0.11391275599598885, "num_tokens": 1117314.0, "step": 615 }, { "entropy": 7.237616014480591, "epoch": 0.5328749462827675, "grad_norm": 1.1875, "learning_rate": 0.0003095, "loss": 6.969, "mean_token_accuracy": 0.11866867989301681, "num_tokens": 1126786.0, "step": 620 }, { "entropy": 7.403380393981934, "epoch": 0.5371723248818221, "grad_norm": 1.3515625, "learning_rate": 0.000312, "loss": 6.983, "mean_token_accuracy": 0.11322688534855843, "num_tokens": 1136013.0, "step": 625 }, { "entropy": 7.355997228622437, "epoch": 0.5414697034808766, "grad_norm": 1.15625, "learning_rate": 0.0003145, "loss": 7.0163, "mean_token_accuracy": 0.1159099243581295, "num_tokens": 1144970.0, "step": 630 }, { "entropy": 7.416441440582275, "epoch": 0.5457670820799312, "grad_norm": 1.3046875, "learning_rate": 0.000317, "loss": 6.9784, "mean_token_accuracy": 0.12343248203396798, "num_tokens": 1153810.0, "step": 635 }, { "entropy": 7.320913982391358, "epoch": 0.5500644606789858, "grad_norm": 1.234375, "learning_rate": 0.0003195, "loss": 6.96, "mean_token_accuracy": 0.11895549520850182, "num_tokens": 1162498.0, "step": 640 }, { "entropy": 7.383200359344483, "epoch": 0.5543618392780404, "grad_norm": 1.15625, "learning_rate": 0.000322, "loss": 7.0441, "mean_token_accuracy": 0.11171148270368576, "num_tokens": 1172091.0, "step": 645 }, { "entropy": 7.465569925308228, "epoch": 0.5586592178770949, "grad_norm": 1.1875, "learning_rate": 0.00032450000000000003, "loss": 7.0379, "mean_token_accuracy": 0.1126454509794712, "num_tokens": 1181400.0, "step": 650 }, { "entropy": 7.29718279838562, "epoch": 0.5629565964761496, "grad_norm": 1.3671875, "learning_rate": 0.00032700000000000003, "loss": 7.0066, "mean_token_accuracy": 0.11692977026104927, "num_tokens": 1189780.0, "step": 655 }, { "entropy": 7.376112461090088, "epoch": 0.5672539750752041, "grad_norm": 1.234375, "learning_rate": 0.00032950000000000004, "loss": 6.9708, "mean_token_accuracy": 0.11179102137684822, "num_tokens": 1198671.0, "step": 660 }, { "entropy": 7.406812715530395, "epoch": 0.5715513536742587, "grad_norm": 1.140625, "learning_rate": 0.00033200000000000005, "loss": 6.9887, "mean_token_accuracy": 0.11439693570137024, "num_tokens": 1207173.0, "step": 665 }, { "entropy": 7.267558336257935, "epoch": 0.5758487322733132, "grad_norm": 1.328125, "learning_rate": 0.00033450000000000005, "loss": 6.9252, "mean_token_accuracy": 0.11824023947119713, "num_tokens": 1216387.0, "step": 670 }, { "entropy": 7.466721105575561, "epoch": 0.5801461108723679, "grad_norm": 1.1640625, "learning_rate": 0.000337, "loss": 6.9093, "mean_token_accuracy": 0.11586858034133911, "num_tokens": 1224461.0, "step": 675 }, { "entropy": 7.260802936553955, "epoch": 0.5844434894714224, "grad_norm": 1.2265625, "learning_rate": 0.0003395, "loss": 6.9855, "mean_token_accuracy": 0.1176436722278595, "num_tokens": 1233774.0, "step": 680 }, { "entropy": 7.267514610290528, "epoch": 0.588740868070477, "grad_norm": 1.2109375, "learning_rate": 0.000342, "loss": 6.9319, "mean_token_accuracy": 0.12313097864389419, "num_tokens": 1242812.0, "step": 685 }, { "entropy": 7.451924133300781, "epoch": 0.5930382466695315, "grad_norm": 1.1640625, "learning_rate": 0.00034449999999999997, "loss": 7.0445, "mean_token_accuracy": 0.1125735655426979, "num_tokens": 1252872.0, "step": 690 }, { "entropy": 7.1216278076171875, "epoch": 0.5973356252685862, "grad_norm": 1.21875, "learning_rate": 0.000347, "loss": 6.8314, "mean_token_accuracy": 0.1210754469037056, "num_tokens": 1260852.0, "step": 695 }, { "entropy": 7.292500305175781, "epoch": 0.6016330038676407, "grad_norm": 1.21875, "learning_rate": 0.0003495, "loss": 6.9419, "mean_token_accuracy": 0.1167706459760666, "num_tokens": 1268925.0, "step": 700 }, { "entropy": 7.384844732284546, "epoch": 0.6059303824666953, "grad_norm": 1.1484375, "learning_rate": 0.000352, "loss": 6.9849, "mean_token_accuracy": 0.11300796419382095, "num_tokens": 1278994.0, "step": 705 }, { "entropy": 7.286926889419556, "epoch": 0.6102277610657499, "grad_norm": 1.1875, "learning_rate": 0.0003545, "loss": 6.9847, "mean_token_accuracy": 0.11259545534849166, "num_tokens": 1287698.0, "step": 710 }, { "entropy": 7.337662601470948, "epoch": 0.6145251396648045, "grad_norm": 1.125, "learning_rate": 0.000357, "loss": 6.9117, "mean_token_accuracy": 0.12028303518891334, "num_tokens": 1297475.0, "step": 715 }, { "entropy": 7.265739297866821, "epoch": 0.618822518263859, "grad_norm": 1.234375, "learning_rate": 0.0003595, "loss": 6.9558, "mean_token_accuracy": 0.11790136769413948, "num_tokens": 1306836.0, "step": 720 }, { "entropy": 7.3774675846099855, "epoch": 0.6231198968629136, "grad_norm": 1.140625, "learning_rate": 0.000362, "loss": 6.9932, "mean_token_accuracy": 0.11299360319972038, "num_tokens": 1315872.0, "step": 725 }, { "entropy": 7.3129335880279545, "epoch": 0.6274172754619682, "grad_norm": 1.28125, "learning_rate": 0.0003645, "loss": 6.9353, "mean_token_accuracy": 0.12453719973564148, "num_tokens": 1324624.0, "step": 730 }, { "entropy": 7.300215101242065, "epoch": 0.6317146540610228, "grad_norm": 1.34375, "learning_rate": 0.000367, "loss": 6.9246, "mean_token_accuracy": 0.12120431885123253, "num_tokens": 1333058.0, "step": 735 }, { "entropy": 7.065497016906738, "epoch": 0.6360120326600773, "grad_norm": 1.0703125, "learning_rate": 0.0003695, "loss": 6.8904, "mean_token_accuracy": 0.11625659838318825, "num_tokens": 1342376.0, "step": 740 }, { "entropy": 7.412401533126831, "epoch": 0.6403094112591319, "grad_norm": 1.2578125, "learning_rate": 0.000372, "loss": 6.9293, "mean_token_accuracy": 0.11268759667873382, "num_tokens": 1351386.0, "step": 745 }, { "entropy": 7.194233036041259, "epoch": 0.6446067898581865, "grad_norm": 1.3359375, "learning_rate": 0.0003745, "loss": 6.8338, "mean_token_accuracy": 0.12849506586790085, "num_tokens": 1358958.0, "step": 750 }, { "entropy": 7.3347986221313475, "epoch": 0.6489041684572411, "grad_norm": 1.2109375, "learning_rate": 0.000377, "loss": 6.988, "mean_token_accuracy": 0.11507417485117913, "num_tokens": 1368599.0, "step": 755 }, { "entropy": 7.380126667022705, "epoch": 0.6532015470562956, "grad_norm": 1.984375, "learning_rate": 0.0003795, "loss": 7.0127, "mean_token_accuracy": 0.111283528059721, "num_tokens": 1378529.0, "step": 760 }, { "entropy": 7.157611989974976, "epoch": 0.6574989256553503, "grad_norm": 1.3984375, "learning_rate": 0.000382, "loss": 6.8052, "mean_token_accuracy": 0.1265752285718918, "num_tokens": 1386993.0, "step": 765 }, { "entropy": 7.21686282157898, "epoch": 0.6617963042544048, "grad_norm": 1.4296875, "learning_rate": 0.0003845, "loss": 6.8936, "mean_token_accuracy": 0.12180712148547172, "num_tokens": 1395790.0, "step": 770 }, { "entropy": 7.166302919387817, "epoch": 0.6660936828534594, "grad_norm": 1.1875, "learning_rate": 0.00038700000000000003, "loss": 6.9063, "mean_token_accuracy": 0.11845313757658005, "num_tokens": 1405587.0, "step": 775 }, { "entropy": 7.20961365699768, "epoch": 0.6703910614525139, "grad_norm": 1.1875, "learning_rate": 0.00038950000000000003, "loss": 6.8702, "mean_token_accuracy": 0.12274195328354835, "num_tokens": 1414478.0, "step": 780 }, { "entropy": 7.319825458526611, "epoch": 0.6746884400515686, "grad_norm": 1.4296875, "learning_rate": 0.00039200000000000004, "loss": 6.9317, "mean_token_accuracy": 0.12083822339773179, "num_tokens": 1423791.0, "step": 785 }, { "entropy": 7.313541460037231, "epoch": 0.6789858186506231, "grad_norm": 1.328125, "learning_rate": 0.00039450000000000005, "loss": 6.975, "mean_token_accuracy": 0.11185284182429314, "num_tokens": 1432955.0, "step": 790 }, { "entropy": 7.242367315292358, "epoch": 0.6832831972496777, "grad_norm": 1.03125, "learning_rate": 0.00039700000000000005, "loss": 6.9394, "mean_token_accuracy": 0.11529579535126686, "num_tokens": 1441907.0, "step": 795 }, { "entropy": 7.173644304275513, "epoch": 0.6875805758487322, "grad_norm": 1.2734375, "learning_rate": 0.0003995, "loss": 6.8059, "mean_token_accuracy": 0.12198502644896507, "num_tokens": 1451062.0, "step": 800 }, { "entropy": 7.2840491771698, "epoch": 0.6918779544477869, "grad_norm": 1.109375, "learning_rate": 0.000402, "loss": 6.8894, "mean_token_accuracy": 0.11644295528531075, "num_tokens": 1460132.0, "step": 805 }, { "entropy": 7.085446500778199, "epoch": 0.6961753330468414, "grad_norm": 1.078125, "learning_rate": 0.0004045, "loss": 6.7896, "mean_token_accuracy": 0.12437586709856988, "num_tokens": 1469582.0, "step": 810 }, { "entropy": 7.180881690979004, "epoch": 0.700472711645896, "grad_norm": 1.4453125, "learning_rate": 0.00040699999999999997, "loss": 6.8844, "mean_token_accuracy": 0.11694586053490638, "num_tokens": 1479053.0, "step": 815 }, { "entropy": 7.176044559478759, "epoch": 0.7047700902449506, "grad_norm": 1.21875, "learning_rate": 0.0004095, "loss": 6.8874, "mean_token_accuracy": 0.11812442615628242, "num_tokens": 1488189.0, "step": 820 }, { "entropy": 7.071721315383911, "epoch": 0.7090674688440052, "grad_norm": 1.2578125, "learning_rate": 0.000412, "loss": 6.7495, "mean_token_accuracy": 0.12273769155144691, "num_tokens": 1497324.0, "step": 825 }, { "entropy": 7.243275499343872, "epoch": 0.7133648474430597, "grad_norm": 1.0546875, "learning_rate": 0.0004145, "loss": 6.8631, "mean_token_accuracy": 0.12297548577189446, "num_tokens": 1506543.0, "step": 830 }, { "entropy": 7.1102629661560055, "epoch": 0.7176622260421143, "grad_norm": 1.171875, "learning_rate": 0.000417, "loss": 6.8571, "mean_token_accuracy": 0.1257997862994671, "num_tokens": 1516737.0, "step": 835 }, { "entropy": 7.015081739425659, "epoch": 0.7219596046411689, "grad_norm": 1.1015625, "learning_rate": 0.0004195, "loss": 6.7311, "mean_token_accuracy": 0.12102818563580513, "num_tokens": 1525561.0, "step": 840 }, { "entropy": 7.17170901298523, "epoch": 0.7262569832402235, "grad_norm": 1.203125, "learning_rate": 0.000422, "loss": 6.757, "mean_token_accuracy": 0.12571127861738204, "num_tokens": 1533323.0, "step": 845 }, { "entropy": 7.173940944671631, "epoch": 0.730554361839278, "grad_norm": 1.2109375, "learning_rate": 0.0004245, "loss": 6.821, "mean_token_accuracy": 0.12750849053263663, "num_tokens": 1542632.0, "step": 850 }, { "entropy": 7.148316097259522, "epoch": 0.7348517404383326, "grad_norm": 1.296875, "learning_rate": 0.000427, "loss": 6.7649, "mean_token_accuracy": 0.12507490813732147, "num_tokens": 1551236.0, "step": 855 }, { "entropy": 6.981910467147827, "epoch": 0.7391491190373872, "grad_norm": 1.21875, "learning_rate": 0.0004295, "loss": 6.7641, "mean_token_accuracy": 0.12514904662966728, "num_tokens": 1559674.0, "step": 860 }, { "entropy": 7.186282157897949, "epoch": 0.7434464976364418, "grad_norm": 1.1484375, "learning_rate": 0.000432, "loss": 6.8498, "mean_token_accuracy": 0.1250532478094101, "num_tokens": 1569481.0, "step": 865 }, { "entropy": 7.118600702285766, "epoch": 0.7477438762354963, "grad_norm": 1.1796875, "learning_rate": 0.0004345, "loss": 6.8888, "mean_token_accuracy": 0.1209896370768547, "num_tokens": 1578488.0, "step": 870 }, { "entropy": 7.105226039886475, "epoch": 0.752041254834551, "grad_norm": 1.078125, "learning_rate": 0.000437, "loss": 6.7736, "mean_token_accuracy": 0.12527675032615662, "num_tokens": 1586675.0, "step": 875 }, { "entropy": 7.185068035125733, "epoch": 0.7563386334336055, "grad_norm": 1.1015625, "learning_rate": 0.0004395, "loss": 6.8782, "mean_token_accuracy": 0.1180253192782402, "num_tokens": 1595411.0, "step": 880 }, { "entropy": 7.179415893554688, "epoch": 0.7606360120326601, "grad_norm": 1.2734375, "learning_rate": 0.000442, "loss": 6.8619, "mean_token_accuracy": 0.12292847484350204, "num_tokens": 1604046.0, "step": 885 }, { "entropy": 7.130577564239502, "epoch": 0.7649333906317146, "grad_norm": 1.15625, "learning_rate": 0.0004445, "loss": 6.8566, "mean_token_accuracy": 0.11715829819440841, "num_tokens": 1613759.0, "step": 890 }, { "entropy": 7.111226511001587, "epoch": 0.7692307692307693, "grad_norm": 1.09375, "learning_rate": 0.000447, "loss": 6.8191, "mean_token_accuracy": 0.1252148814499378, "num_tokens": 1623323.0, "step": 895 }, { "entropy": 7.097943353652954, "epoch": 0.7735281478298238, "grad_norm": 1.21875, "learning_rate": 0.00044950000000000003, "loss": 6.7922, "mean_token_accuracy": 0.11943844705820084, "num_tokens": 1631727.0, "step": 900 }, { "entropy": 7.073408317565918, "epoch": 0.7778255264288784, "grad_norm": 1.21875, "learning_rate": 0.00045200000000000004, "loss": 6.7454, "mean_token_accuracy": 0.12582483813166617, "num_tokens": 1639544.0, "step": 905 }, { "entropy": 7.1905022144317625, "epoch": 0.7821229050279329, "grad_norm": 1.2421875, "learning_rate": 0.00045450000000000004, "loss": 6.8716, "mean_token_accuracy": 0.11673429310321808, "num_tokens": 1648931.0, "step": 910 }, { "entropy": 7.032827425003052, "epoch": 0.7864202836269876, "grad_norm": 1.140625, "learning_rate": 0.00045700000000000005, "loss": 6.7325, "mean_token_accuracy": 0.12737771049141883, "num_tokens": 1657688.0, "step": 915 }, { "entropy": 7.160619735717773, "epoch": 0.7907176622260421, "grad_norm": 1.0859375, "learning_rate": 0.00045950000000000006, "loss": 6.8191, "mean_token_accuracy": 0.11969996094703675, "num_tokens": 1666879.0, "step": 920 }, { "entropy": 7.016655492782593, "epoch": 0.7950150408250967, "grad_norm": 1.125, "learning_rate": 0.000462, "loss": 6.7912, "mean_token_accuracy": 0.12404834032058716, "num_tokens": 1676773.0, "step": 925 }, { "entropy": 7.205742454528808, "epoch": 0.7993124194241513, "grad_norm": 1.140625, "learning_rate": 0.0004645, "loss": 6.8942, "mean_token_accuracy": 0.11682869419455529, "num_tokens": 1686144.0, "step": 930 }, { "entropy": 7.093483018875122, "epoch": 0.8036097980232059, "grad_norm": 1.09375, "learning_rate": 0.000467, "loss": 6.8555, "mean_token_accuracy": 0.11735839322209358, "num_tokens": 1695476.0, "step": 935 }, { "entropy": 7.090408611297607, "epoch": 0.8079071766222604, "grad_norm": 1.1171875, "learning_rate": 0.0004695, "loss": 6.7525, "mean_token_accuracy": 0.12118161767721176, "num_tokens": 1704907.0, "step": 940 }, { "entropy": 7.016019344329834, "epoch": 0.812204555221315, "grad_norm": 1.0078125, "learning_rate": 0.000472, "loss": 6.7924, "mean_token_accuracy": 0.12617168575525284, "num_tokens": 1714564.0, "step": 945 }, { "entropy": 7.132166576385498, "epoch": 0.8165019338203696, "grad_norm": 1.1328125, "learning_rate": 0.0004745, "loss": 6.8135, "mean_token_accuracy": 0.12022659555077553, "num_tokens": 1725285.0, "step": 950 }, { "entropy": 7.00044469833374, "epoch": 0.8207993124194242, "grad_norm": 1.1015625, "learning_rate": 0.000477, "loss": 6.8177, "mean_token_accuracy": 0.12241263464093208, "num_tokens": 1734331.0, "step": 955 }, { "entropy": 7.126689529418945, "epoch": 0.8250966910184787, "grad_norm": 1.28125, "learning_rate": 0.0004795, "loss": 6.749, "mean_token_accuracy": 0.11530287116765976, "num_tokens": 1742340.0, "step": 960 }, { "entropy": 7.05500750541687, "epoch": 0.8293940696175333, "grad_norm": 1.15625, "learning_rate": 0.000482, "loss": 6.7383, "mean_token_accuracy": 0.12545244619250298, "num_tokens": 1751725.0, "step": 965 }, { "entropy": 6.894489717483521, "epoch": 0.8336914482165879, "grad_norm": 1.1796875, "learning_rate": 0.0004845, "loss": 6.6736, "mean_token_accuracy": 0.12856126353144645, "num_tokens": 1760294.0, "step": 970 }, { "entropy": 7.036704349517822, "epoch": 0.8379888268156425, "grad_norm": 1.0859375, "learning_rate": 0.000487, "loss": 6.7265, "mean_token_accuracy": 0.1231304183602333, "num_tokens": 1768912.0, "step": 975 }, { "entropy": 7.092654848098755, "epoch": 0.842286205414697, "grad_norm": 1.140625, "learning_rate": 0.0004895, "loss": 6.9187, "mean_token_accuracy": 0.12804483920335769, "num_tokens": 1778633.0, "step": 980 }, { "entropy": 7.090839195251465, "epoch": 0.8465835840137517, "grad_norm": 1.140625, "learning_rate": 0.000492, "loss": 6.7883, "mean_token_accuracy": 0.12408955544233322, "num_tokens": 1787275.0, "step": 985 }, { "entropy": 7.0695414543151855, "epoch": 0.8508809626128062, "grad_norm": 1.2734375, "learning_rate": 0.0004945, "loss": 6.7844, "mean_token_accuracy": 0.12348324134945869, "num_tokens": 1795994.0, "step": 990 }, { "entropy": 6.964667177200317, "epoch": 0.8551783412118608, "grad_norm": 0.94921875, "learning_rate": 0.000497, "loss": 6.7175, "mean_token_accuracy": 0.12602235972881318, "num_tokens": 1806379.0, "step": 995 }, { "entropy": 7.061655473709107, "epoch": 0.8594757198109153, "grad_norm": 1.09375, "learning_rate": 0.0004995, "loss": 6.7479, "mean_token_accuracy": 0.13024335727095604, "num_tokens": 1816135.0, "step": 1000 }, { "epoch": 0.8594757198109153, "eval_entropy": 6.75515693050247, "eval_loss": 6.752710819244385, "eval_mean_token_accuracy": 0.12811107195175445, "eval_num_tokens": 1816135.0, "eval_runtime": 2.0604, "eval_samples_per_second": 1722.442, "eval_steps_per_second": 215.487, "step": 1000 }, { "entropy": 6.9897054672241214, "epoch": 0.86377309840997, "grad_norm": 1.2890625, "learning_rate": 0.0004999998427807679, "loss": 6.7314, "mean_token_accuracy": 0.12282020673155784, "num_tokens": 1824777.0, "step": 1005 }, { "entropy": 6.925821113586426, "epoch": 0.8680704770090245, "grad_norm": 1.4296875, "learning_rate": 0.0004999992040780138, "loss": 6.8085, "mean_token_accuracy": 0.1247783549129963, "num_tokens": 1833807.0, "step": 1010 }, { "entropy": 7.123036670684814, "epoch": 0.8723678556080791, "grad_norm": 1.078125, "learning_rate": 0.0004999980740669294, "loss": 6.754, "mean_token_accuracy": 0.12499897480010987, "num_tokens": 1843375.0, "step": 1015 }, { "entropy": 7.027141857147217, "epoch": 0.8766652342071336, "grad_norm": 1.1796875, "learning_rate": 0.0004999964527499823, "loss": 6.8155, "mean_token_accuracy": 0.12067028507590294, "num_tokens": 1853036.0, "step": 1020 }, { "entropy": 7.018357038497925, "epoch": 0.8809626128061883, "grad_norm": 1.1328125, "learning_rate": 0.0004999943401307127, "loss": 6.7605, "mean_token_accuracy": 0.12497071847319603, "num_tokens": 1862041.0, "step": 1025 }, { "entropy": 6.984006929397583, "epoch": 0.8852599914052428, "grad_norm": 1.2421875, "learning_rate": 0.0004999917362137337, "loss": 6.6885, "mean_token_accuracy": 0.12735832259058952, "num_tokens": 1870707.0, "step": 1030 }, { "entropy": 6.964999151229859, "epoch": 0.8895573700042974, "grad_norm": 1.140625, "learning_rate": 0.0004999886410047312, "loss": 6.6849, "mean_token_accuracy": 0.12543184384703637, "num_tokens": 1879787.0, "step": 1035 }, { "entropy": 7.046022748947143, "epoch": 0.8938547486033519, "grad_norm": 1.1171875, "learning_rate": 0.0004999850545104638, "loss": 6.7336, "mean_token_accuracy": 0.12585699930787086, "num_tokens": 1889413.0, "step": 1040 }, { "entropy": 6.9450146675109865, "epoch": 0.8981521272024066, "grad_norm": 1.265625, "learning_rate": 0.0004999809767387633, "loss": 6.7291, "mean_token_accuracy": 0.12462790235877037, "num_tokens": 1898283.0, "step": 1045 }, { "entropy": 6.982704973220825, "epoch": 0.9024495058014611, "grad_norm": 1.109375, "learning_rate": 0.0004999764076985337, "loss": 6.7474, "mean_token_accuracy": 0.12953734770417213, "num_tokens": 1907175.0, "step": 1050 }, { "entropy": 6.947793340682983, "epoch": 0.9067468844005157, "grad_norm": 1.109375, "learning_rate": 0.0004999713473997519, "loss": 6.7933, "mean_token_accuracy": 0.12337937280535698, "num_tokens": 1918223.0, "step": 1055 }, { "entropy": 7.053569555282593, "epoch": 0.9110442629995703, "grad_norm": 1.109375, "learning_rate": 0.0004999657958534677, "loss": 6.7435, "mean_token_accuracy": 0.11936211958527565, "num_tokens": 1928801.0, "step": 1060 }, { "entropy": 6.874362564086914, "epoch": 0.9153416415986249, "grad_norm": 1.1171875, "learning_rate": 0.0004999597530718034, "loss": 6.7076, "mean_token_accuracy": 0.12535862401127815, "num_tokens": 1937406.0, "step": 1065 }, { "entropy": 6.924251508712769, "epoch": 0.9196390201976794, "grad_norm": 1.1171875, "learning_rate": 0.000499953219067954, "loss": 6.7025, "mean_token_accuracy": 0.12463184967637062, "num_tokens": 1947184.0, "step": 1070 }, { "entropy": 7.056308698654175, "epoch": 0.923936398796734, "grad_norm": 1.15625, "learning_rate": 0.0004999461938561873, "loss": 6.7241, "mean_token_accuracy": 0.12476856112480164, "num_tokens": 1956293.0, "step": 1075 }, { "entropy": 6.90220274925232, "epoch": 0.9282337773957886, "grad_norm": 1.1328125, "learning_rate": 0.0004999386774518432, "loss": 6.6968, "mean_token_accuracy": 0.12625648751854895, "num_tokens": 1964791.0, "step": 1080 }, { "entropy": 6.965981435775757, "epoch": 0.9325311559948432, "grad_norm": 1.0546875, "learning_rate": 0.0004999306698713349, "loss": 6.616, "mean_token_accuracy": 0.12837354317307473, "num_tokens": 1973754.0, "step": 1085 }, { "entropy": 6.929974555969238, "epoch": 0.9368285345938977, "grad_norm": 1.1015625, "learning_rate": 0.0004999221711321477, "loss": 6.6857, "mean_token_accuracy": 0.12695353776216506, "num_tokens": 1983035.0, "step": 1090 }, { "entropy": 6.804391956329345, "epoch": 0.9411259131929522, "grad_norm": 1.0859375, "learning_rate": 0.0004999131812528393, "loss": 6.7126, "mean_token_accuracy": 0.12742481231689454, "num_tokens": 1992584.0, "step": 1095 }, { "entropy": 7.0129533290863035, "epoch": 0.9454232917920069, "grad_norm": 0.94140625, "learning_rate": 0.00049990370025304, "loss": 6.745, "mean_token_accuracy": 0.1250165306031704, "num_tokens": 2001876.0, "step": 1100 }, { "entropy": 6.9361108303070065, "epoch": 0.9497206703910615, "grad_norm": 1.015625, "learning_rate": 0.0004998937281534526, "loss": 6.6354, "mean_token_accuracy": 0.1352070689201355, "num_tokens": 2011067.0, "step": 1105 }, { "entropy": 7.00281867980957, "epoch": 0.954018048990116, "grad_norm": 1.140625, "learning_rate": 0.0004998832649758521, "loss": 6.7191, "mean_token_accuracy": 0.12910578772425652, "num_tokens": 2020763.0, "step": 1110 }, { "entropy": 6.846075534820557, "epoch": 0.9583154275891707, "grad_norm": 1.2421875, "learning_rate": 0.0004998723107430862, "loss": 6.702, "mean_token_accuracy": 0.12597106099128724, "num_tokens": 2029534.0, "step": 1115 }, { "entropy": 6.979312801361084, "epoch": 0.9626128061882252, "grad_norm": 1.109375, "learning_rate": 0.0004998608654790741, "loss": 6.6576, "mean_token_accuracy": 0.12685178518295287, "num_tokens": 2039143.0, "step": 1120 }, { "entropy": 6.840395832061768, "epoch": 0.9669101847872797, "grad_norm": 1.1953125, "learning_rate": 0.000499848929208808, "loss": 6.619, "mean_token_accuracy": 0.13090287074446677, "num_tokens": 2048253.0, "step": 1125 }, { "entropy": 6.833210182189942, "epoch": 0.9712075633863343, "grad_norm": 1.234375, "learning_rate": 0.0004998365019583519, "loss": 6.6747, "mean_token_accuracy": 0.13630941957235337, "num_tokens": 2057234.0, "step": 1130 }, { "entropy": 7.008919525146484, "epoch": 0.975504941985389, "grad_norm": 1.203125, "learning_rate": 0.0004998235837548417, "loss": 6.7058, "mean_token_accuracy": 0.12927891165018082, "num_tokens": 2065431.0, "step": 1135 }, { "entropy": 6.887974071502685, "epoch": 0.9798023205844435, "grad_norm": 1.1015625, "learning_rate": 0.000499810174626486, "loss": 6.7146, "mean_token_accuracy": 0.1267981804907322, "num_tokens": 2074723.0, "step": 1140 }, { "entropy": 6.909135150909424, "epoch": 0.984099699183498, "grad_norm": 1.2265625, "learning_rate": 0.0004997962746025646, "loss": 6.5835, "mean_token_accuracy": 0.13582983165979384, "num_tokens": 2084509.0, "step": 1145 }, { "entropy": 6.8790112972259525, "epoch": 0.9883970777825526, "grad_norm": 1.1875, "learning_rate": 0.0004997818837134298, "loss": 6.7192, "mean_token_accuracy": 0.13046733066439628, "num_tokens": 2093110.0, "step": 1150 }, { "entropy": 6.820547676086425, "epoch": 0.9926944563816072, "grad_norm": 1.1484375, "learning_rate": 0.0004997670019905057, "loss": 6.5939, "mean_token_accuracy": 0.12773325443267822, "num_tokens": 2102355.0, "step": 1155 }, { "entropy": 6.849571800231933, "epoch": 0.9969918349806618, "grad_norm": 1.2109375, "learning_rate": 0.0004997516294662876, "loss": 6.6207, "mean_token_accuracy": 0.1278907351195812, "num_tokens": 2110418.0, "step": 1160 }, { "entropy": 6.932281441158718, "epoch": 1.0008594757198108, "grad_norm": 1.1796875, "learning_rate": 0.0004997357661743433, "loss": 6.6076, "mean_token_accuracy": 0.13429299659199184, "num_tokens": 2117866.0, "step": 1165 }, { "entropy": 6.776707983016967, "epoch": 1.0051568543188656, "grad_norm": 1.1171875, "learning_rate": 0.0004997194121493118, "loss": 6.4353, "mean_token_accuracy": 0.14019777849316598, "num_tokens": 2126082.0, "step": 1170 }, { "entropy": 6.887734413146973, "epoch": 1.0094542329179201, "grad_norm": 1.0859375, "learning_rate": 0.0004997025674269037, "loss": 6.4211, "mean_token_accuracy": 0.13955733701586723, "num_tokens": 2134042.0, "step": 1175 }, { "entropy": 6.774314117431641, "epoch": 1.0137516115169747, "grad_norm": 1.2109375, "learning_rate": 0.0004996852320439013, "loss": 6.4895, "mean_token_accuracy": 0.13937605321407318, "num_tokens": 2142570.0, "step": 1180 }, { "entropy": 6.8031017780303955, "epoch": 1.0180489901160292, "grad_norm": 1.015625, "learning_rate": 0.0004996674060381578, "loss": 6.4187, "mean_token_accuracy": 0.13786159604787826, "num_tokens": 2151310.0, "step": 1185 }, { "entropy": 6.884524583816528, "epoch": 1.0223463687150838, "grad_norm": 1.2109375, "learning_rate": 0.0004996490894485985, "loss": 6.4993, "mean_token_accuracy": 0.1331610009074211, "num_tokens": 2160662.0, "step": 1190 }, { "entropy": 6.801689147949219, "epoch": 1.0266437473141383, "grad_norm": 1.1484375, "learning_rate": 0.0004996302823152193, "loss": 6.445, "mean_token_accuracy": 0.13591438457369803, "num_tokens": 2170067.0, "step": 1195 }, { "entropy": 6.76284008026123, "epoch": 1.0309411259131929, "grad_norm": 1.15625, "learning_rate": 0.0004996109846790873, "loss": 6.4084, "mean_token_accuracy": 0.14033972024917601, "num_tokens": 2178850.0, "step": 1200 }, { "entropy": 6.71863865852356, "epoch": 1.0352385045122476, "grad_norm": 1.0, "learning_rate": 0.0004995911965823412, "loss": 6.4263, "mean_token_accuracy": 0.1453915849328041, "num_tokens": 2188307.0, "step": 1205 }, { "entropy": 6.847736549377442, "epoch": 1.0395358831113022, "grad_norm": 1.21875, "learning_rate": 0.0004995709180681899, "loss": 6.4144, "mean_token_accuracy": 0.1416982263326645, "num_tokens": 2197026.0, "step": 1210 }, { "entropy": 6.729686546325683, "epoch": 1.0438332617103567, "grad_norm": 1.125, "learning_rate": 0.000499550149180914, "loss": 6.4003, "mean_token_accuracy": 0.13990466818213462, "num_tokens": 2205537.0, "step": 1215 }, { "entropy": 6.780020618438721, "epoch": 1.0481306403094113, "grad_norm": 1.15625, "learning_rate": 0.0004995288899658641, "loss": 6.4298, "mean_token_accuracy": 0.1448238343000412, "num_tokens": 2214508.0, "step": 1220 }, { "entropy": 6.842759847640991, "epoch": 1.0524280189084658, "grad_norm": 1.171875, "learning_rate": 0.0004995071404694619, "loss": 6.5391, "mean_token_accuracy": 0.1354886084794998, "num_tokens": 2223084.0, "step": 1225 }, { "entropy": 6.7924669742584225, "epoch": 1.0567253975075204, "grad_norm": 1.078125, "learning_rate": 0.0004994849007391996, "loss": 6.4679, "mean_token_accuracy": 0.13138427063822747, "num_tokens": 2231406.0, "step": 1230 }, { "entropy": 6.731750345230102, "epoch": 1.061022776106575, "grad_norm": 1.1328125, "learning_rate": 0.0004994621708236401, "loss": 6.3805, "mean_token_accuracy": 0.14119497835636138, "num_tokens": 2239867.0, "step": 1235 }, { "entropy": 6.745153379440308, "epoch": 1.0653201547056295, "grad_norm": 1.2265625, "learning_rate": 0.000499438950772416, "loss": 6.4467, "mean_token_accuracy": 0.1372622825205326, "num_tokens": 2248844.0, "step": 1240 }, { "entropy": 6.710582876205445, "epoch": 1.0696175333046842, "grad_norm": 1.078125, "learning_rate": 0.0004994152406362311, "loss": 6.3633, "mean_token_accuracy": 0.14102791994810104, "num_tokens": 2257599.0, "step": 1245 }, { "entropy": 6.773756074905395, "epoch": 1.0739149119037388, "grad_norm": 1.296875, "learning_rate": 0.0004993910404668586, "loss": 6.418, "mean_token_accuracy": 0.13638516888022423, "num_tokens": 2266510.0, "step": 1250 }, { "entropy": 6.720381832122802, "epoch": 1.0782122905027933, "grad_norm": 1.03125, "learning_rate": 0.000499366350317142, "loss": 6.4145, "mean_token_accuracy": 0.1418795846402645, "num_tokens": 2275462.0, "step": 1255 }, { "entropy": 6.712311601638794, "epoch": 1.0825096691018479, "grad_norm": 1.15625, "learning_rate": 0.0004993411702409948, "loss": 6.3874, "mean_token_accuracy": 0.1354715533554554, "num_tokens": 2283826.0, "step": 1260 }, { "entropy": 6.76007399559021, "epoch": 1.0868070477009024, "grad_norm": 1.3203125, "learning_rate": 0.0004993155002934002, "loss": 6.3997, "mean_token_accuracy": 0.13856483697891236, "num_tokens": 2292967.0, "step": 1265 }, { "entropy": 6.8389280319213865, "epoch": 1.091104426299957, "grad_norm": 1.7109375, "learning_rate": 0.0004992893405304111, "loss": 6.5262, "mean_token_accuracy": 0.13781826868653296, "num_tokens": 2302336.0, "step": 1270 }, { "entropy": 6.64991979598999, "epoch": 1.0954018048990115, "grad_norm": 1.078125, "learning_rate": 0.00049926269100915, "loss": 6.4293, "mean_token_accuracy": 0.1432204395532608, "num_tokens": 2311465.0, "step": 1275 }, { "entropy": 6.792691707611084, "epoch": 1.0996991834980663, "grad_norm": 1.140625, "learning_rate": 0.0004992355517878087, "loss": 6.542, "mean_token_accuracy": 0.13071493357419967, "num_tokens": 2320281.0, "step": 1280 }, { "entropy": 6.689556837081909, "epoch": 1.1039965620971208, "grad_norm": 1.171875, "learning_rate": 0.0004992079229256484, "loss": 6.4431, "mean_token_accuracy": 0.1360026031732559, "num_tokens": 2329755.0, "step": 1285 }, { "entropy": 6.6757041931152346, "epoch": 1.1082939406961754, "grad_norm": 1.0546875, "learning_rate": 0.0004991798044829996, "loss": 6.3861, "mean_token_accuracy": 0.1369478650391102, "num_tokens": 2338807.0, "step": 1290 }, { "entropy": 6.7733612060546875, "epoch": 1.11259131929523, "grad_norm": 1.171875, "learning_rate": 0.0004991511965212618, "loss": 6.4719, "mean_token_accuracy": 0.13780709579586983, "num_tokens": 2348056.0, "step": 1295 }, { "entropy": 6.688971424102784, "epoch": 1.1168886978942845, "grad_norm": 1.1171875, "learning_rate": 0.0004991220991029032, "loss": 6.4868, "mean_token_accuracy": 0.13366840407252312, "num_tokens": 2357780.0, "step": 1300 }, { "entropy": 6.773650407791138, "epoch": 1.121186076493339, "grad_norm": 1.3046875, "learning_rate": 0.000499092512291461, "loss": 6.4446, "mean_token_accuracy": 0.13651487827301026, "num_tokens": 2367060.0, "step": 1305 }, { "entropy": 6.7718230247497555, "epoch": 1.1254834550923936, "grad_norm": 1.0703125, "learning_rate": 0.000499062436151541, "loss": 6.441, "mean_token_accuracy": 0.1382215812802315, "num_tokens": 2375751.0, "step": 1310 }, { "entropy": 6.800968360900879, "epoch": 1.129780833691448, "grad_norm": 1.1640625, "learning_rate": 0.0004990318707488173, "loss": 6.5069, "mean_token_accuracy": 0.13017478883266448, "num_tokens": 2385013.0, "step": 1315 }, { "entropy": 6.692961692810059, "epoch": 1.1340782122905029, "grad_norm": 1.1953125, "learning_rate": 0.0004990008161500327, "loss": 6.3937, "mean_token_accuracy": 0.14006393477320672, "num_tokens": 2392935.0, "step": 1320 }, { "entropy": 6.706206512451172, "epoch": 1.1383755908895574, "grad_norm": 1.2578125, "learning_rate": 0.000498969272422998, "loss": 6.4188, "mean_token_accuracy": 0.1468452200293541, "num_tokens": 2401560.0, "step": 1325 }, { "entropy": 6.711210012435913, "epoch": 1.142672969488612, "grad_norm": 1.1328125, "learning_rate": 0.0004989372396365921, "loss": 6.3447, "mean_token_accuracy": 0.1455326870083809, "num_tokens": 2410050.0, "step": 1330 }, { "entropy": 6.756243276596069, "epoch": 1.1469703480876665, "grad_norm": 1.1796875, "learning_rate": 0.0004989047178607618, "loss": 6.4505, "mean_token_accuracy": 0.13842038065195084, "num_tokens": 2418980.0, "step": 1335 }, { "entropy": 6.671654081344604, "epoch": 1.151267726686721, "grad_norm": 1.1328125, "learning_rate": 0.0004988717071665215, "loss": 6.4407, "mean_token_accuracy": 0.13684784546494483, "num_tokens": 2427992.0, "step": 1340 }, { "entropy": 6.762688112258911, "epoch": 1.1555651052857756, "grad_norm": 1.046875, "learning_rate": 0.0004988382076259537, "loss": 6.3572, "mean_token_accuracy": 0.14135119169950486, "num_tokens": 2436368.0, "step": 1345 }, { "entropy": 6.5892657279968265, "epoch": 1.1598624838848304, "grad_norm": 1.0546875, "learning_rate": 0.0004988042193122077, "loss": 6.3456, "mean_token_accuracy": 0.14492984861135483, "num_tokens": 2445499.0, "step": 1350 }, { "entropy": 6.752876138687133, "epoch": 1.164159862483885, "grad_norm": 1.2265625, "learning_rate": 0.0004987697422995005, "loss": 6.3818, "mean_token_accuracy": 0.13490121066570282, "num_tokens": 2454312.0, "step": 1355 }, { "entropy": 6.647862577438355, "epoch": 1.1684572410829395, "grad_norm": 1.109375, "learning_rate": 0.0004987347766631161, "loss": 6.4437, "mean_token_accuracy": 0.1407245770096779, "num_tokens": 2462922.0, "step": 1360 }, { "entropy": 6.755164289474488, "epoch": 1.172754619681994, "grad_norm": 1.0703125, "learning_rate": 0.0004986993224794055, "loss": 6.4781, "mean_token_accuracy": 0.13789629712700843, "num_tokens": 2472195.0, "step": 1365 }, { "entropy": 6.6456316947937015, "epoch": 1.1770519982810486, "grad_norm": 1.1953125, "learning_rate": 0.0004986633798257865, "loss": 6.3829, "mean_token_accuracy": 0.14376115351915358, "num_tokens": 2481021.0, "step": 1370 }, { "entropy": 6.657115125656128, "epoch": 1.181349376880103, "grad_norm": 1.15625, "learning_rate": 0.0004986269487807434, "loss": 6.405, "mean_token_accuracy": 0.13883866667747496, "num_tokens": 2490250.0, "step": 1375 }, { "entropy": 6.763047981262207, "epoch": 1.1856467554791577, "grad_norm": 1.0859375, "learning_rate": 0.000498590029423827, "loss": 6.4581, "mean_token_accuracy": 0.14272229447960855, "num_tokens": 2499122.0, "step": 1380 }, { "entropy": 6.686977815628052, "epoch": 1.1899441340782122, "grad_norm": 1.109375, "learning_rate": 0.0004985526218356546, "loss": 6.4227, "mean_token_accuracy": 0.13726608753204345, "num_tokens": 2508454.0, "step": 1385 }, { "entropy": 6.699887418746949, "epoch": 1.1942415126772667, "grad_norm": 1.1328125, "learning_rate": 0.0004985147260979093, "loss": 6.3632, "mean_token_accuracy": 0.1465839110314846, "num_tokens": 2517353.0, "step": 1390 }, { "entropy": 6.691904354095459, "epoch": 1.1985388912763215, "grad_norm": 1.1796875, "learning_rate": 0.0004984763422933402, "loss": 6.3821, "mean_token_accuracy": 0.14337702393531798, "num_tokens": 2526321.0, "step": 1395 }, { "entropy": 6.6859358787536625, "epoch": 1.202836269875376, "grad_norm": 1.0078125, "learning_rate": 0.0004984374705057623, "loss": 6.4144, "mean_token_accuracy": 0.14242582842707635, "num_tokens": 2535924.0, "step": 1400 }, { "entropy": 6.640392780303955, "epoch": 1.2071336484744306, "grad_norm": 1.171875, "learning_rate": 0.0004983981108200561, "loss": 6.3922, "mean_token_accuracy": 0.1401688925921917, "num_tokens": 2545606.0, "step": 1405 }, { "entropy": 6.649671459197998, "epoch": 1.2114310270734852, "grad_norm": 1.171875, "learning_rate": 0.0004983582633221672, "loss": 6.3859, "mean_token_accuracy": 0.1407300591468811, "num_tokens": 2554947.0, "step": 1410 }, { "entropy": 6.765527582168579, "epoch": 1.2157284056725397, "grad_norm": 1.0234375, "learning_rate": 0.0004983179280991068, "loss": 6.5354, "mean_token_accuracy": 0.13627680763602257, "num_tokens": 2564462.0, "step": 1415 }, { "entropy": 6.688222122192383, "epoch": 1.2200257842715942, "grad_norm": 1.1328125, "learning_rate": 0.0004982771052389508, "loss": 6.3743, "mean_token_accuracy": 0.1444454774260521, "num_tokens": 2573124.0, "step": 1420 }, { "entropy": 6.700618696212769, "epoch": 1.224323162870649, "grad_norm": 1.1484375, "learning_rate": 0.0004982357948308401, "loss": 6.4798, "mean_token_accuracy": 0.13040754944086075, "num_tokens": 2581829.0, "step": 1425 }, { "entropy": 6.7136975765228275, "epoch": 1.2286205414697036, "grad_norm": 1.1328125, "learning_rate": 0.0004981939969649799, "loss": 6.3405, "mean_token_accuracy": 0.1422662131488323, "num_tokens": 2590631.0, "step": 1430 }, { "entropy": 6.661464500427246, "epoch": 1.232917920068758, "grad_norm": 1.1796875, "learning_rate": 0.0004981517117326404, "loss": 6.4484, "mean_token_accuracy": 0.13987314701080322, "num_tokens": 2600684.0, "step": 1435 }, { "entropy": 6.6479767799377445, "epoch": 1.2372152986678127, "grad_norm": 1.0859375, "learning_rate": 0.0004981089392261553, "loss": 6.3605, "mean_token_accuracy": 0.14449947997927665, "num_tokens": 2609667.0, "step": 1440 }, { "entropy": 6.643135976791382, "epoch": 1.2415126772668672, "grad_norm": 1.0, "learning_rate": 0.000498065679538923, "loss": 6.4317, "mean_token_accuracy": 0.14703501164913177, "num_tokens": 2620025.0, "step": 1445 }, { "entropy": 6.672731685638428, "epoch": 1.2458100558659218, "grad_norm": 1.1484375, "learning_rate": 0.0004980219327654049, "loss": 6.351, "mean_token_accuracy": 0.14008775800466539, "num_tokens": 2629032.0, "step": 1450 }, { "entropy": 6.605780506134034, "epoch": 1.2501074344649763, "grad_norm": 1.15625, "learning_rate": 0.000497977699001127, "loss": 6.3357, "mean_token_accuracy": 0.1428795799612999, "num_tokens": 2638303.0, "step": 1455 }, { "entropy": 6.698618459701538, "epoch": 1.2544048130640308, "grad_norm": 1.1328125, "learning_rate": 0.0004979329783426778, "loss": 6.3527, "mean_token_accuracy": 0.14518981352448462, "num_tokens": 2647902.0, "step": 1460 }, { "entropy": 6.619544601440429, "epoch": 1.2587021916630854, "grad_norm": 1.1015625, "learning_rate": 0.0004978877708877094, "loss": 6.4046, "mean_token_accuracy": 0.1414396196603775, "num_tokens": 2657902.0, "step": 1465 }, { "entropy": 6.67303991317749, "epoch": 1.2629995702621402, "grad_norm": 1.09375, "learning_rate": 0.0004978420767349368, "loss": 6.3504, "mean_token_accuracy": 0.14340997561812402, "num_tokens": 2667082.0, "step": 1470 }, { "entropy": 6.647952270507813, "epoch": 1.2672969488611947, "grad_norm": 1.0546875, "learning_rate": 0.0004977958959841379, "loss": 6.4223, "mean_token_accuracy": 0.1364084042608738, "num_tokens": 2676855.0, "step": 1475 }, { "entropy": 6.6442427158355715, "epoch": 1.2715943274602493, "grad_norm": 1.1015625, "learning_rate": 0.000497749228736153, "loss": 6.3546, "mean_token_accuracy": 0.145116026699543, "num_tokens": 2685750.0, "step": 1480 }, { "entropy": 6.597840929031372, "epoch": 1.2758917060593038, "grad_norm": 1.1953125, "learning_rate": 0.0004977020750928845, "loss": 6.4075, "mean_token_accuracy": 0.14761355221271516, "num_tokens": 2695272.0, "step": 1485 }, { "entropy": 6.709882497787476, "epoch": 1.2801890846583583, "grad_norm": 1.0703125, "learning_rate": 0.0004976544351572973, "loss": 6.3504, "mean_token_accuracy": 0.1418570265173912, "num_tokens": 2704806.0, "step": 1490 }, { "entropy": 6.533363771438599, "epoch": 1.2844864632574131, "grad_norm": 1.09375, "learning_rate": 0.0004976063090334179, "loss": 6.4036, "mean_token_accuracy": 0.1452034071087837, "num_tokens": 2713521.0, "step": 1495 }, { "entropy": 6.7042053699493405, "epoch": 1.2887838418564677, "grad_norm": 1.171875, "learning_rate": 0.0004975576968263346, "loss": 6.3966, "mean_token_accuracy": 0.1381194919347763, "num_tokens": 2721848.0, "step": 1500 }, { "epoch": 1.2887838418564677, "eval_entropy": 6.494678375957249, "eval_loss": 6.482933044433594, "eval_mean_token_accuracy": 0.14236528785513328, "eval_num_tokens": 2721848.0, "eval_runtime": 2.0538, "eval_samples_per_second": 1728.039, "eval_steps_per_second": 216.187, "step": 1500 }, { "entropy": 6.592136430740356, "epoch": 1.2930812204555222, "grad_norm": 0.9921875, "learning_rate": 0.000497508598642197, "loss": 6.3613, "mean_token_accuracy": 0.14413030222058296, "num_tokens": 2731473.0, "step": 1505 }, { "entropy": 6.610020494461059, "epoch": 1.2973785990545768, "grad_norm": 1.09375, "learning_rate": 0.000497459014588216, "loss": 6.4326, "mean_token_accuracy": 0.141157578676939, "num_tokens": 2739867.0, "step": 1510 }, { "entropy": 6.684322929382324, "epoch": 1.3016759776536313, "grad_norm": 1.15625, "learning_rate": 0.000497408944772663, "loss": 6.3442, "mean_token_accuracy": 0.14187844544649125, "num_tokens": 2748903.0, "step": 1515 }, { "entropy": 6.512551116943359, "epoch": 1.3059733562526858, "grad_norm": 1.1015625, "learning_rate": 0.0004973583893048707, "loss": 6.3389, "mean_token_accuracy": 0.14152248129248618, "num_tokens": 2757711.0, "step": 1520 }, { "entropy": 6.74653639793396, "epoch": 1.3102707348517404, "grad_norm": 1.1328125, "learning_rate": 0.0004973073482952321, "loss": 6.358, "mean_token_accuracy": 0.140853676199913, "num_tokens": 2765633.0, "step": 1525 }, { "entropy": 6.572407674789429, "epoch": 1.314568113450795, "grad_norm": 1.3203125, "learning_rate": 0.0004972558218552004, "loss": 6.3982, "mean_token_accuracy": 0.14053191468119622, "num_tokens": 2774495.0, "step": 1530 }, { "entropy": 6.645643854141236, "epoch": 1.3188654920498495, "grad_norm": 1.1640625, "learning_rate": 0.0004972038100972885, "loss": 6.4066, "mean_token_accuracy": 0.1426756389439106, "num_tokens": 2782665.0, "step": 1535 }, { "entropy": 6.549836540222168, "epoch": 1.323162870648904, "grad_norm": 1.3671875, "learning_rate": 0.0004971513131350697, "loss": 6.356, "mean_token_accuracy": 0.13861292153596877, "num_tokens": 2791394.0, "step": 1540 }, { "entropy": 6.566079998016358, "epoch": 1.3274602492479588, "grad_norm": 1.2265625, "learning_rate": 0.0004970983310831759, "loss": 6.3437, "mean_token_accuracy": 0.1422226123511791, "num_tokens": 2800488.0, "step": 1545 }, { "entropy": 6.6656012535095215, "epoch": 1.3317576278470133, "grad_norm": 1.03125, "learning_rate": 0.0004970448640572989, "loss": 6.4644, "mean_token_accuracy": 0.14133307337760925, "num_tokens": 2810116.0, "step": 1550 }, { "entropy": 6.59561824798584, "epoch": 1.336055006446068, "grad_norm": 0.984375, "learning_rate": 0.0004969909121741895, "loss": 6.2592, "mean_token_accuracy": 0.14750397205352783, "num_tokens": 2819205.0, "step": 1555 }, { "entropy": 6.559555625915527, "epoch": 1.3403523850451224, "grad_norm": 1.140625, "learning_rate": 0.0004969364755516569, "loss": 6.3311, "mean_token_accuracy": 0.14398850798606871, "num_tokens": 2828017.0, "step": 1560 }, { "entropy": 6.688138008117676, "epoch": 1.344649763644177, "grad_norm": 1.1484375, "learning_rate": 0.0004968815543085689, "loss": 6.3815, "mean_token_accuracy": 0.145321074873209, "num_tokens": 2837125.0, "step": 1565 }, { "entropy": 6.569426822662353, "epoch": 1.3489471422432318, "grad_norm": 1.1015625, "learning_rate": 0.0004968261485648516, "loss": 6.3921, "mean_token_accuracy": 0.14212561994791031, "num_tokens": 2845438.0, "step": 1570 }, { "entropy": 6.608628225326538, "epoch": 1.3532445208422863, "grad_norm": 1.0546875, "learning_rate": 0.000496770258441489, "loss": 6.3689, "mean_token_accuracy": 0.1471138596534729, "num_tokens": 2854389.0, "step": 1575 }, { "entropy": 6.556783771514892, "epoch": 1.3575418994413408, "grad_norm": 1.0859375, "learning_rate": 0.0004967138840605228, "loss": 6.3281, "mean_token_accuracy": 0.14712274819612503, "num_tokens": 2863654.0, "step": 1580 }, { "entropy": 6.517911720275879, "epoch": 1.3618392780403954, "grad_norm": 1.1171875, "learning_rate": 0.000496657025545052, "loss": 6.2482, "mean_token_accuracy": 0.15075734853744507, "num_tokens": 2872871.0, "step": 1585 }, { "entropy": 6.5070977210998535, "epoch": 1.36613665663945, "grad_norm": 1.15625, "learning_rate": 0.000496599683019233, "loss": 6.3373, "mean_token_accuracy": 0.1449791297316551, "num_tokens": 2881140.0, "step": 1590 }, { "entropy": 6.6506085872650145, "epoch": 1.3704340352385045, "grad_norm": 1.09375, "learning_rate": 0.000496541856608279, "loss": 6.3251, "mean_token_accuracy": 0.14629032611846923, "num_tokens": 2889945.0, "step": 1595 }, { "entropy": 6.464802026748657, "epoch": 1.374731413837559, "grad_norm": 0.9921875, "learning_rate": 0.0004964835464384595, "loss": 6.254, "mean_token_accuracy": 0.14956037551164628, "num_tokens": 2898897.0, "step": 1600 }, { "entropy": 6.606829452514648, "epoch": 1.3790287924366136, "grad_norm": 1.1484375, "learning_rate": 0.000496424752637101, "loss": 6.2819, "mean_token_accuracy": 0.15412394404411317, "num_tokens": 2907717.0, "step": 1605 }, { "entropy": 6.513754224777221, "epoch": 1.3833261710356681, "grad_norm": 1.109375, "learning_rate": 0.0004963654753325853, "loss": 6.2693, "mean_token_accuracy": 0.1435668349266052, "num_tokens": 2916213.0, "step": 1610 }, { "entropy": 6.6343999862670895, "epoch": 1.387623549634723, "grad_norm": 1.03125, "learning_rate": 0.0004963057146543505, "loss": 6.4423, "mean_token_accuracy": 0.13986597284674646, "num_tokens": 2925706.0, "step": 1615 }, { "entropy": 6.570179843902588, "epoch": 1.3919209282337774, "grad_norm": 1.0546875, "learning_rate": 0.00049624547073289, "loss": 6.3511, "mean_token_accuracy": 0.13794696033000947, "num_tokens": 2934464.0, "step": 1620 }, { "entropy": 6.570999479293823, "epoch": 1.396218306832832, "grad_norm": 1.171875, "learning_rate": 0.0004961847436997526, "loss": 6.2482, "mean_token_accuracy": 0.1511821575462818, "num_tokens": 2944095.0, "step": 1625 }, { "entropy": 6.450803470611572, "epoch": 1.4005156854318865, "grad_norm": 1.1484375, "learning_rate": 0.0004961235336875416, "loss": 6.249, "mean_token_accuracy": 0.1513315513730049, "num_tokens": 2953357.0, "step": 1630 }, { "entropy": 6.5238546371459964, "epoch": 1.404813064030941, "grad_norm": 1.1484375, "learning_rate": 0.0004960618408299154, "loss": 6.4089, "mean_token_accuracy": 0.1346985176205635, "num_tokens": 2963020.0, "step": 1635 }, { "entropy": 6.61925859451294, "epoch": 1.4091104426299956, "grad_norm": 1.0859375, "learning_rate": 0.0004959996652615865, "loss": 6.2427, "mean_token_accuracy": 0.1468616619706154, "num_tokens": 2971955.0, "step": 1640 }, { "entropy": 6.584984397888183, "epoch": 1.4134078212290504, "grad_norm": 1.1015625, "learning_rate": 0.0004959370071183216, "loss": 6.3097, "mean_token_accuracy": 0.14391712918877603, "num_tokens": 2980662.0, "step": 1645 }, { "entropy": 6.6156212329864506, "epoch": 1.417705199828105, "grad_norm": 1.2734375, "learning_rate": 0.0004958738665369407, "loss": 6.439, "mean_token_accuracy": 0.12904247269034386, "num_tokens": 2990038.0, "step": 1650 }, { "entropy": 6.566392660140991, "epoch": 1.4220025784271595, "grad_norm": 1.1875, "learning_rate": 0.0004958102436553179, "loss": 6.3627, "mean_token_accuracy": 0.1401166081428528, "num_tokens": 2999835.0, "step": 1655 }, { "entropy": 6.622867441177368, "epoch": 1.426299957026214, "grad_norm": 1.0234375, "learning_rate": 0.00049574613861238, "loss": 6.3528, "mean_token_accuracy": 0.1401872843503952, "num_tokens": 3009593.0, "step": 1660 }, { "entropy": 6.564433908462524, "epoch": 1.4305973356252686, "grad_norm": 1.0546875, "learning_rate": 0.0004956815515481069, "loss": 6.3748, "mean_token_accuracy": 0.14576212018728257, "num_tokens": 3019187.0, "step": 1665 }, { "entropy": 6.528054189682007, "epoch": 1.4348947142243231, "grad_norm": 1.1171875, "learning_rate": 0.0004956164826035309, "loss": 6.2839, "mean_token_accuracy": 0.14402172416448594, "num_tokens": 3027875.0, "step": 1670 }, { "entropy": 6.481614637374878, "epoch": 1.4391920928233777, "grad_norm": 1.1484375, "learning_rate": 0.0004955509319207363, "loss": 6.3184, "mean_token_accuracy": 0.14420104324817656, "num_tokens": 3036902.0, "step": 1675 }, { "entropy": 6.46042537689209, "epoch": 1.4434894714224322, "grad_norm": 0.96875, "learning_rate": 0.0004954848996428601, "loss": 6.2969, "mean_token_accuracy": 0.1498032405972481, "num_tokens": 3046653.0, "step": 1680 }, { "entropy": 6.64046082496643, "epoch": 1.4477868500214868, "grad_norm": 1.3203125, "learning_rate": 0.00049541838591409, "loss": 6.3977, "mean_token_accuracy": 0.14052897915244103, "num_tokens": 3056273.0, "step": 1685 }, { "entropy": 6.529829502105713, "epoch": 1.4520842286205415, "grad_norm": 1.078125, "learning_rate": 0.0004953513908796657, "loss": 6.2999, "mean_token_accuracy": 0.13732842430472375, "num_tokens": 3065662.0, "step": 1690 }, { "entropy": 6.594562721252442, "epoch": 1.456381607219596, "grad_norm": 1.1953125, "learning_rate": 0.0004952839146858773, "loss": 6.3277, "mean_token_accuracy": 0.14757051467895507, "num_tokens": 3073970.0, "step": 1695 }, { "entropy": 6.531829500198365, "epoch": 1.4606789858186506, "grad_norm": 1.1875, "learning_rate": 0.0004952159574800658, "loss": 6.3209, "mean_token_accuracy": 0.14381522089242935, "num_tokens": 3082500.0, "step": 1700 }, { "entropy": 6.566446447372437, "epoch": 1.4649763644177052, "grad_norm": 1.1171875, "learning_rate": 0.0004951475194106229, "loss": 6.2777, "mean_token_accuracy": 0.14633866250514985, "num_tokens": 3091574.0, "step": 1705 }, { "entropy": 6.512380361557007, "epoch": 1.4692737430167597, "grad_norm": 1.046875, "learning_rate": 0.0004950786006269898, "loss": 6.3852, "mean_token_accuracy": 0.13938545510172845, "num_tokens": 3102402.0, "step": 1710 }, { "entropy": 6.59727463722229, "epoch": 1.4735711216158143, "grad_norm": 1.1640625, "learning_rate": 0.0004950092012796576, "loss": 6.2072, "mean_token_accuracy": 0.15373199433088303, "num_tokens": 3111347.0, "step": 1715 }, { "entropy": 6.486224889755249, "epoch": 1.477868500214869, "grad_norm": 1.1640625, "learning_rate": 0.0004949393215201666, "loss": 6.2833, "mean_token_accuracy": 0.14614666104316712, "num_tokens": 3120018.0, "step": 1720 }, { "entropy": 6.4936051845550535, "epoch": 1.4821658788139236, "grad_norm": 1.0, "learning_rate": 0.0004948689615011065, "loss": 6.3484, "mean_token_accuracy": 0.13831731379032136, "num_tokens": 3129669.0, "step": 1725 }, { "entropy": 6.6139086246490475, "epoch": 1.4864632574129781, "grad_norm": 0.98828125, "learning_rate": 0.0004947981213761154, "loss": 6.2794, "mean_token_accuracy": 0.15020036697387695, "num_tokens": 3139112.0, "step": 1730 }, { "entropy": 6.510036754608154, "epoch": 1.4907606360120327, "grad_norm": 1.09375, "learning_rate": 0.0004947268012998797, "loss": 6.2427, "mean_token_accuracy": 0.15479698032140732, "num_tokens": 3148437.0, "step": 1735 }, { "entropy": 6.490271472930909, "epoch": 1.4950580146110872, "grad_norm": 0.984375, "learning_rate": 0.000494655001428134, "loss": 6.2146, "mean_token_accuracy": 0.15289759933948516, "num_tokens": 3158165.0, "step": 1740 }, { "entropy": 6.521289396286011, "epoch": 1.4993553932101418, "grad_norm": 1.09375, "learning_rate": 0.0004945827219176604, "loss": 6.3026, "mean_token_accuracy": 0.1522263005375862, "num_tokens": 3167262.0, "step": 1745 }, { "entropy": 6.448360395431519, "epoch": 1.5036527718091963, "grad_norm": 1.03125, "learning_rate": 0.0004945099629262888, "loss": 6.2841, "mean_token_accuracy": 0.14779476150870324, "num_tokens": 3176696.0, "step": 1750 }, { "entropy": 6.6200721740722654, "epoch": 1.5079501504082509, "grad_norm": 1.109375, "learning_rate": 0.0004944367246128954, "loss": 6.3626, "mean_token_accuracy": 0.1411810874938965, "num_tokens": 3185857.0, "step": 1755 }, { "entropy": 6.497649145126343, "epoch": 1.5122475290073054, "grad_norm": 1.09375, "learning_rate": 0.0004943630071374036, "loss": 6.2129, "mean_token_accuracy": 0.15686369836330413, "num_tokens": 3194687.0, "step": 1760 }, { "entropy": 6.447890901565552, "epoch": 1.51654490760636, "grad_norm": 1.03125, "learning_rate": 0.0004942888106607828, "loss": 6.2715, "mean_token_accuracy": 0.14421172440052032, "num_tokens": 3204913.0, "step": 1765 }, { "entropy": 6.556134462356567, "epoch": 1.5208422862054147, "grad_norm": 1.0625, "learning_rate": 0.0004942141353450486, "loss": 6.2587, "mean_token_accuracy": 0.14712465703487396, "num_tokens": 3213312.0, "step": 1770 }, { "entropy": 6.4831544876098635, "epoch": 1.5251396648044693, "grad_norm": 0.9921875, "learning_rate": 0.0004941389813532619, "loss": 6.1822, "mean_token_accuracy": 0.1586100459098816, "num_tokens": 3222992.0, "step": 1775 }, { "entropy": 6.385056638717652, "epoch": 1.5294370434035238, "grad_norm": 1.0625, "learning_rate": 0.000494063348849529, "loss": 6.2213, "mean_token_accuracy": 0.15424711555242537, "num_tokens": 3232836.0, "step": 1780 }, { "entropy": 6.574507141113282, "epoch": 1.5337344220025786, "grad_norm": 0.98046875, "learning_rate": 0.0004939872379990011, "loss": 6.3769, "mean_token_accuracy": 0.14118290394544603, "num_tokens": 3243171.0, "step": 1785 }, { "entropy": 6.56547212600708, "epoch": 1.5380318006016331, "grad_norm": 1.203125, "learning_rate": 0.0004939106489678739, "loss": 6.2954, "mean_token_accuracy": 0.15190573930740356, "num_tokens": 3251995.0, "step": 1790 }, { "entropy": 6.440187692642212, "epoch": 1.5423291792006877, "grad_norm": 1.0390625, "learning_rate": 0.000493833581923387, "loss": 6.2474, "mean_token_accuracy": 0.14897289276123046, "num_tokens": 3260841.0, "step": 1795 }, { "entropy": 6.5475788593292235, "epoch": 1.5466265577997422, "grad_norm": 1.078125, "learning_rate": 0.0004937560370338244, "loss": 6.382, "mean_token_accuracy": 0.14083073958754538, "num_tokens": 3270979.0, "step": 1800 }, { "entropy": 6.536606645584106, "epoch": 1.5509239363987968, "grad_norm": 1.1015625, "learning_rate": 0.000493678014468513, "loss": 6.307, "mean_token_accuracy": 0.1528750814497471, "num_tokens": 3279848.0, "step": 1805 }, { "entropy": 6.46652889251709, "epoch": 1.5552213149978513, "grad_norm": 0.9921875, "learning_rate": 0.0004935995143978227, "loss": 6.311, "mean_token_accuracy": 0.14874453395605086, "num_tokens": 3289172.0, "step": 1810 }, { "entropy": 6.480955171585083, "epoch": 1.5595186935969059, "grad_norm": 1.1796875, "learning_rate": 0.0004935205369931664, "loss": 6.2107, "mean_token_accuracy": 0.15236888080835342, "num_tokens": 3297432.0, "step": 1815 }, { "entropy": 6.62280158996582, "epoch": 1.5638160721959604, "grad_norm": 0.96875, "learning_rate": 0.0004934410824269992, "loss": 6.2391, "mean_token_accuracy": 0.1460478588938713, "num_tokens": 3307486.0, "step": 1820 }, { "entropy": 6.396580219268799, "epoch": 1.568113450795015, "grad_norm": 1.0703125, "learning_rate": 0.0004933611508728182, "loss": 6.2234, "mean_token_accuracy": 0.15543457493185997, "num_tokens": 3316296.0, "step": 1825 }, { "entropy": 6.48117151260376, "epoch": 1.5724108293940695, "grad_norm": 1.0390625, "learning_rate": 0.000493280742505162, "loss": 6.2496, "mean_token_accuracy": 0.14565204530954362, "num_tokens": 3326080.0, "step": 1830 }, { "entropy": 6.399107646942139, "epoch": 1.576708207993124, "grad_norm": 1.1328125, "learning_rate": 0.0004931998574996102, "loss": 6.1637, "mean_token_accuracy": 0.1557439833879471, "num_tokens": 3334826.0, "step": 1835 }, { "entropy": 6.395985460281372, "epoch": 1.5810055865921788, "grad_norm": 1.1171875, "learning_rate": 0.0004931184960327832, "loss": 6.166, "mean_token_accuracy": 0.15891503393650055, "num_tokens": 3343261.0, "step": 1840 }, { "entropy": 6.439464569091797, "epoch": 1.5853029651912334, "grad_norm": 1.6953125, "learning_rate": 0.0004930366582823421, "loss": 6.2095, "mean_token_accuracy": 0.14784578159451484, "num_tokens": 3352513.0, "step": 1845 }, { "entropy": 6.446910238265991, "epoch": 1.589600343790288, "grad_norm": 1.203125, "learning_rate": 0.0004929543444269879, "loss": 6.2679, "mean_token_accuracy": 0.15295199751853944, "num_tokens": 3361577.0, "step": 1850 }, { "entropy": 6.4689103126525875, "epoch": 1.5938977223893425, "grad_norm": 1.171875, "learning_rate": 0.000492871554646461, "loss": 6.327, "mean_token_accuracy": 0.14370332658290863, "num_tokens": 3370591.0, "step": 1855 }, { "entropy": 6.443254470825195, "epoch": 1.5981951009883972, "grad_norm": 1.0859375, "learning_rate": 0.0004927882891215413, "loss": 6.2437, "mean_token_accuracy": 0.14615294709801674, "num_tokens": 3379761.0, "step": 1860 }, { "entropy": 6.549100685119629, "epoch": 1.6024924795874518, "grad_norm": 1.203125, "learning_rate": 0.0004927045480340475, "loss": 6.3212, "mean_token_accuracy": 0.1414845257997513, "num_tokens": 3388974.0, "step": 1865 }, { "entropy": 6.428477334976196, "epoch": 1.6067898581865063, "grad_norm": 1.015625, "learning_rate": 0.0004926203315668363, "loss": 6.2385, "mean_token_accuracy": 0.15081687197089194, "num_tokens": 3398339.0, "step": 1870 }, { "entropy": 6.499061059951782, "epoch": 1.6110872367855609, "grad_norm": 1.0625, "learning_rate": 0.0004925356399038032, "loss": 6.2121, "mean_token_accuracy": 0.15119217038154603, "num_tokens": 3408292.0, "step": 1875 }, { "entropy": 6.460348415374756, "epoch": 1.6153846153846154, "grad_norm": 1.09375, "learning_rate": 0.0004924504732298808, "loss": 6.1809, "mean_token_accuracy": 0.15673429295420646, "num_tokens": 3417057.0, "step": 1880 }, { "entropy": 6.498525190353393, "epoch": 1.61968199398367, "grad_norm": 1.1171875, "learning_rate": 0.0004923648317310391, "loss": 6.2886, "mean_token_accuracy": 0.15057691931724548, "num_tokens": 3425830.0, "step": 1885 }, { "entropy": 6.466361808776855, "epoch": 1.6239793725827245, "grad_norm": 1.015625, "learning_rate": 0.0004922787155942849, "loss": 6.3261, "mean_token_accuracy": 0.14087508171796798, "num_tokens": 3435513.0, "step": 1890 }, { "entropy": 6.480417251586914, "epoch": 1.628276751181779, "grad_norm": 1.046875, "learning_rate": 0.0004921921250076611, "loss": 6.2319, "mean_token_accuracy": 0.1488749422132969, "num_tokens": 3444684.0, "step": 1895 }, { "entropy": 6.398703765869141, "epoch": 1.6325741297808336, "grad_norm": 1.15625, "learning_rate": 0.0004921050601602475, "loss": 6.309, "mean_token_accuracy": 0.15032647401094437, "num_tokens": 3453454.0, "step": 1900 }, { "entropy": 6.512422227859497, "epoch": 1.6368715083798882, "grad_norm": 1.125, "learning_rate": 0.0004920175212421587, "loss": 6.2317, "mean_token_accuracy": 0.1462756022810936, "num_tokens": 3463228.0, "step": 1905 }, { "entropy": 6.298534250259399, "epoch": 1.6411688869789427, "grad_norm": 1.0546875, "learning_rate": 0.0004919295084445445, "loss": 6.1203, "mean_token_accuracy": 0.15622290521860122, "num_tokens": 3472131.0, "step": 1910 }, { "entropy": 6.46199779510498, "epoch": 1.6454662655779975, "grad_norm": 1.03125, "learning_rate": 0.0004918410219595899, "loss": 6.1947, "mean_token_accuracy": 0.15805622637271882, "num_tokens": 3480642.0, "step": 1915 }, { "entropy": 6.536061143875122, "epoch": 1.649763644177052, "grad_norm": 1.0078125, "learning_rate": 0.000491752061980514, "loss": 6.1748, "mean_token_accuracy": 0.15212914645671843, "num_tokens": 3489346.0, "step": 1920 }, { "entropy": 6.385542201995849, "epoch": 1.6540610227761066, "grad_norm": 1.125, "learning_rate": 0.0004916626287015697, "loss": 6.2236, "mean_token_accuracy": 0.1506744407117367, "num_tokens": 3498473.0, "step": 1925 }, { "entropy": 6.4339292526245115, "epoch": 1.658358401375161, "grad_norm": 1.03125, "learning_rate": 0.0004915727223180436, "loss": 6.2184, "mean_token_accuracy": 0.1503354400396347, "num_tokens": 3507415.0, "step": 1930 }, { "entropy": 6.472232723236084, "epoch": 1.6626557799742159, "grad_norm": 1.03125, "learning_rate": 0.0004914823430262554, "loss": 6.3466, "mean_token_accuracy": 0.13937689363956451, "num_tokens": 3516873.0, "step": 1935 }, { "entropy": 6.475211191177368, "epoch": 1.6669531585732704, "grad_norm": 1.1796875, "learning_rate": 0.0004913914910235573, "loss": 6.2023, "mean_token_accuracy": 0.15309734791517257, "num_tokens": 3525047.0, "step": 1940 }, { "entropy": 6.334531784057617, "epoch": 1.671250537172325, "grad_norm": 1.1015625, "learning_rate": 0.0004913001665083337, "loss": 6.2098, "mean_token_accuracy": 0.1510941930115223, "num_tokens": 3534354.0, "step": 1945 }, { "entropy": 6.499793291091919, "epoch": 1.6755479157713795, "grad_norm": 1.3203125, "learning_rate": 0.0004912083696800008, "loss": 6.2384, "mean_token_accuracy": 0.14515842348337174, "num_tokens": 3543830.0, "step": 1950 }, { "entropy": 6.334777593612671, "epoch": 1.679845294370434, "grad_norm": 1.125, "learning_rate": 0.0004911161007390063, "loss": 6.1344, "mean_token_accuracy": 0.1552545964717865, "num_tokens": 3552314.0, "step": 1955 }, { "entropy": 6.398986530303955, "epoch": 1.6841426729694886, "grad_norm": 1.203125, "learning_rate": 0.0004910233598868287, "loss": 6.2232, "mean_token_accuracy": 0.14675267040729523, "num_tokens": 3561656.0, "step": 1960 }, { "entropy": 6.426092958450317, "epoch": 1.6884400515685432, "grad_norm": 1.09375, "learning_rate": 0.0004909301473259769, "loss": 6.2232, "mean_token_accuracy": 0.14848204478621482, "num_tokens": 3571784.0, "step": 1965 }, { "entropy": 6.454012489318847, "epoch": 1.6927374301675977, "grad_norm": 1.0859375, "learning_rate": 0.0004908364632599899, "loss": 6.1775, "mean_token_accuracy": 0.15773458033800125, "num_tokens": 3580626.0, "step": 1970 }, { "entropy": 6.337477779388427, "epoch": 1.6970348087666522, "grad_norm": 1.0703125, "learning_rate": 0.0004907423078934362, "loss": 6.2001, "mean_token_accuracy": 0.14792972654104233, "num_tokens": 3589916.0, "step": 1975 }, { "entropy": 6.395978736877441, "epoch": 1.7013321873657068, "grad_norm": 1.0546875, "learning_rate": 0.0004906476814319134, "loss": 6.2045, "mean_token_accuracy": 0.15436216294765473, "num_tokens": 3599128.0, "step": 1980 }, { "entropy": 6.384798145294189, "epoch": 1.7056295659647613, "grad_norm": 0.890625, "learning_rate": 0.0004905525840820481, "loss": 6.2156, "mean_token_accuracy": 0.1487440824508667, "num_tokens": 3608764.0, "step": 1985 }, { "entropy": 6.519760847091675, "epoch": 1.709926944563816, "grad_norm": 0.984375, "learning_rate": 0.0004904570160514948, "loss": 6.2587, "mean_token_accuracy": 0.14064486026763917, "num_tokens": 3619082.0, "step": 1990 }, { "entropy": 6.396596527099609, "epoch": 1.7142243231628707, "grad_norm": 1.171875, "learning_rate": 0.0004903609775489358, "loss": 6.2232, "mean_token_accuracy": 0.14829822033643722, "num_tokens": 3628695.0, "step": 1995 }, { "entropy": 6.453386020660401, "epoch": 1.7185217017619252, "grad_norm": 1.1484375, "learning_rate": 0.0004902644687840809, "loss": 6.2106, "mean_token_accuracy": 0.14628567397594452, "num_tokens": 3637599.0, "step": 2000 }, { "epoch": 1.7185217017619252, "eval_entropy": 6.120181280213433, "eval_loss": 6.287801742553711, "eval_mean_token_accuracy": 0.15146609128931085, "eval_num_tokens": 3637599.0, "eval_runtime": 2.0623, "eval_samples_per_second": 1720.853, "eval_steps_per_second": 215.289, "step": 2000 }, { "entropy": 6.389330768585205, "epoch": 1.7228190803609797, "grad_norm": 1.1640625, "learning_rate": 0.0004901674899676667, "loss": 6.189, "mean_token_accuracy": 0.15087567865848542, "num_tokens": 3647406.0, "step": 2005 }, { "entropy": 6.32288761138916, "epoch": 1.7271164589600345, "grad_norm": 1.0546875, "learning_rate": 0.0004900700413114561, "loss": 6.0845, "mean_token_accuracy": 0.15229684859514236, "num_tokens": 3656531.0, "step": 2010 }, { "entropy": 6.2982823848724365, "epoch": 1.731413837559089, "grad_norm": 1.0078125, "learning_rate": 0.000489972123028238, "loss": 6.1711, "mean_token_accuracy": 0.14639344438910484, "num_tokens": 3664922.0, "step": 2015 }, { "entropy": 6.42927360534668, "epoch": 1.7357112161581436, "grad_norm": 1.0625, "learning_rate": 0.0004898737353318268, "loss": 6.114, "mean_token_accuracy": 0.15603691339492798, "num_tokens": 3673283.0, "step": 2020 }, { "entropy": 6.379903554916382, "epoch": 1.7400085947571982, "grad_norm": 1.1796875, "learning_rate": 0.000489774878437062, "loss": 6.2432, "mean_token_accuracy": 0.1512456476688385, "num_tokens": 3681760.0, "step": 2025 }, { "entropy": 6.362637662887574, "epoch": 1.7443059733562527, "grad_norm": 1.0859375, "learning_rate": 0.0004896755525598074, "loss": 6.0576, "mean_token_accuracy": 0.15525488257408143, "num_tokens": 3689408.0, "step": 2030 }, { "entropy": 6.350458097457886, "epoch": 1.7486033519553073, "grad_norm": 1.1484375, "learning_rate": 0.0004895757579169511, "loss": 6.1868, "mean_token_accuracy": 0.1519346058368683, "num_tokens": 3697904.0, "step": 2035 }, { "entropy": 6.549949407577515, "epoch": 1.7529007305543618, "grad_norm": 1.0234375, "learning_rate": 0.0004894754947264047, "loss": 6.2025, "mean_token_accuracy": 0.1540897861123085, "num_tokens": 3706704.0, "step": 2040 }, { "entropy": 6.33614501953125, "epoch": 1.7571981091534163, "grad_norm": 1.140625, "learning_rate": 0.000489374763207103, "loss": 6.2858, "mean_token_accuracy": 0.14759851694107057, "num_tokens": 3715690.0, "step": 2045 }, { "entropy": 6.4482136249542235, "epoch": 1.761495487752471, "grad_norm": 1.1484375, "learning_rate": 0.0004892735635790033, "loss": 6.0651, "mean_token_accuracy": 0.16219264268875122, "num_tokens": 3724835.0, "step": 2050 }, { "entropy": 6.303025627136231, "epoch": 1.7657928663515254, "grad_norm": 0.96484375, "learning_rate": 0.000489171896063085, "loss": 6.0978, "mean_token_accuracy": 0.1608540341258049, "num_tokens": 3733977.0, "step": 2055 }, { "entropy": 6.440810489654541, "epoch": 1.77009024495058, "grad_norm": 1.0859375, "learning_rate": 0.0004890697608813495, "loss": 6.2166, "mean_token_accuracy": 0.14737534075975417, "num_tokens": 3742665.0, "step": 2060 }, { "entropy": 6.50860743522644, "epoch": 1.7743876235496348, "grad_norm": 1.1171875, "learning_rate": 0.0004889671582568193, "loss": 6.2866, "mean_token_accuracy": 0.15046041160821916, "num_tokens": 3751647.0, "step": 2065 }, { "entropy": 6.323904037475586, "epoch": 1.7786850021486893, "grad_norm": 1.1640625, "learning_rate": 0.0004888640884135374, "loss": 6.1804, "mean_token_accuracy": 0.14905625879764556, "num_tokens": 3760852.0, "step": 2070 }, { "entropy": 6.3692279815673825, "epoch": 1.7829823807477438, "grad_norm": 1.2734375, "learning_rate": 0.0004887605515765671, "loss": 6.146, "mean_token_accuracy": 0.1545763321220875, "num_tokens": 3768640.0, "step": 2075 }, { "entropy": 6.432651662826538, "epoch": 1.7872797593467986, "grad_norm": 1.09375, "learning_rate": 0.0004886565479719914, "loss": 6.1701, "mean_token_accuracy": 0.1504896029829979, "num_tokens": 3776504.0, "step": 2080 }, { "entropy": 6.4639040470123295, "epoch": 1.7915771379458532, "grad_norm": 1.15625, "learning_rate": 0.0004885520778269128, "loss": 6.1968, "mean_token_accuracy": 0.15468488037586212, "num_tokens": 3786353.0, "step": 2085 }, { "entropy": 6.380429744720459, "epoch": 1.7958745165449077, "grad_norm": 1.125, "learning_rate": 0.0004884471413694523, "loss": 6.2326, "mean_token_accuracy": 0.14940588921308517, "num_tokens": 3795902.0, "step": 2090 }, { "entropy": 6.3466850280761715, "epoch": 1.8001718951439623, "grad_norm": 0.9453125, "learning_rate": 0.0004883417388287491, "loss": 6.1431, "mean_token_accuracy": 0.14718958735466003, "num_tokens": 3805986.0, "step": 2095 }, { "entropy": 6.3597740650177, "epoch": 1.8044692737430168, "grad_norm": 1.1796875, "learning_rate": 0.0004882358704349603, "loss": 6.2747, "mean_token_accuracy": 0.15220490992069244, "num_tokens": 3814915.0, "step": 2100 }, { "entropy": 6.366986703872681, "epoch": 1.8087666523420713, "grad_norm": 1.1640625, "learning_rate": 0.0004881295364192601, "loss": 6.1506, "mean_token_accuracy": 0.15957469791173934, "num_tokens": 3823966.0, "step": 2105 }, { "entropy": 6.475821685791016, "epoch": 1.813064030941126, "grad_norm": 1.03125, "learning_rate": 0.0004880227370138394, "loss": 6.212, "mean_token_accuracy": 0.14951324909925462, "num_tokens": 3832775.0, "step": 2110 }, { "entropy": 6.301672267913818, "epoch": 1.8173614095401804, "grad_norm": 0.93359375, "learning_rate": 0.0004879154724519057, "loss": 6.1316, "mean_token_accuracy": 0.15576981902122497, "num_tokens": 3842808.0, "step": 2115 }, { "entropy": 6.454287385940551, "epoch": 1.821658788139235, "grad_norm": 1.0703125, "learning_rate": 0.0004878077429676816, "loss": 6.2649, "mean_token_accuracy": 0.14898920953273773, "num_tokens": 3853303.0, "step": 2120 }, { "entropy": 6.3901642799377445, "epoch": 1.8259561667382895, "grad_norm": 1.0703125, "learning_rate": 0.0004876995487964054, "loss": 6.1853, "mean_token_accuracy": 0.14685731381177902, "num_tokens": 3862462.0, "step": 2125 }, { "entropy": 6.411391401290894, "epoch": 1.830253545337344, "grad_norm": 1.046875, "learning_rate": 0.00048759089017432996, "loss": 6.293, "mean_token_accuracy": 0.14782755076885223, "num_tokens": 3871596.0, "step": 2130 }, { "entropy": 6.432213401794433, "epoch": 1.8345509239363988, "grad_norm": 1.0390625, "learning_rate": 0.0004874817673387222, "loss": 6.1972, "mean_token_accuracy": 0.15025533735752106, "num_tokens": 3881276.0, "step": 2135 }, { "entropy": 6.357042551040649, "epoch": 1.8388483025354534, "grad_norm": 0.984375, "learning_rate": 0.00048737218052786275, "loss": 6.2863, "mean_token_accuracy": 0.14599718973040582, "num_tokens": 3891610.0, "step": 2140 }, { "entropy": 6.469332885742188, "epoch": 1.843145681134508, "grad_norm": 0.984375, "learning_rate": 0.00048726212998104554, "loss": 6.2036, "mean_token_accuracy": 0.1476315975189209, "num_tokens": 3900584.0, "step": 2145 }, { "entropy": 6.3720924854278564, "epoch": 1.8474430597335625, "grad_norm": 1.0390625, "learning_rate": 0.0004871516159385768, "loss": 6.1288, "mean_token_accuracy": 0.15351544842123985, "num_tokens": 3910208.0, "step": 2150 }, { "entropy": 6.2077491760253904, "epoch": 1.8517404383326173, "grad_norm": 1.140625, "learning_rate": 0.0004870406386417752, "loss": 6.0609, "mean_token_accuracy": 0.16224084943532943, "num_tokens": 3918424.0, "step": 2155 }, { "entropy": 6.278759956359863, "epoch": 1.8560378169316718, "grad_norm": 1.1328125, "learning_rate": 0.0004869291983329707, "loss": 5.9946, "mean_token_accuracy": 0.16720272302627565, "num_tokens": 3926206.0, "step": 2160 }, { "entropy": 6.399888753890991, "epoch": 1.8603351955307263, "grad_norm": 1.078125, "learning_rate": 0.0004868172952555044, "loss": 6.0991, "mean_token_accuracy": 0.1470145635306835, "num_tokens": 3935769.0, "step": 2165 }, { "entropy": 6.315070724487304, "epoch": 1.864632574129781, "grad_norm": 0.98046875, "learning_rate": 0.0004867049296537278, "loss": 6.0903, "mean_token_accuracy": 0.15458065569400786, "num_tokens": 3945118.0, "step": 2170 }, { "entropy": 6.344206809997559, "epoch": 1.8689299527288354, "grad_norm": 1.40625, "learning_rate": 0.0004865921017730027, "loss": 6.1791, "mean_token_accuracy": 0.15464479327201844, "num_tokens": 3954012.0, "step": 2175 }, { "entropy": 6.429828739166259, "epoch": 1.87322733132789, "grad_norm": 0.96484375, "learning_rate": 0.00048647881185969995, "loss": 6.1931, "mean_token_accuracy": 0.14908381700515747, "num_tokens": 3964239.0, "step": 2180 }, { "entropy": 6.33826584815979, "epoch": 1.8775247099269445, "grad_norm": 1.0390625, "learning_rate": 0.0004863650601611994, "loss": 6.0996, "mean_token_accuracy": 0.1615213319659233, "num_tokens": 3973694.0, "step": 2185 }, { "entropy": 6.327886295318604, "epoch": 1.881822088525999, "grad_norm": 1.078125, "learning_rate": 0.00048625084692588937, "loss": 6.1415, "mean_token_accuracy": 0.1605108693242073, "num_tokens": 3982706.0, "step": 2190 }, { "entropy": 6.265936231613159, "epoch": 1.8861194671250536, "grad_norm": 1.1171875, "learning_rate": 0.00048613617240316593, "loss": 6.0825, "mean_token_accuracy": 0.15816196352243422, "num_tokens": 3990934.0, "step": 2195 }, { "entropy": 6.391205978393555, "epoch": 1.8904168457241082, "grad_norm": 1.0859375, "learning_rate": 0.0004860210368434323, "loss": 6.1513, "mean_token_accuracy": 0.15758474171161652, "num_tokens": 3999864.0, "step": 2200 }, { "entropy": 6.334363603591919, "epoch": 1.8947142243231627, "grad_norm": 0.984375, "learning_rate": 0.00048590544049809857, "loss": 6.1514, "mean_token_accuracy": 0.15803639888763427, "num_tokens": 4008273.0, "step": 2205 }, { "entropy": 6.388755893707275, "epoch": 1.8990116029222175, "grad_norm": 1.03125, "learning_rate": 0.000485789383619581, "loss": 6.1719, "mean_token_accuracy": 0.15583823770284652, "num_tokens": 4017697.0, "step": 2210 }, { "entropy": 6.345981502532959, "epoch": 1.903308981521272, "grad_norm": 1.140625, "learning_rate": 0.0004856728664613015, "loss": 6.1881, "mean_token_accuracy": 0.14975374042987824, "num_tokens": 4026775.0, "step": 2215 }, { "entropy": 6.3457728862762455, "epoch": 1.9076063601203266, "grad_norm": 1.0859375, "learning_rate": 0.00048555588927768674, "loss": 6.1523, "mean_token_accuracy": 0.15700841099023818, "num_tokens": 4036476.0, "step": 2220 }, { "entropy": 6.4198205947875975, "epoch": 1.9119037387193811, "grad_norm": 1.1328125, "learning_rate": 0.0004854384523241683, "loss": 6.1336, "mean_token_accuracy": 0.1571663163602352, "num_tokens": 4045221.0, "step": 2225 }, { "entropy": 6.218144416809082, "epoch": 1.916201117318436, "grad_norm": 1.0390625, "learning_rate": 0.00048532055585718143, "loss": 6.0619, "mean_token_accuracy": 0.15748531818389894, "num_tokens": 4053754.0, "step": 2230 }, { "entropy": 6.358814668655396, "epoch": 1.9204984959174904, "grad_norm": 1.1015625, "learning_rate": 0.00048520220013416505, "loss": 6.103, "mean_token_accuracy": 0.1605447456240654, "num_tokens": 4061730.0, "step": 2235 }, { "entropy": 6.3438163757324215, "epoch": 1.924795874516545, "grad_norm": 1.0390625, "learning_rate": 0.0004850833854135607, "loss": 6.1491, "mean_token_accuracy": 0.15683530494570733, "num_tokens": 4070501.0, "step": 2240 }, { "entropy": 6.367244625091553, "epoch": 1.9290932531155995, "grad_norm": 0.95703125, "learning_rate": 0.0004849641119548122, "loss": 6.2334, "mean_token_accuracy": 0.14961420446634294, "num_tokens": 4079621.0, "step": 2245 }, { "entropy": 6.398000574111938, "epoch": 1.933390631714654, "grad_norm": 1.09375, "learning_rate": 0.000484844380018365, "loss": 6.2167, "mean_token_accuracy": 0.15164064317941667, "num_tokens": 4090106.0, "step": 2250 }, { "entropy": 6.375770425796508, "epoch": 1.9376880103137086, "grad_norm": 1.03125, "learning_rate": 0.000484724189865666, "loss": 6.1578, "mean_token_accuracy": 0.15246021896600723, "num_tokens": 4099269.0, "step": 2255 }, { "entropy": 6.217796373367309, "epoch": 1.9419853889127632, "grad_norm": 1.0625, "learning_rate": 0.0004846035417591624, "loss": 6.0917, "mean_token_accuracy": 0.15897612571716307, "num_tokens": 4108414.0, "step": 2260 }, { "entropy": 6.376744508743286, "epoch": 1.9462827675118177, "grad_norm": 1.109375, "learning_rate": 0.0004844824359623014, "loss": 6.2234, "mean_token_accuracy": 0.14845603406429292, "num_tokens": 4117731.0, "step": 2265 }, { "entropy": 6.413218784332275, "epoch": 1.9505801461108723, "grad_norm": 1.109375, "learning_rate": 0.00048436087273952966, "loss": 6.2001, "mean_token_accuracy": 0.14851808845996856, "num_tokens": 4127194.0, "step": 2270 }, { "entropy": 6.277564620971679, "epoch": 1.9548775247099268, "grad_norm": 1.1171875, "learning_rate": 0.00048423885235629265, "loss": 6.1488, "mean_token_accuracy": 0.15824481397867202, "num_tokens": 4135594.0, "step": 2275 }, { "entropy": 6.359572219848633, "epoch": 1.9591749033089814, "grad_norm": 1.0234375, "learning_rate": 0.0004841163750790342, "loss": 6.1804, "mean_token_accuracy": 0.15617654621601104, "num_tokens": 4145027.0, "step": 2280 }, { "entropy": 6.301140403747558, "epoch": 1.9634722819080361, "grad_norm": 1.015625, "learning_rate": 0.00048399344117519555, "loss": 6.0431, "mean_token_accuracy": 0.15682056695222854, "num_tokens": 4153754.0, "step": 2285 }, { "entropy": 6.266417598724365, "epoch": 1.9677696605070907, "grad_norm": 0.96875, "learning_rate": 0.00048387005091321544, "loss": 6.1066, "mean_token_accuracy": 0.16042741984128953, "num_tokens": 4162765.0, "step": 2290 }, { "entropy": 6.3823741436004635, "epoch": 1.9720670391061452, "grad_norm": 1.1328125, "learning_rate": 0.00048374620456252877, "loss": 6.1293, "mean_token_accuracy": 0.15764901116490365, "num_tokens": 4171589.0, "step": 2295 }, { "entropy": 6.2937760829925535, "epoch": 1.9763644177052, "grad_norm": 1.0625, "learning_rate": 0.00048362190239356644, "loss": 6.1393, "mean_token_accuracy": 0.15565742254257203, "num_tokens": 4181817.0, "step": 2300 }, { "entropy": 6.305324840545654, "epoch": 1.9806617963042545, "grad_norm": 0.9609375, "learning_rate": 0.00048349714467775474, "loss": 6.0995, "mean_token_accuracy": 0.14838732779026031, "num_tokens": 4191350.0, "step": 2305 }, { "entropy": 6.278328514099121, "epoch": 1.984959174903309, "grad_norm": 1.0625, "learning_rate": 0.00048337193168751464, "loss": 6.1486, "mean_token_accuracy": 0.15034544318914414, "num_tokens": 4199888.0, "step": 2310 }, { "entropy": 6.390921354293823, "epoch": 1.9892565535023636, "grad_norm": 1.171875, "learning_rate": 0.0004832462636962613, "loss": 6.1298, "mean_token_accuracy": 0.1492708593606949, "num_tokens": 4209509.0, "step": 2315 }, { "entropy": 6.316125965118408, "epoch": 1.9935539321014182, "grad_norm": 1.140625, "learning_rate": 0.0004831201409784034, "loss": 6.072, "mean_token_accuracy": 0.1605883792042732, "num_tokens": 4218496.0, "step": 2320 }, { "entropy": 6.255798053741455, "epoch": 1.9978513107004727, "grad_norm": 1.0078125, "learning_rate": 0.0004829935638093424, "loss": 6.1087, "mean_token_accuracy": 0.15721286535263063, "num_tokens": 4227504.0, "step": 2325 }, { "entropy": 6.355179150899251, "epoch": 2.0017189514396216, "grad_norm": 1.078125, "learning_rate": 0.0004828665324654724, "loss": 6.0277, "mean_token_accuracy": 0.15705766446060604, "num_tokens": 4235338.0, "step": 2330 }, { "entropy": 6.347105932235718, "epoch": 2.006016330038676, "grad_norm": 0.98828125, "learning_rate": 0.0004827390472241791, "loss": 5.7915, "mean_token_accuracy": 0.16444853693246841, "num_tokens": 4244905.0, "step": 2335 }, { "entropy": 6.278535604476929, "epoch": 2.010313708637731, "grad_norm": 0.95703125, "learning_rate": 0.0004826111083638392, "loss": 5.8696, "mean_token_accuracy": 0.16559881940484047, "num_tokens": 4254533.0, "step": 2340 }, { "entropy": 6.316350841522217, "epoch": 2.0146110872367857, "grad_norm": 0.984375, "learning_rate": 0.00048248271616382, "loss": 5.8463, "mean_token_accuracy": 0.16606585830450057, "num_tokens": 4264023.0, "step": 2345 }, { "entropy": 6.242398643493653, "epoch": 2.0189084658358403, "grad_norm": 1.046875, "learning_rate": 0.00048235387090447894, "loss": 5.876, "mean_token_accuracy": 0.16416406631469727, "num_tokens": 4273298.0, "step": 2350 }, { "entropy": 6.315939903259277, "epoch": 2.023205844434895, "grad_norm": 1.078125, "learning_rate": 0.00048222457286716235, "loss": 5.8197, "mean_token_accuracy": 0.16837385147809983, "num_tokens": 4283244.0, "step": 2355 }, { "entropy": 6.246707153320313, "epoch": 2.0275032230339494, "grad_norm": 1.1796875, "learning_rate": 0.00048209482233420564, "loss": 5.7698, "mean_token_accuracy": 0.17663903087377547, "num_tokens": 4291677.0, "step": 2360 }, { "entropy": 6.277777862548828, "epoch": 2.031800601633004, "grad_norm": 1.0625, "learning_rate": 0.000481964619588932, "loss": 5.8196, "mean_token_accuracy": 0.170748533308506, "num_tokens": 4300822.0, "step": 2365 }, { "entropy": 6.263119220733643, "epoch": 2.0360979802320585, "grad_norm": 1.0625, "learning_rate": 0.0004818339649156523, "loss": 5.8415, "mean_token_accuracy": 0.17079205960035324, "num_tokens": 4310149.0, "step": 2370 }, { "entropy": 6.12338604927063, "epoch": 2.040395358831113, "grad_norm": 1.03125, "learning_rate": 0.00048170285859966395, "loss": 5.7423, "mean_token_accuracy": 0.17794516831636428, "num_tokens": 4319109.0, "step": 2375 }, { "entropy": 6.281350469589233, "epoch": 2.0446927374301676, "grad_norm": 0.99609375, "learning_rate": 0.00048157130092725087, "loss": 5.7302, "mean_token_accuracy": 0.17352935224771499, "num_tokens": 4327921.0, "step": 2380 }, { "entropy": 6.251600027084351, "epoch": 2.048990116029222, "grad_norm": 1.046875, "learning_rate": 0.0004814392921856824, "loss": 5.8821, "mean_token_accuracy": 0.17077834606170655, "num_tokens": 4338026.0, "step": 2385 }, { "entropy": 6.208239459991455, "epoch": 2.0532874946282766, "grad_norm": 0.984375, "learning_rate": 0.0004813068326632128, "loss": 5.7272, "mean_token_accuracy": 0.17698098421096803, "num_tokens": 4347794.0, "step": 2390 }, { "entropy": 6.288045644760132, "epoch": 2.057584873227331, "grad_norm": 1.078125, "learning_rate": 0.0004811739226490809, "loss": 5.917, "mean_token_accuracy": 0.16798364371061325, "num_tokens": 4357249.0, "step": 2395 }, { "entropy": 6.22069525718689, "epoch": 2.0618822518263857, "grad_norm": 1.0546875, "learning_rate": 0.00048104056243350896, "loss": 5.8434, "mean_token_accuracy": 0.16641683727502823, "num_tokens": 4366053.0, "step": 2400 }, { "entropy": 6.237291955947876, "epoch": 2.0661796304254403, "grad_norm": 1.0234375, "learning_rate": 0.0004809067523077023, "loss": 5.8614, "mean_token_accuracy": 0.1700182244181633, "num_tokens": 4375543.0, "step": 2405 }, { "entropy": 6.230785751342774, "epoch": 2.0704770090244953, "grad_norm": 1.1015625, "learning_rate": 0.00048077249256384884, "loss": 5.7564, "mean_token_accuracy": 0.17438603639602662, "num_tokens": 4384332.0, "step": 2410 }, { "entropy": 6.19397988319397, "epoch": 2.07477438762355, "grad_norm": 1.2109375, "learning_rate": 0.0004806377834951182, "loss": 5.8466, "mean_token_accuracy": 0.1659790098667145, "num_tokens": 4393670.0, "step": 2415 }, { "entropy": 6.280642461776734, "epoch": 2.0790717662226044, "grad_norm": 1.125, "learning_rate": 0.00048050262539566104, "loss": 5.8543, "mean_token_accuracy": 0.17281297594308853, "num_tokens": 4402763.0, "step": 2420 }, { "entropy": 6.234371662139893, "epoch": 2.083369144821659, "grad_norm": 1.0234375, "learning_rate": 0.0004803670185606087, "loss": 5.766, "mean_token_accuracy": 0.17708782404661177, "num_tokens": 4411863.0, "step": 2425 }, { "entropy": 6.2064672946929935, "epoch": 2.0876665234207135, "grad_norm": 1.078125, "learning_rate": 0.0004802309632860724, "loss": 5.8476, "mean_token_accuracy": 0.17201682031154633, "num_tokens": 4421110.0, "step": 2430 }, { "entropy": 6.299275922775268, "epoch": 2.091963902019768, "grad_norm": 1.0625, "learning_rate": 0.00048009445986914236, "loss": 5.8416, "mean_token_accuracy": 0.165672005712986, "num_tokens": 4430249.0, "step": 2435 }, { "entropy": 6.197122812271118, "epoch": 2.0962612806188226, "grad_norm": 1.0703125, "learning_rate": 0.00047995750860788756, "loss": 5.8269, "mean_token_accuracy": 0.16112401485443115, "num_tokens": 4439686.0, "step": 2440 }, { "entropy": 6.239287519454956, "epoch": 2.100558659217877, "grad_norm": 1.1875, "learning_rate": 0.0004798201098013547, "loss": 5.8073, "mean_token_accuracy": 0.17286145985126494, "num_tokens": 4448645.0, "step": 2445 }, { "entropy": 6.17987093925476, "epoch": 2.1048560378169316, "grad_norm": 1.0078125, "learning_rate": 0.00047968226374956797, "loss": 5.785, "mean_token_accuracy": 0.1676485523581505, "num_tokens": 4456870.0, "step": 2450 }, { "entropy": 6.148839092254638, "epoch": 2.109153416415986, "grad_norm": 1.0390625, "learning_rate": 0.00047954397075352794, "loss": 5.804, "mean_token_accuracy": 0.1790194794535637, "num_tokens": 4466287.0, "step": 2455 }, { "entropy": 6.161268472671509, "epoch": 2.1134507950150407, "grad_norm": 1.09375, "learning_rate": 0.00047940523111521136, "loss": 5.7069, "mean_token_accuracy": 0.17733812779188157, "num_tokens": 4474461.0, "step": 2460 }, { "entropy": 6.201317834854126, "epoch": 2.1177481736140953, "grad_norm": 1.21875, "learning_rate": 0.0004792660451375701, "loss": 5.7722, "mean_token_accuracy": 0.17279447317123414, "num_tokens": 4483002.0, "step": 2465 }, { "entropy": 6.191226196289063, "epoch": 2.12204555221315, "grad_norm": 1.109375, "learning_rate": 0.00047912641312453064, "loss": 5.7874, "mean_token_accuracy": 0.1739989399909973, "num_tokens": 4492061.0, "step": 2470 }, { "entropy": 6.240914964675904, "epoch": 2.1263429308122044, "grad_norm": 0.96484375, "learning_rate": 0.00047898633538099363, "loss": 5.8403, "mean_token_accuracy": 0.16375242471694945, "num_tokens": 4501829.0, "step": 2475 }, { "entropy": 6.218794155120849, "epoch": 2.130640309411259, "grad_norm": 1.015625, "learning_rate": 0.0004788458122128327, "loss": 5.8683, "mean_token_accuracy": 0.16395576894283295, "num_tokens": 4511539.0, "step": 2480 }, { "entropy": 6.197416687011719, "epoch": 2.134937688010314, "grad_norm": 1.09375, "learning_rate": 0.00047870484392689434, "loss": 5.7256, "mean_token_accuracy": 0.1732256680727005, "num_tokens": 4520425.0, "step": 2485 }, { "entropy": 6.153204393386841, "epoch": 2.1392350666093685, "grad_norm": 1.0859375, "learning_rate": 0.000478563430830997, "loss": 5.8237, "mean_token_accuracy": 0.1660924568772316, "num_tokens": 4529474.0, "step": 2490 }, { "entropy": 6.238351488113404, "epoch": 2.143532445208423, "grad_norm": 1.0859375, "learning_rate": 0.00047842157323393035, "loss": 5.7621, "mean_token_accuracy": 0.17213856130838395, "num_tokens": 4538082.0, "step": 2495 }, { "entropy": 6.185346221923828, "epoch": 2.1478298238074776, "grad_norm": 1.046875, "learning_rate": 0.0004782792714454547, "loss": 5.9584, "mean_token_accuracy": 0.1624767854809761, "num_tokens": 4547340.0, "step": 2500 }, { "epoch": 2.1478298238074776, "eval_entropy": 5.975759233440365, "eval_loss": 6.17199182510376, "eval_mean_token_accuracy": 0.15897784858673542, "eval_num_tokens": 4547340.0, "eval_runtime": 2.0476, "eval_samples_per_second": 1733.228, "eval_steps_per_second": 216.837, "step": 2500 }, { "entropy": 6.180462265014649, "epoch": 2.152127202406532, "grad_norm": 1.15625, "learning_rate": 0.0004781365257763002, "loss": 5.787, "mean_token_accuracy": 0.17266686409711837, "num_tokens": 4556415.0, "step": 2505 }, { "entropy": 6.12634973526001, "epoch": 2.1564245810055866, "grad_norm": 1.359375, "learning_rate": 0.00047799333653816633, "loss": 5.6887, "mean_token_accuracy": 0.18215310871601104, "num_tokens": 4565156.0, "step": 2510 }, { "entropy": 6.176189327239991, "epoch": 2.160721959604641, "grad_norm": 1.0, "learning_rate": 0.00047784970404372124, "loss": 5.7844, "mean_token_accuracy": 0.1708789974451065, "num_tokens": 4574678.0, "step": 2515 }, { "entropy": 6.102172183990478, "epoch": 2.1650193382036957, "grad_norm": 1.1328125, "learning_rate": 0.00047770562860660083, "loss": 5.811, "mean_token_accuracy": 0.16656892001628876, "num_tokens": 4583253.0, "step": 2520 }, { "entropy": 6.217717409133911, "epoch": 2.1693167168027503, "grad_norm": 0.9609375, "learning_rate": 0.0004775611105414083, "loss": 5.88, "mean_token_accuracy": 0.16319236159324646, "num_tokens": 4594042.0, "step": 2525 }, { "entropy": 6.179096984863281, "epoch": 2.173614095401805, "grad_norm": 0.99609375, "learning_rate": 0.0004774161501637133, "loss": 5.8208, "mean_token_accuracy": 0.16849268227815628, "num_tokens": 4603128.0, "step": 2530 }, { "entropy": 6.120770788192749, "epoch": 2.1779114740008594, "grad_norm": 1.25, "learning_rate": 0.0004772707477900514, "loss": 5.805, "mean_token_accuracy": 0.17015553265810013, "num_tokens": 4611537.0, "step": 2535 }, { "entropy": 6.254914093017578, "epoch": 2.182208852599914, "grad_norm": 1.1171875, "learning_rate": 0.0004771249037379232, "loss": 5.8984, "mean_token_accuracy": 0.16680168211460114, "num_tokens": 4622481.0, "step": 2540 }, { "entropy": 6.1392961025238035, "epoch": 2.1865062311989685, "grad_norm": 1.109375, "learning_rate": 0.0004769786183257939, "loss": 5.7985, "mean_token_accuracy": 0.17476972788572312, "num_tokens": 4631259.0, "step": 2545 }, { "entropy": 6.153097438812256, "epoch": 2.190803609798023, "grad_norm": 1.1484375, "learning_rate": 0.0004768318918730924, "loss": 5.7586, "mean_token_accuracy": 0.17578112632036208, "num_tokens": 4640266.0, "step": 2550 }, { "entropy": 6.1481832504272464, "epoch": 2.195100988397078, "grad_norm": 1.0625, "learning_rate": 0.00047668472470021044, "loss": 5.8113, "mean_token_accuracy": 0.16784311085939407, "num_tokens": 4649520.0, "step": 2555 }, { "entropy": 6.236191320419311, "epoch": 2.1993983669961326, "grad_norm": 1.0625, "learning_rate": 0.0004765371171285025, "loss": 5.7573, "mean_token_accuracy": 0.17908186167478563, "num_tokens": 4658501.0, "step": 2560 }, { "entropy": 6.072221088409424, "epoch": 2.203695745595187, "grad_norm": 1.078125, "learning_rate": 0.00047638906948028445, "loss": 5.8192, "mean_token_accuracy": 0.16885218769311905, "num_tokens": 4667567.0, "step": 2565 }, { "entropy": 6.158999061584472, "epoch": 2.2079931241942417, "grad_norm": 1.1640625, "learning_rate": 0.00047624058207883317, "loss": 5.8154, "mean_token_accuracy": 0.16924346387386321, "num_tokens": 4676618.0, "step": 2570 }, { "entropy": 6.275905895233154, "epoch": 2.212290502793296, "grad_norm": 1.03125, "learning_rate": 0.00047609165524838576, "loss": 5.8748, "mean_token_accuracy": 0.16927455067634584, "num_tokens": 4685967.0, "step": 2575 }, { "entropy": 6.100191926956176, "epoch": 2.2165878813923507, "grad_norm": 1.265625, "learning_rate": 0.0004759422893141389, "loss": 5.766, "mean_token_accuracy": 0.17206043750047684, "num_tokens": 4694568.0, "step": 2580 }, { "entropy": 6.1515192031860355, "epoch": 2.2208852599914053, "grad_norm": 1.09375, "learning_rate": 0.0004757924846022482, "loss": 5.826, "mean_token_accuracy": 0.17090158760547638, "num_tokens": 4703648.0, "step": 2585 }, { "entropy": 6.158021259307861, "epoch": 2.22518263859046, "grad_norm": 1.1796875, "learning_rate": 0.00047564224143982714, "loss": 5.6863, "mean_token_accuracy": 0.1804699569940567, "num_tokens": 4712444.0, "step": 2590 }, { "entropy": 6.184865522384643, "epoch": 2.2294800171895144, "grad_norm": 1.171875, "learning_rate": 0.00047549156015494676, "loss": 5.8404, "mean_token_accuracy": 0.1678527906537056, "num_tokens": 4722034.0, "step": 2595 }, { "entropy": 6.146687793731689, "epoch": 2.233777395788569, "grad_norm": 1.1484375, "learning_rate": 0.00047534044107663484, "loss": 5.8616, "mean_token_accuracy": 0.1660257250070572, "num_tokens": 4731344.0, "step": 2600 }, { "entropy": 6.193604421615601, "epoch": 2.2380747743876235, "grad_norm": 1.1796875, "learning_rate": 0.00047518888453487496, "loss": 5.7742, "mean_token_accuracy": 0.18199435770511627, "num_tokens": 4739302.0, "step": 2605 }, { "entropy": 6.141710472106934, "epoch": 2.242372152986678, "grad_norm": 0.99609375, "learning_rate": 0.0004750368908606061, "loss": 5.8788, "mean_token_accuracy": 0.16633899062871932, "num_tokens": 4748848.0, "step": 2610 }, { "entropy": 6.243501758575439, "epoch": 2.2466695315857326, "grad_norm": 1.015625, "learning_rate": 0.00047488446038572164, "loss": 5.9385, "mean_token_accuracy": 0.16254912167787552, "num_tokens": 4758194.0, "step": 2615 }, { "entropy": 6.16309700012207, "epoch": 2.250966910184787, "grad_norm": 1.1640625, "learning_rate": 0.0004747315934430688, "loss": 5.8537, "mean_token_accuracy": 0.16617514342069625, "num_tokens": 4768081.0, "step": 2620 }, { "entropy": 6.1246990203857425, "epoch": 2.2552642887838417, "grad_norm": 1.1328125, "learning_rate": 0.000474578290366448, "loss": 5.7741, "mean_token_accuracy": 0.1719846934080124, "num_tokens": 4776471.0, "step": 2625 }, { "entropy": 6.14447546005249, "epoch": 2.259561667382896, "grad_norm": 1.1328125, "learning_rate": 0.0004744245514906117, "loss": 5.7832, "mean_token_accuracy": 0.17195819914340973, "num_tokens": 4784403.0, "step": 2630 }, { "entropy": 6.066154432296753, "epoch": 2.263859045981951, "grad_norm": 1.125, "learning_rate": 0.00047427037715126426, "loss": 5.7561, "mean_token_accuracy": 0.17391669005155563, "num_tokens": 4792779.0, "step": 2635 }, { "entropy": 6.112756013870239, "epoch": 2.2681564245810057, "grad_norm": 1.0390625, "learning_rate": 0.0004741157676850608, "loss": 5.7404, "mean_token_accuracy": 0.17708691954612732, "num_tokens": 4801426.0, "step": 2640 }, { "entropy": 6.151607942581177, "epoch": 2.2724538031800603, "grad_norm": 1.2890625, "learning_rate": 0.00047396072342960663, "loss": 5.7871, "mean_token_accuracy": 0.16531864404678345, "num_tokens": 4810329.0, "step": 2645 }, { "entropy": 6.167152023315429, "epoch": 2.276751181779115, "grad_norm": 1.0546875, "learning_rate": 0.00047380524472345645, "loss": 5.8465, "mean_token_accuracy": 0.1671397715806961, "num_tokens": 4819544.0, "step": 2650 }, { "entropy": 6.143870735168457, "epoch": 2.2810485603781694, "grad_norm": 1.1171875, "learning_rate": 0.0004736493319061134, "loss": 5.8444, "mean_token_accuracy": 0.16567971110343932, "num_tokens": 4828113.0, "step": 2655 }, { "entropy": 6.107415771484375, "epoch": 2.285345938977224, "grad_norm": 1.0, "learning_rate": 0.0004734929853180291, "loss": 5.8272, "mean_token_accuracy": 0.1672320321202278, "num_tokens": 4836989.0, "step": 2660 }, { "entropy": 6.207330274581909, "epoch": 2.2896433175762785, "grad_norm": 0.96875, "learning_rate": 0.00047333620530060175, "loss": 5.8623, "mean_token_accuracy": 0.16721852421760558, "num_tokens": 4847103.0, "step": 2665 }, { "entropy": 6.162304496765136, "epoch": 2.293940696175333, "grad_norm": 1.125, "learning_rate": 0.0004731789921961764, "loss": 5.8736, "mean_token_accuracy": 0.17004917562007904, "num_tokens": 4856238.0, "step": 2670 }, { "entropy": 6.193357849121094, "epoch": 2.2982380747743876, "grad_norm": 1.1484375, "learning_rate": 0.0004730213463480434, "loss": 5.7792, "mean_token_accuracy": 0.17748985737562178, "num_tokens": 4864608.0, "step": 2675 }, { "entropy": 6.143442440032959, "epoch": 2.302535453373442, "grad_norm": 1.0546875, "learning_rate": 0.00047286326810043857, "loss": 5.7374, "mean_token_accuracy": 0.17498592138290406, "num_tokens": 4873889.0, "step": 2680 }, { "entropy": 6.103658771514892, "epoch": 2.3068328319724967, "grad_norm": 1.15625, "learning_rate": 0.00047270475779854137, "loss": 5.7804, "mean_token_accuracy": 0.1758432075381279, "num_tokens": 4882902.0, "step": 2685 }, { "entropy": 6.238151502609253, "epoch": 2.311130210571551, "grad_norm": 1.125, "learning_rate": 0.00047254581578847507, "loss": 5.7985, "mean_token_accuracy": 0.16956950426101686, "num_tokens": 4892390.0, "step": 2690 }, { "entropy": 6.069392299652099, "epoch": 2.3154275891706058, "grad_norm": 1.109375, "learning_rate": 0.0004723864424173055, "loss": 5.9313, "mean_token_accuracy": 0.16988434195518493, "num_tokens": 4901625.0, "step": 2695 }, { "entropy": 6.1605274200439455, "epoch": 2.3197249677696608, "grad_norm": 1.078125, "learning_rate": 0.0004722266380330403, "loss": 5.7187, "mean_token_accuracy": 0.18315426409244537, "num_tokens": 4910804.0, "step": 2700 }, { "entropy": 6.102558517456055, "epoch": 2.3240223463687153, "grad_norm": 1.1015625, "learning_rate": 0.00047206640298462857, "loss": 5.8022, "mean_token_accuracy": 0.1743517056107521, "num_tokens": 4920441.0, "step": 2705 }, { "entropy": 6.1133277893066404, "epoch": 2.32831972496777, "grad_norm": 1.1171875, "learning_rate": 0.00047190573762195945, "loss": 5.8433, "mean_token_accuracy": 0.17087312042713165, "num_tokens": 4930204.0, "step": 2710 }, { "entropy": 6.14404034614563, "epoch": 2.3326171035668244, "grad_norm": 0.94140625, "learning_rate": 0.00047174464229586186, "loss": 5.9433, "mean_token_accuracy": 0.16308000683784485, "num_tokens": 4941191.0, "step": 2715 }, { "entropy": 6.279735326766968, "epoch": 2.336914482165879, "grad_norm": 1.265625, "learning_rate": 0.0004715831173581036, "loss": 5.9209, "mean_token_accuracy": 0.16662475615739822, "num_tokens": 4951825.0, "step": 2720 }, { "entropy": 6.089169502258301, "epoch": 2.3412118607649335, "grad_norm": 0.96484375, "learning_rate": 0.00047142116316139073, "loss": 5.8464, "mean_token_accuracy": 0.1715902417898178, "num_tokens": 4960632.0, "step": 2725 }, { "entropy": 6.170309162139892, "epoch": 2.345509239363988, "grad_norm": 0.9765625, "learning_rate": 0.0004712587800593663, "loss": 5.8798, "mean_token_accuracy": 0.16993365585803985, "num_tokens": 4969455.0, "step": 2730 }, { "entropy": 6.12351393699646, "epoch": 2.3498066179630426, "grad_norm": 1.2890625, "learning_rate": 0.0004710959684066102, "loss": 5.7859, "mean_token_accuracy": 0.1763747364282608, "num_tokens": 4978997.0, "step": 2735 }, { "entropy": 6.158613014221191, "epoch": 2.354103996562097, "grad_norm": 1.0546875, "learning_rate": 0.00047093272855863803, "loss": 5.8433, "mean_token_accuracy": 0.17122582197189332, "num_tokens": 4988305.0, "step": 2740 }, { "entropy": 6.098289728164673, "epoch": 2.3584013751611517, "grad_norm": 1.0546875, "learning_rate": 0.0004707690608719003, "loss": 5.7826, "mean_token_accuracy": 0.1741472825407982, "num_tokens": 4997022.0, "step": 2745 }, { "entropy": 6.148240280151367, "epoch": 2.362698753760206, "grad_norm": 1.1484375, "learning_rate": 0.0004706049657037818, "loss": 5.8468, "mean_token_accuracy": 0.1676038146018982, "num_tokens": 5005664.0, "step": 2750 }, { "entropy": 6.128693151473999, "epoch": 2.3669961323592608, "grad_norm": 1.03125, "learning_rate": 0.0004704404434126009, "loss": 5.7993, "mean_token_accuracy": 0.1630128264427185, "num_tokens": 5014769.0, "step": 2755 }, { "entropy": 6.173029994964599, "epoch": 2.3712935109583153, "grad_norm": 1.015625, "learning_rate": 0.00047027549435760843, "loss": 5.869, "mean_token_accuracy": 0.16782066822052003, "num_tokens": 5024060.0, "step": 2760 }, { "entropy": 6.2035542011260985, "epoch": 2.37559088955737, "grad_norm": 1.140625, "learning_rate": 0.0004701101188989872, "loss": 5.9029, "mean_token_accuracy": 0.16563379466533662, "num_tokens": 5033046.0, "step": 2765 }, { "entropy": 6.131078624725342, "epoch": 2.3798882681564244, "grad_norm": 1.2421875, "learning_rate": 0.00046994431739785114, "loss": 5.7511, "mean_token_accuracy": 0.18578273057937622, "num_tokens": 5040894.0, "step": 2770 }, { "entropy": 6.131838798522949, "epoch": 2.384185646755479, "grad_norm": 1.046875, "learning_rate": 0.00046977809021624454, "loss": 5.9175, "mean_token_accuracy": 0.16830566823482512, "num_tokens": 5050961.0, "step": 2775 }, { "entropy": 6.217153167724609, "epoch": 2.3884830253545335, "grad_norm": 1.140625, "learning_rate": 0.0004696114377171409, "loss": 5.8447, "mean_token_accuracy": 0.166962693631649, "num_tokens": 5060226.0, "step": 2780 }, { "entropy": 6.121598243713379, "epoch": 2.3927804039535885, "grad_norm": 1.1015625, "learning_rate": 0.0004694443602644429, "loss": 5.8063, "mean_token_accuracy": 0.16657978445291519, "num_tokens": 5069225.0, "step": 2785 }, { "entropy": 6.101100969314575, "epoch": 2.397077782552643, "grad_norm": 1.0703125, "learning_rate": 0.0004692768582229808, "loss": 5.7858, "mean_token_accuracy": 0.17338948845863342, "num_tokens": 5078386.0, "step": 2790 }, { "entropy": 6.113597059249878, "epoch": 2.4013751611516976, "grad_norm": 0.98046875, "learning_rate": 0.00046910893195851213, "loss": 5.7198, "mean_token_accuracy": 0.1726340189576149, "num_tokens": 5087161.0, "step": 2795 }, { "entropy": 6.0919578075408936, "epoch": 2.405672539750752, "grad_norm": 1.0546875, "learning_rate": 0.00046894058183772074, "loss": 5.892, "mean_token_accuracy": 0.16800693273544312, "num_tokens": 5096613.0, "step": 2800 }, { "entropy": 6.136939573287964, "epoch": 2.4099699183498067, "grad_norm": 1.1484375, "learning_rate": 0.000468771808228216, "loss": 5.8324, "mean_token_accuracy": 0.16829856783151625, "num_tokens": 5106534.0, "step": 2805 }, { "entropy": 6.107866430282593, "epoch": 2.414267296948861, "grad_norm": 1.1171875, "learning_rate": 0.00046860261149853197, "loss": 5.8647, "mean_token_accuracy": 0.16899872124195098, "num_tokens": 5115975.0, "step": 2810 }, { "entropy": 6.077492666244507, "epoch": 2.4185646755479158, "grad_norm": 1.1640625, "learning_rate": 0.0004684329920181268, "loss": 5.751, "mean_token_accuracy": 0.1743526503443718, "num_tokens": 5124635.0, "step": 2815 }, { "entropy": 6.131529140472412, "epoch": 2.4228620541469703, "grad_norm": 1.1875, "learning_rate": 0.00046826295015738154, "loss": 5.7277, "mean_token_accuracy": 0.18355693519115449, "num_tokens": 5133226.0, "step": 2820 }, { "entropy": 6.0389073371887205, "epoch": 2.427159432746025, "grad_norm": 1.0390625, "learning_rate": 0.0004680924862875996, "loss": 5.8249, "mean_token_accuracy": 0.17242621779441833, "num_tokens": 5142257.0, "step": 2825 }, { "entropy": 6.1347551345825195, "epoch": 2.4314568113450794, "grad_norm": 1.015625, "learning_rate": 0.00046792160078100605, "loss": 5.803, "mean_token_accuracy": 0.1744050070643425, "num_tokens": 5150752.0, "step": 2830 }, { "entropy": 6.121770191192627, "epoch": 2.435754189944134, "grad_norm": 1.0546875, "learning_rate": 0.00046775029401074653, "loss": 5.7301, "mean_token_accuracy": 0.18050158619880677, "num_tokens": 5160237.0, "step": 2835 }, { "entropy": 6.1292938709259035, "epoch": 2.4400515685431885, "grad_norm": 1.125, "learning_rate": 0.00046757856635088645, "loss": 5.8006, "mean_token_accuracy": 0.1770539328455925, "num_tokens": 5169752.0, "step": 2840 }, { "entropy": 6.102383279800415, "epoch": 2.444348947142243, "grad_norm": 1.0234375, "learning_rate": 0.0004674064181764105, "loss": 5.8474, "mean_token_accuracy": 0.17131757587194443, "num_tokens": 5178892.0, "step": 2845 }, { "entropy": 6.149413537979126, "epoch": 2.448646325741298, "grad_norm": 0.9765625, "learning_rate": 0.00046723384986322147, "loss": 5.8235, "mean_token_accuracy": 0.1708175078034401, "num_tokens": 5188468.0, "step": 2850 }, { "entropy": 6.077389192581177, "epoch": 2.4529437043403526, "grad_norm": 1.0625, "learning_rate": 0.0004670608617881395, "loss": 5.7566, "mean_token_accuracy": 0.17592367827892302, "num_tokens": 5197565.0, "step": 2855 }, { "entropy": 6.030880689620972, "epoch": 2.457241082939407, "grad_norm": 1.1171875, "learning_rate": 0.0004668874543289014, "loss": 5.7533, "mean_token_accuracy": 0.17984721809625626, "num_tokens": 5205791.0, "step": 2860 }, { "entropy": 6.102017164230347, "epoch": 2.4615384615384617, "grad_norm": 1.078125, "learning_rate": 0.00046671362786415986, "loss": 5.7546, "mean_token_accuracy": 0.18463018238544465, "num_tokens": 5214773.0, "step": 2865 }, { "entropy": 6.001052331924439, "epoch": 2.465835840137516, "grad_norm": 1.015625, "learning_rate": 0.00046653938277348237, "loss": 5.7784, "mean_token_accuracy": 0.178888800740242, "num_tokens": 5223734.0, "step": 2870 }, { "entropy": 6.224011754989624, "epoch": 2.4701332187365708, "grad_norm": 1.203125, "learning_rate": 0.0004663647194373505, "loss": 5.8493, "mean_token_accuracy": 0.1654559224843979, "num_tokens": 5231742.0, "step": 2875 }, { "entropy": 6.084821510314941, "epoch": 2.4744305973356253, "grad_norm": 1.046875, "learning_rate": 0.00046618963823715913, "loss": 5.8114, "mean_token_accuracy": 0.1739817038178444, "num_tokens": 5241673.0, "step": 2880 }, { "entropy": 6.109079217910766, "epoch": 2.47872797593468, "grad_norm": 1.15625, "learning_rate": 0.00046601413955521575, "loss": 5.7746, "mean_token_accuracy": 0.1694352611899376, "num_tokens": 5250082.0, "step": 2885 }, { "entropy": 6.092094850540161, "epoch": 2.4830253545337344, "grad_norm": 1.1640625, "learning_rate": 0.0004658382237747393, "loss": 5.8386, "mean_token_accuracy": 0.1698906749486923, "num_tokens": 5259680.0, "step": 2890 }, { "entropy": 6.122028970718384, "epoch": 2.487322733132789, "grad_norm": 0.98046875, "learning_rate": 0.00046566189127985946, "loss": 5.8246, "mean_token_accuracy": 0.17455327808856963, "num_tokens": 5269561.0, "step": 2895 }, { "entropy": 6.137590980529785, "epoch": 2.4916201117318435, "grad_norm": 0.98828125, "learning_rate": 0.000465485142455616, "loss": 5.7746, "mean_token_accuracy": 0.18081652075052262, "num_tokens": 5278659.0, "step": 2900 }, { "entropy": 6.009606838226318, "epoch": 2.495917490330898, "grad_norm": 1.046875, "learning_rate": 0.00046530797768795765, "loss": 5.7732, "mean_token_accuracy": 0.180580236017704, "num_tokens": 5287619.0, "step": 2905 }, { "entropy": 6.08758659362793, "epoch": 2.5002148689299526, "grad_norm": 1.03125, "learning_rate": 0.00046513039736374153, "loss": 5.8748, "mean_token_accuracy": 0.16648129373788834, "num_tokens": 5297334.0, "step": 2910 }, { "entropy": 6.145037269592285, "epoch": 2.504512247529007, "grad_norm": 1.1484375, "learning_rate": 0.0004649524018707319, "loss": 5.8028, "mean_token_accuracy": 0.17371988743543626, "num_tokens": 5306208.0, "step": 2915 }, { "entropy": 6.061826229095459, "epoch": 2.5088096261280617, "grad_norm": 1.1875, "learning_rate": 0.00046477399159759996, "loss": 5.7262, "mean_token_accuracy": 0.17741942554712295, "num_tokens": 5314754.0, "step": 2920 }, { "entropy": 5.948053169250488, "epoch": 2.5131070047271162, "grad_norm": 1.21875, "learning_rate": 0.00046459516693392246, "loss": 5.751, "mean_token_accuracy": 0.17220668345689774, "num_tokens": 5324000.0, "step": 2925 }, { "entropy": 6.165147161483764, "epoch": 2.517404383326171, "grad_norm": 1.09375, "learning_rate": 0.0004644159282701808, "loss": 5.7977, "mean_token_accuracy": 0.17256153672933577, "num_tokens": 5332478.0, "step": 2930 }, { "entropy": 6.164068746566772, "epoch": 2.5217017619252258, "grad_norm": 1.03125, "learning_rate": 0.00046423627599776076, "loss": 5.881, "mean_token_accuracy": 0.16446806192398072, "num_tokens": 5341635.0, "step": 2935 }, { "entropy": 6.083116579055786, "epoch": 2.5259991405242803, "grad_norm": 1.046875, "learning_rate": 0.000464056210508951, "loss": 5.8562, "mean_token_accuracy": 0.17011034190654756, "num_tokens": 5350144.0, "step": 2940 }, { "entropy": 6.115471887588501, "epoch": 2.530296519123335, "grad_norm": 1.1171875, "learning_rate": 0.0004638757321969426, "loss": 5.7867, "mean_token_accuracy": 0.17099311649799348, "num_tokens": 5358788.0, "step": 2945 }, { "entropy": 6.1144256591796875, "epoch": 2.5345938977223894, "grad_norm": 1.0859375, "learning_rate": 0.00046369484145582815, "loss": 5.8724, "mean_token_accuracy": 0.16514718383550644, "num_tokens": 5368057.0, "step": 2950 }, { "entropy": 6.012566184997558, "epoch": 2.538891276321444, "grad_norm": 1.0546875, "learning_rate": 0.00046351353868060054, "loss": 5.7114, "mean_token_accuracy": 0.18013515770435334, "num_tokens": 5376739.0, "step": 2955 }, { "entropy": 6.148663520812988, "epoch": 2.5431886549204985, "grad_norm": 1.078125, "learning_rate": 0.00046333182426715273, "loss": 5.8321, "mean_token_accuracy": 0.17200638502836227, "num_tokens": 5385967.0, "step": 2960 }, { "entropy": 6.130731725692749, "epoch": 2.547486033519553, "grad_norm": 1.046875, "learning_rate": 0.00046314969861227626, "loss": 5.8503, "mean_token_accuracy": 0.16146385669708252, "num_tokens": 5395192.0, "step": 2965 }, { "entropy": 6.0897301197052, "epoch": 2.5517834121186076, "grad_norm": 0.94921875, "learning_rate": 0.0004629671621136608, "loss": 5.8185, "mean_token_accuracy": 0.1707317277789116, "num_tokens": 5404694.0, "step": 2970 }, { "entropy": 6.1051109790802, "epoch": 2.556080790717662, "grad_norm": 1.1328125, "learning_rate": 0.0004627842151698931, "loss": 5.8208, "mean_token_accuracy": 0.1680385336279869, "num_tokens": 5413102.0, "step": 2975 }, { "entropy": 6.055868244171142, "epoch": 2.5603781693167167, "grad_norm": 1.0703125, "learning_rate": 0.00046260085818045625, "loss": 5.8548, "mean_token_accuracy": 0.17059293240308762, "num_tokens": 5423339.0, "step": 2980 }, { "entropy": 6.159391784667969, "epoch": 2.5646755479157712, "grad_norm": 1.078125, "learning_rate": 0.0004624170915457284, "loss": 5.8092, "mean_token_accuracy": 0.1737206295132637, "num_tokens": 5432377.0, "step": 2985 }, { "entropy": 6.093228054046631, "epoch": 2.5689729265148262, "grad_norm": 1.2109375, "learning_rate": 0.00046223291566698264, "loss": 5.7337, "mean_token_accuracy": 0.17590138092637062, "num_tokens": 5441038.0, "step": 2990 }, { "entropy": 6.048309326171875, "epoch": 2.5732703051138808, "grad_norm": 1.0234375, "learning_rate": 0.0004620483309463855, "loss": 5.752, "mean_token_accuracy": 0.18060761988162993, "num_tokens": 5449557.0, "step": 2995 }, { "entropy": 6.118342542648316, "epoch": 2.5775676837129353, "grad_norm": 1.078125, "learning_rate": 0.0004618633377869961, "loss": 5.8908, "mean_token_accuracy": 0.17027267962694168, "num_tokens": 5458931.0, "step": 3000 }, { "epoch": 2.5775676837129353, "eval_entropy": 5.922512502283664, "eval_loss": 6.089641094207764, "eval_mean_token_accuracy": 0.16509396373084537, "eval_num_tokens": 5458931.0, "eval_runtime": 2.0449, "eval_samples_per_second": 1735.564, "eval_steps_per_second": 217.129, "step": 3000 }, { "entropy": 6.080532598495483, "epoch": 2.58186506231199, "grad_norm": 1.0078125, "learning_rate": 0.0004616779365927656, "loss": 5.715, "mean_token_accuracy": 0.18329965472221374, "num_tokens": 5468539.0, "step": 3005 }, { "entropy": 5.97086615562439, "epoch": 2.5861624409110444, "grad_norm": 1.2734375, "learning_rate": 0.0004614921277685361, "loss": 5.6562, "mean_token_accuracy": 0.18940457850694656, "num_tokens": 5475710.0, "step": 3010 }, { "entropy": 6.014264154434204, "epoch": 2.590459819510099, "grad_norm": 1.0390625, "learning_rate": 0.00046130591172003976, "loss": 5.8105, "mean_token_accuracy": 0.17161315381526948, "num_tokens": 5484597.0, "step": 3015 }, { "entropy": 6.152267932891846, "epoch": 2.5947571981091535, "grad_norm": 1.0390625, "learning_rate": 0.0004611192888538981, "loss": 5.8783, "mean_token_accuracy": 0.16452959179878235, "num_tokens": 5493213.0, "step": 3020 }, { "entropy": 6.160617208480835, "epoch": 2.599054576708208, "grad_norm": 1.2890625, "learning_rate": 0.00046093225957762084, "loss": 5.8774, "mean_token_accuracy": 0.1677599936723709, "num_tokens": 5502556.0, "step": 3025 }, { "entropy": 6.121158790588379, "epoch": 2.6033519553072626, "grad_norm": 1.078125, "learning_rate": 0.0004607448242996051, "loss": 5.7783, "mean_token_accuracy": 0.17667657732963563, "num_tokens": 5511779.0, "step": 3030 }, { "entropy": 6.09659161567688, "epoch": 2.607649333906317, "grad_norm": 1.0859375, "learning_rate": 0.0004605569834291347, "loss": 5.7762, "mean_token_accuracy": 0.17719159871339799, "num_tokens": 5520836.0, "step": 3035 }, { "entropy": 6.045120048522949, "epoch": 2.6119467125053717, "grad_norm": 1.1953125, "learning_rate": 0.00046036873737637904, "loss": 5.7919, "mean_token_accuracy": 0.1743898034095764, "num_tokens": 5529285.0, "step": 3040 }, { "entropy": 6.032482624053955, "epoch": 2.6162440911044262, "grad_norm": 1.0859375, "learning_rate": 0.0004601800865523921, "loss": 5.795, "mean_token_accuracy": 0.171583154797554, "num_tokens": 5538160.0, "step": 3045 }, { "entropy": 6.106417560577393, "epoch": 2.620541469703481, "grad_norm": 1.0859375, "learning_rate": 0.00045999103136911204, "loss": 5.801, "mean_token_accuracy": 0.16901974827051164, "num_tokens": 5547355.0, "step": 3050 }, { "entropy": 6.082255268096924, "epoch": 2.6248388483025353, "grad_norm": 1.0390625, "learning_rate": 0.00045980157223935965, "loss": 5.8134, "mean_token_accuracy": 0.17005283683538436, "num_tokens": 5557299.0, "step": 3055 }, { "entropy": 6.022762966156006, "epoch": 2.62913622690159, "grad_norm": 1.015625, "learning_rate": 0.00045961170957683806, "loss": 5.7335, "mean_token_accuracy": 0.17649647146463393, "num_tokens": 5565469.0, "step": 3060 }, { "entropy": 6.093750953674316, "epoch": 2.6334336055006444, "grad_norm": 1.0390625, "learning_rate": 0.00045942144379613147, "loss": 5.8526, "mean_token_accuracy": 0.17166510373353958, "num_tokens": 5574740.0, "step": 3065 }, { "entropy": 6.13080358505249, "epoch": 2.637730984099699, "grad_norm": 1.0703125, "learning_rate": 0.00045923077531270426, "loss": 5.8407, "mean_token_accuracy": 0.1691308617591858, "num_tokens": 5583438.0, "step": 3070 }, { "entropy": 6.099384212493897, "epoch": 2.6420283626987535, "grad_norm": 1.078125, "learning_rate": 0.0004590397045429001, "loss": 5.8094, "mean_token_accuracy": 0.17814612835645677, "num_tokens": 5592389.0, "step": 3075 }, { "entropy": 6.034897947311402, "epoch": 2.646325741297808, "grad_norm": 0.984375, "learning_rate": 0.00045884823190394134, "loss": 5.7097, "mean_token_accuracy": 0.18135049045085908, "num_tokens": 5601598.0, "step": 3080 }, { "entropy": 6.026404762268067, "epoch": 2.650623119896863, "grad_norm": 1.1484375, "learning_rate": 0.0004586563578139275, "loss": 5.7991, "mean_token_accuracy": 0.16875589936971663, "num_tokens": 5610498.0, "step": 3085 }, { "entropy": 6.030868768692017, "epoch": 2.6549204984959176, "grad_norm": 1.1875, "learning_rate": 0.00045846408269183505, "loss": 5.7054, "mean_token_accuracy": 0.1822717770934105, "num_tokens": 5620082.0, "step": 3090 }, { "entropy": 6.124869680404663, "epoch": 2.659217877094972, "grad_norm": 1.0546875, "learning_rate": 0.00045827140695751603, "loss": 5.7925, "mean_token_accuracy": 0.17561250925064087, "num_tokens": 5630291.0, "step": 3095 }, { "entropy": 6.034296226501465, "epoch": 2.6635152556940267, "grad_norm": 1.1875, "learning_rate": 0.0004580783310316971, "loss": 5.7735, "mean_token_accuracy": 0.1745203271508217, "num_tokens": 5638784.0, "step": 3100 }, { "entropy": 5.955547714233399, "epoch": 2.6678126342930812, "grad_norm": 1.09375, "learning_rate": 0.00045788485533597895, "loss": 5.6462, "mean_token_accuracy": 0.1883938804268837, "num_tokens": 5647968.0, "step": 3105 }, { "entropy": 6.074917793273926, "epoch": 2.672110012892136, "grad_norm": 1.0625, "learning_rate": 0.00045769098029283526, "loss": 5.8675, "mean_token_accuracy": 0.16595781594514847, "num_tokens": 5657543.0, "step": 3110 }, { "entropy": 6.098982810974121, "epoch": 2.6764073914911903, "grad_norm": 1.1328125, "learning_rate": 0.0004574967063256115, "loss": 5.7801, "mean_token_accuracy": 0.17747018337249756, "num_tokens": 5666535.0, "step": 3115 }, { "entropy": 6.071979904174805, "epoch": 2.680704770090245, "grad_norm": 1.125, "learning_rate": 0.00045730203385852447, "loss": 5.8624, "mean_token_accuracy": 0.17044119387865067, "num_tokens": 5676273.0, "step": 3120 }, { "entropy": 5.99071159362793, "epoch": 2.6850021486892994, "grad_norm": 1.0546875, "learning_rate": 0.000457106963316661, "loss": 5.7612, "mean_token_accuracy": 0.177787147462368, "num_tokens": 5684888.0, "step": 3125 }, { "entropy": 6.075513315200806, "epoch": 2.689299527288354, "grad_norm": 1.0859375, "learning_rate": 0.00045691149512597717, "loss": 5.8228, "mean_token_accuracy": 0.17054860144853592, "num_tokens": 5693626.0, "step": 3130 }, { "entropy": 6.097319412231445, "epoch": 2.6935969058874085, "grad_norm": 1.3671875, "learning_rate": 0.00045671562971329736, "loss": 5.7252, "mean_token_accuracy": 0.18006587252020836, "num_tokens": 5702542.0, "step": 3135 }, { "entropy": 5.990765905380249, "epoch": 2.6978942844864635, "grad_norm": 1.1875, "learning_rate": 0.00045651936750631337, "loss": 5.7717, "mean_token_accuracy": 0.17453034669160844, "num_tokens": 5711440.0, "step": 3140 }, { "entropy": 6.161540746688843, "epoch": 2.702191663085518, "grad_norm": 1.0546875, "learning_rate": 0.00045632270893358333, "loss": 5.8408, "mean_token_accuracy": 0.17016429752111434, "num_tokens": 5721495.0, "step": 3145 }, { "entropy": 6.113433980941773, "epoch": 2.7064890416845726, "grad_norm": 1.1171875, "learning_rate": 0.0004561256544245312, "loss": 5.8702, "mean_token_accuracy": 0.16516700088977815, "num_tokens": 5730664.0, "step": 3150 }, { "entropy": 6.01002688407898, "epoch": 2.710786420283627, "grad_norm": 1.0859375, "learning_rate": 0.000455928204409445, "loss": 5.7398, "mean_token_accuracy": 0.18062053769826888, "num_tokens": 5740229.0, "step": 3155 }, { "entropy": 6.008193063735962, "epoch": 2.7150837988826817, "grad_norm": 1.1484375, "learning_rate": 0.00045573035931947684, "loss": 5.7378, "mean_token_accuracy": 0.1744979053735733, "num_tokens": 5748549.0, "step": 3160 }, { "entropy": 6.039789533615112, "epoch": 2.7193811774817362, "grad_norm": 1.125, "learning_rate": 0.0004555321195866411, "loss": 5.6806, "mean_token_accuracy": 0.17732828259468078, "num_tokens": 5757603.0, "step": 3165 }, { "entropy": 6.080458354949951, "epoch": 2.723678556080791, "grad_norm": 1.265625, "learning_rate": 0.0004553334856438143, "loss": 5.8618, "mean_token_accuracy": 0.16976358145475387, "num_tokens": 5767520.0, "step": 3170 }, { "entropy": 6.09289813041687, "epoch": 2.7279759346798453, "grad_norm": 1.0078125, "learning_rate": 0.00045513445792473356, "loss": 5.8583, "mean_token_accuracy": 0.1657076820731163, "num_tokens": 5776778.0, "step": 3175 }, { "entropy": 6.129758882522583, "epoch": 2.7322733132789, "grad_norm": 1.1328125, "learning_rate": 0.0004549350368639958, "loss": 5.8808, "mean_token_accuracy": 0.16651461273431778, "num_tokens": 5785652.0, "step": 3180 }, { "entropy": 6.144531726837158, "epoch": 2.7365706918779544, "grad_norm": 1.078125, "learning_rate": 0.00045473522289705693, "loss": 5.849, "mean_token_accuracy": 0.1734338730573654, "num_tokens": 5795766.0, "step": 3185 }, { "entropy": 5.992430114746094, "epoch": 2.740868070477009, "grad_norm": 1.140625, "learning_rate": 0.00045453501646023085, "loss": 5.8822, "mean_token_accuracy": 0.16669443398714065, "num_tokens": 5804504.0, "step": 3190 }, { "entropy": 6.04656753540039, "epoch": 2.7451654490760635, "grad_norm": 0.94921875, "learning_rate": 0.00045433441799068837, "loss": 5.7879, "mean_token_accuracy": 0.17372047603130342, "num_tokens": 5814161.0, "step": 3195 }, { "entropy": 6.094403076171875, "epoch": 2.749462827675118, "grad_norm": 1.0625, "learning_rate": 0.0004541334279264562, "loss": 5.6942, "mean_token_accuracy": 0.18637760877609252, "num_tokens": 5822235.0, "step": 3200 }, { "entropy": 6.033037233352661, "epoch": 2.7537602062741726, "grad_norm": 1.15625, "learning_rate": 0.00045393204670641656, "loss": 5.7009, "mean_token_accuracy": 0.17470744848251343, "num_tokens": 5831572.0, "step": 3205 }, { "entropy": 5.930374097824097, "epoch": 2.758057584873227, "grad_norm": 1.046875, "learning_rate": 0.0004537302747703055, "loss": 5.7328, "mean_token_accuracy": 0.18409457355737685, "num_tokens": 5839694.0, "step": 3210 }, { "entropy": 6.124579620361328, "epoch": 2.7623549634722817, "grad_norm": 1.1640625, "learning_rate": 0.00045352811255871216, "loss": 5.8448, "mean_token_accuracy": 0.17230847403407096, "num_tokens": 5849131.0, "step": 3215 }, { "entropy": 6.174058246612549, "epoch": 2.7666523420713363, "grad_norm": 0.93359375, "learning_rate": 0.00045332556051307804, "loss": 5.7711, "mean_token_accuracy": 0.1720232903957367, "num_tokens": 5858861.0, "step": 3220 }, { "entropy": 6.060689687728882, "epoch": 2.770949720670391, "grad_norm": 1.078125, "learning_rate": 0.00045312261907569585, "loss": 5.7833, "mean_token_accuracy": 0.17473076432943344, "num_tokens": 5867585.0, "step": 3225 }, { "entropy": 6.015534782409668, "epoch": 2.775247099269446, "grad_norm": 1.078125, "learning_rate": 0.00045291928868970867, "loss": 5.7865, "mean_token_accuracy": 0.16985856741666794, "num_tokens": 5876256.0, "step": 3230 }, { "entropy": 6.038353967666626, "epoch": 2.7795444778685003, "grad_norm": 1.0703125, "learning_rate": 0.0004527155697991087, "loss": 5.8471, "mean_token_accuracy": 0.16595111042261124, "num_tokens": 5885302.0, "step": 3235 }, { "entropy": 6.021387720108033, "epoch": 2.783841856467555, "grad_norm": 0.94921875, "learning_rate": 0.0004525114628487365, "loss": 5.8628, "mean_token_accuracy": 0.16938397139310837, "num_tokens": 5895066.0, "step": 3240 }, { "entropy": 6.151897144317627, "epoch": 2.7881392350666094, "grad_norm": 1.0546875, "learning_rate": 0.00045230696828428026, "loss": 5.8557, "mean_token_accuracy": 0.16848236918449402, "num_tokens": 5903258.0, "step": 3245 }, { "entropy": 6.039195203781128, "epoch": 2.792436613665664, "grad_norm": 1.15625, "learning_rate": 0.0004521020865522742, "loss": 5.7511, "mean_token_accuracy": 0.1710444927215576, "num_tokens": 5911714.0, "step": 3250 }, { "entropy": 5.995623922348022, "epoch": 2.7967339922647185, "grad_norm": 1.0859375, "learning_rate": 0.00045189681810009827, "loss": 5.8176, "mean_token_accuracy": 0.17098681181669234, "num_tokens": 5920432.0, "step": 3255 }, { "entropy": 6.183896541595459, "epoch": 2.801031370863773, "grad_norm": 1.234375, "learning_rate": 0.00045169116337597653, "loss": 5.8195, "mean_token_accuracy": 0.1705167680978775, "num_tokens": 5929202.0, "step": 3260 }, { "entropy": 6.115947484970093, "epoch": 2.8053287494628276, "grad_norm": 1.1640625, "learning_rate": 0.000451485122828977, "loss": 5.8601, "mean_token_accuracy": 0.16596955806016922, "num_tokens": 5938034.0, "step": 3265 }, { "entropy": 5.939763784408569, "epoch": 2.809626128061882, "grad_norm": 1.015625, "learning_rate": 0.00045127869690900956, "loss": 5.7097, "mean_token_accuracy": 0.18104571253061294, "num_tokens": 5946944.0, "step": 3270 }, { "entropy": 6.011037588119507, "epoch": 2.8139235066609367, "grad_norm": 1.21875, "learning_rate": 0.00045107188606682613, "loss": 5.8219, "mean_token_accuracy": 0.17439836859703065, "num_tokens": 5956475.0, "step": 3275 }, { "entropy": 6.145265722274781, "epoch": 2.8182208852599913, "grad_norm": 1.125, "learning_rate": 0.0004508646907540188, "loss": 5.7788, "mean_token_accuracy": 0.1669231042265892, "num_tokens": 5965814.0, "step": 3280 }, { "entropy": 6.047088956832885, "epoch": 2.8225182638590463, "grad_norm": 1.1484375, "learning_rate": 0.0004506571114230195, "loss": 5.839, "mean_token_accuracy": 0.16392946541309356, "num_tokens": 5973850.0, "step": 3285 }, { "entropy": 5.966979217529297, "epoch": 2.826815642458101, "grad_norm": 1.0, "learning_rate": 0.00045044914852709824, "loss": 5.7718, "mean_token_accuracy": 0.17040073573589326, "num_tokens": 5982987.0, "step": 3290 }, { "entropy": 6.124370384216308, "epoch": 2.8311130210571553, "grad_norm": 1.1328125, "learning_rate": 0.0004502408025203631, "loss": 5.7567, "mean_token_accuracy": 0.18116641640663148, "num_tokens": 5992227.0, "step": 3295 }, { "entropy": 6.060417222976684, "epoch": 2.83541039965621, "grad_norm": 1.046875, "learning_rate": 0.0004500320738577584, "loss": 5.7373, "mean_token_accuracy": 0.18134199529886247, "num_tokens": 6000243.0, "step": 3300 }, { "entropy": 6.009725856781006, "epoch": 2.8397077782552644, "grad_norm": 1.1171875, "learning_rate": 0.00044982296299506407, "loss": 5.7396, "mean_token_accuracy": 0.1772996261715889, "num_tokens": 6009771.0, "step": 3305 }, { "entropy": 6.065958309173584, "epoch": 2.844005156854319, "grad_norm": 1.203125, "learning_rate": 0.0004496134703888948, "loss": 5.8227, "mean_token_accuracy": 0.17212264090776444, "num_tokens": 6018683.0, "step": 3310 }, { "entropy": 6.064363861083985, "epoch": 2.8483025354533735, "grad_norm": 1.0859375, "learning_rate": 0.00044940359649669846, "loss": 5.6744, "mean_token_accuracy": 0.18227704763412475, "num_tokens": 6027422.0, "step": 3315 }, { "entropy": 5.975715494155883, "epoch": 2.852599914052428, "grad_norm": 1.0859375, "learning_rate": 0.00044919334177675595, "loss": 5.7633, "mean_token_accuracy": 0.1744269087910652, "num_tokens": 6035670.0, "step": 3320 }, { "entropy": 6.036557149887085, "epoch": 2.8568972926514826, "grad_norm": 1.1015625, "learning_rate": 0.00044898270668817955, "loss": 5.6979, "mean_token_accuracy": 0.1815047174692154, "num_tokens": 6044092.0, "step": 3325 }, { "entropy": 6.020293951034546, "epoch": 2.861194671250537, "grad_norm": 0.99609375, "learning_rate": 0.000448771691690912, "loss": 5.7773, "mean_token_accuracy": 0.16777832806110382, "num_tokens": 6053970.0, "step": 3330 }, { "entropy": 6.0354420185089115, "epoch": 2.8654920498495917, "grad_norm": 1.0546875, "learning_rate": 0.0004485602972457257, "loss": 5.7383, "mean_token_accuracy": 0.1797216445207596, "num_tokens": 6062965.0, "step": 3335 }, { "entropy": 6.068978786468506, "epoch": 2.8697894284486463, "grad_norm": 1.0859375, "learning_rate": 0.00044834852381422165, "loss": 5.8049, "mean_token_accuracy": 0.17490418255329132, "num_tokens": 6072420.0, "step": 3340 }, { "entropy": 6.005989837646484, "epoch": 2.874086807047701, "grad_norm": 1.078125, "learning_rate": 0.00044813637185882836, "loss": 5.7175, "mean_token_accuracy": 0.17540892213582993, "num_tokens": 6080915.0, "step": 3345 }, { "entropy": 6.08394684791565, "epoch": 2.8783841856467554, "grad_norm": 1.1953125, "learning_rate": 0.00044792384184280106, "loss": 5.8546, "mean_token_accuracy": 0.16469819992780685, "num_tokens": 6090453.0, "step": 3350 }, { "entropy": 6.007422256469726, "epoch": 2.88268156424581, "grad_norm": 1.1171875, "learning_rate": 0.00044771093423022013, "loss": 5.8795, "mean_token_accuracy": 0.16327449679374695, "num_tokens": 6099390.0, "step": 3355 }, { "entropy": 6.039740133285522, "epoch": 2.8869789428448644, "grad_norm": 1.03125, "learning_rate": 0.0004474976494859909, "loss": 5.798, "mean_token_accuracy": 0.17494250684976578, "num_tokens": 6108677.0, "step": 3360 }, { "entropy": 6.058880233764649, "epoch": 2.891276321443919, "grad_norm": 0.98828125, "learning_rate": 0.0004472839880758419, "loss": 5.716, "mean_token_accuracy": 0.17659443318843843, "num_tokens": 6117151.0, "step": 3365 }, { "entropy": 6.116930532455444, "epoch": 2.8955737000429735, "grad_norm": 1.140625, "learning_rate": 0.0004470699504663242, "loss": 5.8387, "mean_token_accuracy": 0.16731317192316056, "num_tokens": 6127167.0, "step": 3370 }, { "entropy": 6.032348012924194, "epoch": 2.899871078642028, "grad_norm": 1.0703125, "learning_rate": 0.0004468555371248104, "loss": 5.7315, "mean_token_accuracy": 0.1812448814511299, "num_tokens": 6136487.0, "step": 3375 }, { "entropy": 6.042793130874633, "epoch": 2.904168457241083, "grad_norm": 1.0546875, "learning_rate": 0.0004466407485194937, "loss": 5.8432, "mean_token_accuracy": 0.16948612183332443, "num_tokens": 6145334.0, "step": 3380 }, { "entropy": 6.0380902767181395, "epoch": 2.9084658358401376, "grad_norm": 1.0546875, "learning_rate": 0.0004464255851193864, "loss": 5.7558, "mean_token_accuracy": 0.17524855434894562, "num_tokens": 6155062.0, "step": 3385 }, { "entropy": 6.0504677295684814, "epoch": 2.912763214439192, "grad_norm": 1.84375, "learning_rate": 0.0004462100473943194, "loss": 5.6948, "mean_token_accuracy": 0.1831045612692833, "num_tokens": 6164313.0, "step": 3390 }, { "entropy": 6.009590721130371, "epoch": 2.9170605930382467, "grad_norm": 1.0234375, "learning_rate": 0.000445994135814941, "loss": 5.7596, "mean_token_accuracy": 0.17282265722751616, "num_tokens": 6173513.0, "step": 3395 }, { "entropy": 6.019471836090088, "epoch": 2.9213579716373013, "grad_norm": 1.2578125, "learning_rate": 0.00044577785085271566, "loss": 5.7717, "mean_token_accuracy": 0.17321082055568696, "num_tokens": 6182000.0, "step": 3400 }, { "entropy": 6.081956481933593, "epoch": 2.925655350236356, "grad_norm": 1.0, "learning_rate": 0.0004455611929799235, "loss": 5.8084, "mean_token_accuracy": 0.16455612033605577, "num_tokens": 6191887.0, "step": 3405 }, { "entropy": 5.956260538101196, "epoch": 2.9299527288354104, "grad_norm": 1.0234375, "learning_rate": 0.0004453441626696585, "loss": 5.8554, "mean_token_accuracy": 0.16420858800411225, "num_tokens": 6202897.0, "step": 3410 }, { "entropy": 6.078850793838501, "epoch": 2.934250107434465, "grad_norm": 1.0625, "learning_rate": 0.00044512676039582823, "loss": 5.7438, "mean_token_accuracy": 0.18065994828939438, "num_tokens": 6211811.0, "step": 3415 }, { "entropy": 6.1270510196685795, "epoch": 2.9385474860335195, "grad_norm": 1.125, "learning_rate": 0.0004449089866331524, "loss": 5.7475, "mean_token_accuracy": 0.1779635578393936, "num_tokens": 6219896.0, "step": 3420 }, { "entropy": 5.887205791473389, "epoch": 2.942844864632574, "grad_norm": 1.109375, "learning_rate": 0.0004446908418571617, "loss": 5.737, "mean_token_accuracy": 0.17775188386440277, "num_tokens": 6228212.0, "step": 3425 }, { "entropy": 6.039950180053711, "epoch": 2.9471422432316285, "grad_norm": 1.046875, "learning_rate": 0.0004444723265441973, "loss": 5.896, "mean_token_accuracy": 0.16747722327709197, "num_tokens": 6238133.0, "step": 3430 }, { "entropy": 6.076528787612915, "epoch": 2.9514396218306835, "grad_norm": 0.9765625, "learning_rate": 0.0004442534411714092, "loss": 5.7945, "mean_token_accuracy": 0.16944347620010375, "num_tokens": 6247331.0, "step": 3435 }, { "entropy": 6.101355123519897, "epoch": 2.955737000429738, "grad_norm": 1.125, "learning_rate": 0.00044403418621675555, "loss": 5.7926, "mean_token_accuracy": 0.17052113264799118, "num_tokens": 6255280.0, "step": 3440 }, { "entropy": 6.050349187850952, "epoch": 2.9600343790287926, "grad_norm": 1.0625, "learning_rate": 0.0004438145621590017, "loss": 5.7555, "mean_token_accuracy": 0.17491218596696853, "num_tokens": 6264752.0, "step": 3445 }, { "entropy": 5.9742930889129635, "epoch": 2.964331757627847, "grad_norm": 1.140625, "learning_rate": 0.00044359456947771857, "loss": 5.7023, "mean_token_accuracy": 0.17401786297559738, "num_tokens": 6273258.0, "step": 3450 }, { "entropy": 5.869597768783569, "epoch": 2.9686291362269017, "grad_norm": 1.21875, "learning_rate": 0.0004433742086532824, "loss": 5.6228, "mean_token_accuracy": 0.19265587478876114, "num_tokens": 6281584.0, "step": 3455 }, { "entropy": 6.010894346237182, "epoch": 2.9729265148259563, "grad_norm": 1.203125, "learning_rate": 0.00044315348016687317, "loss": 5.7472, "mean_token_accuracy": 0.17217940390110015, "num_tokens": 6290016.0, "step": 3460 }, { "entropy": 5.982230138778687, "epoch": 2.977223893425011, "grad_norm": 1.0703125, "learning_rate": 0.0004429323845004736, "loss": 5.6523, "mean_token_accuracy": 0.18324829190969466, "num_tokens": 6298569.0, "step": 3465 }, { "entropy": 6.025563192367554, "epoch": 2.9815212720240654, "grad_norm": 1.015625, "learning_rate": 0.00044271092213686824, "loss": 5.6855, "mean_token_accuracy": 0.18166320472955705, "num_tokens": 6307684.0, "step": 3470 }, { "entropy": 6.1143230438232425, "epoch": 2.98581865062312, "grad_norm": 0.98046875, "learning_rate": 0.00044248909355964247, "loss": 5.8192, "mean_token_accuracy": 0.17195742726325988, "num_tokens": 6317767.0, "step": 3475 }, { "entropy": 6.101650047302246, "epoch": 2.9901160292221745, "grad_norm": 1.140625, "learning_rate": 0.00044226689925318117, "loss": 5.8454, "mean_token_accuracy": 0.16614989936351776, "num_tokens": 6327457.0, "step": 3480 }, { "entropy": 5.9672809600830075, "epoch": 2.994413407821229, "grad_norm": 1.0703125, "learning_rate": 0.00044204433970266785, "loss": 5.6491, "mean_token_accuracy": 0.1888262301683426, "num_tokens": 6335747.0, "step": 3485 }, { "entropy": 5.983137941360473, "epoch": 2.9987107864202835, "grad_norm": 1.078125, "learning_rate": 0.0004418214153940837, "loss": 5.7429, "mean_token_accuracy": 0.18020158410072326, "num_tokens": 6344750.0, "step": 3490 }, { "entropy": 6.069730917612712, "epoch": 3.002578427159433, "grad_norm": 0.9140625, "learning_rate": 0.00044159812681420624, "loss": 5.6774, "mean_token_accuracy": 0.18010619613859388, "num_tokens": 6354779.0, "step": 3495 }, { "entropy": 6.04861307144165, "epoch": 3.0068758057584875, "grad_norm": 1.109375, "learning_rate": 0.0004413744744506086, "loss": 5.4671, "mean_token_accuracy": 0.18785551339387893, "num_tokens": 6363809.0, "step": 3500 }, { "epoch": 3.0068758057584875, "eval_entropy": 5.758008357640859, "eval_loss": 6.010996341705322, "eval_mean_token_accuracy": 0.17076294478196818, "eval_num_tokens": 6363809.0, "eval_runtime": 2.0461, "eval_samples_per_second": 1734.535, "eval_steps_per_second": 217.0, "step": 3500 }, { "entropy": 5.996388673782349, "epoch": 3.011173184357542, "grad_norm": 1.078125, "learning_rate": 0.00044115045879165806, "loss": 5.5232, "mean_token_accuracy": 0.18338106274604798, "num_tokens": 6373082.0, "step": 3505 }, { "entropy": 6.007624244689941, "epoch": 3.0154705629565965, "grad_norm": 1.1328125, "learning_rate": 0.00044092608032651515, "loss": 5.4884, "mean_token_accuracy": 0.18493928611278534, "num_tokens": 6381286.0, "step": 3510 }, { "entropy": 6.025672864913941, "epoch": 3.019767941555651, "grad_norm": 1.0, "learning_rate": 0.00044070133954513305, "loss": 5.4331, "mean_token_accuracy": 0.19577560871839522, "num_tokens": 6390217.0, "step": 3515 }, { "entropy": 6.000850200653076, "epoch": 3.0240653201547056, "grad_norm": 1.2421875, "learning_rate": 0.0004404762369382555, "loss": 5.4683, "mean_token_accuracy": 0.18827376067638396, "num_tokens": 6399276.0, "step": 3520 }, { "entropy": 5.93954758644104, "epoch": 3.02836269875376, "grad_norm": 1.1640625, "learning_rate": 0.00044025077299741683, "loss": 5.4445, "mean_token_accuracy": 0.1970704421401024, "num_tokens": 6407981.0, "step": 3525 }, { "entropy": 5.957830190658569, "epoch": 3.0326600773528147, "grad_norm": 1.1484375, "learning_rate": 0.00044002494821494007, "loss": 5.4438, "mean_token_accuracy": 0.1922784000635147, "num_tokens": 6416159.0, "step": 3530 }, { "entropy": 5.9005261898040775, "epoch": 3.0369574559518693, "grad_norm": 1.109375, "learning_rate": 0.00043979876308393635, "loss": 5.4964, "mean_token_accuracy": 0.19178588539361954, "num_tokens": 6424564.0, "step": 3535 }, { "entropy": 6.050303220748901, "epoch": 3.041254834550924, "grad_norm": 1.0625, "learning_rate": 0.0004395722180983036, "loss": 5.5298, "mean_token_accuracy": 0.1850397542119026, "num_tokens": 6434163.0, "step": 3540 }, { "entropy": 5.894874811172485, "epoch": 3.0455522131499784, "grad_norm": 1.1171875, "learning_rate": 0.00043934531375272535, "loss": 5.3505, "mean_token_accuracy": 0.20217433124780654, "num_tokens": 6443372.0, "step": 3545 }, { "entropy": 5.910711765289307, "epoch": 3.049849591749033, "grad_norm": 1.0, "learning_rate": 0.00043911805054267015, "loss": 5.4569, "mean_token_accuracy": 0.19326651990413665, "num_tokens": 6452638.0, "step": 3550 }, { "entropy": 6.0943183422088625, "epoch": 3.0541469703480875, "grad_norm": 1.0703125, "learning_rate": 0.00043889042896439004, "loss": 5.4504, "mean_token_accuracy": 0.19366029798984527, "num_tokens": 6461319.0, "step": 3555 }, { "entropy": 5.933924388885498, "epoch": 3.0584443489471425, "grad_norm": 1.2890625, "learning_rate": 0.00043866244951491946, "loss": 5.3807, "mean_token_accuracy": 0.20453428626060485, "num_tokens": 6469506.0, "step": 3560 }, { "entropy": 5.953407573699951, "epoch": 3.062741727546197, "grad_norm": 1.1171875, "learning_rate": 0.00043843411269207445, "loss": 5.437, "mean_token_accuracy": 0.1967134654521942, "num_tokens": 6478404.0, "step": 3565 }, { "entropy": 5.93691873550415, "epoch": 3.0670391061452515, "grad_norm": 1.09375, "learning_rate": 0.0004382054189944514, "loss": 5.3816, "mean_token_accuracy": 0.19278321117162706, "num_tokens": 6487447.0, "step": 3570 }, { "entropy": 5.878192138671875, "epoch": 3.071336484744306, "grad_norm": 1.046875, "learning_rate": 0.0004379763689214259, "loss": 5.4196, "mean_token_accuracy": 0.18803493976593016, "num_tokens": 6496738.0, "step": 3575 }, { "entropy": 5.99528021812439, "epoch": 3.0756338633433606, "grad_norm": 0.984375, "learning_rate": 0.0004377469629731518, "loss": 5.4317, "mean_token_accuracy": 0.193548683822155, "num_tokens": 6505848.0, "step": 3580 }, { "entropy": 5.963389873504639, "epoch": 3.079931241942415, "grad_norm": 1.0390625, "learning_rate": 0.0004375172016505599, "loss": 5.4138, "mean_token_accuracy": 0.19534891694784165, "num_tokens": 6515731.0, "step": 3585 }, { "entropy": 5.967396306991577, "epoch": 3.0842286205414697, "grad_norm": 1.0390625, "learning_rate": 0.0004372870854553572, "loss": 5.4706, "mean_token_accuracy": 0.194082772731781, "num_tokens": 6524914.0, "step": 3590 }, { "entropy": 5.949575996398925, "epoch": 3.0885259991405243, "grad_norm": 1.078125, "learning_rate": 0.0004370566148900255, "loss": 5.4527, "mean_token_accuracy": 0.19753452241420746, "num_tokens": 6533712.0, "step": 3595 }, { "entropy": 5.982216501235962, "epoch": 3.092823377739579, "grad_norm": 1.0859375, "learning_rate": 0.00043682579045782024, "loss": 5.5375, "mean_token_accuracy": 0.18995364159345626, "num_tokens": 6543313.0, "step": 3600 }, { "entropy": 5.923231220245361, "epoch": 3.0971207563386334, "grad_norm": 1.203125, "learning_rate": 0.0004365946126627699, "loss": 5.4189, "mean_token_accuracy": 0.2017338365316391, "num_tokens": 6551634.0, "step": 3605 }, { "entropy": 5.981328535079956, "epoch": 3.101418134937688, "grad_norm": 1.0859375, "learning_rate": 0.00043636308200967433, "loss": 5.4241, "mean_token_accuracy": 0.2000526711344719, "num_tokens": 6560695.0, "step": 3610 }, { "entropy": 5.813827800750732, "epoch": 3.1057155135367425, "grad_norm": 1.0390625, "learning_rate": 0.0004361311990041039, "loss": 5.3344, "mean_token_accuracy": 0.1969393327832222, "num_tokens": 6569086.0, "step": 3615 }, { "entropy": 5.89613938331604, "epoch": 3.110012892135797, "grad_norm": 1.09375, "learning_rate": 0.00043589896415239843, "loss": 5.4161, "mean_token_accuracy": 0.19979367852211, "num_tokens": 6578287.0, "step": 3620 }, { "entropy": 5.952451086044311, "epoch": 3.1143102707348516, "grad_norm": 1.015625, "learning_rate": 0.00043566637796166595, "loss": 5.4753, "mean_token_accuracy": 0.19049297720193864, "num_tokens": 6587015.0, "step": 3625 }, { "entropy": 5.962173509597778, "epoch": 3.118607649333906, "grad_norm": 1.1796875, "learning_rate": 0.00043543344093978186, "loss": 5.5175, "mean_token_accuracy": 0.18538623303174973, "num_tokens": 6596187.0, "step": 3630 }, { "entropy": 5.912763595581055, "epoch": 3.122905027932961, "grad_norm": 1.109375, "learning_rate": 0.00043520015359538745, "loss": 5.3898, "mean_token_accuracy": 0.19703881144523622, "num_tokens": 6605226.0, "step": 3635 }, { "entropy": 5.858266019821167, "epoch": 3.1272024065320156, "grad_norm": 1.1328125, "learning_rate": 0.0004349665164378891, "loss": 5.4371, "mean_token_accuracy": 0.1935065433382988, "num_tokens": 6613232.0, "step": 3640 }, { "entropy": 5.901705503463745, "epoch": 3.13149978513107, "grad_norm": 1.140625, "learning_rate": 0.00043473252997745684, "loss": 5.4392, "mean_token_accuracy": 0.19095546007156372, "num_tokens": 6622247.0, "step": 3645 }, { "entropy": 5.985095024108887, "epoch": 3.1357971637301247, "grad_norm": 1.6171875, "learning_rate": 0.00043449819472502366, "loss": 5.3871, "mean_token_accuracy": 0.19558012783527373, "num_tokens": 6630883.0, "step": 3650 }, { "entropy": 5.880083703994751, "epoch": 3.1400945423291793, "grad_norm": 1.1328125, "learning_rate": 0.0004342635111922841, "loss": 5.5374, "mean_token_accuracy": 0.19031234830617905, "num_tokens": 6639399.0, "step": 3655 }, { "entropy": 5.943640947341919, "epoch": 3.144391920928234, "grad_norm": 1.140625, "learning_rate": 0.0004340284798916931, "loss": 5.433, "mean_token_accuracy": 0.19181231111288072, "num_tokens": 6649288.0, "step": 3660 }, { "entropy": 5.898565721511841, "epoch": 3.1486892995272884, "grad_norm": 1.03125, "learning_rate": 0.0004337931013364653, "loss": 5.3804, "mean_token_accuracy": 0.19686342626810074, "num_tokens": 6658670.0, "step": 3665 }, { "entropy": 5.893218088150024, "epoch": 3.152986678126343, "grad_norm": 1.125, "learning_rate": 0.000433557376040573, "loss": 5.4538, "mean_token_accuracy": 0.196715846657753, "num_tokens": 6667302.0, "step": 3670 }, { "entropy": 5.956412696838379, "epoch": 3.1572840567253975, "grad_norm": 1.09375, "learning_rate": 0.00043332130451874645, "loss": 5.4965, "mean_token_accuracy": 0.1952889919281006, "num_tokens": 6677393.0, "step": 3675 }, { "entropy": 5.933012199401856, "epoch": 3.161581435324452, "grad_norm": 0.96875, "learning_rate": 0.00043308488728647127, "loss": 5.4744, "mean_token_accuracy": 0.1893087148666382, "num_tokens": 6686727.0, "step": 3680 }, { "entropy": 5.877901268005371, "epoch": 3.1658788139235066, "grad_norm": 1.1875, "learning_rate": 0.0004328481248599882, "loss": 5.3869, "mean_token_accuracy": 0.19530683755874634, "num_tokens": 6696116.0, "step": 3685 }, { "entropy": 5.934775876998901, "epoch": 3.170176192522561, "grad_norm": 1.078125, "learning_rate": 0.0004326110177562918, "loss": 5.4945, "mean_token_accuracy": 0.18531183749437333, "num_tokens": 6704640.0, "step": 3690 }, { "entropy": 5.860188579559326, "epoch": 3.1744735711216157, "grad_norm": 1.2265625, "learning_rate": 0.00043237356649312926, "loss": 5.3497, "mean_token_accuracy": 0.20377567410469055, "num_tokens": 6713663.0, "step": 3695 }, { "entropy": 5.891902303695678, "epoch": 3.17877094972067, "grad_norm": 1.0703125, "learning_rate": 0.0004321357715889991, "loss": 5.4868, "mean_token_accuracy": 0.18891167342662812, "num_tokens": 6722965.0, "step": 3700 }, { "entropy": 5.922307395935059, "epoch": 3.1830683283197247, "grad_norm": 1.140625, "learning_rate": 0.0004318976335631505, "loss": 5.4553, "mean_token_accuracy": 0.19657856673002244, "num_tokens": 6732776.0, "step": 3705 }, { "entropy": 5.93743257522583, "epoch": 3.1873657069187797, "grad_norm": 1.0703125, "learning_rate": 0.00043165915293558155, "loss": 5.4328, "mean_token_accuracy": 0.19283491969108582, "num_tokens": 6741309.0, "step": 3710 }, { "entropy": 5.882253980636596, "epoch": 3.1916630855178343, "grad_norm": 1.0703125, "learning_rate": 0.0004314203302270388, "loss": 5.4972, "mean_token_accuracy": 0.19080058485269547, "num_tokens": 6750584.0, "step": 3715 }, { "entropy": 5.9671601295471195, "epoch": 3.195960464116889, "grad_norm": 1.1953125, "learning_rate": 0.0004311811659590154, "loss": 5.4717, "mean_token_accuracy": 0.19037737101316451, "num_tokens": 6759344.0, "step": 3720 }, { "entropy": 5.958385229110718, "epoch": 3.2002578427159434, "grad_norm": 0.90234375, "learning_rate": 0.0004309416606537507, "loss": 5.6122, "mean_token_accuracy": 0.18305473029613495, "num_tokens": 6770345.0, "step": 3725 }, { "entropy": 5.974073982238769, "epoch": 3.204555221314998, "grad_norm": 1.1328125, "learning_rate": 0.00043070181483422843, "loss": 5.5015, "mean_token_accuracy": 0.18963303416967392, "num_tokens": 6779991.0, "step": 3730 }, { "entropy": 5.910816812515259, "epoch": 3.2088525999140525, "grad_norm": 1.1875, "learning_rate": 0.000430461629024176, "loss": 5.4509, "mean_token_accuracy": 0.19442620873451233, "num_tokens": 6788972.0, "step": 3735 }, { "entropy": 5.827464151382446, "epoch": 3.213149978513107, "grad_norm": 1.203125, "learning_rate": 0.0004302211037480634, "loss": 5.3772, "mean_token_accuracy": 0.19249555021524428, "num_tokens": 6796967.0, "step": 3740 }, { "entropy": 5.8557556629180905, "epoch": 3.2174473571121616, "grad_norm": 1.2421875, "learning_rate": 0.0004299802395311015, "loss": 5.4743, "mean_token_accuracy": 0.19575096070766448, "num_tokens": 6805961.0, "step": 3745 }, { "entropy": 5.86885895729065, "epoch": 3.221744735711216, "grad_norm": 1.265625, "learning_rate": 0.0004297390368992414, "loss": 5.3787, "mean_token_accuracy": 0.19657269567251207, "num_tokens": 6814657.0, "step": 3750 }, { "entropy": 5.923396444320678, "epoch": 3.2260421143102707, "grad_norm": 1.2109375, "learning_rate": 0.00042949749637917353, "loss": 5.4168, "mean_token_accuracy": 0.1941995695233345, "num_tokens": 6823095.0, "step": 3755 }, { "entropy": 5.889871215820312, "epoch": 3.230339492909325, "grad_norm": 1.0, "learning_rate": 0.0004292556184983256, "loss": 5.4421, "mean_token_accuracy": 0.1958567351102829, "num_tokens": 6832195.0, "step": 3760 }, { "entropy": 5.969770717620849, "epoch": 3.2346368715083798, "grad_norm": 1.203125, "learning_rate": 0.0004290134037848623, "loss": 5.575, "mean_token_accuracy": 0.1806161344051361, "num_tokens": 6840922.0, "step": 3765 }, { "entropy": 5.95328722000122, "epoch": 3.2389342501074343, "grad_norm": 1.1953125, "learning_rate": 0.00042877085276768386, "loss": 5.4178, "mean_token_accuracy": 0.20026799887418748, "num_tokens": 6849182.0, "step": 3770 }, { "entropy": 5.878392887115479, "epoch": 3.243231628706489, "grad_norm": 1.09375, "learning_rate": 0.00042852796597642455, "loss": 5.408, "mean_token_accuracy": 0.19837529808282853, "num_tokens": 6857932.0, "step": 3775 }, { "entropy": 5.953036594390869, "epoch": 3.247529007305544, "grad_norm": 1.125, "learning_rate": 0.0004282847439414522, "loss": 5.5606, "mean_token_accuracy": 0.18480827659368515, "num_tokens": 6867283.0, "step": 3780 }, { "entropy": 5.960994386672974, "epoch": 3.2518263859045984, "grad_norm": 1.1015625, "learning_rate": 0.0004280411871938664, "loss": 5.5237, "mean_token_accuracy": 0.18728075176477432, "num_tokens": 6876123.0, "step": 3785 }, { "entropy": 5.969729375839234, "epoch": 3.256123764503653, "grad_norm": 1.203125, "learning_rate": 0.0004277972962654979, "loss": 5.4539, "mean_token_accuracy": 0.19224015027284622, "num_tokens": 6885239.0, "step": 3790 }, { "entropy": 5.8932037353515625, "epoch": 3.2604211431027075, "grad_norm": 1.0859375, "learning_rate": 0.0004275530716889069, "loss": 5.5146, "mean_token_accuracy": 0.18382496386766434, "num_tokens": 6895061.0, "step": 3795 }, { "entropy": 5.947560358047485, "epoch": 3.264718521701762, "grad_norm": 1.234375, "learning_rate": 0.0004273085139973822, "loss": 5.5657, "mean_token_accuracy": 0.1781401515007019, "num_tokens": 6903828.0, "step": 3800 }, { "entropy": 5.989862442016602, "epoch": 3.2690159003008166, "grad_norm": 1.1484375, "learning_rate": 0.0004270636237249401, "loss": 5.4777, "mean_token_accuracy": 0.18864577561616896, "num_tokens": 6912805.0, "step": 3805 }, { "entropy": 5.920726633071899, "epoch": 3.273313278899871, "grad_norm": 1.1328125, "learning_rate": 0.00042681840140632314, "loss": 5.5243, "mean_token_accuracy": 0.18295771330595018, "num_tokens": 6922165.0, "step": 3810 }, { "entropy": 5.956135368347168, "epoch": 3.2776106574989257, "grad_norm": 1.0859375, "learning_rate": 0.0004265728475769989, "loss": 5.4939, "mean_token_accuracy": 0.1879052475094795, "num_tokens": 6931677.0, "step": 3815 }, { "entropy": 5.918879604339599, "epoch": 3.28190803609798, "grad_norm": 1.0078125, "learning_rate": 0.0004263269627731586, "loss": 5.452, "mean_token_accuracy": 0.192815600335598, "num_tokens": 6940486.0, "step": 3820 }, { "entropy": 5.825883960723877, "epoch": 3.2862054146970348, "grad_norm": 1.125, "learning_rate": 0.0004260807475317164, "loss": 5.4745, "mean_token_accuracy": 0.18577916026115418, "num_tokens": 6948990.0, "step": 3825 }, { "entropy": 5.979638195037841, "epoch": 3.2905027932960893, "grad_norm": 1.0234375, "learning_rate": 0.0004258342023903081, "loss": 5.5953, "mean_token_accuracy": 0.18115273416042327, "num_tokens": 6959311.0, "step": 3830 }, { "entropy": 5.957726049423218, "epoch": 3.294800171895144, "grad_norm": 1.125, "learning_rate": 0.00042558732788728975, "loss": 5.3649, "mean_token_accuracy": 0.20235307216644288, "num_tokens": 6968619.0, "step": 3835 }, { "entropy": 5.867683029174804, "epoch": 3.2990975504941984, "grad_norm": 1.109375, "learning_rate": 0.00042534012456173643, "loss": 5.4398, "mean_token_accuracy": 0.1914222314953804, "num_tokens": 6977469.0, "step": 3840 }, { "entropy": 5.841017484664917, "epoch": 3.303394929093253, "grad_norm": 1.2578125, "learning_rate": 0.00042509259295344157, "loss": 5.4285, "mean_token_accuracy": 0.18821925073862075, "num_tokens": 6986772.0, "step": 3845 }, { "entropy": 5.910496807098388, "epoch": 3.3076923076923075, "grad_norm": 1.265625, "learning_rate": 0.00042484473360291514, "loss": 5.4393, "mean_token_accuracy": 0.19060401618480682, "num_tokens": 6993937.0, "step": 3850 }, { "entropy": 5.86693377494812, "epoch": 3.311989686291362, "grad_norm": 1.15625, "learning_rate": 0.00042459654705138294, "loss": 5.497, "mean_token_accuracy": 0.19289185404777526, "num_tokens": 7003222.0, "step": 3855 }, { "entropy": 5.9119508266448975, "epoch": 3.316287064890417, "grad_norm": 1.125, "learning_rate": 0.0004243480338407853, "loss": 5.4532, "mean_token_accuracy": 0.19899186342954636, "num_tokens": 7012055.0, "step": 3860 }, { "entropy": 5.892843008041382, "epoch": 3.3205844434894716, "grad_norm": 1.109375, "learning_rate": 0.0004240991945137755, "loss": 5.4592, "mean_token_accuracy": 0.19213219434022905, "num_tokens": 7021036.0, "step": 3865 }, { "entropy": 5.882801103591919, "epoch": 3.324881822088526, "grad_norm": 1.15625, "learning_rate": 0.00042385002961371944, "loss": 5.4441, "mean_token_accuracy": 0.19504359364509583, "num_tokens": 7030450.0, "step": 3870 }, { "entropy": 5.978818750381469, "epoch": 3.3291792006875807, "grad_norm": 1.1875, "learning_rate": 0.0004236005396846935, "loss": 5.5439, "mean_token_accuracy": 0.1879236653447151, "num_tokens": 7039740.0, "step": 3875 }, { "entropy": 5.946993112564087, "epoch": 3.333476579286635, "grad_norm": 1.15625, "learning_rate": 0.00042335072527148406, "loss": 5.5256, "mean_token_accuracy": 0.19050987511873246, "num_tokens": 7050430.0, "step": 3880 }, { "entropy": 5.85164065361023, "epoch": 3.3377739578856898, "grad_norm": 1.28125, "learning_rate": 0.0004231005869195859, "loss": 5.5069, "mean_token_accuracy": 0.18632889091968535, "num_tokens": 7059477.0, "step": 3885 }, { "entropy": 5.922405767440796, "epoch": 3.3420713364847443, "grad_norm": 1.4921875, "learning_rate": 0.0004228501251752011, "loss": 5.4601, "mean_token_accuracy": 0.1952619045972824, "num_tokens": 7067805.0, "step": 3890 }, { "entropy": 5.881201887130738, "epoch": 3.346368715083799, "grad_norm": 1.1171875, "learning_rate": 0.00042259934058523814, "loss": 5.46, "mean_token_accuracy": 0.18905477821826935, "num_tokens": 7077606.0, "step": 3895 }, { "entropy": 5.918180227279663, "epoch": 3.3506660936828534, "grad_norm": 1.171875, "learning_rate": 0.00042234823369731027, "loss": 5.4043, "mean_token_accuracy": 0.1953267216682434, "num_tokens": 7085647.0, "step": 3900 }, { "entropy": 5.835781908035278, "epoch": 3.354963472281908, "grad_norm": 1.140625, "learning_rate": 0.00042209680505973465, "loss": 5.449, "mean_token_accuracy": 0.1939581647515297, "num_tokens": 7095298.0, "step": 3905 }, { "entropy": 5.841428470611572, "epoch": 3.3592608508809625, "grad_norm": 1.0625, "learning_rate": 0.0004218450552215308, "loss": 5.5157, "mean_token_accuracy": 0.194383442401886, "num_tokens": 7105207.0, "step": 3910 }, { "entropy": 5.941448974609375, "epoch": 3.363558229480017, "grad_norm": 1.0546875, "learning_rate": 0.0004215929847324199, "loss": 5.5708, "mean_token_accuracy": 0.18115754574537277, "num_tokens": 7114833.0, "step": 3915 }, { "entropy": 5.963439035415649, "epoch": 3.3678556080790716, "grad_norm": 1.171875, "learning_rate": 0.000421340594142823, "loss": 5.3787, "mean_token_accuracy": 0.19882705360651015, "num_tokens": 7123608.0, "step": 3920 }, { "entropy": 5.8740808963775635, "epoch": 3.3721529866781266, "grad_norm": 1.21875, "learning_rate": 0.00042108788400386035, "loss": 5.4499, "mean_token_accuracy": 0.19346580952405928, "num_tokens": 7132250.0, "step": 3925 }, { "entropy": 5.880605030059814, "epoch": 3.376450365277181, "grad_norm": 0.98046875, "learning_rate": 0.0004208348548673498, "loss": 5.5399, "mean_token_accuracy": 0.19135694503784179, "num_tokens": 7142086.0, "step": 3930 }, { "entropy": 5.951083946228027, "epoch": 3.3807477438762357, "grad_norm": 1.1484375, "learning_rate": 0.000420581507285806, "loss": 5.4858, "mean_token_accuracy": 0.1825383946299553, "num_tokens": 7152434.0, "step": 3935 }, { "entropy": 5.82304835319519, "epoch": 3.38504512247529, "grad_norm": 1.0703125, "learning_rate": 0.0004203278418124386, "loss": 5.419, "mean_token_accuracy": 0.19977713227272034, "num_tokens": 7163041.0, "step": 3940 }, { "entropy": 5.829355192184448, "epoch": 3.3893425010743448, "grad_norm": 1.046875, "learning_rate": 0.0004200738590011518, "loss": 5.4173, "mean_token_accuracy": 0.19818853884935378, "num_tokens": 7171875.0, "step": 3945 }, { "entropy": 5.888874340057373, "epoch": 3.3936398796733993, "grad_norm": 1.1796875, "learning_rate": 0.00041981955940654245, "loss": 5.5242, "mean_token_accuracy": 0.1952082931995392, "num_tokens": 7180803.0, "step": 3950 }, { "entropy": 5.918650579452515, "epoch": 3.397937258272454, "grad_norm": 1.140625, "learning_rate": 0.0004195649435838992, "loss": 5.5527, "mean_token_accuracy": 0.17839486598968507, "num_tokens": 7190661.0, "step": 3955 }, { "entropy": 5.824749708175659, "epoch": 3.4022346368715084, "grad_norm": 1.109375, "learning_rate": 0.0004193100120892013, "loss": 5.3825, "mean_token_accuracy": 0.20471955984830856, "num_tokens": 7199357.0, "step": 3960 }, { "entropy": 5.87763524055481, "epoch": 3.406532015470563, "grad_norm": 1.015625, "learning_rate": 0.0004190547654791172, "loss": 5.5507, "mean_token_accuracy": 0.18516142815351486, "num_tokens": 7209856.0, "step": 3965 }, { "entropy": 5.960424184799194, "epoch": 3.4108293940696175, "grad_norm": 1.2578125, "learning_rate": 0.00041879920431100347, "loss": 5.5182, "mean_token_accuracy": 0.18072724491357803, "num_tokens": 7218778.0, "step": 3970 }, { "entropy": 5.929759693145752, "epoch": 3.415126772668672, "grad_norm": 1.1328125, "learning_rate": 0.0004185433291429036, "loss": 5.5383, "mean_token_accuracy": 0.19149455428123474, "num_tokens": 7228442.0, "step": 3975 }, { "entropy": 5.9323328018188475, "epoch": 3.4194241512677266, "grad_norm": 1.171875, "learning_rate": 0.00041828714053354665, "loss": 5.5232, "mean_token_accuracy": 0.18421948850154876, "num_tokens": 7238724.0, "step": 3980 }, { "entropy": 5.825714445114135, "epoch": 3.423721529866781, "grad_norm": 1.09375, "learning_rate": 0.0004180306390423462, "loss": 5.48, "mean_token_accuracy": 0.19667282402515412, "num_tokens": 7247844.0, "step": 3985 }, { "entropy": 5.873914575576782, "epoch": 3.4280189084658357, "grad_norm": 1.078125, "learning_rate": 0.00041777382522939884, "loss": 5.5471, "mean_token_accuracy": 0.18860624134540557, "num_tokens": 7257260.0, "step": 3990 }, { "entropy": 5.9357811450958256, "epoch": 3.4323162870648902, "grad_norm": 1.0078125, "learning_rate": 0.00041751669965548344, "loss": 5.5448, "mean_token_accuracy": 0.18522292822599412, "num_tokens": 7266890.0, "step": 3995 }, { "entropy": 5.9294140338897705, "epoch": 3.4366136656639448, "grad_norm": 1.171875, "learning_rate": 0.00041725926288205945, "loss": 5.5664, "mean_token_accuracy": 0.18018038868904113, "num_tokens": 7276114.0, "step": 4000 }, { "epoch": 3.4366136656639448, "eval_entropy": 5.70672602589066, "eval_loss": 5.984828472137451, "eval_mean_token_accuracy": 0.17211216785483532, "eval_num_tokens": 7276114.0, "eval_runtime": 2.0584, "eval_samples_per_second": 1724.118, "eval_steps_per_second": 215.697, "step": 4000 }, { "entropy": 5.962442874908447, "epoch": 3.4409110442629998, "grad_norm": 1.0859375, "learning_rate": 0.0004170015154712658, "loss": 5.5069, "mean_token_accuracy": 0.1916913628578186, "num_tokens": 7284426.0, "step": 4005 }, { "entropy": 5.903675699234009, "epoch": 3.4452084228620543, "grad_norm": 1.03125, "learning_rate": 0.00041674345798591993, "loss": 5.5421, "mean_token_accuracy": 0.18398448526859285, "num_tokens": 7294813.0, "step": 4010 }, { "entropy": 5.9148882865905765, "epoch": 3.449505801461109, "grad_norm": 1.0703125, "learning_rate": 0.0004164850909895161, "loss": 5.5258, "mean_token_accuracy": 0.19086309522390366, "num_tokens": 7304655.0, "step": 4015 }, { "entropy": 5.865190315246582, "epoch": 3.4538031800601634, "grad_norm": 0.9921875, "learning_rate": 0.0004162264150462247, "loss": 5.4786, "mean_token_accuracy": 0.19230384528636932, "num_tokens": 7313610.0, "step": 4020 }, { "entropy": 5.979327869415283, "epoch": 3.458100558659218, "grad_norm": 1.109375, "learning_rate": 0.00041596743072089065, "loss": 5.5052, "mean_token_accuracy": 0.1926753893494606, "num_tokens": 7322243.0, "step": 4025 }, { "entropy": 6.007712459564209, "epoch": 3.4623979372582725, "grad_norm": 1.21875, "learning_rate": 0.000415708138579032, "loss": 5.4902, "mean_token_accuracy": 0.18262085914611817, "num_tokens": 7331040.0, "step": 4030 }, { "entropy": 5.855608510971069, "epoch": 3.466695315857327, "grad_norm": 1.1015625, "learning_rate": 0.00041544853918683923, "loss": 5.5494, "mean_token_accuracy": 0.18672456443309784, "num_tokens": 7340771.0, "step": 4035 }, { "entropy": 5.890000200271606, "epoch": 3.4709926944563816, "grad_norm": 1.09375, "learning_rate": 0.0004151886331111737, "loss": 5.6055, "mean_token_accuracy": 0.18518777936697006, "num_tokens": 7349960.0, "step": 4040 }, { "entropy": 5.848879432678222, "epoch": 3.475290073055436, "grad_norm": 1.203125, "learning_rate": 0.00041492842091956646, "loss": 5.4391, "mean_token_accuracy": 0.19307579845190048, "num_tokens": 7357983.0, "step": 4045 }, { "entropy": 5.976620149612427, "epoch": 3.4795874516544907, "grad_norm": 1.109375, "learning_rate": 0.0004146679031802167, "loss": 5.5438, "mean_token_accuracy": 0.19301331490278245, "num_tokens": 7366814.0, "step": 4050 }, { "entropy": 5.923441648483276, "epoch": 3.4838848302535452, "grad_norm": 1.203125, "learning_rate": 0.00041440708046199123, "loss": 5.4161, "mean_token_accuracy": 0.19995936155319213, "num_tokens": 7374773.0, "step": 4055 }, { "entropy": 5.836741638183594, "epoch": 3.4881822088525998, "grad_norm": 1.0625, "learning_rate": 0.0004141459533344226, "loss": 5.5224, "mean_token_accuracy": 0.1920263037085533, "num_tokens": 7383937.0, "step": 4060 }, { "entropy": 5.9110997200012205, "epoch": 3.4924795874516543, "grad_norm": 1.0859375, "learning_rate": 0.00041388452236770795, "loss": 5.4846, "mean_token_accuracy": 0.1872958168387413, "num_tokens": 7392577.0, "step": 4065 }, { "entropy": 5.911495304107666, "epoch": 3.4967769660507093, "grad_norm": 1.1171875, "learning_rate": 0.00041362278813270823, "loss": 5.375, "mean_token_accuracy": 0.2082198366522789, "num_tokens": 7401473.0, "step": 4070 }, { "entropy": 5.924019622802734, "epoch": 3.501074344649764, "grad_norm": 1.046875, "learning_rate": 0.00041336075120094616, "loss": 5.5843, "mean_token_accuracy": 0.17396451085805892, "num_tokens": 7410831.0, "step": 4075 }, { "entropy": 5.966419315338134, "epoch": 3.5053717232488184, "grad_norm": 1.0546875, "learning_rate": 0.00041309841214460586, "loss": 5.564, "mean_token_accuracy": 0.18492306172847747, "num_tokens": 7421563.0, "step": 4080 }, { "entropy": 5.853989505767823, "epoch": 3.509669101847873, "grad_norm": 1.140625, "learning_rate": 0.0004128357715365309, "loss": 5.4952, "mean_token_accuracy": 0.1896917328238487, "num_tokens": 7430174.0, "step": 4085 }, { "entropy": 5.8695268630981445, "epoch": 3.5139664804469275, "grad_norm": 1.0859375, "learning_rate": 0.00041257282995022345, "loss": 5.4708, "mean_token_accuracy": 0.19776251018047333, "num_tokens": 7439034.0, "step": 4090 }, { "entropy": 5.86503415107727, "epoch": 3.518263859045982, "grad_norm": 1.3359375, "learning_rate": 0.0004123095879598426, "loss": 5.4624, "mean_token_accuracy": 0.18698047548532487, "num_tokens": 7447663.0, "step": 4095 }, { "entropy": 5.907058811187744, "epoch": 3.5225612376450366, "grad_norm": 1.09375, "learning_rate": 0.00041204604614020397, "loss": 5.5695, "mean_token_accuracy": 0.1786721721291542, "num_tokens": 7456615.0, "step": 4100 }, { "entropy": 5.957714653015136, "epoch": 3.526858616244091, "grad_norm": 1.09375, "learning_rate": 0.0004117822050667773, "loss": 5.5844, "mean_token_accuracy": 0.18617163747549056, "num_tokens": 7466203.0, "step": 4105 }, { "entropy": 5.950328683853149, "epoch": 3.5311559948431457, "grad_norm": 1.1171875, "learning_rate": 0.00041151806531568617, "loss": 5.5417, "mean_token_accuracy": 0.18721332550048828, "num_tokens": 7475411.0, "step": 4110 }, { "entropy": 5.8657652854919435, "epoch": 3.5354533734422002, "grad_norm": 1.0625, "learning_rate": 0.00041125362746370625, "loss": 5.5615, "mean_token_accuracy": 0.1827959179878235, "num_tokens": 7484965.0, "step": 4115 }, { "entropy": 5.972699880599976, "epoch": 3.5397507520412548, "grad_norm": 1.109375, "learning_rate": 0.0004109888920882639, "loss": 5.4911, "mean_token_accuracy": 0.1917601242661476, "num_tokens": 7494240.0, "step": 4120 }, { "entropy": 5.875241994857788, "epoch": 3.5440481306403093, "grad_norm": 1.078125, "learning_rate": 0.0004107238597674356, "loss": 5.5254, "mean_token_accuracy": 0.18883996307849885, "num_tokens": 7503560.0, "step": 4125 }, { "entropy": 5.829753732681274, "epoch": 3.548345509239364, "grad_norm": 1.0078125, "learning_rate": 0.000410458531079946, "loss": 5.4368, "mean_token_accuracy": 0.19843829721212386, "num_tokens": 7512650.0, "step": 4130 }, { "entropy": 5.936978960037232, "epoch": 3.5526428878384184, "grad_norm": 1.1640625, "learning_rate": 0.0004101929066051668, "loss": 5.5485, "mean_token_accuracy": 0.18788963854312896, "num_tokens": 7521864.0, "step": 4135 }, { "entropy": 5.8465251445770265, "epoch": 3.556940266437473, "grad_norm": 1.09375, "learning_rate": 0.0004099269869231157, "loss": 5.4556, "mean_token_accuracy": 0.19057717472314833, "num_tokens": 7531013.0, "step": 4140 }, { "entropy": 5.880660581588745, "epoch": 3.5612376450365275, "grad_norm": 1.046875, "learning_rate": 0.00040966077261445495, "loss": 5.4713, "mean_token_accuracy": 0.18294108659029007, "num_tokens": 7539959.0, "step": 4145 }, { "entropy": 5.955602645874023, "epoch": 3.565535023635582, "grad_norm": 1.2890625, "learning_rate": 0.0004093942642604904, "loss": 5.4437, "mean_token_accuracy": 0.19040033966302872, "num_tokens": 7548354.0, "step": 4150 }, { "entropy": 5.910123491287232, "epoch": 3.5698324022346366, "grad_norm": 1.0625, "learning_rate": 0.00040912746244316944, "loss": 5.5755, "mean_token_accuracy": 0.18532428592443467, "num_tokens": 7558321.0, "step": 4155 }, { "entropy": 5.8719439029693605, "epoch": 3.5741297808336916, "grad_norm": 1.0546875, "learning_rate": 0.00040886036774508095, "loss": 5.4582, "mean_token_accuracy": 0.1960417792201042, "num_tokens": 7567889.0, "step": 4160 }, { "entropy": 5.915447568893432, "epoch": 3.578427159432746, "grad_norm": 1.0859375, "learning_rate": 0.0004085929807494527, "loss": 5.5055, "mean_token_accuracy": 0.18908539414405823, "num_tokens": 7576752.0, "step": 4165 }, { "entropy": 5.8521651268005375, "epoch": 3.5827245380318007, "grad_norm": 1.0390625, "learning_rate": 0.0004083253020401512, "loss": 5.4099, "mean_token_accuracy": 0.19865455478429794, "num_tokens": 7585413.0, "step": 4170 }, { "entropy": 5.876428270339966, "epoch": 3.5870219166308552, "grad_norm": 1.21875, "learning_rate": 0.0004080573322016797, "loss": 5.3779, "mean_token_accuracy": 0.19813554286956786, "num_tokens": 7593966.0, "step": 4175 }, { "entropy": 5.881918239593506, "epoch": 3.59131929522991, "grad_norm": 1.1015625, "learning_rate": 0.0004077890718191773, "loss": 5.3964, "mean_token_accuracy": 0.19273053705692292, "num_tokens": 7602746.0, "step": 4180 }, { "entropy": 5.8412984848022464, "epoch": 3.5956166738289643, "grad_norm": 1.1953125, "learning_rate": 0.00040752052147841733, "loss": 5.4458, "mean_token_accuracy": 0.19012261629104615, "num_tokens": 7611245.0, "step": 4185 }, { "entropy": 5.8679516315460205, "epoch": 3.599914052428019, "grad_norm": 1.03125, "learning_rate": 0.0004072516817658065, "loss": 5.4644, "mean_token_accuracy": 0.19320045709609984, "num_tokens": 7620234.0, "step": 4190 }, { "entropy": 5.896762943267822, "epoch": 3.6042114310270734, "grad_norm": 1.140625, "learning_rate": 0.0004069825532683831, "loss": 5.4977, "mean_token_accuracy": 0.1906106486916542, "num_tokens": 7629794.0, "step": 4195 }, { "entropy": 5.841946125030518, "epoch": 3.608508809626128, "grad_norm": 1.0625, "learning_rate": 0.00040671313657381645, "loss": 5.4439, "mean_token_accuracy": 0.1994246259331703, "num_tokens": 7639497.0, "step": 4200 }, { "entropy": 5.814431810379029, "epoch": 3.6128061882251825, "grad_norm": 1.0859375, "learning_rate": 0.00040644343227040473, "loss": 5.3911, "mean_token_accuracy": 0.19411042034626008, "num_tokens": 7647647.0, "step": 4205 }, { "entropy": 5.854738044738769, "epoch": 3.617103566824237, "grad_norm": 1.0390625, "learning_rate": 0.0004061734409470745, "loss": 5.5712, "mean_token_accuracy": 0.1941414475440979, "num_tokens": 7657988.0, "step": 4210 }, { "entropy": 5.920310211181641, "epoch": 3.621400945423292, "grad_norm": 1.09375, "learning_rate": 0.0004059031631933788, "loss": 5.4762, "mean_token_accuracy": 0.1912338137626648, "num_tokens": 7667498.0, "step": 4215 }, { "entropy": 5.906491374969482, "epoch": 3.6256983240223466, "grad_norm": 1.0703125, "learning_rate": 0.00040563259959949615, "loss": 5.6174, "mean_token_accuracy": 0.18079061657190323, "num_tokens": 7677386.0, "step": 4220 }, { "entropy": 5.96941728591919, "epoch": 3.629995702621401, "grad_norm": 1.0703125, "learning_rate": 0.0004053617507562295, "loss": 5.4659, "mean_token_accuracy": 0.18902975618839263, "num_tokens": 7686643.0, "step": 4225 }, { "entropy": 5.910523223876953, "epoch": 3.6342930812204557, "grad_norm": 1.25, "learning_rate": 0.00040509061725500426, "loss": 5.4904, "mean_token_accuracy": 0.18905052542686462, "num_tokens": 7695089.0, "step": 4230 }, { "entropy": 5.792887115478516, "epoch": 3.6385904598195102, "grad_norm": 1.1015625, "learning_rate": 0.0004048191996878677, "loss": 5.4783, "mean_token_accuracy": 0.19112255573272705, "num_tokens": 7703854.0, "step": 4235 }, { "entropy": 5.852434682846069, "epoch": 3.642887838418565, "grad_norm": 1.1015625, "learning_rate": 0.00040454749864748734, "loss": 5.4171, "mean_token_accuracy": 0.19455593526363374, "num_tokens": 7712903.0, "step": 4240 }, { "entropy": 5.880266714096069, "epoch": 3.6471852170176193, "grad_norm": 0.9921875, "learning_rate": 0.0004042755147271496, "loss": 5.3654, "mean_token_accuracy": 0.1971506655216217, "num_tokens": 7721701.0, "step": 4245 }, { "entropy": 5.74291033744812, "epoch": 3.651482595616674, "grad_norm": 1.0, "learning_rate": 0.0004040032485207587, "loss": 5.4819, "mean_token_accuracy": 0.19151000380516053, "num_tokens": 7731318.0, "step": 4250 }, { "entropy": 5.879048299789429, "epoch": 3.6557799742157284, "grad_norm": 1.03125, "learning_rate": 0.0004037307006228352, "loss": 5.4148, "mean_token_accuracy": 0.1946114867925644, "num_tokens": 7740413.0, "step": 4255 }, { "entropy": 5.858242034912109, "epoch": 3.660077352814783, "grad_norm": 1.09375, "learning_rate": 0.0004034578716285147, "loss": 5.3943, "mean_token_accuracy": 0.1999268651008606, "num_tokens": 7749054.0, "step": 4260 }, { "entropy": 5.81954345703125, "epoch": 3.6643747314138375, "grad_norm": 1.2265625, "learning_rate": 0.0004031847621335467, "loss": 5.4168, "mean_token_accuracy": 0.19938697069883346, "num_tokens": 7757366.0, "step": 4265 }, { "entropy": 5.823007678985595, "epoch": 3.668672110012892, "grad_norm": 1.265625, "learning_rate": 0.0004029113727342933, "loss": 5.4638, "mean_token_accuracy": 0.19210753887891768, "num_tokens": 7766471.0, "step": 4270 }, { "entropy": 5.830737972259522, "epoch": 3.6729694886119466, "grad_norm": 1.125, "learning_rate": 0.00040263770402772746, "loss": 5.4476, "mean_token_accuracy": 0.19209140986204148, "num_tokens": 7775920.0, "step": 4275 }, { "entropy": 5.9193565368652346, "epoch": 3.677266867211001, "grad_norm": 1.2421875, "learning_rate": 0.0004023637566114325, "loss": 5.4879, "mean_token_accuracy": 0.19063451737165452, "num_tokens": 7784530.0, "step": 4280 }, { "entropy": 5.880253267288208, "epoch": 3.6815642458100557, "grad_norm": 1.125, "learning_rate": 0.0004020895310835999, "loss": 5.4425, "mean_token_accuracy": 0.19116731137037277, "num_tokens": 7793656.0, "step": 4285 }, { "entropy": 5.858336305618286, "epoch": 3.6858616244091102, "grad_norm": 1.046875, "learning_rate": 0.00040181502804302865, "loss": 5.4694, "mean_token_accuracy": 0.18451076596975327, "num_tokens": 7802185.0, "step": 4290 }, { "entropy": 5.848211097717285, "epoch": 3.690159003008165, "grad_norm": 1.171875, "learning_rate": 0.00040154024808912377, "loss": 5.4419, "mean_token_accuracy": 0.19738092869520188, "num_tokens": 7810345.0, "step": 4295 }, { "entropy": 5.858597469329834, "epoch": 3.6944563816072193, "grad_norm": 1.171875, "learning_rate": 0.0004012651918218947, "loss": 5.5014, "mean_token_accuracy": 0.1863890826702118, "num_tokens": 7818998.0, "step": 4300 }, { "entropy": 5.930095386505127, "epoch": 3.6987537602062743, "grad_norm": 1.078125, "learning_rate": 0.0004009898598419544, "loss": 5.6082, "mean_token_accuracy": 0.17587608397006987, "num_tokens": 7828638.0, "step": 4305 }, { "entropy": 5.931398916244507, "epoch": 3.703051138805329, "grad_norm": 1.1796875, "learning_rate": 0.000400714252750518, "loss": 5.5731, "mean_token_accuracy": 0.1824898198246956, "num_tokens": 7838812.0, "step": 4310 }, { "entropy": 5.920556163787841, "epoch": 3.7073485174043834, "grad_norm": 1.125, "learning_rate": 0.0004004383711494011, "loss": 5.4985, "mean_token_accuracy": 0.19454227834939958, "num_tokens": 7847458.0, "step": 4315 }, { "entropy": 5.90432448387146, "epoch": 3.711645896003438, "grad_norm": 1.0546875, "learning_rate": 0.0004001622156410189, "loss": 5.5137, "mean_token_accuracy": 0.1859744668006897, "num_tokens": 7856553.0, "step": 4320 }, { "entropy": 5.838715839385986, "epoch": 3.7159432746024925, "grad_norm": 1.0859375, "learning_rate": 0.00039988578682838467, "loss": 5.4568, "mean_token_accuracy": 0.19485563039779663, "num_tokens": 7864788.0, "step": 4325 }, { "entropy": 5.868589687347412, "epoch": 3.720240653201547, "grad_norm": 1.0546875, "learning_rate": 0.00039960908531510843, "loss": 5.4424, "mean_token_accuracy": 0.1948522835969925, "num_tokens": 7873850.0, "step": 4330 }, { "entropy": 5.923116254806518, "epoch": 3.7245380318006016, "grad_norm": 1.1484375, "learning_rate": 0.0003993321117053956, "loss": 5.5762, "mean_token_accuracy": 0.18173539787530898, "num_tokens": 7882775.0, "step": 4335 }, { "entropy": 5.944538116455078, "epoch": 3.728835410399656, "grad_norm": 1.140625, "learning_rate": 0.00039905486660404604, "loss": 5.4952, "mean_token_accuracy": 0.18034075796604157, "num_tokens": 7890570.0, "step": 4340 }, { "entropy": 5.816492652893066, "epoch": 3.7331327889987107, "grad_norm": 1.015625, "learning_rate": 0.00039877735061645206, "loss": 5.4642, "mean_token_accuracy": 0.20065016895532609, "num_tokens": 7900090.0, "step": 4345 }, { "entropy": 5.899166393280029, "epoch": 3.7374301675977653, "grad_norm": 1.1875, "learning_rate": 0.0003984995643485977, "loss": 5.5033, "mean_token_accuracy": 0.18698994666337967, "num_tokens": 7908077.0, "step": 4350 }, { "entropy": 5.927290630340576, "epoch": 3.74172754619682, "grad_norm": 1.4609375, "learning_rate": 0.00039822150840705716, "loss": 5.507, "mean_token_accuracy": 0.190550497174263, "num_tokens": 7916290.0, "step": 4355 }, { "entropy": 5.962394952774048, "epoch": 3.746024924795875, "grad_norm": 1.15625, "learning_rate": 0.00039794318339899347, "loss": 5.5781, "mean_token_accuracy": 0.1866394594311714, "num_tokens": 7925835.0, "step": 4360 }, { "entropy": 5.942964267730713, "epoch": 3.7503223033949293, "grad_norm": 1.0625, "learning_rate": 0.00039766458993215726, "loss": 5.5424, "mean_token_accuracy": 0.18485690355300904, "num_tokens": 7935076.0, "step": 4365 }, { "entropy": 5.8386486053466795, "epoch": 3.754619681993984, "grad_norm": 1.0703125, "learning_rate": 0.00039738572861488527, "loss": 5.4601, "mean_token_accuracy": 0.1967233255505562, "num_tokens": 7943958.0, "step": 4370 }, { "entropy": 5.8420368194580075, "epoch": 3.7589170605930384, "grad_norm": 1.0390625, "learning_rate": 0.000397106600056099, "loss": 5.4924, "mean_token_accuracy": 0.1886660173535347, "num_tokens": 7953189.0, "step": 4375 }, { "entropy": 5.824497079849243, "epoch": 3.763214439192093, "grad_norm": 0.99609375, "learning_rate": 0.0003968272048653039, "loss": 5.3941, "mean_token_accuracy": 0.20391580015420913, "num_tokens": 7962927.0, "step": 4380 }, { "entropy": 5.778334474563598, "epoch": 3.7675118177911475, "grad_norm": 1.0390625, "learning_rate": 0.0003965475436525873, "loss": 5.4282, "mean_token_accuracy": 0.19804799407720566, "num_tokens": 7973087.0, "step": 4385 }, { "entropy": 5.8292381286621096, "epoch": 3.771809196390202, "grad_norm": 1.0234375, "learning_rate": 0.0003962676170286174, "loss": 5.4011, "mean_token_accuracy": 0.19533102959394455, "num_tokens": 7982535.0, "step": 4390 }, { "entropy": 5.918207216262817, "epoch": 3.7761065749892566, "grad_norm": 1.171875, "learning_rate": 0.00039598742560464223, "loss": 5.465, "mean_token_accuracy": 0.19849387407302857, "num_tokens": 7990740.0, "step": 4395 }, { "entropy": 5.974913454055786, "epoch": 3.780403953588311, "grad_norm": 1.2421875, "learning_rate": 0.0003957069699924877, "loss": 5.4715, "mean_token_accuracy": 0.18468343317508698, "num_tokens": 7999349.0, "step": 4400 }, { "entropy": 5.8590610980987545, "epoch": 3.7847013321873657, "grad_norm": 1.1015625, "learning_rate": 0.000395426250804557, "loss": 5.4678, "mean_token_accuracy": 0.20066145956516265, "num_tokens": 8007615.0, "step": 4405 }, { "entropy": 5.851322650909424, "epoch": 3.7889987107864203, "grad_norm": 1.0390625, "learning_rate": 0.00039514526865382847, "loss": 5.4643, "mean_token_accuracy": 0.19499933123588561, "num_tokens": 8017545.0, "step": 4410 }, { "entropy": 5.874475526809692, "epoch": 3.793296089385475, "grad_norm": 1.1875, "learning_rate": 0.0003948640241538548, "loss": 5.3969, "mean_token_accuracy": 0.1957339271903038, "num_tokens": 8026381.0, "step": 4415 }, { "entropy": 5.874440097808838, "epoch": 3.7975934679845293, "grad_norm": 1.390625, "learning_rate": 0.0003945825179187617, "loss": 5.5045, "mean_token_accuracy": 0.18969615548849106, "num_tokens": 8034745.0, "step": 4420 }, { "entropy": 5.89657678604126, "epoch": 3.801890846583584, "grad_norm": 1.234375, "learning_rate": 0.00039430075056324604, "loss": 5.4536, "mean_token_accuracy": 0.1952233463525772, "num_tokens": 8043995.0, "step": 4425 }, { "entropy": 5.882454442977905, "epoch": 3.8061882251826384, "grad_norm": 1.140625, "learning_rate": 0.00039401872270257546, "loss": 5.5335, "mean_token_accuracy": 0.18912735730409622, "num_tokens": 8053059.0, "step": 4430 }, { "entropy": 5.857541561126709, "epoch": 3.810485603781693, "grad_norm": 1.078125, "learning_rate": 0.00039373643495258567, "loss": 5.5521, "mean_token_accuracy": 0.1929113283753395, "num_tokens": 8062160.0, "step": 4435 }, { "entropy": 5.836390352249145, "epoch": 3.8147829823807475, "grad_norm": 1.1484375, "learning_rate": 0.00039345388792968056, "loss": 5.4535, "mean_token_accuracy": 0.19928884506225586, "num_tokens": 8071260.0, "step": 4440 }, { "entropy": 5.92797999382019, "epoch": 3.819080360979802, "grad_norm": 1.2578125, "learning_rate": 0.00039317108225082984, "loss": 5.584, "mean_token_accuracy": 0.17884425669908524, "num_tokens": 8081540.0, "step": 4445 }, { "entropy": 5.852789735794067, "epoch": 3.8233777395788566, "grad_norm": 1.1796875, "learning_rate": 0.00039288801853356806, "loss": 5.5403, "mean_token_accuracy": 0.19031485319137573, "num_tokens": 8089785.0, "step": 4450 }, { "entropy": 5.903105306625366, "epoch": 3.8276751181779116, "grad_norm": 1.203125, "learning_rate": 0.0003926046973959932, "loss": 5.3854, "mean_token_accuracy": 0.2016897514462471, "num_tokens": 8098097.0, "step": 4455 }, { "entropy": 5.839665746688842, "epoch": 3.831972496776966, "grad_norm": 1.0234375, "learning_rate": 0.0003923211194567654, "loss": 5.6069, "mean_token_accuracy": 0.18874900341033934, "num_tokens": 8108693.0, "step": 4460 }, { "entropy": 5.916633415222168, "epoch": 3.8362698753760207, "grad_norm": 1.1484375, "learning_rate": 0.00039203728533510556, "loss": 5.4581, "mean_token_accuracy": 0.19540373086929322, "num_tokens": 8117181.0, "step": 4465 }, { "entropy": 5.865298652648926, "epoch": 3.8405672539750753, "grad_norm": 1.1640625, "learning_rate": 0.000391753195650794, "loss": 5.4702, "mean_token_accuracy": 0.18679834604263307, "num_tokens": 8125398.0, "step": 4470 }, { "entropy": 5.871571397781372, "epoch": 3.84486463257413, "grad_norm": 1.0859375, "learning_rate": 0.00039146885102416895, "loss": 5.4902, "mean_token_accuracy": 0.18905259370803834, "num_tokens": 8135320.0, "step": 4475 }, { "entropy": 5.89444932937622, "epoch": 3.8491620111731844, "grad_norm": 1.1484375, "learning_rate": 0.00039118425207612553, "loss": 5.568, "mean_token_accuracy": 0.18921478390693663, "num_tokens": 8144320.0, "step": 4480 }, { "entropy": 5.779955959320068, "epoch": 3.853459389772239, "grad_norm": 1.1640625, "learning_rate": 0.00039089939942811396, "loss": 5.4373, "mean_token_accuracy": 0.1988345354795456, "num_tokens": 8153653.0, "step": 4485 }, { "entropy": 5.915558528900147, "epoch": 3.8577567683712934, "grad_norm": 1.0703125, "learning_rate": 0.00039061429370213863, "loss": 5.4759, "mean_token_accuracy": 0.1880367398262024, "num_tokens": 8162741.0, "step": 4490 }, { "entropy": 5.789914560317993, "epoch": 3.862054146970348, "grad_norm": 1.140625, "learning_rate": 0.00039032893552075646, "loss": 5.3866, "mean_token_accuracy": 0.20460292547941208, "num_tokens": 8171078.0, "step": 4495 }, { "entropy": 5.812246417999267, "epoch": 3.8663515255694025, "grad_norm": 1.21875, "learning_rate": 0.0003900433255070758, "loss": 5.4441, "mean_token_accuracy": 0.19726246744394302, "num_tokens": 8179968.0, "step": 4500 }, { "epoch": 3.8663515255694025, "eval_entropy": 5.669238594201234, "eval_loss": 5.937331676483154, "eval_mean_token_accuracy": 0.17546728171139686, "eval_num_tokens": 8179968.0, "eval_runtime": 2.0525, "eval_samples_per_second": 1729.081, "eval_steps_per_second": 216.318, "step": 4500 }, { "entropy": 5.868862056732178, "epoch": 3.870648904168457, "grad_norm": 1.109375, "learning_rate": 0.00038975746428475454, "loss": 5.4345, "mean_token_accuracy": 0.19067300260066986, "num_tokens": 8189261.0, "step": 4505 }, { "entropy": 5.934845113754273, "epoch": 3.874946282767512, "grad_norm": 1.1484375, "learning_rate": 0.00038947135247799955, "loss": 5.4365, "mean_token_accuracy": 0.1983747035264969, "num_tokens": 8198302.0, "step": 4510 }, { "entropy": 5.853604698181153, "epoch": 3.8792436613665666, "grad_norm": 1.0703125, "learning_rate": 0.00038918499071156443, "loss": 5.4386, "mean_token_accuracy": 0.19632905274629592, "num_tokens": 8207098.0, "step": 4515 }, { "entropy": 5.838044023513794, "epoch": 3.883541039965621, "grad_norm": 1.2265625, "learning_rate": 0.000388898379610749, "loss": 5.4741, "mean_token_accuracy": 0.18905991911888123, "num_tokens": 8216831.0, "step": 4520 }, { "entropy": 5.869770908355713, "epoch": 3.8878384185646757, "grad_norm": 1.15625, "learning_rate": 0.0003886115198013973, "loss": 5.4715, "mean_token_accuracy": 0.19619868695735931, "num_tokens": 8225369.0, "step": 4525 }, { "entropy": 5.923373460769653, "epoch": 3.8921357971637303, "grad_norm": 1.1875, "learning_rate": 0.0003883244119098965, "loss": 5.6014, "mean_token_accuracy": 0.1788467511534691, "num_tokens": 8234440.0, "step": 4530 }, { "entropy": 5.8807614803314205, "epoch": 3.896433175762785, "grad_norm": 1.09375, "learning_rate": 0.0003880370565631754, "loss": 5.4068, "mean_token_accuracy": 0.1984736517071724, "num_tokens": 8243707.0, "step": 4535 }, { "entropy": 5.894764137268067, "epoch": 3.9007305543618394, "grad_norm": 1.09375, "learning_rate": 0.00038774945438870337, "loss": 5.5811, "mean_token_accuracy": 0.18669764697551727, "num_tokens": 8254223.0, "step": 4540 }, { "entropy": 5.834207344055176, "epoch": 3.905027932960894, "grad_norm": 1.1640625, "learning_rate": 0.00038746160601448845, "loss": 5.429, "mean_token_accuracy": 0.1912005826830864, "num_tokens": 8263105.0, "step": 4545 }, { "entropy": 5.793808317184448, "epoch": 3.9093253115599484, "grad_norm": 1.046875, "learning_rate": 0.0003871735120690766, "loss": 5.4899, "mean_token_accuracy": 0.19511680603027343, "num_tokens": 8271478.0, "step": 4550 }, { "entropy": 5.896292304992675, "epoch": 3.913622690159003, "grad_norm": 1.1171875, "learning_rate": 0.0003868851731815497, "loss": 5.522, "mean_token_accuracy": 0.18419113904237747, "num_tokens": 8280396.0, "step": 4555 }, { "entropy": 5.898507070541382, "epoch": 3.9179200687580575, "grad_norm": 1.1796875, "learning_rate": 0.0003865965899815247, "loss": 5.522, "mean_token_accuracy": 0.18741509020328523, "num_tokens": 8290371.0, "step": 4560 }, { "entropy": 5.856956624984742, "epoch": 3.922217447357112, "grad_norm": 1.1328125, "learning_rate": 0.0003863077630991518, "loss": 5.416, "mean_token_accuracy": 0.2038359597325325, "num_tokens": 8298976.0, "step": 4565 }, { "entropy": 5.8094282150268555, "epoch": 3.9265148259561666, "grad_norm": 1.0859375, "learning_rate": 0.0003860186931651139, "loss": 5.4775, "mean_token_accuracy": 0.19014360755681992, "num_tokens": 8308752.0, "step": 4570 }, { "entropy": 5.868230819702148, "epoch": 3.930812204555221, "grad_norm": 1.09375, "learning_rate": 0.0003857293808106238, "loss": 5.5311, "mean_token_accuracy": 0.1895223081111908, "num_tokens": 8317343.0, "step": 4575 }, { "entropy": 5.894303274154663, "epoch": 3.9351095831542757, "grad_norm": 1.0546875, "learning_rate": 0.0003854398266674241, "loss": 5.382, "mean_token_accuracy": 0.1990468829870224, "num_tokens": 8326956.0, "step": 4580 }, { "entropy": 5.8280493259429935, "epoch": 3.9394069617533303, "grad_norm": 1.1875, "learning_rate": 0.00038515003136778544, "loss": 5.497, "mean_token_accuracy": 0.1899331420660019, "num_tokens": 8335589.0, "step": 4585 }, { "entropy": 5.8244860649108885, "epoch": 3.943704340352385, "grad_norm": 1.125, "learning_rate": 0.00038485999554450483, "loss": 5.4746, "mean_token_accuracy": 0.18865241706371308, "num_tokens": 8345517.0, "step": 4590 }, { "entropy": 5.774182653427124, "epoch": 3.9480017189514394, "grad_norm": 1.1875, "learning_rate": 0.00038456971983090454, "loss": 5.3962, "mean_token_accuracy": 0.2035086452960968, "num_tokens": 8354702.0, "step": 4595 }, { "entropy": 5.860455703735352, "epoch": 3.9522990975504944, "grad_norm": 1.1171875, "learning_rate": 0.0003842792048608309, "loss": 5.4462, "mean_token_accuracy": 0.19748363494873047, "num_tokens": 8362940.0, "step": 4600 }, { "entropy": 5.890370893478393, "epoch": 3.956596476149549, "grad_norm": 1.0234375, "learning_rate": 0.0003839884512686523, "loss": 5.4862, "mean_token_accuracy": 0.1941131427884102, "num_tokens": 8372034.0, "step": 4605 }, { "entropy": 5.86606798171997, "epoch": 3.9608938547486034, "grad_norm": 1.1171875, "learning_rate": 0.00038369745968925846, "loss": 5.5098, "mean_token_accuracy": 0.18545786291360855, "num_tokens": 8381673.0, "step": 4610 }, { "entropy": 5.86917929649353, "epoch": 3.965191233347658, "grad_norm": 1.03125, "learning_rate": 0.00038340623075805875, "loss": 5.4495, "mean_token_accuracy": 0.1948995217680931, "num_tokens": 8390804.0, "step": 4615 }, { "entropy": 5.923179006576538, "epoch": 3.9694886119467125, "grad_norm": 1.1796875, "learning_rate": 0.00038311476511098053, "loss": 5.4934, "mean_token_accuracy": 0.19242242276668547, "num_tokens": 8399644.0, "step": 4620 }, { "entropy": 5.829574918746948, "epoch": 3.973785990545767, "grad_norm": 1.1640625, "learning_rate": 0.0003828230633844685, "loss": 5.5087, "mean_token_accuracy": 0.1927846297621727, "num_tokens": 8409264.0, "step": 4625 }, { "entropy": 5.8476622104644775, "epoch": 3.9780833691448216, "grad_norm": 1.2109375, "learning_rate": 0.00038253112621548243, "loss": 5.4526, "mean_token_accuracy": 0.18896006494760514, "num_tokens": 8418383.0, "step": 4630 }, { "entropy": 5.907848215103149, "epoch": 3.982380747743876, "grad_norm": 1.0859375, "learning_rate": 0.0003822389542414966, "loss": 5.4784, "mean_token_accuracy": 0.1900715708732605, "num_tokens": 8427411.0, "step": 4635 }, { "entropy": 5.852678823471069, "epoch": 3.9866781263429307, "grad_norm": 1.125, "learning_rate": 0.00038194654810049775, "loss": 5.4311, "mean_token_accuracy": 0.1907512903213501, "num_tokens": 8435537.0, "step": 4640 }, { "entropy": 5.844833660125732, "epoch": 3.9909755049419853, "grad_norm": 1.0703125, "learning_rate": 0.000381653908430984, "loss": 5.5191, "mean_token_accuracy": 0.18742516934871672, "num_tokens": 8444400.0, "step": 4645 }, { "entropy": 5.909849452972412, "epoch": 3.99527288354104, "grad_norm": 1.0703125, "learning_rate": 0.0003813610358719634, "loss": 5.4883, "mean_token_accuracy": 0.18923701345920563, "num_tokens": 8453830.0, "step": 4650 }, { "entropy": 5.875612211227417, "epoch": 3.999570262140095, "grad_norm": 1.0546875, "learning_rate": 0.00038106793106295266, "loss": 5.4533, "mean_token_accuracy": 0.19856019467115402, "num_tokens": 8463033.0, "step": 4655 }, { "entropy": 5.82416311899821, "epoch": 4.003437902879243, "grad_norm": 1.0234375, "learning_rate": 0.0003807745946439754, "loss": 5.238, "mean_token_accuracy": 0.2109938876496421, "num_tokens": 8470740.0, "step": 4660 }, { "entropy": 5.827999782562256, "epoch": 4.007735281478298, "grad_norm": 1.015625, "learning_rate": 0.0003804810272555612, "loss": 5.2188, "mean_token_accuracy": 0.20497858077287673, "num_tokens": 8480480.0, "step": 4665 }, { "entropy": 5.813249492645264, "epoch": 4.012032660077352, "grad_norm": 1.1640625, "learning_rate": 0.0003801872295387439, "loss": 5.1719, "mean_token_accuracy": 0.2139571100473404, "num_tokens": 8489047.0, "step": 4670 }, { "entropy": 5.870213556289673, "epoch": 4.016330038676408, "grad_norm": 1.1015625, "learning_rate": 0.0003798932021350603, "loss": 5.2304, "mean_token_accuracy": 0.2088460475206375, "num_tokens": 8497763.0, "step": 4675 }, { "entropy": 5.844989204406739, "epoch": 4.020627417275462, "grad_norm": 1.109375, "learning_rate": 0.00037959894568654864, "loss": 5.2276, "mean_token_accuracy": 0.2120213195681572, "num_tokens": 8506814.0, "step": 4680 }, { "entropy": 5.945601987838745, "epoch": 4.024924795874517, "grad_norm": 1.1171875, "learning_rate": 0.0003793044608357474, "loss": 5.3337, "mean_token_accuracy": 0.19796358346939086, "num_tokens": 8516384.0, "step": 4685 }, { "entropy": 5.8777913570404055, "epoch": 4.0292221744735714, "grad_norm": 1.1796875, "learning_rate": 0.0003790097482256939, "loss": 5.1704, "mean_token_accuracy": 0.20637003779411317, "num_tokens": 8524822.0, "step": 4690 }, { "entropy": 5.831185436248779, "epoch": 4.033519553072626, "grad_norm": 1.03125, "learning_rate": 0.0003787148084999225, "loss": 5.2003, "mean_token_accuracy": 0.21544599533081055, "num_tokens": 8534129.0, "step": 4695 }, { "entropy": 5.817060756683349, "epoch": 4.0378169316716805, "grad_norm": 1.1875, "learning_rate": 0.00037841964230246394, "loss": 5.2683, "mean_token_accuracy": 0.20154480040073394, "num_tokens": 8543235.0, "step": 4700 }, { "entropy": 5.798751497268677, "epoch": 4.042114310270735, "grad_norm": 1.1796875, "learning_rate": 0.0003781242502778429, "loss": 5.1701, "mean_token_accuracy": 0.22188325822353364, "num_tokens": 8551903.0, "step": 4705 }, { "entropy": 5.810360908508301, "epoch": 4.04641168886979, "grad_norm": 1.234375, "learning_rate": 0.00037782863307107785, "loss": 5.2542, "mean_token_accuracy": 0.2083105593919754, "num_tokens": 8561173.0, "step": 4710 }, { "entropy": 5.864426326751709, "epoch": 4.050709067468844, "grad_norm": 1.15625, "learning_rate": 0.00037753279132767833, "loss": 5.1615, "mean_token_accuracy": 0.22426997274160385, "num_tokens": 8569789.0, "step": 4715 }, { "entropy": 5.76513557434082, "epoch": 4.055006446067899, "grad_norm": 1.265625, "learning_rate": 0.00037723672569364453, "loss": 5.1543, "mean_token_accuracy": 0.21237667500972748, "num_tokens": 8577971.0, "step": 4720 }, { "entropy": 5.831951045989991, "epoch": 4.059303824666953, "grad_norm": 1.125, "learning_rate": 0.00037694043681546545, "loss": 5.2463, "mean_token_accuracy": 0.20461665093898773, "num_tokens": 8587299.0, "step": 4725 }, { "entropy": 5.790086889266968, "epoch": 4.063601203266008, "grad_norm": 1.078125, "learning_rate": 0.0003766439253401177, "loss": 5.1975, "mean_token_accuracy": 0.20828050523996353, "num_tokens": 8595813.0, "step": 4730 }, { "entropy": 5.796638345718383, "epoch": 4.067898581865062, "grad_norm": 1.1875, "learning_rate": 0.00037634719191506367, "loss": 5.2105, "mean_token_accuracy": 0.2160770669579506, "num_tokens": 8604552.0, "step": 4735 }, { "entropy": 5.735085487365723, "epoch": 4.072195960464117, "grad_norm": 1.328125, "learning_rate": 0.00037605023718825065, "loss": 5.1459, "mean_token_accuracy": 0.21400606483221055, "num_tokens": 8612701.0, "step": 4740 }, { "entropy": 5.806445741653443, "epoch": 4.0764933390631715, "grad_norm": 1.0234375, "learning_rate": 0.000375753061808109, "loss": 5.2262, "mean_token_accuracy": 0.20582615286111833, "num_tokens": 8622699.0, "step": 4745 }, { "entropy": 5.826416683197022, "epoch": 4.080790717662226, "grad_norm": 1.109375, "learning_rate": 0.00037545566642355107, "loss": 5.1988, "mean_token_accuracy": 0.20449893027544022, "num_tokens": 8631821.0, "step": 4750 }, { "entropy": 5.820923519134522, "epoch": 4.0850880962612806, "grad_norm": 1.078125, "learning_rate": 0.0003751580516839695, "loss": 5.1632, "mean_token_accuracy": 0.21159304827451705, "num_tokens": 8641814.0, "step": 4755 }, { "entropy": 5.8700745582580565, "epoch": 4.089385474860335, "grad_norm": 1.1796875, "learning_rate": 0.00037486021823923574, "loss": 5.2402, "mean_token_accuracy": 0.2071731209754944, "num_tokens": 8649649.0, "step": 4760 }, { "entropy": 5.737698268890381, "epoch": 4.09368285345939, "grad_norm": 1.15625, "learning_rate": 0.00037456216673969925, "loss": 5.1651, "mean_token_accuracy": 0.21470018476247787, "num_tokens": 8658216.0, "step": 4765 }, { "entropy": 5.819456481933594, "epoch": 4.097980232058444, "grad_norm": 1.015625, "learning_rate": 0.0003742638978361851, "loss": 5.263, "mean_token_accuracy": 0.20773692131042482, "num_tokens": 8667725.0, "step": 4770 }, { "entropy": 5.748415946960449, "epoch": 4.102277610657499, "grad_norm": 1.1796875, "learning_rate": 0.00037396541217999367, "loss": 5.1166, "mean_token_accuracy": 0.21343214809894562, "num_tokens": 8675739.0, "step": 4775 }, { "entropy": 5.802015209197998, "epoch": 4.106574989256553, "grad_norm": 1.140625, "learning_rate": 0.0003736667104228981, "loss": 5.1982, "mean_token_accuracy": 0.21212756633758545, "num_tokens": 8685764.0, "step": 4780 }, { "entropy": 5.813879203796387, "epoch": 4.110872367855608, "grad_norm": 1.3515625, "learning_rate": 0.00037336779321714376, "loss": 5.1685, "mean_token_accuracy": 0.2184282124042511, "num_tokens": 8695476.0, "step": 4785 }, { "entropy": 5.770734739303589, "epoch": 4.115169746454662, "grad_norm": 1.1796875, "learning_rate": 0.00037306866121544633, "loss": 5.2349, "mean_token_accuracy": 0.2108105957508087, "num_tokens": 8705544.0, "step": 4790 }, { "entropy": 5.867660713195801, "epoch": 4.119467125053717, "grad_norm": 1.15625, "learning_rate": 0.0003727693150709904, "loss": 5.229, "mean_token_accuracy": 0.21013996750116348, "num_tokens": 8714883.0, "step": 4795 }, { "entropy": 5.878910875320434, "epoch": 4.1237645036527715, "grad_norm": 1.109375, "learning_rate": 0.00037246975543742843, "loss": 5.2647, "mean_token_accuracy": 0.20463906675577165, "num_tokens": 8724589.0, "step": 4800 }, { "entropy": 5.705543088912964, "epoch": 4.128061882251826, "grad_norm": 1.125, "learning_rate": 0.000372169982968879, "loss": 5.1432, "mean_token_accuracy": 0.2145526185631752, "num_tokens": 8733771.0, "step": 4805 }, { "entropy": 5.80822606086731, "epoch": 4.132359260850881, "grad_norm": 1.2578125, "learning_rate": 0.0003718699983199252, "loss": 5.23, "mean_token_accuracy": 0.21176477819681166, "num_tokens": 8742348.0, "step": 4810 }, { "entropy": 5.754483032226562, "epoch": 4.136656639449935, "grad_norm": 1.1171875, "learning_rate": 0.0003715698021456137, "loss": 5.1666, "mean_token_accuracy": 0.21949181854724883, "num_tokens": 8751357.0, "step": 4815 }, { "entropy": 5.754259347915649, "epoch": 4.1409540180489905, "grad_norm": 1.09375, "learning_rate": 0.00037126939510145294, "loss": 5.2289, "mean_token_accuracy": 0.21563220173120498, "num_tokens": 8760813.0, "step": 4820 }, { "entropy": 5.887066793441773, "epoch": 4.145251396648045, "grad_norm": 1.2265625, "learning_rate": 0.0003709687778434118, "loss": 5.267, "mean_token_accuracy": 0.2031414046883583, "num_tokens": 8770228.0, "step": 4825 }, { "entropy": 5.764794158935547, "epoch": 4.1495487752471, "grad_norm": 1.28125, "learning_rate": 0.0003706679510279183, "loss": 5.104, "mean_token_accuracy": 0.21419611275196077, "num_tokens": 8779351.0, "step": 4830 }, { "entropy": 5.787221384048462, "epoch": 4.153846153846154, "grad_norm": 1.21875, "learning_rate": 0.0003703669153118578, "loss": 5.265, "mean_token_accuracy": 0.2048552840948105, "num_tokens": 8789116.0, "step": 4835 }, { "entropy": 5.73529543876648, "epoch": 4.158143532445209, "grad_norm": 1.15625, "learning_rate": 0.00037006567135257216, "loss": 5.2226, "mean_token_accuracy": 0.2051789328455925, "num_tokens": 8797790.0, "step": 4840 }, { "entropy": 5.840820932388306, "epoch": 4.162440911044263, "grad_norm": 1.09375, "learning_rate": 0.00036976421980785764, "loss": 5.2712, "mean_token_accuracy": 0.19828639030456544, "num_tokens": 8808067.0, "step": 4845 }, { "entropy": 5.780494928359985, "epoch": 4.166738289643318, "grad_norm": 1.1875, "learning_rate": 0.0003694625613359641, "loss": 5.1702, "mean_token_accuracy": 0.2194593980908394, "num_tokens": 8816587.0, "step": 4850 }, { "entropy": 5.8219578742980955, "epoch": 4.171035668242372, "grad_norm": 1.21875, "learning_rate": 0.0003691606965955929, "loss": 5.237, "mean_token_accuracy": 0.20742220878601075, "num_tokens": 8826045.0, "step": 4855 }, { "entropy": 5.729091739654541, "epoch": 4.175333046841427, "grad_norm": 1.09375, "learning_rate": 0.000368858626245896, "loss": 5.2299, "mean_token_accuracy": 0.21220277696847917, "num_tokens": 8835427.0, "step": 4860 }, { "entropy": 5.753469085693359, "epoch": 4.1796304254404815, "grad_norm": 0.984375, "learning_rate": 0.0003685563509464744, "loss": 5.1641, "mean_token_accuracy": 0.21428546756505967, "num_tokens": 8845167.0, "step": 4865 }, { "entropy": 5.825128555297852, "epoch": 4.183927804039536, "grad_norm": 1.25, "learning_rate": 0.00036825387135737647, "loss": 5.1645, "mean_token_accuracy": 0.21480036079883574, "num_tokens": 8853591.0, "step": 4870 }, { "entropy": 5.752241849899292, "epoch": 4.188225182638591, "grad_norm": 1.21875, "learning_rate": 0.00036795118813909674, "loss": 5.2826, "mean_token_accuracy": 0.2008580043911934, "num_tokens": 8863647.0, "step": 4875 }, { "entropy": 5.848931741714478, "epoch": 4.192522561237645, "grad_norm": 1.1796875, "learning_rate": 0.00036764830195257437, "loss": 5.2041, "mean_token_accuracy": 0.20963532328605652, "num_tokens": 8872911.0, "step": 4880 }, { "entropy": 5.850318908691406, "epoch": 4.1968199398367, "grad_norm": 1.2578125, "learning_rate": 0.0003673452134591918, "loss": 5.2565, "mean_token_accuracy": 0.20480175465345382, "num_tokens": 8881001.0, "step": 4885 }, { "entropy": 5.747745132446289, "epoch": 4.201117318435754, "grad_norm": 1.1953125, "learning_rate": 0.000367041923320773, "loss": 5.1625, "mean_token_accuracy": 0.2151420921087265, "num_tokens": 8890323.0, "step": 4890 }, { "entropy": 5.7503491878509525, "epoch": 4.205414697034809, "grad_norm": 1.15625, "learning_rate": 0.00036673843219958257, "loss": 5.1986, "mean_token_accuracy": 0.21217773407697677, "num_tokens": 8900471.0, "step": 4895 }, { "entropy": 5.833252668380737, "epoch": 4.209712075633863, "grad_norm": 1.2109375, "learning_rate": 0.0003664347407583238, "loss": 5.2539, "mean_token_accuracy": 0.20236335545778275, "num_tokens": 8909320.0, "step": 4900 }, { "entropy": 5.810373783111572, "epoch": 4.214009454232918, "grad_norm": 1.234375, "learning_rate": 0.0003661308496601373, "loss": 5.1684, "mean_token_accuracy": 0.21240386217832566, "num_tokens": 8917453.0, "step": 4905 }, { "entropy": 5.728451442718506, "epoch": 4.218306832831972, "grad_norm": 1.3125, "learning_rate": 0.00036582675956859983, "loss": 5.2383, "mean_token_accuracy": 0.21226001232862474, "num_tokens": 8925737.0, "step": 4910 }, { "entropy": 5.681561899185181, "epoch": 4.222604211431027, "grad_norm": 1.21875, "learning_rate": 0.00036552247114772263, "loss": 5.1789, "mean_token_accuracy": 0.21163196116685867, "num_tokens": 8935475.0, "step": 4915 }, { "entropy": 5.826884984970093, "epoch": 4.2269015900300815, "grad_norm": 1.0859375, "learning_rate": 0.00036521798506194996, "loss": 5.2031, "mean_token_accuracy": 0.21508658528327942, "num_tokens": 8944683.0, "step": 4920 }, { "entropy": 5.844992017745971, "epoch": 4.231198968629136, "grad_norm": 1.28125, "learning_rate": 0.00036491330197615775, "loss": 5.2401, "mean_token_accuracy": 0.1984725683927536, "num_tokens": 8953837.0, "step": 4925 }, { "entropy": 5.795260763168335, "epoch": 4.235496347228191, "grad_norm": 1.015625, "learning_rate": 0.00036460842255565197, "loss": 5.2776, "mean_token_accuracy": 0.20311659723520278, "num_tokens": 8964822.0, "step": 4930 }, { "entropy": 5.831965970993042, "epoch": 4.239793725827245, "grad_norm": 1.3671875, "learning_rate": 0.0003643033474661676, "loss": 5.253, "mean_token_accuracy": 0.2069919228553772, "num_tokens": 8974363.0, "step": 4935 }, { "entropy": 5.818027830123901, "epoch": 4.2440911044263, "grad_norm": 1.25, "learning_rate": 0.00036399807737386657, "loss": 5.1565, "mean_token_accuracy": 0.2174941658973694, "num_tokens": 8983122.0, "step": 4940 }, { "entropy": 5.833076524734497, "epoch": 4.248388483025354, "grad_norm": 1.2734375, "learning_rate": 0.0003636926129453368, "loss": 5.2785, "mean_token_accuracy": 0.2052626445889473, "num_tokens": 8991618.0, "step": 4945 }, { "entropy": 5.812604331970215, "epoch": 4.252685861624409, "grad_norm": 1.203125, "learning_rate": 0.0003633869548475904, "loss": 5.189, "mean_token_accuracy": 0.2113402158021927, "num_tokens": 9000128.0, "step": 4950 }, { "entropy": 5.738666582107544, "epoch": 4.256983240223463, "grad_norm": 1.1640625, "learning_rate": 0.0003630811037480627, "loss": 5.1852, "mean_token_accuracy": 0.20911359339952468, "num_tokens": 9008951.0, "step": 4955 }, { "entropy": 5.798120307922363, "epoch": 4.261280618822518, "grad_norm": 1.0859375, "learning_rate": 0.0003627750603146101, "loss": 5.2361, "mean_token_accuracy": 0.2046567305922508, "num_tokens": 9018949.0, "step": 4960 }, { "entropy": 5.852856063842774, "epoch": 4.265577997421573, "grad_norm": 1.03125, "learning_rate": 0.0003624688252155091, "loss": 5.2348, "mean_token_accuracy": 0.20631033331155776, "num_tokens": 9028910.0, "step": 4965 }, { "entropy": 5.752488708496093, "epoch": 4.269875376020628, "grad_norm": 1.15625, "learning_rate": 0.0003621623991194549, "loss": 5.2837, "mean_token_accuracy": 0.20204642564058303, "num_tokens": 9039012.0, "step": 4970 }, { "entropy": 5.821020841598511, "epoch": 4.274172754619682, "grad_norm": 1.1015625, "learning_rate": 0.0003618557826955594, "loss": 5.2515, "mean_token_accuracy": 0.20976072698831558, "num_tokens": 9048639.0, "step": 4975 }, { "entropy": 5.811152553558349, "epoch": 4.278470133218737, "grad_norm": 1.1328125, "learning_rate": 0.00036154897661335063, "loss": 5.209, "mean_token_accuracy": 0.21008352786302567, "num_tokens": 9057453.0, "step": 4980 }, { "entropy": 5.742544078826905, "epoch": 4.2827675118177915, "grad_norm": 1.1875, "learning_rate": 0.0003612419815427702, "loss": 5.2414, "mean_token_accuracy": 0.20367368310689926, "num_tokens": 9066761.0, "step": 4985 }, { "entropy": 5.800252914428711, "epoch": 4.287064890416846, "grad_norm": 1.390625, "learning_rate": 0.0003609347981541726, "loss": 5.3227, "mean_token_accuracy": 0.1967179670929909, "num_tokens": 9075535.0, "step": 4990 }, { "entropy": 5.852711486816406, "epoch": 4.291362269015901, "grad_norm": 1.2265625, "learning_rate": 0.00036062742711832376, "loss": 5.2097, "mean_token_accuracy": 0.2098710834980011, "num_tokens": 9084559.0, "step": 4995 }, { "entropy": 5.792783451080322, "epoch": 4.295659647614955, "grad_norm": 1.2421875, "learning_rate": 0.0003603198691063991, "loss": 5.1864, "mean_token_accuracy": 0.21033176481723787, "num_tokens": 9093069.0, "step": 5000 }, { "epoch": 4.295659647614955, "eval_entropy": 5.5126907895277215, "eval_loss": 5.941234588623047, "eval_mean_token_accuracy": 0.17705249861889594, "eval_num_tokens": 9093069.0, "eval_runtime": 2.0573, "eval_samples_per_second": 1725.077, "eval_steps_per_second": 215.817, "step": 5000 }, { "entropy": 5.730698966979981, "epoch": 4.29995702621401, "grad_norm": 1.34375, "learning_rate": 0.0003600121247899824, "loss": 5.1854, "mean_token_accuracy": 0.21148194521665573, "num_tokens": 9101914.0, "step": 5005 }, { "entropy": 5.787820625305176, "epoch": 4.304254404813064, "grad_norm": 1.1328125, "learning_rate": 0.00035970419484106404, "loss": 5.2651, "mean_token_accuracy": 0.2031300500035286, "num_tokens": 9110967.0, "step": 5010 }, { "entropy": 5.860268306732178, "epoch": 4.308551783412119, "grad_norm": 1.2734375, "learning_rate": 0.0003593960799320402, "loss": 5.3436, "mean_token_accuracy": 0.20152566432952881, "num_tokens": 9119774.0, "step": 5015 }, { "entropy": 5.8534019947052, "epoch": 4.312849162011173, "grad_norm": 1.2890625, "learning_rate": 0.0003590877807357107, "loss": 5.2621, "mean_token_accuracy": 0.20448311269283295, "num_tokens": 9127738.0, "step": 5020 }, { "entropy": 5.747374820709228, "epoch": 4.317146540610228, "grad_norm": 1.234375, "learning_rate": 0.0003587792979252776, "loss": 5.2284, "mean_token_accuracy": 0.21259768903255463, "num_tokens": 9137060.0, "step": 5025 }, { "entropy": 5.738905239105224, "epoch": 4.321443919209282, "grad_norm": 1.2734375, "learning_rate": 0.0003584706321743442, "loss": 5.1536, "mean_token_accuracy": 0.20924367606639863, "num_tokens": 9145169.0, "step": 5030 }, { "entropy": 5.76141357421875, "epoch": 4.325741297808337, "grad_norm": 1.140625, "learning_rate": 0.000358161784156913, "loss": 5.1942, "mean_token_accuracy": 0.2173262655735016, "num_tokens": 9154092.0, "step": 5035 }, { "entropy": 5.825730895996093, "epoch": 4.3300386764073915, "grad_norm": 1.1171875, "learning_rate": 0.00035785275454738456, "loss": 5.2386, "mean_token_accuracy": 0.2012816220521927, "num_tokens": 9162824.0, "step": 5040 }, { "entropy": 5.742588758468628, "epoch": 4.334336055006446, "grad_norm": 1.3984375, "learning_rate": 0.00035754354402055635, "loss": 5.1559, "mean_token_accuracy": 0.2160980686545372, "num_tokens": 9170977.0, "step": 5045 }, { "entropy": 5.702868556976318, "epoch": 4.338633433605501, "grad_norm": 1.203125, "learning_rate": 0.0003572341532516202, "loss": 5.1992, "mean_token_accuracy": 0.20814448446035386, "num_tokens": 9179539.0, "step": 5050 }, { "entropy": 5.7406261444091795, "epoch": 4.342930812204555, "grad_norm": 1.140625, "learning_rate": 0.0003569245829161622, "loss": 5.2807, "mean_token_accuracy": 0.20756541788578034, "num_tokens": 9188861.0, "step": 5055 }, { "entropy": 5.784680891036987, "epoch": 4.34722819080361, "grad_norm": 1.15625, "learning_rate": 0.00035661483369016004, "loss": 5.2286, "mean_token_accuracy": 0.1989392876625061, "num_tokens": 9197724.0, "step": 5060 }, { "entropy": 5.787300729751587, "epoch": 4.351525569402664, "grad_norm": 1.171875, "learning_rate": 0.0003563049062499822, "loss": 5.2315, "mean_token_accuracy": 0.20997477173805237, "num_tokens": 9206375.0, "step": 5065 }, { "entropy": 5.733764886856079, "epoch": 4.355822948001719, "grad_norm": 1.2890625, "learning_rate": 0.0003559948012723865, "loss": 5.1919, "mean_token_accuracy": 0.21158115565776825, "num_tokens": 9214675.0, "step": 5070 }, { "entropy": 5.733879041671753, "epoch": 4.360120326600773, "grad_norm": 1.2265625, "learning_rate": 0.0003556845194345181, "loss": 5.2254, "mean_token_accuracy": 0.20888206362724304, "num_tokens": 9224128.0, "step": 5075 }, { "entropy": 5.7081492900848385, "epoch": 4.364417705199828, "grad_norm": 1.3671875, "learning_rate": 0.0003553740614139086, "loss": 5.1392, "mean_token_accuracy": 0.21299163699150087, "num_tokens": 9232568.0, "step": 5080 }, { "entropy": 5.800437164306641, "epoch": 4.368715083798882, "grad_norm": 1.3125, "learning_rate": 0.0003550634278884742, "loss": 5.2324, "mean_token_accuracy": 0.21197118163108825, "num_tokens": 9241809.0, "step": 5085 }, { "entropy": 5.77132625579834, "epoch": 4.373012462397937, "grad_norm": 1.1484375, "learning_rate": 0.00035475261953651433, "loss": 5.2321, "mean_token_accuracy": 0.21335066556930543, "num_tokens": 9250845.0, "step": 5090 }, { "entropy": 5.693653917312622, "epoch": 4.3773098409969915, "grad_norm": 1.2578125, "learning_rate": 0.00035444163703671026, "loss": 5.1943, "mean_token_accuracy": 0.21285624653100968, "num_tokens": 9259465.0, "step": 5095 }, { "entropy": 5.744434118270874, "epoch": 4.381607219596046, "grad_norm": 1.09375, "learning_rate": 0.00035413048106812357, "loss": 5.1779, "mean_token_accuracy": 0.22047749757766724, "num_tokens": 9267853.0, "step": 5100 }, { "entropy": 5.905093669891357, "epoch": 4.385904598195101, "grad_norm": 1.140625, "learning_rate": 0.00035381915231019425, "loss": 5.3809, "mean_token_accuracy": 0.19569829255342483, "num_tokens": 9276664.0, "step": 5105 }, { "entropy": 5.811098051071167, "epoch": 4.390201976794156, "grad_norm": 1.2109375, "learning_rate": 0.0003535076514427401, "loss": 5.1895, "mean_token_accuracy": 0.203962604701519, "num_tokens": 9285482.0, "step": 5110 }, { "entropy": 5.80591082572937, "epoch": 4.39449935539321, "grad_norm": 1.109375, "learning_rate": 0.00035319597914595436, "loss": 5.2852, "mean_token_accuracy": 0.195915387570858, "num_tokens": 9293936.0, "step": 5115 }, { "entropy": 5.787075090408325, "epoch": 4.398796733992265, "grad_norm": 1.171875, "learning_rate": 0.0003528841361004049, "loss": 5.3233, "mean_token_accuracy": 0.19306655526161193, "num_tokens": 9303998.0, "step": 5120 }, { "entropy": 5.752510738372803, "epoch": 4.40309411259132, "grad_norm": 1.2265625, "learning_rate": 0.0003525721229870323, "loss": 5.251, "mean_token_accuracy": 0.2090088054537773, "num_tokens": 9313117.0, "step": 5125 }, { "entropy": 5.783660840988159, "epoch": 4.407391491190374, "grad_norm": 1.1875, "learning_rate": 0.00035225994048714823, "loss": 5.2526, "mean_token_accuracy": 0.2077660158276558, "num_tokens": 9321446.0, "step": 5130 }, { "entropy": 5.77846508026123, "epoch": 4.411688869789429, "grad_norm": 1.1875, "learning_rate": 0.0003519475892824348, "loss": 5.2384, "mean_token_accuracy": 0.20596326589584352, "num_tokens": 9330752.0, "step": 5135 }, { "entropy": 5.770553827285767, "epoch": 4.415986248388483, "grad_norm": 1.1875, "learning_rate": 0.0003516350700549419, "loss": 5.2732, "mean_token_accuracy": 0.20229104906320572, "num_tokens": 9339322.0, "step": 5140 }, { "entropy": 5.816301393508911, "epoch": 4.420283626987538, "grad_norm": 1.21875, "learning_rate": 0.00035132238348708697, "loss": 5.2839, "mean_token_accuracy": 0.20409662872552872, "num_tokens": 9349024.0, "step": 5145 }, { "entropy": 5.914752197265625, "epoch": 4.424581005586592, "grad_norm": 1.2890625, "learning_rate": 0.00035100953026165224, "loss": 5.3867, "mean_token_accuracy": 0.19790366888046265, "num_tokens": 9358833.0, "step": 5150 }, { "entropy": 5.826274108886719, "epoch": 4.428878384185647, "grad_norm": 1.1328125, "learning_rate": 0.0003506965110617841, "loss": 5.2289, "mean_token_accuracy": 0.214010326564312, "num_tokens": 9368276.0, "step": 5155 }, { "entropy": 5.816830682754516, "epoch": 4.4331757627847015, "grad_norm": 1.0390625, "learning_rate": 0.0003503833265709915, "loss": 5.3071, "mean_token_accuracy": 0.20325663387775422, "num_tokens": 9378501.0, "step": 5160 }, { "entropy": 5.826269149780273, "epoch": 4.437473141383756, "grad_norm": 1.25, "learning_rate": 0.00035006997747314404, "loss": 5.2774, "mean_token_accuracy": 0.19983620047569275, "num_tokens": 9387789.0, "step": 5165 }, { "entropy": 5.813027286529541, "epoch": 4.441770519982811, "grad_norm": 1.1484375, "learning_rate": 0.00034975646445247106, "loss": 5.3265, "mean_token_accuracy": 0.20185133814811707, "num_tokens": 9397041.0, "step": 5170 }, { "entropy": 5.773282194137574, "epoch": 4.446067898581865, "grad_norm": 1.2109375, "learning_rate": 0.0003494427881935596, "loss": 5.2723, "mean_token_accuracy": 0.2093829855322838, "num_tokens": 9405393.0, "step": 5175 }, { "entropy": 5.777915334701538, "epoch": 4.45036527718092, "grad_norm": 1.1484375, "learning_rate": 0.00034912894938135325, "loss": 5.2298, "mean_token_accuracy": 0.2085086777806282, "num_tokens": 9415127.0, "step": 5180 }, { "entropy": 5.829266881942749, "epoch": 4.454662655779974, "grad_norm": 1.234375, "learning_rate": 0.0003488149487011506, "loss": 5.3247, "mean_token_accuracy": 0.20795295685529708, "num_tokens": 9424416.0, "step": 5185 }, { "entropy": 5.84766411781311, "epoch": 4.458960034379029, "grad_norm": 1.1484375, "learning_rate": 0.00034850078683860346, "loss": 5.2875, "mean_token_accuracy": 0.1986890748143196, "num_tokens": 9434523.0, "step": 5190 }, { "entropy": 5.80431432723999, "epoch": 4.463257412978083, "grad_norm": 1.1484375, "learning_rate": 0.0003481864644797159, "loss": 5.2997, "mean_token_accuracy": 0.21254103779792785, "num_tokens": 9443605.0, "step": 5195 }, { "entropy": 5.781310033798218, "epoch": 4.467554791577138, "grad_norm": 1.1796875, "learning_rate": 0.0003478719823108424, "loss": 5.2947, "mean_token_accuracy": 0.19746852219104766, "num_tokens": 9453268.0, "step": 5200 }, { "entropy": 5.7848005294799805, "epoch": 4.471852170176192, "grad_norm": 1.203125, "learning_rate": 0.00034755734101868613, "loss": 5.1721, "mean_token_accuracy": 0.21285812854766845, "num_tokens": 9461578.0, "step": 5205 }, { "entropy": 5.753478622436523, "epoch": 4.476149548775247, "grad_norm": 1.1484375, "learning_rate": 0.00034724254129029795, "loss": 5.2001, "mean_token_accuracy": 0.21043628752231597, "num_tokens": 9470722.0, "step": 5210 }, { "entropy": 5.809699296951294, "epoch": 4.4804469273743015, "grad_norm": 1.296875, "learning_rate": 0.0003469275838130748, "loss": 5.3227, "mean_token_accuracy": 0.20192081183195115, "num_tokens": 9479695.0, "step": 5215 }, { "entropy": 5.808649206161499, "epoch": 4.484744305973356, "grad_norm": 1.1953125, "learning_rate": 0.0003466124692747577, "loss": 5.232, "mean_token_accuracy": 0.20722401291131973, "num_tokens": 9488444.0, "step": 5220 }, { "entropy": 5.7202616214752195, "epoch": 4.489041684572411, "grad_norm": 1.2265625, "learning_rate": 0.00034629719836343106, "loss": 5.1774, "mean_token_accuracy": 0.2171816810965538, "num_tokens": 9497413.0, "step": 5225 }, { "entropy": 5.747977256774902, "epoch": 4.493339063171465, "grad_norm": 1.296875, "learning_rate": 0.0003459817717675203, "loss": 5.2276, "mean_token_accuracy": 0.21789820790290831, "num_tokens": 9506135.0, "step": 5230 }, { "entropy": 5.8027201175689695, "epoch": 4.49763644177052, "grad_norm": 1.0546875, "learning_rate": 0.0003456661901757913, "loss": 5.2937, "mean_token_accuracy": 0.19964347779750824, "num_tokens": 9516918.0, "step": 5235 }, { "entropy": 5.848086404800415, "epoch": 4.501933820369574, "grad_norm": 1.2734375, "learning_rate": 0.00034535045427734796, "loss": 5.2538, "mean_token_accuracy": 0.21298788338899613, "num_tokens": 9526052.0, "step": 5240 }, { "entropy": 5.732174396514893, "epoch": 4.506231198968629, "grad_norm": 1.25, "learning_rate": 0.0003450345647616313, "loss": 5.3185, "mean_token_accuracy": 0.20533958673477173, "num_tokens": 9535200.0, "step": 5245 }, { "entropy": 5.726828384399414, "epoch": 4.510528577567683, "grad_norm": 1.171875, "learning_rate": 0.0003447185223184177, "loss": 5.281, "mean_token_accuracy": 0.2057891994714737, "num_tokens": 9544786.0, "step": 5250 }, { "entropy": 5.807714033126831, "epoch": 4.514825956166739, "grad_norm": 1.1484375, "learning_rate": 0.00034440232763781765, "loss": 5.2182, "mean_token_accuracy": 0.21241068094968796, "num_tokens": 9553694.0, "step": 5255 }, { "entropy": 5.750864171981812, "epoch": 4.519123334765792, "grad_norm": 1.203125, "learning_rate": 0.000344085981410274, "loss": 5.2862, "mean_token_accuracy": 0.20592611283063889, "num_tokens": 9563332.0, "step": 5260 }, { "entropy": 5.697850608825684, "epoch": 4.523420713364848, "grad_norm": 1.140625, "learning_rate": 0.00034376948432656036, "loss": 5.1916, "mean_token_accuracy": 0.2156210646033287, "num_tokens": 9572367.0, "step": 5265 }, { "entropy": 5.792697620391846, "epoch": 4.527718091963902, "grad_norm": 1.0546875, "learning_rate": 0.0003434528370777798, "loss": 5.29, "mean_token_accuracy": 0.20060945600271224, "num_tokens": 9582535.0, "step": 5270 }, { "entropy": 5.776414012908935, "epoch": 4.532015470562957, "grad_norm": 1.2109375, "learning_rate": 0.00034313604035536344, "loss": 5.2398, "mean_token_accuracy": 0.2140058010816574, "num_tokens": 9590688.0, "step": 5275 }, { "entropy": 5.744070100784302, "epoch": 4.5363128491620115, "grad_norm": 1.1875, "learning_rate": 0.0003428190948510687, "loss": 5.2788, "mean_token_accuracy": 0.2073332354426384, "num_tokens": 9599209.0, "step": 5280 }, { "entropy": 5.789183044433594, "epoch": 4.540610227761066, "grad_norm": 1.2109375, "learning_rate": 0.0003425020012569778, "loss": 5.3154, "mean_token_accuracy": 0.2052520453929901, "num_tokens": 9608575.0, "step": 5285 }, { "entropy": 5.872993040084839, "epoch": 4.544907606360121, "grad_norm": 1.2265625, "learning_rate": 0.00034218476026549665, "loss": 5.2731, "mean_token_accuracy": 0.205942103266716, "num_tokens": 9617312.0, "step": 5290 }, { "entropy": 5.782846736907959, "epoch": 4.549204984959175, "grad_norm": 1.2578125, "learning_rate": 0.0003418673725693524, "loss": 5.26, "mean_token_accuracy": 0.21283673942089082, "num_tokens": 9626398.0, "step": 5295 }, { "entropy": 5.746619319915771, "epoch": 4.55350236355823, "grad_norm": 1.1875, "learning_rate": 0.0003415498388615932, "loss": 5.2346, "mean_token_accuracy": 0.20255037099123002, "num_tokens": 9635470.0, "step": 5300 }, { "entropy": 5.792229700088501, "epoch": 4.557799742157284, "grad_norm": 1.1640625, "learning_rate": 0.0003412321598355857, "loss": 5.1799, "mean_token_accuracy": 0.21325116753578185, "num_tokens": 9644728.0, "step": 5305 }, { "entropy": 5.7457763195037845, "epoch": 4.562097120756339, "grad_norm": 1.09375, "learning_rate": 0.0003409143361850139, "loss": 5.2343, "mean_token_accuracy": 0.2107704222202301, "num_tokens": 9654129.0, "step": 5310 }, { "entropy": 5.777891540527344, "epoch": 4.566394499355393, "grad_norm": 1.25, "learning_rate": 0.0003405963686038775, "loss": 5.3331, "mean_token_accuracy": 0.20437313318252565, "num_tokens": 9662648.0, "step": 5315 }, { "entropy": 5.8037111282348635, "epoch": 4.570691877954448, "grad_norm": 1.171875, "learning_rate": 0.0003402782577864908, "loss": 5.2993, "mean_token_accuracy": 0.20600511133670807, "num_tokens": 9672082.0, "step": 5320 }, { "entropy": 5.8148528099060055, "epoch": 4.574989256553502, "grad_norm": 1.25, "learning_rate": 0.00033996000442748056, "loss": 5.2167, "mean_token_accuracy": 0.21099451184272766, "num_tokens": 9681422.0, "step": 5325 }, { "entropy": 5.799546527862549, "epoch": 4.579286635152557, "grad_norm": 1.28125, "learning_rate": 0.00033964160922178495, "loss": 5.26, "mean_token_accuracy": 0.20364574044942857, "num_tokens": 9690675.0, "step": 5330 }, { "entropy": 5.7803350448608395, "epoch": 4.5835840137516115, "grad_norm": 1.140625, "learning_rate": 0.0003393230728646518, "loss": 5.2486, "mean_token_accuracy": 0.2094307616353035, "num_tokens": 9700200.0, "step": 5335 }, { "entropy": 5.7491453170776365, "epoch": 4.587881392350666, "grad_norm": 1.234375, "learning_rate": 0.00033900439605163724, "loss": 5.2404, "mean_token_accuracy": 0.20623662322759628, "num_tokens": 9709533.0, "step": 5340 }, { "entropy": 5.702243328094482, "epoch": 4.592178770949721, "grad_norm": 1.1015625, "learning_rate": 0.00033868557947860407, "loss": 5.2712, "mean_token_accuracy": 0.21160826086997986, "num_tokens": 9719250.0, "step": 5345 }, { "entropy": 5.789318418502807, "epoch": 4.596476149548775, "grad_norm": 1.1796875, "learning_rate": 0.00033836662384172014, "loss": 5.204, "mean_token_accuracy": 0.21837505847215652, "num_tokens": 9727837.0, "step": 5350 }, { "entropy": 5.77027473449707, "epoch": 4.60077352814783, "grad_norm": 1.1640625, "learning_rate": 0.0003380475298374573, "loss": 5.2919, "mean_token_accuracy": 0.20920580327510835, "num_tokens": 9737125.0, "step": 5355 }, { "entropy": 5.786106395721435, "epoch": 4.605070906746884, "grad_norm": 1.1328125, "learning_rate": 0.000337728298162589, "loss": 5.3048, "mean_token_accuracy": 0.20057657063007356, "num_tokens": 9746309.0, "step": 5360 }, { "entropy": 5.766927814483642, "epoch": 4.609368285345939, "grad_norm": 1.171875, "learning_rate": 0.00033740892951418993, "loss": 5.1898, "mean_token_accuracy": 0.2109677717089653, "num_tokens": 9755633.0, "step": 5365 }, { "entropy": 5.8201977729797365, "epoch": 4.613665663944993, "grad_norm": 1.2890625, "learning_rate": 0.0003370894245896333, "loss": 5.241, "mean_token_accuracy": 0.20121043920516968, "num_tokens": 9765179.0, "step": 5370 }, { "entropy": 5.838722229003906, "epoch": 4.617963042544048, "grad_norm": 1.359375, "learning_rate": 0.00033676978408659047, "loss": 5.2648, "mean_token_accuracy": 0.20447903871536255, "num_tokens": 9774085.0, "step": 5375 }, { "entropy": 5.819502544403076, "epoch": 4.622260421143102, "grad_norm": 1.1015625, "learning_rate": 0.0003364500087030283, "loss": 5.3734, "mean_token_accuracy": 0.19501604586839677, "num_tokens": 9784650.0, "step": 5380 }, { "entropy": 5.8356828689575195, "epoch": 4.626557799742157, "grad_norm": 1.109375, "learning_rate": 0.00033613009913720845, "loss": 5.2185, "mean_token_accuracy": 0.2058563932776451, "num_tokens": 9793947.0, "step": 5385 }, { "entropy": 5.723565244674683, "epoch": 4.6308551783412115, "grad_norm": 1.2265625, "learning_rate": 0.00033581005608768563, "loss": 5.206, "mean_token_accuracy": 0.21180496960878373, "num_tokens": 9803593.0, "step": 5390 }, { "entropy": 5.743806266784668, "epoch": 4.635152556940266, "grad_norm": 1.2109375, "learning_rate": 0.0003354898802533058, "loss": 5.2527, "mean_token_accuracy": 0.2048086941242218, "num_tokens": 9812295.0, "step": 5395 }, { "entropy": 5.786093854904175, "epoch": 4.6394499355393215, "grad_norm": 1.109375, "learning_rate": 0.0003351695723332051, "loss": 5.2582, "mean_token_accuracy": 0.20891703963279723, "num_tokens": 9820586.0, "step": 5400 }, { "entropy": 5.785850095748901, "epoch": 4.643747314138375, "grad_norm": 1.15625, "learning_rate": 0.00033484913302680807, "loss": 5.1887, "mean_token_accuracy": 0.20835017114877702, "num_tokens": 9829080.0, "step": 5405 }, { "entropy": 5.780792331695556, "epoch": 4.648044692737431, "grad_norm": 1.125, "learning_rate": 0.00033452856303382595, "loss": 5.2112, "mean_token_accuracy": 0.20481612533330917, "num_tokens": 9838421.0, "step": 5410 }, { "entropy": 5.704366683959961, "epoch": 4.652342071336484, "grad_norm": 1.3515625, "learning_rate": 0.0003342078630542555, "loss": 5.2078, "mean_token_accuracy": 0.2090738072991371, "num_tokens": 9847151.0, "step": 5415 }, { "entropy": 5.789543390274048, "epoch": 4.65663944993554, "grad_norm": 1.171875, "learning_rate": 0.00033388703378837737, "loss": 5.2403, "mean_token_accuracy": 0.2085887834429741, "num_tokens": 9856803.0, "step": 5420 }, { "entropy": 5.778923892974854, "epoch": 4.660936828534594, "grad_norm": 1.1875, "learning_rate": 0.0003335660759367544, "loss": 5.1568, "mean_token_accuracy": 0.22275909930467605, "num_tokens": 9865617.0, "step": 5425 }, { "entropy": 5.741341590881348, "epoch": 4.665234207133649, "grad_norm": 1.1171875, "learning_rate": 0.00033324499020023025, "loss": 5.227, "mean_token_accuracy": 0.2128440573811531, "num_tokens": 9875454.0, "step": 5430 }, { "entropy": 5.792864894866943, "epoch": 4.669531585732703, "grad_norm": 1.2109375, "learning_rate": 0.0003329237772799277, "loss": 5.2145, "mean_token_accuracy": 0.21179936379194259, "num_tokens": 9884770.0, "step": 5435 }, { "entropy": 5.753189039230347, "epoch": 4.673828964331758, "grad_norm": 1.2421875, "learning_rate": 0.0003326024378772477, "loss": 5.2166, "mean_token_accuracy": 0.20966077744960784, "num_tokens": 9893594.0, "step": 5440 }, { "entropy": 5.75540075302124, "epoch": 4.678126342930812, "grad_norm": 1.1328125, "learning_rate": 0.0003322809726938667, "loss": 5.3273, "mean_token_accuracy": 0.19715422689914702, "num_tokens": 9902260.0, "step": 5445 }, { "entropy": 5.785859155654907, "epoch": 4.682423721529867, "grad_norm": 1.2421875, "learning_rate": 0.00033195938243173645, "loss": 5.2191, "mean_token_accuracy": 0.2127591535449028, "num_tokens": 9911020.0, "step": 5450 }, { "entropy": 5.830314302444458, "epoch": 4.6867211001289215, "grad_norm": 1.359375, "learning_rate": 0.0003316376677930814, "loss": 5.2353, "mean_token_accuracy": 0.2036338374018669, "num_tokens": 9918696.0, "step": 5455 }, { "entropy": 5.73724684715271, "epoch": 4.691018478727976, "grad_norm": 1.21875, "learning_rate": 0.0003313158294803977, "loss": 5.2727, "mean_token_accuracy": 0.19963512420654297, "num_tokens": 9927638.0, "step": 5460 }, { "entropy": 5.745275354385376, "epoch": 4.695315857327031, "grad_norm": 1.21875, "learning_rate": 0.00033099386819645176, "loss": 5.2512, "mean_token_accuracy": 0.20709264129400254, "num_tokens": 9936969.0, "step": 5465 }, { "entropy": 5.7822166919708256, "epoch": 4.699613235926085, "grad_norm": 1.0546875, "learning_rate": 0.0003306717846442782, "loss": 5.1656, "mean_token_accuracy": 0.20781334042549132, "num_tokens": 9945229.0, "step": 5470 }, { "entropy": 5.7839744091033936, "epoch": 4.70391061452514, "grad_norm": 1.1640625, "learning_rate": 0.0003303495795271788, "loss": 5.1618, "mean_token_accuracy": 0.2114815816283226, "num_tokens": 9953759.0, "step": 5475 }, { "entropy": 5.725874710083008, "epoch": 4.708207993124194, "grad_norm": 1.1640625, "learning_rate": 0.00033002725354872075, "loss": 5.2698, "mean_token_accuracy": 0.2067277103662491, "num_tokens": 9962771.0, "step": 5480 }, { "entropy": 5.762161922454834, "epoch": 4.712505371723249, "grad_norm": 1.25, "learning_rate": 0.00032970480741273514, "loss": 5.2727, "mean_token_accuracy": 0.19539556801319122, "num_tokens": 9972481.0, "step": 5485 }, { "entropy": 5.821972513198853, "epoch": 4.716802750322303, "grad_norm": 1.4140625, "learning_rate": 0.0003293822418233155, "loss": 5.2272, "mean_token_accuracy": 0.2026599943637848, "num_tokens": 9980773.0, "step": 5490 }, { "entropy": 5.833219766616821, "epoch": 4.721100128921358, "grad_norm": 1.21875, "learning_rate": 0.0003290595574848161, "loss": 5.2993, "mean_token_accuracy": 0.19704944640398026, "num_tokens": 9989830.0, "step": 5495 }, { "entropy": 5.733156061172485, "epoch": 4.725397507520412, "grad_norm": 1.1796875, "learning_rate": 0.0003287367551018505, "loss": 5.2339, "mean_token_accuracy": 0.2133151039481163, "num_tokens": 9999234.0, "step": 5500 }, { "epoch": 4.725397507520412, "eval_entropy": 5.560198612041302, "eval_loss": 5.90402889251709, "eval_mean_token_accuracy": 0.17905197752354382, "eval_num_tokens": 9999234.0, "eval_runtime": 2.2434, "eval_samples_per_second": 1581.999, "eval_steps_per_second": 197.917, "step": 5500 }, { "entropy": 5.795751619338989, "epoch": 4.729694886119467, "grad_norm": 1.09375, "learning_rate": 0.0003284138353792903, "loss": 5.2886, "mean_token_accuracy": 0.2074010655283928, "num_tokens": 10008671.0, "step": 5505 }, { "entropy": 5.721130895614624, "epoch": 4.7339922647185215, "grad_norm": 1.3203125, "learning_rate": 0.0003280907990222628, "loss": 5.2707, "mean_token_accuracy": 0.2059103801846504, "num_tokens": 10017170.0, "step": 5510 }, { "entropy": 5.723065376281738, "epoch": 4.738289643317576, "grad_norm": 1.3359375, "learning_rate": 0.00032776764673615055, "loss": 5.2724, "mean_token_accuracy": 0.20495502203702926, "num_tokens": 10025712.0, "step": 5515 }, { "entropy": 5.788556861877441, "epoch": 4.742587021916631, "grad_norm": 1.125, "learning_rate": 0.0003274443792265888, "loss": 5.2631, "mean_token_accuracy": 0.21783117204904556, "num_tokens": 10035297.0, "step": 5520 }, { "entropy": 5.810042524337769, "epoch": 4.746884400515685, "grad_norm": 1.2109375, "learning_rate": 0.00032712099719946474, "loss": 5.2512, "mean_token_accuracy": 0.21279625296592714, "num_tokens": 10043903.0, "step": 5525 }, { "entropy": 5.732596445083618, "epoch": 4.75118177911474, "grad_norm": 1.21875, "learning_rate": 0.00032679750136091533, "loss": 5.2747, "mean_token_accuracy": 0.2008904069662094, "num_tokens": 10053035.0, "step": 5530 }, { "entropy": 5.677971029281617, "epoch": 4.755479157713794, "grad_norm": 1.25, "learning_rate": 0.0003264738924173262, "loss": 5.2332, "mean_token_accuracy": 0.21256235390901565, "num_tokens": 10061911.0, "step": 5535 }, { "entropy": 5.743843460083008, "epoch": 4.759776536312849, "grad_norm": 1.203125, "learning_rate": 0.00032615017107533, "loss": 5.2502, "mean_token_accuracy": 0.20980682075023652, "num_tokens": 10070738.0, "step": 5540 }, { "entropy": 5.779987239837647, "epoch": 4.764073914911903, "grad_norm": 1.25, "learning_rate": 0.0003258263380418047, "loss": 5.2579, "mean_token_accuracy": 0.21013425886631013, "num_tokens": 10080638.0, "step": 5545 }, { "entropy": 5.874662113189697, "epoch": 4.768371293510958, "grad_norm": 1.265625, "learning_rate": 0.00032550239402387226, "loss": 5.2999, "mean_token_accuracy": 0.20025704056024551, "num_tokens": 10089429.0, "step": 5550 }, { "entropy": 5.714057350158692, "epoch": 4.772668672110013, "grad_norm": 1.203125, "learning_rate": 0.00032517833972889695, "loss": 5.1752, "mean_token_accuracy": 0.21451522409915924, "num_tokens": 10098109.0, "step": 5555 }, { "entropy": 5.8082190990448, "epoch": 4.776966050709067, "grad_norm": 1.265625, "learning_rate": 0.00032485417586448375, "loss": 5.2778, "mean_token_accuracy": 0.207219435274601, "num_tokens": 10106808.0, "step": 5560 }, { "entropy": 5.796800899505615, "epoch": 4.781263429308122, "grad_norm": 1.3203125, "learning_rate": 0.000324529903138477, "loss": 5.2739, "mean_token_accuracy": 0.20491307973861694, "num_tokens": 10116372.0, "step": 5565 }, { "entropy": 5.746866130828858, "epoch": 4.785560807907177, "grad_norm": 1.1796875, "learning_rate": 0.0003242055222589587, "loss": 5.1874, "mean_token_accuracy": 0.21595880538225173, "num_tokens": 10125256.0, "step": 5570 }, { "entropy": 5.781742668151855, "epoch": 4.7898581865062315, "grad_norm": 1.1796875, "learning_rate": 0.000323881033934247, "loss": 5.3216, "mean_token_accuracy": 0.19934289157390594, "num_tokens": 10134784.0, "step": 5575 }, { "entropy": 5.8426214218139645, "epoch": 4.794155565105286, "grad_norm": 1.2578125, "learning_rate": 0.00032355643887289486, "loss": 5.2638, "mean_token_accuracy": 0.20878719687461852, "num_tokens": 10144324.0, "step": 5580 }, { "entropy": 5.8031574249267575, "epoch": 4.798452943704341, "grad_norm": 1.1953125, "learning_rate": 0.0003232317377836881, "loss": 5.291, "mean_token_accuracy": 0.20265911519527435, "num_tokens": 10152866.0, "step": 5585 }, { "entropy": 5.698230791091919, "epoch": 4.802750322303395, "grad_norm": 1.15625, "learning_rate": 0.000322906931375644, "loss": 5.206, "mean_token_accuracy": 0.20940036773681642, "num_tokens": 10162457.0, "step": 5590 }, { "entropy": 5.738978576660156, "epoch": 4.80704770090245, "grad_norm": 1.1953125, "learning_rate": 0.00032258202035801, "loss": 5.2816, "mean_token_accuracy": 0.2019842505455017, "num_tokens": 10171604.0, "step": 5595 }, { "entropy": 5.848924446105957, "epoch": 4.811345079501504, "grad_norm": 1.140625, "learning_rate": 0.000322257005440262, "loss": 5.2371, "mean_token_accuracy": 0.2144896060228348, "num_tokens": 10180762.0, "step": 5600 }, { "entropy": 5.717645740509033, "epoch": 4.815642458100559, "grad_norm": 1.171875, "learning_rate": 0.0003219318873321025, "loss": 5.1608, "mean_token_accuracy": 0.22832754403352737, "num_tokens": 10189122.0, "step": 5605 }, { "entropy": 5.69571099281311, "epoch": 4.819939836699613, "grad_norm": 1.15625, "learning_rate": 0.00032160666674345954, "loss": 5.2763, "mean_token_accuracy": 0.19781464338302612, "num_tokens": 10197280.0, "step": 5610 }, { "entropy": 5.781878519058227, "epoch": 4.824237215298668, "grad_norm": 1.1171875, "learning_rate": 0.00032128134438448504, "loss": 5.3146, "mean_token_accuracy": 0.20028045326471328, "num_tokens": 10207507.0, "step": 5615 }, { "entropy": 5.825538921356201, "epoch": 4.828534593897722, "grad_norm": 1.2421875, "learning_rate": 0.00032095592096555284, "loss": 5.2795, "mean_token_accuracy": 0.2031347781419754, "num_tokens": 10217584.0, "step": 5620 }, { "entropy": 5.776251411437988, "epoch": 4.832831972496777, "grad_norm": 1.140625, "learning_rate": 0.0003206303971972577, "loss": 5.216, "mean_token_accuracy": 0.2127404496073723, "num_tokens": 10226388.0, "step": 5625 }, { "entropy": 5.74369969367981, "epoch": 4.8371293510958315, "grad_norm": 1.234375, "learning_rate": 0.0003203047737904134, "loss": 5.2346, "mean_token_accuracy": 0.21056682765483856, "num_tokens": 10235333.0, "step": 5630 }, { "entropy": 5.7578212261199955, "epoch": 4.841426729694886, "grad_norm": 1.2421875, "learning_rate": 0.00031997905145605135, "loss": 5.285, "mean_token_accuracy": 0.2008419454097748, "num_tokens": 10243985.0, "step": 5635 }, { "entropy": 5.800904607772827, "epoch": 4.845724108293941, "grad_norm": 1.2578125, "learning_rate": 0.00031965323090541874, "loss": 5.2762, "mean_token_accuracy": 0.20055779963731765, "num_tokens": 10252968.0, "step": 5640 }, { "entropy": 5.7787864208221436, "epoch": 4.850021486892995, "grad_norm": 1.234375, "learning_rate": 0.0003193273128499777, "loss": 5.1509, "mean_token_accuracy": 0.20814335197210312, "num_tokens": 10261890.0, "step": 5645 }, { "entropy": 5.755404710769653, "epoch": 4.85431886549205, "grad_norm": 1.234375, "learning_rate": 0.00031900129800140287, "loss": 5.264, "mean_token_accuracy": 0.2088309958577156, "num_tokens": 10271363.0, "step": 5650 }, { "entropy": 5.778644323348999, "epoch": 4.858616244091104, "grad_norm": 1.3671875, "learning_rate": 0.00031867518707158027, "loss": 5.3037, "mean_token_accuracy": 0.20097779482603073, "num_tokens": 10280608.0, "step": 5655 }, { "entropy": 5.763189840316772, "epoch": 4.862913622690159, "grad_norm": 1.21875, "learning_rate": 0.000318348980772606, "loss": 5.2226, "mean_token_accuracy": 0.2084927350282669, "num_tokens": 10289972.0, "step": 5660 }, { "entropy": 5.852762603759766, "epoch": 4.867211001289213, "grad_norm": 1.328125, "learning_rate": 0.00031802267981678414, "loss": 5.2859, "mean_token_accuracy": 0.2053852140903473, "num_tokens": 10298740.0, "step": 5665 }, { "entropy": 5.785357904434204, "epoch": 4.871508379888268, "grad_norm": 1.09375, "learning_rate": 0.00031769628491662563, "loss": 5.2421, "mean_token_accuracy": 0.2079989120364189, "num_tokens": 10307706.0, "step": 5670 }, { "entropy": 5.791097354888916, "epoch": 4.8758057584873224, "grad_norm": 1.09375, "learning_rate": 0.00031736979678484634, "loss": 5.2811, "mean_token_accuracy": 0.20809438675642014, "num_tokens": 10317549.0, "step": 5675 }, { "entropy": 5.737194919586182, "epoch": 4.880103137086377, "grad_norm": 1.1796875, "learning_rate": 0.00031704321613436597, "loss": 5.3278, "mean_token_accuracy": 0.2005993828177452, "num_tokens": 10327681.0, "step": 5680 }, { "entropy": 5.695877456665039, "epoch": 4.8844005156854315, "grad_norm": 1.1015625, "learning_rate": 0.0003167165436783061, "loss": 5.2592, "mean_token_accuracy": 0.20781303942203522, "num_tokens": 10336261.0, "step": 5685 }, { "entropy": 5.710936450958252, "epoch": 4.888697894284486, "grad_norm": 1.2421875, "learning_rate": 0.00031638978012998875, "loss": 5.167, "mean_token_accuracy": 0.21940137445926666, "num_tokens": 10344770.0, "step": 5690 }, { "entropy": 5.783964347839356, "epoch": 4.892995272883541, "grad_norm": 1.171875, "learning_rate": 0.000316062926202935, "loss": 5.3284, "mean_token_accuracy": 0.19456012099981307, "num_tokens": 10354246.0, "step": 5695 }, { "entropy": 5.779203987121582, "epoch": 4.897292651482596, "grad_norm": 1.3203125, "learning_rate": 0.0003157359826108632, "loss": 5.2356, "mean_token_accuracy": 0.2105637699365616, "num_tokens": 10362693.0, "step": 5700 }, { "entropy": 5.798118543624878, "epoch": 4.90159003008165, "grad_norm": 1.203125, "learning_rate": 0.00031540895006768727, "loss": 5.247, "mean_token_accuracy": 0.20408895760774612, "num_tokens": 10371639.0, "step": 5705 }, { "entropy": 5.757362174987793, "epoch": 4.905887408680705, "grad_norm": 1.1875, "learning_rate": 0.0003150818292875158, "loss": 5.2643, "mean_token_accuracy": 0.20622867792844773, "num_tokens": 10381237.0, "step": 5710 }, { "entropy": 5.789800691604614, "epoch": 4.91018478727976, "grad_norm": 1.234375, "learning_rate": 0.0003147546209846497, "loss": 5.2454, "mean_token_accuracy": 0.2081000551581383, "num_tokens": 10389932.0, "step": 5715 }, { "entropy": 5.681432151794434, "epoch": 4.914482165878814, "grad_norm": 1.21875, "learning_rate": 0.0003144273258735812, "loss": 5.1358, "mean_token_accuracy": 0.2108309730887413, "num_tokens": 10398938.0, "step": 5720 }, { "entropy": 5.736886119842529, "epoch": 4.918779544477869, "grad_norm": 1.59375, "learning_rate": 0.0003140999446689919, "loss": 5.2497, "mean_token_accuracy": 0.20110664814710616, "num_tokens": 10407980.0, "step": 5725 }, { "entropy": 5.738250875473023, "epoch": 4.923076923076923, "grad_norm": 1.421875, "learning_rate": 0.0003137724780857516, "loss": 5.3108, "mean_token_accuracy": 0.20317183136940004, "num_tokens": 10416990.0, "step": 5730 }, { "entropy": 5.827092552185059, "epoch": 4.927374301675978, "grad_norm": 1.078125, "learning_rate": 0.00031344492683891634, "loss": 5.3121, "mean_token_accuracy": 0.20660584568977355, "num_tokens": 10426573.0, "step": 5735 }, { "entropy": 5.835918712615967, "epoch": 4.931671680275032, "grad_norm": 1.265625, "learning_rate": 0.0003131172916437272, "loss": 5.2782, "mean_token_accuracy": 0.20372102856636048, "num_tokens": 10435162.0, "step": 5740 }, { "entropy": 5.760685634613037, "epoch": 4.935969058874087, "grad_norm": 1.2890625, "learning_rate": 0.00031278957321560845, "loss": 5.2918, "mean_token_accuracy": 0.20650595277547837, "num_tokens": 10444374.0, "step": 5745 }, { "entropy": 5.827428340911865, "epoch": 4.9402664374731415, "grad_norm": 1.4140625, "learning_rate": 0.00031246177227016615, "loss": 5.2964, "mean_token_accuracy": 0.19701409637928008, "num_tokens": 10452679.0, "step": 5750 }, { "entropy": 5.766043090820313, "epoch": 4.944563816072196, "grad_norm": 1.1640625, "learning_rate": 0.00031213388952318653, "loss": 5.2579, "mean_token_accuracy": 0.21266984939575195, "num_tokens": 10461801.0, "step": 5755 }, { "entropy": 5.774032258987427, "epoch": 4.948861194671251, "grad_norm": 1.171875, "learning_rate": 0.0003118059256906345, "loss": 5.2484, "mean_token_accuracy": 0.20315881073474884, "num_tokens": 10471176.0, "step": 5760 }, { "entropy": 5.80142560005188, "epoch": 4.953158573270305, "grad_norm": 1.28125, "learning_rate": 0.00031147788148865204, "loss": 5.2972, "mean_token_accuracy": 0.1972917288541794, "num_tokens": 10480403.0, "step": 5765 }, { "entropy": 5.776571130752563, "epoch": 4.95745595186936, "grad_norm": 1.1953125, "learning_rate": 0.0003111497576335564, "loss": 5.2465, "mean_token_accuracy": 0.20880960077047347, "num_tokens": 10489574.0, "step": 5770 }, { "entropy": 5.754300117492676, "epoch": 4.961753330468414, "grad_norm": 1.0078125, "learning_rate": 0.0003108215548418391, "loss": 5.2601, "mean_token_accuracy": 0.2078449085354805, "num_tokens": 10499631.0, "step": 5775 }, { "entropy": 5.760015392303467, "epoch": 4.966050709067469, "grad_norm": 1.46875, "learning_rate": 0.0003104932738301637, "loss": 5.2392, "mean_token_accuracy": 0.20401757657527925, "num_tokens": 10508128.0, "step": 5780 }, { "entropy": 5.7455097198486325, "epoch": 4.970348087666523, "grad_norm": 1.1484375, "learning_rate": 0.00031016491531536477, "loss": 5.209, "mean_token_accuracy": 0.2097795397043228, "num_tokens": 10517544.0, "step": 5785 }, { "entropy": 5.734928035736084, "epoch": 4.974645466265578, "grad_norm": 1.1953125, "learning_rate": 0.0003098364800144462, "loss": 5.2651, "mean_token_accuracy": 0.21147863417863846, "num_tokens": 10526244.0, "step": 5790 }, { "entropy": 5.814940166473389, "epoch": 4.9789428448646325, "grad_norm": 1.234375, "learning_rate": 0.0003095079686445792, "loss": 5.3519, "mean_token_accuracy": 0.20624426007270813, "num_tokens": 10535887.0, "step": 5795 }, { "entropy": 5.786068773269653, "epoch": 4.983240223463687, "grad_norm": 1.1484375, "learning_rate": 0.00030917938192310146, "loss": 5.1931, "mean_token_accuracy": 0.21397653371095657, "num_tokens": 10544420.0, "step": 5800 }, { "entropy": 5.779589319229126, "epoch": 4.9875376020627415, "grad_norm": 1.1640625, "learning_rate": 0.00030885072056751494, "loss": 5.2765, "mean_token_accuracy": 0.20363772213459014, "num_tokens": 10553114.0, "step": 5805 }, { "entropy": 5.727208232879638, "epoch": 4.991834980661796, "grad_norm": 1.1328125, "learning_rate": 0.00030852198529548476, "loss": 5.3007, "mean_token_accuracy": 0.20537486374378205, "num_tokens": 10562272.0, "step": 5810 }, { "entropy": 5.76721887588501, "epoch": 4.996132359260851, "grad_norm": 1.2109375, "learning_rate": 0.0003081931768248373, "loss": 5.2452, "mean_token_accuracy": 0.20829617381095886, "num_tokens": 10571757.0, "step": 5815 }, { "entropy": 5.730508645375569, "epoch": 5.0, "grad_norm": 1.8125, "learning_rate": 0.0003078642958735588, "loss": 5.2074, "mean_token_accuracy": 0.21393800609641606, "num_tokens": 10579660.0, "step": 5820 }, { "entropy": 5.796049404144287, "epoch": 5.0042973785990545, "grad_norm": 1.2265625, "learning_rate": 0.00030753534315979393, "loss": 5.0885, "mean_token_accuracy": 0.21603988260030746, "num_tokens": 10589139.0, "step": 5825 }, { "entropy": 5.747597026824951, "epoch": 5.008594757198109, "grad_norm": 1.28125, "learning_rate": 0.0003072063194018438, "loss": 4.8885, "mean_token_accuracy": 0.23525934666395187, "num_tokens": 10597915.0, "step": 5830 }, { "entropy": 5.772410535812378, "epoch": 5.012892135797164, "grad_norm": 1.203125, "learning_rate": 0.0003068772253181648, "loss": 5.0912, "mean_token_accuracy": 0.21257753819227218, "num_tokens": 10606491.0, "step": 5835 }, { "entropy": 5.788251781463623, "epoch": 5.017189514396218, "grad_norm": 1.0859375, "learning_rate": 0.0003065480616273671, "loss": 5.0793, "mean_token_accuracy": 0.21656766831874846, "num_tokens": 10615852.0, "step": 5840 }, { "entropy": 5.813474178314209, "epoch": 5.021486892995273, "grad_norm": 1.28125, "learning_rate": 0.0003062188290482123, "loss": 5.0668, "mean_token_accuracy": 0.21779369860887526, "num_tokens": 10625442.0, "step": 5845 }, { "entropy": 5.729433870315551, "epoch": 5.025784271594327, "grad_norm": 1.0859375, "learning_rate": 0.00030588952829961304, "loss": 4.9891, "mean_token_accuracy": 0.23263396918773652, "num_tokens": 10634972.0, "step": 5850 }, { "entropy": 5.756176233291626, "epoch": 5.030081650193382, "grad_norm": 1.265625, "learning_rate": 0.0003055601601006303, "loss": 4.9781, "mean_token_accuracy": 0.22352562844753265, "num_tokens": 10644487.0, "step": 5855 }, { "entropy": 5.775532484054565, "epoch": 5.034379028792436, "grad_norm": 1.2734375, "learning_rate": 0.0003052307251704728, "loss": 5.0044, "mean_token_accuracy": 0.24283600449562073, "num_tokens": 10654144.0, "step": 5860 }, { "entropy": 5.713040828704834, "epoch": 5.038676407391491, "grad_norm": 1.4453125, "learning_rate": 0.0003049012242284946, "loss": 5.0475, "mean_token_accuracy": 0.21910769790410994, "num_tokens": 10663023.0, "step": 5865 }, { "entropy": 5.778925609588623, "epoch": 5.0429737859905455, "grad_norm": 1.2109375, "learning_rate": 0.0003045716579941941, "loss": 5.0938, "mean_token_accuracy": 0.21345814913511277, "num_tokens": 10672001.0, "step": 5870 }, { "entropy": 5.738844108581543, "epoch": 5.0472711645896, "grad_norm": 1.1015625, "learning_rate": 0.00030424202718721215, "loss": 5.0557, "mean_token_accuracy": 0.22208653390407562, "num_tokens": 10682654.0, "step": 5875 }, { "entropy": 5.726908826828003, "epoch": 5.051568543188655, "grad_norm": 1.296875, "learning_rate": 0.00030391233252733085, "loss": 5.0485, "mean_token_accuracy": 0.22415625005960466, "num_tokens": 10691429.0, "step": 5880 }, { "entropy": 5.722217750549317, "epoch": 5.055865921787709, "grad_norm": 1.171875, "learning_rate": 0.00030358257473447144, "loss": 5.0328, "mean_token_accuracy": 0.22570680379867553, "num_tokens": 10701130.0, "step": 5885 }, { "entropy": 5.732234096527099, "epoch": 5.060163300386764, "grad_norm": 1.234375, "learning_rate": 0.00030325275452869316, "loss": 4.9798, "mean_token_accuracy": 0.23107673078775406, "num_tokens": 10709779.0, "step": 5890 }, { "entropy": 5.676132869720459, "epoch": 5.064460678985819, "grad_norm": 1.21875, "learning_rate": 0.00030292287263019153, "loss": 5.0472, "mean_token_accuracy": 0.2280441015958786, "num_tokens": 10718795.0, "step": 5895 }, { "entropy": 5.61620192527771, "epoch": 5.068758057584874, "grad_norm": 1.25, "learning_rate": 0.00030259292975929675, "loss": 4.9615, "mean_token_accuracy": 0.23248845636844634, "num_tokens": 10728202.0, "step": 5900 }, { "entropy": 5.742444896697998, "epoch": 5.073055436183928, "grad_norm": 1.1171875, "learning_rate": 0.0003022629266364723, "loss": 4.9959, "mean_token_accuracy": 0.22648665606975554, "num_tokens": 10737050.0, "step": 5905 }, { "entropy": 5.765759754180908, "epoch": 5.077352814782983, "grad_norm": 1.328125, "learning_rate": 0.00030193286398231276, "loss": 5.0042, "mean_token_accuracy": 0.23227498382329942, "num_tokens": 10745261.0, "step": 5910 }, { "entropy": 5.716326808929443, "epoch": 5.081650193382037, "grad_norm": 1.21875, "learning_rate": 0.00030160274251754337, "loss": 5.0737, "mean_token_accuracy": 0.22183098196983336, "num_tokens": 10755008.0, "step": 5915 }, { "entropy": 5.730919981002808, "epoch": 5.085947571981092, "grad_norm": 1.2421875, "learning_rate": 0.00030127256296301724, "loss": 5.0783, "mean_token_accuracy": 0.2172458812594414, "num_tokens": 10763951.0, "step": 5920 }, { "entropy": 5.725339317321778, "epoch": 5.090244950580146, "grad_norm": 1.1484375, "learning_rate": 0.0003009423260397148, "loss": 5.0167, "mean_token_accuracy": 0.22735593020915984, "num_tokens": 10772770.0, "step": 5925 }, { "entropy": 5.728106307983398, "epoch": 5.094542329179201, "grad_norm": 1.1953125, "learning_rate": 0.00030061203246874125, "loss": 5.0864, "mean_token_accuracy": 0.21524910628795624, "num_tokens": 10781827.0, "step": 5930 }, { "entropy": 5.795179605484009, "epoch": 5.0988397077782555, "grad_norm": 1.3125, "learning_rate": 0.00030028168297132593, "loss": 5.1495, "mean_token_accuracy": 0.2189791604876518, "num_tokens": 10792321.0, "step": 5935 }, { "entropy": 5.710354375839233, "epoch": 5.10313708637731, "grad_norm": 1.328125, "learning_rate": 0.0002999512782688199, "loss": 5.1125, "mean_token_accuracy": 0.22242486327886582, "num_tokens": 10801689.0, "step": 5940 }, { "entropy": 5.7583554744720455, "epoch": 5.1074344649763646, "grad_norm": 1.1953125, "learning_rate": 0.0002996208190826951, "loss": 5.0227, "mean_token_accuracy": 0.2247595891356468, "num_tokens": 10810513.0, "step": 5945 }, { "entropy": 5.741311025619507, "epoch": 5.111731843575419, "grad_norm": 1.171875, "learning_rate": 0.00029929030613454227, "loss": 4.992, "mean_token_accuracy": 0.2273296982049942, "num_tokens": 10819581.0, "step": 5950 }, { "entropy": 5.697500371932984, "epoch": 5.116029222174474, "grad_norm": 1.2109375, "learning_rate": 0.0002989597401460697, "loss": 4.9943, "mean_token_accuracy": 0.2291257008910179, "num_tokens": 10828139.0, "step": 5955 }, { "entropy": 5.706188678741455, "epoch": 5.120326600773528, "grad_norm": 1.34375, "learning_rate": 0.00029862912183910105, "loss": 5.0124, "mean_token_accuracy": 0.22806197851896287, "num_tokens": 10836256.0, "step": 5960 }, { "entropy": 5.734335613250733, "epoch": 5.124623979372583, "grad_norm": 1.2109375, "learning_rate": 0.00029829845193557496, "loss": 5.0376, "mean_token_accuracy": 0.22361504435539245, "num_tokens": 10846255.0, "step": 5965 }, { "entropy": 5.714362859725952, "epoch": 5.128921357971637, "grad_norm": 1.2578125, "learning_rate": 0.0002979677311575421, "loss": 5.0333, "mean_token_accuracy": 0.23065778315067292, "num_tokens": 10855546.0, "step": 5970 }, { "entropy": 5.773637056350708, "epoch": 5.133218736570692, "grad_norm": 1.3125, "learning_rate": 0.0002976369602271646, "loss": 5.1018, "mean_token_accuracy": 0.22361014485359193, "num_tokens": 10864417.0, "step": 5975 }, { "entropy": 5.773756456375122, "epoch": 5.137516115169746, "grad_norm": 1.15625, "learning_rate": 0.0002973061398667138, "loss": 5.0901, "mean_token_accuracy": 0.22015978991985322, "num_tokens": 10874527.0, "step": 5980 }, { "entropy": 5.735579776763916, "epoch": 5.141813493768801, "grad_norm": 1.296875, "learning_rate": 0.00029697527079856916, "loss": 5.0971, "mean_token_accuracy": 0.2176682710647583, "num_tokens": 10883486.0, "step": 5985 }, { "entropy": 5.7296717166900635, "epoch": 5.1461108723678555, "grad_norm": 1.296875, "learning_rate": 0.00029664435374521665, "loss": 4.9762, "mean_token_accuracy": 0.22494551688432693, "num_tokens": 10891972.0, "step": 5990 }, { "entropy": 5.744566440582275, "epoch": 5.15040825096691, "grad_norm": 1.21875, "learning_rate": 0.00029631338942924664, "loss": 5.0027, "mean_token_accuracy": 0.23141432255506517, "num_tokens": 10901350.0, "step": 5995 }, { "entropy": 5.698413944244384, "epoch": 5.154705629565965, "grad_norm": 1.3671875, "learning_rate": 0.0002959823785733531, "loss": 4.9972, "mean_token_accuracy": 0.22052146643400192, "num_tokens": 10910114.0, "step": 6000 }, { "epoch": 5.154705629565965, "eval_entropy": 5.505302819582793, "eval_loss": 5.905168533325195, "eval_mean_token_accuracy": 0.1798398159444332, "eval_num_tokens": 10910114.0, "eval_runtime": 2.0504, "eval_samples_per_second": 1730.888, "eval_steps_per_second": 216.544, "step": 6000 }, { "entropy": 5.703116512298584, "epoch": 5.159003008165019, "grad_norm": 1.328125, "learning_rate": 0.0002956513219003312, "loss": 5.0708, "mean_token_accuracy": 0.22419020235538484, "num_tokens": 10919781.0, "step": 6005 }, { "entropy": 5.713309383392334, "epoch": 5.163300386764074, "grad_norm": 1.21875, "learning_rate": 0.00029532022013307666, "loss": 5.0656, "mean_token_accuracy": 0.2243061915040016, "num_tokens": 10929561.0, "step": 6010 }, { "entropy": 5.7261634349823, "epoch": 5.167597765363128, "grad_norm": 1.171875, "learning_rate": 0.00029498907399458325, "loss": 5.0387, "mean_token_accuracy": 0.22178855687379836, "num_tokens": 10939123.0, "step": 6015 }, { "entropy": 5.791707181930542, "epoch": 5.171895143962183, "grad_norm": 1.265625, "learning_rate": 0.0002946578842079418, "loss": 5.0776, "mean_token_accuracy": 0.22488960474729539, "num_tokens": 10947990.0, "step": 6020 }, { "entropy": 5.73942985534668, "epoch": 5.176192522561237, "grad_norm": 1.3515625, "learning_rate": 0.0002943266514963384, "loss": 4.9908, "mean_token_accuracy": 0.2356729969382286, "num_tokens": 10956569.0, "step": 6025 }, { "entropy": 5.6919941902160645, "epoch": 5.180489901160292, "grad_norm": 1.3515625, "learning_rate": 0.0002939953765830529, "loss": 5.0481, "mean_token_accuracy": 0.2131089001893997, "num_tokens": 10965466.0, "step": 6030 }, { "entropy": 5.7069274425506595, "epoch": 5.184787279759346, "grad_norm": 1.1796875, "learning_rate": 0.00029366406019145735, "loss": 5.0639, "mean_token_accuracy": 0.21831514239311217, "num_tokens": 10975051.0, "step": 6035 }, { "entropy": 5.679154253005981, "epoch": 5.189084658358402, "grad_norm": 1.3046875, "learning_rate": 0.0002933327030450143, "loss": 4.9649, "mean_token_accuracy": 0.22860484719276428, "num_tokens": 10983940.0, "step": 6040 }, { "entropy": 5.743462324142456, "epoch": 5.193382036957456, "grad_norm": 1.2734375, "learning_rate": 0.00029300130586727545, "loss": 5.0794, "mean_token_accuracy": 0.2147662729024887, "num_tokens": 10994029.0, "step": 6045 }, { "entropy": 5.774721336364746, "epoch": 5.197679415556511, "grad_norm": 1.1640625, "learning_rate": 0.00029266986938187943, "loss": 5.1232, "mean_token_accuracy": 0.2085681289434433, "num_tokens": 11003616.0, "step": 6050 }, { "entropy": 5.690842819213867, "epoch": 5.2019767941555655, "grad_norm": 1.2265625, "learning_rate": 0.0002923383943125514, "loss": 5.0045, "mean_token_accuracy": 0.22171231657266616, "num_tokens": 11012068.0, "step": 6055 }, { "entropy": 5.7006793975830075, "epoch": 5.20627417275462, "grad_norm": 1.3671875, "learning_rate": 0.0002920068813831002, "loss": 5.0307, "mean_token_accuracy": 0.22483575940132142, "num_tokens": 11020510.0, "step": 6060 }, { "entropy": 5.684882783889771, "epoch": 5.210571551353675, "grad_norm": 1.21875, "learning_rate": 0.0002916753313174178, "loss": 5.0806, "mean_token_accuracy": 0.21965396404266357, "num_tokens": 11029804.0, "step": 6065 }, { "entropy": 5.7044366836547855, "epoch": 5.214868929952729, "grad_norm": 1.296875, "learning_rate": 0.0002913437448394768, "loss": 5.0667, "mean_token_accuracy": 0.22526153475046157, "num_tokens": 11038586.0, "step": 6070 }, { "entropy": 5.8208294868469235, "epoch": 5.219166308551784, "grad_norm": 1.1953125, "learning_rate": 0.00029101212267332955, "loss": 5.1221, "mean_token_accuracy": 0.22288703322410583, "num_tokens": 11048240.0, "step": 6075 }, { "entropy": 5.76058988571167, "epoch": 5.223463687150838, "grad_norm": 1.1484375, "learning_rate": 0.00029068046554310637, "loss": 5.0108, "mean_token_accuracy": 0.21949543207883834, "num_tokens": 11056703.0, "step": 6080 }, { "entropy": 5.705720090866089, "epoch": 5.227761065749893, "grad_norm": 1.109375, "learning_rate": 0.0002903487741730139, "loss": 5.048, "mean_token_accuracy": 0.21768798530101777, "num_tokens": 11066246.0, "step": 6085 }, { "entropy": 5.734841060638428, "epoch": 5.232058444348947, "grad_norm": 1.296875, "learning_rate": 0.00029001704928733354, "loss": 5.0935, "mean_token_accuracy": 0.22335942834615707, "num_tokens": 11075277.0, "step": 6090 }, { "entropy": 5.6772912502288815, "epoch": 5.236355822948002, "grad_norm": 1.234375, "learning_rate": 0.0002896852916104198, "loss": 4.9675, "mean_token_accuracy": 0.23530965745449067, "num_tokens": 11083759.0, "step": 6095 }, { "entropy": 5.768802165985107, "epoch": 5.240653201547056, "grad_norm": 1.28125, "learning_rate": 0.0002893535018666988, "loss": 4.9656, "mean_token_accuracy": 0.22874611467123032, "num_tokens": 11091960.0, "step": 6100 }, { "entropy": 5.67240538597107, "epoch": 5.244950580146111, "grad_norm": 1.4453125, "learning_rate": 0.00028902168078066674, "loss": 5.0937, "mean_token_accuracy": 0.2179962605237961, "num_tokens": 11101134.0, "step": 6105 }, { "entropy": 5.707141494750976, "epoch": 5.2492479587451655, "grad_norm": 1.296875, "learning_rate": 0.0002886898290768883, "loss": 5.0423, "mean_token_accuracy": 0.22455137670040132, "num_tokens": 11110282.0, "step": 6110 }, { "entropy": 5.73919997215271, "epoch": 5.25354533734422, "grad_norm": 1.25, "learning_rate": 0.000288357947479995, "loss": 5.1048, "mean_token_accuracy": 0.22395750284194946, "num_tokens": 11119591.0, "step": 6115 }, { "entropy": 5.717151689529419, "epoch": 5.257842715943275, "grad_norm": 1.125, "learning_rate": 0.00028802603671468347, "loss": 5.093, "mean_token_accuracy": 0.21745581328868865, "num_tokens": 11129164.0, "step": 6120 }, { "entropy": 5.73230209350586, "epoch": 5.262140094542329, "grad_norm": 1.234375, "learning_rate": 0.00028769409750571413, "loss": 5.051, "mean_token_accuracy": 0.22113343924283982, "num_tokens": 11137973.0, "step": 6125 }, { "entropy": 5.745645236968994, "epoch": 5.266437473141384, "grad_norm": 1.1796875, "learning_rate": 0.00028736213057790975, "loss": 5.1262, "mean_token_accuracy": 0.21628147065639497, "num_tokens": 11147285.0, "step": 6130 }, { "entropy": 5.7566581726074215, "epoch": 5.270734851740438, "grad_norm": 1.3359375, "learning_rate": 0.0002870301366561533, "loss": 5.0881, "mean_token_accuracy": 0.2212749555706978, "num_tokens": 11155303.0, "step": 6135 }, { "entropy": 5.701379346847534, "epoch": 5.275032230339493, "grad_norm": 1.40625, "learning_rate": 0.0002866981164653867, "loss": 4.9684, "mean_token_accuracy": 0.2331068202853203, "num_tokens": 11163553.0, "step": 6140 }, { "entropy": 5.659363603591919, "epoch": 5.279329608938547, "grad_norm": 1.4921875, "learning_rate": 0.0002863660707306095, "loss": 4.9041, "mean_token_accuracy": 0.23561924546957017, "num_tokens": 11171865.0, "step": 6145 }, { "entropy": 5.717046880722046, "epoch": 5.283626987537602, "grad_norm": 1.15625, "learning_rate": 0.00028603400017687675, "loss": 5.0903, "mean_token_accuracy": 0.21897276788949965, "num_tokens": 11181137.0, "step": 6150 }, { "entropy": 5.599358224868775, "epoch": 5.287924366136656, "grad_norm": 1.2578125, "learning_rate": 0.00028570190552929794, "loss": 4.9649, "mean_token_accuracy": 0.23953897058963775, "num_tokens": 11190174.0, "step": 6155 }, { "entropy": 5.644610738754272, "epoch": 5.292221744735711, "grad_norm": 1.3515625, "learning_rate": 0.000285369787513035, "loss": 4.987, "mean_token_accuracy": 0.22932439893484116, "num_tokens": 11197964.0, "step": 6160 }, { "entropy": 5.768923711776734, "epoch": 5.2965191233347655, "grad_norm": 1.09375, "learning_rate": 0.00028503764685330077, "loss": 5.1084, "mean_token_accuracy": 0.2178024098277092, "num_tokens": 11207974.0, "step": 6165 }, { "entropy": 5.669016408920288, "epoch": 5.30081650193382, "grad_norm": 1.3515625, "learning_rate": 0.00028470548427535794, "loss": 5.0318, "mean_token_accuracy": 0.22539808452129365, "num_tokens": 11216430.0, "step": 6170 }, { "entropy": 5.72750072479248, "epoch": 5.305113880532875, "grad_norm": 1.2109375, "learning_rate": 0.00028437330050451654, "loss": 5.1328, "mean_token_accuracy": 0.21367713510990144, "num_tokens": 11226189.0, "step": 6175 }, { "entropy": 5.705896663665771, "epoch": 5.309411259131929, "grad_norm": 1.34375, "learning_rate": 0.0002840410962661334, "loss": 5.088, "mean_token_accuracy": 0.2210001528263092, "num_tokens": 11234691.0, "step": 6180 }, { "entropy": 5.660514640808105, "epoch": 5.313708637730985, "grad_norm": 1.328125, "learning_rate": 0.0002837088722856098, "loss": 4.9318, "mean_token_accuracy": 0.23419011533260345, "num_tokens": 11243852.0, "step": 6185 }, { "entropy": 5.72763729095459, "epoch": 5.318006016330038, "grad_norm": 1.234375, "learning_rate": 0.00028337662928838996, "loss": 5.1053, "mean_token_accuracy": 0.21972943395376204, "num_tokens": 11253416.0, "step": 6190 }, { "entropy": 5.662899398803711, "epoch": 5.322303394929094, "grad_norm": 1.1015625, "learning_rate": 0.00028304436799995986, "loss": 4.997, "mean_token_accuracy": 0.2329850599169731, "num_tokens": 11262869.0, "step": 6195 }, { "entropy": 5.765104818344116, "epoch": 5.326600773528148, "grad_norm": 1.1875, "learning_rate": 0.00028271208914584534, "loss": 5.1133, "mean_token_accuracy": 0.21480259597301482, "num_tokens": 11272386.0, "step": 6200 }, { "entropy": 5.706294012069702, "epoch": 5.330898152127203, "grad_norm": 1.2578125, "learning_rate": 0.00028237979345161065, "loss": 4.9803, "mean_token_accuracy": 0.22452399581670762, "num_tokens": 11281590.0, "step": 6205 }, { "entropy": 5.694635534286499, "epoch": 5.335195530726257, "grad_norm": 1.3515625, "learning_rate": 0.0002820474816428568, "loss": 5.0398, "mean_token_accuracy": 0.22545266300439834, "num_tokens": 11290873.0, "step": 6210 }, { "entropy": 5.7399656772613525, "epoch": 5.339492909325312, "grad_norm": 1.328125, "learning_rate": 0.0002817151544452198, "loss": 4.9911, "mean_token_accuracy": 0.22447988986968995, "num_tokens": 11299064.0, "step": 6215 }, { "entropy": 5.719579458236694, "epoch": 5.343790287924366, "grad_norm": 1.234375, "learning_rate": 0.00028138281258436947, "loss": 5.0504, "mean_token_accuracy": 0.21890538781881333, "num_tokens": 11307390.0, "step": 6220 }, { "entropy": 5.641092538833618, "epoch": 5.348087666523421, "grad_norm": 1.4375, "learning_rate": 0.0002810504567860078, "loss": 4.9957, "mean_token_accuracy": 0.23052176386117934, "num_tokens": 11315606.0, "step": 6225 }, { "entropy": 5.78738374710083, "epoch": 5.3523850451224755, "grad_norm": 1.3359375, "learning_rate": 0.0002807180877758667, "loss": 5.1199, "mean_token_accuracy": 0.21379440873861313, "num_tokens": 11323821.0, "step": 6230 }, { "entropy": 5.668607139587403, "epoch": 5.35668242372153, "grad_norm": 1.4453125, "learning_rate": 0.00028038570627970754, "loss": 5.0706, "mean_token_accuracy": 0.22117573767900467, "num_tokens": 11331850.0, "step": 6235 }, { "entropy": 5.657337856292725, "epoch": 5.360979802320585, "grad_norm": 1.4296875, "learning_rate": 0.0002800533130233184, "loss": 5.0238, "mean_token_accuracy": 0.2220548003911972, "num_tokens": 11340125.0, "step": 6240 }, { "entropy": 5.743132734298706, "epoch": 5.365277180919639, "grad_norm": 1.3203125, "learning_rate": 0.0002797209087325135, "loss": 5.0623, "mean_token_accuracy": 0.21825114637613297, "num_tokens": 11349184.0, "step": 6245 }, { "entropy": 5.771504831314087, "epoch": 5.369574559518694, "grad_norm": 1.2578125, "learning_rate": 0.00027938849413313083, "loss": 5.1051, "mean_token_accuracy": 0.21638326942920685, "num_tokens": 11357536.0, "step": 6250 }, { "entropy": 5.719394874572754, "epoch": 5.373871938117748, "grad_norm": 1.1953125, "learning_rate": 0.000279056069951031, "loss": 5.0973, "mean_token_accuracy": 0.2208126589655876, "num_tokens": 11367242.0, "step": 6255 }, { "entropy": 5.701067686080933, "epoch": 5.378169316716803, "grad_norm": 1.3046875, "learning_rate": 0.00027872363691209564, "loss": 5.0424, "mean_token_accuracy": 0.2230653151869774, "num_tokens": 11374932.0, "step": 6260 }, { "entropy": 5.692240571975708, "epoch": 5.382466695315857, "grad_norm": 1.328125, "learning_rate": 0.0002783911957422256, "loss": 5.0384, "mean_token_accuracy": 0.22070774734020232, "num_tokens": 11383575.0, "step": 6265 }, { "entropy": 5.7348823070526125, "epoch": 5.386764073914912, "grad_norm": 1.3046875, "learning_rate": 0.0002780587471673394, "loss": 5.0771, "mean_token_accuracy": 0.21955856680870056, "num_tokens": 11392285.0, "step": 6270 }, { "entropy": 5.7097772598266605, "epoch": 5.391061452513966, "grad_norm": 1.25, "learning_rate": 0.00027772629191337206, "loss": 5.0487, "mean_token_accuracy": 0.22507010102272035, "num_tokens": 11401054.0, "step": 6275 }, { "entropy": 5.684504842758178, "epoch": 5.395358831113021, "grad_norm": 1.3671875, "learning_rate": 0.00027739383070627283, "loss": 5.0698, "mean_token_accuracy": 0.2200905501842499, "num_tokens": 11410529.0, "step": 6280 }, { "entropy": 5.719993925094604, "epoch": 5.3996562097120755, "grad_norm": 1.234375, "learning_rate": 0.0002770613642720041, "loss": 5.0371, "mean_token_accuracy": 0.2246679574251175, "num_tokens": 11419961.0, "step": 6285 }, { "entropy": 5.782123231887818, "epoch": 5.40395358831113, "grad_norm": 1.125, "learning_rate": 0.00027672889333653984, "loss": 5.1644, "mean_token_accuracy": 0.20422402620315552, "num_tokens": 11429529.0, "step": 6290 }, { "entropy": 5.693040895462036, "epoch": 5.408250966910185, "grad_norm": 1.1875, "learning_rate": 0.0002763964186258635, "loss": 5.0301, "mean_token_accuracy": 0.22416374385356902, "num_tokens": 11438254.0, "step": 6295 }, { "entropy": 5.702614498138428, "epoch": 5.412548345509239, "grad_norm": 1.203125, "learning_rate": 0.0002760639408659671, "loss": 5.0978, "mean_token_accuracy": 0.2164264589548111, "num_tokens": 11447587.0, "step": 6300 }, { "entropy": 5.698922777175904, "epoch": 5.416845724108294, "grad_norm": 1.375, "learning_rate": 0.0002757314607828489, "loss": 5.0377, "mean_token_accuracy": 0.22288380414247513, "num_tokens": 11455493.0, "step": 6305 }, { "entropy": 5.725058221817017, "epoch": 5.421143102707348, "grad_norm": 1.328125, "learning_rate": 0.00027539897910251293, "loss": 5.0063, "mean_token_accuracy": 0.23147807270288467, "num_tokens": 11464143.0, "step": 6310 }, { "entropy": 5.668506097793579, "epoch": 5.425440481306403, "grad_norm": 1.3671875, "learning_rate": 0.00027506649655096595, "loss": 4.9814, "mean_token_accuracy": 0.2202653095126152, "num_tokens": 11471813.0, "step": 6315 }, { "entropy": 5.662840414047241, "epoch": 5.429737859905457, "grad_norm": 1.3125, "learning_rate": 0.0002747340138542171, "loss": 5.0057, "mean_token_accuracy": 0.2203777849674225, "num_tokens": 11481374.0, "step": 6320 }, { "entropy": 5.767701101303101, "epoch": 5.434035238504512, "grad_norm": 1.1953125, "learning_rate": 0.0002744015317382757, "loss": 5.1746, "mean_token_accuracy": 0.2164507880806923, "num_tokens": 11490575.0, "step": 6325 }, { "entropy": 5.757967758178711, "epoch": 5.438332617103566, "grad_norm": 1.2265625, "learning_rate": 0.0002740690509291498, "loss": 5.0806, "mean_token_accuracy": 0.22688030898571016, "num_tokens": 11499898.0, "step": 6330 }, { "entropy": 5.733929872512817, "epoch": 5.442629995702621, "grad_norm": 1.3828125, "learning_rate": 0.0002737365721528445, "loss": 5.0643, "mean_token_accuracy": 0.21820032298564912, "num_tokens": 11508544.0, "step": 6335 }, { "entropy": 5.6999755859375, "epoch": 5.446927374301676, "grad_norm": 1.2421875, "learning_rate": 0.0002734040961353607, "loss": 5.0513, "mean_token_accuracy": 0.22286413460969925, "num_tokens": 11519239.0, "step": 6340 }, { "entropy": 5.707623910903931, "epoch": 5.451224752900731, "grad_norm": 1.2734375, "learning_rate": 0.000273071623602693, "loss": 5.0382, "mean_token_accuracy": 0.22144442051649094, "num_tokens": 11529014.0, "step": 6345 }, { "entropy": 5.712731885910034, "epoch": 5.4555221314997855, "grad_norm": 1.25, "learning_rate": 0.00027273915528082865, "loss": 5.0207, "mean_token_accuracy": 0.22821328788995743, "num_tokens": 11538367.0, "step": 6350 }, { "entropy": 5.784208154678344, "epoch": 5.45981951009884, "grad_norm": 1.2109375, "learning_rate": 0.0002724066918957455, "loss": 5.1945, "mean_token_accuracy": 0.20518641024827958, "num_tokens": 11548166.0, "step": 6355 }, { "entropy": 5.650224399566651, "epoch": 5.464116888697895, "grad_norm": 1.2265625, "learning_rate": 0.0002720742341734107, "loss": 5.0393, "mean_token_accuracy": 0.22485454976558686, "num_tokens": 11557187.0, "step": 6360 }, { "entropy": 5.770067548751831, "epoch": 5.468414267296949, "grad_norm": 1.34375, "learning_rate": 0.00027174178283977904, "loss": 5.0648, "mean_token_accuracy": 0.21925112903118132, "num_tokens": 11566181.0, "step": 6365 }, { "entropy": 5.679496145248413, "epoch": 5.472711645896004, "grad_norm": 1.171875, "learning_rate": 0.00027140933862079136, "loss": 5.0389, "mean_token_accuracy": 0.22760843932628633, "num_tokens": 11576157.0, "step": 6370 }, { "entropy": 5.662256336212158, "epoch": 5.477009024495058, "grad_norm": 1.3125, "learning_rate": 0.000271076902242373, "loss": 5.0069, "mean_token_accuracy": 0.22589283734560012, "num_tokens": 11585325.0, "step": 6375 }, { "entropy": 5.733037662506104, "epoch": 5.481306403094113, "grad_norm": 1.2109375, "learning_rate": 0.000270744474430432, "loss": 4.9985, "mean_token_accuracy": 0.22967931479215623, "num_tokens": 11594623.0, "step": 6380 }, { "entropy": 5.782998085021973, "epoch": 5.485603781693167, "grad_norm": 1.265625, "learning_rate": 0.000270412055910858, "loss": 5.1686, "mean_token_accuracy": 0.21681290566921235, "num_tokens": 11604370.0, "step": 6385 }, { "entropy": 5.643405389785767, "epoch": 5.489901160292222, "grad_norm": 1.1953125, "learning_rate": 0.0002700796474095201, "loss": 4.9855, "mean_token_accuracy": 0.23277317434549333, "num_tokens": 11613779.0, "step": 6390 }, { "entropy": 5.7119005680084225, "epoch": 5.494198538891276, "grad_norm": 1.34375, "learning_rate": 0.0002697472496522656, "loss": 5.0583, "mean_token_accuracy": 0.22339898496866226, "num_tokens": 11623037.0, "step": 6395 }, { "entropy": 5.762480592727661, "epoch": 5.498495917490331, "grad_norm": 1.25, "learning_rate": 0.0002694148633649184, "loss": 5.0846, "mean_token_accuracy": 0.21870031207799911, "num_tokens": 11631640.0, "step": 6400 }, { "entropy": 5.7132039070129395, "epoch": 5.5027932960893855, "grad_norm": 1.359375, "learning_rate": 0.0002690824892732772, "loss": 5.0635, "mean_token_accuracy": 0.22984230667352676, "num_tokens": 11640500.0, "step": 6405 }, { "entropy": 5.727060127258301, "epoch": 5.50709067468844, "grad_norm": 1.328125, "learning_rate": 0.0002687501281031142, "loss": 5.0927, "mean_token_accuracy": 0.21840092092752456, "num_tokens": 11649173.0, "step": 6410 }, { "entropy": 5.5954478740692135, "epoch": 5.511388053287495, "grad_norm": 1.3203125, "learning_rate": 0.0002684177805801734, "loss": 4.9503, "mean_token_accuracy": 0.23530040234327315, "num_tokens": 11658808.0, "step": 6415 }, { "entropy": 5.721831846237182, "epoch": 5.515685431886549, "grad_norm": 1.1875, "learning_rate": 0.00026808544743016886, "loss": 5.027, "mean_token_accuracy": 0.2200983464717865, "num_tokens": 11667600.0, "step": 6420 }, { "entropy": 5.680030441284179, "epoch": 5.519982810485604, "grad_norm": 1.359375, "learning_rate": 0.0002677531293787835, "loss": 5.0687, "mean_token_accuracy": 0.21921883374452591, "num_tokens": 11675597.0, "step": 6425 }, { "entropy": 5.611881399154663, "epoch": 5.524280189084658, "grad_norm": 1.265625, "learning_rate": 0.000267420827151667, "loss": 4.9903, "mean_token_accuracy": 0.22676096260547637, "num_tokens": 11684549.0, "step": 6430 }, { "entropy": 5.70304045677185, "epoch": 5.528577567683713, "grad_norm": 1.3125, "learning_rate": 0.0002670885414744347, "loss": 5.0797, "mean_token_accuracy": 0.22514398992061616, "num_tokens": 11693043.0, "step": 6435 }, { "entropy": 5.821166849136352, "epoch": 5.532874946282767, "grad_norm": 1.2265625, "learning_rate": 0.0002667562730726655, "loss": 5.172, "mean_token_accuracy": 0.21643173396587373, "num_tokens": 11702982.0, "step": 6440 }, { "entropy": 5.741958427429199, "epoch": 5.537172324881822, "grad_norm": 1.1953125, "learning_rate": 0.00026642402267190095, "loss": 5.1484, "mean_token_accuracy": 0.21787002980709075, "num_tokens": 11711994.0, "step": 6445 }, { "entropy": 5.768466472625732, "epoch": 5.541469703480876, "grad_norm": 1.3046875, "learning_rate": 0.00026609179099764313, "loss": 5.1038, "mean_token_accuracy": 0.22068035900592803, "num_tokens": 11722165.0, "step": 6450 }, { "entropy": 5.724889326095581, "epoch": 5.545767082079931, "grad_norm": 1.21875, "learning_rate": 0.00026575957877535323, "loss": 5.0716, "mean_token_accuracy": 0.21890437453985215, "num_tokens": 11731265.0, "step": 6455 }, { "entropy": 5.697373151779175, "epoch": 5.5500644606789855, "grad_norm": 1.21875, "learning_rate": 0.00026542738673044985, "loss": 5.0947, "mean_token_accuracy": 0.2167985513806343, "num_tokens": 11741779.0, "step": 6460 }, { "entropy": 5.6448475360870365, "epoch": 5.55436183927804, "grad_norm": 1.359375, "learning_rate": 0.0002650952155883077, "loss": 5.0737, "mean_token_accuracy": 0.221512633562088, "num_tokens": 11749976.0, "step": 6465 }, { "entropy": 5.717658185958863, "epoch": 5.558659217877095, "grad_norm": 1.2109375, "learning_rate": 0.0002647630660742559, "loss": 5.0631, "mean_token_accuracy": 0.21613216102123262, "num_tokens": 11759781.0, "step": 6470 }, { "entropy": 5.73361644744873, "epoch": 5.56295659647615, "grad_norm": 1.109375, "learning_rate": 0.000264430938913576, "loss": 5.0306, "mean_token_accuracy": 0.22538082003593446, "num_tokens": 11769544.0, "step": 6475 }, { "entropy": 5.743956756591797, "epoch": 5.567253975075204, "grad_norm": 1.4375, "learning_rate": 0.00026409883483150123, "loss": 5.0235, "mean_token_accuracy": 0.223893466591835, "num_tokens": 11778831.0, "step": 6480 }, { "entropy": 5.692667055130005, "epoch": 5.571551353674259, "grad_norm": 1.2421875, "learning_rate": 0.000263766754553214, "loss": 5.1075, "mean_token_accuracy": 0.21305216550827027, "num_tokens": 11788813.0, "step": 6485 }, { "entropy": 5.721110439300537, "epoch": 5.575848732273313, "grad_norm": 1.234375, "learning_rate": 0.0002634346988038448, "loss": 5.0485, "mean_token_accuracy": 0.22178650200366973, "num_tokens": 11797335.0, "step": 6490 }, { "entropy": 5.712099361419678, "epoch": 5.580146110872368, "grad_norm": 1.3203125, "learning_rate": 0.00026310266830847093, "loss": 5.0608, "mean_token_accuracy": 0.22487997114658356, "num_tokens": 11806741.0, "step": 6495 }, { "entropy": 5.78537073135376, "epoch": 5.584443489471423, "grad_norm": 1.265625, "learning_rate": 0.00026277066379211406, "loss": 5.1082, "mean_token_accuracy": 0.2191992685198784, "num_tokens": 11815551.0, "step": 6500 }, { "epoch": 5.584443489471423, "eval_entropy": 5.526053285813546, "eval_loss": 5.88471794128418, "eval_mean_token_accuracy": 0.18178187918152894, "eval_num_tokens": 11815551.0, "eval_runtime": 2.2546, "eval_samples_per_second": 1574.098, "eval_steps_per_second": 196.929, "step": 6500 }, { "entropy": 5.77349157333374, "epoch": 5.588740868070477, "grad_norm": 1.3046875, "learning_rate": 0.0002624386859797396, "loss": 5.1239, "mean_token_accuracy": 0.21715308576822281, "num_tokens": 11824483.0, "step": 6505 }, { "entropy": 5.620351982116699, "epoch": 5.593038246669532, "grad_norm": 1.3125, "learning_rate": 0.00026210673559625406, "loss": 4.9251, "mean_token_accuracy": 0.2342335268855095, "num_tokens": 11832383.0, "step": 6510 }, { "entropy": 5.736187314987182, "epoch": 5.597335625268586, "grad_norm": 1.453125, "learning_rate": 0.0002617748133665047, "loss": 5.1511, "mean_token_accuracy": 0.2210177078843117, "num_tokens": 11841430.0, "step": 6515 }, { "entropy": 5.713363361358643, "epoch": 5.601633003867641, "grad_norm": 1.2578125, "learning_rate": 0.0002614429200152768, "loss": 5.1042, "mean_token_accuracy": 0.21721142828464507, "num_tokens": 11850863.0, "step": 6520 }, { "entropy": 5.686949777603149, "epoch": 5.6059303824666955, "grad_norm": 1.359375, "learning_rate": 0.000261111056267293, "loss": 4.9908, "mean_token_accuracy": 0.223213329911232, "num_tokens": 11859392.0, "step": 6525 }, { "entropy": 5.678573751449585, "epoch": 5.61022776106575, "grad_norm": 1.28125, "learning_rate": 0.00026077922284721084, "loss": 5.0374, "mean_token_accuracy": 0.22779326885938644, "num_tokens": 11868762.0, "step": 6530 }, { "entropy": 5.663127517700195, "epoch": 5.614525139664805, "grad_norm": 1.390625, "learning_rate": 0.00026044742047962206, "loss": 5.0065, "mean_token_accuracy": 0.22907865047454834, "num_tokens": 11876722.0, "step": 6535 }, { "entropy": 5.705111980438232, "epoch": 5.618822518263859, "grad_norm": 1.265625, "learning_rate": 0.00026011564988905023, "loss": 5.1385, "mean_token_accuracy": 0.2226913809776306, "num_tokens": 11885614.0, "step": 6540 }, { "entropy": 5.803359842300415, "epoch": 5.623119896862914, "grad_norm": 1.2890625, "learning_rate": 0.0002597839117999499, "loss": 5.1434, "mean_token_accuracy": 0.21973469406366347, "num_tokens": 11894702.0, "step": 6545 }, { "entropy": 5.694731378555298, "epoch": 5.627417275461968, "grad_norm": 1.3828125, "learning_rate": 0.0002594522069367044, "loss": 5.0213, "mean_token_accuracy": 0.2334420010447502, "num_tokens": 11902829.0, "step": 6550 }, { "entropy": 5.662298774719238, "epoch": 5.631714654061023, "grad_norm": 1.1953125, "learning_rate": 0.0002591205360236245, "loss": 5.074, "mean_token_accuracy": 0.22429724633693696, "num_tokens": 11912377.0, "step": 6555 }, { "entropy": 5.692343282699585, "epoch": 5.636012032660077, "grad_norm": 1.390625, "learning_rate": 0.000258788899784947, "loss": 5.0625, "mean_token_accuracy": 0.21530526280403137, "num_tokens": 11920563.0, "step": 6560 }, { "entropy": 5.775351047515869, "epoch": 5.640309411259132, "grad_norm": 1.1640625, "learning_rate": 0.00025845729894483283, "loss": 5.097, "mean_token_accuracy": 0.21399070620536803, "num_tokens": 11930190.0, "step": 6565 }, { "entropy": 5.762508678436279, "epoch": 5.644606789858186, "grad_norm": 1.21875, "learning_rate": 0.0002581257342273657, "loss": 5.1624, "mean_token_accuracy": 0.2138037234544754, "num_tokens": 11939840.0, "step": 6570 }, { "entropy": 5.710089302062988, "epoch": 5.648904168457241, "grad_norm": 1.34375, "learning_rate": 0.0002577942063565504, "loss": 5.0784, "mean_token_accuracy": 0.2212178960442543, "num_tokens": 11948260.0, "step": 6575 }, { "entropy": 5.701170206069946, "epoch": 5.6532015470562955, "grad_norm": 1.390625, "learning_rate": 0.0002574627160563114, "loss": 5.1242, "mean_token_accuracy": 0.2222455382347107, "num_tokens": 11956776.0, "step": 6580 }, { "entropy": 5.808792400360107, "epoch": 5.65749892565535, "grad_norm": 1.3203125, "learning_rate": 0.0002571312640504909, "loss": 5.1515, "mean_token_accuracy": 0.21467148512601852, "num_tokens": 11966375.0, "step": 6585 }, { "entropy": 5.825958395004273, "epoch": 5.661796304254405, "grad_norm": 1.2265625, "learning_rate": 0.0002567998510628476, "loss": 5.1745, "mean_token_accuracy": 0.21654749512672425, "num_tokens": 11975835.0, "step": 6590 }, { "entropy": 5.697682046890259, "epoch": 5.666093682853459, "grad_norm": 1.375, "learning_rate": 0.00025646847781705506, "loss": 5.062, "mean_token_accuracy": 0.22404171526432037, "num_tokens": 11984672.0, "step": 6595 }, { "entropy": 5.7217936515808105, "epoch": 5.670391061452514, "grad_norm": 1.390625, "learning_rate": 0.0002561371450367, "loss": 5.058, "mean_token_accuracy": 0.2239342138171196, "num_tokens": 11993954.0, "step": 6600 }, { "entropy": 5.765909624099732, "epoch": 5.674688440051568, "grad_norm": 1.375, "learning_rate": 0.00025580585344528076, "loss": 5.1266, "mean_token_accuracy": 0.21507280468940734, "num_tokens": 12002523.0, "step": 6605 }, { "entropy": 5.7512987613677975, "epoch": 5.678985818650623, "grad_norm": 1.34375, "learning_rate": 0.0002554746037662058, "loss": 5.1522, "mean_token_accuracy": 0.2076337844133377, "num_tokens": 12011638.0, "step": 6610 }, { "entropy": 5.767333936691284, "epoch": 5.683283197249677, "grad_norm": 1.15625, "learning_rate": 0.0002551433967227919, "loss": 5.1074, "mean_token_accuracy": 0.2166377604007721, "num_tokens": 12021319.0, "step": 6615 }, { "entropy": 5.730424690246582, "epoch": 5.687580575848732, "grad_norm": 1.2890625, "learning_rate": 0.000254812233038263, "loss": 5.0593, "mean_token_accuracy": 0.22508615553379058, "num_tokens": 12030255.0, "step": 6620 }, { "entropy": 5.750051546096802, "epoch": 5.691877954447786, "grad_norm": 1.3828125, "learning_rate": 0.00025448111343574813, "loss": 5.0446, "mean_token_accuracy": 0.22844503819942474, "num_tokens": 12038884.0, "step": 6625 }, { "entropy": 5.6743001461029055, "epoch": 5.696175333046842, "grad_norm": 1.109375, "learning_rate": 0.0002541500386382802, "loss": 5.0402, "mean_token_accuracy": 0.22093237787485123, "num_tokens": 12047477.0, "step": 6630 }, { "entropy": 5.70745210647583, "epoch": 5.7004727116458955, "grad_norm": 1.25, "learning_rate": 0.00025381900936879433, "loss": 5.1182, "mean_token_accuracy": 0.21816065311431884, "num_tokens": 12056902.0, "step": 6635 }, { "entropy": 5.719636726379394, "epoch": 5.704770090244951, "grad_norm": 1.2265625, "learning_rate": 0.0002534880263501259, "loss": 5.0763, "mean_token_accuracy": 0.22078678905963897, "num_tokens": 12065721.0, "step": 6640 }, { "entropy": 5.708343172073365, "epoch": 5.7090674688440055, "grad_norm": 1.1796875, "learning_rate": 0.0002531570903050097, "loss": 5.0515, "mean_token_accuracy": 0.22799196988344192, "num_tokens": 12074870.0, "step": 6645 }, { "entropy": 5.7802948474884035, "epoch": 5.71336484744306, "grad_norm": 1.203125, "learning_rate": 0.0002528262019560776, "loss": 5.1006, "mean_token_accuracy": 0.21951047629117965, "num_tokens": 12084557.0, "step": 6650 }, { "entropy": 5.676051235198974, "epoch": 5.717662226042115, "grad_norm": 1.140625, "learning_rate": 0.0002524953620258579, "loss": 4.9682, "mean_token_accuracy": 0.23413270562887192, "num_tokens": 12093074.0, "step": 6655 }, { "entropy": 5.689513444900513, "epoch": 5.721959604641169, "grad_norm": 1.296875, "learning_rate": 0.0002521645712367724, "loss": 4.9918, "mean_token_accuracy": 0.2312155470252037, "num_tokens": 12102785.0, "step": 6660 }, { "entropy": 5.680976343154907, "epoch": 5.726256983240224, "grad_norm": 1.3671875, "learning_rate": 0.00025183383031113606, "loss": 5.0198, "mean_token_accuracy": 0.23386708199977874, "num_tokens": 12112535.0, "step": 6665 }, { "entropy": 5.595748949050903, "epoch": 5.730554361839278, "grad_norm": 1.2421875, "learning_rate": 0.00025150313997115476, "loss": 4.973, "mean_token_accuracy": 0.2262031227350235, "num_tokens": 12121604.0, "step": 6670 }, { "entropy": 5.729144287109375, "epoch": 5.734851740438333, "grad_norm": 1.2109375, "learning_rate": 0.0002511725009389244, "loss": 5.1364, "mean_token_accuracy": 0.2118260830640793, "num_tokens": 12131276.0, "step": 6675 }, { "entropy": 5.677047777175903, "epoch": 5.739149119037387, "grad_norm": 1.40625, "learning_rate": 0.000250841913936428, "loss": 5.0989, "mean_token_accuracy": 0.22394427061080932, "num_tokens": 12140180.0, "step": 6680 }, { "entropy": 5.6755109310150145, "epoch": 5.743446497636442, "grad_norm": 1.296875, "learning_rate": 0.0002505113796855357, "loss": 5.0724, "mean_token_accuracy": 0.21869651228189468, "num_tokens": 12149635.0, "step": 6685 }, { "entropy": 5.731184768676758, "epoch": 5.747743876235496, "grad_norm": 1.328125, "learning_rate": 0.00025018089890800225, "loss": 5.0106, "mean_token_accuracy": 0.23130760043859483, "num_tokens": 12157565.0, "step": 6690 }, { "entropy": 5.726122760772705, "epoch": 5.752041254834551, "grad_norm": 1.3046875, "learning_rate": 0.00024985047232546544, "loss": 5.1313, "mean_token_accuracy": 0.21073731929063796, "num_tokens": 12166647.0, "step": 6695 }, { "entropy": 5.7177893161773685, "epoch": 5.7563386334336055, "grad_norm": 1.2265625, "learning_rate": 0.00024952010065944485, "loss": 5.103, "mean_token_accuracy": 0.22112697809934617, "num_tokens": 12175554.0, "step": 6700 }, { "entropy": 5.720852613449097, "epoch": 5.76063601203266, "grad_norm": 1.3984375, "learning_rate": 0.0002491897846313402, "loss": 5.0861, "mean_token_accuracy": 0.22505464851856233, "num_tokens": 12184756.0, "step": 6705 }, { "entropy": 5.787768459320068, "epoch": 5.764933390631715, "grad_norm": 1.21875, "learning_rate": 0.0002488595249624297, "loss": 5.1657, "mean_token_accuracy": 0.2111471638083458, "num_tokens": 12194724.0, "step": 6710 }, { "entropy": 5.751486301422119, "epoch": 5.769230769230769, "grad_norm": 1.1875, "learning_rate": 0.00024852932237386837, "loss": 5.1101, "mean_token_accuracy": 0.21754648834466933, "num_tokens": 12203804.0, "step": 6715 }, { "entropy": 5.71734390258789, "epoch": 5.773528147829824, "grad_norm": 1.2109375, "learning_rate": 0.00024819917758668673, "loss": 5.0583, "mean_token_accuracy": 0.21450338363647461, "num_tokens": 12212868.0, "step": 6720 }, { "entropy": 5.749476957321167, "epoch": 5.777825526428878, "grad_norm": 1.2578125, "learning_rate": 0.00024786909132178906, "loss": 5.1339, "mean_token_accuracy": 0.2220611587166786, "num_tokens": 12221650.0, "step": 6725 }, { "entropy": 5.7352417469024655, "epoch": 5.782122905027933, "grad_norm": 1.328125, "learning_rate": 0.00024753906429995194, "loss": 5.0208, "mean_token_accuracy": 0.23245062083005905, "num_tokens": 12231541.0, "step": 6730 }, { "entropy": 5.726803779602051, "epoch": 5.786420283626987, "grad_norm": 1.3359375, "learning_rate": 0.0002472090972418222, "loss": 5.1206, "mean_token_accuracy": 0.2154802456498146, "num_tokens": 12240899.0, "step": 6735 }, { "entropy": 5.747926139831543, "epoch": 5.790717662226042, "grad_norm": 1.2421875, "learning_rate": 0.0002468791908679163, "loss": 5.0888, "mean_token_accuracy": 0.21271725445985795, "num_tokens": 12250352.0, "step": 6740 }, { "entropy": 5.777256822586059, "epoch": 5.795015040825096, "grad_norm": 1.2734375, "learning_rate": 0.0002465493458986175, "loss": 5.1526, "mean_token_accuracy": 0.21231271475553512, "num_tokens": 12259975.0, "step": 6745 }, { "entropy": 5.662602233886719, "epoch": 5.799312419424151, "grad_norm": 1.40625, "learning_rate": 0.00024621956305417587, "loss": 4.9995, "mean_token_accuracy": 0.22537871152162553, "num_tokens": 12269203.0, "step": 6750 }, { "entropy": 5.759423017501831, "epoch": 5.8036097980232055, "grad_norm": 1.3359375, "learning_rate": 0.000245889843054705, "loss": 5.1008, "mean_token_accuracy": 0.22535506039857864, "num_tokens": 12279481.0, "step": 6755 }, { "entropy": 5.677676963806152, "epoch": 5.80790717662226, "grad_norm": 1.2421875, "learning_rate": 0.00024556018662018163, "loss": 5.0841, "mean_token_accuracy": 0.22594149112701417, "num_tokens": 12288848.0, "step": 6760 }, { "entropy": 5.761795091629028, "epoch": 5.812204555221315, "grad_norm": 1.2109375, "learning_rate": 0.00024523059447044377, "loss": 5.0773, "mean_token_accuracy": 0.21773602664470673, "num_tokens": 12297346.0, "step": 6765 }, { "entropy": 5.74983320236206, "epoch": 5.816501933820369, "grad_norm": 1.28125, "learning_rate": 0.0002449010673251887, "loss": 5.0937, "mean_token_accuracy": 0.2133167415857315, "num_tokens": 12306233.0, "step": 6770 }, { "entropy": 5.7833335399627686, "epoch": 5.820799312419425, "grad_norm": 1.2734375, "learning_rate": 0.0002445716059039723, "loss": 5.2003, "mean_token_accuracy": 0.20486359298229218, "num_tokens": 12315609.0, "step": 6775 }, { "entropy": 5.767945957183838, "epoch": 5.825096691018478, "grad_norm": 1.484375, "learning_rate": 0.00024424221092620644, "loss": 5.1211, "mean_token_accuracy": 0.22181842029094695, "num_tokens": 12323915.0, "step": 6780 }, { "entropy": 5.654868030548096, "epoch": 5.829394069617534, "grad_norm": 1.2265625, "learning_rate": 0.00024391288311115822, "loss": 5.0409, "mean_token_accuracy": 0.21649088859558105, "num_tokens": 12334077.0, "step": 6785 }, { "entropy": 5.701321029663086, "epoch": 5.833691448216588, "grad_norm": 1.3203125, "learning_rate": 0.0002435836231779478, "loss": 5.0891, "mean_token_accuracy": 0.21890870183706285, "num_tokens": 12342411.0, "step": 6790 }, { "entropy": 5.682297992706299, "epoch": 5.837988826815643, "grad_norm": 1.2109375, "learning_rate": 0.00024325443184554724, "loss": 4.9759, "mean_token_accuracy": 0.23374876230955124, "num_tokens": 12351308.0, "step": 6795 }, { "entropy": 5.6986854553222654, "epoch": 5.842286205414697, "grad_norm": 1.25, "learning_rate": 0.00024292530983277904, "loss": 5.1638, "mean_token_accuracy": 0.2165835380554199, "num_tokens": 12359673.0, "step": 6800 }, { "entropy": 5.688496255874634, "epoch": 5.846583584013752, "grad_norm": 1.2890625, "learning_rate": 0.00024259625785831408, "loss": 4.9993, "mean_token_accuracy": 0.2236035004258156, "num_tokens": 12367876.0, "step": 6805 }, { "entropy": 5.767372131347656, "epoch": 5.850880962612806, "grad_norm": 1.3046875, "learning_rate": 0.00024226727664067023, "loss": 5.1523, "mean_token_accuracy": 0.2160484492778778, "num_tokens": 12377040.0, "step": 6810 }, { "entropy": 5.827092409133911, "epoch": 5.855178341211861, "grad_norm": 1.2578125, "learning_rate": 0.00024193836689821109, "loss": 5.2168, "mean_token_accuracy": 0.21186406761407853, "num_tokens": 12387622.0, "step": 6815 }, { "entropy": 5.657399129867554, "epoch": 5.8594757198109155, "grad_norm": 1.3046875, "learning_rate": 0.0002416095293491439, "loss": 4.9935, "mean_token_accuracy": 0.22828370928764344, "num_tokens": 12396447.0, "step": 6820 }, { "entropy": 5.7454558372497555, "epoch": 5.86377309840997, "grad_norm": 1.3515625, "learning_rate": 0.0002412807647115186, "loss": 5.0195, "mean_token_accuracy": 0.23341405838727952, "num_tokens": 12405887.0, "step": 6825 }, { "entropy": 5.6852339744567875, "epoch": 5.868070477009025, "grad_norm": 1.28125, "learning_rate": 0.00024095207370322574, "loss": 5.0519, "mean_token_accuracy": 0.22003408372402192, "num_tokens": 12414543.0, "step": 6830 }, { "entropy": 5.7159740924835205, "epoch": 5.872367855608079, "grad_norm": 1.3515625, "learning_rate": 0.00024062345704199507, "loss": 5.0443, "mean_token_accuracy": 0.22227698117494582, "num_tokens": 12423370.0, "step": 6835 }, { "entropy": 5.764145946502685, "epoch": 5.876665234207134, "grad_norm": 1.046875, "learning_rate": 0.00024029491544539405, "loss": 5.1412, "mean_token_accuracy": 0.21509134620428086, "num_tokens": 12433980.0, "step": 6840 }, { "entropy": 5.701482248306275, "epoch": 5.880962612806188, "grad_norm": 1.359375, "learning_rate": 0.00023996644963082616, "loss": 5.0814, "mean_token_accuracy": 0.22744909077882766, "num_tokens": 12443300.0, "step": 6845 }, { "entropy": 5.749948024749756, "epoch": 5.885259991405243, "grad_norm": 1.1875, "learning_rate": 0.00023963806031552948, "loss": 5.1236, "mean_token_accuracy": 0.21074166893959045, "num_tokens": 12452462.0, "step": 6850 }, { "entropy": 5.6513162612915036, "epoch": 5.889557370004297, "grad_norm": 1.265625, "learning_rate": 0.00023930974821657504, "loss": 5.0546, "mean_token_accuracy": 0.2217905506491661, "num_tokens": 12461605.0, "step": 6855 }, { "entropy": 5.785699319839478, "epoch": 5.893854748603352, "grad_norm": 1.25, "learning_rate": 0.00023898151405086533, "loss": 5.1274, "mean_token_accuracy": 0.21363674402236937, "num_tokens": 12470905.0, "step": 6860 }, { "entropy": 5.7869494438171385, "epoch": 5.8981521272024064, "grad_norm": 1.3671875, "learning_rate": 0.00023865335853513232, "loss": 5.0999, "mean_token_accuracy": 0.2226301461458206, "num_tokens": 12478913.0, "step": 6865 }, { "entropy": 5.734767198562622, "epoch": 5.902449505801461, "grad_norm": 1.25, "learning_rate": 0.00023832528238593677, "loss": 5.1908, "mean_token_accuracy": 0.21548587679862977, "num_tokens": 12487561.0, "step": 6870 }, { "entropy": 5.668437480926514, "epoch": 5.9067468844005155, "grad_norm": 1.28125, "learning_rate": 0.00023799728631966556, "loss": 5.0735, "mean_token_accuracy": 0.22422571927309037, "num_tokens": 12496781.0, "step": 6875 }, { "entropy": 5.763645887374878, "epoch": 5.91104426299957, "grad_norm": 1.515625, "learning_rate": 0.0002376693710525313, "loss": 5.1662, "mean_token_accuracy": 0.21098217368125916, "num_tokens": 12505716.0, "step": 6880 }, { "entropy": 5.814133214950561, "epoch": 5.915341641598625, "grad_norm": 1.15625, "learning_rate": 0.00023734153730056967, "loss": 5.1122, "mean_token_accuracy": 0.21441538035869598, "num_tokens": 12515594.0, "step": 6885 }, { "entropy": 5.671756172180176, "epoch": 5.919639020197679, "grad_norm": 1.5078125, "learning_rate": 0.00023701378577963873, "loss": 4.9555, "mean_token_accuracy": 0.2352278098464012, "num_tokens": 12523439.0, "step": 6890 }, { "entropy": 5.661434507369995, "epoch": 5.923936398796734, "grad_norm": 1.34375, "learning_rate": 0.0002366861172054166, "loss": 5.0439, "mean_token_accuracy": 0.23203356862068175, "num_tokens": 12532100.0, "step": 6895 }, { "entropy": 5.69484052658081, "epoch": 5.928233777395788, "grad_norm": 1.3515625, "learning_rate": 0.00023635853229340054, "loss": 5.0674, "mean_token_accuracy": 0.21839096695184707, "num_tokens": 12539689.0, "step": 6900 }, { "entropy": 5.740998411178589, "epoch": 5.932531155994843, "grad_norm": 1.265625, "learning_rate": 0.00023603103175890512, "loss": 5.1108, "mean_token_accuracy": 0.21340399384498596, "num_tokens": 12548486.0, "step": 6905 }, { "entropy": 5.708246421813965, "epoch": 5.936828534593897, "grad_norm": 1.265625, "learning_rate": 0.00023570361631706062, "loss": 4.9806, "mean_token_accuracy": 0.23616116344928742, "num_tokens": 12557423.0, "step": 6910 }, { "entropy": 5.676116800308227, "epoch": 5.941125913192952, "grad_norm": 1.3046875, "learning_rate": 0.00023537628668281142, "loss": 5.1282, "mean_token_accuracy": 0.22083746641874313, "num_tokens": 12566086.0, "step": 6915 }, { "entropy": 5.675523042678833, "epoch": 5.945423291792007, "grad_norm": 1.25, "learning_rate": 0.00023504904357091468, "loss": 5.0403, "mean_token_accuracy": 0.22071400731801988, "num_tokens": 12575827.0, "step": 6920 }, { "entropy": 5.724242877960205, "epoch": 5.949720670391061, "grad_norm": 1.3984375, "learning_rate": 0.0002347218876959384, "loss": 5.0177, "mean_token_accuracy": 0.23093894720077515, "num_tokens": 12585044.0, "step": 6925 }, { "entropy": 5.677963542938232, "epoch": 5.954018048990116, "grad_norm": 1.2265625, "learning_rate": 0.0002343948197722604, "loss": 5.0663, "mean_token_accuracy": 0.2196858137845993, "num_tokens": 12594677.0, "step": 6930 }, { "entropy": 5.6957056522369385, "epoch": 5.958315427589171, "grad_norm": 1.3125, "learning_rate": 0.00023406784051406638, "loss": 5.0934, "mean_token_accuracy": 0.21734366714954376, "num_tokens": 12604829.0, "step": 6935 }, { "entropy": 5.814334344863892, "epoch": 5.9626128061882255, "grad_norm": 1.2421875, "learning_rate": 0.00023374095063534816, "loss": 5.072, "mean_token_accuracy": 0.23150041699409485, "num_tokens": 12613869.0, "step": 6940 }, { "entropy": 5.7182660579681395, "epoch": 5.96691018478728, "grad_norm": 1.265625, "learning_rate": 0.00023341415084990276, "loss": 5.0531, "mean_token_accuracy": 0.22266816049814225, "num_tokens": 12623248.0, "step": 6945 }, { "entropy": 5.623208475112915, "epoch": 5.971207563386335, "grad_norm": 1.2265625, "learning_rate": 0.00023308744187132996, "loss": 4.9786, "mean_token_accuracy": 0.2341190755367279, "num_tokens": 12631973.0, "step": 6950 }, { "entropy": 5.716631126403809, "epoch": 5.975504941985389, "grad_norm": 1.3046875, "learning_rate": 0.00023276082441303197, "loss": 5.0999, "mean_token_accuracy": 0.22288594245910645, "num_tokens": 12641435.0, "step": 6955 }, { "entropy": 5.753938913345337, "epoch": 5.979802320584444, "grad_norm": 1.2421875, "learning_rate": 0.00023243429918821056, "loss": 5.0884, "mean_token_accuracy": 0.21952137500047683, "num_tokens": 12651077.0, "step": 6960 }, { "entropy": 5.798230123519898, "epoch": 5.984099699183498, "grad_norm": 1.3671875, "learning_rate": 0.00023210786690986646, "loss": 5.1737, "mean_token_accuracy": 0.21541121006011962, "num_tokens": 12659929.0, "step": 6965 }, { "entropy": 5.718689918518066, "epoch": 5.988397077782553, "grad_norm": 1.3125, "learning_rate": 0.00023178152829079712, "loss": 5.0248, "mean_token_accuracy": 0.2251025393605232, "num_tokens": 12670725.0, "step": 6970 }, { "entropy": 5.701576375961304, "epoch": 5.992694456381607, "grad_norm": 1.40625, "learning_rate": 0.00023145528404359562, "loss": 5.0708, "mean_token_accuracy": 0.22593270093202591, "num_tokens": 12680820.0, "step": 6975 }, { "entropy": 5.746781206130981, "epoch": 5.996991834980662, "grad_norm": 1.3046875, "learning_rate": 0.0002311291348806492, "loss": 5.1318, "mean_token_accuracy": 0.2134876236319542, "num_tokens": 12689785.0, "step": 6980 }, { "entropy": 5.668917391035292, "epoch": 6.000859475719811, "grad_norm": 1.3046875, "learning_rate": 0.0002308030815141372, "loss": 5.0926, "mean_token_accuracy": 0.2199319683843189, "num_tokens": 12697221.0, "step": 6985 }, { "entropy": 5.642835378646851, "epoch": 6.005156854318866, "grad_norm": 1.234375, "learning_rate": 0.00023047712465602976, "loss": 4.8828, "mean_token_accuracy": 0.2438489243388176, "num_tokens": 12707127.0, "step": 6990 }, { "entropy": 5.647864055633545, "epoch": 6.00945423291792, "grad_norm": 1.390625, "learning_rate": 0.00023015126501808641, "loss": 4.8829, "mean_token_accuracy": 0.22817471027374267, "num_tokens": 12715364.0, "step": 6995 }, { "entropy": 5.700256299972534, "epoch": 6.013751611516975, "grad_norm": 1.2109375, "learning_rate": 0.00022982550331185437, "loss": 4.8942, "mean_token_accuracy": 0.24342687278985978, "num_tokens": 12724914.0, "step": 7000 }, { "epoch": 6.013751611516975, "eval_entropy": 5.508724927365243, "eval_loss": 5.8795576095581055, "eval_mean_token_accuracy": 0.18236776303734864, "eval_num_tokens": 12724914.0, "eval_runtime": 2.0479, "eval_samples_per_second": 1733.032, "eval_steps_per_second": 216.812, "step": 7000 }, { "entropy": 5.670391464233399, "epoch": 6.0180489901160295, "grad_norm": 1.1875, "learning_rate": 0.00022949984024866704, "loss": 4.9099, "mean_token_accuracy": 0.23701934069395064, "num_tokens": 12735193.0, "step": 7005 }, { "entropy": 5.732305097579956, "epoch": 6.022346368715084, "grad_norm": 1.3515625, "learning_rate": 0.0002291742765396424, "loss": 4.9422, "mean_token_accuracy": 0.24183074086904527, "num_tokens": 12743945.0, "step": 7010 }, { "entropy": 5.727162218093872, "epoch": 6.0266437473141385, "grad_norm": 1.203125, "learning_rate": 0.00022884881289568133, "loss": 4.95, "mean_token_accuracy": 0.2304249882698059, "num_tokens": 12753130.0, "step": 7015 }, { "entropy": 5.668257331848144, "epoch": 6.030941125913193, "grad_norm": 1.2734375, "learning_rate": 0.0002285234500274665, "loss": 4.9429, "mean_token_accuracy": 0.22958544343709947, "num_tokens": 12762108.0, "step": 7020 }, { "entropy": 5.778354358673096, "epoch": 6.035238504512248, "grad_norm": 1.34375, "learning_rate": 0.00022819818864546016, "loss": 4.9808, "mean_token_accuracy": 0.23066942244768143, "num_tokens": 12772102.0, "step": 7025 }, { "entropy": 5.6958764553070065, "epoch": 6.039535883111302, "grad_norm": 1.3359375, "learning_rate": 0.00022787302945990345, "loss": 4.9414, "mean_token_accuracy": 0.24072416126728058, "num_tokens": 12781225.0, "step": 7030 }, { "entropy": 5.725193023681641, "epoch": 6.043833261710357, "grad_norm": 1.4140625, "learning_rate": 0.00022754797318081383, "loss": 4.9144, "mean_token_accuracy": 0.23167165368795395, "num_tokens": 12789896.0, "step": 7035 }, { "entropy": 5.701824522018432, "epoch": 6.048130640309411, "grad_norm": 1.28125, "learning_rate": 0.00022722302051798442, "loss": 4.8494, "mean_token_accuracy": 0.2424164205789566, "num_tokens": 12798596.0, "step": 7040 }, { "entropy": 5.605796670913696, "epoch": 6.052428018908466, "grad_norm": 1.3046875, "learning_rate": 0.0002268981721809819, "loss": 4.8698, "mean_token_accuracy": 0.24207407981157303, "num_tokens": 12807285.0, "step": 7045 }, { "entropy": 5.690208387374878, "epoch": 6.05672539750752, "grad_norm": 1.3125, "learning_rate": 0.0002265734288791451, "loss": 4.9288, "mean_token_accuracy": 0.2328885316848755, "num_tokens": 12816668.0, "step": 7050 }, { "entropy": 5.733233499526977, "epoch": 6.061022776106575, "grad_norm": 1.3515625, "learning_rate": 0.00022624879132158377, "loss": 5.0102, "mean_token_accuracy": 0.22736618667840958, "num_tokens": 12825943.0, "step": 7055 }, { "entropy": 5.666690349578857, "epoch": 6.0653201547056295, "grad_norm": 1.171875, "learning_rate": 0.00022592426021717654, "loss": 4.8429, "mean_token_accuracy": 0.24100734442472457, "num_tokens": 12835693.0, "step": 7060 }, { "entropy": 5.673206329345703, "epoch": 6.069617533304684, "grad_norm": 1.34375, "learning_rate": 0.0002255998362745696, "loss": 4.7591, "mean_token_accuracy": 0.2515610337257385, "num_tokens": 12844201.0, "step": 7065 }, { "entropy": 5.621398305892944, "epoch": 6.073914911903739, "grad_norm": 1.3515625, "learning_rate": 0.00022527552020217513, "loss": 4.8947, "mean_token_accuracy": 0.238974666595459, "num_tokens": 12853220.0, "step": 7070 }, { "entropy": 5.6449134349823, "epoch": 6.078212290502793, "grad_norm": 1.2734375, "learning_rate": 0.0002249513127081697, "loss": 4.9839, "mean_token_accuracy": 0.23246047645807266, "num_tokens": 12862486.0, "step": 7075 }, { "entropy": 5.768129444122314, "epoch": 6.082509669101848, "grad_norm": 1.28125, "learning_rate": 0.00022462721450049316, "loss": 4.9689, "mean_token_accuracy": 0.2276782661676407, "num_tokens": 12871717.0, "step": 7080 }, { "entropy": 5.729448127746582, "epoch": 6.086807047700902, "grad_norm": 1.4453125, "learning_rate": 0.0002243032262868464, "loss": 4.9213, "mean_token_accuracy": 0.22925499081611633, "num_tokens": 12881278.0, "step": 7085 }, { "entropy": 5.68177433013916, "epoch": 6.091104426299957, "grad_norm": 1.234375, "learning_rate": 0.00022397934877469, "loss": 4.9532, "mean_token_accuracy": 0.23347581177949905, "num_tokens": 12890720.0, "step": 7090 }, { "entropy": 5.7116344451904295, "epoch": 6.095401804899011, "grad_norm": 1.2734375, "learning_rate": 0.0002236555826712432, "loss": 4.9578, "mean_token_accuracy": 0.23043094277381898, "num_tokens": 12900428.0, "step": 7095 }, { "entropy": 5.766893911361694, "epoch": 6.099699183498066, "grad_norm": 1.2890625, "learning_rate": 0.00022333192868348152, "loss": 4.9373, "mean_token_accuracy": 0.23340655714273453, "num_tokens": 12910177.0, "step": 7100 }, { "entropy": 5.710207319259643, "epoch": 6.10399656209712, "grad_norm": 1.25, "learning_rate": 0.00022300838751813606, "loss": 4.9861, "mean_token_accuracy": 0.2358441546559334, "num_tokens": 12920734.0, "step": 7105 }, { "entropy": 5.700603389739991, "epoch": 6.108293940696175, "grad_norm": 1.3671875, "learning_rate": 0.00022268495988169145, "loss": 4.8642, "mean_token_accuracy": 0.23978381156921386, "num_tokens": 12929585.0, "step": 7110 }, { "entropy": 5.5586847305297855, "epoch": 6.1125913192952295, "grad_norm": 1.359375, "learning_rate": 0.00022236164648038433, "loss": 4.8715, "mean_token_accuracy": 0.2399493783712387, "num_tokens": 12938933.0, "step": 7115 }, { "entropy": 5.64050784111023, "epoch": 6.116888697894285, "grad_norm": 1.4140625, "learning_rate": 0.0002220384480202019, "loss": 4.8359, "mean_token_accuracy": 0.24464279115200044, "num_tokens": 12947461.0, "step": 7120 }, { "entropy": 5.698766994476318, "epoch": 6.1211860764933395, "grad_norm": 1.34375, "learning_rate": 0.00022171536520688046, "loss": 4.8981, "mean_token_accuracy": 0.2352096125483513, "num_tokens": 12956507.0, "step": 7125 }, { "entropy": 5.72119255065918, "epoch": 6.125483455092394, "grad_norm": 1.3125, "learning_rate": 0.00022139239874590362, "loss": 5.0051, "mean_token_accuracy": 0.23063586056232452, "num_tokens": 12965740.0, "step": 7130 }, { "entropy": 5.698656225204468, "epoch": 6.1297808336914485, "grad_norm": 1.2890625, "learning_rate": 0.0002210695493425013, "loss": 4.9312, "mean_token_accuracy": 0.23300146460533142, "num_tokens": 12975057.0, "step": 7135 }, { "entropy": 5.629667854309082, "epoch": 6.134078212290503, "grad_norm": 1.5390625, "learning_rate": 0.00022074681770164735, "loss": 4.8588, "mean_token_accuracy": 0.23449651151895523, "num_tokens": 12984087.0, "step": 7140 }, { "entropy": 5.644195985794068, "epoch": 6.138375590889558, "grad_norm": 1.234375, "learning_rate": 0.00022042420452805868, "loss": 4.8837, "mean_token_accuracy": 0.24718453586101533, "num_tokens": 12992793.0, "step": 7145 }, { "entropy": 5.70198245048523, "epoch": 6.142672969488612, "grad_norm": 1.3671875, "learning_rate": 0.00022010171052619365, "loss": 4.9435, "mean_token_accuracy": 0.2373737797141075, "num_tokens": 13000769.0, "step": 7150 }, { "entropy": 5.695436716079712, "epoch": 6.146970348087667, "grad_norm": 1.3046875, "learning_rate": 0.00021977933640025, "loss": 4.9391, "mean_token_accuracy": 0.23648517280817033, "num_tokens": 13010677.0, "step": 7155 }, { "entropy": 5.574930858612061, "epoch": 6.151267726686721, "grad_norm": 1.28125, "learning_rate": 0.00021945708285416434, "loss": 4.7856, "mean_token_accuracy": 0.2466425433754921, "num_tokens": 13019791.0, "step": 7160 }, { "entropy": 5.678705787658691, "epoch": 6.155565105285776, "grad_norm": 1.3359375, "learning_rate": 0.0002191349505916093, "loss": 4.9675, "mean_token_accuracy": 0.23267736732959748, "num_tokens": 13029223.0, "step": 7165 }, { "entropy": 5.708924913406372, "epoch": 6.15986248388483, "grad_norm": 1.2421875, "learning_rate": 0.00021881294031599318, "loss": 4.9167, "mean_token_accuracy": 0.23287765979766845, "num_tokens": 13038716.0, "step": 7170 }, { "entropy": 5.715561962127685, "epoch": 6.164159862483885, "grad_norm": 1.3984375, "learning_rate": 0.0002184910527304576, "loss": 4.9329, "mean_token_accuracy": 0.23829473853111266, "num_tokens": 13047915.0, "step": 7175 }, { "entropy": 5.647942113876343, "epoch": 6.1684572410829395, "grad_norm": 1.328125, "learning_rate": 0.00021816928853787636, "loss": 4.8862, "mean_token_accuracy": 0.24437055140733718, "num_tokens": 13056613.0, "step": 7180 }, { "entropy": 5.718434190750122, "epoch": 6.172754619681994, "grad_norm": 1.28125, "learning_rate": 0.00021784764844085398, "loss": 4.9568, "mean_token_accuracy": 0.23609795272350312, "num_tokens": 13066658.0, "step": 7185 }, { "entropy": 5.776039171218872, "epoch": 6.177051998281049, "grad_norm": 1.4609375, "learning_rate": 0.0002175261331417238, "loss": 4.9589, "mean_token_accuracy": 0.23155548572540283, "num_tokens": 13074798.0, "step": 7190 }, { "entropy": 5.655656957626343, "epoch": 6.181349376880103, "grad_norm": 1.375, "learning_rate": 0.00021720474334254675, "loss": 4.877, "mean_token_accuracy": 0.23738462030887603, "num_tokens": 13084173.0, "step": 7195 }, { "entropy": 5.679304552078247, "epoch": 6.185646755479158, "grad_norm": 1.265625, "learning_rate": 0.00021688347974510962, "loss": 4.9038, "mean_token_accuracy": 0.23641733229160308, "num_tokens": 13093096.0, "step": 7200 }, { "entropy": 5.678824186325073, "epoch": 6.189944134078212, "grad_norm": 1.53125, "learning_rate": 0.00021656234305092377, "loss": 4.904, "mean_token_accuracy": 0.23884446620941163, "num_tokens": 13101191.0, "step": 7205 }, { "entropy": 5.676383590698242, "epoch": 6.194241512677267, "grad_norm": 1.4921875, "learning_rate": 0.0002162413339612234, "loss": 4.9377, "mean_token_accuracy": 0.2336137905716896, "num_tokens": 13109829.0, "step": 7210 }, { "entropy": 5.654401731491089, "epoch": 6.198538891276321, "grad_norm": 1.421875, "learning_rate": 0.00021592045317696406, "loss": 4.8817, "mean_token_accuracy": 0.24019764363765717, "num_tokens": 13119314.0, "step": 7215 }, { "entropy": 5.652538299560547, "epoch": 6.202836269875376, "grad_norm": 1.390625, "learning_rate": 0.00021559970139882102, "loss": 4.8503, "mean_token_accuracy": 0.24541642367839814, "num_tokens": 13128113.0, "step": 7220 }, { "entropy": 5.747891569137574, "epoch": 6.20713364847443, "grad_norm": 1.3828125, "learning_rate": 0.0002152790793271881, "loss": 5.0009, "mean_token_accuracy": 0.22410304695367814, "num_tokens": 13136892.0, "step": 7225 }, { "entropy": 5.722711181640625, "epoch": 6.211431027073485, "grad_norm": 1.1640625, "learning_rate": 0.00021495858766217558, "loss": 4.9665, "mean_token_accuracy": 0.22984595149755477, "num_tokens": 13146960.0, "step": 7230 }, { "entropy": 5.677499055862427, "epoch": 6.2157284056725395, "grad_norm": 1.3984375, "learning_rate": 0.00021463822710360932, "loss": 4.8447, "mean_token_accuracy": 0.25095522999763487, "num_tokens": 13156147.0, "step": 7235 }, { "entropy": 5.732334947586059, "epoch": 6.220025784271594, "grad_norm": 1.265625, "learning_rate": 0.00021431799835102867, "loss": 4.9396, "mean_token_accuracy": 0.22682653665542601, "num_tokens": 13164588.0, "step": 7240 }, { "entropy": 5.62511248588562, "epoch": 6.224323162870649, "grad_norm": 1.28125, "learning_rate": 0.00021399790210368524, "loss": 4.88, "mean_token_accuracy": 0.23860864341259003, "num_tokens": 13174361.0, "step": 7245 }, { "entropy": 5.7138762950897215, "epoch": 6.228620541469703, "grad_norm": 1.1796875, "learning_rate": 0.00021367793906054133, "loss": 5.0856, "mean_token_accuracy": 0.2224991261959076, "num_tokens": 13185266.0, "step": 7250 }, { "entropy": 5.6888185977935795, "epoch": 6.232917920068758, "grad_norm": 1.3671875, "learning_rate": 0.00021335810992026823, "loss": 4.9142, "mean_token_accuracy": 0.24534845352172852, "num_tokens": 13194227.0, "step": 7255 }, { "entropy": 5.725516080856323, "epoch": 6.237215298667812, "grad_norm": 1.4375, "learning_rate": 0.00021303841538124497, "loss": 4.9591, "mean_token_accuracy": 0.23196382820606232, "num_tokens": 13202569.0, "step": 7260 }, { "entropy": 5.673784589767456, "epoch": 6.241512677266867, "grad_norm": 1.359375, "learning_rate": 0.00021271885614155685, "loss": 4.9169, "mean_token_accuracy": 0.2391177996993065, "num_tokens": 13212201.0, "step": 7265 }, { "entropy": 5.58170371055603, "epoch": 6.245810055865922, "grad_norm": 1.4375, "learning_rate": 0.0002123994328989932, "loss": 4.8384, "mean_token_accuracy": 0.2410106509923935, "num_tokens": 13220802.0, "step": 7270 }, { "entropy": 5.6795917510986325, "epoch": 6.250107434464977, "grad_norm": 1.3203125, "learning_rate": 0.00021208014635104688, "loss": 4.9909, "mean_token_accuracy": 0.22773831188678742, "num_tokens": 13229519.0, "step": 7275 }, { "entropy": 5.679076099395752, "epoch": 6.254404813064031, "grad_norm": 1.3515625, "learning_rate": 0.00021176099719491209, "loss": 4.9371, "mean_token_accuracy": 0.23299687653779982, "num_tokens": 13238865.0, "step": 7280 }, { "entropy": 5.627301931381226, "epoch": 6.258702191663086, "grad_norm": 1.4453125, "learning_rate": 0.00021144198612748312, "loss": 4.8621, "mean_token_accuracy": 0.24295336604118348, "num_tokens": 13247259.0, "step": 7285 }, { "entropy": 5.730584716796875, "epoch": 6.26299957026214, "grad_norm": 1.28125, "learning_rate": 0.00021112311384535243, "loss": 4.9719, "mean_token_accuracy": 0.2346948578953743, "num_tokens": 13256692.0, "step": 7290 }, { "entropy": 5.713074779510498, "epoch": 6.267296948861195, "grad_norm": 1.2109375, "learning_rate": 0.00021080438104480976, "loss": 4.8997, "mean_token_accuracy": 0.23539288341999054, "num_tokens": 13266109.0, "step": 7295 }, { "entropy": 5.738828372955322, "epoch": 6.2715943274602495, "grad_norm": 1.234375, "learning_rate": 0.00021048578842184019, "loss": 4.9833, "mean_token_accuracy": 0.22857994139194487, "num_tokens": 13275484.0, "step": 7300 }, { "entropy": 5.661355638504029, "epoch": 6.275891706059304, "grad_norm": 1.34375, "learning_rate": 0.00021016733667212245, "loss": 4.8854, "mean_token_accuracy": 0.23928772211074828, "num_tokens": 13284755.0, "step": 7305 }, { "entropy": 5.65988245010376, "epoch": 6.280189084658359, "grad_norm": 1.1328125, "learning_rate": 0.00020984902649102806, "loss": 4.881, "mean_token_accuracy": 0.23558754920959474, "num_tokens": 13294386.0, "step": 7310 }, { "entropy": 5.669145393371582, "epoch": 6.284486463257413, "grad_norm": 1.28125, "learning_rate": 0.00020953085857361924, "loss": 4.8806, "mean_token_accuracy": 0.23975540697574615, "num_tokens": 13303926.0, "step": 7315 }, { "entropy": 5.664641571044922, "epoch": 6.288783841856468, "grad_norm": 1.4140625, "learning_rate": 0.00020921283361464754, "loss": 4.984, "mean_token_accuracy": 0.22923536598682404, "num_tokens": 13312727.0, "step": 7320 }, { "entropy": 5.672109365463257, "epoch": 6.293081220455522, "grad_norm": 1.3515625, "learning_rate": 0.00020889495230855232, "loss": 4.9516, "mean_token_accuracy": 0.22691491544246672, "num_tokens": 13321706.0, "step": 7325 }, { "entropy": 5.634451389312744, "epoch": 6.297378599054577, "grad_norm": 1.2734375, "learning_rate": 0.00020857721534945923, "loss": 4.8854, "mean_token_accuracy": 0.24334731549024582, "num_tokens": 13330436.0, "step": 7330 }, { "entropy": 5.706661605834961, "epoch": 6.301675977653631, "grad_norm": 1.3515625, "learning_rate": 0.0002082596234311789, "loss": 4.9799, "mean_token_accuracy": 0.22508741170167923, "num_tokens": 13339334.0, "step": 7335 }, { "entropy": 5.732862854003907, "epoch": 6.305973356252686, "grad_norm": 1.34375, "learning_rate": 0.0002079421772472051, "loss": 5.0224, "mean_token_accuracy": 0.22750386744737625, "num_tokens": 13348969.0, "step": 7340 }, { "entropy": 5.760523128509521, "epoch": 6.31027073485174, "grad_norm": 1.1328125, "learning_rate": 0.0002076248774907134, "loss": 4.9944, "mean_token_accuracy": 0.22979640066623688, "num_tokens": 13358467.0, "step": 7345 }, { "entropy": 5.640356731414795, "epoch": 6.314568113450795, "grad_norm": 1.4921875, "learning_rate": 0.00020730772485455962, "loss": 4.8248, "mean_token_accuracy": 0.24465546160936355, "num_tokens": 13366413.0, "step": 7350 }, { "entropy": 5.697748041152954, "epoch": 6.3188654920498495, "grad_norm": 1.2109375, "learning_rate": 0.0002069907200312785, "loss": 4.998, "mean_token_accuracy": 0.2297839805483818, "num_tokens": 13376620.0, "step": 7355 }, { "entropy": 5.674102115631103, "epoch": 6.323162870648904, "grad_norm": 1.3515625, "learning_rate": 0.00020667386371308162, "loss": 4.9836, "mean_token_accuracy": 0.22940946370363235, "num_tokens": 13385492.0, "step": 7360 }, { "entropy": 5.650719451904297, "epoch": 6.327460249247959, "grad_norm": 1.25, "learning_rate": 0.00020635715659185673, "loss": 4.9198, "mean_token_accuracy": 0.23462713062763213, "num_tokens": 13395562.0, "step": 7365 }, { "entropy": 5.600523614883423, "epoch": 6.331757627847013, "grad_norm": 1.4296875, "learning_rate": 0.00020604059935916551, "loss": 4.8568, "mean_token_accuracy": 0.24552038162946702, "num_tokens": 13403357.0, "step": 7370 }, { "entropy": 5.685958814620972, "epoch": 6.336055006446068, "grad_norm": 1.3984375, "learning_rate": 0.00020572419270624255, "loss": 4.9465, "mean_token_accuracy": 0.23870250284671785, "num_tokens": 13412527.0, "step": 7375 }, { "entropy": 5.696662282943725, "epoch": 6.340352385045122, "grad_norm": 1.3203125, "learning_rate": 0.00020540793732399339, "loss": 5.0053, "mean_token_accuracy": 0.22636662870645524, "num_tokens": 13422455.0, "step": 7380 }, { "entropy": 5.7542284488677975, "epoch": 6.344649763644177, "grad_norm": 1.390625, "learning_rate": 0.00020509183390299325, "loss": 5.092, "mean_token_accuracy": 0.2187615990638733, "num_tokens": 13431677.0, "step": 7385 }, { "entropy": 5.588942003250122, "epoch": 6.348947142243231, "grad_norm": 1.3203125, "learning_rate": 0.00020477588313348594, "loss": 4.8304, "mean_token_accuracy": 0.2510680601000786, "num_tokens": 13440522.0, "step": 7390 }, { "entropy": 5.7484352588653564, "epoch": 6.353244520842286, "grad_norm": 1.515625, "learning_rate": 0.00020446008570538154, "loss": 4.9445, "mean_token_accuracy": 0.22700524926185608, "num_tokens": 13450021.0, "step": 7395 }, { "entropy": 5.64814863204956, "epoch": 6.35754189944134, "grad_norm": 1.3828125, "learning_rate": 0.0002041444423082554, "loss": 4.8793, "mean_token_accuracy": 0.23593862354755402, "num_tokens": 13458115.0, "step": 7400 }, { "entropy": 5.707351446151733, "epoch": 6.361839278040395, "grad_norm": 1.359375, "learning_rate": 0.00020382895363134652, "loss": 4.954, "mean_token_accuracy": 0.22770389467477797, "num_tokens": 13466798.0, "step": 7405 }, { "entropy": 5.667495393753052, "epoch": 6.3661366566394495, "grad_norm": 1.3671875, "learning_rate": 0.00020351362036355602, "loss": 4.9795, "mean_token_accuracy": 0.23159152567386626, "num_tokens": 13476096.0, "step": 7410 }, { "entropy": 5.661939573287964, "epoch": 6.370434035238505, "grad_norm": 1.3125, "learning_rate": 0.0002031984431934459, "loss": 4.9256, "mean_token_accuracy": 0.23647017031908035, "num_tokens": 13484601.0, "step": 7415 }, { "entropy": 5.68424654006958, "epoch": 6.3747314138375595, "grad_norm": 1.2734375, "learning_rate": 0.00020288342280923695, "loss": 4.9062, "mean_token_accuracy": 0.2412991166114807, "num_tokens": 13493994.0, "step": 7420 }, { "entropy": 5.657544279098511, "epoch": 6.379028792436614, "grad_norm": 1.3125, "learning_rate": 0.00020256855989880785, "loss": 4.943, "mean_token_accuracy": 0.22665072232484818, "num_tokens": 13502890.0, "step": 7425 }, { "entropy": 5.624227142333984, "epoch": 6.383326171035669, "grad_norm": 1.3515625, "learning_rate": 0.00020225385514969336, "loss": 4.8962, "mean_token_accuracy": 0.24369400441646577, "num_tokens": 13512980.0, "step": 7430 }, { "entropy": 5.700878524780274, "epoch": 6.387623549634723, "grad_norm": 1.3125, "learning_rate": 0.00020193930924908277, "loss": 4.9713, "mean_token_accuracy": 0.23523945212364197, "num_tokens": 13521558.0, "step": 7435 }, { "entropy": 5.657977867126465, "epoch": 6.391920928233778, "grad_norm": 1.1953125, "learning_rate": 0.00020162492288381867, "loss": 4.8956, "mean_token_accuracy": 0.23665117025375365, "num_tokens": 13531506.0, "step": 7440 }, { "entropy": 5.6123686790466305, "epoch": 6.396218306832832, "grad_norm": 1.2265625, "learning_rate": 0.0002013106967403953, "loss": 4.8547, "mean_token_accuracy": 0.24135282933712005, "num_tokens": 13540559.0, "step": 7445 }, { "entropy": 5.714596271514893, "epoch": 6.400515685431887, "grad_norm": 1.3984375, "learning_rate": 0.0002009966315049569, "loss": 4.965, "mean_token_accuracy": 0.23248774856328963, "num_tokens": 13550654.0, "step": 7450 }, { "entropy": 5.723209571838379, "epoch": 6.404813064030941, "grad_norm": 1.3515625, "learning_rate": 0.0002006827278632964, "loss": 4.9848, "mean_token_accuracy": 0.23032907545566558, "num_tokens": 13560708.0, "step": 7455 }, { "entropy": 5.694536209106445, "epoch": 6.409110442629996, "grad_norm": 1.3203125, "learning_rate": 0.00020036898650085377, "loss": 4.8691, "mean_token_accuracy": 0.23195305913686753, "num_tokens": 13569330.0, "step": 7460 }, { "entropy": 5.723634719848633, "epoch": 6.41340782122905, "grad_norm": 1.453125, "learning_rate": 0.00020005540810271493, "loss": 5.051, "mean_token_accuracy": 0.21840845048427582, "num_tokens": 13577500.0, "step": 7465 }, { "entropy": 5.662380409240723, "epoch": 6.417705199828105, "grad_norm": 1.328125, "learning_rate": 0.00019974199335360976, "loss": 4.9421, "mean_token_accuracy": 0.23581697791814804, "num_tokens": 13586087.0, "step": 7470 }, { "entropy": 5.6357903480529785, "epoch": 6.4220025784271595, "grad_norm": 1.3828125, "learning_rate": 0.00019942874293791068, "loss": 4.8988, "mean_token_accuracy": 0.23675889521837234, "num_tokens": 13595346.0, "step": 7475 }, { "entropy": 5.736030340194702, "epoch": 6.426299957026214, "grad_norm": 1.3515625, "learning_rate": 0.00019911565753963145, "loss": 5.0373, "mean_token_accuracy": 0.22954674065113068, "num_tokens": 13604755.0, "step": 7480 }, { "entropy": 5.741143465042114, "epoch": 6.430597335625269, "grad_norm": 1.25, "learning_rate": 0.0001988027378424254, "loss": 4.9452, "mean_token_accuracy": 0.24101046323776246, "num_tokens": 13613860.0, "step": 7485 }, { "entropy": 5.619118022918701, "epoch": 6.434894714224323, "grad_norm": 1.4609375, "learning_rate": 0.00019848998452958429, "loss": 4.8294, "mean_token_accuracy": 0.24386470913887023, "num_tokens": 13622574.0, "step": 7490 }, { "entropy": 5.661189985275269, "epoch": 6.439192092823378, "grad_norm": 1.2421875, "learning_rate": 0.00019817739828403602, "loss": 4.9752, "mean_token_accuracy": 0.22600160390138627, "num_tokens": 13632366.0, "step": 7495 }, { "entropy": 5.716126537322998, "epoch": 6.443489471422432, "grad_norm": 1.2578125, "learning_rate": 0.00019786497978834422, "loss": 4.8696, "mean_token_accuracy": 0.24941558837890626, "num_tokens": 13640682.0, "step": 7500 }, { "epoch": 6.443489471422432, "eval_entropy": 5.460215036933486, "eval_loss": 5.889508247375488, "eval_mean_token_accuracy": 0.1832222778954216, "eval_num_tokens": 13640682.0, "eval_runtime": 2.2243, "eval_samples_per_second": 1595.592, "eval_steps_per_second": 199.618, "step": 7500 }, { "entropy": 5.671139860153199, "epoch": 6.447786850021487, "grad_norm": 1.375, "learning_rate": 0.00019755272972470602, "loss": 4.9579, "mean_token_accuracy": 0.23592828810214997, "num_tokens": 13649675.0, "step": 7505 }, { "entropy": 5.6834453582763675, "epoch": 6.452084228620541, "grad_norm": 1.4296875, "learning_rate": 0.00019724064877495057, "loss": 4.9211, "mean_token_accuracy": 0.22713227868080138, "num_tokens": 13658260.0, "step": 7510 }, { "entropy": 5.629180526733398, "epoch": 6.456381607219596, "grad_norm": 1.4453125, "learning_rate": 0.00019692873762053808, "loss": 4.8758, "mean_token_accuracy": 0.24112626761198044, "num_tokens": 13666571.0, "step": 7515 }, { "entropy": 5.717655801773072, "epoch": 6.46067898581865, "grad_norm": 1.1875, "learning_rate": 0.00019661699694255785, "loss": 4.9529, "mean_token_accuracy": 0.23426357060670852, "num_tokens": 13675707.0, "step": 7520 }, { "entropy": 5.730238914489746, "epoch": 6.464976364417705, "grad_norm": 1.4609375, "learning_rate": 0.00019630542742172692, "loss": 4.9157, "mean_token_accuracy": 0.24274927675724028, "num_tokens": 13684796.0, "step": 7525 }, { "entropy": 5.697819185256958, "epoch": 6.4692737430167595, "grad_norm": 1.3359375, "learning_rate": 0.00019599402973838854, "loss": 5.0123, "mean_token_accuracy": 0.22690223902463913, "num_tokens": 13693158.0, "step": 7530 }, { "entropy": 5.679841184616089, "epoch": 6.473571121615814, "grad_norm": 1.4140625, "learning_rate": 0.0001956828045725107, "loss": 4.9862, "mean_token_accuracy": 0.22812242060899734, "num_tokens": 13703521.0, "step": 7535 }, { "entropy": 5.655819082260132, "epoch": 6.477868500214869, "grad_norm": 1.34375, "learning_rate": 0.0001953717526036849, "loss": 4.8919, "mean_token_accuracy": 0.2375039428472519, "num_tokens": 13712337.0, "step": 7540 }, { "entropy": 5.673346281051636, "epoch": 6.482165878813923, "grad_norm": 1.4140625, "learning_rate": 0.00019506087451112437, "loss": 4.9142, "mean_token_accuracy": 0.23074083924293518, "num_tokens": 13721605.0, "step": 7545 }, { "entropy": 5.60535888671875, "epoch": 6.486463257412978, "grad_norm": 1.4609375, "learning_rate": 0.00019475017097366244, "loss": 4.8196, "mean_token_accuracy": 0.24068238884210585, "num_tokens": 13730827.0, "step": 7550 }, { "entropy": 5.727794599533081, "epoch": 6.490760636012032, "grad_norm": 1.390625, "learning_rate": 0.00019443964266975156, "loss": 4.983, "mean_token_accuracy": 0.2254954144358635, "num_tokens": 13740128.0, "step": 7555 }, { "entropy": 5.665542125701904, "epoch": 6.495058014611088, "grad_norm": 1.3671875, "learning_rate": 0.0001941292902774614, "loss": 4.9594, "mean_token_accuracy": 0.22979557663202285, "num_tokens": 13748428.0, "step": 7560 }, { "entropy": 5.651149702072144, "epoch": 6.499355393210142, "grad_norm": 1.2578125, "learning_rate": 0.00019381911447447742, "loss": 4.9317, "mean_token_accuracy": 0.23943807184696198, "num_tokens": 13757109.0, "step": 7565 }, { "entropy": 5.731936073303222, "epoch": 6.503652771809197, "grad_norm": 1.453125, "learning_rate": 0.00019350911593809977, "loss": 4.9893, "mean_token_accuracy": 0.22604165077209473, "num_tokens": 13766281.0, "step": 7570 }, { "entropy": 5.691824197769165, "epoch": 6.507950150408251, "grad_norm": 1.2578125, "learning_rate": 0.00019319929534524128, "loss": 4.9652, "mean_token_accuracy": 0.2340141773223877, "num_tokens": 13775535.0, "step": 7575 }, { "entropy": 5.713566875457763, "epoch": 6.512247529007306, "grad_norm": 1.3359375, "learning_rate": 0.00019288965337242636, "loss": 4.9498, "mean_token_accuracy": 0.22838044166564941, "num_tokens": 13784099.0, "step": 7580 }, { "entropy": 5.790343189239502, "epoch": 6.51654490760636, "grad_norm": 1.3125, "learning_rate": 0.00019258019069578924, "loss": 5.0787, "mean_token_accuracy": 0.22436539083719254, "num_tokens": 13793098.0, "step": 7585 }, { "entropy": 5.607115888595581, "epoch": 6.520842286205415, "grad_norm": 1.3671875, "learning_rate": 0.00019227090799107266, "loss": 4.9188, "mean_token_accuracy": 0.23831139653921127, "num_tokens": 13801847.0, "step": 7590 }, { "entropy": 5.62153811454773, "epoch": 6.5251396648044695, "grad_norm": 1.28125, "learning_rate": 0.0001919618059336265, "loss": 4.8026, "mean_token_accuracy": 0.23950466215610505, "num_tokens": 13810599.0, "step": 7595 }, { "entropy": 5.7144591331481935, "epoch": 6.529437043403524, "grad_norm": 1.453125, "learning_rate": 0.00019165288519840617, "loss": 4.9558, "mean_token_accuracy": 0.2338729202747345, "num_tokens": 13819602.0, "step": 7600 }, { "entropy": 5.640046072006226, "epoch": 6.533734422002579, "grad_norm": 1.4375, "learning_rate": 0.000191344146459971, "loss": 4.917, "mean_token_accuracy": 0.23910915553569795, "num_tokens": 13828254.0, "step": 7605 }, { "entropy": 5.665197372436523, "epoch": 6.538031800601633, "grad_norm": 1.3125, "learning_rate": 0.00019103559039248302, "loss": 4.962, "mean_token_accuracy": 0.2355642482638359, "num_tokens": 13837163.0, "step": 7610 }, { "entropy": 5.72585711479187, "epoch": 6.542329179200688, "grad_norm": 1.3828125, "learning_rate": 0.0001907272176697052, "loss": 4.9707, "mean_token_accuracy": 0.2294859230518341, "num_tokens": 13846373.0, "step": 7615 }, { "entropy": 5.626835584640503, "epoch": 6.546626557799742, "grad_norm": 1.3046875, "learning_rate": 0.00019041902896500059, "loss": 4.9055, "mean_token_accuracy": 0.23354367166757584, "num_tokens": 13855846.0, "step": 7620 }, { "entropy": 5.670918416976929, "epoch": 6.550923936398797, "grad_norm": 1.3515625, "learning_rate": 0.00019011102495132993, "loss": 4.9116, "mean_token_accuracy": 0.23671811074018478, "num_tokens": 13864723.0, "step": 7625 }, { "entropy": 5.755877733230591, "epoch": 6.555221314997851, "grad_norm": 1.1796875, "learning_rate": 0.00018980320630125104, "loss": 5.0723, "mean_token_accuracy": 0.2211229085922241, "num_tokens": 13873418.0, "step": 7630 }, { "entropy": 5.742153120040894, "epoch": 6.559518693596906, "grad_norm": 1.578125, "learning_rate": 0.00018949557368691666, "loss": 4.9595, "mean_token_accuracy": 0.23428151607513428, "num_tokens": 13881890.0, "step": 7635 }, { "entropy": 5.713848876953125, "epoch": 6.56381607219596, "grad_norm": 1.2421875, "learning_rate": 0.00018918812778007343, "loss": 4.9983, "mean_token_accuracy": 0.230608469247818, "num_tokens": 13891289.0, "step": 7640 }, { "entropy": 5.672276353836059, "epoch": 6.568113450795015, "grad_norm": 1.328125, "learning_rate": 0.00018888086925206054, "loss": 4.9145, "mean_token_accuracy": 0.2302643895149231, "num_tokens": 13900344.0, "step": 7645 }, { "entropy": 5.718430042266846, "epoch": 6.5724108293940695, "grad_norm": 1.4453125, "learning_rate": 0.00018857379877380763, "loss": 4.9505, "mean_token_accuracy": 0.23823903053998946, "num_tokens": 13909108.0, "step": 7650 }, { "entropy": 5.667624759674072, "epoch": 6.576708207993124, "grad_norm": 1.28125, "learning_rate": 0.00018826691701583404, "loss": 4.982, "mean_token_accuracy": 0.2340136095881462, "num_tokens": 13918458.0, "step": 7655 }, { "entropy": 5.681752157211304, "epoch": 6.581005586592179, "grad_norm": 1.390625, "learning_rate": 0.00018796022464824663, "loss": 4.9239, "mean_token_accuracy": 0.2388586074113846, "num_tokens": 13927186.0, "step": 7660 }, { "entropy": 5.710815858840943, "epoch": 6.585302965191233, "grad_norm": 1.359375, "learning_rate": 0.00018765372234073912, "loss": 5.0222, "mean_token_accuracy": 0.22159896939992904, "num_tokens": 13936701.0, "step": 7665 }, { "entropy": 5.685254335403442, "epoch": 6.589600343790288, "grad_norm": 1.484375, "learning_rate": 0.00018734741076259005, "loss": 5.0356, "mean_token_accuracy": 0.2193260222673416, "num_tokens": 13945842.0, "step": 7670 }, { "entropy": 5.625968742370605, "epoch": 6.593897722389342, "grad_norm": 1.234375, "learning_rate": 0.00018704129058266152, "loss": 4.9129, "mean_token_accuracy": 0.23576617240905762, "num_tokens": 13955675.0, "step": 7675 }, { "entropy": 5.746795892715454, "epoch": 6.598195100988397, "grad_norm": 1.3125, "learning_rate": 0.00018673536246939743, "loss": 5.0022, "mean_token_accuracy": 0.23313231617212296, "num_tokens": 13964153.0, "step": 7680 }, { "entropy": 5.680375480651856, "epoch": 6.602492479587451, "grad_norm": 1.421875, "learning_rate": 0.00018642962709082274, "loss": 4.9636, "mean_token_accuracy": 0.23241930603981018, "num_tokens": 13972141.0, "step": 7685 }, { "entropy": 5.625921630859375, "epoch": 6.606789858186506, "grad_norm": 1.4453125, "learning_rate": 0.00018612408511454103, "loss": 4.856, "mean_token_accuracy": 0.23955829739570617, "num_tokens": 13981369.0, "step": 7690 }, { "entropy": 5.666980743408203, "epoch": 6.61108723678556, "grad_norm": 1.3828125, "learning_rate": 0.00018581873720773423, "loss": 4.9019, "mean_token_accuracy": 0.23360517770051956, "num_tokens": 13990626.0, "step": 7695 }, { "entropy": 5.673977088928223, "epoch": 6.615384615384615, "grad_norm": 1.34375, "learning_rate": 0.00018551358403715989, "loss": 5.0046, "mean_token_accuracy": 0.22494971752166748, "num_tokens": 13999399.0, "step": 7700 }, { "entropy": 5.697866153717041, "epoch": 6.61968199398367, "grad_norm": 1.4140625, "learning_rate": 0.00018520862626915052, "loss": 4.8992, "mean_token_accuracy": 0.2335071235895157, "num_tokens": 14007487.0, "step": 7705 }, { "entropy": 5.673949384689331, "epoch": 6.623979372582724, "grad_norm": 1.3046875, "learning_rate": 0.00018490386456961223, "loss": 4.8899, "mean_token_accuracy": 0.23473947495222092, "num_tokens": 14016779.0, "step": 7710 }, { "entropy": 5.728198099136352, "epoch": 6.6282767511817795, "grad_norm": 1.3515625, "learning_rate": 0.0001845992996040224, "loss": 4.9727, "mean_token_accuracy": 0.23272737711668015, "num_tokens": 14025770.0, "step": 7715 }, { "entropy": 5.64153265953064, "epoch": 6.632574129780834, "grad_norm": 1.34375, "learning_rate": 0.00018429493203742946, "loss": 4.8628, "mean_token_accuracy": 0.2445816546678543, "num_tokens": 14035304.0, "step": 7720 }, { "entropy": 5.660477972030639, "epoch": 6.636871508379889, "grad_norm": 1.515625, "learning_rate": 0.00018399076253445052, "loss": 4.8205, "mean_token_accuracy": 0.2434508979320526, "num_tokens": 14044058.0, "step": 7725 }, { "entropy": 5.669250535964966, "epoch": 6.641168886978943, "grad_norm": 1.3359375, "learning_rate": 0.00018368679175927012, "loss": 4.8682, "mean_token_accuracy": 0.24331035017967223, "num_tokens": 14052709.0, "step": 7730 }, { "entropy": 5.617627811431885, "epoch": 6.645466265577998, "grad_norm": 1.390625, "learning_rate": 0.00018338302037563885, "loss": 4.8264, "mean_token_accuracy": 0.2451078027486801, "num_tokens": 14061346.0, "step": 7735 }, { "entropy": 5.628537797927857, "epoch": 6.649763644177052, "grad_norm": 1.4375, "learning_rate": 0.00018307944904687211, "loss": 4.9163, "mean_token_accuracy": 0.2382143720984459, "num_tokens": 14070409.0, "step": 7740 }, { "entropy": 5.7070694923400875, "epoch": 6.654061022776107, "grad_norm": 1.5625, "learning_rate": 0.0001827760784358483, "loss": 4.9999, "mean_token_accuracy": 0.22305570393800736, "num_tokens": 14079448.0, "step": 7745 }, { "entropy": 5.732889890670776, "epoch": 6.658358401375161, "grad_norm": 1.3203125, "learning_rate": 0.00018247290920500776, "loss": 4.9949, "mean_token_accuracy": 0.23120992481708527, "num_tokens": 14088452.0, "step": 7750 }, { "entropy": 5.68429913520813, "epoch": 6.662655779974216, "grad_norm": 1.421875, "learning_rate": 0.00018216994201635062, "loss": 4.9634, "mean_token_accuracy": 0.23368675857782364, "num_tokens": 14097956.0, "step": 7755 }, { "entropy": 5.722675609588623, "epoch": 6.66695315857327, "grad_norm": 1.40625, "learning_rate": 0.00018186717753143633, "loss": 4.9065, "mean_token_accuracy": 0.23614079207181932, "num_tokens": 14106771.0, "step": 7760 }, { "entropy": 5.715720987319946, "epoch": 6.671250537172325, "grad_norm": 1.2578125, "learning_rate": 0.00018156461641138133, "loss": 4.9715, "mean_token_accuracy": 0.227101469039917, "num_tokens": 14115871.0, "step": 7765 }, { "entropy": 5.6637770652771, "epoch": 6.6755479157713795, "grad_norm": 1.2890625, "learning_rate": 0.00018126225931685836, "loss": 4.9167, "mean_token_accuracy": 0.23606135249137877, "num_tokens": 14125316.0, "step": 7770 }, { "entropy": 5.600428295135498, "epoch": 6.679845294370434, "grad_norm": 1.453125, "learning_rate": 0.00018096010690809444, "loss": 4.8643, "mean_token_accuracy": 0.24022420942783357, "num_tokens": 14134244.0, "step": 7775 }, { "entropy": 5.712977886199951, "epoch": 6.684142672969489, "grad_norm": 1.3203125, "learning_rate": 0.00018065815984486962, "loss": 4.9966, "mean_token_accuracy": 0.22435731440782547, "num_tokens": 14143600.0, "step": 7780 }, { "entropy": 5.677579307556153, "epoch": 6.688440051568543, "grad_norm": 1.3671875, "learning_rate": 0.00018035641878651548, "loss": 4.8938, "mean_token_accuracy": 0.2324790060520172, "num_tokens": 14152112.0, "step": 7785 }, { "entropy": 5.643826341629028, "epoch": 6.692737430167598, "grad_norm": 1.4921875, "learning_rate": 0.00018005488439191408, "loss": 4.8629, "mean_token_accuracy": 0.23934897482395173, "num_tokens": 14161044.0, "step": 7790 }, { "entropy": 5.67745008468628, "epoch": 6.697034808766652, "grad_norm": 1.46875, "learning_rate": 0.0001797535573194959, "loss": 5.0141, "mean_token_accuracy": 0.23301784992218016, "num_tokens": 14169629.0, "step": 7795 }, { "entropy": 5.669470548629761, "epoch": 6.701332187365707, "grad_norm": 1.21875, "learning_rate": 0.0001794524382272389, "loss": 4.9116, "mean_token_accuracy": 0.23372034281492232, "num_tokens": 14179234.0, "step": 7800 }, { "entropy": 5.674129343032837, "epoch": 6.705629565964761, "grad_norm": 1.1953125, "learning_rate": 0.0001791515277726667, "loss": 4.9562, "mean_token_accuracy": 0.24207875579595567, "num_tokens": 14188887.0, "step": 7805 }, { "entropy": 5.629077529907226, "epoch": 6.709926944563816, "grad_norm": 1.3203125, "learning_rate": 0.00017885082661284763, "loss": 4.8625, "mean_token_accuracy": 0.2409089684486389, "num_tokens": 14197731.0, "step": 7810 }, { "entropy": 5.685734653472901, "epoch": 6.71422432316287, "grad_norm": 1.2265625, "learning_rate": 0.00017855033540439274, "loss": 4.9971, "mean_token_accuracy": 0.22851601541042327, "num_tokens": 14206851.0, "step": 7815 }, { "entropy": 5.666434955596924, "epoch": 6.718521701761925, "grad_norm": 1.5078125, "learning_rate": 0.00017825005480345463, "loss": 4.9914, "mean_token_accuracy": 0.23279282301664353, "num_tokens": 14215879.0, "step": 7820 }, { "entropy": 5.664918899536133, "epoch": 6.7228190803609795, "grad_norm": 1.4296875, "learning_rate": 0.00017794998546572627, "loss": 4.8397, "mean_token_accuracy": 0.244611656665802, "num_tokens": 14223798.0, "step": 7825 }, { "entropy": 5.720886945724487, "epoch": 6.727116458960034, "grad_norm": 1.203125, "learning_rate": 0.0001776501280464391, "loss": 4.9608, "mean_token_accuracy": 0.2350848749279976, "num_tokens": 14233234.0, "step": 7830 }, { "entropy": 5.684752702713013, "epoch": 6.731413837559089, "grad_norm": 1.46875, "learning_rate": 0.00017735048320036197, "loss": 4.904, "mean_token_accuracy": 0.228993958234787, "num_tokens": 14241851.0, "step": 7835 }, { "entropy": 5.690377855300904, "epoch": 6.735711216158143, "grad_norm": 1.46875, "learning_rate": 0.00017705105158179917, "loss": 5.0018, "mean_token_accuracy": 0.22215719074010848, "num_tokens": 14251578.0, "step": 7840 }, { "entropy": 5.726417493820191, "epoch": 6.740008594757198, "grad_norm": 1.25, "learning_rate": 0.00017675183384458987, "loss": 5.0221, "mean_token_accuracy": 0.22361525744199753, "num_tokens": 14261122.0, "step": 7845 }, { "entropy": 5.738840818405151, "epoch": 6.744305973356253, "grad_norm": 1.5, "learning_rate": 0.00017645283064210616, "loss": 4.9072, "mean_token_accuracy": 0.2399264469742775, "num_tokens": 14270594.0, "step": 7850 }, { "entropy": 5.689656162261963, "epoch": 6.748603351955307, "grad_norm": 1.390625, "learning_rate": 0.00017615404262725132, "loss": 4.9534, "mean_token_accuracy": 0.23406831324100494, "num_tokens": 14279646.0, "step": 7855 }, { "entropy": 5.686764335632324, "epoch": 6.752900730554362, "grad_norm": 1.4296875, "learning_rate": 0.00017585547045245885, "loss": 4.8961, "mean_token_accuracy": 0.241571407020092, "num_tokens": 14288555.0, "step": 7860 }, { "entropy": 5.686032819747925, "epoch": 6.757198109153417, "grad_norm": 1.4296875, "learning_rate": 0.00017555711476969138, "loss": 4.9301, "mean_token_accuracy": 0.23303429484367372, "num_tokens": 14297813.0, "step": 7865 }, { "entropy": 5.6986595630645756, "epoch": 6.761495487752471, "grad_norm": 1.453125, "learning_rate": 0.00017525897623043806, "loss": 4.9084, "mean_token_accuracy": 0.2414133533835411, "num_tokens": 14305779.0, "step": 7870 }, { "entropy": 5.703876495361328, "epoch": 6.765792866351526, "grad_norm": 1.46875, "learning_rate": 0.00017496105548571472, "loss": 5.0062, "mean_token_accuracy": 0.220834356546402, "num_tokens": 14314419.0, "step": 7875 }, { "entropy": 5.679584407806397, "epoch": 6.77009024495058, "grad_norm": 1.4609375, "learning_rate": 0.00017466335318606086, "loss": 4.9296, "mean_token_accuracy": 0.23031468242406844, "num_tokens": 14322959.0, "step": 7880 }, { "entropy": 5.710860538482666, "epoch": 6.774387623549635, "grad_norm": 1.21875, "learning_rate": 0.00017436586998153947, "loss": 4.9121, "mean_token_accuracy": 0.237626251578331, "num_tokens": 14332270.0, "step": 7885 }, { "entropy": 5.667410135269165, "epoch": 6.7786850021486895, "grad_norm": 1.2421875, "learning_rate": 0.00017406860652173495, "loss": 4.9146, "mean_token_accuracy": 0.2343543827533722, "num_tokens": 14341701.0, "step": 7890 }, { "entropy": 5.712936305999756, "epoch": 6.782982380747744, "grad_norm": 1.5078125, "learning_rate": 0.00017377156345575176, "loss": 4.9341, "mean_token_accuracy": 0.2295023277401924, "num_tokens": 14349551.0, "step": 7895 }, { "entropy": 5.582612323760986, "epoch": 6.787279759346799, "grad_norm": 1.375, "learning_rate": 0.00017347474143221338, "loss": 4.8532, "mean_token_accuracy": 0.2416646361351013, "num_tokens": 14358577.0, "step": 7900 }, { "entropy": 5.747670841217041, "epoch": 6.791577137945853, "grad_norm": 1.3203125, "learning_rate": 0.00017317814109926044, "loss": 4.98, "mean_token_accuracy": 0.2275027111172676, "num_tokens": 14367862.0, "step": 7905 }, { "entropy": 5.605372095108033, "epoch": 6.795874516544908, "grad_norm": 1.21875, "learning_rate": 0.0001728817631045495, "loss": 4.9242, "mean_token_accuracy": 0.23678470700979232, "num_tokens": 14377414.0, "step": 7910 }, { "entropy": 5.592658567428589, "epoch": 6.800171895143962, "grad_norm": 1.28125, "learning_rate": 0.0001725856080952516, "loss": 4.8519, "mean_token_accuracy": 0.250497567653656, "num_tokens": 14387239.0, "step": 7915 }, { "entropy": 5.6137172222137455, "epoch": 6.804469273743017, "grad_norm": 1.375, "learning_rate": 0.0001722896767180509, "loss": 4.8798, "mean_token_accuracy": 0.24150066673755646, "num_tokens": 14396076.0, "step": 7920 }, { "entropy": 5.676685476303101, "epoch": 6.808766652342071, "grad_norm": 1.2734375, "learning_rate": 0.00017199396961914334, "loss": 4.9487, "mean_token_accuracy": 0.2327686607837677, "num_tokens": 14404982.0, "step": 7925 }, { "entropy": 5.641691255569458, "epoch": 6.813064030941126, "grad_norm": 1.3984375, "learning_rate": 0.00017169848744423506, "loss": 4.8727, "mean_token_accuracy": 0.2303582951426506, "num_tokens": 14413364.0, "step": 7930 }, { "entropy": 5.735635137557983, "epoch": 6.81736140954018, "grad_norm": 1.2109375, "learning_rate": 0.00017140323083854076, "loss": 5.0197, "mean_token_accuracy": 0.23055720925331116, "num_tokens": 14424279.0, "step": 7935 }, { "entropy": 5.668535566329956, "epoch": 6.821658788139235, "grad_norm": 1.3828125, "learning_rate": 0.00017110820044678317, "loss": 4.994, "mean_token_accuracy": 0.22530962973833085, "num_tokens": 14432931.0, "step": 7940 }, { "entropy": 5.685610580444336, "epoch": 6.8259561667382895, "grad_norm": 1.3203125, "learning_rate": 0.00017081339691319054, "loss": 4.9347, "mean_token_accuracy": 0.2323356956243515, "num_tokens": 14442652.0, "step": 7945 }, { "entropy": 5.704575395584106, "epoch": 6.830253545337344, "grad_norm": 1.296875, "learning_rate": 0.00017051882088149612, "loss": 4.9181, "mean_token_accuracy": 0.2219302698969841, "num_tokens": 14452061.0, "step": 7950 }, { "entropy": 5.685388898849487, "epoch": 6.834550923936399, "grad_norm": 1.3046875, "learning_rate": 0.00017022447299493599, "loss": 4.8898, "mean_token_accuracy": 0.23219588547945022, "num_tokens": 14460771.0, "step": 7955 }, { "entropy": 5.70946888923645, "epoch": 6.838848302535453, "grad_norm": 1.28125, "learning_rate": 0.00016993035389624854, "loss": 5.0197, "mean_token_accuracy": 0.22667427957057953, "num_tokens": 14469983.0, "step": 7960 }, { "entropy": 5.669454717636109, "epoch": 6.843145681134508, "grad_norm": 1.3984375, "learning_rate": 0.0001696364642276722, "loss": 4.9164, "mean_token_accuracy": 0.23280643969774245, "num_tokens": 14478641.0, "step": 7965 }, { "entropy": 5.699934196472168, "epoch": 6.847443059733562, "grad_norm": 1.265625, "learning_rate": 0.00016934280463094448, "loss": 4.9913, "mean_token_accuracy": 0.2324465572834015, "num_tokens": 14487922.0, "step": 7970 }, { "entropy": 5.636373853683471, "epoch": 6.851740438332617, "grad_norm": 1.5625, "learning_rate": 0.00016904937574730062, "loss": 4.8422, "mean_token_accuracy": 0.24668496400117873, "num_tokens": 14496259.0, "step": 7975 }, { "entropy": 5.7063679695129395, "epoch": 6.856037816931671, "grad_norm": 1.5625, "learning_rate": 0.00016875617821747208, "loss": 4.9453, "mean_token_accuracy": 0.229776830971241, "num_tokens": 14504366.0, "step": 7980 }, { "entropy": 5.63564167022705, "epoch": 6.860335195530726, "grad_norm": 1.203125, "learning_rate": 0.00016846321268168508, "loss": 4.9484, "mean_token_accuracy": 0.23471348434686662, "num_tokens": 14513561.0, "step": 7985 }, { "entropy": 5.688099098205567, "epoch": 6.8646325741297805, "grad_norm": 1.34375, "learning_rate": 0.00016817047977965905, "loss": 4.8929, "mean_token_accuracy": 0.23645369410514833, "num_tokens": 14522531.0, "step": 7990 }, { "entropy": 5.707848453521729, "epoch": 6.868929952728836, "grad_norm": 1.4296875, "learning_rate": 0.0001678779801506058, "loss": 4.9288, "mean_token_accuracy": 0.2295157477259636, "num_tokens": 14531375.0, "step": 7995 }, { "entropy": 5.671221542358398, "epoch": 6.8732273313278895, "grad_norm": 1.328125, "learning_rate": 0.00016758571443322774, "loss": 4.9612, "mean_token_accuracy": 0.2283916264772415, "num_tokens": 14541081.0, "step": 8000 }, { "epoch": 6.8732273313278895, "eval_entropy": 5.498058859292452, "eval_loss": 5.869622707366943, "eval_mean_token_accuracy": 0.18391755706555135, "eval_num_tokens": 14541081.0, "eval_runtime": 2.047, "eval_samples_per_second": 1733.741, "eval_steps_per_second": 216.901, "step": 8000 }, { "entropy": 5.61028299331665, "epoch": 6.877524709926945, "grad_norm": 1.3046875, "learning_rate": 0.0001672936832657162, "loss": 4.8721, "mean_token_accuracy": 0.24735442847013472, "num_tokens": 14550129.0, "step": 8005 }, { "entropy": 5.746318817138672, "epoch": 6.8818220885259995, "grad_norm": 1.53125, "learning_rate": 0.00016700188728575047, "loss": 5.0547, "mean_token_accuracy": 0.2224505975842476, "num_tokens": 14559051.0, "step": 8010 }, { "entropy": 5.652448987960815, "epoch": 6.886119467125054, "grad_norm": 1.3046875, "learning_rate": 0.00016671032713049655, "loss": 4.9551, "mean_token_accuracy": 0.2368941769003868, "num_tokens": 14567719.0, "step": 8015 }, { "entropy": 5.7099028587341305, "epoch": 6.890416845724109, "grad_norm": 1.40625, "learning_rate": 0.00016641900343660515, "loss": 4.9827, "mean_token_accuracy": 0.23374326676130294, "num_tokens": 14576256.0, "step": 8020 }, { "entropy": 5.70998477935791, "epoch": 6.894714224323163, "grad_norm": 1.1328125, "learning_rate": 0.0001661279168402107, "loss": 4.9877, "mean_token_accuracy": 0.22852143943309783, "num_tokens": 14586392.0, "step": 8025 }, { "entropy": 5.655804634094238, "epoch": 6.899011602922218, "grad_norm": 1.4453125, "learning_rate": 0.00016583706797693008, "loss": 4.9559, "mean_token_accuracy": 0.23013717234134673, "num_tokens": 14595448.0, "step": 8030 }, { "entropy": 5.681487083435059, "epoch": 6.903308981521272, "grad_norm": 1.328125, "learning_rate": 0.00016554645748186105, "loss": 4.9868, "mean_token_accuracy": 0.23257408142089844, "num_tokens": 14604242.0, "step": 8035 }, { "entropy": 5.766960859298706, "epoch": 6.907606360120327, "grad_norm": 1.328125, "learning_rate": 0.00016525608598958063, "loss": 5.0582, "mean_token_accuracy": 0.2256121054291725, "num_tokens": 14614983.0, "step": 8040 }, { "entropy": 5.7032266616821286, "epoch": 6.911903738719381, "grad_norm": 1.359375, "learning_rate": 0.00016496595413414421, "loss": 4.9188, "mean_token_accuracy": 0.23416159003973008, "num_tokens": 14624748.0, "step": 8045 }, { "entropy": 5.662182235717774, "epoch": 6.916201117318436, "grad_norm": 1.1875, "learning_rate": 0.00016467606254908355, "loss": 4.8949, "mean_token_accuracy": 0.23815073519945146, "num_tokens": 14633642.0, "step": 8050 }, { "entropy": 5.694502353668213, "epoch": 6.9204984959174904, "grad_norm": 1.2890625, "learning_rate": 0.00016438641186740632, "loss": 4.9805, "mean_token_accuracy": 0.22760697454214096, "num_tokens": 14642549.0, "step": 8055 }, { "entropy": 5.6843287467956545, "epoch": 6.924795874516545, "grad_norm": 1.3359375, "learning_rate": 0.00016409700272159371, "loss": 4.9796, "mean_token_accuracy": 0.2324282094836235, "num_tokens": 14651642.0, "step": 8060 }, { "entropy": 5.724811697006226, "epoch": 6.9290932531155995, "grad_norm": 1.25, "learning_rate": 0.00016380783574359957, "loss": 4.9666, "mean_token_accuracy": 0.23225385397672654, "num_tokens": 14661052.0, "step": 8065 }, { "entropy": 5.673737716674805, "epoch": 6.933390631714654, "grad_norm": 1.4453125, "learning_rate": 0.0001635189115648491, "loss": 4.9513, "mean_token_accuracy": 0.2355537384748459, "num_tokens": 14670292.0, "step": 8070 }, { "entropy": 5.640899229049682, "epoch": 6.937688010313709, "grad_norm": 1.3828125, "learning_rate": 0.00016323023081623705, "loss": 4.872, "mean_token_accuracy": 0.24542313069105148, "num_tokens": 14679735.0, "step": 8075 }, { "entropy": 5.714216613769532, "epoch": 6.941985388912763, "grad_norm": 1.3046875, "learning_rate": 0.00016294179412812702, "loss": 4.9885, "mean_token_accuracy": 0.22407402098178864, "num_tokens": 14688710.0, "step": 8080 }, { "entropy": 5.703889846801758, "epoch": 6.946282767511818, "grad_norm": 1.3203125, "learning_rate": 0.00016265360213034923, "loss": 5.0529, "mean_token_accuracy": 0.21957808136940002, "num_tokens": 14698523.0, "step": 8085 }, { "entropy": 5.630609178543091, "epoch": 6.950580146110872, "grad_norm": 1.1953125, "learning_rate": 0.00016236565545220007, "loss": 4.9523, "mean_token_accuracy": 0.2298360839486122, "num_tokens": 14707674.0, "step": 8090 }, { "entropy": 5.645981884002685, "epoch": 6.954877524709927, "grad_norm": 1.25, "learning_rate": 0.00016207795472243975, "loss": 4.868, "mean_token_accuracy": 0.2401219502091408, "num_tokens": 14716600.0, "step": 8095 }, { "entropy": 5.747059631347656, "epoch": 6.959174903308981, "grad_norm": 1.4921875, "learning_rate": 0.00016179050056929173, "loss": 5.0359, "mean_token_accuracy": 0.22199208289384842, "num_tokens": 14726112.0, "step": 8100 }, { "entropy": 5.6999822616577145, "epoch": 6.963472281908036, "grad_norm": 1.3828125, "learning_rate": 0.00016150329362044102, "loss": 4.9225, "mean_token_accuracy": 0.23735381215810775, "num_tokens": 14735126.0, "step": 8105 }, { "entropy": 5.706126689910889, "epoch": 6.9677696605070905, "grad_norm": 1.25, "learning_rate": 0.00016121633450303285, "loss": 4.9392, "mean_token_accuracy": 0.22967226356267928, "num_tokens": 14744346.0, "step": 8110 }, { "entropy": 5.784051847457886, "epoch": 6.972067039106145, "grad_norm": 1.2734375, "learning_rate": 0.00016092962384367122, "loss": 5.0323, "mean_token_accuracy": 0.22371545135974885, "num_tokens": 14753322.0, "step": 8115 }, { "entropy": 5.647177982330322, "epoch": 6.9763644177051995, "grad_norm": 1.3125, "learning_rate": 0.0001606431622684176, "loss": 4.9358, "mean_token_accuracy": 0.23805618584156035, "num_tokens": 14762384.0, "step": 8120 }, { "entropy": 5.661736106872558, "epoch": 6.980661796304254, "grad_norm": 1.3046875, "learning_rate": 0.00016035695040278935, "loss": 4.9693, "mean_token_accuracy": 0.23804220259189607, "num_tokens": 14771451.0, "step": 8125 }, { "entropy": 5.720511436462402, "epoch": 6.984959174903309, "grad_norm": 1.2578125, "learning_rate": 0.00016007098887175914, "loss": 4.9862, "mean_token_accuracy": 0.2279539957642555, "num_tokens": 14780662.0, "step": 8130 }, { "entropy": 5.706746292114258, "epoch": 6.989256553502363, "grad_norm": 1.4765625, "learning_rate": 0.00015978527829975254, "loss": 4.9077, "mean_token_accuracy": 0.2372632071375847, "num_tokens": 14789201.0, "step": 8135 }, { "entropy": 5.678561067581176, "epoch": 6.993553932101419, "grad_norm": 1.40625, "learning_rate": 0.00015949981931064714, "loss": 4.9418, "mean_token_accuracy": 0.23557900339365007, "num_tokens": 14797857.0, "step": 8140 }, { "entropy": 5.709754991531372, "epoch": 6.997851310700472, "grad_norm": 1.5234375, "learning_rate": 0.0001592146125277714, "loss": 4.9405, "mean_token_accuracy": 0.23237190246582032, "num_tokens": 14806271.0, "step": 8145 }, { "entropy": 5.701049168904622, "epoch": 7.001718951439622, "grad_norm": 1.203125, "learning_rate": 0.00015892965857390278, "loss": 5.0242, "mean_token_accuracy": 0.22801572581132254, "num_tokens": 14815568.0, "step": 8150 }, { "entropy": 5.726880693435669, "epoch": 7.006016330038676, "grad_norm": 1.3125, "learning_rate": 0.00015864495807126704, "loss": 4.9097, "mean_token_accuracy": 0.24267207831144333, "num_tokens": 14825140.0, "step": 8155 }, { "entropy": 5.735155391693115, "epoch": 7.010313708637731, "grad_norm": 1.25, "learning_rate": 0.00015836051164153602, "loss": 4.9041, "mean_token_accuracy": 0.24150658994913102, "num_tokens": 14834459.0, "step": 8160 }, { "entropy": 5.716802549362183, "epoch": 7.014611087236785, "grad_norm": 1.4296875, "learning_rate": 0.00015807631990582733, "loss": 4.8455, "mean_token_accuracy": 0.24123812317848206, "num_tokens": 14843632.0, "step": 8165 }, { "entropy": 5.745954322814941, "epoch": 7.01890846583584, "grad_norm": 1.296875, "learning_rate": 0.00015779238348470192, "loss": 4.8292, "mean_token_accuracy": 0.24553413093090057, "num_tokens": 14852626.0, "step": 8170 }, { "entropy": 5.627586793899536, "epoch": 7.023205844434894, "grad_norm": 1.453125, "learning_rate": 0.00015750870299816345, "loss": 4.7871, "mean_token_accuracy": 0.2367524728178978, "num_tokens": 14861571.0, "step": 8175 }, { "entropy": 5.648326206207275, "epoch": 7.027503223033949, "grad_norm": 1.34375, "learning_rate": 0.00015722527906565672, "loss": 4.772, "mean_token_accuracy": 0.25098019242286684, "num_tokens": 14870383.0, "step": 8180 }, { "entropy": 5.6729450702667235, "epoch": 7.0318006016330035, "grad_norm": 1.4921875, "learning_rate": 0.00015694211230606647, "loss": 4.8441, "mean_token_accuracy": 0.2389305979013443, "num_tokens": 14880212.0, "step": 8185 }, { "entropy": 5.6791423797607425, "epoch": 7.036097980232058, "grad_norm": 1.3359375, "learning_rate": 0.00015665920333771564, "loss": 4.8316, "mean_token_accuracy": 0.24294765591621398, "num_tokens": 14889347.0, "step": 8190 }, { "entropy": 5.663656330108642, "epoch": 7.0403953588311134, "grad_norm": 1.2890625, "learning_rate": 0.00015637655277836427, "loss": 4.8571, "mean_token_accuracy": 0.23541874438524246, "num_tokens": 14898553.0, "step": 8195 }, { "entropy": 5.682969284057617, "epoch": 7.044692737430168, "grad_norm": 1.3359375, "learning_rate": 0.0001560941612452081, "loss": 4.8544, "mean_token_accuracy": 0.24519650787115096, "num_tokens": 14907275.0, "step": 8200 }, { "entropy": 5.677180910110474, "epoch": 7.0489901160292225, "grad_norm": 1.4140625, "learning_rate": 0.0001558120293548777, "loss": 4.8605, "mean_token_accuracy": 0.2467809095978737, "num_tokens": 14916409.0, "step": 8205 }, { "entropy": 5.740977430343628, "epoch": 7.053287494628277, "grad_norm": 1.4296875, "learning_rate": 0.00015553015772343614, "loss": 4.9298, "mean_token_accuracy": 0.22942662239074707, "num_tokens": 14927144.0, "step": 8210 }, { "entropy": 5.629847240447998, "epoch": 7.057584873227332, "grad_norm": 1.5234375, "learning_rate": 0.00015524854696637847, "loss": 4.771, "mean_token_accuracy": 0.24928097873926164, "num_tokens": 14936310.0, "step": 8215 }, { "entropy": 5.629301357269287, "epoch": 7.061882251826386, "grad_norm": 1.25, "learning_rate": 0.00015496719769862981, "loss": 4.7825, "mean_token_accuracy": 0.2499506175518036, "num_tokens": 14945571.0, "step": 8220 }, { "entropy": 5.636851263046265, "epoch": 7.066179630425441, "grad_norm": 1.4453125, "learning_rate": 0.00015468611053454478, "loss": 4.7934, "mean_token_accuracy": 0.248419189453125, "num_tokens": 14954586.0, "step": 8225 }, { "entropy": 5.662562465667724, "epoch": 7.070477009024495, "grad_norm": 1.5625, "learning_rate": 0.00015440528608790533, "loss": 4.8692, "mean_token_accuracy": 0.24275394678115844, "num_tokens": 14963048.0, "step": 8230 }, { "entropy": 5.684344291687012, "epoch": 7.07477438762355, "grad_norm": 1.3671875, "learning_rate": 0.0001541247249719197, "loss": 4.8617, "mean_token_accuracy": 0.2369182214140892, "num_tokens": 14972158.0, "step": 8235 }, { "entropy": 5.675915765762329, "epoch": 7.079071766222604, "grad_norm": 1.3359375, "learning_rate": 0.00015384442779922135, "loss": 4.8169, "mean_token_accuracy": 0.24358518421649933, "num_tokens": 14980869.0, "step": 8240 }, { "entropy": 5.678097248077393, "epoch": 7.083369144821659, "grad_norm": 1.5625, "learning_rate": 0.00015356439518186726, "loss": 4.8092, "mean_token_accuracy": 0.2419195994734764, "num_tokens": 14989103.0, "step": 8245 }, { "entropy": 5.622421312332153, "epoch": 7.0876665234207135, "grad_norm": 1.15625, "learning_rate": 0.00015328462773133672, "loss": 4.8322, "mean_token_accuracy": 0.25122032314538956, "num_tokens": 14999378.0, "step": 8250 }, { "entropy": 5.662282085418701, "epoch": 7.091963902019768, "grad_norm": 1.3984375, "learning_rate": 0.00015300512605852977, "loss": 4.8865, "mean_token_accuracy": 0.2370702803134918, "num_tokens": 15007971.0, "step": 8255 }, { "entropy": 5.734650087356568, "epoch": 7.0962612806188226, "grad_norm": 1.265625, "learning_rate": 0.0001527258907737668, "loss": 4.9063, "mean_token_accuracy": 0.23673705607652665, "num_tokens": 15017537.0, "step": 8260 }, { "entropy": 5.62396354675293, "epoch": 7.100558659217877, "grad_norm": 1.5234375, "learning_rate": 0.00015244692248678586, "loss": 4.7927, "mean_token_accuracy": 0.25119365751743317, "num_tokens": 15025684.0, "step": 8265 }, { "entropy": 5.6647412300109865, "epoch": 7.104856037816932, "grad_norm": 1.484375, "learning_rate": 0.0001521682218067421, "loss": 4.826, "mean_token_accuracy": 0.24565030932426452, "num_tokens": 15034753.0, "step": 8270 }, { "entropy": 5.6979693412780765, "epoch": 7.109153416415986, "grad_norm": 1.3359375, "learning_rate": 0.00015188978934220642, "loss": 4.8555, "mean_token_accuracy": 0.24527978003025055, "num_tokens": 15044685.0, "step": 8275 }, { "entropy": 5.721617269515991, "epoch": 7.113450795015041, "grad_norm": 1.2421875, "learning_rate": 0.0001516116257011641, "loss": 4.9024, "mean_token_accuracy": 0.2421650305390358, "num_tokens": 15054853.0, "step": 8280 }, { "entropy": 5.668715095520019, "epoch": 7.117748173614095, "grad_norm": 1.1953125, "learning_rate": 0.0001513337314910134, "loss": 4.8429, "mean_token_accuracy": 0.2332398548722267, "num_tokens": 15065244.0, "step": 8285 }, { "entropy": 5.626372385025024, "epoch": 7.12204555221315, "grad_norm": 1.40625, "learning_rate": 0.00015105610731856416, "loss": 4.7841, "mean_token_accuracy": 0.2513157859444618, "num_tokens": 15074046.0, "step": 8290 }, { "entropy": 5.66493592262268, "epoch": 7.126342930812204, "grad_norm": 1.3984375, "learning_rate": 0.00015077875379003653, "loss": 4.8261, "mean_token_accuracy": 0.245232991874218, "num_tokens": 15083518.0, "step": 8295 }, { "entropy": 5.6777307987213135, "epoch": 7.130640309411259, "grad_norm": 1.265625, "learning_rate": 0.00015050167151105988, "loss": 4.8787, "mean_token_accuracy": 0.2450040340423584, "num_tokens": 15092512.0, "step": 8300 }, { "entropy": 5.720175552368164, "epoch": 7.1349376880103135, "grad_norm": 1.5625, "learning_rate": 0.000150224861086671, "loss": 4.8636, "mean_token_accuracy": 0.24103742837905884, "num_tokens": 15101722.0, "step": 8305 }, { "entropy": 5.65927529335022, "epoch": 7.139235066609368, "grad_norm": 1.3515625, "learning_rate": 0.00014994832312131332, "loss": 4.8115, "mean_token_accuracy": 0.2433381572365761, "num_tokens": 15110114.0, "step": 8310 }, { "entropy": 5.647584772109985, "epoch": 7.143532445208423, "grad_norm": 1.40625, "learning_rate": 0.00014967205821883532, "loss": 4.8898, "mean_token_accuracy": 0.24128275364637375, "num_tokens": 15119461.0, "step": 8315 }, { "entropy": 5.7003998279571535, "epoch": 7.147829823807477, "grad_norm": 1.5078125, "learning_rate": 0.000149396066982489, "loss": 4.8247, "mean_token_accuracy": 0.24309603422880172, "num_tokens": 15127518.0, "step": 8320 }, { "entropy": 5.69748592376709, "epoch": 7.152127202406532, "grad_norm": 1.3203125, "learning_rate": 0.00014912035001492897, "loss": 4.9025, "mean_token_accuracy": 0.23496294468641282, "num_tokens": 15136741.0, "step": 8325 }, { "entropy": 5.679088973999024, "epoch": 7.156424581005586, "grad_norm": 1.4609375, "learning_rate": 0.00014884490791821058, "loss": 4.8403, "mean_token_accuracy": 0.24242965281009674, "num_tokens": 15145193.0, "step": 8330 }, { "entropy": 5.643162488937378, "epoch": 7.160721959604641, "grad_norm": 1.3515625, "learning_rate": 0.00014856974129378981, "loss": 4.837, "mean_token_accuracy": 0.24121178388595582, "num_tokens": 15154117.0, "step": 8335 }, { "entropy": 5.69333701133728, "epoch": 7.165019338203695, "grad_norm": 1.296875, "learning_rate": 0.0001482948507425203, "loss": 4.859, "mean_token_accuracy": 0.24258929044008254, "num_tokens": 15163221.0, "step": 8340 }, { "entropy": 5.709475135803222, "epoch": 7.169316716802751, "grad_norm": 1.28125, "learning_rate": 0.00014802023686465314, "loss": 4.9285, "mean_token_accuracy": 0.22908981293439865, "num_tokens": 15173234.0, "step": 8345 }, { "entropy": 5.639783477783203, "epoch": 7.173614095401805, "grad_norm": 1.390625, "learning_rate": 0.00014774590025983523, "loss": 4.8194, "mean_token_accuracy": 0.24694596081972123, "num_tokens": 15181436.0, "step": 8350 }, { "entropy": 5.632948637008667, "epoch": 7.17791147400086, "grad_norm": 1.5546875, "learning_rate": 0.00014747184152710807, "loss": 4.8521, "mean_token_accuracy": 0.24572525024414063, "num_tokens": 15191697.0, "step": 8355 }, { "entropy": 5.6552135944366455, "epoch": 7.182208852599914, "grad_norm": 1.359375, "learning_rate": 0.00014719806126490658, "loss": 4.8327, "mean_token_accuracy": 0.2454862505197525, "num_tokens": 15201563.0, "step": 8360 }, { "entropy": 5.664471912384033, "epoch": 7.186506231198969, "grad_norm": 1.4609375, "learning_rate": 0.0001469245600710573, "loss": 4.822, "mean_token_accuracy": 0.24350446462631226, "num_tokens": 15210886.0, "step": 8365 }, { "entropy": 5.667477607727051, "epoch": 7.1908036097980235, "grad_norm": 1.4765625, "learning_rate": 0.00014665133854277742, "loss": 4.8688, "mean_token_accuracy": 0.23880793303251266, "num_tokens": 15219254.0, "step": 8370 }, { "entropy": 5.644754886627197, "epoch": 7.195100988397078, "grad_norm": 1.359375, "learning_rate": 0.0001463783972766737, "loss": 4.8438, "mean_token_accuracy": 0.24330057799816132, "num_tokens": 15228117.0, "step": 8375 }, { "entropy": 5.608816766738892, "epoch": 7.199398366996133, "grad_norm": 1.4765625, "learning_rate": 0.0001461057368687407, "loss": 4.8249, "mean_token_accuracy": 0.2522394210100174, "num_tokens": 15236621.0, "step": 8380 }, { "entropy": 5.619053030014038, "epoch": 7.203695745595187, "grad_norm": 1.375, "learning_rate": 0.00014583335791435971, "loss": 4.7818, "mean_token_accuracy": 0.24403955191373825, "num_tokens": 15245487.0, "step": 8385 }, { "entropy": 5.628172588348389, "epoch": 7.207993124194242, "grad_norm": 1.3125, "learning_rate": 0.00014556126100829774, "loss": 4.8254, "mean_token_accuracy": 0.24101845026016236, "num_tokens": 15255321.0, "step": 8390 }, { "entropy": 5.609516525268555, "epoch": 7.212290502793296, "grad_norm": 1.34375, "learning_rate": 0.00014528944674470546, "loss": 4.754, "mean_token_accuracy": 0.252737557888031, "num_tokens": 15264788.0, "step": 8395 }, { "entropy": 5.669772291183472, "epoch": 7.216587881392351, "grad_norm": 1.421875, "learning_rate": 0.0001450179157171166, "loss": 4.8532, "mean_token_accuracy": 0.23921219259500504, "num_tokens": 15273448.0, "step": 8400 }, { "entropy": 5.704756450653076, "epoch": 7.220885259991405, "grad_norm": 1.453125, "learning_rate": 0.00014474666851844632, "loss": 4.9219, "mean_token_accuracy": 0.23073083758354188, "num_tokens": 15283071.0, "step": 8405 }, { "entropy": 5.641713428497314, "epoch": 7.22518263859046, "grad_norm": 1.4375, "learning_rate": 0.00014447570574099028, "loss": 4.7482, "mean_token_accuracy": 0.25668143630027773, "num_tokens": 15291537.0, "step": 8410 }, { "entropy": 5.706275081634521, "epoch": 7.229480017189514, "grad_norm": 1.28125, "learning_rate": 0.00014420502797642283, "loss": 4.8257, "mean_token_accuracy": 0.2518530562520027, "num_tokens": 15300531.0, "step": 8415 }, { "entropy": 5.643521165847778, "epoch": 7.233777395788569, "grad_norm": 1.46875, "learning_rate": 0.000143934635815796, "loss": 4.899, "mean_token_accuracy": 0.23624274879693985, "num_tokens": 15309820.0, "step": 8420 }, { "entropy": 5.626271390914917, "epoch": 7.2380747743876235, "grad_norm": 1.5234375, "learning_rate": 0.0001436645298495381, "loss": 4.8423, "mean_token_accuracy": 0.24444258213043213, "num_tokens": 15318604.0, "step": 8425 }, { "entropy": 5.643628692626953, "epoch": 7.242372152986678, "grad_norm": 1.34375, "learning_rate": 0.00014339471066745262, "loss": 4.8571, "mean_token_accuracy": 0.2405830442905426, "num_tokens": 15327737.0, "step": 8430 }, { "entropy": 5.664806270599366, "epoch": 7.246669531585733, "grad_norm": 1.46875, "learning_rate": 0.000143125178858717, "loss": 4.8992, "mean_token_accuracy": 0.24197297692298889, "num_tokens": 15336663.0, "step": 8435 }, { "entropy": 5.715575551986694, "epoch": 7.250966910184787, "grad_norm": 1.3359375, "learning_rate": 0.00014285593501188083, "loss": 4.9137, "mean_token_accuracy": 0.23828656673431398, "num_tokens": 15345278.0, "step": 8440 }, { "entropy": 5.6954974174499515, "epoch": 7.255264288783842, "grad_norm": 1.3359375, "learning_rate": 0.00014258697971486492, "loss": 4.8743, "mean_token_accuracy": 0.2434326246380806, "num_tokens": 15354230.0, "step": 8445 }, { "entropy": 5.684997081756592, "epoch": 7.259561667382896, "grad_norm": 1.40625, "learning_rate": 0.00014231831355496045, "loss": 4.8884, "mean_token_accuracy": 0.24170466512441635, "num_tokens": 15362838.0, "step": 8450 }, { "entropy": 5.619481086730957, "epoch": 7.263859045981951, "grad_norm": 1.2421875, "learning_rate": 0.00014204993711882662, "loss": 4.8517, "mean_token_accuracy": 0.24466632157564164, "num_tokens": 15372593.0, "step": 8455 }, { "entropy": 5.716213417053223, "epoch": 7.268156424581005, "grad_norm": 1.3671875, "learning_rate": 0.0001417818509924906, "loss": 4.9144, "mean_token_accuracy": 0.24032662957906722, "num_tokens": 15381945.0, "step": 8460 }, { "entropy": 5.632898616790771, "epoch": 7.27245380318006, "grad_norm": 1.2890625, "learning_rate": 0.000141514055761345, "loss": 4.8387, "mean_token_accuracy": 0.24242759346961976, "num_tokens": 15391487.0, "step": 8465 }, { "entropy": 5.6765156269073485, "epoch": 7.276751181779114, "grad_norm": 1.4453125, "learning_rate": 0.00014124655201014786, "loss": 4.7872, "mean_token_accuracy": 0.24917674511671067, "num_tokens": 15399891.0, "step": 8470 }, { "entropy": 5.660211992263794, "epoch": 7.281048560378169, "grad_norm": 1.3359375, "learning_rate": 0.00014097934032302037, "loss": 4.8029, "mean_token_accuracy": 0.24828134030103682, "num_tokens": 15408693.0, "step": 8475 }, { "entropy": 5.642815494537354, "epoch": 7.2853459389772235, "grad_norm": 1.390625, "learning_rate": 0.00014071242128344593, "loss": 4.8855, "mean_token_accuracy": 0.2412761390209198, "num_tokens": 15417779.0, "step": 8480 }, { "entropy": 5.628547477722168, "epoch": 7.289643317576278, "grad_norm": 1.4921875, "learning_rate": 0.0001404457954742691, "loss": 4.7916, "mean_token_accuracy": 0.25722613334655764, "num_tokens": 15425826.0, "step": 8485 }, { "entropy": 5.645930242538452, "epoch": 7.2939406961753335, "grad_norm": 1.609375, "learning_rate": 0.00014017946347769423, "loss": 4.8787, "mean_token_accuracy": 0.24464091658592224, "num_tokens": 15435811.0, "step": 8490 }, { "entropy": 5.608988952636719, "epoch": 7.298238074774388, "grad_norm": 1.4609375, "learning_rate": 0.00013991342587528377, "loss": 4.7581, "mean_token_accuracy": 0.25116355568170545, "num_tokens": 15444949.0, "step": 8495 }, { "entropy": 5.619772720336914, "epoch": 7.302535453373443, "grad_norm": 1.3828125, "learning_rate": 0.00013964768324795752, "loss": 4.7767, "mean_token_accuracy": 0.2535047471523285, "num_tokens": 15453398.0, "step": 8500 }, { "epoch": 7.302535453373443, "eval_entropy": 5.460987055623853, "eval_loss": 5.883355140686035, "eval_mean_token_accuracy": 0.18381839069361622, "eval_num_tokens": 15453398.0, "eval_runtime": 2.0476, "eval_samples_per_second": 1733.217, "eval_steps_per_second": 216.835, "step": 8500 }, { "entropy": 5.6177013397216795, "epoch": 7.306832831972497, "grad_norm": 1.2734375, "learning_rate": 0.00013938223617599124, "loss": 4.8873, "mean_token_accuracy": 0.23690903782844544, "num_tokens": 15462785.0, "step": 8505 }, { "entropy": 5.6555908679962155, "epoch": 7.311130210571552, "grad_norm": 1.5, "learning_rate": 0.00013911708523901514, "loss": 4.8803, "mean_token_accuracy": 0.2451162204146385, "num_tokens": 15471718.0, "step": 8510 }, { "entropy": 5.664057064056396, "epoch": 7.315427589170606, "grad_norm": 1.3828125, "learning_rate": 0.00013885223101601303, "loss": 4.8366, "mean_token_accuracy": 0.24271080642938614, "num_tokens": 15480204.0, "step": 8515 }, { "entropy": 5.639170408248901, "epoch": 7.319724967769661, "grad_norm": 1.375, "learning_rate": 0.00013858767408532051, "loss": 4.7954, "mean_token_accuracy": 0.24772258400917052, "num_tokens": 15489388.0, "step": 8520 }, { "entropy": 5.640066719055175, "epoch": 7.324022346368715, "grad_norm": 1.3984375, "learning_rate": 0.00013832341502462432, "loss": 4.8193, "mean_token_accuracy": 0.24168650656938553, "num_tokens": 15498028.0, "step": 8525 }, { "entropy": 5.679781198501587, "epoch": 7.32831972496777, "grad_norm": 1.578125, "learning_rate": 0.00013805945441096057, "loss": 4.8481, "mean_token_accuracy": 0.2431727021932602, "num_tokens": 15506382.0, "step": 8530 }, { "entropy": 5.678671789169312, "epoch": 7.332617103566824, "grad_norm": 1.4765625, "learning_rate": 0.00013779579282071364, "loss": 4.8543, "mean_token_accuracy": 0.2428709551692009, "num_tokens": 15515271.0, "step": 8535 }, { "entropy": 5.62637357711792, "epoch": 7.336914482165879, "grad_norm": 1.359375, "learning_rate": 0.00013753243082961512, "loss": 4.7953, "mean_token_accuracy": 0.25061759501695635, "num_tokens": 15524396.0, "step": 8540 }, { "entropy": 5.611609268188476, "epoch": 7.3412118607649335, "grad_norm": 1.453125, "learning_rate": 0.00013726936901274246, "loss": 4.7813, "mean_token_accuracy": 0.24898689538240432, "num_tokens": 15532829.0, "step": 8545 }, { "entropy": 5.680109977722168, "epoch": 7.345509239363988, "grad_norm": 1.2890625, "learning_rate": 0.0001370066079445174, "loss": 4.8978, "mean_token_accuracy": 0.23813850730657576, "num_tokens": 15541726.0, "step": 8550 }, { "entropy": 5.71290807723999, "epoch": 7.349806617963043, "grad_norm": 1.3359375, "learning_rate": 0.00013674414819870502, "loss": 4.9929, "mean_token_accuracy": 0.22865635603666307, "num_tokens": 15551539.0, "step": 8555 }, { "entropy": 5.651469898223877, "epoch": 7.354103996562097, "grad_norm": 1.5390625, "learning_rate": 0.00013648199034841264, "loss": 4.8606, "mean_token_accuracy": 0.23857547491788864, "num_tokens": 15560147.0, "step": 8560 }, { "entropy": 5.6347579002380375, "epoch": 7.358401375161152, "grad_norm": 1.46875, "learning_rate": 0.0001362201349660882, "loss": 4.8195, "mean_token_accuracy": 0.2504441112279892, "num_tokens": 15568983.0, "step": 8565 }, { "entropy": 5.603701829910278, "epoch": 7.362698753760206, "grad_norm": 1.421875, "learning_rate": 0.0001359585826235192, "loss": 4.8184, "mean_token_accuracy": 0.2471785604953766, "num_tokens": 15578065.0, "step": 8570 }, { "entropy": 5.699456787109375, "epoch": 7.366996132359261, "grad_norm": 1.53125, "learning_rate": 0.00013569733389183126, "loss": 4.9112, "mean_token_accuracy": 0.24275968074798585, "num_tokens": 15587181.0, "step": 8575 }, { "entropy": 5.650845050811768, "epoch": 7.371293510958315, "grad_norm": 1.3359375, "learning_rate": 0.00013543638934148736, "loss": 4.8473, "mean_token_accuracy": 0.24374731332063676, "num_tokens": 15596602.0, "step": 8580 }, { "entropy": 5.729697942733765, "epoch": 7.37559088955737, "grad_norm": 1.484375, "learning_rate": 0.000135175749542286, "loss": 4.8999, "mean_token_accuracy": 0.22945987284183503, "num_tokens": 15605857.0, "step": 8585 }, { "entropy": 5.59364161491394, "epoch": 7.379888268156424, "grad_norm": 1.3046875, "learning_rate": 0.0001349154150633604, "loss": 4.7965, "mean_token_accuracy": 0.2551902085542679, "num_tokens": 15615320.0, "step": 8590 }, { "entropy": 5.695501613616943, "epoch": 7.384185646755479, "grad_norm": 1.3359375, "learning_rate": 0.000134655386473177, "loss": 4.9361, "mean_token_accuracy": 0.22892682403326034, "num_tokens": 15624193.0, "step": 8595 }, { "entropy": 5.583432912826538, "epoch": 7.3884830253545335, "grad_norm": 1.5390625, "learning_rate": 0.00013439566433953427, "loss": 4.795, "mean_token_accuracy": 0.25091494172811507, "num_tokens": 15632924.0, "step": 8600 }, { "entropy": 5.629459381103516, "epoch": 7.392780403953588, "grad_norm": 1.421875, "learning_rate": 0.0001341362492295616, "loss": 4.8, "mean_token_accuracy": 0.25384778827428817, "num_tokens": 15642201.0, "step": 8605 }, { "entropy": 5.688088846206665, "epoch": 7.397077782552643, "grad_norm": 1.578125, "learning_rate": 0.00013387714170971776, "loss": 4.8435, "mean_token_accuracy": 0.2389387384057045, "num_tokens": 15651608.0, "step": 8610 }, { "entropy": 5.7352265357971195, "epoch": 7.401375161151697, "grad_norm": 1.3203125, "learning_rate": 0.00013361834234579012, "loss": 4.9476, "mean_token_accuracy": 0.23288207799196242, "num_tokens": 15661768.0, "step": 8615 }, { "entropy": 5.595257806777954, "epoch": 7.405672539750752, "grad_norm": 1.453125, "learning_rate": 0.0001333598517028931, "loss": 4.8252, "mean_token_accuracy": 0.2436438739299774, "num_tokens": 15670270.0, "step": 8620 }, { "entropy": 5.629942989349365, "epoch": 7.409969918349806, "grad_norm": 1.3671875, "learning_rate": 0.00013310167034546688, "loss": 4.8258, "mean_token_accuracy": 0.24356841742992402, "num_tokens": 15679587.0, "step": 8625 }, { "entropy": 5.671183729171753, "epoch": 7.414267296948861, "grad_norm": 1.46875, "learning_rate": 0.0001328437988372763, "loss": 4.8821, "mean_token_accuracy": 0.23919072449207307, "num_tokens": 15688838.0, "step": 8630 }, { "entropy": 5.709632778167725, "epoch": 7.418564675547916, "grad_norm": 1.4375, "learning_rate": 0.00013258623774140967, "loss": 4.8513, "mean_token_accuracy": 0.24157456457614898, "num_tokens": 15697744.0, "step": 8635 }, { "entropy": 5.636171436309814, "epoch": 7.422862054146971, "grad_norm": 1.5, "learning_rate": 0.00013232898762027766, "loss": 4.8212, "mean_token_accuracy": 0.24941018223762512, "num_tokens": 15707643.0, "step": 8640 }, { "entropy": 5.681140041351318, "epoch": 7.427159432746025, "grad_norm": 1.3671875, "learning_rate": 0.00013207204903561154, "loss": 4.9013, "mean_token_accuracy": 0.24012650102376937, "num_tokens": 15717568.0, "step": 8645 }, { "entropy": 5.616391515731811, "epoch": 7.43145681134508, "grad_norm": 1.296875, "learning_rate": 0.00013181542254846247, "loss": 4.7643, "mean_token_accuracy": 0.2518541321158409, "num_tokens": 15726467.0, "step": 8650 }, { "entropy": 5.694960689544677, "epoch": 7.435754189944134, "grad_norm": 1.4609375, "learning_rate": 0.0001315591087192002, "loss": 4.8946, "mean_token_accuracy": 0.23947711288928986, "num_tokens": 15736533.0, "step": 8655 }, { "entropy": 5.596284627914429, "epoch": 7.440051568543189, "grad_norm": 1.3203125, "learning_rate": 0.00013130310810751162, "loss": 4.8082, "mean_token_accuracy": 0.24334124028682708, "num_tokens": 15745853.0, "step": 8660 }, { "entropy": 5.697447776794434, "epoch": 7.4443489471422435, "grad_norm": 1.453125, "learning_rate": 0.00013104742127239983, "loss": 4.9422, "mean_token_accuracy": 0.23501614928245546, "num_tokens": 15755534.0, "step": 8665 }, { "entropy": 5.714992809295654, "epoch": 7.448646325741298, "grad_norm": 1.2109375, "learning_rate": 0.0001307920487721826, "loss": 4.9252, "mean_token_accuracy": 0.232830011844635, "num_tokens": 15766182.0, "step": 8670 }, { "entropy": 5.657570457458496, "epoch": 7.452943704340353, "grad_norm": 1.4609375, "learning_rate": 0.00013053699116449144, "loss": 4.8346, "mean_token_accuracy": 0.2376754343509674, "num_tokens": 15775454.0, "step": 8675 }, { "entropy": 5.817200994491577, "epoch": 7.457241082939407, "grad_norm": 1.296875, "learning_rate": 0.00013028224900627026, "loss": 4.947, "mean_token_accuracy": 0.22718140333890915, "num_tokens": 15784768.0, "step": 8680 }, { "entropy": 5.735804557800293, "epoch": 7.461538461538462, "grad_norm": 1.1015625, "learning_rate": 0.00013002782285377395, "loss": 4.8814, "mean_token_accuracy": 0.24075218588113784, "num_tokens": 15794255.0, "step": 8685 }, { "entropy": 5.6803868293762205, "epoch": 7.465835840137516, "grad_norm": 1.5234375, "learning_rate": 0.0001297737132625677, "loss": 4.9114, "mean_token_accuracy": 0.23759287297725679, "num_tokens": 15803722.0, "step": 8690 }, { "entropy": 5.709731101989746, "epoch": 7.470133218736571, "grad_norm": 1.6171875, "learning_rate": 0.00012951992078752528, "loss": 4.8673, "mean_token_accuracy": 0.2400722473859787, "num_tokens": 15811819.0, "step": 8695 }, { "entropy": 5.6642265796661375, "epoch": 7.474430597335625, "grad_norm": 1.3203125, "learning_rate": 0.00012926644598282798, "loss": 4.883, "mean_token_accuracy": 0.23496807366609573, "num_tokens": 15821446.0, "step": 8700 }, { "entropy": 5.6967566967010494, "epoch": 7.47872797593468, "grad_norm": 1.3046875, "learning_rate": 0.0001290132894019634, "loss": 4.9032, "mean_token_accuracy": 0.24297993183135985, "num_tokens": 15830585.0, "step": 8705 }, { "entropy": 5.698636674880982, "epoch": 7.483025354533734, "grad_norm": 1.375, "learning_rate": 0.00012876045159772442, "loss": 4.8693, "mean_token_accuracy": 0.24373389929533004, "num_tokens": 15838872.0, "step": 8710 }, { "entropy": 5.6971441268920895, "epoch": 7.487322733132789, "grad_norm": 1.4765625, "learning_rate": 0.00012850793312220766, "loss": 4.8712, "mean_token_accuracy": 0.23879421204328538, "num_tokens": 15847561.0, "step": 8715 }, { "entropy": 5.61161732673645, "epoch": 7.4916201117318435, "grad_norm": 1.2734375, "learning_rate": 0.00012825573452681266, "loss": 4.8183, "mean_token_accuracy": 0.25333032459020616, "num_tokens": 15856405.0, "step": 8720 }, { "entropy": 5.675054311752319, "epoch": 7.495917490330898, "grad_norm": 1.4921875, "learning_rate": 0.00012800385636224017, "loss": 4.8783, "mean_token_accuracy": 0.2412225589156151, "num_tokens": 15865856.0, "step": 8725 }, { "entropy": 5.697736120223999, "epoch": 7.500214868929953, "grad_norm": 1.484375, "learning_rate": 0.00012775229917849162, "loss": 4.8061, "mean_token_accuracy": 0.24927690327167512, "num_tokens": 15873605.0, "step": 8730 }, { "entropy": 5.670104217529297, "epoch": 7.504512247529007, "grad_norm": 1.34375, "learning_rate": 0.00012750106352486728, "loss": 4.8144, "mean_token_accuracy": 0.251702156662941, "num_tokens": 15883123.0, "step": 8735 }, { "entropy": 5.622718381881714, "epoch": 7.508809626128062, "grad_norm": 1.25, "learning_rate": 0.00012725014994996534, "loss": 4.8654, "mean_token_accuracy": 0.24962699562311172, "num_tokens": 15892713.0, "step": 8740 }, { "entropy": 5.716191387176513, "epoch": 7.513107004727116, "grad_norm": 1.328125, "learning_rate": 0.00012699955900168075, "loss": 4.8486, "mean_token_accuracy": 0.2432439148426056, "num_tokens": 15902913.0, "step": 8745 }, { "entropy": 5.672324466705322, "epoch": 7.517404383326171, "grad_norm": 1.375, "learning_rate": 0.00012674929122720414, "loss": 4.9054, "mean_token_accuracy": 0.23004304766654968, "num_tokens": 15912721.0, "step": 8750 }, { "entropy": 5.594343137741089, "epoch": 7.521701761925225, "grad_norm": 1.3125, "learning_rate": 0.0001264993471730202, "loss": 4.7899, "mean_token_accuracy": 0.2503419265151024, "num_tokens": 15921520.0, "step": 8755 }, { "entropy": 5.690563106536866, "epoch": 7.52599914052428, "grad_norm": 1.34375, "learning_rate": 0.00012624972738490675, "loss": 4.9229, "mean_token_accuracy": 0.23388323038816453, "num_tokens": 15930753.0, "step": 8760 }, { "entropy": 5.751315879821777, "epoch": 7.530296519123334, "grad_norm": 1.328125, "learning_rate": 0.00012600043240793368, "loss": 4.8837, "mean_token_accuracy": 0.23834782242774963, "num_tokens": 15939957.0, "step": 8765 }, { "entropy": 5.6837897300720215, "epoch": 7.534593897722389, "grad_norm": 1.421875, "learning_rate": 0.00012575146278646175, "loss": 4.815, "mean_token_accuracy": 0.2443731501698494, "num_tokens": 15949555.0, "step": 8770 }, { "entropy": 5.626264572143555, "epoch": 7.5388912763214435, "grad_norm": 1.3359375, "learning_rate": 0.00012550281906414097, "loss": 4.8277, "mean_token_accuracy": 0.240455362200737, "num_tokens": 15958395.0, "step": 8775 }, { "entropy": 5.663618469238282, "epoch": 7.543188654920499, "grad_norm": 1.328125, "learning_rate": 0.00012525450178390972, "loss": 4.8681, "mean_token_accuracy": 0.2480662539601326, "num_tokens": 15967522.0, "step": 8780 }, { "entropy": 5.694502210617065, "epoch": 7.547486033519553, "grad_norm": 1.4375, "learning_rate": 0.0001250065114879939, "loss": 4.878, "mean_token_accuracy": 0.23790884763002396, "num_tokens": 15976311.0, "step": 8785 }, { "entropy": 5.618346405029297, "epoch": 7.551783412118608, "grad_norm": 1.375, "learning_rate": 0.00012475884871790505, "loss": 4.7873, "mean_token_accuracy": 0.25124809592962266, "num_tokens": 15985202.0, "step": 8790 }, { "entropy": 5.733155822753906, "epoch": 7.556080790717663, "grad_norm": 1.390625, "learning_rate": 0.00012451151401443982, "loss": 4.9326, "mean_token_accuracy": 0.2288578063249588, "num_tokens": 15995043.0, "step": 8795 }, { "entropy": 5.590882110595703, "epoch": 7.560378169316717, "grad_norm": 1.3984375, "learning_rate": 0.00012426450791767815, "loss": 4.773, "mean_token_accuracy": 0.2554294764995575, "num_tokens": 16004355.0, "step": 8800 }, { "entropy": 5.66195330619812, "epoch": 7.564675547915772, "grad_norm": 1.46875, "learning_rate": 0.00012401783096698283, "loss": 4.7098, "mean_token_accuracy": 0.2548373222351074, "num_tokens": 16013069.0, "step": 8805 }, { "entropy": 5.664208126068115, "epoch": 7.568972926514826, "grad_norm": 1.2890625, "learning_rate": 0.00012377148370099764, "loss": 4.882, "mean_token_accuracy": 0.23631102591753006, "num_tokens": 16023757.0, "step": 8810 }, { "entropy": 5.6740035057067875, "epoch": 7.573270305113881, "grad_norm": 1.390625, "learning_rate": 0.00012352546665764642, "loss": 4.8746, "mean_token_accuracy": 0.23626330494880676, "num_tokens": 16032550.0, "step": 8815 }, { "entropy": 5.582717180252075, "epoch": 7.577567683712935, "grad_norm": 1.3125, "learning_rate": 0.00012327978037413219, "loss": 4.7658, "mean_token_accuracy": 0.25544307976961134, "num_tokens": 16041580.0, "step": 8820 }, { "entropy": 5.654942321777344, "epoch": 7.58186506231199, "grad_norm": 1.5078125, "learning_rate": 0.00012303442538693564, "loss": 4.8642, "mean_token_accuracy": 0.245210263133049, "num_tokens": 16049845.0, "step": 8825 }, { "entropy": 5.606918573379517, "epoch": 7.586162440911044, "grad_norm": 1.4765625, "learning_rate": 0.00012278940223181393, "loss": 4.7706, "mean_token_accuracy": 0.239949569106102, "num_tokens": 16059703.0, "step": 8830 }, { "entropy": 5.623773241043091, "epoch": 7.590459819510099, "grad_norm": 1.3671875, "learning_rate": 0.00012254471144379964, "loss": 4.7411, "mean_token_accuracy": 0.2577579036355019, "num_tokens": 16068416.0, "step": 8835 }, { "entropy": 5.654675531387329, "epoch": 7.5947571981091535, "grad_norm": 1.3359375, "learning_rate": 0.00012230035355719968, "loss": 4.9084, "mean_token_accuracy": 0.23492099046707154, "num_tokens": 16078067.0, "step": 8840 }, { "entropy": 5.67131872177124, "epoch": 7.599054576708208, "grad_norm": 1.375, "learning_rate": 0.0001220563291055941, "loss": 4.8605, "mean_token_accuracy": 0.24588163793087006, "num_tokens": 16086591.0, "step": 8845 }, { "entropy": 5.68556776046753, "epoch": 7.603351955307263, "grad_norm": 1.390625, "learning_rate": 0.0001218126386218347, "loss": 4.8508, "mean_token_accuracy": 0.245563143491745, "num_tokens": 16096138.0, "step": 8850 }, { "entropy": 5.607348871231079, "epoch": 7.607649333906317, "grad_norm": 1.484375, "learning_rate": 0.00012156928263804403, "loss": 4.7926, "mean_token_accuracy": 0.24644255489110947, "num_tokens": 16105182.0, "step": 8855 }, { "entropy": 5.667599391937256, "epoch": 7.611946712505372, "grad_norm": 1.546875, "learning_rate": 0.0001213262616856144, "loss": 4.9127, "mean_token_accuracy": 0.23956708014011383, "num_tokens": 16113940.0, "step": 8860 }, { "entropy": 5.672130346298218, "epoch": 7.616244091104426, "grad_norm": 1.546875, "learning_rate": 0.00012108357629520635, "loss": 4.8201, "mean_token_accuracy": 0.24776309728622437, "num_tokens": 16123036.0, "step": 8865 }, { "entropy": 5.661095905303955, "epoch": 7.620541469703481, "grad_norm": 1.4296875, "learning_rate": 0.00012084122699674785, "loss": 4.8167, "mean_token_accuracy": 0.24714952260255812, "num_tokens": 16131057.0, "step": 8870 }, { "entropy": 5.651075649261474, "epoch": 7.624838848302535, "grad_norm": 1.515625, "learning_rate": 0.00012059921431943278, "loss": 4.8634, "mean_token_accuracy": 0.23851459324359894, "num_tokens": 16140259.0, "step": 8875 }, { "entropy": 5.7019164085388185, "epoch": 7.62913622690159, "grad_norm": 1.515625, "learning_rate": 0.00012035753879172026, "loss": 5.0453, "mean_token_accuracy": 0.22252508252859116, "num_tokens": 16149585.0, "step": 8880 }, { "entropy": 5.608267688751221, "epoch": 7.633433605500644, "grad_norm": 1.5078125, "learning_rate": 0.00012011620094133296, "loss": 4.7009, "mean_token_accuracy": 0.25520451068878175, "num_tokens": 16157656.0, "step": 8885 }, { "entropy": 5.595222854614258, "epoch": 7.637730984099699, "grad_norm": 1.3515625, "learning_rate": 0.00011987520129525622, "loss": 4.8398, "mean_token_accuracy": 0.24301209151744843, "num_tokens": 16166900.0, "step": 8890 }, { "entropy": 5.651483631134033, "epoch": 7.6420283626987535, "grad_norm": 1.5546875, "learning_rate": 0.000119634540379737, "loss": 4.8716, "mean_token_accuracy": 0.24231579452753066, "num_tokens": 16174859.0, "step": 8895 }, { "entropy": 5.677278900146485, "epoch": 7.646325741297808, "grad_norm": 1.4375, "learning_rate": 0.00011939421872028262, "loss": 4.8536, "mean_token_accuracy": 0.24620164632797242, "num_tokens": 16183660.0, "step": 8900 }, { "entropy": 5.58764533996582, "epoch": 7.650623119896863, "grad_norm": 1.3984375, "learning_rate": 0.00011915423684165948, "loss": 4.8054, "mean_token_accuracy": 0.24301510602235793, "num_tokens": 16192344.0, "step": 8905 }, { "entropy": 5.685575675964356, "epoch": 7.654920498495917, "grad_norm": 1.2890625, "learning_rate": 0.00011891459526789198, "loss": 4.8872, "mean_token_accuracy": 0.2380649194121361, "num_tokens": 16202060.0, "step": 8910 }, { "entropy": 5.705985689163208, "epoch": 7.659217877094972, "grad_norm": 1.328125, "learning_rate": 0.0001186752945222616, "loss": 4.8688, "mean_token_accuracy": 0.23804006576538086, "num_tokens": 16211297.0, "step": 8915 }, { "entropy": 5.679658222198486, "epoch": 7.663515255694026, "grad_norm": 1.2890625, "learning_rate": 0.00011843633512730562, "loss": 4.815, "mean_token_accuracy": 0.2446437507867813, "num_tokens": 16219812.0, "step": 8920 }, { "entropy": 5.651452541351318, "epoch": 7.667812634293082, "grad_norm": 1.375, "learning_rate": 0.00011819771760481576, "loss": 4.8388, "mean_token_accuracy": 0.25003154426813123, "num_tokens": 16229197.0, "step": 8925 }, { "entropy": 5.590824747085572, "epoch": 7.672110012892135, "grad_norm": 1.4375, "learning_rate": 0.00011795944247583725, "loss": 4.7635, "mean_token_accuracy": 0.2490478202700615, "num_tokens": 16238154.0, "step": 8930 }, { "entropy": 5.615209436416626, "epoch": 7.676407391491191, "grad_norm": 1.296875, "learning_rate": 0.00011772151026066789, "loss": 4.8197, "mean_token_accuracy": 0.2377667546272278, "num_tokens": 16247206.0, "step": 8935 }, { "entropy": 5.686316299438476, "epoch": 7.680704770090245, "grad_norm": 1.34375, "learning_rate": 0.00011748392147885642, "loss": 4.9245, "mean_token_accuracy": 0.23257190883159637, "num_tokens": 16256571.0, "step": 8940 }, { "entropy": 5.669029426574707, "epoch": 7.6850021486893, "grad_norm": 1.6328125, "learning_rate": 0.00011724667664920177, "loss": 4.8675, "mean_token_accuracy": 0.23874716609716415, "num_tokens": 16265429.0, "step": 8945 }, { "entropy": 5.712026929855346, "epoch": 7.689299527288354, "grad_norm": 1.46875, "learning_rate": 0.00011700977628975183, "loss": 4.9595, "mean_token_accuracy": 0.22859783619642257, "num_tokens": 16273804.0, "step": 8950 }, { "entropy": 5.69069471359253, "epoch": 7.693596905887409, "grad_norm": 1.4609375, "learning_rate": 0.00011677322091780243, "loss": 4.8767, "mean_token_accuracy": 0.2396107569336891, "num_tokens": 16282894.0, "step": 8955 }, { "entropy": 5.6962803363800045, "epoch": 7.6978942844864635, "grad_norm": 1.40625, "learning_rate": 0.0001165370110498958, "loss": 4.911, "mean_token_accuracy": 0.23751324117183686, "num_tokens": 16291568.0, "step": 8960 }, { "entropy": 5.713715887069702, "epoch": 7.702191663085518, "grad_norm": 1.3671875, "learning_rate": 0.00011630114720181989, "loss": 4.9006, "mean_token_accuracy": 0.23961835503578185, "num_tokens": 16300650.0, "step": 8965 }, { "entropy": 5.633689308166504, "epoch": 7.706489041684573, "grad_norm": 1.421875, "learning_rate": 0.00011606562988860711, "loss": 4.8228, "mean_token_accuracy": 0.24292599856853486, "num_tokens": 16309712.0, "step": 8970 }, { "entropy": 5.640320634841919, "epoch": 7.710786420283627, "grad_norm": 1.28125, "learning_rate": 0.0001158304596245332, "loss": 4.8093, "mean_token_accuracy": 0.24374091476202012, "num_tokens": 16319440.0, "step": 8975 }, { "entropy": 5.608017110824585, "epoch": 7.715083798882682, "grad_norm": 1.34375, "learning_rate": 0.00011559563692311595, "loss": 4.8151, "mean_token_accuracy": 0.24847222715616227, "num_tokens": 16328752.0, "step": 8980 }, { "entropy": 5.644386625289917, "epoch": 7.719381177481736, "grad_norm": 1.34375, "learning_rate": 0.00011536116229711422, "loss": 4.7971, "mean_token_accuracy": 0.25065541118383405, "num_tokens": 16338045.0, "step": 8985 }, { "entropy": 5.696937465667725, "epoch": 7.723678556080791, "grad_norm": 1.3984375, "learning_rate": 0.000115127036258527, "loss": 4.874, "mean_token_accuracy": 0.23981755524873732, "num_tokens": 16347174.0, "step": 8990 }, { "entropy": 5.6324869155883786, "epoch": 7.727975934679845, "grad_norm": 1.4921875, "learning_rate": 0.00011489325931859185, "loss": 4.7414, "mean_token_accuracy": 0.24950257539749146, "num_tokens": 16355371.0, "step": 8995 }, { "entropy": 5.614347124099732, "epoch": 7.7322733132789, "grad_norm": 1.34375, "learning_rate": 0.0001146598319877843, "loss": 4.8049, "mean_token_accuracy": 0.24828497618436812, "num_tokens": 16363938.0, "step": 9000 }, { "epoch": 7.7322733132789, "eval_entropy": 5.454324596637004, "eval_loss": 5.879878997802734, "eval_mean_token_accuracy": 0.1840149760078471, "eval_num_tokens": 16363938.0, "eval_runtime": 2.0495, "eval_samples_per_second": 1731.616, "eval_steps_per_second": 216.635, "step": 9000 }, { "entropy": 5.668048620223999, "epoch": 7.736570691877954, "grad_norm": 1.5390625, "learning_rate": 0.00011442675477581621, "loss": 4.8712, "mean_token_accuracy": 0.24084932655096053, "num_tokens": 16373110.0, "step": 9005 }, { "entropy": 5.663178205490112, "epoch": 7.740868070477009, "grad_norm": 1.3515625, "learning_rate": 0.0001141940281916352, "loss": 4.7987, "mean_token_accuracy": 0.24338341653347015, "num_tokens": 16381521.0, "step": 9010 }, { "entropy": 5.71226396560669, "epoch": 7.7451654490760635, "grad_norm": 1.359375, "learning_rate": 0.00011396165274342304, "loss": 4.9155, "mean_token_accuracy": 0.23528602570295334, "num_tokens": 16391322.0, "step": 9015 }, { "entropy": 5.630045700073242, "epoch": 7.749462827675118, "grad_norm": 1.3125, "learning_rate": 0.00011372962893859471, "loss": 4.8487, "mean_token_accuracy": 0.2383380651473999, "num_tokens": 16400653.0, "step": 9020 }, { "entropy": 5.633638191223144, "epoch": 7.753760206274173, "grad_norm": 1.40625, "learning_rate": 0.00011349795728379759, "loss": 4.8587, "mean_token_accuracy": 0.23745760172605515, "num_tokens": 16410133.0, "step": 9025 }, { "entropy": 5.684892416000366, "epoch": 7.758057584873227, "grad_norm": 1.4140625, "learning_rate": 0.00011326663828491, "loss": 4.9227, "mean_token_accuracy": 0.23583627492189407, "num_tokens": 16419302.0, "step": 9030 }, { "entropy": 5.65294246673584, "epoch": 7.762354963472282, "grad_norm": 1.546875, "learning_rate": 0.00011303567244704015, "loss": 4.8942, "mean_token_accuracy": 0.24354798793792726, "num_tokens": 16428020.0, "step": 9035 }, { "entropy": 5.688437175750733, "epoch": 7.766652342071336, "grad_norm": 1.359375, "learning_rate": 0.00011280506027452502, "loss": 4.9648, "mean_token_accuracy": 0.22782256603240966, "num_tokens": 16438033.0, "step": 9040 }, { "entropy": 5.707497072219849, "epoch": 7.770949720670391, "grad_norm": 1.5, "learning_rate": 0.0001125748022709295, "loss": 4.9044, "mean_token_accuracy": 0.24439404755830765, "num_tokens": 16447067.0, "step": 9045 }, { "entropy": 5.715335369110107, "epoch": 7.775247099269445, "grad_norm": 1.3515625, "learning_rate": 0.00011234489893904509, "loss": 4.9154, "mean_token_accuracy": 0.23768466860055923, "num_tokens": 16457146.0, "step": 9050 }, { "entropy": 5.631183815002442, "epoch": 7.7795444778685, "grad_norm": 1.359375, "learning_rate": 0.00011211535078088869, "loss": 4.7698, "mean_token_accuracy": 0.25073523819446564, "num_tokens": 16466428.0, "step": 9055 }, { "entropy": 5.619486904144287, "epoch": 7.783841856467554, "grad_norm": 1.4375, "learning_rate": 0.00011188615829770171, "loss": 4.8294, "mean_token_accuracy": 0.2441105604171753, "num_tokens": 16474198.0, "step": 9060 }, { "entropy": 5.650256633758545, "epoch": 7.788139235066609, "grad_norm": 1.484375, "learning_rate": 0.00011165732198994905, "loss": 4.8596, "mean_token_accuracy": 0.245253886282444, "num_tokens": 16483464.0, "step": 9065 }, { "entropy": 5.655068445205688, "epoch": 7.792436613665664, "grad_norm": 1.4609375, "learning_rate": 0.00011142884235731756, "loss": 4.8247, "mean_token_accuracy": 0.24826207160949706, "num_tokens": 16492619.0, "step": 9070 }, { "entropy": 5.704363393783569, "epoch": 7.796733992264718, "grad_norm": 1.5234375, "learning_rate": 0.00011120071989871564, "loss": 4.928, "mean_token_accuracy": 0.24253664612770082, "num_tokens": 16501690.0, "step": 9075 }, { "entropy": 5.587621402740479, "epoch": 7.8010313708637735, "grad_norm": 1.28125, "learning_rate": 0.00011097295511227134, "loss": 4.807, "mean_token_accuracy": 0.24702800661325455, "num_tokens": 16510158.0, "step": 9080 }, { "entropy": 5.640108871459961, "epoch": 7.805328749462827, "grad_norm": 1.40625, "learning_rate": 0.0001107455484953321, "loss": 4.8187, "mean_token_accuracy": 0.24920191913843154, "num_tokens": 16518722.0, "step": 9085 }, { "entropy": 5.6231927394866945, "epoch": 7.809626128061883, "grad_norm": 1.453125, "learning_rate": 0.00011051850054446306, "loss": 4.8428, "mean_token_accuracy": 0.24272346943616868, "num_tokens": 16527404.0, "step": 9090 }, { "entropy": 5.571829319000244, "epoch": 7.813923506660937, "grad_norm": 1.53125, "learning_rate": 0.00011029181175544603, "loss": 4.7711, "mean_token_accuracy": 0.24813230186700821, "num_tokens": 16536210.0, "step": 9095 }, { "entropy": 5.717549419403076, "epoch": 7.818220885259992, "grad_norm": 1.5390625, "learning_rate": 0.00011006548262327884, "loss": 4.9294, "mean_token_accuracy": 0.24030217677354812, "num_tokens": 16544707.0, "step": 9100 }, { "entropy": 5.707336902618408, "epoch": 7.822518263859046, "grad_norm": 1.34375, "learning_rate": 0.0001098395136421739, "loss": 4.9008, "mean_token_accuracy": 0.23155685216188432, "num_tokens": 16553883.0, "step": 9105 }, { "entropy": 5.673837184906006, "epoch": 7.826815642458101, "grad_norm": 1.3046875, "learning_rate": 0.00010961390530555712, "loss": 4.8925, "mean_token_accuracy": 0.23413754403591155, "num_tokens": 16562537.0, "step": 9110 }, { "entropy": 5.66425347328186, "epoch": 7.831113021057155, "grad_norm": 1.3515625, "learning_rate": 0.00010938865810606682, "loss": 4.8643, "mean_token_accuracy": 0.23997241556644439, "num_tokens": 16571665.0, "step": 9115 }, { "entropy": 5.6818047046661375, "epoch": 7.83541039965621, "grad_norm": 1.4921875, "learning_rate": 0.00010916377253555293, "loss": 4.8505, "mean_token_accuracy": 0.23443864583969115, "num_tokens": 16581102.0, "step": 9120 }, { "entropy": 5.636024236679077, "epoch": 7.839707778255264, "grad_norm": 1.40625, "learning_rate": 0.00010893924908507573, "loss": 4.8179, "mean_token_accuracy": 0.2463734805583954, "num_tokens": 16589958.0, "step": 9125 }, { "entropy": 5.7293510913848875, "epoch": 7.844005156854319, "grad_norm": 1.5, "learning_rate": 0.0001087150882449046, "loss": 4.8842, "mean_token_accuracy": 0.24326430410146713, "num_tokens": 16598800.0, "step": 9130 }, { "entropy": 5.675615882873535, "epoch": 7.8483025354533735, "grad_norm": 1.3828125, "learning_rate": 0.00010849129050451717, "loss": 4.8569, "mean_token_accuracy": 0.23552822470664977, "num_tokens": 16607751.0, "step": 9135 }, { "entropy": 5.607501125335693, "epoch": 7.852599914052428, "grad_norm": 1.34375, "learning_rate": 0.00010826785635259842, "loss": 4.7993, "mean_token_accuracy": 0.2433355689048767, "num_tokens": 16616041.0, "step": 9140 }, { "entropy": 5.618802070617676, "epoch": 7.856897292651483, "grad_norm": 1.5625, "learning_rate": 0.00010804478627703903, "loss": 4.7761, "mean_token_accuracy": 0.25758384317159655, "num_tokens": 16624800.0, "step": 9145 }, { "entropy": 5.70573935508728, "epoch": 7.861194671250537, "grad_norm": 1.765625, "learning_rate": 0.00010782208076493508, "loss": 4.8799, "mean_token_accuracy": 0.23971739262342454, "num_tokens": 16632808.0, "step": 9150 }, { "entropy": 5.700254964828491, "epoch": 7.865492049849592, "grad_norm": 1.4296875, "learning_rate": 0.00010759974030258621, "loss": 4.8532, "mean_token_accuracy": 0.24205577969551087, "num_tokens": 16641179.0, "step": 9155 }, { "entropy": 5.651480436325073, "epoch": 7.869789428448646, "grad_norm": 1.2421875, "learning_rate": 0.00010737776537549531, "loss": 4.8599, "mean_token_accuracy": 0.24276550859212875, "num_tokens": 16650402.0, "step": 9160 }, { "entropy": 5.615023612976074, "epoch": 7.874086807047701, "grad_norm": 1.453125, "learning_rate": 0.00010715615646836679, "loss": 4.8228, "mean_token_accuracy": 0.244487963616848, "num_tokens": 16659661.0, "step": 9165 }, { "entropy": 5.592911720275879, "epoch": 7.878384185646755, "grad_norm": 1.328125, "learning_rate": 0.00010693491406510585, "loss": 4.8163, "mean_token_accuracy": 0.2512414947152138, "num_tokens": 16668630.0, "step": 9170 }, { "entropy": 5.685381031036377, "epoch": 7.88268156424581, "grad_norm": 1.484375, "learning_rate": 0.00010671403864881757, "loss": 4.8931, "mean_token_accuracy": 0.2415841579437256, "num_tokens": 16678023.0, "step": 9175 }, { "entropy": 5.69584379196167, "epoch": 7.8869789428448644, "grad_norm": 1.328125, "learning_rate": 0.00010649353070180562, "loss": 4.886, "mean_token_accuracy": 0.23775400519371032, "num_tokens": 16686751.0, "step": 9180 }, { "entropy": 5.675034427642823, "epoch": 7.891276321443919, "grad_norm": 1.390625, "learning_rate": 0.00010627339070557118, "loss": 4.8746, "mean_token_accuracy": 0.2419977530837059, "num_tokens": 16696672.0, "step": 9185 }, { "entropy": 5.638709783554077, "epoch": 7.8955737000429735, "grad_norm": 1.3828125, "learning_rate": 0.00010605361914081194, "loss": 4.7686, "mean_token_accuracy": 0.24883348643779754, "num_tokens": 16706018.0, "step": 9190 }, { "entropy": 5.6906637191772464, "epoch": 7.899871078642028, "grad_norm": 1.234375, "learning_rate": 0.00010583421648742125, "loss": 4.8111, "mean_token_accuracy": 0.2465365782380104, "num_tokens": 16715206.0, "step": 9195 }, { "entropy": 5.666348171234131, "epoch": 7.904168457241083, "grad_norm": 1.4609375, "learning_rate": 0.00010561518322448673, "loss": 4.9193, "mean_token_accuracy": 0.2381324961781502, "num_tokens": 16724479.0, "step": 9200 }, { "entropy": 5.628651762008667, "epoch": 7.908465835840137, "grad_norm": 1.53125, "learning_rate": 0.00010539651983028955, "loss": 4.7786, "mean_token_accuracy": 0.2523382410407066, "num_tokens": 16733304.0, "step": 9205 }, { "entropy": 5.616871786117554, "epoch": 7.912763214439192, "grad_norm": 1.5234375, "learning_rate": 0.0001051782267823031, "loss": 4.7756, "mean_token_accuracy": 0.24963734596967696, "num_tokens": 16741447.0, "step": 9210 }, { "entropy": 5.635164642333985, "epoch": 7.917060593038247, "grad_norm": 1.5078125, "learning_rate": 0.00010496030455719225, "loss": 4.8521, "mean_token_accuracy": 0.23965709507465363, "num_tokens": 16751487.0, "step": 9215 }, { "entropy": 5.644070672988891, "epoch": 7.921357971637301, "grad_norm": 1.4453125, "learning_rate": 0.00010474275363081193, "loss": 4.8551, "mean_token_accuracy": 0.2444910541176796, "num_tokens": 16760795.0, "step": 9220 }, { "entropy": 5.646812820434571, "epoch": 7.925655350236356, "grad_norm": 1.4765625, "learning_rate": 0.0001045255744782064, "loss": 4.8303, "mean_token_accuracy": 0.24445725977420807, "num_tokens": 16769639.0, "step": 9225 }, { "entropy": 5.709587478637696, "epoch": 7.92995272883541, "grad_norm": 1.34375, "learning_rate": 0.00010430876757360817, "loss": 4.9103, "mean_token_accuracy": 0.23549216985702515, "num_tokens": 16779195.0, "step": 9230 }, { "entropy": 5.562986993789673, "epoch": 7.934250107434465, "grad_norm": 1.4609375, "learning_rate": 0.00010409233339043694, "loss": 4.7358, "mean_token_accuracy": 0.2566070005297661, "num_tokens": 16787531.0, "step": 9235 }, { "entropy": 5.603441858291626, "epoch": 7.93854748603352, "grad_norm": 1.4765625, "learning_rate": 0.00010387627240129838, "loss": 4.7895, "mean_token_accuracy": 0.25151281505823136, "num_tokens": 16796392.0, "step": 9240 }, { "entropy": 5.647985029220581, "epoch": 7.942844864632574, "grad_norm": 1.2734375, "learning_rate": 0.00010366058507798326, "loss": 4.8395, "mean_token_accuracy": 0.24179578721523284, "num_tokens": 16804942.0, "step": 9245 }, { "entropy": 5.657173204421997, "epoch": 7.947142243231629, "grad_norm": 1.40625, "learning_rate": 0.00010344527189146655, "loss": 4.8896, "mean_token_accuracy": 0.23595151156187058, "num_tokens": 16813754.0, "step": 9250 }, { "entropy": 5.682489442825317, "epoch": 7.9514396218306835, "grad_norm": 1.28125, "learning_rate": 0.00010323033331190626, "loss": 4.8726, "mean_token_accuracy": 0.239631487429142, "num_tokens": 16823010.0, "step": 9255 }, { "entropy": 5.666946983337402, "epoch": 7.955737000429738, "grad_norm": 1.390625, "learning_rate": 0.00010301576980864228, "loss": 4.7882, "mean_token_accuracy": 0.2553831934928894, "num_tokens": 16831909.0, "step": 9260 }, { "entropy": 5.5937840938568115, "epoch": 7.960034379028793, "grad_norm": 1.46875, "learning_rate": 0.00010280158185019547, "loss": 4.8387, "mean_token_accuracy": 0.2543884262442589, "num_tokens": 16841460.0, "step": 9265 }, { "entropy": 5.6272660255432125, "epoch": 7.964331757627847, "grad_norm": 1.234375, "learning_rate": 0.00010258776990426686, "loss": 4.8492, "mean_token_accuracy": 0.2500534489750862, "num_tokens": 16850592.0, "step": 9270 }, { "entropy": 5.643726873397827, "epoch": 7.968629136226902, "grad_norm": 1.515625, "learning_rate": 0.00010237433443773612, "loss": 4.8581, "mean_token_accuracy": 0.24305154681205748, "num_tokens": 16859736.0, "step": 9275 }, { "entropy": 5.612250900268554, "epoch": 7.972926514825956, "grad_norm": 1.25, "learning_rate": 0.00010216127591666115, "loss": 4.8619, "mean_token_accuracy": 0.25060978829860686, "num_tokens": 16870084.0, "step": 9280 }, { "entropy": 5.636719512939453, "epoch": 7.977223893425011, "grad_norm": 1.6640625, "learning_rate": 0.00010194859480627648, "loss": 4.8777, "mean_token_accuracy": 0.24135367721319198, "num_tokens": 16877771.0, "step": 9285 }, { "entropy": 5.627331495285034, "epoch": 7.981521272024065, "grad_norm": 1.2890625, "learning_rate": 0.00010173629157099279, "loss": 4.8882, "mean_token_accuracy": 0.2392001986503601, "num_tokens": 16887487.0, "step": 9290 }, { "entropy": 5.626888370513916, "epoch": 7.98581865062312, "grad_norm": 1.234375, "learning_rate": 0.00010152436667439537, "loss": 4.8877, "mean_token_accuracy": 0.2454797610640526, "num_tokens": 16897286.0, "step": 9295 }, { "entropy": 5.670809698104859, "epoch": 7.9901160292221745, "grad_norm": 1.421875, "learning_rate": 0.00010131282057924345, "loss": 4.821, "mean_token_accuracy": 0.24797312319278716, "num_tokens": 16905968.0, "step": 9300 }, { "entropy": 5.704876232147217, "epoch": 7.994413407821229, "grad_norm": 1.34375, "learning_rate": 0.00010110165374746924, "loss": 4.8232, "mean_token_accuracy": 0.24678525626659392, "num_tokens": 16914604.0, "step": 9305 }, { "entropy": 5.683019065856934, "epoch": 7.9987107864202835, "grad_norm": 1.359375, "learning_rate": 0.00010089086664017674, "loss": 4.9812, "mean_token_accuracy": 0.23446963727474213, "num_tokens": 16925085.0, "step": 9310 }, { "entropy": 5.646041446261936, "epoch": 8.002578427159433, "grad_norm": 1.390625, "learning_rate": 0.00010068045971764067, "loss": 4.7677, "mean_token_accuracy": 0.25311458938651615, "num_tokens": 16932717.0, "step": 9315 }, { "entropy": 5.687277030944824, "epoch": 8.006875805758487, "grad_norm": 1.4140625, "learning_rate": 0.00010047043343930561, "loss": 4.8798, "mean_token_accuracy": 0.24468370229005815, "num_tokens": 16941332.0, "step": 9320 }, { "entropy": 5.604476070404052, "epoch": 8.011173184357542, "grad_norm": 1.4375, "learning_rate": 0.00010026078826378502, "loss": 4.7217, "mean_token_accuracy": 0.2548048093914986, "num_tokens": 16949732.0, "step": 9325 }, { "entropy": 5.661039781570435, "epoch": 8.015470562956596, "grad_norm": 1.3359375, "learning_rate": 0.00010005152464886031, "loss": 4.7478, "mean_token_accuracy": 0.24966603070497512, "num_tokens": 16958013.0, "step": 9330 }, { "entropy": 5.6239735126495365, "epoch": 8.019767941555651, "grad_norm": 1.3125, "learning_rate": 9.984264305147941e-05, "loss": 4.7115, "mean_token_accuracy": 0.2564238518476486, "num_tokens": 16966050.0, "step": 9335 }, { "entropy": 5.616132545471191, "epoch": 8.024065320154705, "grad_norm": 1.328125, "learning_rate": 9.963414392775627e-05, "loss": 4.7125, "mean_token_accuracy": 0.25857759863138197, "num_tokens": 16975178.0, "step": 9340 }, { "entropy": 5.536005544662475, "epoch": 8.02836269875376, "grad_norm": 1.3125, "learning_rate": 9.942602773296971e-05, "loss": 4.701, "mean_token_accuracy": 0.26308748573064805, "num_tokens": 16984247.0, "step": 9345 }, { "entropy": 5.716782569885254, "epoch": 8.032660077352816, "grad_norm": 1.3515625, "learning_rate": 9.921829492156223e-05, "loss": 4.8897, "mean_token_accuracy": 0.2355976179242134, "num_tokens": 16995048.0, "step": 9350 }, { "entropy": 5.6854979515075685, "epoch": 8.03695745595187, "grad_norm": 1.625, "learning_rate": 9.901094594713933e-05, "loss": 4.8501, "mean_token_accuracy": 0.2387949600815773, "num_tokens": 17003748.0, "step": 9355 }, { "entropy": 5.691632413864136, "epoch": 8.041254834550925, "grad_norm": 1.4140625, "learning_rate": 9.88039812624682e-05, "loss": 4.8154, "mean_token_accuracy": 0.24805701822042464, "num_tokens": 17011639.0, "step": 9360 }, { "entropy": 5.735028791427612, "epoch": 8.045552213149978, "grad_norm": 1.2109375, "learning_rate": 9.859740131947715e-05, "loss": 4.8078, "mean_token_accuracy": 0.2420959696173668, "num_tokens": 17021056.0, "step": 9365 }, { "entropy": 5.678194046020508, "epoch": 8.049849591749034, "grad_norm": 1.3671875, "learning_rate": 9.839120656925407e-05, "loss": 4.8574, "mean_token_accuracy": 0.23739131838083266, "num_tokens": 17030944.0, "step": 9370 }, { "entropy": 5.6369085788726805, "epoch": 8.054146970348087, "grad_norm": 1.421875, "learning_rate": 9.818539746204588e-05, "loss": 4.8013, "mean_token_accuracy": 0.24849113821983337, "num_tokens": 17040127.0, "step": 9375 }, { "entropy": 5.710569906234741, "epoch": 8.058444348947143, "grad_norm": 1.6171875, "learning_rate": 9.797997444725745e-05, "loss": 4.7846, "mean_token_accuracy": 0.24967314451932907, "num_tokens": 17049418.0, "step": 9380 }, { "entropy": 5.609195852279663, "epoch": 8.062741727546197, "grad_norm": 1.34375, "learning_rate": 9.77749379734506e-05, "loss": 4.7923, "mean_token_accuracy": 0.2569655641913414, "num_tokens": 17058270.0, "step": 9385 }, { "entropy": 5.674309730529785, "epoch": 8.067039106145252, "grad_norm": 1.4609375, "learning_rate": 9.757028848834293e-05, "loss": 4.8089, "mean_token_accuracy": 0.2483993098139763, "num_tokens": 17068011.0, "step": 9390 }, { "entropy": 5.636227035522461, "epoch": 8.071336484744306, "grad_norm": 1.3828125, "learning_rate": 9.736602643880712e-05, "loss": 4.8026, "mean_token_accuracy": 0.24484998136758804, "num_tokens": 17077356.0, "step": 9395 }, { "entropy": 5.644352102279663, "epoch": 8.075633863343361, "grad_norm": 1.359375, "learning_rate": 9.716215227086997e-05, "loss": 4.7616, "mean_token_accuracy": 0.24986155480146408, "num_tokens": 17085679.0, "step": 9400 }, { "entropy": 5.6834478855133055, "epoch": 8.079931241942415, "grad_norm": 1.3203125, "learning_rate": 9.695866642971098e-05, "loss": 4.7877, "mean_token_accuracy": 0.24179317951202392, "num_tokens": 17094925.0, "step": 9405 }, { "entropy": 5.635635614395142, "epoch": 8.08422862054147, "grad_norm": 1.40625, "learning_rate": 9.67555693596621e-05, "loss": 4.8421, "mean_token_accuracy": 0.24287659376859666, "num_tokens": 17105278.0, "step": 9410 }, { "entropy": 5.594030666351318, "epoch": 8.088525999140524, "grad_norm": 1.40625, "learning_rate": 9.655286150420595e-05, "loss": 4.7669, "mean_token_accuracy": 0.25346663445234296, "num_tokens": 17114070.0, "step": 9415 }, { "entropy": 5.569116306304932, "epoch": 8.09282337773958, "grad_norm": 1.2578125, "learning_rate": 9.635054330597565e-05, "loss": 4.694, "mean_token_accuracy": 0.2626067653298378, "num_tokens": 17122862.0, "step": 9420 }, { "entropy": 5.6683930397033695, "epoch": 8.097120756338633, "grad_norm": 1.4765625, "learning_rate": 9.614861520675322e-05, "loss": 4.7552, "mean_token_accuracy": 0.2559780165553093, "num_tokens": 17131555.0, "step": 9425 }, { "entropy": 5.626582384109497, "epoch": 8.101418134937688, "grad_norm": 1.390625, "learning_rate": 9.594707764746881e-05, "loss": 4.7252, "mean_token_accuracy": 0.25357394516468046, "num_tokens": 17140841.0, "step": 9430 }, { "entropy": 5.595681619644165, "epoch": 8.105715513536742, "grad_norm": 1.328125, "learning_rate": 9.57459310682e-05, "loss": 4.7846, "mean_token_accuracy": 0.24935179948806763, "num_tokens": 17150027.0, "step": 9435 }, { "entropy": 5.689383602142334, "epoch": 8.110012892135797, "grad_norm": 1.3203125, "learning_rate": 9.554517590817055e-05, "loss": 4.8344, "mean_token_accuracy": 0.2364582061767578, "num_tokens": 17159589.0, "step": 9440 }, { "entropy": 5.6651389598846436, "epoch": 8.114310270734851, "grad_norm": 1.296875, "learning_rate": 9.534481260574944e-05, "loss": 4.8245, "mean_token_accuracy": 0.24162878692150117, "num_tokens": 17168219.0, "step": 9445 }, { "entropy": 5.655685997009277, "epoch": 8.118607649333907, "grad_norm": 1.328125, "learning_rate": 9.514484159844997e-05, "loss": 4.8044, "mean_token_accuracy": 0.24854951202869416, "num_tokens": 17177364.0, "step": 9450 }, { "entropy": 5.6925647258758545, "epoch": 8.12290502793296, "grad_norm": 1.4765625, "learning_rate": 9.494526332292899e-05, "loss": 4.8423, "mean_token_accuracy": 0.2397886872291565, "num_tokens": 17186572.0, "step": 9455 }, { "entropy": 5.654357242584228, "epoch": 8.127202406532016, "grad_norm": 1.3671875, "learning_rate": 9.47460782149857e-05, "loss": 4.8388, "mean_token_accuracy": 0.2454209119081497, "num_tokens": 17195645.0, "step": 9460 }, { "entropy": 5.684907484054565, "epoch": 8.13149978513107, "grad_norm": 1.34375, "learning_rate": 9.454728670956073e-05, "loss": 4.8143, "mean_token_accuracy": 0.23839772045612334, "num_tokens": 17205279.0, "step": 9465 }, { "entropy": 5.620777225494384, "epoch": 8.135797163730125, "grad_norm": 1.40625, "learning_rate": 9.43488892407352e-05, "loss": 4.7512, "mean_token_accuracy": 0.25200601369142533, "num_tokens": 17214536.0, "step": 9470 }, { "entropy": 5.682265663146973, "epoch": 8.140094542329178, "grad_norm": 1.34375, "learning_rate": 9.415088624172997e-05, "loss": 4.846, "mean_token_accuracy": 0.24233781099319457, "num_tokens": 17223336.0, "step": 9475 }, { "entropy": 5.688704490661621, "epoch": 8.144391920928234, "grad_norm": 1.3828125, "learning_rate": 9.395327814490439e-05, "loss": 4.8558, "mean_token_accuracy": 0.23712000399827957, "num_tokens": 17232991.0, "step": 9480 }, { "entropy": 5.685535001754761, "epoch": 8.148689299527287, "grad_norm": 1.5, "learning_rate": 9.375606538175566e-05, "loss": 4.7689, "mean_token_accuracy": 0.24820202291011811, "num_tokens": 17241760.0, "step": 9485 }, { "entropy": 5.710322761535645, "epoch": 8.152986678126343, "grad_norm": 1.3359375, "learning_rate": 9.35592483829175e-05, "loss": 4.8582, "mean_token_accuracy": 0.23691945970058442, "num_tokens": 17251514.0, "step": 9490 }, { "entropy": 5.667467212677002, "epoch": 8.157284056725398, "grad_norm": 1.421875, "learning_rate": 9.336282757815964e-05, "loss": 4.8301, "mean_token_accuracy": 0.24544458836317062, "num_tokens": 17260876.0, "step": 9495 }, { "entropy": 5.620681381225586, "epoch": 8.161581435324452, "grad_norm": 1.4375, "learning_rate": 9.316680339638664e-05, "loss": 4.8495, "mean_token_accuracy": 0.24359913915395737, "num_tokens": 17270051.0, "step": 9500 }, { "epoch": 8.161581435324452, "eval_entropy": 5.470761820539698, "eval_loss": 5.884027481079102, "eval_mean_token_accuracy": 0.18388339954319302, "eval_num_tokens": 17270051.0, "eval_runtime": 2.0482, "eval_samples_per_second": 1732.734, "eval_steps_per_second": 216.775, "step": 9500 }, { "entropy": 5.707862997055054, "epoch": 8.165878813923507, "grad_norm": 1.5, "learning_rate": 9.297117626563687e-05, "loss": 4.881, "mean_token_accuracy": 0.24201221615076066, "num_tokens": 17279744.0, "step": 9505 }, { "entropy": 5.6367637634277346, "epoch": 8.170176192522561, "grad_norm": 1.359375, "learning_rate": 9.27759466130818e-05, "loss": 4.8041, "mean_token_accuracy": 0.24783320277929305, "num_tokens": 17289107.0, "step": 9510 }, { "entropy": 5.614573669433594, "epoch": 8.174473571121617, "grad_norm": 1.2734375, "learning_rate": 9.25811148650251e-05, "loss": 4.7014, "mean_token_accuracy": 0.25507569015026094, "num_tokens": 17298385.0, "step": 9515 }, { "entropy": 5.674180555343628, "epoch": 8.17877094972067, "grad_norm": 1.3671875, "learning_rate": 9.238668144690133e-05, "loss": 4.7892, "mean_token_accuracy": 0.2445373848080635, "num_tokens": 17307759.0, "step": 9520 }, { "entropy": 5.619296503067017, "epoch": 8.183068328319726, "grad_norm": 1.3125, "learning_rate": 9.219264678327527e-05, "loss": 4.7775, "mean_token_accuracy": 0.25314409881830213, "num_tokens": 17317135.0, "step": 9525 }, { "entropy": 5.620535802841187, "epoch": 8.18736570691878, "grad_norm": 1.390625, "learning_rate": 9.199901129784121e-05, "loss": 4.7845, "mean_token_accuracy": 0.24970700591802597, "num_tokens": 17326950.0, "step": 9530 }, { "entropy": 5.7192747592926025, "epoch": 8.191663085517835, "grad_norm": 1.28125, "learning_rate": 9.180577541342164e-05, "loss": 4.8815, "mean_token_accuracy": 0.24280520528554916, "num_tokens": 17335873.0, "step": 9535 }, { "entropy": 5.642719173431397, "epoch": 8.195960464116888, "grad_norm": 1.4375, "learning_rate": 9.161293955196648e-05, "loss": 4.7757, "mean_token_accuracy": 0.25748146921396253, "num_tokens": 17344659.0, "step": 9540 }, { "entropy": 5.707903146743774, "epoch": 8.200257842715944, "grad_norm": 1.390625, "learning_rate": 9.142050413455214e-05, "loss": 4.8345, "mean_token_accuracy": 0.23902178555727005, "num_tokens": 17353274.0, "step": 9545 }, { "entropy": 5.672132682800293, "epoch": 8.204555221314997, "grad_norm": 1.3046875, "learning_rate": 9.12284695813807e-05, "loss": 4.8439, "mean_token_accuracy": 0.25198720693588256, "num_tokens": 17362212.0, "step": 9550 }, { "entropy": 5.658090209960937, "epoch": 8.208852599914053, "grad_norm": 1.4609375, "learning_rate": 9.103683631177878e-05, "loss": 4.778, "mean_token_accuracy": 0.2557189479470253, "num_tokens": 17370567.0, "step": 9555 }, { "entropy": 5.621751546859741, "epoch": 8.213149978513107, "grad_norm": 1.6796875, "learning_rate": 9.084560474419701e-05, "loss": 4.7835, "mean_token_accuracy": 0.25118696242570876, "num_tokens": 17378521.0, "step": 9560 }, { "entropy": 5.696748685836792, "epoch": 8.217447357112162, "grad_norm": 1.484375, "learning_rate": 9.065477529620852e-05, "loss": 4.8066, "mean_token_accuracy": 0.2443075641989708, "num_tokens": 17386988.0, "step": 9565 }, { "entropy": 5.665190410614014, "epoch": 8.221744735711216, "grad_norm": 1.515625, "learning_rate": 9.046434838450868e-05, "loss": 4.8143, "mean_token_accuracy": 0.2463520273566246, "num_tokens": 17396317.0, "step": 9570 }, { "entropy": 5.7112932205200195, "epoch": 8.226042114310271, "grad_norm": 1.484375, "learning_rate": 9.027432442491369e-05, "loss": 4.9087, "mean_token_accuracy": 0.2336752951145172, "num_tokens": 17405768.0, "step": 9575 }, { "entropy": 5.691789293289185, "epoch": 8.230339492909325, "grad_norm": 1.484375, "learning_rate": 9.008470383235991e-05, "loss": 4.9032, "mean_token_accuracy": 0.24061028361320497, "num_tokens": 17416098.0, "step": 9580 }, { "entropy": 5.657401275634766, "epoch": 8.23463687150838, "grad_norm": 1.5078125, "learning_rate": 8.989548702090295e-05, "loss": 4.8131, "mean_token_accuracy": 0.2478136107325554, "num_tokens": 17426091.0, "step": 9585 }, { "entropy": 5.613147830963134, "epoch": 8.238934250107434, "grad_norm": 1.4609375, "learning_rate": 8.970667440371676e-05, "loss": 4.7774, "mean_token_accuracy": 0.2528875932097435, "num_tokens": 17434956.0, "step": 9590 }, { "entropy": 5.673092412948608, "epoch": 8.24323162870649, "grad_norm": 1.34375, "learning_rate": 8.951826639309257e-05, "loss": 4.8428, "mean_token_accuracy": 0.2408193603157997, "num_tokens": 17443980.0, "step": 9595 }, { "entropy": 5.613753366470337, "epoch": 8.247529007305543, "grad_norm": 1.484375, "learning_rate": 8.933026340043811e-05, "loss": 4.7847, "mean_token_accuracy": 0.24801580756902694, "num_tokens": 17452982.0, "step": 9600 }, { "entropy": 5.626958703994751, "epoch": 8.251826385904598, "grad_norm": 1.4765625, "learning_rate": 8.914266583627684e-05, "loss": 4.7602, "mean_token_accuracy": 0.25679432451725004, "num_tokens": 17461946.0, "step": 9605 }, { "entropy": 5.705102348327637, "epoch": 8.256123764503652, "grad_norm": 1.5859375, "learning_rate": 8.89554741102469e-05, "loss": 4.8275, "mean_token_accuracy": 0.2392795503139496, "num_tokens": 17470125.0, "step": 9610 }, { "entropy": 5.625812101364136, "epoch": 8.260421143102707, "grad_norm": 1.4140625, "learning_rate": 8.876868863110013e-05, "loss": 4.8448, "mean_token_accuracy": 0.24406588524580003, "num_tokens": 17479536.0, "step": 9615 }, { "entropy": 5.704039239883423, "epoch": 8.264718521701761, "grad_norm": 1.359375, "learning_rate": 8.858230980670134e-05, "loss": 4.7876, "mean_token_accuracy": 0.2442236140370369, "num_tokens": 17488709.0, "step": 9620 }, { "entropy": 5.685574102401733, "epoch": 8.269015900300817, "grad_norm": 1.3203125, "learning_rate": 8.839633804402747e-05, "loss": 4.8484, "mean_token_accuracy": 0.24362473785877228, "num_tokens": 17498196.0, "step": 9625 }, { "entropy": 5.6750462532043455, "epoch": 8.27331327889987, "grad_norm": 1.5234375, "learning_rate": 8.821077374916647e-05, "loss": 4.7961, "mean_token_accuracy": 0.25050569921731947, "num_tokens": 17507374.0, "step": 9630 }, { "entropy": 5.600301933288574, "epoch": 8.277610657498926, "grad_norm": 1.46875, "learning_rate": 8.802561732731654e-05, "loss": 4.7754, "mean_token_accuracy": 0.2526980236172676, "num_tokens": 17516163.0, "step": 9635 }, { "entropy": 5.719686460494995, "epoch": 8.281908036097981, "grad_norm": 1.3125, "learning_rate": 8.784086918278534e-05, "loss": 4.8649, "mean_token_accuracy": 0.2339397519826889, "num_tokens": 17525567.0, "step": 9640 }, { "entropy": 5.607286500930786, "epoch": 8.286205414697035, "grad_norm": 1.359375, "learning_rate": 8.765652971898908e-05, "loss": 4.7321, "mean_token_accuracy": 0.25611190050840377, "num_tokens": 17534433.0, "step": 9645 }, { "entropy": 5.677676486968994, "epoch": 8.29050279329609, "grad_norm": 1.5390625, "learning_rate": 8.747259933845134e-05, "loss": 4.8617, "mean_token_accuracy": 0.25352744907140734, "num_tokens": 17542981.0, "step": 9650 }, { "entropy": 5.707070398330688, "epoch": 8.294800171895144, "grad_norm": 1.328125, "learning_rate": 8.728907844280254e-05, "loss": 4.8851, "mean_token_accuracy": 0.23950794637203215, "num_tokens": 17552425.0, "step": 9655 }, { "entropy": 5.6442076683044435, "epoch": 8.2990975504942, "grad_norm": 1.515625, "learning_rate": 8.710596743277901e-05, "loss": 4.8163, "mean_token_accuracy": 0.24814968705177307, "num_tokens": 17560954.0, "step": 9660 }, { "entropy": 5.645054149627685, "epoch": 8.303394929093253, "grad_norm": 1.4609375, "learning_rate": 8.692326670822207e-05, "loss": 4.7305, "mean_token_accuracy": 0.2519303172826767, "num_tokens": 17569769.0, "step": 9665 }, { "entropy": 5.663473224639892, "epoch": 8.307692307692308, "grad_norm": 1.3984375, "learning_rate": 8.6740976668077e-05, "loss": 4.7768, "mean_token_accuracy": 0.25253538936376574, "num_tokens": 17578875.0, "step": 9670 }, { "entropy": 5.601699924468994, "epoch": 8.311989686291362, "grad_norm": 1.6015625, "learning_rate": 8.655909771039239e-05, "loss": 4.7722, "mean_token_accuracy": 0.24630842208862305, "num_tokens": 17587556.0, "step": 9675 }, { "entropy": 5.668028783798218, "epoch": 8.316287064890417, "grad_norm": 1.6640625, "learning_rate": 8.637763023231922e-05, "loss": 4.8186, "mean_token_accuracy": 0.2444746106863022, "num_tokens": 17595077.0, "step": 9680 }, { "entropy": 5.5484637260437015, "epoch": 8.320584443489471, "grad_norm": 1.46875, "learning_rate": 8.619657463010987e-05, "loss": 4.6988, "mean_token_accuracy": 0.260918502509594, "num_tokens": 17603864.0, "step": 9685 }, { "entropy": 5.722867631912232, "epoch": 8.324881822088527, "grad_norm": 1.4921875, "learning_rate": 8.60159312991175e-05, "loss": 4.8931, "mean_token_accuracy": 0.241627100110054, "num_tokens": 17613570.0, "step": 9690 }, { "entropy": 5.686518335342408, "epoch": 8.32917920068758, "grad_norm": 1.34375, "learning_rate": 8.583570063379487e-05, "loss": 4.8181, "mean_token_accuracy": 0.24606943130493164, "num_tokens": 17622352.0, "step": 9695 }, { "entropy": 5.644981908798218, "epoch": 8.333476579286636, "grad_norm": 1.484375, "learning_rate": 8.565588302769374e-05, "loss": 4.7561, "mean_token_accuracy": 0.24853309392929077, "num_tokens": 17631556.0, "step": 9700 }, { "entropy": 5.598904609680176, "epoch": 8.33777395788569, "grad_norm": 1.421875, "learning_rate": 8.54764788734639e-05, "loss": 4.785, "mean_token_accuracy": 0.2558070823550224, "num_tokens": 17641182.0, "step": 9705 }, { "entropy": 5.706678056716919, "epoch": 8.342071336484745, "grad_norm": 1.390625, "learning_rate": 8.529748856285228e-05, "loss": 4.7858, "mean_token_accuracy": 0.24797595292329788, "num_tokens": 17650842.0, "step": 9710 }, { "entropy": 5.658598756790161, "epoch": 8.346368715083798, "grad_norm": 1.4375, "learning_rate": 8.511891248670217e-05, "loss": 4.782, "mean_token_accuracy": 0.2595686510205269, "num_tokens": 17659963.0, "step": 9715 }, { "entropy": 5.688131809234619, "epoch": 8.350666093682854, "grad_norm": 1.5625, "learning_rate": 8.494075103495245e-05, "loss": 4.8469, "mean_token_accuracy": 0.24513275921344757, "num_tokens": 17668637.0, "step": 9720 }, { "entropy": 5.676475381851196, "epoch": 8.354963472281907, "grad_norm": 1.390625, "learning_rate": 8.476300459663643e-05, "loss": 4.8216, "mean_token_accuracy": 0.24088364839553833, "num_tokens": 17678212.0, "step": 9725 }, { "entropy": 5.665163183212281, "epoch": 8.359260850880963, "grad_norm": 1.3515625, "learning_rate": 8.458567355988123e-05, "loss": 4.891, "mean_token_accuracy": 0.23809681534767152, "num_tokens": 17686766.0, "step": 9730 }, { "entropy": 5.628652429580688, "epoch": 8.363558229480017, "grad_norm": 1.234375, "learning_rate": 8.440875831190704e-05, "loss": 4.8108, "mean_token_accuracy": 0.243441666662693, "num_tokens": 17696846.0, "step": 9735 }, { "entropy": 5.656926536560059, "epoch": 8.367855608079072, "grad_norm": 1.3359375, "learning_rate": 8.423225923902608e-05, "loss": 4.7911, "mean_token_accuracy": 0.25119892954826356, "num_tokens": 17705846.0, "step": 9740 }, { "entropy": 5.626911640167236, "epoch": 8.372152986678126, "grad_norm": 1.4765625, "learning_rate": 8.405617672664175e-05, "loss": 4.7294, "mean_token_accuracy": 0.26259505450725557, "num_tokens": 17714255.0, "step": 9745 }, { "entropy": 5.725198030471802, "epoch": 8.376450365277181, "grad_norm": 1.3359375, "learning_rate": 8.388051115924781e-05, "loss": 4.8935, "mean_token_accuracy": 0.2378841072320938, "num_tokens": 17724246.0, "step": 9750 }, { "entropy": 5.666871118545532, "epoch": 8.380747743876235, "grad_norm": 1.4296875, "learning_rate": 8.370526292042774e-05, "loss": 4.7505, "mean_token_accuracy": 0.2538667440414429, "num_tokens": 17733081.0, "step": 9755 }, { "entropy": 5.587338781356811, "epoch": 8.38504512247529, "grad_norm": 1.671875, "learning_rate": 8.353043239285357e-05, "loss": 4.7737, "mean_token_accuracy": 0.2544986054301262, "num_tokens": 17741220.0, "step": 9760 }, { "entropy": 5.640384817123413, "epoch": 8.389342501074344, "grad_norm": 1.5546875, "learning_rate": 8.335601995828534e-05, "loss": 4.8104, "mean_token_accuracy": 0.242693130671978, "num_tokens": 17750057.0, "step": 9765 }, { "entropy": 5.6606762409210205, "epoch": 8.3936398796734, "grad_norm": 1.3828125, "learning_rate": 8.318202599757008e-05, "loss": 4.8345, "mean_token_accuracy": 0.24492816925048827, "num_tokens": 17759016.0, "step": 9770 }, { "entropy": 5.606284189224243, "epoch": 8.397937258272453, "grad_norm": 1.5078125, "learning_rate": 8.30084508906411e-05, "loss": 4.7323, "mean_token_accuracy": 0.2531019449234009, "num_tokens": 17767170.0, "step": 9775 }, { "entropy": 5.618020248413086, "epoch": 8.402234636871508, "grad_norm": 1.5078125, "learning_rate": 8.283529501651698e-05, "loss": 4.8268, "mean_token_accuracy": 0.24008245319128035, "num_tokens": 17776508.0, "step": 9780 }, { "entropy": 5.642845964431762, "epoch": 8.406532015470564, "grad_norm": 1.3671875, "learning_rate": 8.266255875330095e-05, "loss": 4.7733, "mean_token_accuracy": 0.25374836176633836, "num_tokens": 17785494.0, "step": 9785 }, { "entropy": 5.627595281600952, "epoch": 8.410829394069617, "grad_norm": 1.4921875, "learning_rate": 8.249024247817998e-05, "loss": 4.8114, "mean_token_accuracy": 0.24691130816936493, "num_tokens": 17795158.0, "step": 9790 }, { "entropy": 5.655184364318847, "epoch": 8.415126772668673, "grad_norm": 1.2734375, "learning_rate": 8.231834656742402e-05, "loss": 4.8303, "mean_token_accuracy": 0.24660495668649673, "num_tokens": 17805130.0, "step": 9795 }, { "entropy": 5.6911468505859375, "epoch": 8.419424151267727, "grad_norm": 1.4921875, "learning_rate": 8.214687139638494e-05, "loss": 4.833, "mean_token_accuracy": 0.25070433020591737, "num_tokens": 17813775.0, "step": 9800 }, { "entropy": 5.6668381690979, "epoch": 8.423721529866782, "grad_norm": 1.5625, "learning_rate": 8.197581733949597e-05, "loss": 4.7642, "mean_token_accuracy": 0.2571685716509819, "num_tokens": 17822599.0, "step": 9805 }, { "entropy": 5.67210898399353, "epoch": 8.428018908465836, "grad_norm": 1.5703125, "learning_rate": 8.180518477027083e-05, "loss": 4.7963, "mean_token_accuracy": 0.2531482130289078, "num_tokens": 17830742.0, "step": 9810 }, { "entropy": 5.551906299591065, "epoch": 8.432316287064891, "grad_norm": 1.3828125, "learning_rate": 8.163497406130287e-05, "loss": 4.7097, "mean_token_accuracy": 0.2613152891397476, "num_tokens": 17840101.0, "step": 9815 }, { "entropy": 5.731686592102051, "epoch": 8.436613665663945, "grad_norm": 1.3828125, "learning_rate": 8.146518558426426e-05, "loss": 4.9128, "mean_token_accuracy": 0.23491355329751967, "num_tokens": 17850464.0, "step": 9820 }, { "entropy": 5.651397562026977, "epoch": 8.440911044263, "grad_norm": 1.484375, "learning_rate": 8.129581970990507e-05, "loss": 4.7678, "mean_token_accuracy": 0.2589532405138016, "num_tokens": 17858792.0, "step": 9825 }, { "entropy": 5.655478382110596, "epoch": 8.445208422862054, "grad_norm": 1.546875, "learning_rate": 8.11268768080528e-05, "loss": 4.7926, "mean_token_accuracy": 0.24893169701099396, "num_tokens": 17867225.0, "step": 9830 }, { "entropy": 5.679694795608521, "epoch": 8.44950580146111, "grad_norm": 1.5859375, "learning_rate": 8.09583572476111e-05, "loss": 4.8898, "mean_token_accuracy": 0.24183138757944106, "num_tokens": 17876423.0, "step": 9835 }, { "entropy": 5.670652627944946, "epoch": 8.453803180060163, "grad_norm": 1.5859375, "learning_rate": 8.079026139655946e-05, "loss": 4.8412, "mean_token_accuracy": 0.24176028072834016, "num_tokens": 17886847.0, "step": 9840 }, { "entropy": 5.590162706375122, "epoch": 8.458100558659218, "grad_norm": 1.4296875, "learning_rate": 8.062258962195192e-05, "loss": 4.7299, "mean_token_accuracy": 0.2560922592878342, "num_tokens": 17897395.0, "step": 9845 }, { "entropy": 5.63255124092102, "epoch": 8.462397937258272, "grad_norm": 1.5859375, "learning_rate": 8.04553422899167e-05, "loss": 4.826, "mean_token_accuracy": 0.24348948746919633, "num_tokens": 17906190.0, "step": 9850 }, { "entropy": 5.6421679019927975, "epoch": 8.466695315857327, "grad_norm": 1.3515625, "learning_rate": 8.028851976565508e-05, "loss": 4.872, "mean_token_accuracy": 0.24232618063688277, "num_tokens": 17916056.0, "step": 9855 }, { "entropy": 5.640596628189087, "epoch": 8.470992694456381, "grad_norm": 1.4140625, "learning_rate": 8.01221224134408e-05, "loss": 4.767, "mean_token_accuracy": 0.2555935025215149, "num_tokens": 17925790.0, "step": 9860 }, { "entropy": 5.6605424880981445, "epoch": 8.475290073055437, "grad_norm": 1.4765625, "learning_rate": 7.995615059661907e-05, "loss": 4.8383, "mean_token_accuracy": 0.24309351146221161, "num_tokens": 17935007.0, "step": 9865 }, { "entropy": 5.669471883773804, "epoch": 8.47958745165449, "grad_norm": 1.4609375, "learning_rate": 7.979060467760616e-05, "loss": 4.824, "mean_token_accuracy": 0.24673386365175248, "num_tokens": 17943363.0, "step": 9870 }, { "entropy": 5.629602289199829, "epoch": 8.483884830253546, "grad_norm": 1.5390625, "learning_rate": 7.962548501788811e-05, "loss": 4.7397, "mean_token_accuracy": 0.2527197152376175, "num_tokens": 17951667.0, "step": 9875 }, { "entropy": 5.647467613220215, "epoch": 8.4881822088526, "grad_norm": 1.484375, "learning_rate": 7.946079197802028e-05, "loss": 4.827, "mean_token_accuracy": 0.23749534040689468, "num_tokens": 17960920.0, "step": 9880 }, { "entropy": 5.648807525634766, "epoch": 8.492479587451655, "grad_norm": 1.609375, "learning_rate": 7.929652591762636e-05, "loss": 4.8127, "mean_token_accuracy": 0.24475187957286834, "num_tokens": 17969462.0, "step": 9885 }, { "entropy": 5.630274534225464, "epoch": 8.496776966050708, "grad_norm": 1.5703125, "learning_rate": 7.913268719539785e-05, "loss": 4.8389, "mean_token_accuracy": 0.24791176468133927, "num_tokens": 17978608.0, "step": 9890 }, { "entropy": 5.617410373687744, "epoch": 8.501074344649764, "grad_norm": 1.40625, "learning_rate": 7.896927616909304e-05, "loss": 4.7157, "mean_token_accuracy": 0.2596577987074852, "num_tokens": 17987563.0, "step": 9895 }, { "entropy": 5.6985677719116214, "epoch": 8.505371723248818, "grad_norm": 1.3359375, "learning_rate": 7.880629319553623e-05, "loss": 4.8151, "mean_token_accuracy": 0.2542118713259697, "num_tokens": 17996493.0, "step": 9900 }, { "entropy": 5.660664224624634, "epoch": 8.509669101847873, "grad_norm": 1.3984375, "learning_rate": 7.864373863061717e-05, "loss": 4.819, "mean_token_accuracy": 0.2512780398130417, "num_tokens": 18004578.0, "step": 9905 }, { "entropy": 5.619579887390136, "epoch": 8.513966480446927, "grad_norm": 1.5546875, "learning_rate": 7.848161282929006e-05, "loss": 4.7944, "mean_token_accuracy": 0.24880642294883729, "num_tokens": 18013366.0, "step": 9910 }, { "entropy": 5.679474544525147, "epoch": 8.518263859045982, "grad_norm": 1.5, "learning_rate": 7.831991614557274e-05, "loss": 4.8348, "mean_token_accuracy": 0.23917439728975295, "num_tokens": 18021718.0, "step": 9915 }, { "entropy": 5.6579587936401365, "epoch": 8.522561237645036, "grad_norm": 1.515625, "learning_rate": 7.815864893254619e-05, "loss": 4.8091, "mean_token_accuracy": 0.2455276608467102, "num_tokens": 18030770.0, "step": 9920 }, { "entropy": 5.663690233230591, "epoch": 8.526858616244091, "grad_norm": 1.4140625, "learning_rate": 7.799781154235361e-05, "loss": 4.8306, "mean_token_accuracy": 0.24498046338558196, "num_tokens": 18039009.0, "step": 9925 }, { "entropy": 5.722172498703003, "epoch": 8.531155994843147, "grad_norm": 1.6015625, "learning_rate": 7.783740432619954e-05, "loss": 4.8863, "mean_token_accuracy": 0.2334560737013817, "num_tokens": 18046677.0, "step": 9930 }, { "entropy": 5.644006681442261, "epoch": 8.5354533734422, "grad_norm": 1.4921875, "learning_rate": 7.767742763434922e-05, "loss": 4.8411, "mean_token_accuracy": 0.2409740775823593, "num_tokens": 18056772.0, "step": 9935 }, { "entropy": 5.589591312408447, "epoch": 8.539750752041256, "grad_norm": 1.4140625, "learning_rate": 7.75178818161277e-05, "loss": 4.7222, "mean_token_accuracy": 0.25535061210393906, "num_tokens": 18065387.0, "step": 9940 }, { "entropy": 5.667737579345703, "epoch": 8.54404813064031, "grad_norm": 1.5, "learning_rate": 7.735876721991945e-05, "loss": 4.8786, "mean_token_accuracy": 0.24178892970085145, "num_tokens": 18074495.0, "step": 9945 }, { "entropy": 5.685311031341553, "epoch": 8.548345509239365, "grad_norm": 1.3203125, "learning_rate": 7.720008419316708e-05, "loss": 4.8577, "mean_token_accuracy": 0.24099551886320114, "num_tokens": 18083351.0, "step": 9950 }, { "entropy": 5.613406562805176, "epoch": 8.552642887838418, "grad_norm": 1.375, "learning_rate": 7.704183308237089e-05, "loss": 4.7478, "mean_token_accuracy": 0.256206014752388, "num_tokens": 18092632.0, "step": 9955 }, { "entropy": 5.592525625228882, "epoch": 8.556940266437474, "grad_norm": 1.3515625, "learning_rate": 7.688401423308799e-05, "loss": 4.758, "mean_token_accuracy": 0.25166425555944444, "num_tokens": 18102380.0, "step": 9960 }, { "entropy": 5.575402688980103, "epoch": 8.561237645036528, "grad_norm": 1.5703125, "learning_rate": 7.672662798993174e-05, "loss": 4.7522, "mean_token_accuracy": 0.25995253920555117, "num_tokens": 18111254.0, "step": 9965 }, { "entropy": 5.668489742279053, "epoch": 8.565535023635583, "grad_norm": 1.4296875, "learning_rate": 7.656967469657083e-05, "loss": 4.8124, "mean_token_accuracy": 0.24452023208141327, "num_tokens": 18119998.0, "step": 9970 }, { "entropy": 5.618843078613281, "epoch": 8.569832402234637, "grad_norm": 1.4140625, "learning_rate": 7.641315469572841e-05, "loss": 4.7452, "mean_token_accuracy": 0.2510646566748619, "num_tokens": 18129237.0, "step": 9975 }, { "entropy": 5.674667453765869, "epoch": 8.574129780833692, "grad_norm": 1.3125, "learning_rate": 7.625706832918172e-05, "loss": 4.8343, "mean_token_accuracy": 0.24694945514202118, "num_tokens": 18138440.0, "step": 9980 }, { "entropy": 5.677858638763428, "epoch": 8.578427159432746, "grad_norm": 1.4375, "learning_rate": 7.610141593776091e-05, "loss": 4.9126, "mean_token_accuracy": 0.23654921650886535, "num_tokens": 18147712.0, "step": 9985 }, { "entropy": 5.656866931915284, "epoch": 8.582724538031801, "grad_norm": 1.4609375, "learning_rate": 7.59461978613486e-05, "loss": 4.8304, "mean_token_accuracy": 0.23874644041061402, "num_tokens": 18156710.0, "step": 9990 }, { "entropy": 5.644109725952148, "epoch": 8.587021916630855, "grad_norm": 1.515625, "learning_rate": 7.579141443887901e-05, "loss": 4.7803, "mean_token_accuracy": 0.2478152796626091, "num_tokens": 18165412.0, "step": 9995 }, { "entropy": 5.652941131591797, "epoch": 8.59131929522991, "grad_norm": 1.578125, "learning_rate": 7.563706600833737e-05, "loss": 4.8194, "mean_token_accuracy": 0.24661646783351898, "num_tokens": 18174415.0, "step": 10000 }, { "epoch": 8.59131929522991, "eval_entropy": 5.45993368367891, "eval_loss": 5.879827976226807, "eval_mean_token_accuracy": 0.1841710262034122, "eval_num_tokens": 18174415.0, "eval_runtime": 2.0477, "eval_samples_per_second": 1733.145, "eval_steps_per_second": 216.826, "step": 10000 }, { "entropy": 5.582108640670777, "epoch": 8.595616673828964, "grad_norm": 1.6484375, "learning_rate": 7.548315290675886e-05, "loss": 4.7544, "mean_token_accuracy": 0.2536217987537384, "num_tokens": 18183468.0, "step": 10005 }, { "entropy": 5.594784164428711, "epoch": 8.59991405242802, "grad_norm": 1.375, "learning_rate": 7.532967547022825e-05, "loss": 4.7644, "mean_token_accuracy": 0.25198703110218046, "num_tokens": 18192552.0, "step": 10010 }, { "entropy": 5.632088756561279, "epoch": 8.604211431027073, "grad_norm": 1.515625, "learning_rate": 7.517663403387874e-05, "loss": 4.8489, "mean_token_accuracy": 0.24825350791215897, "num_tokens": 18202056.0, "step": 10015 }, { "entropy": 5.578985023498535, "epoch": 8.608508809626128, "grad_norm": 1.4765625, "learning_rate": 7.502402893189191e-05, "loss": 4.7289, "mean_token_accuracy": 0.25674757063388826, "num_tokens": 18210329.0, "step": 10020 }, { "entropy": 5.71737813949585, "epoch": 8.612806188225182, "grad_norm": 1.328125, "learning_rate": 7.487186049749618e-05, "loss": 4.8561, "mean_token_accuracy": 0.2366224303841591, "num_tokens": 18219143.0, "step": 10025 }, { "entropy": 5.631614637374878, "epoch": 8.617103566824238, "grad_norm": 1.40625, "learning_rate": 7.472012906296658e-05, "loss": 4.7979, "mean_token_accuracy": 0.24905427694320678, "num_tokens": 18227679.0, "step": 10030 }, { "entropy": 5.615096664428711, "epoch": 8.621400945423291, "grad_norm": 1.2265625, "learning_rate": 7.45688349596239e-05, "loss": 4.7738, "mean_token_accuracy": 0.2522251158952713, "num_tokens": 18237522.0, "step": 10035 }, { "entropy": 5.646911573410034, "epoch": 8.625698324022347, "grad_norm": 1.4609375, "learning_rate": 7.441797851783402e-05, "loss": 4.8238, "mean_token_accuracy": 0.2428828716278076, "num_tokens": 18245969.0, "step": 10040 }, { "entropy": 5.627325582504272, "epoch": 8.6299957026214, "grad_norm": 1.515625, "learning_rate": 7.426756006700716e-05, "loss": 4.7666, "mean_token_accuracy": 0.25277192890644073, "num_tokens": 18254886.0, "step": 10045 }, { "entropy": 5.686093473434449, "epoch": 8.634293081220456, "grad_norm": 1.5703125, "learning_rate": 7.411757993559702e-05, "loss": 4.889, "mean_token_accuracy": 0.24558345079421998, "num_tokens": 18264222.0, "step": 10050 }, { "entropy": 5.702205467224121, "epoch": 8.63859045981951, "grad_norm": 1.4765625, "learning_rate": 7.396803845110032e-05, "loss": 4.8488, "mean_token_accuracy": 0.25144137591123583, "num_tokens": 18273548.0, "step": 10055 }, { "entropy": 5.66311354637146, "epoch": 8.642887838418565, "grad_norm": 1.4453125, "learning_rate": 7.381893594005585e-05, "loss": 4.8137, "mean_token_accuracy": 0.24637157917022706, "num_tokens": 18282842.0, "step": 10060 }, { "entropy": 5.6392535209655765, "epoch": 8.647185217017618, "grad_norm": 1.453125, "learning_rate": 7.367027272804387e-05, "loss": 4.7912, "mean_token_accuracy": 0.24758926182985305, "num_tokens": 18291133.0, "step": 10065 }, { "entropy": 5.656706380844116, "epoch": 8.651482595616674, "grad_norm": 1.3984375, "learning_rate": 7.352204913968546e-05, "loss": 4.8252, "mean_token_accuracy": 0.24841432571411132, "num_tokens": 18300936.0, "step": 10070 }, { "entropy": 5.6700053215026855, "epoch": 8.65577997421573, "grad_norm": 1.2265625, "learning_rate": 7.337426549864175e-05, "loss": 4.8512, "mean_token_accuracy": 0.24023033082485198, "num_tokens": 18311372.0, "step": 10075 }, { "entropy": 5.655622768402099, "epoch": 8.660077352814783, "grad_norm": 1.2890625, "learning_rate": 7.322692212761306e-05, "loss": 4.8588, "mean_token_accuracy": 0.24819429963827133, "num_tokens": 18321166.0, "step": 10080 }, { "entropy": 5.672299194335937, "epoch": 8.664374731413837, "grad_norm": 1.5234375, "learning_rate": 7.308001934833844e-05, "loss": 4.8068, "mean_token_accuracy": 0.24570150524377823, "num_tokens": 18330399.0, "step": 10085 }, { "entropy": 5.6919238567352295, "epoch": 8.668672110012892, "grad_norm": 1.46875, "learning_rate": 7.29335574815948e-05, "loss": 4.7831, "mean_token_accuracy": 0.24971666783094407, "num_tokens": 18338920.0, "step": 10090 }, { "entropy": 5.647150468826294, "epoch": 8.672969488611947, "grad_norm": 1.3046875, "learning_rate": 7.278753684719636e-05, "loss": 4.8084, "mean_token_accuracy": 0.2445070579648018, "num_tokens": 18348515.0, "step": 10095 }, { "entropy": 5.6693848133087155, "epoch": 8.677266867211001, "grad_norm": 1.4765625, "learning_rate": 7.264195776399386e-05, "loss": 4.8275, "mean_token_accuracy": 0.2455286830663681, "num_tokens": 18357353.0, "step": 10100 }, { "entropy": 5.631603240966797, "epoch": 8.681564245810057, "grad_norm": 1.390625, "learning_rate": 7.249682054987381e-05, "loss": 4.7488, "mean_token_accuracy": 0.25079986453056335, "num_tokens": 18366434.0, "step": 10105 }, { "entropy": 5.709641027450561, "epoch": 8.68586162440911, "grad_norm": 1.34375, "learning_rate": 7.23521255217578e-05, "loss": 4.8827, "mean_token_accuracy": 0.23537713289260864, "num_tokens": 18375589.0, "step": 10110 }, { "entropy": 5.6438311576843265, "epoch": 8.690159003008166, "grad_norm": 1.3984375, "learning_rate": 7.220787299560205e-05, "loss": 4.8076, "mean_token_accuracy": 0.2504834607243538, "num_tokens": 18384904.0, "step": 10115 }, { "entropy": 5.618740224838257, "epoch": 8.69445638160722, "grad_norm": 1.5234375, "learning_rate": 7.20640632863963e-05, "loss": 4.7024, "mean_token_accuracy": 0.25854243636131286, "num_tokens": 18393833.0, "step": 10120 }, { "entropy": 5.633188009262085, "epoch": 8.698753760206275, "grad_norm": 1.4296875, "learning_rate": 7.192069670816359e-05, "loss": 4.8576, "mean_token_accuracy": 0.241273033618927, "num_tokens": 18403444.0, "step": 10125 }, { "entropy": 5.624459218978882, "epoch": 8.703051138805328, "grad_norm": 1.59375, "learning_rate": 7.177777357395912e-05, "loss": 4.7869, "mean_token_accuracy": 0.2525374382734299, "num_tokens": 18412091.0, "step": 10130 }, { "entropy": 5.629094934463501, "epoch": 8.707348517404384, "grad_norm": 1.4765625, "learning_rate": 7.163529419587002e-05, "loss": 4.7907, "mean_token_accuracy": 0.24723465293645858, "num_tokens": 18421655.0, "step": 10135 }, { "entropy": 5.711287641525269, "epoch": 8.711645896003438, "grad_norm": 1.421875, "learning_rate": 7.149325888501418e-05, "loss": 4.879, "mean_token_accuracy": 0.24357055127620697, "num_tokens": 18431239.0, "step": 10140 }, { "entropy": 5.641200685501099, "epoch": 8.715943274602493, "grad_norm": 1.3515625, "learning_rate": 7.135166795153992e-05, "loss": 4.763, "mean_token_accuracy": 0.2550801619887352, "num_tokens": 18440572.0, "step": 10145 }, { "entropy": 5.671554136276245, "epoch": 8.720240653201547, "grad_norm": 1.453125, "learning_rate": 7.121052170462541e-05, "loss": 4.8463, "mean_token_accuracy": 0.24792980551719665, "num_tokens": 18449245.0, "step": 10150 }, { "entropy": 5.579294300079345, "epoch": 8.724538031800602, "grad_norm": 1.5390625, "learning_rate": 7.106982045247754e-05, "loss": 4.6967, "mean_token_accuracy": 0.26570699363946915, "num_tokens": 18457961.0, "step": 10155 }, { "entropy": 5.6380534172058105, "epoch": 8.728835410399656, "grad_norm": 1.3984375, "learning_rate": 7.092956450233162e-05, "loss": 4.8613, "mean_token_accuracy": 0.2413666471838951, "num_tokens": 18467956.0, "step": 10160 }, { "entropy": 5.6795158863067625, "epoch": 8.733132788998711, "grad_norm": 1.3203125, "learning_rate": 7.078975416045055e-05, "loss": 4.7933, "mean_token_accuracy": 0.2507988974452019, "num_tokens": 18477313.0, "step": 10165 }, { "entropy": 5.592861557006836, "epoch": 8.737430167597765, "grad_norm": 1.6484375, "learning_rate": 7.065038973212424e-05, "loss": 4.6759, "mean_token_accuracy": 0.260982346534729, "num_tokens": 18485690.0, "step": 10170 }, { "entropy": 5.639511489868164, "epoch": 8.74172754619682, "grad_norm": 1.5703125, "learning_rate": 7.051147152166896e-05, "loss": 4.8044, "mean_token_accuracy": 0.252939784526825, "num_tokens": 18494150.0, "step": 10175 }, { "entropy": 5.585833978652954, "epoch": 8.746024924795874, "grad_norm": 1.2265625, "learning_rate": 7.037299983242652e-05, "loss": 4.7762, "mean_token_accuracy": 0.25288844257593157, "num_tokens": 18504091.0, "step": 10180 }, { "entropy": 5.6570563316345215, "epoch": 8.75032230339493, "grad_norm": 1.4140625, "learning_rate": 7.023497496676371e-05, "loss": 4.8169, "mean_token_accuracy": 0.24400742948055268, "num_tokens": 18513695.0, "step": 10185 }, { "entropy": 5.645201253890991, "epoch": 8.754619681993983, "grad_norm": 1.296875, "learning_rate": 7.009739722607173e-05, "loss": 4.7747, "mean_token_accuracy": 0.25545826405286787, "num_tokens": 18522551.0, "step": 10190 }, { "entropy": 5.678719329833984, "epoch": 8.758917060593038, "grad_norm": 1.40625, "learning_rate": 6.996026691076531e-05, "loss": 4.9283, "mean_token_accuracy": 0.2372728779911995, "num_tokens": 18533290.0, "step": 10195 }, { "entropy": 5.646335554122925, "epoch": 8.763214439192092, "grad_norm": 1.390625, "learning_rate": 6.982358432028234e-05, "loss": 4.7396, "mean_token_accuracy": 0.2493108794093132, "num_tokens": 18542990.0, "step": 10200 }, { "entropy": 5.754535675048828, "epoch": 8.767511817791148, "grad_norm": 1.453125, "learning_rate": 6.968734975308283e-05, "loss": 4.9163, "mean_token_accuracy": 0.23316902965307235, "num_tokens": 18552525.0, "step": 10205 }, { "entropy": 5.613467741012573, "epoch": 8.771809196390201, "grad_norm": 1.4375, "learning_rate": 6.955156350664876e-05, "loss": 4.7408, "mean_token_accuracy": 0.2547129586338997, "num_tokens": 18561741.0, "step": 10210 }, { "entropy": 5.715860271453858, "epoch": 8.776106574989257, "grad_norm": 1.4609375, "learning_rate": 6.941622587748298e-05, "loss": 4.8939, "mean_token_accuracy": 0.2358478605747223, "num_tokens": 18570130.0, "step": 10215 }, { "entropy": 5.6053954601287845, "epoch": 8.780403953588312, "grad_norm": 1.484375, "learning_rate": 6.928133716110863e-05, "loss": 4.7973, "mean_token_accuracy": 0.24966762959957123, "num_tokens": 18579149.0, "step": 10220 }, { "entropy": 5.650966930389404, "epoch": 8.784701332187366, "grad_norm": 1.4921875, "learning_rate": 6.914689765206899e-05, "loss": 4.722, "mean_token_accuracy": 0.2569688901305199, "num_tokens": 18587604.0, "step": 10225 }, { "entropy": 5.668413829803467, "epoch": 8.78899871078642, "grad_norm": 1.5546875, "learning_rate": 6.901290764392609e-05, "loss": 4.7793, "mean_token_accuracy": 0.25286874175071716, "num_tokens": 18596056.0, "step": 10230 }, { "entropy": 5.673072862625122, "epoch": 8.793296089385475, "grad_norm": 1.2265625, "learning_rate": 6.887936742926058e-05, "loss": 4.8267, "mean_token_accuracy": 0.24095726311206817, "num_tokens": 18605575.0, "step": 10235 }, { "entropy": 5.602101993560791, "epoch": 8.79759346798453, "grad_norm": 1.3984375, "learning_rate": 6.874627729967086e-05, "loss": 4.7614, "mean_token_accuracy": 0.25367412269115447, "num_tokens": 18614206.0, "step": 10240 }, { "entropy": 5.634996032714843, "epoch": 8.801890846583584, "grad_norm": 1.3125, "learning_rate": 6.861363754577262e-05, "loss": 4.7681, "mean_token_accuracy": 0.25228522568941114, "num_tokens": 18623067.0, "step": 10245 }, { "entropy": 5.583075428009034, "epoch": 8.80618822518264, "grad_norm": 1.53125, "learning_rate": 6.848144845719808e-05, "loss": 4.762, "mean_token_accuracy": 0.25409252345561983, "num_tokens": 18632266.0, "step": 10250 }, { "entropy": 5.593589639663696, "epoch": 8.810485603781693, "grad_norm": 1.4765625, "learning_rate": 6.834971032259537e-05, "loss": 4.7551, "mean_token_accuracy": 0.24724589586257933, "num_tokens": 18640516.0, "step": 10255 }, { "entropy": 5.59733476638794, "epoch": 8.814782982380748, "grad_norm": 1.5234375, "learning_rate": 6.821842342962786e-05, "loss": 4.7732, "mean_token_accuracy": 0.2560932472348213, "num_tokens": 18649368.0, "step": 10260 }, { "entropy": 5.605298137664795, "epoch": 8.819080360979802, "grad_norm": 1.453125, "learning_rate": 6.808758806497375e-05, "loss": 4.7547, "mean_token_accuracy": 0.25770940631628036, "num_tokens": 18658306.0, "step": 10265 }, { "entropy": 5.668881273269653, "epoch": 8.823377739578858, "grad_norm": 1.4921875, "learning_rate": 6.795720451432509e-05, "loss": 4.763, "mean_token_accuracy": 0.2539924278855324, "num_tokens": 18668676.0, "step": 10270 }, { "entropy": 5.653170919418335, "epoch": 8.827675118177911, "grad_norm": 1.3359375, "learning_rate": 6.782727306238749e-05, "loss": 4.8899, "mean_token_accuracy": 0.23995666801929474, "num_tokens": 18677840.0, "step": 10275 }, { "entropy": 5.571465110778808, "epoch": 8.831972496776967, "grad_norm": 1.578125, "learning_rate": 6.769779399287928e-05, "loss": 4.7125, "mean_token_accuracy": 0.2465637966990471, "num_tokens": 18685733.0, "step": 10280 }, { "entropy": 5.609991598129272, "epoch": 8.83626987537602, "grad_norm": 1.40625, "learning_rate": 6.756876758853104e-05, "loss": 4.7832, "mean_token_accuracy": 0.24858400970697403, "num_tokens": 18694648.0, "step": 10285 }, { "entropy": 5.614251518249512, "epoch": 8.840567253975076, "grad_norm": 1.671875, "learning_rate": 6.744019413108486e-05, "loss": 4.7793, "mean_token_accuracy": 0.25418430119752883, "num_tokens": 18702758.0, "step": 10290 }, { "entropy": 5.661041927337647, "epoch": 8.84486463257413, "grad_norm": 1.453125, "learning_rate": 6.731207390129366e-05, "loss": 4.8443, "mean_token_accuracy": 0.23748955130577087, "num_tokens": 18712257.0, "step": 10295 }, { "entropy": 5.648515462875366, "epoch": 8.849162011173185, "grad_norm": 1.265625, "learning_rate": 6.7184407178921e-05, "loss": 4.8226, "mean_token_accuracy": 0.25025610327720643, "num_tokens": 18722337.0, "step": 10300 }, { "entropy": 5.668762063980102, "epoch": 8.853459389772238, "grad_norm": 1.5390625, "learning_rate": 6.70571942427399e-05, "loss": 4.8706, "mean_token_accuracy": 0.24509814977645875, "num_tokens": 18731260.0, "step": 10305 }, { "entropy": 5.696825885772705, "epoch": 8.857756768371294, "grad_norm": 1.484375, "learning_rate": 6.693043537053254e-05, "loss": 4.9, "mean_token_accuracy": 0.23831938505172728, "num_tokens": 18741357.0, "step": 10310 }, { "entropy": 5.636625862121582, "epoch": 8.862054146970348, "grad_norm": 1.4453125, "learning_rate": 6.68041308390896e-05, "loss": 4.8411, "mean_token_accuracy": 0.2473379299044609, "num_tokens": 18751694.0, "step": 10315 }, { "entropy": 5.678902578353882, "epoch": 8.866351525569403, "grad_norm": 1.34375, "learning_rate": 6.667828092420977e-05, "loss": 4.8471, "mean_token_accuracy": 0.24866860657930373, "num_tokens": 18760679.0, "step": 10320 }, { "entropy": 5.607882404327393, "epoch": 8.870648904168457, "grad_norm": 1.375, "learning_rate": 6.655288590069889e-05, "loss": 4.7921, "mean_token_accuracy": 0.254789824783802, "num_tokens": 18769650.0, "step": 10325 }, { "entropy": 5.594926452636718, "epoch": 8.874946282767512, "grad_norm": 1.375, "learning_rate": 6.642794604236965e-05, "loss": 4.8221, "mean_token_accuracy": 0.24797703474760055, "num_tokens": 18780076.0, "step": 10330 }, { "entropy": 5.618593168258667, "epoch": 8.879243661366566, "grad_norm": 1.3984375, "learning_rate": 6.630346162204069e-05, "loss": 4.7636, "mean_token_accuracy": 0.2526925429701805, "num_tokens": 18789034.0, "step": 10335 }, { "entropy": 5.6028295993804935, "epoch": 8.883541039965621, "grad_norm": 1.6484375, "learning_rate": 6.617943291153631e-05, "loss": 4.6939, "mean_token_accuracy": 0.25986738353967664, "num_tokens": 18797916.0, "step": 10340 }, { "entropy": 5.710940647125244, "epoch": 8.887838418564675, "grad_norm": 1.6171875, "learning_rate": 6.605586018168558e-05, "loss": 4.81, "mean_token_accuracy": 0.24979257434606553, "num_tokens": 18806895.0, "step": 10345 }, { "entropy": 5.683839225769043, "epoch": 8.89213579716373, "grad_norm": 1.34375, "learning_rate": 6.593274370232191e-05, "loss": 4.8327, "mean_token_accuracy": 0.2464034602046013, "num_tokens": 18816049.0, "step": 10350 }, { "entropy": 5.628410482406617, "epoch": 8.896433175762784, "grad_norm": 1.3359375, "learning_rate": 6.581008374228255e-05, "loss": 4.7908, "mean_token_accuracy": 0.2461797282099724, "num_tokens": 18825491.0, "step": 10355 }, { "entropy": 5.622152996063233, "epoch": 8.90073055436184, "grad_norm": 1.4296875, "learning_rate": 6.568788056940785e-05, "loss": 4.8165, "mean_token_accuracy": 0.2465150237083435, "num_tokens": 18834677.0, "step": 10360 }, { "entropy": 5.6243490219116214, "epoch": 8.905027932960895, "grad_norm": 1.359375, "learning_rate": 6.556613445054063e-05, "loss": 4.7228, "mean_token_accuracy": 0.25819993317127227, "num_tokens": 18844283.0, "step": 10365 }, { "entropy": 5.675375461578369, "epoch": 8.909325311559948, "grad_norm": 1.3515625, "learning_rate": 6.544484565152577e-05, "loss": 4.7937, "mean_token_accuracy": 0.2524940297007561, "num_tokens": 18853403.0, "step": 10370 }, { "entropy": 5.635634279251098, "epoch": 8.913622690159002, "grad_norm": 1.65625, "learning_rate": 6.532401443720951e-05, "loss": 4.8251, "mean_token_accuracy": 0.24945158213377, "num_tokens": 18861530.0, "step": 10375 }, { "entropy": 5.646540689468384, "epoch": 8.917920068758058, "grad_norm": 1.265625, "learning_rate": 6.520364107143898e-05, "loss": 4.8436, "mean_token_accuracy": 0.24798599183559417, "num_tokens": 18871495.0, "step": 10380 }, { "entropy": 5.6593023300170895, "epoch": 8.922217447357113, "grad_norm": 1.375, "learning_rate": 6.50837258170615e-05, "loss": 4.8269, "mean_token_accuracy": 0.24874897450208663, "num_tokens": 18881099.0, "step": 10385 }, { "entropy": 5.668875980377197, "epoch": 8.926514825956167, "grad_norm": 1.3984375, "learning_rate": 6.496426893592397e-05, "loss": 4.8391, "mean_token_accuracy": 0.2391132190823555, "num_tokens": 18890368.0, "step": 10390 }, { "entropy": 5.667122745513916, "epoch": 8.930812204555222, "grad_norm": 1.5234375, "learning_rate": 6.484527068887258e-05, "loss": 4.8371, "mean_token_accuracy": 0.24273379147052765, "num_tokens": 18898576.0, "step": 10395 }, { "entropy": 5.638441038131714, "epoch": 8.935109583154276, "grad_norm": 1.546875, "learning_rate": 6.472673133575181e-05, "loss": 4.7597, "mean_token_accuracy": 0.24820929765701294, "num_tokens": 18907276.0, "step": 10400 }, { "entropy": 5.664800548553467, "epoch": 8.939406961753331, "grad_norm": 1.5, "learning_rate": 6.460865113540437e-05, "loss": 4.7591, "mean_token_accuracy": 0.25227137207984923, "num_tokens": 18915920.0, "step": 10405 }, { "entropy": 5.680027961730957, "epoch": 8.943704340352385, "grad_norm": 1.40625, "learning_rate": 6.449103034567011e-05, "loss": 4.7592, "mean_token_accuracy": 0.2561339348554611, "num_tokens": 18925529.0, "step": 10410 }, { "entropy": 5.664671468734741, "epoch": 8.94800171895144, "grad_norm": 1.484375, "learning_rate": 6.437386922338591e-05, "loss": 4.861, "mean_token_accuracy": 0.240652696788311, "num_tokens": 18934859.0, "step": 10415 }, { "entropy": 5.666413927078247, "epoch": 8.952299097550494, "grad_norm": 1.5078125, "learning_rate": 6.425716802438479e-05, "loss": 4.8403, "mean_token_accuracy": 0.24916145354509353, "num_tokens": 18943804.0, "step": 10420 }, { "entropy": 5.626944637298584, "epoch": 8.95659647614955, "grad_norm": 1.53125, "learning_rate": 6.414092700349548e-05, "loss": 4.804, "mean_token_accuracy": 0.25158223062753676, "num_tokens": 18954290.0, "step": 10425 }, { "entropy": 5.677120351791382, "epoch": 8.960893854748603, "grad_norm": 1.453125, "learning_rate": 6.402514641454192e-05, "loss": 4.91, "mean_token_accuracy": 0.23440527617931367, "num_tokens": 18963448.0, "step": 10430 }, { "entropy": 5.629398536682129, "epoch": 8.965191233347658, "grad_norm": 1.3515625, "learning_rate": 6.390982651034274e-05, "loss": 4.8127, "mean_token_accuracy": 0.24613251239061357, "num_tokens": 18972314.0, "step": 10435 }, { "entropy": 5.608459091186523, "epoch": 8.969488611946712, "grad_norm": 1.59375, "learning_rate": 6.379496754271044e-05, "loss": 4.6478, "mean_token_accuracy": 0.2624844193458557, "num_tokens": 18980720.0, "step": 10440 }, { "entropy": 5.566482067108154, "epoch": 8.973785990545768, "grad_norm": 1.5703125, "learning_rate": 6.368056976245107e-05, "loss": 4.6832, "mean_token_accuracy": 0.27008168697357177, "num_tokens": 18988267.0, "step": 10445 }, { "entropy": 5.61891360282898, "epoch": 8.978083369144821, "grad_norm": 1.546875, "learning_rate": 6.356663341936368e-05, "loss": 4.7809, "mean_token_accuracy": 0.24728458374738693, "num_tokens": 18997036.0, "step": 10450 }, { "entropy": 5.63971700668335, "epoch": 8.982380747743877, "grad_norm": 1.515625, "learning_rate": 6.345315876223977e-05, "loss": 4.7413, "mean_token_accuracy": 0.25158697068691255, "num_tokens": 19005440.0, "step": 10455 }, { "entropy": 5.617656993865967, "epoch": 8.98667812634293, "grad_norm": 1.4765625, "learning_rate": 6.334014603886256e-05, "loss": 4.8081, "mean_token_accuracy": 0.25301000475883484, "num_tokens": 19014449.0, "step": 10460 }, { "entropy": 5.6047585010528564, "epoch": 8.990975504941986, "grad_norm": 1.5546875, "learning_rate": 6.322759549600665e-05, "loss": 4.7627, "mean_token_accuracy": 0.24849656075239182, "num_tokens": 19024061.0, "step": 10465 }, { "entropy": 5.575642251968384, "epoch": 8.99527288354104, "grad_norm": 1.4375, "learning_rate": 6.311550737943753e-05, "loss": 4.7483, "mean_token_accuracy": 0.257401143014431, "num_tokens": 19033400.0, "step": 10470 }, { "entropy": 5.665790462493897, "epoch": 8.999570262140095, "grad_norm": 1.4296875, "learning_rate": 6.300388193391075e-05, "loss": 4.8158, "mean_token_accuracy": 0.24791148900985718, "num_tokens": 19042633.0, "step": 10475 }, { "entropy": 5.64085324605306, "epoch": 9.003437902879243, "grad_norm": 1.390625, "learning_rate": 6.289271940317174e-05, "loss": 4.8769, "mean_token_accuracy": 0.2419930464691586, "num_tokens": 19051120.0, "step": 10480 }, { "entropy": 5.6130658149719235, "epoch": 9.007735281478299, "grad_norm": 1.4375, "learning_rate": 6.278202002995497e-05, "loss": 4.7878, "mean_token_accuracy": 0.25171366333961487, "num_tokens": 19060755.0, "step": 10485 }, { "entropy": 5.610973453521728, "epoch": 9.012032660077352, "grad_norm": 1.5078125, "learning_rate": 6.267178405598371e-05, "loss": 4.7215, "mean_token_accuracy": 0.2603663995862007, "num_tokens": 19069197.0, "step": 10490 }, { "entropy": 5.675764322280884, "epoch": 9.016330038676408, "grad_norm": 1.453125, "learning_rate": 6.256201172196921e-05, "loss": 4.7783, "mean_token_accuracy": 0.25133358389139177, "num_tokens": 19078452.0, "step": 10495 }, { "entropy": 5.640312767028808, "epoch": 9.020627417275461, "grad_norm": 1.390625, "learning_rate": 6.245270326761034e-05, "loss": 4.7935, "mean_token_accuracy": 0.2465323805809021, "num_tokens": 19087127.0, "step": 10500 }, { "epoch": 9.020627417275461, "eval_entropy": 5.4709870262188955, "eval_loss": 5.88071870803833, "eval_mean_token_accuracy": 0.18410248618136654, "eval_num_tokens": 19087127.0, "eval_runtime": 2.0494, "eval_samples_per_second": 1731.732, "eval_steps_per_second": 216.649, "step": 10500 }, { "entropy": 5.669859838485718, "epoch": 9.024924795874517, "grad_norm": 1.53125, "learning_rate": 6.234385893159311e-05, "loss": 4.7838, "mean_token_accuracy": 0.2515318378806114, "num_tokens": 19095610.0, "step": 10505 }, { "entropy": 5.657605171203613, "epoch": 9.02922217447357, "grad_norm": 1.5234375, "learning_rate": 6.223547895159009e-05, "loss": 4.7404, "mean_token_accuracy": 0.25628202855587007, "num_tokens": 19103649.0, "step": 10510 }, { "entropy": 5.5712086200714115, "epoch": 9.033519553072626, "grad_norm": 1.4375, "learning_rate": 6.212756356425978e-05, "loss": 4.6999, "mean_token_accuracy": 0.25828822106122973, "num_tokens": 19112426.0, "step": 10515 }, { "entropy": 5.645226001739502, "epoch": 9.03781693167168, "grad_norm": 1.4140625, "learning_rate": 6.202011300524623e-05, "loss": 4.8116, "mean_token_accuracy": 0.24831810742616653, "num_tokens": 19121307.0, "step": 10520 }, { "entropy": 5.679613542556763, "epoch": 9.042114310270735, "grad_norm": 1.3671875, "learning_rate": 6.191312750917855e-05, "loss": 4.8263, "mean_token_accuracy": 0.2535302057862282, "num_tokens": 19129728.0, "step": 10525 }, { "entropy": 5.622981262207031, "epoch": 9.046411688869789, "grad_norm": 1.4765625, "learning_rate": 6.180660730967036e-05, "loss": 4.7466, "mean_token_accuracy": 0.2522904068231583, "num_tokens": 19139931.0, "step": 10530 }, { "entropy": 5.683325624465942, "epoch": 9.050709067468844, "grad_norm": 1.4609375, "learning_rate": 6.170055263931912e-05, "loss": 4.7445, "mean_token_accuracy": 0.24987991899251938, "num_tokens": 19148711.0, "step": 10535 }, { "entropy": 5.64924807548523, "epoch": 9.055006446067898, "grad_norm": 1.515625, "learning_rate": 6.159496372970584e-05, "loss": 4.7187, "mean_token_accuracy": 0.24842094928026198, "num_tokens": 19156937.0, "step": 10540 }, { "entropy": 5.594881010055542, "epoch": 9.059303824666953, "grad_norm": 1.3125, "learning_rate": 6.148984081139454e-05, "loss": 4.7702, "mean_token_accuracy": 0.250566403567791, "num_tokens": 19166449.0, "step": 10545 }, { "entropy": 5.676750993728637, "epoch": 9.063601203266007, "grad_norm": 1.2109375, "learning_rate": 6.138518411393163e-05, "loss": 4.8824, "mean_token_accuracy": 0.23818726986646652, "num_tokens": 19175757.0, "step": 10550 }, { "entropy": 5.6487244129180905, "epoch": 9.067898581865062, "grad_norm": 1.46875, "learning_rate": 6.128099386584548e-05, "loss": 4.7894, "mean_token_accuracy": 0.25352845937013624, "num_tokens": 19184638.0, "step": 10555 }, { "entropy": 5.652242422103882, "epoch": 9.072195960464116, "grad_norm": 1.46875, "learning_rate": 6.1177270294646e-05, "loss": 4.8136, "mean_token_accuracy": 0.24836055785417557, "num_tokens": 19193791.0, "step": 10560 }, { "entropy": 5.592644214630127, "epoch": 9.076493339063171, "grad_norm": 1.46875, "learning_rate": 6.107401362682401e-05, "loss": 4.8297, "mean_token_accuracy": 0.24406633228063584, "num_tokens": 19202595.0, "step": 10565 }, { "entropy": 5.617107391357422, "epoch": 9.080790717662227, "grad_norm": 1.328125, "learning_rate": 6.097122408785076e-05, "loss": 4.7995, "mean_token_accuracy": 0.24826500862836837, "num_tokens": 19211994.0, "step": 10570 }, { "entropy": 5.664296865463257, "epoch": 9.08508809626128, "grad_norm": 1.421875, "learning_rate": 6.086890190217752e-05, "loss": 4.775, "mean_token_accuracy": 0.25444269776344297, "num_tokens": 19220588.0, "step": 10575 }, { "entropy": 5.598661231994629, "epoch": 9.089385474860336, "grad_norm": 1.546875, "learning_rate": 6.076704729323507e-05, "loss": 4.7082, "mean_token_accuracy": 0.2616791486740112, "num_tokens": 19229679.0, "step": 10580 }, { "entropy": 5.6352826118469235, "epoch": 9.09368285345939, "grad_norm": 1.4140625, "learning_rate": 6.0665660483433173e-05, "loss": 4.8391, "mean_token_accuracy": 0.24676432013511657, "num_tokens": 19239269.0, "step": 10585 }, { "entropy": 5.6875403881072994, "epoch": 9.097980232058445, "grad_norm": 1.4921875, "learning_rate": 6.05647416941601e-05, "loss": 4.8437, "mean_token_accuracy": 0.2471165433526039, "num_tokens": 19249046.0, "step": 10590 }, { "entropy": 5.616286373138427, "epoch": 9.102277610657499, "grad_norm": 1.4375, "learning_rate": 6.046429114578212e-05, "loss": 4.6969, "mean_token_accuracy": 0.25456131249666214, "num_tokens": 19257398.0, "step": 10595 }, { "entropy": 5.664317655563354, "epoch": 9.106574989256554, "grad_norm": 1.4609375, "learning_rate": 6.0364309057643084e-05, "loss": 4.8135, "mean_token_accuracy": 0.24456136524677277, "num_tokens": 19266619.0, "step": 10600 }, { "entropy": 5.697394466400146, "epoch": 9.110872367855608, "grad_norm": 1.484375, "learning_rate": 6.0264795648063904e-05, "loss": 4.8151, "mean_token_accuracy": 0.24979521930217743, "num_tokens": 19275837.0, "step": 10605 }, { "entropy": 5.682855892181396, "epoch": 9.115169746454663, "grad_norm": 1.296875, "learning_rate": 6.0165751134342155e-05, "loss": 4.7746, "mean_token_accuracy": 0.242179574072361, "num_tokens": 19284745.0, "step": 10610 }, { "entropy": 5.6265136241912845, "epoch": 9.119467125053717, "grad_norm": 1.3046875, "learning_rate": 6.006717573275138e-05, "loss": 4.7321, "mean_token_accuracy": 0.2574576482176781, "num_tokens": 19293956.0, "step": 10615 }, { "entropy": 5.661381340026855, "epoch": 9.123764503652772, "grad_norm": 1.1953125, "learning_rate": 5.996906965854093e-05, "loss": 4.8359, "mean_token_accuracy": 0.2464934766292572, "num_tokens": 19303629.0, "step": 10620 }, { "entropy": 5.669901990890503, "epoch": 9.128061882251826, "grad_norm": 1.5546875, "learning_rate": 5.987143312593522e-05, "loss": 4.8455, "mean_token_accuracy": 0.24552787095308304, "num_tokens": 19312458.0, "step": 10625 }, { "entropy": 5.583899021148682, "epoch": 9.132359260850881, "grad_norm": 1.484375, "learning_rate": 5.977426634813338e-05, "loss": 4.7378, "mean_token_accuracy": 0.2557566374540329, "num_tokens": 19321428.0, "step": 10630 }, { "entropy": 5.6959641456604, "epoch": 9.136656639449935, "grad_norm": 1.5, "learning_rate": 5.9677569537308866e-05, "loss": 4.9001, "mean_token_accuracy": 0.23603728413581848, "num_tokens": 19331176.0, "step": 10635 }, { "entropy": 5.678333759307861, "epoch": 9.14095401804899, "grad_norm": 1.4296875, "learning_rate": 5.958134290460888e-05, "loss": 4.8093, "mean_token_accuracy": 0.24847684800624847, "num_tokens": 19340401.0, "step": 10640 }, { "entropy": 5.594663047790528, "epoch": 9.145251396648044, "grad_norm": 1.21875, "learning_rate": 5.94855866601539e-05, "loss": 4.7344, "mean_token_accuracy": 0.25321966260671613, "num_tokens": 19350700.0, "step": 10645 }, { "entropy": 5.658229923248291, "epoch": 9.1495487752471, "grad_norm": 1.4609375, "learning_rate": 5.939030101303724e-05, "loss": 4.7892, "mean_token_accuracy": 0.24839598089456558, "num_tokens": 19360040.0, "step": 10650 }, { "entropy": 5.663689517974854, "epoch": 9.153846153846153, "grad_norm": 1.515625, "learning_rate": 5.929548617132472e-05, "loss": 4.7355, "mean_token_accuracy": 0.25522763580083846, "num_tokens": 19368546.0, "step": 10655 }, { "entropy": 5.598207283020019, "epoch": 9.158143532445209, "grad_norm": 1.5703125, "learning_rate": 5.920114234205407e-05, "loss": 4.7166, "mean_token_accuracy": 0.25508318692445753, "num_tokens": 19377137.0, "step": 10660 }, { "entropy": 5.693424034118652, "epoch": 9.162440911044262, "grad_norm": 1.4296875, "learning_rate": 5.910726973123451e-05, "loss": 4.8644, "mean_token_accuracy": 0.23823002427816392, "num_tokens": 19385781.0, "step": 10665 }, { "entropy": 5.642845821380615, "epoch": 9.166738289643318, "grad_norm": 1.453125, "learning_rate": 5.901386854384622e-05, "loss": 4.8062, "mean_token_accuracy": 0.25551503002643583, "num_tokens": 19395186.0, "step": 10670 }, { "entropy": 5.684504127502441, "epoch": 9.171035668242371, "grad_norm": 1.515625, "learning_rate": 5.892093898384017e-05, "loss": 4.7858, "mean_token_accuracy": 0.25086683183908465, "num_tokens": 19404140.0, "step": 10675 }, { "entropy": 5.649334859848023, "epoch": 9.175333046841427, "grad_norm": 1.484375, "learning_rate": 5.8828481254137276e-05, "loss": 4.8012, "mean_token_accuracy": 0.2486447736620903, "num_tokens": 19414481.0, "step": 10680 }, { "entropy": 5.533756256103516, "epoch": 9.17963042544048, "grad_norm": 1.6015625, "learning_rate": 5.873649555662836e-05, "loss": 4.6407, "mean_token_accuracy": 0.2578012332320213, "num_tokens": 19423259.0, "step": 10685 }, { "entropy": 5.624078559875488, "epoch": 9.183927804039536, "grad_norm": 1.4765625, "learning_rate": 5.8644982092173335e-05, "loss": 4.7787, "mean_token_accuracy": 0.2532876804471016, "num_tokens": 19432011.0, "step": 10690 }, { "entropy": 5.625823307037353, "epoch": 9.18822518263859, "grad_norm": 1.4296875, "learning_rate": 5.85539410606011e-05, "loss": 4.7681, "mean_token_accuracy": 0.2507260128855705, "num_tokens": 19441221.0, "step": 10695 }, { "entropy": 5.620725297927857, "epoch": 9.192522561237645, "grad_norm": 1.46875, "learning_rate": 5.8463372660708836e-05, "loss": 4.7071, "mean_token_accuracy": 0.25606116056442263, "num_tokens": 19450014.0, "step": 10700 }, { "entropy": 5.626107978820801, "epoch": 9.196819939836699, "grad_norm": 1.4375, "learning_rate": 5.837327709026171e-05, "loss": 4.8369, "mean_token_accuracy": 0.24323922097682954, "num_tokens": 19459480.0, "step": 10705 }, { "entropy": 5.652804756164551, "epoch": 9.201117318435754, "grad_norm": 1.3828125, "learning_rate": 5.8283654545992464e-05, "loss": 4.8259, "mean_token_accuracy": 0.25096631944179537, "num_tokens": 19469111.0, "step": 10710 }, { "entropy": 5.642743730545044, "epoch": 9.20541469703481, "grad_norm": 1.5, "learning_rate": 5.819450522360096e-05, "loss": 4.7641, "mean_token_accuracy": 0.2504918292164803, "num_tokens": 19477531.0, "step": 10715 }, { "entropy": 5.663795518875122, "epoch": 9.209712075633863, "grad_norm": 1.5546875, "learning_rate": 5.810582931775362e-05, "loss": 4.7953, "mean_token_accuracy": 0.2489721119403839, "num_tokens": 19486041.0, "step": 10720 }, { "entropy": 5.652674436569214, "epoch": 9.214009454232919, "grad_norm": 1.453125, "learning_rate": 5.801762702208317e-05, "loss": 4.7974, "mean_token_accuracy": 0.24531738311052323, "num_tokens": 19495514.0, "step": 10725 }, { "entropy": 5.5886533737182615, "epoch": 9.218306832831972, "grad_norm": 1.4765625, "learning_rate": 5.7929898529188215e-05, "loss": 4.7238, "mean_token_accuracy": 0.25462878197431565, "num_tokens": 19503879.0, "step": 10730 }, { "entropy": 5.619577884674072, "epoch": 9.222604211431028, "grad_norm": 1.265625, "learning_rate": 5.784264403063272e-05, "loss": 4.7174, "mean_token_accuracy": 0.25512780994176865, "num_tokens": 19513922.0, "step": 10735 }, { "entropy": 5.601106023788452, "epoch": 9.226901590030081, "grad_norm": 1.5625, "learning_rate": 5.775586371694561e-05, "loss": 4.7573, "mean_token_accuracy": 0.2493024155497551, "num_tokens": 19522555.0, "step": 10740 }, { "entropy": 5.605829763412475, "epoch": 9.231198968629137, "grad_norm": 1.3046875, "learning_rate": 5.7669557777620376e-05, "loss": 4.7487, "mean_token_accuracy": 0.25207943469285965, "num_tokens": 19531245.0, "step": 10745 }, { "entropy": 5.604989385604858, "epoch": 9.23549634722819, "grad_norm": 1.46875, "learning_rate": 5.7583726401114756e-05, "loss": 4.713, "mean_token_accuracy": 0.25506144762039185, "num_tokens": 19540258.0, "step": 10750 }, { "entropy": 5.64974570274353, "epoch": 9.239793725827246, "grad_norm": 1.453125, "learning_rate": 5.749836977485013e-05, "loss": 4.7854, "mean_token_accuracy": 0.24881967455148696, "num_tokens": 19550071.0, "step": 10755 }, { "entropy": 5.593701410293579, "epoch": 9.2440911044263, "grad_norm": 1.3515625, "learning_rate": 5.741348808521128e-05, "loss": 4.7131, "mean_token_accuracy": 0.2589977756142616, "num_tokens": 19558972.0, "step": 10760 }, { "entropy": 5.649643993377685, "epoch": 9.248388483025355, "grad_norm": 1.4140625, "learning_rate": 5.7329081517545846e-05, "loss": 4.8102, "mean_token_accuracy": 0.25246872156858446, "num_tokens": 19568270.0, "step": 10765 }, { "entropy": 5.668751621246338, "epoch": 9.252685861624409, "grad_norm": 1.71875, "learning_rate": 5.724515025616409e-05, "loss": 4.7713, "mean_token_accuracy": 0.2551438122987747, "num_tokens": 19576726.0, "step": 10770 }, { "entropy": 5.6279136657714846, "epoch": 9.256983240223464, "grad_norm": 1.3828125, "learning_rate": 5.716169448433832e-05, "loss": 4.7687, "mean_token_accuracy": 0.24954331517219544, "num_tokens": 19586706.0, "step": 10775 }, { "entropy": 5.627971220016479, "epoch": 9.261280618822518, "grad_norm": 1.484375, "learning_rate": 5.707871438430255e-05, "loss": 4.7989, "mean_token_accuracy": 0.2460992693901062, "num_tokens": 19595370.0, "step": 10780 }, { "entropy": 5.626669549942017, "epoch": 9.265577997421573, "grad_norm": 1.5859375, "learning_rate": 5.699621013725218e-05, "loss": 4.8064, "mean_token_accuracy": 0.25317208766937255, "num_tokens": 19604134.0, "step": 10785 }, { "entropy": 5.5840527534484865, "epoch": 9.269875376020627, "grad_norm": 1.4296875, "learning_rate": 5.691418192334352e-05, "loss": 4.7339, "mean_token_accuracy": 0.25829588919878005, "num_tokens": 19612716.0, "step": 10790 }, { "entropy": 5.624117422103882, "epoch": 9.274172754619682, "grad_norm": 1.421875, "learning_rate": 5.683262992169341e-05, "loss": 4.8399, "mean_token_accuracy": 0.24457038044929505, "num_tokens": 19621850.0, "step": 10795 }, { "entropy": 5.592050504684448, "epoch": 9.278470133218736, "grad_norm": 1.390625, "learning_rate": 5.675155431037876e-05, "loss": 4.7706, "mean_token_accuracy": 0.24704843759536743, "num_tokens": 19630706.0, "step": 10800 }, { "entropy": 5.602861404418945, "epoch": 9.282767511817791, "grad_norm": 1.4609375, "learning_rate": 5.6670955266436365e-05, "loss": 4.7443, "mean_token_accuracy": 0.2627954468131065, "num_tokens": 19639364.0, "step": 10805 }, { "entropy": 5.675701236724853, "epoch": 9.287064890416845, "grad_norm": 1.4453125, "learning_rate": 5.659083296586229e-05, "loss": 4.7119, "mean_token_accuracy": 0.254921193420887, "num_tokens": 19647816.0, "step": 10810 }, { "entropy": 5.656910228729248, "epoch": 9.2913622690159, "grad_norm": 1.3125, "learning_rate": 5.6511187583611663e-05, "loss": 4.831, "mean_token_accuracy": 0.25122607201337815, "num_tokens": 19657136.0, "step": 10815 }, { "entropy": 5.676870679855346, "epoch": 9.295659647614954, "grad_norm": 1.5546875, "learning_rate": 5.643201929359809e-05, "loss": 4.7776, "mean_token_accuracy": 0.25071688145399096, "num_tokens": 19665676.0, "step": 10820 }, { "entropy": 5.65682954788208, "epoch": 9.29995702621401, "grad_norm": 1.625, "learning_rate": 5.635332826869353e-05, "loss": 4.7466, "mean_token_accuracy": 0.2555884286761284, "num_tokens": 19674415.0, "step": 10825 }, { "entropy": 5.601188468933105, "epoch": 9.304254404813063, "grad_norm": 1.375, "learning_rate": 5.6275114680727716e-05, "loss": 4.7557, "mean_token_accuracy": 0.2584034129977226, "num_tokens": 19684411.0, "step": 10830 }, { "entropy": 5.618824768066406, "epoch": 9.308551783412119, "grad_norm": 1.578125, "learning_rate": 5.619737870048783e-05, "loss": 4.7413, "mean_token_accuracy": 0.2524507701396942, "num_tokens": 19693192.0, "step": 10835 }, { "entropy": 5.558388423919678, "epoch": 9.312849162011172, "grad_norm": 1.46875, "learning_rate": 5.612012049771823e-05, "loss": 4.6273, "mean_token_accuracy": 0.27150997519493103, "num_tokens": 19701710.0, "step": 10840 }, { "entropy": 5.683209705352783, "epoch": 9.317146540610228, "grad_norm": 1.296875, "learning_rate": 5.6043340241119924e-05, "loss": 4.8429, "mean_token_accuracy": 0.24387009292840958, "num_tokens": 19710909.0, "step": 10845 }, { "entropy": 5.645463323593139, "epoch": 9.321443919209282, "grad_norm": 1.3671875, "learning_rate": 5.596703809835033e-05, "loss": 4.7923, "mean_token_accuracy": 0.2493225336074829, "num_tokens": 19720634.0, "step": 10850 }, { "entropy": 5.633423852920532, "epoch": 9.325741297808337, "grad_norm": 1.6953125, "learning_rate": 5.589121423602277e-05, "loss": 4.6914, "mean_token_accuracy": 0.2628333792090416, "num_tokens": 19729763.0, "step": 10855 }, { "entropy": 5.702880573272705, "epoch": 9.33003867640739, "grad_norm": 1.46875, "learning_rate": 5.581586881970631e-05, "loss": 4.8575, "mean_token_accuracy": 0.24085617810487747, "num_tokens": 19739065.0, "step": 10860 }, { "entropy": 5.560692501068115, "epoch": 9.334336055006446, "grad_norm": 1.390625, "learning_rate": 5.574100201392522e-05, "loss": 4.6758, "mean_token_accuracy": 0.26179105788469315, "num_tokens": 19748519.0, "step": 10865 }, { "entropy": 5.623036813735962, "epoch": 9.338633433605501, "grad_norm": 1.40625, "learning_rate": 5.5666613982158665e-05, "loss": 4.7412, "mean_token_accuracy": 0.2588634729385376, "num_tokens": 19757573.0, "step": 10870 }, { "entropy": 5.657954597473145, "epoch": 9.342930812204555, "grad_norm": 1.4375, "learning_rate": 5.559270488684036e-05, "loss": 4.8895, "mean_token_accuracy": 0.23635072410106658, "num_tokens": 19767534.0, "step": 10875 }, { "entropy": 5.691379356384277, "epoch": 9.34722819080361, "grad_norm": 1.3203125, "learning_rate": 5.551927488935826e-05, "loss": 4.8455, "mean_token_accuracy": 0.2456997498869896, "num_tokens": 19776929.0, "step": 10880 }, { "entropy": 5.656087350845337, "epoch": 9.351525569402664, "grad_norm": 1.484375, "learning_rate": 5.5446324150054086e-05, "loss": 4.8121, "mean_token_accuracy": 0.25415152609348296, "num_tokens": 19785772.0, "step": 10885 }, { "entropy": 5.654774618148804, "epoch": 9.35582294800172, "grad_norm": 1.2265625, "learning_rate": 5.537385282822315e-05, "loss": 4.7974, "mean_token_accuracy": 0.25373937338590624, "num_tokens": 19796073.0, "step": 10890 }, { "entropy": 5.641226482391358, "epoch": 9.360120326600773, "grad_norm": 1.671875, "learning_rate": 5.53018610821138e-05, "loss": 4.7928, "mean_token_accuracy": 0.25097259134054184, "num_tokens": 19804511.0, "step": 10895 }, { "entropy": 5.687262010574341, "epoch": 9.364417705199829, "grad_norm": 1.4765625, "learning_rate": 5.523034906892728e-05, "loss": 4.8408, "mean_token_accuracy": 0.2439215749502182, "num_tokens": 19813348.0, "step": 10900 }, { "entropy": 5.699068069458008, "epoch": 9.368715083798882, "grad_norm": 1.4140625, "learning_rate": 5.515931694481722e-05, "loss": 4.8403, "mean_token_accuracy": 0.2473944455385208, "num_tokens": 19822390.0, "step": 10905 }, { "entropy": 5.716992425918579, "epoch": 9.373012462397938, "grad_norm": 1.421875, "learning_rate": 5.508876486488936e-05, "loss": 4.8561, "mean_token_accuracy": 0.24376602172851564, "num_tokens": 19832423.0, "step": 10910 }, { "entropy": 5.643907117843628, "epoch": 9.377309840996991, "grad_norm": 1.515625, "learning_rate": 5.501869298320128e-05, "loss": 4.7983, "mean_token_accuracy": 0.24763839840888976, "num_tokens": 19841271.0, "step": 10915 }, { "entropy": 5.6545130729675295, "epoch": 9.381607219596047, "grad_norm": 1.546875, "learning_rate": 5.4949101452761995e-05, "loss": 4.7958, "mean_token_accuracy": 0.2506653919816017, "num_tokens": 19850648.0, "step": 10920 }, { "entropy": 5.657913112640381, "epoch": 9.3859045981951, "grad_norm": 1.3203125, "learning_rate": 5.4879990425531534e-05, "loss": 4.7589, "mean_token_accuracy": 0.2580998450517654, "num_tokens": 19860102.0, "step": 10925 }, { "entropy": 5.674734878540039, "epoch": 9.390201976794156, "grad_norm": 1.3125, "learning_rate": 5.4811360052420754e-05, "loss": 4.8262, "mean_token_accuracy": 0.24551165699958802, "num_tokens": 19870008.0, "step": 10930 }, { "entropy": 5.629330062866211, "epoch": 9.39449935539321, "grad_norm": 1.234375, "learning_rate": 5.4743210483290974e-05, "loss": 4.7496, "mean_token_accuracy": 0.25246907472610475, "num_tokens": 19879452.0, "step": 10935 }, { "entropy": 5.713487863540649, "epoch": 9.398796733992265, "grad_norm": 1.3671875, "learning_rate": 5.467554186695364e-05, "loss": 4.8508, "mean_token_accuracy": 0.2398130863904953, "num_tokens": 19888956.0, "step": 10940 }, { "entropy": 5.65391993522644, "epoch": 9.403094112591319, "grad_norm": 1.375, "learning_rate": 5.4608354351169944e-05, "loss": 4.7862, "mean_token_accuracy": 0.24949571192264558, "num_tokens": 19897760.0, "step": 10945 }, { "entropy": 5.5948474407196045, "epoch": 9.407391491190374, "grad_norm": 1.5, "learning_rate": 5.454164808265057e-05, "loss": 4.7756, "mean_token_accuracy": 0.25042637884616853, "num_tokens": 19905692.0, "step": 10950 }, { "entropy": 5.596786975860596, "epoch": 9.411688869789428, "grad_norm": 1.453125, "learning_rate": 5.447542320705532e-05, "loss": 4.6944, "mean_token_accuracy": 0.26209963411092757, "num_tokens": 19914633.0, "step": 10955 }, { "entropy": 5.683409118652344, "epoch": 9.415986248388483, "grad_norm": 1.4921875, "learning_rate": 5.440967986899289e-05, "loss": 4.8065, "mean_token_accuracy": 0.24011071175336837, "num_tokens": 19922749.0, "step": 10960 }, { "entropy": 5.660010576248169, "epoch": 9.420283626987537, "grad_norm": 1.4453125, "learning_rate": 5.434441821202042e-05, "loss": 4.7322, "mean_token_accuracy": 0.25771835893392564, "num_tokens": 19931274.0, "step": 10965 }, { "entropy": 5.6490149974823, "epoch": 9.424581005586592, "grad_norm": 1.4765625, "learning_rate": 5.42796383786433e-05, "loss": 4.7765, "mean_token_accuracy": 0.24860167354345322, "num_tokens": 19940052.0, "step": 10970 }, { "entropy": 5.646113681793213, "epoch": 9.428878384185646, "grad_norm": 1.3359375, "learning_rate": 5.4215340510314805e-05, "loss": 4.7791, "mean_token_accuracy": 0.247910475730896, "num_tokens": 19948890.0, "step": 10975 }, { "entropy": 5.649462270736694, "epoch": 9.433175762784701, "grad_norm": 1.328125, "learning_rate": 5.41515247474358e-05, "loss": 4.8252, "mean_token_accuracy": 0.2510902941226959, "num_tokens": 19958158.0, "step": 10980 }, { "entropy": 5.700368213653564, "epoch": 9.437473141383755, "grad_norm": 1.3203125, "learning_rate": 5.4088191229354306e-05, "loss": 4.8708, "mean_token_accuracy": 0.2423779547214508, "num_tokens": 19967432.0, "step": 10985 }, { "entropy": 5.615607881546021, "epoch": 9.44177051998281, "grad_norm": 1.4140625, "learning_rate": 5.402534009436552e-05, "loss": 4.7574, "mean_token_accuracy": 0.2512508645653725, "num_tokens": 19977661.0, "step": 10990 }, { "entropy": 5.6874782085418705, "epoch": 9.446067898581864, "grad_norm": 1.4609375, "learning_rate": 5.396297147971116e-05, "loss": 4.8146, "mean_token_accuracy": 0.24550059139728547, "num_tokens": 19986359.0, "step": 10995 }, { "entropy": 5.6440185546875, "epoch": 9.45036527718092, "grad_norm": 1.3984375, "learning_rate": 5.390108552157935e-05, "loss": 4.734, "mean_token_accuracy": 0.25735636949539187, "num_tokens": 19994618.0, "step": 11000 }, { "epoch": 9.45036527718092, "eval_entropy": 5.4661739087319585, "eval_loss": 5.882833480834961, "eval_mean_token_accuracy": 0.18411659976249342, "eval_num_tokens": 19994618.0, "eval_runtime": 2.0463, "eval_samples_per_second": 1734.317, "eval_steps_per_second": 216.973, "step": 11000 }, { "entropy": 5.659858560562133, "epoch": 9.454662655779973, "grad_norm": 1.3984375, "learning_rate": 5.383968235510427e-05, "loss": 4.8322, "mean_token_accuracy": 0.2516154408454895, "num_tokens": 20004050.0, "step": 11005 }, { "entropy": 5.622915983200073, "epoch": 9.458960034379029, "grad_norm": 1.4296875, "learning_rate": 5.377876211436592e-05, "loss": 4.7684, "mean_token_accuracy": 0.2524562880396843, "num_tokens": 20013480.0, "step": 11010 }, { "entropy": 5.733402347564697, "epoch": 9.463257412978084, "grad_norm": 1.59375, "learning_rate": 5.371832493238973e-05, "loss": 4.8484, "mean_token_accuracy": 0.24426155537366867, "num_tokens": 20022133.0, "step": 11015 }, { "entropy": 5.587588977813721, "epoch": 9.467554791577138, "grad_norm": 1.4375, "learning_rate": 5.365837094114639e-05, "loss": 4.737, "mean_token_accuracy": 0.255278055369854, "num_tokens": 20031462.0, "step": 11020 }, { "entropy": 5.685597229003906, "epoch": 9.471852170176193, "grad_norm": 1.5, "learning_rate": 5.3598900271551396e-05, "loss": 4.8577, "mean_token_accuracy": 0.24156038761138915, "num_tokens": 20042055.0, "step": 11025 }, { "entropy": 5.629064464569092, "epoch": 9.476149548775247, "grad_norm": 1.2578125, "learning_rate": 5.353991305346499e-05, "loss": 4.7797, "mean_token_accuracy": 0.24861287474632263, "num_tokens": 20051379.0, "step": 11030 }, { "entropy": 5.596753263473511, "epoch": 9.480446927374302, "grad_norm": 1.4609375, "learning_rate": 5.348140941569165e-05, "loss": 4.7122, "mean_token_accuracy": 0.26016455739736555, "num_tokens": 20060289.0, "step": 11035 }, { "entropy": 5.602110958099365, "epoch": 9.484744305973356, "grad_norm": 1.40625, "learning_rate": 5.342338948597989e-05, "loss": 4.7386, "mean_token_accuracy": 0.25520058423280717, "num_tokens": 20070133.0, "step": 11040 }, { "entropy": 5.652761363983155, "epoch": 9.489041684572411, "grad_norm": 1.4609375, "learning_rate": 5.336585339102209e-05, "loss": 4.7796, "mean_token_accuracy": 0.25094655752182005, "num_tokens": 20078384.0, "step": 11045 }, { "entropy": 5.54692063331604, "epoch": 9.493339063171465, "grad_norm": 1.453125, "learning_rate": 5.33088012564541e-05, "loss": 4.7064, "mean_token_accuracy": 0.2593804582953453, "num_tokens": 20087116.0, "step": 11050 }, { "entropy": 5.525764131546021, "epoch": 9.49763644177052, "grad_norm": 1.4375, "learning_rate": 5.3252233206854955e-05, "loss": 4.6844, "mean_token_accuracy": 0.2625924780964851, "num_tokens": 20096918.0, "step": 11055 }, { "entropy": 5.657556390762329, "epoch": 9.501933820369574, "grad_norm": 1.421875, "learning_rate": 5.3196149365746656e-05, "loss": 4.8704, "mean_token_accuracy": 0.2475408360362053, "num_tokens": 20107602.0, "step": 11060 }, { "entropy": 5.579683017730713, "epoch": 9.50623119896863, "grad_norm": 1.3046875, "learning_rate": 5.31405498555939e-05, "loss": 4.7345, "mean_token_accuracy": 0.25721548944711686, "num_tokens": 20117732.0, "step": 11065 }, { "entropy": 5.711687660217285, "epoch": 9.510528577567683, "grad_norm": 1.328125, "learning_rate": 5.308543479780384e-05, "loss": 4.8706, "mean_token_accuracy": 0.23259128481149674, "num_tokens": 20127765.0, "step": 11070 }, { "entropy": 5.6711039543151855, "epoch": 9.514825956166739, "grad_norm": 1.4453125, "learning_rate": 5.303080431272567e-05, "loss": 4.8563, "mean_token_accuracy": 0.24667593389749526, "num_tokens": 20137135.0, "step": 11075 }, { "entropy": 5.610913515090942, "epoch": 9.519123334765792, "grad_norm": 1.5390625, "learning_rate": 5.297665851965055e-05, "loss": 4.7876, "mean_token_accuracy": 0.24854191541671752, "num_tokens": 20146156.0, "step": 11080 }, { "entropy": 5.655600214004517, "epoch": 9.523420713364848, "grad_norm": 1.6171875, "learning_rate": 5.292299753681129e-05, "loss": 4.7903, "mean_token_accuracy": 0.25051039457321167, "num_tokens": 20154433.0, "step": 11085 }, { "entropy": 5.645228624343872, "epoch": 9.527718091963902, "grad_norm": 1.3828125, "learning_rate": 5.286982148138196e-05, "loss": 4.7636, "mean_token_accuracy": 0.253564678132534, "num_tokens": 20164209.0, "step": 11090 }, { "entropy": 5.649822092056274, "epoch": 9.532015470562957, "grad_norm": 1.4375, "learning_rate": 5.281713046947787e-05, "loss": 4.7729, "mean_token_accuracy": 0.24183453619480133, "num_tokens": 20173116.0, "step": 11095 }, { "entropy": 5.709951972961425, "epoch": 9.53631284916201, "grad_norm": 1.5390625, "learning_rate": 5.2764924616155116e-05, "loss": 4.8596, "mean_token_accuracy": 0.24584992378950118, "num_tokens": 20182933.0, "step": 11100 }, { "entropy": 5.630789375305175, "epoch": 9.540610227761066, "grad_norm": 1.6796875, "learning_rate": 5.271320403541038e-05, "loss": 4.74, "mean_token_accuracy": 0.25479953438043595, "num_tokens": 20191899.0, "step": 11105 }, { "entropy": 5.650143051147461, "epoch": 9.54490760636012, "grad_norm": 1.4453125, "learning_rate": 5.266196884018081e-05, "loss": 4.7935, "mean_token_accuracy": 0.25023754984140395, "num_tokens": 20201108.0, "step": 11110 }, { "entropy": 5.612443971633911, "epoch": 9.549204984959175, "grad_norm": 1.4609375, "learning_rate": 5.2611219142343494e-05, "loss": 4.7849, "mean_token_accuracy": 0.2484399139881134, "num_tokens": 20209137.0, "step": 11115 }, { "entropy": 5.575851345062256, "epoch": 9.553502363558229, "grad_norm": 1.3515625, "learning_rate": 5.2560955052715574e-05, "loss": 4.684, "mean_token_accuracy": 0.25690483301877975, "num_tokens": 20217879.0, "step": 11120 }, { "entropy": 5.636340522766114, "epoch": 9.557799742157284, "grad_norm": 1.4609375, "learning_rate": 5.2511176681053704e-05, "loss": 4.7703, "mean_token_accuracy": 0.25021136105060576, "num_tokens": 20226783.0, "step": 11125 }, { "entropy": 5.640555143356323, "epoch": 9.562097120756338, "grad_norm": 1.4375, "learning_rate": 5.246188413605393e-05, "loss": 4.7858, "mean_token_accuracy": 0.2479167178273201, "num_tokens": 20235008.0, "step": 11130 }, { "entropy": 5.624492883682251, "epoch": 9.566394499355393, "grad_norm": 1.4765625, "learning_rate": 5.241307752535149e-05, "loss": 4.6869, "mean_token_accuracy": 0.2625169694423676, "num_tokens": 20243157.0, "step": 11135 }, { "entropy": 5.616971254348755, "epoch": 9.570691877954447, "grad_norm": 1.375, "learning_rate": 5.236475695552052e-05, "loss": 4.7482, "mean_token_accuracy": 0.25295512080192567, "num_tokens": 20251548.0, "step": 11140 }, { "entropy": 5.573141384124756, "epoch": 9.574989256553502, "grad_norm": 1.546875, "learning_rate": 5.2316922532073796e-05, "loss": 4.659, "mean_token_accuracy": 0.26318345218896866, "num_tokens": 20260954.0, "step": 11145 }, { "entropy": 5.707438850402832, "epoch": 9.579286635152556, "grad_norm": 1.4765625, "learning_rate": 5.226957435946265e-05, "loss": 4.8741, "mean_token_accuracy": 0.23904298692941667, "num_tokens": 20269976.0, "step": 11150 }, { "entropy": 5.651593017578125, "epoch": 9.583584013751612, "grad_norm": 1.4375, "learning_rate": 5.2222712541076464e-05, "loss": 4.8362, "mean_token_accuracy": 0.2450185090303421, "num_tokens": 20279824.0, "step": 11155 }, { "entropy": 5.653690195083618, "epoch": 9.587881392350667, "grad_norm": 1.4609375, "learning_rate": 5.217633717924282e-05, "loss": 4.7696, "mean_token_accuracy": 0.26532009840011594, "num_tokens": 20289284.0, "step": 11160 }, { "entropy": 5.65428581237793, "epoch": 9.59217877094972, "grad_norm": 1.2734375, "learning_rate": 5.213044837522689e-05, "loss": 4.721, "mean_token_accuracy": 0.262369267642498, "num_tokens": 20297905.0, "step": 11165 }, { "entropy": 5.611086177825928, "epoch": 9.596476149548776, "grad_norm": 1.4609375, "learning_rate": 5.208504622923154e-05, "loss": 4.7362, "mean_token_accuracy": 0.2587092310190201, "num_tokens": 20307219.0, "step": 11170 }, { "entropy": 5.623517179489136, "epoch": 9.60077352814783, "grad_norm": 1.4921875, "learning_rate": 5.204013084039687e-05, "loss": 4.7823, "mean_token_accuracy": 0.2503692626953125, "num_tokens": 20316369.0, "step": 11175 }, { "entropy": 5.622481441497802, "epoch": 9.605070906746885, "grad_norm": 1.5390625, "learning_rate": 5.199570230680017e-05, "loss": 4.7574, "mean_token_accuracy": 0.2528963088989258, "num_tokens": 20325764.0, "step": 11180 }, { "entropy": 5.697538805007935, "epoch": 9.609368285345939, "grad_norm": 1.421875, "learning_rate": 5.19517607254556e-05, "loss": 4.8389, "mean_token_accuracy": 0.24330864548683168, "num_tokens": 20334671.0, "step": 11185 }, { "entropy": 5.629094791412354, "epoch": 9.613665663944994, "grad_norm": 1.46875, "learning_rate": 5.190830619231397e-05, "loss": 4.8041, "mean_token_accuracy": 0.24563638269901275, "num_tokens": 20343585.0, "step": 11190 }, { "entropy": 5.619398355484009, "epoch": 9.617963042544048, "grad_norm": 1.453125, "learning_rate": 5.186533880226263e-05, "loss": 4.7431, "mean_token_accuracy": 0.2571689531207085, "num_tokens": 20352569.0, "step": 11195 }, { "entropy": 5.643133020401001, "epoch": 9.622260421143103, "grad_norm": 1.5546875, "learning_rate": 5.1822858649125197e-05, "loss": 4.7987, "mean_token_accuracy": 0.24488139748573304, "num_tokens": 20361479.0, "step": 11200 }, { "entropy": 5.702621603012085, "epoch": 9.626557799742157, "grad_norm": 1.546875, "learning_rate": 5.178086582566134e-05, "loss": 4.8355, "mean_token_accuracy": 0.24582064300775527, "num_tokens": 20370286.0, "step": 11205 }, { "entropy": 5.618520736694336, "epoch": 9.630855178341212, "grad_norm": 1.4453125, "learning_rate": 5.1739360423566596e-05, "loss": 4.8148, "mean_token_accuracy": 0.24465181976556777, "num_tokens": 20379253.0, "step": 11210 }, { "entropy": 5.622318935394287, "epoch": 9.635152556940266, "grad_norm": 1.453125, "learning_rate": 5.16983425334722e-05, "loss": 4.7522, "mean_token_accuracy": 0.24978899955749512, "num_tokens": 20387796.0, "step": 11215 }, { "entropy": 5.629397249221801, "epoch": 9.639449935539322, "grad_norm": 1.453125, "learning_rate": 5.1657812244944796e-05, "loss": 4.7111, "mean_token_accuracy": 0.2518982455134392, "num_tokens": 20396209.0, "step": 11220 }, { "entropy": 5.693726253509522, "epoch": 9.643747314138375, "grad_norm": 1.4765625, "learning_rate": 5.1617769646486344e-05, "loss": 4.8492, "mean_token_accuracy": 0.24990911781787872, "num_tokens": 20405102.0, "step": 11225 }, { "entropy": 5.692820119857788, "epoch": 9.64804469273743, "grad_norm": 1.3046875, "learning_rate": 5.157821482553389e-05, "loss": 4.9092, "mean_token_accuracy": 0.2370424747467041, "num_tokens": 20415126.0, "step": 11230 }, { "entropy": 5.661749172210693, "epoch": 9.652342071336484, "grad_norm": 1.4921875, "learning_rate": 5.153914786845932e-05, "loss": 4.818, "mean_token_accuracy": 0.2480069264769554, "num_tokens": 20424166.0, "step": 11235 }, { "entropy": 5.575618267059326, "epoch": 9.65663944993554, "grad_norm": 1.6875, "learning_rate": 5.1500568860569285e-05, "loss": 4.6445, "mean_token_accuracy": 0.26594655215740204, "num_tokens": 20431984.0, "step": 11240 }, { "entropy": 5.574884748458862, "epoch": 9.660936828534593, "grad_norm": 1.53125, "learning_rate": 5.1462477886104904e-05, "loss": 4.7064, "mean_token_accuracy": 0.2591560423374176, "num_tokens": 20440923.0, "step": 11245 }, { "entropy": 5.624132966995239, "epoch": 9.665234207133649, "grad_norm": 1.5859375, "learning_rate": 5.1424875028241625e-05, "loss": 4.7367, "mean_token_accuracy": 0.24782251417636872, "num_tokens": 20449525.0, "step": 11250 }, { "entropy": 5.660547685623169, "epoch": 9.669531585732702, "grad_norm": 1.3046875, "learning_rate": 5.138776036908911e-05, "loss": 4.8112, "mean_token_accuracy": 0.25246207118034364, "num_tokens": 20459193.0, "step": 11255 }, { "entropy": 5.639756917953491, "epoch": 9.673828964331758, "grad_norm": 1.6328125, "learning_rate": 5.135113398969091e-05, "loss": 4.7594, "mean_token_accuracy": 0.24905567318201066, "num_tokens": 20467564.0, "step": 11260 }, { "entropy": 5.703488731384278, "epoch": 9.678126342930812, "grad_norm": 1.421875, "learning_rate": 5.131499597002437e-05, "loss": 4.8746, "mean_token_accuracy": 0.2473714679479599, "num_tokens": 20476573.0, "step": 11265 }, { "entropy": 5.6513889789581295, "epoch": 9.682423721529867, "grad_norm": 1.4296875, "learning_rate": 5.12793463890005e-05, "loss": 4.8113, "mean_token_accuracy": 0.2503967732191086, "num_tokens": 20486788.0, "step": 11270 }, { "entropy": 5.6483711242675785, "epoch": 9.68672110012892, "grad_norm": 1.3046875, "learning_rate": 5.124418532446376e-05, "loss": 4.8455, "mean_token_accuracy": 0.23826995640993118, "num_tokens": 20495982.0, "step": 11275 }, { "entropy": 5.685800075531006, "epoch": 9.691018478727976, "grad_norm": 1.390625, "learning_rate": 5.120951285319187e-05, "loss": 4.8508, "mean_token_accuracy": 0.24039490073919295, "num_tokens": 20504678.0, "step": 11280 }, { "entropy": 5.6086170196533205, "epoch": 9.69531585732703, "grad_norm": 1.53125, "learning_rate": 5.1175329050895584e-05, "loss": 4.6919, "mean_token_accuracy": 0.2570186793804169, "num_tokens": 20513362.0, "step": 11285 }, { "entropy": 5.623786163330078, "epoch": 9.699613235926085, "grad_norm": 1.4296875, "learning_rate": 5.114163399221871e-05, "loss": 4.8276, "mean_token_accuracy": 0.24475472122430803, "num_tokens": 20522280.0, "step": 11290 }, { "entropy": 5.6047193050384525, "epoch": 9.703910614525139, "grad_norm": 1.5546875, "learning_rate": 5.110842775073778e-05, "loss": 4.7305, "mean_token_accuracy": 0.2570575699210167, "num_tokens": 20531175.0, "step": 11295 }, { "entropy": 5.611165571212768, "epoch": 9.708207993124194, "grad_norm": 1.375, "learning_rate": 5.107571039896196e-05, "loss": 4.8011, "mean_token_accuracy": 0.24260985106229782, "num_tokens": 20540592.0, "step": 11300 }, { "entropy": 5.717087841033935, "epoch": 9.71250537172325, "grad_norm": 1.5390625, "learning_rate": 5.1043482008332864e-05, "loss": 4.8867, "mean_token_accuracy": 0.2418668359518051, "num_tokens": 20549963.0, "step": 11305 }, { "entropy": 5.621548080444336, "epoch": 9.716802750322303, "grad_norm": 1.34375, "learning_rate": 5.1011742649224394e-05, "loss": 4.8006, "mean_token_accuracy": 0.25661737024784087, "num_tokens": 20558928.0, "step": 11310 }, { "entropy": 5.607663822174072, "epoch": 9.721100128921359, "grad_norm": 1.625, "learning_rate": 5.098049239094267e-05, "loss": 4.682, "mean_token_accuracy": 0.25437158197164533, "num_tokens": 20566483.0, "step": 11315 }, { "entropy": 5.666921663284302, "epoch": 9.725397507520412, "grad_norm": 1.4765625, "learning_rate": 5.094973130172573e-05, "loss": 4.7839, "mean_token_accuracy": 0.24927501380443573, "num_tokens": 20575349.0, "step": 11320 }, { "entropy": 5.682584810256958, "epoch": 9.729694886119468, "grad_norm": 1.5234375, "learning_rate": 5.09194594487435e-05, "loss": 4.8006, "mean_token_accuracy": 0.2473174586892128, "num_tokens": 20584235.0, "step": 11325 }, { "entropy": 5.583393573760986, "epoch": 9.733992264718522, "grad_norm": 1.5546875, "learning_rate": 5.088967689809763e-05, "loss": 4.7248, "mean_token_accuracy": 0.25419015288352964, "num_tokens": 20593705.0, "step": 11330 }, { "entropy": 5.6596780776977536, "epoch": 9.738289643317577, "grad_norm": 1.234375, "learning_rate": 5.086038371482128e-05, "loss": 4.825, "mean_token_accuracy": 0.25288843363523483, "num_tokens": 20603886.0, "step": 11335 }, { "entropy": 5.491651773452759, "epoch": 9.74258702191663, "grad_norm": 1.390625, "learning_rate": 5.0831579962879074e-05, "loss": 4.6572, "mean_token_accuracy": 0.26912386566400526, "num_tokens": 20613181.0, "step": 11340 }, { "entropy": 5.649355173110962, "epoch": 9.746884400515686, "grad_norm": 1.5, "learning_rate": 5.080326570516686e-05, "loss": 4.7669, "mean_token_accuracy": 0.2511239215731621, "num_tokens": 20620938.0, "step": 11345 }, { "entropy": 5.694200468063355, "epoch": 9.75118177911474, "grad_norm": 1.359375, "learning_rate": 5.077544100351172e-05, "loss": 4.8779, "mean_token_accuracy": 0.23636120855808257, "num_tokens": 20630992.0, "step": 11350 }, { "entropy": 5.713483667373657, "epoch": 9.755479157713795, "grad_norm": 1.53125, "learning_rate": 5.0748105918671616e-05, "loss": 4.8927, "mean_token_accuracy": 0.2489745020866394, "num_tokens": 20640543.0, "step": 11355 }, { "entropy": 5.648331212997436, "epoch": 9.759776536312849, "grad_norm": 1.3984375, "learning_rate": 5.072126051033551e-05, "loss": 4.7998, "mean_token_accuracy": 0.24586018174886703, "num_tokens": 20649814.0, "step": 11360 }, { "entropy": 5.663990640640259, "epoch": 9.764073914911904, "grad_norm": 1.4296875, "learning_rate": 5.069490483712298e-05, "loss": 4.8085, "mean_token_accuracy": 0.2565313339233398, "num_tokens": 20658215.0, "step": 11365 }, { "entropy": 5.66822190284729, "epoch": 9.768371293510958, "grad_norm": 1.3828125, "learning_rate": 5.066903895658433e-05, "loss": 4.7866, "mean_token_accuracy": 0.2542292311787605, "num_tokens": 20667412.0, "step": 11370 }, { "entropy": 5.650898599624634, "epoch": 9.772668672110013, "grad_norm": 1.5078125, "learning_rate": 5.064366292520028e-05, "loss": 4.8, "mean_token_accuracy": 0.24360747188329696, "num_tokens": 20676366.0, "step": 11375 }, { "entropy": 5.709493207931518, "epoch": 9.776966050709067, "grad_norm": 1.2890625, "learning_rate": 5.061877679838192e-05, "loss": 4.9026, "mean_token_accuracy": 0.23420629501342774, "num_tokens": 20686872.0, "step": 11380 }, { "entropy": 5.6564898014068605, "epoch": 9.781263429308122, "grad_norm": 1.515625, "learning_rate": 5.059438063047066e-05, "loss": 4.7342, "mean_token_accuracy": 0.2545543238520622, "num_tokens": 20695162.0, "step": 11385 }, { "entropy": 5.647177982330322, "epoch": 9.785560807907176, "grad_norm": 1.359375, "learning_rate": 5.057047447473796e-05, "loss": 4.7741, "mean_token_accuracy": 0.24988951534032822, "num_tokens": 20703897.0, "step": 11390 }, { "entropy": 5.573124599456787, "epoch": 9.789858186506232, "grad_norm": 1.390625, "learning_rate": 5.054705838338529e-05, "loss": 4.7305, "mean_token_accuracy": 0.25603497624397276, "num_tokens": 20712115.0, "step": 11395 }, { "entropy": 5.58337926864624, "epoch": 9.794155565105285, "grad_norm": 1.3984375, "learning_rate": 5.052413240754404e-05, "loss": 4.733, "mean_token_accuracy": 0.25425832718610764, "num_tokens": 20721073.0, "step": 11400 }, { "entropy": 5.646140146255493, "epoch": 9.79845294370434, "grad_norm": 1.2890625, "learning_rate": 5.0501696597275376e-05, "loss": 4.7743, "mean_token_accuracy": 0.24980906248092652, "num_tokens": 20731370.0, "step": 11405 }, { "entropy": 5.649660873413086, "epoch": 9.802750322303394, "grad_norm": 1.4765625, "learning_rate": 5.047975100157018e-05, "loss": 4.8259, "mean_token_accuracy": 0.24879364669322968, "num_tokens": 20740744.0, "step": 11410 }, { "entropy": 5.630423450469971, "epoch": 9.80704770090245, "grad_norm": 1.375, "learning_rate": 5.045829566834879e-05, "loss": 4.7828, "mean_token_accuracy": 0.2567125052213669, "num_tokens": 20749838.0, "step": 11415 }, { "entropy": 5.642637586593628, "epoch": 9.811345079501503, "grad_norm": 1.3046875, "learning_rate": 5.043733064446113e-05, "loss": 4.7148, "mean_token_accuracy": 0.26176913231611254, "num_tokens": 20759689.0, "step": 11420 }, { "entropy": 5.637742042541504, "epoch": 9.815642458100559, "grad_norm": 1.546875, "learning_rate": 5.041685597568641e-05, "loss": 4.7516, "mean_token_accuracy": 0.2548555374145508, "num_tokens": 20769234.0, "step": 11425 }, { "entropy": 5.6450474739074705, "epoch": 9.819939836699612, "grad_norm": 1.390625, "learning_rate": 5.039687170673315e-05, "loss": 4.8278, "mean_token_accuracy": 0.25093297064304354, "num_tokens": 20778794.0, "step": 11430 }, { "entropy": 5.656610298156738, "epoch": 9.824237215298668, "grad_norm": 1.4296875, "learning_rate": 5.037737788123895e-05, "loss": 4.7697, "mean_token_accuracy": 0.24846816807985306, "num_tokens": 20788327.0, "step": 11435 }, { "entropy": 5.673328161239624, "epoch": 9.828534593897722, "grad_norm": 1.375, "learning_rate": 5.03583745417706e-05, "loss": 4.8091, "mean_token_accuracy": 0.2541452869772911, "num_tokens": 20797161.0, "step": 11440 }, { "entropy": 5.717051362991333, "epoch": 9.832831972496777, "grad_norm": 1.375, "learning_rate": 5.033986172982375e-05, "loss": 4.8706, "mean_token_accuracy": 0.24515803754329682, "num_tokens": 20806264.0, "step": 11445 }, { "entropy": 5.6528173923492435, "epoch": 9.837129351095832, "grad_norm": 1.4375, "learning_rate": 5.0321839485823014e-05, "loss": 4.829, "mean_token_accuracy": 0.2445735216140747, "num_tokens": 20815569.0, "step": 11450 }, { "entropy": 5.713804626464844, "epoch": 9.841426729694886, "grad_norm": 1.46875, "learning_rate": 5.030430784912177e-05, "loss": 4.8996, "mean_token_accuracy": 0.23994259387254716, "num_tokens": 20825234.0, "step": 11455 }, { "entropy": 5.5915292263031, "epoch": 9.845724108293942, "grad_norm": 1.4609375, "learning_rate": 5.0287266858002054e-05, "loss": 4.7611, "mean_token_accuracy": 0.2517869219183922, "num_tokens": 20834601.0, "step": 11460 }, { "entropy": 5.666462707519531, "epoch": 9.850021486892995, "grad_norm": 1.265625, "learning_rate": 5.027071654967465e-05, "loss": 4.7808, "mean_token_accuracy": 0.25927551090717316, "num_tokens": 20843686.0, "step": 11465 }, { "entropy": 5.651755857467651, "epoch": 9.85431886549205, "grad_norm": 1.4453125, "learning_rate": 5.025465696027875e-05, "loss": 4.7879, "mean_token_accuracy": 0.25643584728240965, "num_tokens": 20852009.0, "step": 11470 }, { "entropy": 5.645906925201416, "epoch": 9.858616244091104, "grad_norm": 1.4296875, "learning_rate": 5.023908812488211e-05, "loss": 4.7539, "mean_token_accuracy": 0.25380629152059553, "num_tokens": 20861241.0, "step": 11475 }, { "entropy": 5.654606008529663, "epoch": 9.86291362269016, "grad_norm": 1.3046875, "learning_rate": 5.022401007748087e-05, "loss": 4.8146, "mean_token_accuracy": 0.24114521145820617, "num_tokens": 20869774.0, "step": 11480 }, { "entropy": 5.640413856506347, "epoch": 9.867211001289213, "grad_norm": 1.4765625, "learning_rate": 5.0209422850999414e-05, "loss": 4.8155, "mean_token_accuracy": 0.24895236790180206, "num_tokens": 20878844.0, "step": 11485 }, { "entropy": 5.6264293670654295, "epoch": 9.871508379888269, "grad_norm": 1.3671875, "learning_rate": 5.019532647729046e-05, "loss": 4.8131, "mean_token_accuracy": 0.24811711311340331, "num_tokens": 20888054.0, "step": 11490 }, { "entropy": 5.655476331710815, "epoch": 9.875805758487322, "grad_norm": 1.234375, "learning_rate": 5.0181720987134815e-05, "loss": 4.7865, "mean_token_accuracy": 0.24855260103940963, "num_tokens": 20897752.0, "step": 11495 }, { "entropy": 5.549111795425415, "epoch": 9.880103137086378, "grad_norm": 1.5625, "learning_rate": 5.016860641024143e-05, "loss": 4.6743, "mean_token_accuracy": 0.26256232261657714, "num_tokens": 20906186.0, "step": 11500 }, { "epoch": 9.880103137086378, "eval_entropy": 5.467424088233226, "eval_loss": 5.883642196655273, "eval_mean_token_accuracy": 0.18433019540666998, "eval_num_tokens": 20906186.0, "eval_runtime": 2.0592, "eval_samples_per_second": 1723.464, "eval_steps_per_second": 215.615, "step": 11500 } ], "logging_steps": 5, "max_steps": 11630, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4703050292981760.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }