{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.4366136656639448, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 7.6312949657440186, "epoch": 0.004297378599054577, "grad_norm": 0.94921875, "learning_rate": 2e-06, "loss": 7.384, "mean_token_accuracy": 0.09047168418765068, "num_tokens": 10107.0, "step": 5 }, { "entropy": 7.674387979507446, "epoch": 0.008594757198109154, "grad_norm": 1.1484375, "learning_rate": 4.5e-06, "loss": 7.3814, "mean_token_accuracy": 0.09915048182010651, "num_tokens": 18391.0, "step": 10 }, { "entropy": 7.658490705490112, "epoch": 0.01289213579716373, "grad_norm": 1.015625, "learning_rate": 7e-06, "loss": 7.4194, "mean_token_accuracy": 0.09372682273387908, "num_tokens": 27061.0, "step": 15 }, { "entropy": 7.6485553741455075, "epoch": 0.017189514396218308, "grad_norm": 1.09375, "learning_rate": 9.5e-06, "loss": 7.4387, "mean_token_accuracy": 0.09950413554906845, "num_tokens": 36339.0, "step": 20 }, { "entropy": 7.655299663543701, "epoch": 0.021486892995272882, "grad_norm": 0.95703125, "learning_rate": 1.2e-05, "loss": 7.4336, "mean_token_accuracy": 0.09199422970414162, "num_tokens": 45770.0, "step": 25 }, { "entropy": 7.707321071624756, "epoch": 0.02578427159432746, "grad_norm": 0.96875, "learning_rate": 1.4500000000000002e-05, "loss": 7.4406, "mean_token_accuracy": 0.09267855286598206, "num_tokens": 54575.0, "step": 30 }, { "entropy": 7.718957376480103, "epoch": 0.030081650193382038, "grad_norm": 0.97265625, "learning_rate": 1.7000000000000003e-05, "loss": 7.5222, "mean_token_accuracy": 0.08976790606975556, "num_tokens": 66403.0, "step": 35 }, { "entropy": 7.742082262039185, "epoch": 0.034379028792436615, "grad_norm": 0.87890625, "learning_rate": 1.95e-05, "loss": 7.4377, "mean_token_accuracy": 0.09164252653717994, "num_tokens": 76510.0, "step": 40 }, { "entropy": 7.745701646804809, "epoch": 0.03867640739149119, "grad_norm": 0.99609375, "learning_rate": 2.2e-05, "loss": 7.358, "mean_token_accuracy": 0.0955798089504242, "num_tokens": 84836.0, "step": 45 }, { "entropy": 7.780595874786377, "epoch": 0.042973785990545764, "grad_norm": 0.984375, "learning_rate": 2.4500000000000003e-05, "loss": 7.3289, "mean_token_accuracy": 0.10552914068102837, "num_tokens": 93197.0, "step": 50 }, { "entropy": 7.764179325103759, "epoch": 0.047271164589600345, "grad_norm": 0.98828125, "learning_rate": 2.7e-05, "loss": 7.3234, "mean_token_accuracy": 0.09917277097702026, "num_tokens": 101546.0, "step": 55 }, { "entropy": 7.719727945327759, "epoch": 0.05156854318865492, "grad_norm": 0.8515625, "learning_rate": 2.95e-05, "loss": 7.4172, "mean_token_accuracy": 0.0928034670650959, "num_tokens": 111703.0, "step": 60 }, { "entropy": 7.748228645324707, "epoch": 0.055865921787709494, "grad_norm": 0.95703125, "learning_rate": 3.2e-05, "loss": 7.3403, "mean_token_accuracy": 0.10037123262882233, "num_tokens": 119894.0, "step": 65 }, { "entropy": 7.714352416992187, "epoch": 0.060163300386764075, "grad_norm": 0.89453125, "learning_rate": 3.4500000000000005e-05, "loss": 7.2915, "mean_token_accuracy": 0.1022428810596466, "num_tokens": 128885.0, "step": 70 }, { "entropy": 7.679376173019409, "epoch": 0.06446067898581866, "grad_norm": 0.8984375, "learning_rate": 3.7e-05, "loss": 7.4226, "mean_token_accuracy": 0.0972097434103489, "num_tokens": 138106.0, "step": 75 }, { "entropy": 7.72790002822876, "epoch": 0.06875805758487323, "grad_norm": 1.140625, "learning_rate": 3.95e-05, "loss": 7.3294, "mean_token_accuracy": 0.1022751808166504, "num_tokens": 146691.0, "step": 80 }, { "entropy": 7.730126142501831, "epoch": 0.0730554361839278, "grad_norm": 0.99609375, "learning_rate": 4.2000000000000004e-05, "loss": 7.382, "mean_token_accuracy": 0.09973402544856072, "num_tokens": 155792.0, "step": 85 }, { "entropy": 7.727601718902588, "epoch": 0.07735281478298238, "grad_norm": 0.89453125, "learning_rate": 4.45e-05, "loss": 7.4474, "mean_token_accuracy": 0.08758748695254326, "num_tokens": 166944.0, "step": 90 }, { "entropy": 7.782265329360962, "epoch": 0.08165019338203695, "grad_norm": 0.98828125, "learning_rate": 4.7000000000000004e-05, "loss": 7.2886, "mean_token_accuracy": 0.1041356198489666, "num_tokens": 175303.0, "step": 95 }, { "entropy": 7.751953029632569, "epoch": 0.08594757198109153, "grad_norm": 1.0078125, "learning_rate": 4.9500000000000004e-05, "loss": 7.3403, "mean_token_accuracy": 0.09793160557746887, "num_tokens": 184708.0, "step": 100 }, { "entropy": 7.702822208404541, "epoch": 0.09024495058014612, "grad_norm": 0.921875, "learning_rate": 5.2e-05, "loss": 7.3117, "mean_token_accuracy": 0.09851032048463822, "num_tokens": 193835.0, "step": 105 }, { "entropy": 7.686660861968994, "epoch": 0.09454232917920069, "grad_norm": 1.1328125, "learning_rate": 5.45e-05, "loss": 7.3479, "mean_token_accuracy": 0.0979080393910408, "num_tokens": 203344.0, "step": 110 }, { "entropy": 7.698584461212159, "epoch": 0.09883970777825526, "grad_norm": 0.9296875, "learning_rate": 5.7e-05, "loss": 7.4586, "mean_token_accuracy": 0.09130895733833314, "num_tokens": 213048.0, "step": 115 }, { "entropy": 7.781258678436279, "epoch": 0.10313708637730984, "grad_norm": 1.109375, "learning_rate": 5.9499999999999996e-05, "loss": 7.3094, "mean_token_accuracy": 0.10353164449334144, "num_tokens": 221784.0, "step": 120 }, { "entropy": 7.650211572647095, "epoch": 0.10743446497636441, "grad_norm": 1.0078125, "learning_rate": 6.2e-05, "loss": 7.3189, "mean_token_accuracy": 0.09726176261901856, "num_tokens": 230971.0, "step": 125 }, { "entropy": 7.655170726776123, "epoch": 0.11173184357541899, "grad_norm": 0.96484375, "learning_rate": 6.450000000000001e-05, "loss": 7.2818, "mean_token_accuracy": 0.1042576052248478, "num_tokens": 240524.0, "step": 130 }, { "entropy": 7.7341550350189205, "epoch": 0.11602922217447358, "grad_norm": 0.88671875, "learning_rate": 6.7e-05, "loss": 7.2512, "mean_token_accuracy": 0.1007460281252861, "num_tokens": 249220.0, "step": 135 }, { "entropy": 7.745693302154541, "epoch": 0.12032660077352815, "grad_norm": 1.0234375, "learning_rate": 6.950000000000001e-05, "loss": 7.3688, "mean_token_accuracy": 0.10030856803059578, "num_tokens": 258934.0, "step": 140 }, { "entropy": 7.694993305206299, "epoch": 0.12462397937258272, "grad_norm": 1.0234375, "learning_rate": 7.2e-05, "loss": 7.2936, "mean_token_accuracy": 0.10321335718035698, "num_tokens": 267680.0, "step": 145 }, { "entropy": 7.719129991531372, "epoch": 0.1289213579716373, "grad_norm": 1.0078125, "learning_rate": 7.45e-05, "loss": 7.3236, "mean_token_accuracy": 0.10207543894648552, "num_tokens": 276227.0, "step": 150 }, { "entropy": 7.648375129699707, "epoch": 0.1332187365706919, "grad_norm": 0.94921875, "learning_rate": 7.7e-05, "loss": 7.2203, "mean_token_accuracy": 0.1059327855706215, "num_tokens": 286342.0, "step": 155 }, { "entropy": 7.674158382415771, "epoch": 0.13751611516974646, "grad_norm": 1.0625, "learning_rate": 7.950000000000001e-05, "loss": 7.2988, "mean_token_accuracy": 0.09665355160832405, "num_tokens": 294994.0, "step": 160 }, { "entropy": 7.717900514602661, "epoch": 0.14181349376880104, "grad_norm": 1.046875, "learning_rate": 8.2e-05, "loss": 7.2704, "mean_token_accuracy": 0.10349940955638885, "num_tokens": 303882.0, "step": 165 }, { "entropy": 7.6729988098144535, "epoch": 0.1461108723678556, "grad_norm": 0.9609375, "learning_rate": 8.450000000000001e-05, "loss": 7.3104, "mean_token_accuracy": 0.10128599181771278, "num_tokens": 312515.0, "step": 170 }, { "entropy": 7.739007139205933, "epoch": 0.15040825096691018, "grad_norm": 1.2109375, "learning_rate": 8.7e-05, "loss": 7.27, "mean_token_accuracy": 0.10081852003931999, "num_tokens": 320801.0, "step": 175 }, { "entropy": 7.720875406265259, "epoch": 0.15470562956596476, "grad_norm": 1.015625, "learning_rate": 8.95e-05, "loss": 7.2872, "mean_token_accuracy": 0.10100285485386848, "num_tokens": 329382.0, "step": 180 }, { "entropy": 7.66646089553833, "epoch": 0.15900300816501933, "grad_norm": 1.0390625, "learning_rate": 9.2e-05, "loss": 7.2814, "mean_token_accuracy": 0.1028428927063942, "num_tokens": 337894.0, "step": 185 }, { "entropy": 7.772510719299317, "epoch": 0.1633003867640739, "grad_norm": 1.125, "learning_rate": 9.45e-05, "loss": 7.2803, "mean_token_accuracy": 0.10378619506955147, "num_tokens": 346380.0, "step": 190 }, { "entropy": 7.690706968307495, "epoch": 0.16759776536312848, "grad_norm": 0.890625, "learning_rate": 9.7e-05, "loss": 7.3588, "mean_token_accuracy": 0.09733301475644111, "num_tokens": 356305.0, "step": 195 }, { "entropy": 7.79454927444458, "epoch": 0.17189514396218306, "grad_norm": 1.0078125, "learning_rate": 9.95e-05, "loss": 7.306, "mean_token_accuracy": 0.09683404862880707, "num_tokens": 364899.0, "step": 200 }, { "entropy": 7.694888687133789, "epoch": 0.17619252256123766, "grad_norm": 1.015625, "learning_rate": 0.000102, "loss": 7.2938, "mean_token_accuracy": 0.09810400977730752, "num_tokens": 373663.0, "step": 205 }, { "entropy": 7.748025798797608, "epoch": 0.18048990116029223, "grad_norm": 1.1640625, "learning_rate": 0.00010449999999999999, "loss": 7.2566, "mean_token_accuracy": 0.10043591782450675, "num_tokens": 382730.0, "step": 210 }, { "entropy": 7.706165361404419, "epoch": 0.1847872797593468, "grad_norm": 1.1328125, "learning_rate": 0.000107, "loss": 7.3157, "mean_token_accuracy": 0.09612104147672654, "num_tokens": 392676.0, "step": 215 }, { "entropy": 7.760982656478882, "epoch": 0.18908465835840138, "grad_norm": 1.2265625, "learning_rate": 0.0001095, "loss": 7.2955, "mean_token_accuracy": 0.10281639397144318, "num_tokens": 401050.0, "step": 220 }, { "entropy": 7.626513719558716, "epoch": 0.19338203695745596, "grad_norm": 1.078125, "learning_rate": 0.000112, "loss": 7.2692, "mean_token_accuracy": 0.10119878426194191, "num_tokens": 410009.0, "step": 225 }, { "entropy": 7.726489019393921, "epoch": 0.19767941555651053, "grad_norm": 0.98828125, "learning_rate": 0.0001145, "loss": 7.2683, "mean_token_accuracy": 0.10186234638094901, "num_tokens": 419302.0, "step": 230 }, { "entropy": 7.643717670440674, "epoch": 0.2019767941555651, "grad_norm": 1.109375, "learning_rate": 0.00011700000000000001, "loss": 7.1665, "mean_token_accuracy": 0.10647615045309067, "num_tokens": 427296.0, "step": 235 }, { "entropy": 7.666737127304077, "epoch": 0.20627417275461968, "grad_norm": 1.125, "learning_rate": 0.00011949999999999999, "loss": 7.3139, "mean_token_accuracy": 0.10131902173161507, "num_tokens": 436368.0, "step": 240 }, { "entropy": 7.772911167144775, "epoch": 0.21057155135367425, "grad_norm": 1.046875, "learning_rate": 0.000122, "loss": 7.2112, "mean_token_accuracy": 0.1055280588567257, "num_tokens": 445535.0, "step": 245 }, { "entropy": 7.602903366088867, "epoch": 0.21486892995272883, "grad_norm": 1.046875, "learning_rate": 0.0001245, "loss": 7.2153, "mean_token_accuracy": 0.10406075567007064, "num_tokens": 454769.0, "step": 250 }, { "entropy": 7.693030595779419, "epoch": 0.2191663085517834, "grad_norm": 1.125, "learning_rate": 0.000127, "loss": 7.2315, "mean_token_accuracy": 0.10270996242761612, "num_tokens": 463975.0, "step": 255 }, { "entropy": 7.637308835983276, "epoch": 0.22346368715083798, "grad_norm": 1.109375, "learning_rate": 0.0001295, "loss": 7.2542, "mean_token_accuracy": 0.10225536078214645, "num_tokens": 472899.0, "step": 260 }, { "entropy": 7.740519666671753, "epoch": 0.22776106574989258, "grad_norm": 1.09375, "learning_rate": 0.000132, "loss": 7.229, "mean_token_accuracy": 0.1005932256579399, "num_tokens": 481556.0, "step": 265 }, { "entropy": 7.654651689529419, "epoch": 0.23205844434894715, "grad_norm": 1.0625, "learning_rate": 0.00013450000000000002, "loss": 7.2258, "mean_token_accuracy": 0.10702893435955048, "num_tokens": 490253.0, "step": 270 }, { "entropy": 7.660864973068238, "epoch": 0.23635582294800173, "grad_norm": 1.2265625, "learning_rate": 0.00013700000000000002, "loss": 7.2451, "mean_token_accuracy": 0.10333684608340263, "num_tokens": 498444.0, "step": 275 }, { "entropy": 7.637535953521729, "epoch": 0.2406532015470563, "grad_norm": 0.98046875, "learning_rate": 0.0001395, "loss": 7.191, "mean_token_accuracy": 0.10794568434357643, "num_tokens": 508330.0, "step": 280 }, { "entropy": 7.6566917419433596, "epoch": 0.24495058014611087, "grad_norm": 1.234375, "learning_rate": 0.00014199999999999998, "loss": 7.3004, "mean_token_accuracy": 0.10417937636375427, "num_tokens": 517900.0, "step": 285 }, { "entropy": 7.670303010940552, "epoch": 0.24924795874516545, "grad_norm": 1.1484375, "learning_rate": 0.0001445, "loss": 7.2276, "mean_token_accuracy": 0.10308908969163895, "num_tokens": 527808.0, "step": 290 }, { "entropy": 7.719700765609741, "epoch": 0.25354533734422, "grad_norm": 1.1484375, "learning_rate": 0.000147, "loss": 7.2415, "mean_token_accuracy": 0.10010977610945701, "num_tokens": 536931.0, "step": 295 }, { "entropy": 7.668509387969971, "epoch": 0.2578427159432746, "grad_norm": 1.1796875, "learning_rate": 0.0001495, "loss": 7.279, "mean_token_accuracy": 0.10248880609869956, "num_tokens": 545758.0, "step": 300 }, { "entropy": 7.700217819213867, "epoch": 0.26214009454232917, "grad_norm": 1.0390625, "learning_rate": 0.000152, "loss": 7.2819, "mean_token_accuracy": 0.10198702886700631, "num_tokens": 555165.0, "step": 305 }, { "entropy": 7.6267822265625, "epoch": 0.2664374731413838, "grad_norm": 1.1171875, "learning_rate": 0.00015450000000000001, "loss": 7.2035, "mean_token_accuracy": 0.10117841735482216, "num_tokens": 564719.0, "step": 310 }, { "entropy": 7.646708202362061, "epoch": 0.2707348517404383, "grad_norm": 1.0859375, "learning_rate": 0.000157, "loss": 7.1638, "mean_token_accuracy": 0.10670615658164025, "num_tokens": 573572.0, "step": 315 }, { "entropy": 7.759027910232544, "epoch": 0.2750322303394929, "grad_norm": 1.3984375, "learning_rate": 0.0001595, "loss": 7.3476, "mean_token_accuracy": 0.10210367739200592, "num_tokens": 581497.0, "step": 320 }, { "entropy": 7.590592908859253, "epoch": 0.27932960893854747, "grad_norm": 1.125, "learning_rate": 0.000162, "loss": 7.2138, "mean_token_accuracy": 0.10664469674229622, "num_tokens": 591107.0, "step": 325 }, { "entropy": 7.70356388092041, "epoch": 0.28362698753760207, "grad_norm": 1.0546875, "learning_rate": 0.00016450000000000001, "loss": 7.2482, "mean_token_accuracy": 0.1050640620291233, "num_tokens": 600241.0, "step": 330 }, { "entropy": 7.639587259292602, "epoch": 0.2879243661366566, "grad_norm": 1.0703125, "learning_rate": 0.00016700000000000002, "loss": 7.161, "mean_token_accuracy": 0.1065776713192463, "num_tokens": 608697.0, "step": 335 }, { "entropy": 7.602131795883179, "epoch": 0.2922217447357112, "grad_norm": 1.1484375, "learning_rate": 0.00016950000000000003, "loss": 7.1698, "mean_token_accuracy": 0.1098954938352108, "num_tokens": 617275.0, "step": 340 }, { "entropy": 7.669042348861694, "epoch": 0.29651912333476577, "grad_norm": 1.0859375, "learning_rate": 0.00017199999999999998, "loss": 7.2602, "mean_token_accuracy": 0.1007254920899868, "num_tokens": 626644.0, "step": 345 }, { "entropy": 7.623440217971802, "epoch": 0.30081650193382037, "grad_norm": 1.1171875, "learning_rate": 0.00017449999999999999, "loss": 7.1639, "mean_token_accuracy": 0.1080157920718193, "num_tokens": 635110.0, "step": 350 }, { "entropy": 7.711002826690674, "epoch": 0.30511388053287497, "grad_norm": 0.97265625, "learning_rate": 0.000177, "loss": 7.3139, "mean_token_accuracy": 0.10216462090611458, "num_tokens": 644746.0, "step": 355 }, { "entropy": 7.708708238601685, "epoch": 0.3094112591319295, "grad_norm": 1.234375, "learning_rate": 0.0001795, "loss": 7.2216, "mean_token_accuracy": 0.1021303728222847, "num_tokens": 654281.0, "step": 360 }, { "entropy": 7.534019136428833, "epoch": 0.3137086377309841, "grad_norm": 1.234375, "learning_rate": 0.000182, "loss": 7.2333, "mean_token_accuracy": 0.10576817691326142, "num_tokens": 663174.0, "step": 365 }, { "entropy": 7.660452365875244, "epoch": 0.31800601633003867, "grad_norm": 1.0625, "learning_rate": 0.0001845, "loss": 7.1525, "mean_token_accuracy": 0.10541519671678543, "num_tokens": 672178.0, "step": 370 }, { "entropy": 7.651990938186645, "epoch": 0.32230339492909327, "grad_norm": 1.1484375, "learning_rate": 0.000187, "loss": 7.1748, "mean_token_accuracy": 0.10421534106135369, "num_tokens": 681323.0, "step": 375 }, { "entropy": 7.537337684631348, "epoch": 0.3266007735281478, "grad_norm": 0.98046875, "learning_rate": 0.0001895, "loss": 7.1001, "mean_token_accuracy": 0.11140918657183647, "num_tokens": 690461.0, "step": 380 }, { "entropy": 7.596573305130005, "epoch": 0.3308981521272024, "grad_norm": 1.2734375, "learning_rate": 0.000192, "loss": 7.1461, "mean_token_accuracy": 0.10594902262091636, "num_tokens": 699199.0, "step": 385 }, { "entropy": 7.566946506500244, "epoch": 0.33519553072625696, "grad_norm": 1.2265625, "learning_rate": 0.0001945, "loss": 7.109, "mean_token_accuracy": 0.11522968709468842, "num_tokens": 707949.0, "step": 390 }, { "entropy": 7.66830849647522, "epoch": 0.33949290932531156, "grad_norm": 1.15625, "learning_rate": 0.00019700000000000002, "loss": 7.1843, "mean_token_accuracy": 0.10416831225156784, "num_tokens": 715752.0, "step": 395 }, { "entropy": 7.619978666305542, "epoch": 0.3437902879243661, "grad_norm": 1.2734375, "learning_rate": 0.00019950000000000002, "loss": 7.1119, "mean_token_accuracy": 0.11198346018791198, "num_tokens": 724416.0, "step": 400 }, { "entropy": 7.594716548919678, "epoch": 0.3480876665234207, "grad_norm": 1.3203125, "learning_rate": 0.000202, "loss": 7.1774, "mean_token_accuracy": 0.10296614542603492, "num_tokens": 733116.0, "step": 405 }, { "entropy": 7.614369249343872, "epoch": 0.3523850451224753, "grad_norm": 1.265625, "learning_rate": 0.00020449999999999998, "loss": 7.1639, "mean_token_accuracy": 0.10737873241305351, "num_tokens": 742093.0, "step": 410 }, { "entropy": 7.532227945327759, "epoch": 0.35668242372152986, "grad_norm": 1.1640625, "learning_rate": 0.000207, "loss": 7.1385, "mean_token_accuracy": 0.11264142915606498, "num_tokens": 750402.0, "step": 415 }, { "entropy": 7.510246276855469, "epoch": 0.36097980232058446, "grad_norm": 1.0625, "learning_rate": 0.0002095, "loss": 7.1129, "mean_token_accuracy": 0.11108387559652329, "num_tokens": 760961.0, "step": 420 }, { "entropy": 7.720337963104248, "epoch": 0.365277180919639, "grad_norm": 1.171875, "learning_rate": 0.000212, "loss": 7.2042, "mean_token_accuracy": 0.10612902790307999, "num_tokens": 770554.0, "step": 425 }, { "entropy": 7.437310361862183, "epoch": 0.3695745595186936, "grad_norm": 1.328125, "learning_rate": 0.0002145, "loss": 7.1596, "mean_token_accuracy": 0.11299800872802734, "num_tokens": 779172.0, "step": 430 }, { "entropy": 7.663910818099976, "epoch": 0.37387193811774816, "grad_norm": 1.1953125, "learning_rate": 0.00021700000000000002, "loss": 7.2239, "mean_token_accuracy": 0.10290571823716163, "num_tokens": 788040.0, "step": 435 }, { "entropy": 7.589281415939331, "epoch": 0.37816931671680276, "grad_norm": 1.125, "learning_rate": 0.0002195, "loss": 7.1461, "mean_token_accuracy": 0.10722599253058433, "num_tokens": 796786.0, "step": 440 }, { "entropy": 7.543337059020996, "epoch": 0.3824666953158573, "grad_norm": 1.4296875, "learning_rate": 0.000222, "loss": 7.1192, "mean_token_accuracy": 0.10885161831974983, "num_tokens": 805520.0, "step": 445 }, { "entropy": 7.486078453063965, "epoch": 0.3867640739149119, "grad_norm": 1.3125, "learning_rate": 0.0002245, "loss": 7.074, "mean_token_accuracy": 0.10658745989203453, "num_tokens": 814939.0, "step": 450 }, { "entropy": 7.534557342529297, "epoch": 0.39106145251396646, "grad_norm": 1.2421875, "learning_rate": 0.00022700000000000002, "loss": 7.0766, "mean_token_accuracy": 0.11227057129144669, "num_tokens": 823862.0, "step": 455 }, { "entropy": 7.5476549625396725, "epoch": 0.39535883111302106, "grad_norm": 1.15625, "learning_rate": 0.00022950000000000002, "loss": 7.1124, "mean_token_accuracy": 0.10576009079813957, "num_tokens": 832820.0, "step": 460 }, { "entropy": 7.601094675064087, "epoch": 0.39965620971207566, "grad_norm": 1.234375, "learning_rate": 0.00023200000000000003, "loss": 7.0697, "mean_token_accuracy": 0.11121490225195885, "num_tokens": 841538.0, "step": 465 }, { "entropy": 7.544060945510864, "epoch": 0.4039535883111302, "grad_norm": 1.1953125, "learning_rate": 0.00023449999999999998, "loss": 7.2069, "mean_token_accuracy": 0.10181558132171631, "num_tokens": 851123.0, "step": 470 }, { "entropy": 7.549469089508056, "epoch": 0.4082509669101848, "grad_norm": 1.1875, "learning_rate": 0.000237, "loss": 7.1633, "mean_token_accuracy": 0.11091246008872986, "num_tokens": 860357.0, "step": 475 }, { "entropy": 7.547894096374511, "epoch": 0.41254834550923936, "grad_norm": 1.234375, "learning_rate": 0.0002395, "loss": 7.0874, "mean_token_accuracy": 0.10722309574484826, "num_tokens": 869980.0, "step": 480 }, { "entropy": 7.507503604888916, "epoch": 0.41684572410829396, "grad_norm": 1.2421875, "learning_rate": 0.000242, "loss": 7.0572, "mean_token_accuracy": 0.11242355704307556, "num_tokens": 878250.0, "step": 485 }, { "entropy": 7.5191121101379395, "epoch": 0.4211431027073485, "grad_norm": 1.125, "learning_rate": 0.0002445, "loss": 7.1411, "mean_token_accuracy": 0.11158529818058013, "num_tokens": 887624.0, "step": 490 }, { "entropy": 7.454204320907593, "epoch": 0.4254404813064031, "grad_norm": 1.1640625, "learning_rate": 0.000247, "loss": 7.1159, "mean_token_accuracy": 0.11260272860527039, "num_tokens": 897120.0, "step": 495 }, { "entropy": 7.495032835006714, "epoch": 0.42973785990545765, "grad_norm": 1.140625, "learning_rate": 0.0002495, "loss": 7.0795, "mean_token_accuracy": 0.11134620234370232, "num_tokens": 906215.0, "step": 500 }, { "epoch": 0.42973785990545765, "eval_entropy": 7.203803374960616, "eval_loss": 7.096514701843262, "eval_mean_token_accuracy": 0.11462040213649874, "eval_num_tokens": 906215.0, "eval_runtime": 2.0645, "eval_samples_per_second": 1719.022, "eval_steps_per_second": 215.059, "step": 500 }, { "entropy": 7.447824621200562, "epoch": 0.43403523850451226, "grad_norm": 1.15625, "learning_rate": 0.000252, "loss": 7.0811, "mean_token_accuracy": 0.1122453585267067, "num_tokens": 915181.0, "step": 505 }, { "entropy": 7.498021125793457, "epoch": 0.4383326171035668, "grad_norm": 1.328125, "learning_rate": 0.0002545, "loss": 7.1044, "mean_token_accuracy": 0.10958386138081551, "num_tokens": 924377.0, "step": 510 }, { "entropy": 7.607626008987427, "epoch": 0.4426299957026214, "grad_norm": 1.1796875, "learning_rate": 0.000257, "loss": 7.1944, "mean_token_accuracy": 0.10655399709939957, "num_tokens": 933114.0, "step": 515 }, { "entropy": 7.6139122486114506, "epoch": 0.44692737430167595, "grad_norm": 1.0625, "learning_rate": 0.0002595, "loss": 7.1453, "mean_token_accuracy": 0.11119715198874473, "num_tokens": 943306.0, "step": 520 }, { "entropy": 7.436026573181152, "epoch": 0.45122475290073055, "grad_norm": 1.2578125, "learning_rate": 0.000262, "loss": 7.0354, "mean_token_accuracy": 0.11904665902256965, "num_tokens": 951515.0, "step": 525 }, { "entropy": 7.494698238372803, "epoch": 0.45552213149978515, "grad_norm": 1.2578125, "learning_rate": 0.00026450000000000003, "loss": 7.1519, "mean_token_accuracy": 0.10504961535334587, "num_tokens": 962686.0, "step": 530 }, { "entropy": 7.572213172912598, "epoch": 0.4598195100988397, "grad_norm": 1.125, "learning_rate": 0.00026700000000000004, "loss": 7.1449, "mean_token_accuracy": 0.11348244249820709, "num_tokens": 972136.0, "step": 535 }, { "entropy": 7.405817127227783, "epoch": 0.4641168886978943, "grad_norm": 1.2734375, "learning_rate": 0.00026950000000000005, "loss": 7.0518, "mean_token_accuracy": 0.1100372053682804, "num_tokens": 981301.0, "step": 540 }, { "entropy": 7.484500360488892, "epoch": 0.46841426729694885, "grad_norm": 1.390625, "learning_rate": 0.00027200000000000005, "loss": 7.0823, "mean_token_accuracy": 0.1120329774916172, "num_tokens": 990360.0, "step": 545 }, { "entropy": 7.573296546936035, "epoch": 0.47271164589600345, "grad_norm": 1.21875, "learning_rate": 0.0002745, "loss": 7.1293, "mean_token_accuracy": 0.10760239511728287, "num_tokens": 999415.0, "step": 550 }, { "entropy": 7.419287919998169, "epoch": 0.477009024495058, "grad_norm": 1.0859375, "learning_rate": 0.000277, "loss": 7.057, "mean_token_accuracy": 0.10999582111835479, "num_tokens": 1008762.0, "step": 555 }, { "entropy": 7.44342451095581, "epoch": 0.4813064030941126, "grad_norm": 1.2890625, "learning_rate": 0.0002795, "loss": 7.0505, "mean_token_accuracy": 0.11702658385038375, "num_tokens": 1017704.0, "step": 560 }, { "entropy": 7.457871007919311, "epoch": 0.48560378169316715, "grad_norm": 1.234375, "learning_rate": 0.00028199999999999997, "loss": 7.018, "mean_token_accuracy": 0.11318592131137847, "num_tokens": 1026251.0, "step": 565 }, { "entropy": 7.356105470657349, "epoch": 0.48990116029222175, "grad_norm": 1.0859375, "learning_rate": 0.0002845, "loss": 7.0083, "mean_token_accuracy": 0.11355392187833786, "num_tokens": 1036191.0, "step": 570 }, { "entropy": 7.5119133472442625, "epoch": 0.4941985388912763, "grad_norm": 1.1953125, "learning_rate": 0.000287, "loss": 7.0501, "mean_token_accuracy": 0.11168754398822785, "num_tokens": 1044936.0, "step": 575 }, { "entropy": 7.406773805618286, "epoch": 0.4984959174903309, "grad_norm": 1.171875, "learning_rate": 0.0002895, "loss": 7.0476, "mean_token_accuracy": 0.1135815680027008, "num_tokens": 1053683.0, "step": 580 }, { "entropy": 7.3828895568847654, "epoch": 0.5027932960893855, "grad_norm": 1.15625, "learning_rate": 0.000292, "loss": 7.0283, "mean_token_accuracy": 0.11782724559307098, "num_tokens": 1062932.0, "step": 585 }, { "entropy": 7.4789910316467285, "epoch": 0.50709067468844, "grad_norm": 1.0859375, "learning_rate": 0.0002945, "loss": 7.0524, "mean_token_accuracy": 0.11150057762861251, "num_tokens": 1072313.0, "step": 590 }, { "entropy": 7.458136653900146, "epoch": 0.5113880532874946, "grad_norm": 1.078125, "learning_rate": 0.000297, "loss": 7.033, "mean_token_accuracy": 0.10738502442836761, "num_tokens": 1081675.0, "step": 595 }, { "entropy": 7.437460470199585, "epoch": 0.5156854318865493, "grad_norm": 1.1875, "learning_rate": 0.0002995, "loss": 7.0392, "mean_token_accuracy": 0.11078862249851226, "num_tokens": 1091541.0, "step": 600 }, { "entropy": 7.43347053527832, "epoch": 0.5199828104856038, "grad_norm": 1.1171875, "learning_rate": 0.000302, "loss": 7.0467, "mean_token_accuracy": 0.11545747444033623, "num_tokens": 1100724.0, "step": 605 }, { "entropy": 7.34070782661438, "epoch": 0.5242801890846583, "grad_norm": 1.265625, "learning_rate": 0.0003045, "loss": 7.0062, "mean_token_accuracy": 0.11681902781128883, "num_tokens": 1108869.0, "step": 610 }, { "entropy": 7.513333511352539, "epoch": 0.5285775676837129, "grad_norm": 1.2109375, "learning_rate": 0.000307, "loss": 7.0303, "mean_token_accuracy": 0.11391275599598885, "num_tokens": 1117314.0, "step": 615 }, { "entropy": 7.237616014480591, "epoch": 0.5328749462827675, "grad_norm": 1.1875, "learning_rate": 0.0003095, "loss": 6.969, "mean_token_accuracy": 0.11866867989301681, "num_tokens": 1126786.0, "step": 620 }, { "entropy": 7.403380393981934, "epoch": 0.5371723248818221, "grad_norm": 1.3515625, "learning_rate": 0.000312, "loss": 6.983, "mean_token_accuracy": 0.11322688534855843, "num_tokens": 1136013.0, "step": 625 }, { "entropy": 7.355997228622437, "epoch": 0.5414697034808766, "grad_norm": 1.15625, "learning_rate": 0.0003145, "loss": 7.0163, "mean_token_accuracy": 0.1159099243581295, "num_tokens": 1144970.0, "step": 630 }, { "entropy": 7.416441440582275, "epoch": 0.5457670820799312, "grad_norm": 1.3046875, "learning_rate": 0.000317, "loss": 6.9784, "mean_token_accuracy": 0.12343248203396798, "num_tokens": 1153810.0, "step": 635 }, { "entropy": 7.320913982391358, "epoch": 0.5500644606789858, "grad_norm": 1.234375, "learning_rate": 0.0003195, "loss": 6.96, "mean_token_accuracy": 0.11895549520850182, "num_tokens": 1162498.0, "step": 640 }, { "entropy": 7.383200359344483, "epoch": 0.5543618392780404, "grad_norm": 1.15625, "learning_rate": 0.000322, "loss": 7.0441, "mean_token_accuracy": 0.11171148270368576, "num_tokens": 1172091.0, "step": 645 }, { "entropy": 7.465569925308228, "epoch": 0.5586592178770949, "grad_norm": 1.1875, "learning_rate": 0.00032450000000000003, "loss": 7.0379, "mean_token_accuracy": 0.1126454509794712, "num_tokens": 1181400.0, "step": 650 }, { "entropy": 7.29718279838562, "epoch": 0.5629565964761496, "grad_norm": 1.3671875, "learning_rate": 0.00032700000000000003, "loss": 7.0066, "mean_token_accuracy": 0.11692977026104927, "num_tokens": 1189780.0, "step": 655 }, { "entropy": 7.376112461090088, "epoch": 0.5672539750752041, "grad_norm": 1.234375, "learning_rate": 0.00032950000000000004, "loss": 6.9708, "mean_token_accuracy": 0.11179102137684822, "num_tokens": 1198671.0, "step": 660 }, { "entropy": 7.406812715530395, "epoch": 0.5715513536742587, "grad_norm": 1.140625, "learning_rate": 0.00033200000000000005, "loss": 6.9887, "mean_token_accuracy": 0.11439693570137024, "num_tokens": 1207173.0, "step": 665 }, { "entropy": 7.267558336257935, "epoch": 0.5758487322733132, "grad_norm": 1.328125, "learning_rate": 0.00033450000000000005, "loss": 6.9252, "mean_token_accuracy": 0.11824023947119713, "num_tokens": 1216387.0, "step": 670 }, { "entropy": 7.466721105575561, "epoch": 0.5801461108723679, "grad_norm": 1.1640625, "learning_rate": 0.000337, "loss": 6.9093, "mean_token_accuracy": 0.11586858034133911, "num_tokens": 1224461.0, "step": 675 }, { "entropy": 7.260802936553955, "epoch": 0.5844434894714224, "grad_norm": 1.2265625, "learning_rate": 0.0003395, "loss": 6.9855, "mean_token_accuracy": 0.1176436722278595, "num_tokens": 1233774.0, "step": 680 }, { "entropy": 7.267514610290528, "epoch": 0.588740868070477, "grad_norm": 1.2109375, "learning_rate": 0.000342, "loss": 6.9319, "mean_token_accuracy": 0.12313097864389419, "num_tokens": 1242812.0, "step": 685 }, { "entropy": 7.451924133300781, "epoch": 0.5930382466695315, "grad_norm": 1.1640625, "learning_rate": 0.00034449999999999997, "loss": 7.0445, "mean_token_accuracy": 0.1125735655426979, "num_tokens": 1252872.0, "step": 690 }, { "entropy": 7.1216278076171875, "epoch": 0.5973356252685862, "grad_norm": 1.21875, "learning_rate": 0.000347, "loss": 6.8314, "mean_token_accuracy": 0.1210754469037056, "num_tokens": 1260852.0, "step": 695 }, { "entropy": 7.292500305175781, "epoch": 0.6016330038676407, "grad_norm": 1.21875, "learning_rate": 0.0003495, "loss": 6.9419, "mean_token_accuracy": 0.1167706459760666, "num_tokens": 1268925.0, "step": 700 }, { "entropy": 7.384844732284546, "epoch": 0.6059303824666953, "grad_norm": 1.1484375, "learning_rate": 0.000352, "loss": 6.9849, "mean_token_accuracy": 0.11300796419382095, "num_tokens": 1278994.0, "step": 705 }, { "entropy": 7.286926889419556, "epoch": 0.6102277610657499, "grad_norm": 1.1875, "learning_rate": 0.0003545, "loss": 6.9847, "mean_token_accuracy": 0.11259545534849166, "num_tokens": 1287698.0, "step": 710 }, { "entropy": 7.337662601470948, "epoch": 0.6145251396648045, "grad_norm": 1.125, "learning_rate": 0.000357, "loss": 6.9117, "mean_token_accuracy": 0.12028303518891334, "num_tokens": 1297475.0, "step": 715 }, { "entropy": 7.265739297866821, "epoch": 0.618822518263859, "grad_norm": 1.234375, "learning_rate": 0.0003595, "loss": 6.9558, "mean_token_accuracy": 0.11790136769413948, "num_tokens": 1306836.0, "step": 720 }, { "entropy": 7.3774675846099855, "epoch": 0.6231198968629136, "grad_norm": 1.140625, "learning_rate": 0.000362, "loss": 6.9932, "mean_token_accuracy": 0.11299360319972038, "num_tokens": 1315872.0, "step": 725 }, { "entropy": 7.3129335880279545, "epoch": 0.6274172754619682, "grad_norm": 1.28125, "learning_rate": 0.0003645, "loss": 6.9353, "mean_token_accuracy": 0.12453719973564148, "num_tokens": 1324624.0, "step": 730 }, { "entropy": 7.300215101242065, "epoch": 0.6317146540610228, "grad_norm": 1.34375, "learning_rate": 0.000367, "loss": 6.9246, "mean_token_accuracy": 0.12120431885123253, "num_tokens": 1333058.0, "step": 735 }, { "entropy": 7.065497016906738, "epoch": 0.6360120326600773, "grad_norm": 1.0703125, "learning_rate": 0.0003695, "loss": 6.8904, "mean_token_accuracy": 0.11625659838318825, "num_tokens": 1342376.0, "step": 740 }, { "entropy": 7.412401533126831, "epoch": 0.6403094112591319, "grad_norm": 1.2578125, "learning_rate": 0.000372, "loss": 6.9293, "mean_token_accuracy": 0.11268759667873382, "num_tokens": 1351386.0, "step": 745 }, { "entropy": 7.194233036041259, "epoch": 0.6446067898581865, "grad_norm": 1.3359375, "learning_rate": 0.0003745, "loss": 6.8338, "mean_token_accuracy": 0.12849506586790085, "num_tokens": 1358958.0, "step": 750 }, { "entropy": 7.3347986221313475, "epoch": 0.6489041684572411, "grad_norm": 1.2109375, "learning_rate": 0.000377, "loss": 6.988, "mean_token_accuracy": 0.11507417485117913, "num_tokens": 1368599.0, "step": 755 }, { "entropy": 7.380126667022705, "epoch": 0.6532015470562956, "grad_norm": 1.984375, "learning_rate": 0.0003795, "loss": 7.0127, "mean_token_accuracy": 0.111283528059721, "num_tokens": 1378529.0, "step": 760 }, { "entropy": 7.157611989974976, "epoch": 0.6574989256553503, "grad_norm": 1.3984375, "learning_rate": 0.000382, "loss": 6.8052, "mean_token_accuracy": 0.1265752285718918, "num_tokens": 1386993.0, "step": 765 }, { "entropy": 7.21686282157898, "epoch": 0.6617963042544048, "grad_norm": 1.4296875, "learning_rate": 0.0003845, "loss": 6.8936, "mean_token_accuracy": 0.12180712148547172, "num_tokens": 1395790.0, "step": 770 }, { "entropy": 7.166302919387817, "epoch": 0.6660936828534594, "grad_norm": 1.1875, "learning_rate": 0.00038700000000000003, "loss": 6.9063, "mean_token_accuracy": 0.11845313757658005, "num_tokens": 1405587.0, "step": 775 }, { "entropy": 7.20961365699768, "epoch": 0.6703910614525139, "grad_norm": 1.1875, "learning_rate": 0.00038950000000000003, "loss": 6.8702, "mean_token_accuracy": 0.12274195328354835, "num_tokens": 1414478.0, "step": 780 }, { "entropy": 7.319825458526611, "epoch": 0.6746884400515686, "grad_norm": 1.4296875, "learning_rate": 0.00039200000000000004, "loss": 6.9317, "mean_token_accuracy": 0.12083822339773179, "num_tokens": 1423791.0, "step": 785 }, { "entropy": 7.313541460037231, "epoch": 0.6789858186506231, "grad_norm": 1.328125, "learning_rate": 0.00039450000000000005, "loss": 6.975, "mean_token_accuracy": 0.11185284182429314, "num_tokens": 1432955.0, "step": 790 }, { "entropy": 7.242367315292358, "epoch": 0.6832831972496777, "grad_norm": 1.03125, "learning_rate": 0.00039700000000000005, "loss": 6.9394, "mean_token_accuracy": 0.11529579535126686, "num_tokens": 1441907.0, "step": 795 }, { "entropy": 7.173644304275513, "epoch": 0.6875805758487322, "grad_norm": 1.2734375, "learning_rate": 0.0003995, "loss": 6.8059, "mean_token_accuracy": 0.12198502644896507, "num_tokens": 1451062.0, "step": 800 }, { "entropy": 7.2840491771698, "epoch": 0.6918779544477869, "grad_norm": 1.109375, "learning_rate": 0.000402, "loss": 6.8894, "mean_token_accuracy": 0.11644295528531075, "num_tokens": 1460132.0, "step": 805 }, { "entropy": 7.085446500778199, "epoch": 0.6961753330468414, "grad_norm": 1.078125, "learning_rate": 0.0004045, "loss": 6.7896, "mean_token_accuracy": 0.12437586709856988, "num_tokens": 1469582.0, "step": 810 }, { "entropy": 7.180881690979004, "epoch": 0.700472711645896, "grad_norm": 1.4453125, "learning_rate": 0.00040699999999999997, "loss": 6.8844, "mean_token_accuracy": 0.11694586053490638, "num_tokens": 1479053.0, "step": 815 }, { "entropy": 7.176044559478759, "epoch": 0.7047700902449506, "grad_norm": 1.21875, "learning_rate": 0.0004095, "loss": 6.8874, "mean_token_accuracy": 0.11812442615628242, "num_tokens": 1488189.0, "step": 820 }, { "entropy": 7.071721315383911, "epoch": 0.7090674688440052, "grad_norm": 1.2578125, "learning_rate": 0.000412, "loss": 6.7495, "mean_token_accuracy": 0.12273769155144691, "num_tokens": 1497324.0, "step": 825 }, { "entropy": 7.243275499343872, "epoch": 0.7133648474430597, "grad_norm": 1.0546875, "learning_rate": 0.0004145, "loss": 6.8631, "mean_token_accuracy": 0.12297548577189446, "num_tokens": 1506543.0, "step": 830 }, { "entropy": 7.1102629661560055, "epoch": 0.7176622260421143, "grad_norm": 1.171875, "learning_rate": 0.000417, "loss": 6.8571, "mean_token_accuracy": 0.1257997862994671, "num_tokens": 1516737.0, "step": 835 }, { "entropy": 7.015081739425659, "epoch": 0.7219596046411689, "grad_norm": 1.1015625, "learning_rate": 0.0004195, "loss": 6.7311, "mean_token_accuracy": 0.12102818563580513, "num_tokens": 1525561.0, "step": 840 }, { "entropy": 7.17170901298523, "epoch": 0.7262569832402235, "grad_norm": 1.203125, "learning_rate": 0.000422, "loss": 6.757, "mean_token_accuracy": 0.12571127861738204, "num_tokens": 1533323.0, "step": 845 }, { "entropy": 7.173940944671631, "epoch": 0.730554361839278, "grad_norm": 1.2109375, "learning_rate": 0.0004245, "loss": 6.821, "mean_token_accuracy": 0.12750849053263663, "num_tokens": 1542632.0, "step": 850 }, { "entropy": 7.148316097259522, "epoch": 0.7348517404383326, "grad_norm": 1.296875, "learning_rate": 0.000427, "loss": 6.7649, "mean_token_accuracy": 0.12507490813732147, "num_tokens": 1551236.0, "step": 855 }, { "entropy": 6.981910467147827, "epoch": 0.7391491190373872, "grad_norm": 1.21875, "learning_rate": 0.0004295, "loss": 6.7641, "mean_token_accuracy": 0.12514904662966728, "num_tokens": 1559674.0, "step": 860 }, { "entropy": 7.186282157897949, "epoch": 0.7434464976364418, "grad_norm": 1.1484375, "learning_rate": 0.000432, "loss": 6.8498, "mean_token_accuracy": 0.1250532478094101, "num_tokens": 1569481.0, "step": 865 }, { "entropy": 7.118600702285766, "epoch": 0.7477438762354963, "grad_norm": 1.1796875, "learning_rate": 0.0004345, "loss": 6.8888, "mean_token_accuracy": 0.1209896370768547, "num_tokens": 1578488.0, "step": 870 }, { "entropy": 7.105226039886475, "epoch": 0.752041254834551, "grad_norm": 1.078125, "learning_rate": 0.000437, "loss": 6.7736, "mean_token_accuracy": 0.12527675032615662, "num_tokens": 1586675.0, "step": 875 }, { "entropy": 7.185068035125733, "epoch": 0.7563386334336055, "grad_norm": 1.1015625, "learning_rate": 0.0004395, "loss": 6.8782, "mean_token_accuracy": 0.1180253192782402, "num_tokens": 1595411.0, "step": 880 }, { "entropy": 7.179415893554688, "epoch": 0.7606360120326601, "grad_norm": 1.2734375, "learning_rate": 0.000442, "loss": 6.8619, "mean_token_accuracy": 0.12292847484350204, "num_tokens": 1604046.0, "step": 885 }, { "entropy": 7.130577564239502, "epoch": 0.7649333906317146, "grad_norm": 1.15625, "learning_rate": 0.0004445, "loss": 6.8566, "mean_token_accuracy": 0.11715829819440841, "num_tokens": 1613759.0, "step": 890 }, { "entropy": 7.111226511001587, "epoch": 0.7692307692307693, "grad_norm": 1.09375, "learning_rate": 0.000447, "loss": 6.8191, "mean_token_accuracy": 0.1252148814499378, "num_tokens": 1623323.0, "step": 895 }, { "entropy": 7.097943353652954, "epoch": 0.7735281478298238, "grad_norm": 1.21875, "learning_rate": 0.00044950000000000003, "loss": 6.7922, "mean_token_accuracy": 0.11943844705820084, "num_tokens": 1631727.0, "step": 900 }, { "entropy": 7.073408317565918, "epoch": 0.7778255264288784, "grad_norm": 1.21875, "learning_rate": 0.00045200000000000004, "loss": 6.7454, "mean_token_accuracy": 0.12582483813166617, "num_tokens": 1639544.0, "step": 905 }, { "entropy": 7.1905022144317625, "epoch": 0.7821229050279329, "grad_norm": 1.2421875, "learning_rate": 0.00045450000000000004, "loss": 6.8716, "mean_token_accuracy": 0.11673429310321808, "num_tokens": 1648931.0, "step": 910 }, { "entropy": 7.032827425003052, "epoch": 0.7864202836269876, "grad_norm": 1.140625, "learning_rate": 0.00045700000000000005, "loss": 6.7325, "mean_token_accuracy": 0.12737771049141883, "num_tokens": 1657688.0, "step": 915 }, { "entropy": 7.160619735717773, "epoch": 0.7907176622260421, "grad_norm": 1.0859375, "learning_rate": 0.00045950000000000006, "loss": 6.8191, "mean_token_accuracy": 0.11969996094703675, "num_tokens": 1666879.0, "step": 920 }, { "entropy": 7.016655492782593, "epoch": 0.7950150408250967, "grad_norm": 1.125, "learning_rate": 0.000462, "loss": 6.7912, "mean_token_accuracy": 0.12404834032058716, "num_tokens": 1676773.0, "step": 925 }, { "entropy": 7.205742454528808, "epoch": 0.7993124194241513, "grad_norm": 1.140625, "learning_rate": 0.0004645, "loss": 6.8942, "mean_token_accuracy": 0.11682869419455529, "num_tokens": 1686144.0, "step": 930 }, { "entropy": 7.093483018875122, "epoch": 0.8036097980232059, "grad_norm": 1.09375, "learning_rate": 0.000467, "loss": 6.8555, "mean_token_accuracy": 0.11735839322209358, "num_tokens": 1695476.0, "step": 935 }, { "entropy": 7.090408611297607, "epoch": 0.8079071766222604, "grad_norm": 1.1171875, "learning_rate": 0.0004695, "loss": 6.7525, "mean_token_accuracy": 0.12118161767721176, "num_tokens": 1704907.0, "step": 940 }, { "entropy": 7.016019344329834, "epoch": 0.812204555221315, "grad_norm": 1.0078125, "learning_rate": 0.000472, "loss": 6.7924, "mean_token_accuracy": 0.12617168575525284, "num_tokens": 1714564.0, "step": 945 }, { "entropy": 7.132166576385498, "epoch": 0.8165019338203696, "grad_norm": 1.1328125, "learning_rate": 0.0004745, "loss": 6.8135, "mean_token_accuracy": 0.12022659555077553, "num_tokens": 1725285.0, "step": 950 }, { "entropy": 7.00044469833374, "epoch": 0.8207993124194242, "grad_norm": 1.1015625, "learning_rate": 0.000477, "loss": 6.8177, "mean_token_accuracy": 0.12241263464093208, "num_tokens": 1734331.0, "step": 955 }, { "entropy": 7.126689529418945, "epoch": 0.8250966910184787, "grad_norm": 1.28125, "learning_rate": 0.0004795, "loss": 6.749, "mean_token_accuracy": 0.11530287116765976, "num_tokens": 1742340.0, "step": 960 }, { "entropy": 7.05500750541687, "epoch": 0.8293940696175333, "grad_norm": 1.15625, "learning_rate": 0.000482, "loss": 6.7383, "mean_token_accuracy": 0.12545244619250298, "num_tokens": 1751725.0, "step": 965 }, { "entropy": 6.894489717483521, "epoch": 0.8336914482165879, "grad_norm": 1.1796875, "learning_rate": 0.0004845, "loss": 6.6736, "mean_token_accuracy": 0.12856126353144645, "num_tokens": 1760294.0, "step": 970 }, { "entropy": 7.036704349517822, "epoch": 0.8379888268156425, "grad_norm": 1.0859375, "learning_rate": 0.000487, "loss": 6.7265, "mean_token_accuracy": 0.1231304183602333, "num_tokens": 1768912.0, "step": 975 }, { "entropy": 7.092654848098755, "epoch": 0.842286205414697, "grad_norm": 1.140625, "learning_rate": 0.0004895, "loss": 6.9187, "mean_token_accuracy": 0.12804483920335769, "num_tokens": 1778633.0, "step": 980 }, { "entropy": 7.090839195251465, "epoch": 0.8465835840137517, "grad_norm": 1.140625, "learning_rate": 0.000492, "loss": 6.7883, "mean_token_accuracy": 0.12408955544233322, "num_tokens": 1787275.0, "step": 985 }, { "entropy": 7.0695414543151855, "epoch": 0.8508809626128062, "grad_norm": 1.2734375, "learning_rate": 0.0004945, "loss": 6.7844, "mean_token_accuracy": 0.12348324134945869, "num_tokens": 1795994.0, "step": 990 }, { "entropy": 6.964667177200317, "epoch": 0.8551783412118608, "grad_norm": 0.94921875, "learning_rate": 0.000497, "loss": 6.7175, "mean_token_accuracy": 0.12602235972881318, "num_tokens": 1806379.0, "step": 995 }, { "entropy": 7.061655473709107, "epoch": 0.8594757198109153, "grad_norm": 1.09375, "learning_rate": 0.0004995, "loss": 6.7479, "mean_token_accuracy": 0.13024335727095604, "num_tokens": 1816135.0, "step": 1000 }, { "epoch": 0.8594757198109153, "eval_entropy": 6.75515693050247, "eval_loss": 6.752710819244385, "eval_mean_token_accuracy": 0.12811107195175445, "eval_num_tokens": 1816135.0, "eval_runtime": 2.0604, "eval_samples_per_second": 1722.442, "eval_steps_per_second": 215.487, "step": 1000 }, { "entropy": 6.9897054672241214, "epoch": 0.86377309840997, "grad_norm": 1.2890625, "learning_rate": 0.0004999998427807679, "loss": 6.7314, "mean_token_accuracy": 0.12282020673155784, "num_tokens": 1824777.0, "step": 1005 }, { "entropy": 6.925821113586426, "epoch": 0.8680704770090245, "grad_norm": 1.4296875, "learning_rate": 0.0004999992040780138, "loss": 6.8085, "mean_token_accuracy": 0.1247783549129963, "num_tokens": 1833807.0, "step": 1010 }, { "entropy": 7.123036670684814, "epoch": 0.8723678556080791, "grad_norm": 1.078125, "learning_rate": 0.0004999980740669294, "loss": 6.754, "mean_token_accuracy": 0.12499897480010987, "num_tokens": 1843375.0, "step": 1015 }, { "entropy": 7.027141857147217, "epoch": 0.8766652342071336, "grad_norm": 1.1796875, "learning_rate": 0.0004999964527499823, "loss": 6.8155, "mean_token_accuracy": 0.12067028507590294, "num_tokens": 1853036.0, "step": 1020 }, { "entropy": 7.018357038497925, "epoch": 0.8809626128061883, "grad_norm": 1.1328125, "learning_rate": 0.0004999943401307127, "loss": 6.7605, "mean_token_accuracy": 0.12497071847319603, "num_tokens": 1862041.0, "step": 1025 }, { "entropy": 6.984006929397583, "epoch": 0.8852599914052428, "grad_norm": 1.2421875, "learning_rate": 0.0004999917362137337, "loss": 6.6885, "mean_token_accuracy": 0.12735832259058952, "num_tokens": 1870707.0, "step": 1030 }, { "entropy": 6.964999151229859, "epoch": 0.8895573700042974, "grad_norm": 1.140625, "learning_rate": 0.0004999886410047312, "loss": 6.6849, "mean_token_accuracy": 0.12543184384703637, "num_tokens": 1879787.0, "step": 1035 }, { "entropy": 7.046022748947143, "epoch": 0.8938547486033519, "grad_norm": 1.1171875, "learning_rate": 0.0004999850545104638, "loss": 6.7336, "mean_token_accuracy": 0.12585699930787086, "num_tokens": 1889413.0, "step": 1040 }, { "entropy": 6.9450146675109865, "epoch": 0.8981521272024066, "grad_norm": 1.265625, "learning_rate": 0.0004999809767387633, "loss": 6.7291, "mean_token_accuracy": 0.12462790235877037, "num_tokens": 1898283.0, "step": 1045 }, { "entropy": 6.982704973220825, "epoch": 0.9024495058014611, "grad_norm": 1.109375, "learning_rate": 0.0004999764076985337, "loss": 6.7474, "mean_token_accuracy": 0.12953734770417213, "num_tokens": 1907175.0, "step": 1050 }, { "entropy": 6.947793340682983, "epoch": 0.9067468844005157, "grad_norm": 1.109375, "learning_rate": 0.0004999713473997519, "loss": 6.7933, "mean_token_accuracy": 0.12337937280535698, "num_tokens": 1918223.0, "step": 1055 }, { "entropy": 7.053569555282593, "epoch": 0.9110442629995703, "grad_norm": 1.109375, "learning_rate": 0.0004999657958534677, "loss": 6.7435, "mean_token_accuracy": 0.11936211958527565, "num_tokens": 1928801.0, "step": 1060 }, { "entropy": 6.874362564086914, "epoch": 0.9153416415986249, "grad_norm": 1.1171875, "learning_rate": 0.0004999597530718034, "loss": 6.7076, "mean_token_accuracy": 0.12535862401127815, "num_tokens": 1937406.0, "step": 1065 }, { "entropy": 6.924251508712769, "epoch": 0.9196390201976794, "grad_norm": 1.1171875, "learning_rate": 0.000499953219067954, "loss": 6.7025, "mean_token_accuracy": 0.12463184967637062, "num_tokens": 1947184.0, "step": 1070 }, { "entropy": 7.056308698654175, "epoch": 0.923936398796734, "grad_norm": 1.15625, "learning_rate": 0.0004999461938561873, "loss": 6.7241, "mean_token_accuracy": 0.12476856112480164, "num_tokens": 1956293.0, "step": 1075 }, { "entropy": 6.90220274925232, "epoch": 0.9282337773957886, "grad_norm": 1.1328125, "learning_rate": 0.0004999386774518432, "loss": 6.6968, "mean_token_accuracy": 0.12625648751854895, "num_tokens": 1964791.0, "step": 1080 }, { "entropy": 6.965981435775757, "epoch": 0.9325311559948432, "grad_norm": 1.0546875, "learning_rate": 0.0004999306698713349, "loss": 6.616, "mean_token_accuracy": 0.12837354317307473, "num_tokens": 1973754.0, "step": 1085 }, { "entropy": 6.929974555969238, "epoch": 0.9368285345938977, "grad_norm": 1.1015625, "learning_rate": 0.0004999221711321477, "loss": 6.6857, "mean_token_accuracy": 0.12695353776216506, "num_tokens": 1983035.0, "step": 1090 }, { "entropy": 6.804391956329345, "epoch": 0.9411259131929522, "grad_norm": 1.0859375, "learning_rate": 0.0004999131812528393, "loss": 6.7126, "mean_token_accuracy": 0.12742481231689454, "num_tokens": 1992584.0, "step": 1095 }, { "entropy": 7.0129533290863035, "epoch": 0.9454232917920069, "grad_norm": 0.94140625, "learning_rate": 0.00049990370025304, "loss": 6.745, "mean_token_accuracy": 0.1250165306031704, "num_tokens": 2001876.0, "step": 1100 }, { "entropy": 6.9361108303070065, "epoch": 0.9497206703910615, "grad_norm": 1.015625, "learning_rate": 0.0004998937281534526, "loss": 6.6354, "mean_token_accuracy": 0.1352070689201355, "num_tokens": 2011067.0, "step": 1105 }, { "entropy": 7.00281867980957, "epoch": 0.954018048990116, "grad_norm": 1.140625, "learning_rate": 0.0004998832649758521, "loss": 6.7191, "mean_token_accuracy": 0.12910578772425652, "num_tokens": 2020763.0, "step": 1110 }, { "entropy": 6.846075534820557, "epoch": 0.9583154275891707, "grad_norm": 1.2421875, "learning_rate": 0.0004998723107430862, "loss": 6.702, "mean_token_accuracy": 0.12597106099128724, "num_tokens": 2029534.0, "step": 1115 }, { "entropy": 6.979312801361084, "epoch": 0.9626128061882252, "grad_norm": 1.109375, "learning_rate": 0.0004998608654790741, "loss": 6.6576, "mean_token_accuracy": 0.12685178518295287, "num_tokens": 2039143.0, "step": 1120 }, { "entropy": 6.840395832061768, "epoch": 0.9669101847872797, "grad_norm": 1.1953125, "learning_rate": 0.000499848929208808, "loss": 6.619, "mean_token_accuracy": 0.13090287074446677, "num_tokens": 2048253.0, "step": 1125 }, { "entropy": 6.833210182189942, "epoch": 0.9712075633863343, "grad_norm": 1.234375, "learning_rate": 0.0004998365019583519, "loss": 6.6747, "mean_token_accuracy": 0.13630941957235337, "num_tokens": 2057234.0, "step": 1130 }, { "entropy": 7.008919525146484, "epoch": 0.975504941985389, "grad_norm": 1.203125, "learning_rate": 0.0004998235837548417, "loss": 6.7058, "mean_token_accuracy": 0.12927891165018082, "num_tokens": 2065431.0, "step": 1135 }, { "entropy": 6.887974071502685, "epoch": 0.9798023205844435, "grad_norm": 1.1015625, "learning_rate": 0.000499810174626486, "loss": 6.7146, "mean_token_accuracy": 0.1267981804907322, "num_tokens": 2074723.0, "step": 1140 }, { "entropy": 6.909135150909424, "epoch": 0.984099699183498, "grad_norm": 1.2265625, "learning_rate": 0.0004997962746025646, "loss": 6.5835, "mean_token_accuracy": 0.13582983165979384, "num_tokens": 2084509.0, "step": 1145 }, { "entropy": 6.8790112972259525, "epoch": 0.9883970777825526, "grad_norm": 1.1875, "learning_rate": 0.0004997818837134298, "loss": 6.7192, "mean_token_accuracy": 0.13046733066439628, "num_tokens": 2093110.0, "step": 1150 }, { "entropy": 6.820547676086425, "epoch": 0.9926944563816072, "grad_norm": 1.1484375, "learning_rate": 0.0004997670019905057, "loss": 6.5939, "mean_token_accuracy": 0.12773325443267822, "num_tokens": 2102355.0, "step": 1155 }, { "entropy": 6.849571800231933, "epoch": 0.9969918349806618, "grad_norm": 1.2109375, "learning_rate": 0.0004997516294662876, "loss": 6.6207, "mean_token_accuracy": 0.1278907351195812, "num_tokens": 2110418.0, "step": 1160 }, { "entropy": 6.932281441158718, "epoch": 1.0008594757198108, "grad_norm": 1.1796875, "learning_rate": 0.0004997357661743433, "loss": 6.6076, "mean_token_accuracy": 0.13429299659199184, "num_tokens": 2117866.0, "step": 1165 }, { "entropy": 6.776707983016967, "epoch": 1.0051568543188656, "grad_norm": 1.1171875, "learning_rate": 0.0004997194121493118, "loss": 6.4353, "mean_token_accuracy": 0.14019777849316598, "num_tokens": 2126082.0, "step": 1170 }, { "entropy": 6.887734413146973, "epoch": 1.0094542329179201, "grad_norm": 1.0859375, "learning_rate": 0.0004997025674269037, "loss": 6.4211, "mean_token_accuracy": 0.13955733701586723, "num_tokens": 2134042.0, "step": 1175 }, { "entropy": 6.774314117431641, "epoch": 1.0137516115169747, "grad_norm": 1.2109375, "learning_rate": 0.0004996852320439013, "loss": 6.4895, "mean_token_accuracy": 0.13937605321407318, "num_tokens": 2142570.0, "step": 1180 }, { "entropy": 6.8031017780303955, "epoch": 1.0180489901160292, "grad_norm": 1.015625, "learning_rate": 0.0004996674060381578, "loss": 6.4187, "mean_token_accuracy": 0.13786159604787826, "num_tokens": 2151310.0, "step": 1185 }, { "entropy": 6.884524583816528, "epoch": 1.0223463687150838, "grad_norm": 1.2109375, "learning_rate": 0.0004996490894485985, "loss": 6.4993, "mean_token_accuracy": 0.1331610009074211, "num_tokens": 2160662.0, "step": 1190 }, { "entropy": 6.801689147949219, "epoch": 1.0266437473141383, "grad_norm": 1.1484375, "learning_rate": 0.0004996302823152193, "loss": 6.445, "mean_token_accuracy": 0.13591438457369803, "num_tokens": 2170067.0, "step": 1195 }, { "entropy": 6.76284008026123, "epoch": 1.0309411259131929, "grad_norm": 1.15625, "learning_rate": 0.0004996109846790873, "loss": 6.4084, "mean_token_accuracy": 0.14033972024917601, "num_tokens": 2178850.0, "step": 1200 }, { "entropy": 6.71863865852356, "epoch": 1.0352385045122476, "grad_norm": 1.0, "learning_rate": 0.0004995911965823412, "loss": 6.4263, "mean_token_accuracy": 0.1453915849328041, "num_tokens": 2188307.0, "step": 1205 }, { "entropy": 6.847736549377442, "epoch": 1.0395358831113022, "grad_norm": 1.21875, "learning_rate": 0.0004995709180681899, "loss": 6.4144, "mean_token_accuracy": 0.1416982263326645, "num_tokens": 2197026.0, "step": 1210 }, { "entropy": 6.729686546325683, "epoch": 1.0438332617103567, "grad_norm": 1.125, "learning_rate": 0.000499550149180914, "loss": 6.4003, "mean_token_accuracy": 0.13990466818213462, "num_tokens": 2205537.0, "step": 1215 }, { "entropy": 6.780020618438721, "epoch": 1.0481306403094113, "grad_norm": 1.15625, "learning_rate": 0.0004995288899658641, "loss": 6.4298, "mean_token_accuracy": 0.1448238343000412, "num_tokens": 2214508.0, "step": 1220 }, { "entropy": 6.842759847640991, "epoch": 1.0524280189084658, "grad_norm": 1.171875, "learning_rate": 0.0004995071404694619, "loss": 6.5391, "mean_token_accuracy": 0.1354886084794998, "num_tokens": 2223084.0, "step": 1225 }, { "entropy": 6.7924669742584225, "epoch": 1.0567253975075204, "grad_norm": 1.078125, "learning_rate": 0.0004994849007391996, "loss": 6.4679, "mean_token_accuracy": 0.13138427063822747, "num_tokens": 2231406.0, "step": 1230 }, { "entropy": 6.731750345230102, "epoch": 1.061022776106575, "grad_norm": 1.1328125, "learning_rate": 0.0004994621708236401, "loss": 6.3805, "mean_token_accuracy": 0.14119497835636138, "num_tokens": 2239867.0, "step": 1235 }, { "entropy": 6.745153379440308, "epoch": 1.0653201547056295, "grad_norm": 1.2265625, "learning_rate": 0.000499438950772416, "loss": 6.4467, "mean_token_accuracy": 0.1372622825205326, "num_tokens": 2248844.0, "step": 1240 }, { "entropy": 6.710582876205445, "epoch": 1.0696175333046842, "grad_norm": 1.078125, "learning_rate": 0.0004994152406362311, "loss": 6.3633, "mean_token_accuracy": 0.14102791994810104, "num_tokens": 2257599.0, "step": 1245 }, { "entropy": 6.773756074905395, "epoch": 1.0739149119037388, "grad_norm": 1.296875, "learning_rate": 0.0004993910404668586, "loss": 6.418, "mean_token_accuracy": 0.13638516888022423, "num_tokens": 2266510.0, "step": 1250 }, { "entropy": 6.720381832122802, "epoch": 1.0782122905027933, "grad_norm": 1.03125, "learning_rate": 0.000499366350317142, "loss": 6.4145, "mean_token_accuracy": 0.1418795846402645, "num_tokens": 2275462.0, "step": 1255 }, { "entropy": 6.712311601638794, "epoch": 1.0825096691018479, "grad_norm": 1.15625, "learning_rate": 0.0004993411702409948, "loss": 6.3874, "mean_token_accuracy": 0.1354715533554554, "num_tokens": 2283826.0, "step": 1260 }, { "entropy": 6.76007399559021, "epoch": 1.0868070477009024, "grad_norm": 1.3203125, "learning_rate": 0.0004993155002934002, "loss": 6.3997, "mean_token_accuracy": 0.13856483697891236, "num_tokens": 2292967.0, "step": 1265 }, { "entropy": 6.8389280319213865, "epoch": 1.091104426299957, "grad_norm": 1.7109375, "learning_rate": 0.0004992893405304111, "loss": 6.5262, "mean_token_accuracy": 0.13781826868653296, "num_tokens": 2302336.0, "step": 1270 }, { "entropy": 6.64991979598999, "epoch": 1.0954018048990115, "grad_norm": 1.078125, "learning_rate": 0.00049926269100915, "loss": 6.4293, "mean_token_accuracy": 0.1432204395532608, "num_tokens": 2311465.0, "step": 1275 }, { "entropy": 6.792691707611084, "epoch": 1.0996991834980663, "grad_norm": 1.140625, "learning_rate": 0.0004992355517878087, "loss": 6.542, "mean_token_accuracy": 0.13071493357419967, "num_tokens": 2320281.0, "step": 1280 }, { "entropy": 6.689556837081909, "epoch": 1.1039965620971208, "grad_norm": 1.171875, "learning_rate": 0.0004992079229256484, "loss": 6.4431, "mean_token_accuracy": 0.1360026031732559, "num_tokens": 2329755.0, "step": 1285 }, { "entropy": 6.6757041931152346, "epoch": 1.1082939406961754, "grad_norm": 1.0546875, "learning_rate": 0.0004991798044829996, "loss": 6.3861, "mean_token_accuracy": 0.1369478650391102, "num_tokens": 2338807.0, "step": 1290 }, { "entropy": 6.7733612060546875, "epoch": 1.11259131929523, "grad_norm": 1.171875, "learning_rate": 0.0004991511965212618, "loss": 6.4719, "mean_token_accuracy": 0.13780709579586983, "num_tokens": 2348056.0, "step": 1295 }, { "entropy": 6.688971424102784, "epoch": 1.1168886978942845, "grad_norm": 1.1171875, "learning_rate": 0.0004991220991029032, "loss": 6.4868, "mean_token_accuracy": 0.13366840407252312, "num_tokens": 2357780.0, "step": 1300 }, { "entropy": 6.773650407791138, "epoch": 1.121186076493339, "grad_norm": 1.3046875, "learning_rate": 0.000499092512291461, "loss": 6.4446, "mean_token_accuracy": 0.13651487827301026, "num_tokens": 2367060.0, "step": 1305 }, { "entropy": 6.7718230247497555, "epoch": 1.1254834550923936, "grad_norm": 1.0703125, "learning_rate": 0.000499062436151541, "loss": 6.441, "mean_token_accuracy": 0.1382215812802315, "num_tokens": 2375751.0, "step": 1310 }, { "entropy": 6.800968360900879, "epoch": 1.129780833691448, "grad_norm": 1.1640625, "learning_rate": 0.0004990318707488173, "loss": 6.5069, "mean_token_accuracy": 0.13017478883266448, "num_tokens": 2385013.0, "step": 1315 }, { "entropy": 6.692961692810059, "epoch": 1.1340782122905029, "grad_norm": 1.1953125, "learning_rate": 0.0004990008161500327, "loss": 6.3937, "mean_token_accuracy": 0.14006393477320672, "num_tokens": 2392935.0, "step": 1320 }, { "entropy": 6.706206512451172, "epoch": 1.1383755908895574, "grad_norm": 1.2578125, "learning_rate": 0.000498969272422998, "loss": 6.4188, "mean_token_accuracy": 0.1468452200293541, "num_tokens": 2401560.0, "step": 1325 }, { "entropy": 6.711210012435913, "epoch": 1.142672969488612, "grad_norm": 1.1328125, "learning_rate": 0.0004989372396365921, "loss": 6.3447, "mean_token_accuracy": 0.1455326870083809, "num_tokens": 2410050.0, "step": 1330 }, { "entropy": 6.756243276596069, "epoch": 1.1469703480876665, "grad_norm": 1.1796875, "learning_rate": 0.0004989047178607618, "loss": 6.4505, "mean_token_accuracy": 0.13842038065195084, "num_tokens": 2418980.0, "step": 1335 }, { "entropy": 6.671654081344604, "epoch": 1.151267726686721, "grad_norm": 1.1328125, "learning_rate": 0.0004988717071665215, "loss": 6.4407, "mean_token_accuracy": 0.13684784546494483, "num_tokens": 2427992.0, "step": 1340 }, { "entropy": 6.762688112258911, "epoch": 1.1555651052857756, "grad_norm": 1.046875, "learning_rate": 0.0004988382076259537, "loss": 6.3572, "mean_token_accuracy": 0.14135119169950486, "num_tokens": 2436368.0, "step": 1345 }, { "entropy": 6.5892657279968265, "epoch": 1.1598624838848304, "grad_norm": 1.0546875, "learning_rate": 0.0004988042193122077, "loss": 6.3456, "mean_token_accuracy": 0.14492984861135483, "num_tokens": 2445499.0, "step": 1350 }, { "entropy": 6.752876138687133, "epoch": 1.164159862483885, "grad_norm": 1.2265625, "learning_rate": 0.0004987697422995005, "loss": 6.3818, "mean_token_accuracy": 0.13490121066570282, "num_tokens": 2454312.0, "step": 1355 }, { "entropy": 6.647862577438355, "epoch": 1.1684572410829395, "grad_norm": 1.109375, "learning_rate": 0.0004987347766631161, "loss": 6.4437, "mean_token_accuracy": 0.1407245770096779, "num_tokens": 2462922.0, "step": 1360 }, { "entropy": 6.755164289474488, "epoch": 1.172754619681994, "grad_norm": 1.0703125, "learning_rate": 0.0004986993224794055, "loss": 6.4781, "mean_token_accuracy": 0.13789629712700843, "num_tokens": 2472195.0, "step": 1365 }, { "entropy": 6.6456316947937015, "epoch": 1.1770519982810486, "grad_norm": 1.1953125, "learning_rate": 0.0004986633798257865, "loss": 6.3829, "mean_token_accuracy": 0.14376115351915358, "num_tokens": 2481021.0, "step": 1370 }, { "entropy": 6.657115125656128, "epoch": 1.181349376880103, "grad_norm": 1.15625, "learning_rate": 0.0004986269487807434, "loss": 6.405, "mean_token_accuracy": 0.13883866667747496, "num_tokens": 2490250.0, "step": 1375 }, { "entropy": 6.763047981262207, "epoch": 1.1856467554791577, "grad_norm": 1.0859375, "learning_rate": 0.000498590029423827, "loss": 6.4581, "mean_token_accuracy": 0.14272229447960855, "num_tokens": 2499122.0, "step": 1380 }, { "entropy": 6.686977815628052, "epoch": 1.1899441340782122, "grad_norm": 1.109375, "learning_rate": 0.0004985526218356546, "loss": 6.4227, "mean_token_accuracy": 0.13726608753204345, "num_tokens": 2508454.0, "step": 1385 }, { "entropy": 6.699887418746949, "epoch": 1.1942415126772667, "grad_norm": 1.1328125, "learning_rate": 0.0004985147260979093, "loss": 6.3632, "mean_token_accuracy": 0.1465839110314846, "num_tokens": 2517353.0, "step": 1390 }, { "entropy": 6.691904354095459, "epoch": 1.1985388912763215, "grad_norm": 1.1796875, "learning_rate": 0.0004984763422933402, "loss": 6.3821, "mean_token_accuracy": 0.14337702393531798, "num_tokens": 2526321.0, "step": 1395 }, { "entropy": 6.6859358787536625, "epoch": 1.202836269875376, "grad_norm": 1.0078125, "learning_rate": 0.0004984374705057623, "loss": 6.4144, "mean_token_accuracy": 0.14242582842707635, "num_tokens": 2535924.0, "step": 1400 }, { "entropy": 6.640392780303955, "epoch": 1.2071336484744306, "grad_norm": 1.171875, "learning_rate": 0.0004983981108200561, "loss": 6.3922, "mean_token_accuracy": 0.1401688925921917, "num_tokens": 2545606.0, "step": 1405 }, { "entropy": 6.649671459197998, "epoch": 1.2114310270734852, "grad_norm": 1.171875, "learning_rate": 0.0004983582633221672, "loss": 6.3859, "mean_token_accuracy": 0.1407300591468811, "num_tokens": 2554947.0, "step": 1410 }, { "entropy": 6.765527582168579, "epoch": 1.2157284056725397, "grad_norm": 1.0234375, "learning_rate": 0.0004983179280991068, "loss": 6.5354, "mean_token_accuracy": 0.13627680763602257, "num_tokens": 2564462.0, "step": 1415 }, { "entropy": 6.688222122192383, "epoch": 1.2200257842715942, "grad_norm": 1.1328125, "learning_rate": 0.0004982771052389508, "loss": 6.3743, "mean_token_accuracy": 0.1444454774260521, "num_tokens": 2573124.0, "step": 1420 }, { "entropy": 6.700618696212769, "epoch": 1.224323162870649, "grad_norm": 1.1484375, "learning_rate": 0.0004982357948308401, "loss": 6.4798, "mean_token_accuracy": 0.13040754944086075, "num_tokens": 2581829.0, "step": 1425 }, { "entropy": 6.7136975765228275, "epoch": 1.2286205414697036, "grad_norm": 1.1328125, "learning_rate": 0.0004981939969649799, "loss": 6.3405, "mean_token_accuracy": 0.1422662131488323, "num_tokens": 2590631.0, "step": 1430 }, { "entropy": 6.661464500427246, "epoch": 1.232917920068758, "grad_norm": 1.1796875, "learning_rate": 0.0004981517117326404, "loss": 6.4484, "mean_token_accuracy": 0.13987314701080322, "num_tokens": 2600684.0, "step": 1435 }, { "entropy": 6.6479767799377445, "epoch": 1.2372152986678127, "grad_norm": 1.0859375, "learning_rate": 0.0004981089392261553, "loss": 6.3605, "mean_token_accuracy": 0.14449947997927665, "num_tokens": 2609667.0, "step": 1440 }, { "entropy": 6.643135976791382, "epoch": 1.2415126772668672, "grad_norm": 1.0, "learning_rate": 0.000498065679538923, "loss": 6.4317, "mean_token_accuracy": 0.14703501164913177, "num_tokens": 2620025.0, "step": 1445 }, { "entropy": 6.672731685638428, "epoch": 1.2458100558659218, "grad_norm": 1.1484375, "learning_rate": 0.0004980219327654049, "loss": 6.351, "mean_token_accuracy": 0.14008775800466539, "num_tokens": 2629032.0, "step": 1450 }, { "entropy": 6.605780506134034, "epoch": 1.2501074344649763, "grad_norm": 1.15625, "learning_rate": 0.000497977699001127, "loss": 6.3357, "mean_token_accuracy": 0.1428795799612999, "num_tokens": 2638303.0, "step": 1455 }, { "entropy": 6.698618459701538, "epoch": 1.2544048130640308, "grad_norm": 1.1328125, "learning_rate": 0.0004979329783426778, "loss": 6.3527, "mean_token_accuracy": 0.14518981352448462, "num_tokens": 2647902.0, "step": 1460 }, { "entropy": 6.619544601440429, "epoch": 1.2587021916630854, "grad_norm": 1.1015625, "learning_rate": 0.0004978877708877094, "loss": 6.4046, "mean_token_accuracy": 0.1414396196603775, "num_tokens": 2657902.0, "step": 1465 }, { "entropy": 6.67303991317749, "epoch": 1.2629995702621402, "grad_norm": 1.09375, "learning_rate": 0.0004978420767349368, "loss": 6.3504, "mean_token_accuracy": 0.14340997561812402, "num_tokens": 2667082.0, "step": 1470 }, { "entropy": 6.647952270507813, "epoch": 1.2672969488611947, "grad_norm": 1.0546875, "learning_rate": 0.0004977958959841379, "loss": 6.4223, "mean_token_accuracy": 0.1364084042608738, "num_tokens": 2676855.0, "step": 1475 }, { "entropy": 6.6442427158355715, "epoch": 1.2715943274602493, "grad_norm": 1.1015625, "learning_rate": 0.000497749228736153, "loss": 6.3546, "mean_token_accuracy": 0.145116026699543, "num_tokens": 2685750.0, "step": 1480 }, { "entropy": 6.597840929031372, "epoch": 1.2758917060593038, "grad_norm": 1.1953125, "learning_rate": 0.0004977020750928845, "loss": 6.4075, "mean_token_accuracy": 0.14761355221271516, "num_tokens": 2695272.0, "step": 1485 }, { "entropy": 6.709882497787476, "epoch": 1.2801890846583583, "grad_norm": 1.0703125, "learning_rate": 0.0004976544351572973, "loss": 6.3504, "mean_token_accuracy": 0.1418570265173912, "num_tokens": 2704806.0, "step": 1490 }, { "entropy": 6.533363771438599, "epoch": 1.2844864632574131, "grad_norm": 1.09375, "learning_rate": 0.0004976063090334179, "loss": 6.4036, "mean_token_accuracy": 0.1452034071087837, "num_tokens": 2713521.0, "step": 1495 }, { "entropy": 6.7042053699493405, "epoch": 1.2887838418564677, "grad_norm": 1.171875, "learning_rate": 0.0004975576968263346, "loss": 6.3966, "mean_token_accuracy": 0.1381194919347763, "num_tokens": 2721848.0, "step": 1500 }, { "epoch": 1.2887838418564677, "eval_entropy": 6.494678375957249, "eval_loss": 6.482933044433594, "eval_mean_token_accuracy": 0.14236528785513328, "eval_num_tokens": 2721848.0, "eval_runtime": 2.0538, "eval_samples_per_second": 1728.039, "eval_steps_per_second": 216.187, "step": 1500 }, { "entropy": 6.592136430740356, "epoch": 1.2930812204555222, "grad_norm": 0.9921875, "learning_rate": 0.000497508598642197, "loss": 6.3613, "mean_token_accuracy": 0.14413030222058296, "num_tokens": 2731473.0, "step": 1505 }, { "entropy": 6.610020494461059, "epoch": 1.2973785990545768, "grad_norm": 1.09375, "learning_rate": 0.000497459014588216, "loss": 6.4326, "mean_token_accuracy": 0.141157578676939, "num_tokens": 2739867.0, "step": 1510 }, { "entropy": 6.684322929382324, "epoch": 1.3016759776536313, "grad_norm": 1.15625, "learning_rate": 0.000497408944772663, "loss": 6.3442, "mean_token_accuracy": 0.14187844544649125, "num_tokens": 2748903.0, "step": 1515 }, { "entropy": 6.512551116943359, "epoch": 1.3059733562526858, "grad_norm": 1.1015625, "learning_rate": 0.0004973583893048707, "loss": 6.3389, "mean_token_accuracy": 0.14152248129248618, "num_tokens": 2757711.0, "step": 1520 }, { "entropy": 6.74653639793396, "epoch": 1.3102707348517404, "grad_norm": 1.1328125, "learning_rate": 0.0004973073482952321, "loss": 6.358, "mean_token_accuracy": 0.140853676199913, "num_tokens": 2765633.0, "step": 1525 }, { "entropy": 6.572407674789429, "epoch": 1.314568113450795, "grad_norm": 1.3203125, "learning_rate": 0.0004972558218552004, "loss": 6.3982, "mean_token_accuracy": 0.14053191468119622, "num_tokens": 2774495.0, "step": 1530 }, { "entropy": 6.645643854141236, "epoch": 1.3188654920498495, "grad_norm": 1.1640625, "learning_rate": 0.0004972038100972885, "loss": 6.4066, "mean_token_accuracy": 0.1426756389439106, "num_tokens": 2782665.0, "step": 1535 }, { "entropy": 6.549836540222168, "epoch": 1.323162870648904, "grad_norm": 1.3671875, "learning_rate": 0.0004971513131350697, "loss": 6.356, "mean_token_accuracy": 0.13861292153596877, "num_tokens": 2791394.0, "step": 1540 }, { "entropy": 6.566079998016358, "epoch": 1.3274602492479588, "grad_norm": 1.2265625, "learning_rate": 0.0004970983310831759, "loss": 6.3437, "mean_token_accuracy": 0.1422226123511791, "num_tokens": 2800488.0, "step": 1545 }, { "entropy": 6.6656012535095215, "epoch": 1.3317576278470133, "grad_norm": 1.03125, "learning_rate": 0.0004970448640572989, "loss": 6.4644, "mean_token_accuracy": 0.14133307337760925, "num_tokens": 2810116.0, "step": 1550 }, { "entropy": 6.59561824798584, "epoch": 1.336055006446068, "grad_norm": 0.984375, "learning_rate": 0.0004969909121741895, "loss": 6.2592, "mean_token_accuracy": 0.14750397205352783, "num_tokens": 2819205.0, "step": 1555 }, { "entropy": 6.559555625915527, "epoch": 1.3403523850451224, "grad_norm": 1.140625, "learning_rate": 0.0004969364755516569, "loss": 6.3311, "mean_token_accuracy": 0.14398850798606871, "num_tokens": 2828017.0, "step": 1560 }, { "entropy": 6.688138008117676, "epoch": 1.344649763644177, "grad_norm": 1.1484375, "learning_rate": 0.0004968815543085689, "loss": 6.3815, "mean_token_accuracy": 0.145321074873209, "num_tokens": 2837125.0, "step": 1565 }, { "entropy": 6.569426822662353, "epoch": 1.3489471422432318, "grad_norm": 1.1015625, "learning_rate": 0.0004968261485648516, "loss": 6.3921, "mean_token_accuracy": 0.14212561994791031, "num_tokens": 2845438.0, "step": 1570 }, { "entropy": 6.608628225326538, "epoch": 1.3532445208422863, "grad_norm": 1.0546875, "learning_rate": 0.000496770258441489, "loss": 6.3689, "mean_token_accuracy": 0.1471138596534729, "num_tokens": 2854389.0, "step": 1575 }, { "entropy": 6.556783771514892, "epoch": 1.3575418994413408, "grad_norm": 1.0859375, "learning_rate": 0.0004967138840605228, "loss": 6.3281, "mean_token_accuracy": 0.14712274819612503, "num_tokens": 2863654.0, "step": 1580 }, { "entropy": 6.517911720275879, "epoch": 1.3618392780403954, "grad_norm": 1.1171875, "learning_rate": 0.000496657025545052, "loss": 6.2482, "mean_token_accuracy": 0.15075734853744507, "num_tokens": 2872871.0, "step": 1585 }, { "entropy": 6.5070977210998535, "epoch": 1.36613665663945, "grad_norm": 1.15625, "learning_rate": 0.000496599683019233, "loss": 6.3373, "mean_token_accuracy": 0.1449791297316551, "num_tokens": 2881140.0, "step": 1590 }, { "entropy": 6.6506085872650145, "epoch": 1.3704340352385045, "grad_norm": 1.09375, "learning_rate": 0.000496541856608279, "loss": 6.3251, "mean_token_accuracy": 0.14629032611846923, "num_tokens": 2889945.0, "step": 1595 }, { "entropy": 6.464802026748657, "epoch": 1.374731413837559, "grad_norm": 0.9921875, "learning_rate": 0.0004964835464384595, "loss": 6.254, "mean_token_accuracy": 0.14956037551164628, "num_tokens": 2898897.0, "step": 1600 }, { "entropy": 6.606829452514648, "epoch": 1.3790287924366136, "grad_norm": 1.1484375, "learning_rate": 0.000496424752637101, "loss": 6.2819, "mean_token_accuracy": 0.15412394404411317, "num_tokens": 2907717.0, "step": 1605 }, { "entropy": 6.513754224777221, "epoch": 1.3833261710356681, "grad_norm": 1.109375, "learning_rate": 0.0004963654753325853, "loss": 6.2693, "mean_token_accuracy": 0.1435668349266052, "num_tokens": 2916213.0, "step": 1610 }, { "entropy": 6.6343999862670895, "epoch": 1.387623549634723, "grad_norm": 1.03125, "learning_rate": 0.0004963057146543505, "loss": 6.4423, "mean_token_accuracy": 0.13986597284674646, "num_tokens": 2925706.0, "step": 1615 }, { "entropy": 6.570179843902588, "epoch": 1.3919209282337774, "grad_norm": 1.0546875, "learning_rate": 0.00049624547073289, "loss": 6.3511, "mean_token_accuracy": 0.13794696033000947, "num_tokens": 2934464.0, "step": 1620 }, { "entropy": 6.570999479293823, "epoch": 1.396218306832832, "grad_norm": 1.171875, "learning_rate": 0.0004961847436997526, "loss": 6.2482, "mean_token_accuracy": 0.1511821575462818, "num_tokens": 2944095.0, "step": 1625 }, { "entropy": 6.450803470611572, "epoch": 1.4005156854318865, "grad_norm": 1.1484375, "learning_rate": 0.0004961235336875416, "loss": 6.249, "mean_token_accuracy": 0.1513315513730049, "num_tokens": 2953357.0, "step": 1630 }, { "entropy": 6.5238546371459964, "epoch": 1.404813064030941, "grad_norm": 1.1484375, "learning_rate": 0.0004960618408299154, "loss": 6.4089, "mean_token_accuracy": 0.1346985176205635, "num_tokens": 2963020.0, "step": 1635 }, { "entropy": 6.61925859451294, "epoch": 1.4091104426299956, "grad_norm": 1.0859375, "learning_rate": 0.0004959996652615865, "loss": 6.2427, "mean_token_accuracy": 0.1468616619706154, "num_tokens": 2971955.0, "step": 1640 }, { "entropy": 6.584984397888183, "epoch": 1.4134078212290504, "grad_norm": 1.1015625, "learning_rate": 0.0004959370071183216, "loss": 6.3097, "mean_token_accuracy": 0.14391712918877603, "num_tokens": 2980662.0, "step": 1645 }, { "entropy": 6.6156212329864506, "epoch": 1.417705199828105, "grad_norm": 1.2734375, "learning_rate": 0.0004958738665369407, "loss": 6.439, "mean_token_accuracy": 0.12904247269034386, "num_tokens": 2990038.0, "step": 1650 }, { "entropy": 6.566392660140991, "epoch": 1.4220025784271595, "grad_norm": 1.1875, "learning_rate": 0.0004958102436553179, "loss": 6.3627, "mean_token_accuracy": 0.1401166081428528, "num_tokens": 2999835.0, "step": 1655 }, { "entropy": 6.622867441177368, "epoch": 1.426299957026214, "grad_norm": 1.0234375, "learning_rate": 0.00049574613861238, "loss": 6.3528, "mean_token_accuracy": 0.1401872843503952, "num_tokens": 3009593.0, "step": 1660 }, { "entropy": 6.564433908462524, "epoch": 1.4305973356252686, "grad_norm": 1.0546875, "learning_rate": 0.0004956815515481069, "loss": 6.3748, "mean_token_accuracy": 0.14576212018728257, "num_tokens": 3019187.0, "step": 1665 }, { "entropy": 6.528054189682007, "epoch": 1.4348947142243231, "grad_norm": 1.1171875, "learning_rate": 0.0004956164826035309, "loss": 6.2839, "mean_token_accuracy": 0.14402172416448594, "num_tokens": 3027875.0, "step": 1670 }, { "entropy": 6.481614637374878, "epoch": 1.4391920928233777, "grad_norm": 1.1484375, "learning_rate": 0.0004955509319207363, "loss": 6.3184, "mean_token_accuracy": 0.14420104324817656, "num_tokens": 3036902.0, "step": 1675 }, { "entropy": 6.46042537689209, "epoch": 1.4434894714224322, "grad_norm": 0.96875, "learning_rate": 0.0004954848996428601, "loss": 6.2969, "mean_token_accuracy": 0.1498032405972481, "num_tokens": 3046653.0, "step": 1680 }, { "entropy": 6.64046082496643, "epoch": 1.4477868500214868, "grad_norm": 1.3203125, "learning_rate": 0.00049541838591409, "loss": 6.3977, "mean_token_accuracy": 0.14052897915244103, "num_tokens": 3056273.0, "step": 1685 }, { "entropy": 6.529829502105713, "epoch": 1.4520842286205415, "grad_norm": 1.078125, "learning_rate": 0.0004953513908796657, "loss": 6.2999, "mean_token_accuracy": 0.13732842430472375, "num_tokens": 3065662.0, "step": 1690 }, { "entropy": 6.594562721252442, "epoch": 1.456381607219596, "grad_norm": 1.1953125, "learning_rate": 0.0004952839146858773, "loss": 6.3277, "mean_token_accuracy": 0.14757051467895507, "num_tokens": 3073970.0, "step": 1695 }, { "entropy": 6.531829500198365, "epoch": 1.4606789858186506, "grad_norm": 1.1875, "learning_rate": 0.0004952159574800658, "loss": 6.3209, "mean_token_accuracy": 0.14381522089242935, "num_tokens": 3082500.0, "step": 1700 }, { "entropy": 6.566446447372437, "epoch": 1.4649763644177052, "grad_norm": 1.1171875, "learning_rate": 0.0004951475194106229, "loss": 6.2777, "mean_token_accuracy": 0.14633866250514985, "num_tokens": 3091574.0, "step": 1705 }, { "entropy": 6.512380361557007, "epoch": 1.4692737430167597, "grad_norm": 1.046875, "learning_rate": 0.0004950786006269898, "loss": 6.3852, "mean_token_accuracy": 0.13938545510172845, "num_tokens": 3102402.0, "step": 1710 }, { "entropy": 6.59727463722229, "epoch": 1.4735711216158143, "grad_norm": 1.1640625, "learning_rate": 0.0004950092012796576, "loss": 6.2072, "mean_token_accuracy": 0.15373199433088303, "num_tokens": 3111347.0, "step": 1715 }, { "entropy": 6.486224889755249, "epoch": 1.477868500214869, "grad_norm": 1.1640625, "learning_rate": 0.0004949393215201666, "loss": 6.2833, "mean_token_accuracy": 0.14614666104316712, "num_tokens": 3120018.0, "step": 1720 }, { "entropy": 6.4936051845550535, "epoch": 1.4821658788139236, "grad_norm": 1.0, "learning_rate": 0.0004948689615011065, "loss": 6.3484, "mean_token_accuracy": 0.13831731379032136, "num_tokens": 3129669.0, "step": 1725 }, { "entropy": 6.6139086246490475, "epoch": 1.4864632574129781, "grad_norm": 0.98828125, "learning_rate": 0.0004947981213761154, "loss": 6.2794, "mean_token_accuracy": 0.15020036697387695, "num_tokens": 3139112.0, "step": 1730 }, { "entropy": 6.510036754608154, "epoch": 1.4907606360120327, "grad_norm": 1.09375, "learning_rate": 0.0004947268012998797, "loss": 6.2427, "mean_token_accuracy": 0.15479698032140732, "num_tokens": 3148437.0, "step": 1735 }, { "entropy": 6.490271472930909, "epoch": 1.4950580146110872, "grad_norm": 0.984375, "learning_rate": 0.000494655001428134, "loss": 6.2146, "mean_token_accuracy": 0.15289759933948516, "num_tokens": 3158165.0, "step": 1740 }, { "entropy": 6.521289396286011, "epoch": 1.4993553932101418, "grad_norm": 1.09375, "learning_rate": 0.0004945827219176604, "loss": 6.3026, "mean_token_accuracy": 0.1522263005375862, "num_tokens": 3167262.0, "step": 1745 }, { "entropy": 6.448360395431519, "epoch": 1.5036527718091963, "grad_norm": 1.03125, "learning_rate": 0.0004945099629262888, "loss": 6.2841, "mean_token_accuracy": 0.14779476150870324, "num_tokens": 3176696.0, "step": 1750 }, { "entropy": 6.6200721740722654, "epoch": 1.5079501504082509, "grad_norm": 1.109375, "learning_rate": 0.0004944367246128954, "loss": 6.3626, "mean_token_accuracy": 0.1411810874938965, "num_tokens": 3185857.0, "step": 1755 }, { "entropy": 6.497649145126343, "epoch": 1.5122475290073054, "grad_norm": 1.09375, "learning_rate": 0.0004943630071374036, "loss": 6.2129, "mean_token_accuracy": 0.15686369836330413, "num_tokens": 3194687.0, "step": 1760 }, { "entropy": 6.447890901565552, "epoch": 1.51654490760636, "grad_norm": 1.03125, "learning_rate": 0.0004942888106607828, "loss": 6.2715, "mean_token_accuracy": 0.14421172440052032, "num_tokens": 3204913.0, "step": 1765 }, { "entropy": 6.556134462356567, "epoch": 1.5208422862054147, "grad_norm": 1.0625, "learning_rate": 0.0004942141353450486, "loss": 6.2587, "mean_token_accuracy": 0.14712465703487396, "num_tokens": 3213312.0, "step": 1770 }, { "entropy": 6.4831544876098635, "epoch": 1.5251396648044693, "grad_norm": 0.9921875, "learning_rate": 0.0004941389813532619, "loss": 6.1822, "mean_token_accuracy": 0.1586100459098816, "num_tokens": 3222992.0, "step": 1775 }, { "entropy": 6.385056638717652, "epoch": 1.5294370434035238, "grad_norm": 1.0625, "learning_rate": 0.000494063348849529, "loss": 6.2213, "mean_token_accuracy": 0.15424711555242537, "num_tokens": 3232836.0, "step": 1780 }, { "entropy": 6.574507141113282, "epoch": 1.5337344220025786, "grad_norm": 0.98046875, "learning_rate": 0.0004939872379990011, "loss": 6.3769, "mean_token_accuracy": 0.14118290394544603, "num_tokens": 3243171.0, "step": 1785 }, { "entropy": 6.56547212600708, "epoch": 1.5380318006016331, "grad_norm": 1.203125, "learning_rate": 0.0004939106489678739, "loss": 6.2954, "mean_token_accuracy": 0.15190573930740356, "num_tokens": 3251995.0, "step": 1790 }, { "entropy": 6.440187692642212, "epoch": 1.5423291792006877, "grad_norm": 1.0390625, "learning_rate": 0.000493833581923387, "loss": 6.2474, "mean_token_accuracy": 0.14897289276123046, "num_tokens": 3260841.0, "step": 1795 }, { "entropy": 6.5475788593292235, "epoch": 1.5466265577997422, "grad_norm": 1.078125, "learning_rate": 0.0004937560370338244, "loss": 6.382, "mean_token_accuracy": 0.14083073958754538, "num_tokens": 3270979.0, "step": 1800 }, { "entropy": 6.536606645584106, "epoch": 1.5509239363987968, "grad_norm": 1.1015625, "learning_rate": 0.000493678014468513, "loss": 6.307, "mean_token_accuracy": 0.1528750814497471, "num_tokens": 3279848.0, "step": 1805 }, { "entropy": 6.46652889251709, "epoch": 1.5552213149978513, "grad_norm": 0.9921875, "learning_rate": 0.0004935995143978227, "loss": 6.311, "mean_token_accuracy": 0.14874453395605086, "num_tokens": 3289172.0, "step": 1810 }, { "entropy": 6.480955171585083, "epoch": 1.5595186935969059, "grad_norm": 1.1796875, "learning_rate": 0.0004935205369931664, "loss": 6.2107, "mean_token_accuracy": 0.15236888080835342, "num_tokens": 3297432.0, "step": 1815 }, { "entropy": 6.62280158996582, "epoch": 1.5638160721959604, "grad_norm": 0.96875, "learning_rate": 0.0004934410824269992, "loss": 6.2391, "mean_token_accuracy": 0.1460478588938713, "num_tokens": 3307486.0, "step": 1820 }, { "entropy": 6.396580219268799, "epoch": 1.568113450795015, "grad_norm": 1.0703125, "learning_rate": 0.0004933611508728182, "loss": 6.2234, "mean_token_accuracy": 0.15543457493185997, "num_tokens": 3316296.0, "step": 1825 }, { "entropy": 6.48117151260376, "epoch": 1.5724108293940695, "grad_norm": 1.0390625, "learning_rate": 0.000493280742505162, "loss": 6.2496, "mean_token_accuracy": 0.14565204530954362, "num_tokens": 3326080.0, "step": 1830 }, { "entropy": 6.399107646942139, "epoch": 1.576708207993124, "grad_norm": 1.1328125, "learning_rate": 0.0004931998574996102, "loss": 6.1637, "mean_token_accuracy": 0.1557439833879471, "num_tokens": 3334826.0, "step": 1835 }, { "entropy": 6.395985460281372, "epoch": 1.5810055865921788, "grad_norm": 1.1171875, "learning_rate": 0.0004931184960327832, "loss": 6.166, "mean_token_accuracy": 0.15891503393650055, "num_tokens": 3343261.0, "step": 1840 }, { "entropy": 6.439464569091797, "epoch": 1.5853029651912334, "grad_norm": 1.6953125, "learning_rate": 0.0004930366582823421, "loss": 6.2095, "mean_token_accuracy": 0.14784578159451484, "num_tokens": 3352513.0, "step": 1845 }, { "entropy": 6.446910238265991, "epoch": 1.589600343790288, "grad_norm": 1.203125, "learning_rate": 0.0004929543444269879, "loss": 6.2679, "mean_token_accuracy": 0.15295199751853944, "num_tokens": 3361577.0, "step": 1850 }, { "entropy": 6.4689103126525875, "epoch": 1.5938977223893425, "grad_norm": 1.171875, "learning_rate": 0.000492871554646461, "loss": 6.327, "mean_token_accuracy": 0.14370332658290863, "num_tokens": 3370591.0, "step": 1855 }, { "entropy": 6.443254470825195, "epoch": 1.5981951009883972, "grad_norm": 1.0859375, "learning_rate": 0.0004927882891215413, "loss": 6.2437, "mean_token_accuracy": 0.14615294709801674, "num_tokens": 3379761.0, "step": 1860 }, { "entropy": 6.549100685119629, "epoch": 1.6024924795874518, "grad_norm": 1.203125, "learning_rate": 0.0004927045480340475, "loss": 6.3212, "mean_token_accuracy": 0.1414845257997513, "num_tokens": 3388974.0, "step": 1865 }, { "entropy": 6.428477334976196, "epoch": 1.6067898581865063, "grad_norm": 1.015625, "learning_rate": 0.0004926203315668363, "loss": 6.2385, "mean_token_accuracy": 0.15081687197089194, "num_tokens": 3398339.0, "step": 1870 }, { "entropy": 6.499061059951782, "epoch": 1.6110872367855609, "grad_norm": 1.0625, "learning_rate": 0.0004925356399038032, "loss": 6.2121, "mean_token_accuracy": 0.15119217038154603, "num_tokens": 3408292.0, "step": 1875 }, { "entropy": 6.460348415374756, "epoch": 1.6153846153846154, "grad_norm": 1.09375, "learning_rate": 0.0004924504732298808, "loss": 6.1809, "mean_token_accuracy": 0.15673429295420646, "num_tokens": 3417057.0, "step": 1880 }, { "entropy": 6.498525190353393, "epoch": 1.61968199398367, "grad_norm": 1.1171875, "learning_rate": 0.0004923648317310391, "loss": 6.2886, "mean_token_accuracy": 0.15057691931724548, "num_tokens": 3425830.0, "step": 1885 }, { "entropy": 6.466361808776855, "epoch": 1.6239793725827245, "grad_norm": 1.015625, "learning_rate": 0.0004922787155942849, "loss": 6.3261, "mean_token_accuracy": 0.14087508171796798, "num_tokens": 3435513.0, "step": 1890 }, { "entropy": 6.480417251586914, "epoch": 1.628276751181779, "grad_norm": 1.046875, "learning_rate": 0.0004921921250076611, "loss": 6.2319, "mean_token_accuracy": 0.1488749422132969, "num_tokens": 3444684.0, "step": 1895 }, { "entropy": 6.398703765869141, "epoch": 1.6325741297808336, "grad_norm": 1.15625, "learning_rate": 0.0004921050601602475, "loss": 6.309, "mean_token_accuracy": 0.15032647401094437, "num_tokens": 3453454.0, "step": 1900 }, { "entropy": 6.512422227859497, "epoch": 1.6368715083798882, "grad_norm": 1.125, "learning_rate": 0.0004920175212421587, "loss": 6.2317, "mean_token_accuracy": 0.1462756022810936, "num_tokens": 3463228.0, "step": 1905 }, { "entropy": 6.298534250259399, "epoch": 1.6411688869789427, "grad_norm": 1.0546875, "learning_rate": 0.0004919295084445445, "loss": 6.1203, "mean_token_accuracy": 0.15622290521860122, "num_tokens": 3472131.0, "step": 1910 }, { "entropy": 6.46199779510498, "epoch": 1.6454662655779975, "grad_norm": 1.03125, "learning_rate": 0.0004918410219595899, "loss": 6.1947, "mean_token_accuracy": 0.15805622637271882, "num_tokens": 3480642.0, "step": 1915 }, { "entropy": 6.536061143875122, "epoch": 1.649763644177052, "grad_norm": 1.0078125, "learning_rate": 0.000491752061980514, "loss": 6.1748, "mean_token_accuracy": 0.15212914645671843, "num_tokens": 3489346.0, "step": 1920 }, { "entropy": 6.385542201995849, "epoch": 1.6540610227761066, "grad_norm": 1.125, "learning_rate": 0.0004916626287015697, "loss": 6.2236, "mean_token_accuracy": 0.1506744407117367, "num_tokens": 3498473.0, "step": 1925 }, { "entropy": 6.4339292526245115, "epoch": 1.658358401375161, "grad_norm": 1.03125, "learning_rate": 0.0004915727223180436, "loss": 6.2184, "mean_token_accuracy": 0.1503354400396347, "num_tokens": 3507415.0, "step": 1930 }, { "entropy": 6.472232723236084, "epoch": 1.6626557799742159, "grad_norm": 1.03125, "learning_rate": 0.0004914823430262554, "loss": 6.3466, "mean_token_accuracy": 0.13937689363956451, "num_tokens": 3516873.0, "step": 1935 }, { "entropy": 6.475211191177368, "epoch": 1.6669531585732704, "grad_norm": 1.1796875, "learning_rate": 0.0004913914910235573, "loss": 6.2023, "mean_token_accuracy": 0.15309734791517257, "num_tokens": 3525047.0, "step": 1940 }, { "entropy": 6.334531784057617, "epoch": 1.671250537172325, "grad_norm": 1.1015625, "learning_rate": 0.0004913001665083337, "loss": 6.2098, "mean_token_accuracy": 0.1510941930115223, "num_tokens": 3534354.0, "step": 1945 }, { "entropy": 6.499793291091919, "epoch": 1.6755479157713795, "grad_norm": 1.3203125, "learning_rate": 0.0004912083696800008, "loss": 6.2384, "mean_token_accuracy": 0.14515842348337174, "num_tokens": 3543830.0, "step": 1950 }, { "entropy": 6.334777593612671, "epoch": 1.679845294370434, "grad_norm": 1.125, "learning_rate": 0.0004911161007390063, "loss": 6.1344, "mean_token_accuracy": 0.1552545964717865, "num_tokens": 3552314.0, "step": 1955 }, { "entropy": 6.398986530303955, "epoch": 1.6841426729694886, "grad_norm": 1.203125, "learning_rate": 0.0004910233598868287, "loss": 6.2232, "mean_token_accuracy": 0.14675267040729523, "num_tokens": 3561656.0, "step": 1960 }, { "entropy": 6.426092958450317, "epoch": 1.6884400515685432, "grad_norm": 1.09375, "learning_rate": 0.0004909301473259769, "loss": 6.2232, "mean_token_accuracy": 0.14848204478621482, "num_tokens": 3571784.0, "step": 1965 }, { "entropy": 6.454012489318847, "epoch": 1.6927374301675977, "grad_norm": 1.0859375, "learning_rate": 0.0004908364632599899, "loss": 6.1775, "mean_token_accuracy": 0.15773458033800125, "num_tokens": 3580626.0, "step": 1970 }, { "entropy": 6.337477779388427, "epoch": 1.6970348087666522, "grad_norm": 1.0703125, "learning_rate": 0.0004907423078934362, "loss": 6.2001, "mean_token_accuracy": 0.14792972654104233, "num_tokens": 3589916.0, "step": 1975 }, { "entropy": 6.395978736877441, "epoch": 1.7013321873657068, "grad_norm": 1.0546875, "learning_rate": 0.0004906476814319134, "loss": 6.2045, "mean_token_accuracy": 0.15436216294765473, "num_tokens": 3599128.0, "step": 1980 }, { "entropy": 6.384798145294189, "epoch": 1.7056295659647613, "grad_norm": 0.890625, "learning_rate": 0.0004905525840820481, "loss": 6.2156, "mean_token_accuracy": 0.1487440824508667, "num_tokens": 3608764.0, "step": 1985 }, { "entropy": 6.519760847091675, "epoch": 1.709926944563816, "grad_norm": 0.984375, "learning_rate": 0.0004904570160514948, "loss": 6.2587, "mean_token_accuracy": 0.14064486026763917, "num_tokens": 3619082.0, "step": 1990 }, { "entropy": 6.396596527099609, "epoch": 1.7142243231628707, "grad_norm": 1.171875, "learning_rate": 0.0004903609775489358, "loss": 6.2232, "mean_token_accuracy": 0.14829822033643722, "num_tokens": 3628695.0, "step": 1995 }, { "entropy": 6.453386020660401, "epoch": 1.7185217017619252, "grad_norm": 1.1484375, "learning_rate": 0.0004902644687840809, "loss": 6.2106, "mean_token_accuracy": 0.14628567397594452, "num_tokens": 3637599.0, "step": 2000 }, { "epoch": 1.7185217017619252, "eval_entropy": 6.120181280213433, "eval_loss": 6.287801742553711, "eval_mean_token_accuracy": 0.15146609128931085, "eval_num_tokens": 3637599.0, "eval_runtime": 2.0623, "eval_samples_per_second": 1720.853, "eval_steps_per_second": 215.289, "step": 2000 }, { "entropy": 6.389330768585205, "epoch": 1.7228190803609797, "grad_norm": 1.1640625, "learning_rate": 0.0004901674899676667, "loss": 6.189, "mean_token_accuracy": 0.15087567865848542, "num_tokens": 3647406.0, "step": 2005 }, { "entropy": 6.32288761138916, "epoch": 1.7271164589600345, "grad_norm": 1.0546875, "learning_rate": 0.0004900700413114561, "loss": 6.0845, "mean_token_accuracy": 0.15229684859514236, "num_tokens": 3656531.0, "step": 2010 }, { "entropy": 6.2982823848724365, "epoch": 1.731413837559089, "grad_norm": 1.0078125, "learning_rate": 0.000489972123028238, "loss": 6.1711, "mean_token_accuracy": 0.14639344438910484, "num_tokens": 3664922.0, "step": 2015 }, { "entropy": 6.42927360534668, "epoch": 1.7357112161581436, "grad_norm": 1.0625, "learning_rate": 0.0004898737353318268, "loss": 6.114, "mean_token_accuracy": 0.15603691339492798, "num_tokens": 3673283.0, "step": 2020 }, { "entropy": 6.379903554916382, "epoch": 1.7400085947571982, "grad_norm": 1.1796875, "learning_rate": 0.000489774878437062, "loss": 6.2432, "mean_token_accuracy": 0.1512456476688385, "num_tokens": 3681760.0, "step": 2025 }, { "entropy": 6.362637662887574, "epoch": 1.7443059733562527, "grad_norm": 1.0859375, "learning_rate": 0.0004896755525598074, "loss": 6.0576, "mean_token_accuracy": 0.15525488257408143, "num_tokens": 3689408.0, "step": 2030 }, { "entropy": 6.350458097457886, "epoch": 1.7486033519553073, "grad_norm": 1.1484375, "learning_rate": 0.0004895757579169511, "loss": 6.1868, "mean_token_accuracy": 0.1519346058368683, "num_tokens": 3697904.0, "step": 2035 }, { "entropy": 6.549949407577515, "epoch": 1.7529007305543618, "grad_norm": 1.0234375, "learning_rate": 0.0004894754947264047, "loss": 6.2025, "mean_token_accuracy": 0.1540897861123085, "num_tokens": 3706704.0, "step": 2040 }, { "entropy": 6.33614501953125, "epoch": 1.7571981091534163, "grad_norm": 1.140625, "learning_rate": 0.000489374763207103, "loss": 6.2858, "mean_token_accuracy": 0.14759851694107057, "num_tokens": 3715690.0, "step": 2045 }, { "entropy": 6.4482136249542235, "epoch": 1.761495487752471, "grad_norm": 1.1484375, "learning_rate": 0.0004892735635790033, "loss": 6.0651, "mean_token_accuracy": 0.16219264268875122, "num_tokens": 3724835.0, "step": 2050 }, { "entropy": 6.303025627136231, "epoch": 1.7657928663515254, "grad_norm": 0.96484375, "learning_rate": 0.000489171896063085, "loss": 6.0978, "mean_token_accuracy": 0.1608540341258049, "num_tokens": 3733977.0, "step": 2055 }, { "entropy": 6.440810489654541, "epoch": 1.77009024495058, "grad_norm": 1.0859375, "learning_rate": 0.0004890697608813495, "loss": 6.2166, "mean_token_accuracy": 0.14737534075975417, "num_tokens": 3742665.0, "step": 2060 }, { "entropy": 6.50860743522644, "epoch": 1.7743876235496348, "grad_norm": 1.1171875, "learning_rate": 0.0004889671582568193, "loss": 6.2866, "mean_token_accuracy": 0.15046041160821916, "num_tokens": 3751647.0, "step": 2065 }, { "entropy": 6.323904037475586, "epoch": 1.7786850021486893, "grad_norm": 1.1640625, "learning_rate": 0.0004888640884135374, "loss": 6.1804, "mean_token_accuracy": 0.14905625879764556, "num_tokens": 3760852.0, "step": 2070 }, { "entropy": 6.3692279815673825, "epoch": 1.7829823807477438, "grad_norm": 1.2734375, "learning_rate": 0.0004887605515765671, "loss": 6.146, "mean_token_accuracy": 0.1545763321220875, "num_tokens": 3768640.0, "step": 2075 }, { "entropy": 6.432651662826538, "epoch": 1.7872797593467986, "grad_norm": 1.09375, "learning_rate": 0.0004886565479719914, "loss": 6.1701, "mean_token_accuracy": 0.1504896029829979, "num_tokens": 3776504.0, "step": 2080 }, { "entropy": 6.4639040470123295, "epoch": 1.7915771379458532, "grad_norm": 1.15625, "learning_rate": 0.0004885520778269128, "loss": 6.1968, "mean_token_accuracy": 0.15468488037586212, "num_tokens": 3786353.0, "step": 2085 }, { "entropy": 6.380429744720459, "epoch": 1.7958745165449077, "grad_norm": 1.125, "learning_rate": 0.0004884471413694523, "loss": 6.2326, "mean_token_accuracy": 0.14940588921308517, "num_tokens": 3795902.0, "step": 2090 }, { "entropy": 6.3466850280761715, "epoch": 1.8001718951439623, "grad_norm": 0.9453125, "learning_rate": 0.0004883417388287491, "loss": 6.1431, "mean_token_accuracy": 0.14718958735466003, "num_tokens": 3805986.0, "step": 2095 }, { "entropy": 6.3597740650177, "epoch": 1.8044692737430168, "grad_norm": 1.1796875, "learning_rate": 0.0004882358704349603, "loss": 6.2747, "mean_token_accuracy": 0.15220490992069244, "num_tokens": 3814915.0, "step": 2100 }, { "entropy": 6.366986703872681, "epoch": 1.8087666523420713, "grad_norm": 1.1640625, "learning_rate": 0.0004881295364192601, "loss": 6.1506, "mean_token_accuracy": 0.15957469791173934, "num_tokens": 3823966.0, "step": 2105 }, { "entropy": 6.475821685791016, "epoch": 1.813064030941126, "grad_norm": 1.03125, "learning_rate": 0.0004880227370138394, "loss": 6.212, "mean_token_accuracy": 0.14951324909925462, "num_tokens": 3832775.0, "step": 2110 }, { "entropy": 6.301672267913818, "epoch": 1.8173614095401804, "grad_norm": 0.93359375, "learning_rate": 0.0004879154724519057, "loss": 6.1316, "mean_token_accuracy": 0.15576981902122497, "num_tokens": 3842808.0, "step": 2115 }, { "entropy": 6.454287385940551, "epoch": 1.821658788139235, "grad_norm": 1.0703125, "learning_rate": 0.0004878077429676816, "loss": 6.2649, "mean_token_accuracy": 0.14898920953273773, "num_tokens": 3853303.0, "step": 2120 }, { "entropy": 6.3901642799377445, "epoch": 1.8259561667382895, "grad_norm": 1.0703125, "learning_rate": 0.0004876995487964054, "loss": 6.1853, "mean_token_accuracy": 0.14685731381177902, "num_tokens": 3862462.0, "step": 2125 }, { "entropy": 6.411391401290894, "epoch": 1.830253545337344, "grad_norm": 1.046875, "learning_rate": 0.00048759089017432996, "loss": 6.293, "mean_token_accuracy": 0.14782755076885223, "num_tokens": 3871596.0, "step": 2130 }, { "entropy": 6.432213401794433, "epoch": 1.8345509239363988, "grad_norm": 1.0390625, "learning_rate": 0.0004874817673387222, "loss": 6.1972, "mean_token_accuracy": 0.15025533735752106, "num_tokens": 3881276.0, "step": 2135 }, { "entropy": 6.357042551040649, "epoch": 1.8388483025354534, "grad_norm": 0.984375, "learning_rate": 0.00048737218052786275, "loss": 6.2863, "mean_token_accuracy": 0.14599718973040582, "num_tokens": 3891610.0, "step": 2140 }, { "entropy": 6.469332885742188, "epoch": 1.843145681134508, "grad_norm": 0.984375, "learning_rate": 0.00048726212998104554, "loss": 6.2036, "mean_token_accuracy": 0.1476315975189209, "num_tokens": 3900584.0, "step": 2145 }, { "entropy": 6.3720924854278564, "epoch": 1.8474430597335625, "grad_norm": 1.0390625, "learning_rate": 0.0004871516159385768, "loss": 6.1288, "mean_token_accuracy": 0.15351544842123985, "num_tokens": 3910208.0, "step": 2150 }, { "entropy": 6.2077491760253904, "epoch": 1.8517404383326173, "grad_norm": 1.140625, "learning_rate": 0.0004870406386417752, "loss": 6.0609, "mean_token_accuracy": 0.16224084943532943, "num_tokens": 3918424.0, "step": 2155 }, { "entropy": 6.278759956359863, "epoch": 1.8560378169316718, "grad_norm": 1.1328125, "learning_rate": 0.0004869291983329707, "loss": 5.9946, "mean_token_accuracy": 0.16720272302627565, "num_tokens": 3926206.0, "step": 2160 }, { "entropy": 6.399888753890991, "epoch": 1.8603351955307263, "grad_norm": 1.078125, "learning_rate": 0.0004868172952555044, "loss": 6.0991, "mean_token_accuracy": 0.1470145635306835, "num_tokens": 3935769.0, "step": 2165 }, { "entropy": 6.315070724487304, "epoch": 1.864632574129781, "grad_norm": 0.98046875, "learning_rate": 0.0004867049296537278, "loss": 6.0903, "mean_token_accuracy": 0.15458065569400786, "num_tokens": 3945118.0, "step": 2170 }, { "entropy": 6.344206809997559, "epoch": 1.8689299527288354, "grad_norm": 1.40625, "learning_rate": 0.0004865921017730027, "loss": 6.1791, "mean_token_accuracy": 0.15464479327201844, "num_tokens": 3954012.0, "step": 2175 }, { "entropy": 6.429828739166259, "epoch": 1.87322733132789, "grad_norm": 0.96484375, "learning_rate": 0.00048647881185969995, "loss": 6.1931, "mean_token_accuracy": 0.14908381700515747, "num_tokens": 3964239.0, "step": 2180 }, { "entropy": 6.33826584815979, "epoch": 1.8775247099269445, "grad_norm": 1.0390625, "learning_rate": 0.0004863650601611994, "loss": 6.0996, "mean_token_accuracy": 0.1615213319659233, "num_tokens": 3973694.0, "step": 2185 }, { "entropy": 6.327886295318604, "epoch": 1.881822088525999, "grad_norm": 1.078125, "learning_rate": 0.00048625084692588937, "loss": 6.1415, "mean_token_accuracy": 0.1605108693242073, "num_tokens": 3982706.0, "step": 2190 }, { "entropy": 6.265936231613159, "epoch": 1.8861194671250536, "grad_norm": 1.1171875, "learning_rate": 0.00048613617240316593, "loss": 6.0825, "mean_token_accuracy": 0.15816196352243422, "num_tokens": 3990934.0, "step": 2195 }, { "entropy": 6.391205978393555, "epoch": 1.8904168457241082, "grad_norm": 1.0859375, "learning_rate": 0.0004860210368434323, "loss": 6.1513, "mean_token_accuracy": 0.15758474171161652, "num_tokens": 3999864.0, "step": 2200 }, { "entropy": 6.334363603591919, "epoch": 1.8947142243231627, "grad_norm": 0.984375, "learning_rate": 0.00048590544049809857, "loss": 6.1514, "mean_token_accuracy": 0.15803639888763427, "num_tokens": 4008273.0, "step": 2205 }, { "entropy": 6.388755893707275, "epoch": 1.8990116029222175, "grad_norm": 1.03125, "learning_rate": 0.000485789383619581, "loss": 6.1719, "mean_token_accuracy": 0.15583823770284652, "num_tokens": 4017697.0, "step": 2210 }, { "entropy": 6.345981502532959, "epoch": 1.903308981521272, "grad_norm": 1.140625, "learning_rate": 0.0004856728664613015, "loss": 6.1881, "mean_token_accuracy": 0.14975374042987824, "num_tokens": 4026775.0, "step": 2215 }, { "entropy": 6.3457728862762455, "epoch": 1.9076063601203266, "grad_norm": 1.0859375, "learning_rate": 0.00048555588927768674, "loss": 6.1523, "mean_token_accuracy": 0.15700841099023818, "num_tokens": 4036476.0, "step": 2220 }, { "entropy": 6.4198205947875975, "epoch": 1.9119037387193811, "grad_norm": 1.1328125, "learning_rate": 0.0004854384523241683, "loss": 6.1336, "mean_token_accuracy": 0.1571663163602352, "num_tokens": 4045221.0, "step": 2225 }, { "entropy": 6.218144416809082, "epoch": 1.916201117318436, "grad_norm": 1.0390625, "learning_rate": 0.00048532055585718143, "loss": 6.0619, "mean_token_accuracy": 0.15748531818389894, "num_tokens": 4053754.0, "step": 2230 }, { "entropy": 6.358814668655396, "epoch": 1.9204984959174904, "grad_norm": 1.1015625, "learning_rate": 0.00048520220013416505, "loss": 6.103, "mean_token_accuracy": 0.1605447456240654, "num_tokens": 4061730.0, "step": 2235 }, { "entropy": 6.3438163757324215, "epoch": 1.924795874516545, "grad_norm": 1.0390625, "learning_rate": 0.0004850833854135607, "loss": 6.1491, "mean_token_accuracy": 0.15683530494570733, "num_tokens": 4070501.0, "step": 2240 }, { "entropy": 6.367244625091553, "epoch": 1.9290932531155995, "grad_norm": 0.95703125, "learning_rate": 0.0004849641119548122, "loss": 6.2334, "mean_token_accuracy": 0.14961420446634294, "num_tokens": 4079621.0, "step": 2245 }, { "entropy": 6.398000574111938, "epoch": 1.933390631714654, "grad_norm": 1.09375, "learning_rate": 0.000484844380018365, "loss": 6.2167, "mean_token_accuracy": 0.15164064317941667, "num_tokens": 4090106.0, "step": 2250 }, { "entropy": 6.375770425796508, "epoch": 1.9376880103137086, "grad_norm": 1.03125, "learning_rate": 0.000484724189865666, "loss": 6.1578, "mean_token_accuracy": 0.15246021896600723, "num_tokens": 4099269.0, "step": 2255 }, { "entropy": 6.217796373367309, "epoch": 1.9419853889127632, "grad_norm": 1.0625, "learning_rate": 0.0004846035417591624, "loss": 6.0917, "mean_token_accuracy": 0.15897612571716307, "num_tokens": 4108414.0, "step": 2260 }, { "entropy": 6.376744508743286, "epoch": 1.9462827675118177, "grad_norm": 1.109375, "learning_rate": 0.0004844824359623014, "loss": 6.2234, "mean_token_accuracy": 0.14845603406429292, "num_tokens": 4117731.0, "step": 2265 }, { "entropy": 6.413218784332275, "epoch": 1.9505801461108723, "grad_norm": 1.109375, "learning_rate": 0.00048436087273952966, "loss": 6.2001, "mean_token_accuracy": 0.14851808845996856, "num_tokens": 4127194.0, "step": 2270 }, { "entropy": 6.277564620971679, "epoch": 1.9548775247099268, "grad_norm": 1.1171875, "learning_rate": 0.00048423885235629265, "loss": 6.1488, "mean_token_accuracy": 0.15824481397867202, "num_tokens": 4135594.0, "step": 2275 }, { "entropy": 6.359572219848633, "epoch": 1.9591749033089814, "grad_norm": 1.0234375, "learning_rate": 0.0004841163750790342, "loss": 6.1804, "mean_token_accuracy": 0.15617654621601104, "num_tokens": 4145027.0, "step": 2280 }, { "entropy": 6.301140403747558, "epoch": 1.9634722819080361, "grad_norm": 1.015625, "learning_rate": 0.00048399344117519555, "loss": 6.0431, "mean_token_accuracy": 0.15682056695222854, "num_tokens": 4153754.0, "step": 2285 }, { "entropy": 6.266417598724365, "epoch": 1.9677696605070907, "grad_norm": 0.96875, "learning_rate": 0.00048387005091321544, "loss": 6.1066, "mean_token_accuracy": 0.16042741984128953, "num_tokens": 4162765.0, "step": 2290 }, { "entropy": 6.3823741436004635, "epoch": 1.9720670391061452, "grad_norm": 1.1328125, "learning_rate": 0.00048374620456252877, "loss": 6.1293, "mean_token_accuracy": 0.15764901116490365, "num_tokens": 4171589.0, "step": 2295 }, { "entropy": 6.2937760829925535, "epoch": 1.9763644177052, "grad_norm": 1.0625, "learning_rate": 0.00048362190239356644, "loss": 6.1393, "mean_token_accuracy": 0.15565742254257203, "num_tokens": 4181817.0, "step": 2300 }, { "entropy": 6.305324840545654, "epoch": 1.9806617963042545, "grad_norm": 0.9609375, "learning_rate": 0.00048349714467775474, "loss": 6.0995, "mean_token_accuracy": 0.14838732779026031, "num_tokens": 4191350.0, "step": 2305 }, { "entropy": 6.278328514099121, "epoch": 1.984959174903309, "grad_norm": 1.0625, "learning_rate": 0.00048337193168751464, "loss": 6.1486, "mean_token_accuracy": 0.15034544318914414, "num_tokens": 4199888.0, "step": 2310 }, { "entropy": 6.390921354293823, "epoch": 1.9892565535023636, "grad_norm": 1.171875, "learning_rate": 0.0004832462636962613, "loss": 6.1298, "mean_token_accuracy": 0.1492708593606949, "num_tokens": 4209509.0, "step": 2315 }, { "entropy": 6.316125965118408, "epoch": 1.9935539321014182, "grad_norm": 1.140625, "learning_rate": 0.0004831201409784034, "loss": 6.072, "mean_token_accuracy": 0.1605883792042732, "num_tokens": 4218496.0, "step": 2320 }, { "entropy": 6.255798053741455, "epoch": 1.9978513107004727, "grad_norm": 1.0078125, "learning_rate": 0.0004829935638093424, "loss": 6.1087, "mean_token_accuracy": 0.15721286535263063, "num_tokens": 4227504.0, "step": 2325 }, { "entropy": 6.355179150899251, "epoch": 2.0017189514396216, "grad_norm": 1.078125, "learning_rate": 0.0004828665324654724, "loss": 6.0277, "mean_token_accuracy": 0.15705766446060604, "num_tokens": 4235338.0, "step": 2330 }, { "entropy": 6.347105932235718, "epoch": 2.006016330038676, "grad_norm": 0.98828125, "learning_rate": 0.0004827390472241791, "loss": 5.7915, "mean_token_accuracy": 0.16444853693246841, "num_tokens": 4244905.0, "step": 2335 }, { "entropy": 6.278535604476929, "epoch": 2.010313708637731, "grad_norm": 0.95703125, "learning_rate": 0.0004826111083638392, "loss": 5.8696, "mean_token_accuracy": 0.16559881940484047, "num_tokens": 4254533.0, "step": 2340 }, { "entropy": 6.316350841522217, "epoch": 2.0146110872367857, "grad_norm": 0.984375, "learning_rate": 0.00048248271616382, "loss": 5.8463, "mean_token_accuracy": 0.16606585830450057, "num_tokens": 4264023.0, "step": 2345 }, { "entropy": 6.242398643493653, "epoch": 2.0189084658358403, "grad_norm": 1.046875, "learning_rate": 0.00048235387090447894, "loss": 5.876, "mean_token_accuracy": 0.16416406631469727, "num_tokens": 4273298.0, "step": 2350 }, { "entropy": 6.315939903259277, "epoch": 2.023205844434895, "grad_norm": 1.078125, "learning_rate": 0.00048222457286716235, "loss": 5.8197, "mean_token_accuracy": 0.16837385147809983, "num_tokens": 4283244.0, "step": 2355 }, { "entropy": 6.246707153320313, "epoch": 2.0275032230339494, "grad_norm": 1.1796875, "learning_rate": 0.00048209482233420564, "loss": 5.7698, "mean_token_accuracy": 0.17663903087377547, "num_tokens": 4291677.0, "step": 2360 }, { "entropy": 6.277777862548828, "epoch": 2.031800601633004, "grad_norm": 1.0625, "learning_rate": 0.000481964619588932, "loss": 5.8196, "mean_token_accuracy": 0.170748533308506, "num_tokens": 4300822.0, "step": 2365 }, { "entropy": 6.263119220733643, "epoch": 2.0360979802320585, "grad_norm": 1.0625, "learning_rate": 0.0004818339649156523, "loss": 5.8415, "mean_token_accuracy": 0.17079205960035324, "num_tokens": 4310149.0, "step": 2370 }, { "entropy": 6.12338604927063, "epoch": 2.040395358831113, "grad_norm": 1.03125, "learning_rate": 0.00048170285859966395, "loss": 5.7423, "mean_token_accuracy": 0.17794516831636428, "num_tokens": 4319109.0, "step": 2375 }, { "entropy": 6.281350469589233, "epoch": 2.0446927374301676, "grad_norm": 0.99609375, "learning_rate": 0.00048157130092725087, "loss": 5.7302, "mean_token_accuracy": 0.17352935224771499, "num_tokens": 4327921.0, "step": 2380 }, { "entropy": 6.251600027084351, "epoch": 2.048990116029222, "grad_norm": 1.046875, "learning_rate": 0.0004814392921856824, "loss": 5.8821, "mean_token_accuracy": 0.17077834606170655, "num_tokens": 4338026.0, "step": 2385 }, { "entropy": 6.208239459991455, "epoch": 2.0532874946282766, "grad_norm": 0.984375, "learning_rate": 0.0004813068326632128, "loss": 5.7272, "mean_token_accuracy": 0.17698098421096803, "num_tokens": 4347794.0, "step": 2390 }, { "entropy": 6.288045644760132, "epoch": 2.057584873227331, "grad_norm": 1.078125, "learning_rate": 0.0004811739226490809, "loss": 5.917, "mean_token_accuracy": 0.16798364371061325, "num_tokens": 4357249.0, "step": 2395 }, { "entropy": 6.22069525718689, "epoch": 2.0618822518263857, "grad_norm": 1.0546875, "learning_rate": 0.00048104056243350896, "loss": 5.8434, "mean_token_accuracy": 0.16641683727502823, "num_tokens": 4366053.0, "step": 2400 }, { "entropy": 6.237291955947876, "epoch": 2.0661796304254403, "grad_norm": 1.0234375, "learning_rate": 0.0004809067523077023, "loss": 5.8614, "mean_token_accuracy": 0.1700182244181633, "num_tokens": 4375543.0, "step": 2405 }, { "entropy": 6.230785751342774, "epoch": 2.0704770090244953, "grad_norm": 1.1015625, "learning_rate": 0.00048077249256384884, "loss": 5.7564, "mean_token_accuracy": 0.17438603639602662, "num_tokens": 4384332.0, "step": 2410 }, { "entropy": 6.19397988319397, "epoch": 2.07477438762355, "grad_norm": 1.2109375, "learning_rate": 0.0004806377834951182, "loss": 5.8466, "mean_token_accuracy": 0.1659790098667145, "num_tokens": 4393670.0, "step": 2415 }, { "entropy": 6.280642461776734, "epoch": 2.0790717662226044, "grad_norm": 1.125, "learning_rate": 0.00048050262539566104, "loss": 5.8543, "mean_token_accuracy": 0.17281297594308853, "num_tokens": 4402763.0, "step": 2420 }, { "entropy": 6.234371662139893, "epoch": 2.083369144821659, "grad_norm": 1.0234375, "learning_rate": 0.0004803670185606087, "loss": 5.766, "mean_token_accuracy": 0.17708782404661177, "num_tokens": 4411863.0, "step": 2425 }, { "entropy": 6.2064672946929935, "epoch": 2.0876665234207135, "grad_norm": 1.078125, "learning_rate": 0.0004802309632860724, "loss": 5.8476, "mean_token_accuracy": 0.17201682031154633, "num_tokens": 4421110.0, "step": 2430 }, { "entropy": 6.299275922775268, "epoch": 2.091963902019768, "grad_norm": 1.0625, "learning_rate": 0.00048009445986914236, "loss": 5.8416, "mean_token_accuracy": 0.165672005712986, "num_tokens": 4430249.0, "step": 2435 }, { "entropy": 6.197122812271118, "epoch": 2.0962612806188226, "grad_norm": 1.0703125, "learning_rate": 0.00047995750860788756, "loss": 5.8269, "mean_token_accuracy": 0.16112401485443115, "num_tokens": 4439686.0, "step": 2440 }, { "entropy": 6.239287519454956, "epoch": 2.100558659217877, "grad_norm": 1.1875, "learning_rate": 0.0004798201098013547, "loss": 5.8073, "mean_token_accuracy": 0.17286145985126494, "num_tokens": 4448645.0, "step": 2445 }, { "entropy": 6.17987093925476, "epoch": 2.1048560378169316, "grad_norm": 1.0078125, "learning_rate": 0.00047968226374956797, "loss": 5.785, "mean_token_accuracy": 0.1676485523581505, "num_tokens": 4456870.0, "step": 2450 }, { "entropy": 6.148839092254638, "epoch": 2.109153416415986, "grad_norm": 1.0390625, "learning_rate": 0.00047954397075352794, "loss": 5.804, "mean_token_accuracy": 0.1790194794535637, "num_tokens": 4466287.0, "step": 2455 }, { "entropy": 6.161268472671509, "epoch": 2.1134507950150407, "grad_norm": 1.09375, "learning_rate": 0.00047940523111521136, "loss": 5.7069, "mean_token_accuracy": 0.17733812779188157, "num_tokens": 4474461.0, "step": 2460 }, { "entropy": 6.201317834854126, "epoch": 2.1177481736140953, "grad_norm": 1.21875, "learning_rate": 0.0004792660451375701, "loss": 5.7722, "mean_token_accuracy": 0.17279447317123414, "num_tokens": 4483002.0, "step": 2465 }, { "entropy": 6.191226196289063, "epoch": 2.12204555221315, "grad_norm": 1.109375, "learning_rate": 0.00047912641312453064, "loss": 5.7874, "mean_token_accuracy": 0.1739989399909973, "num_tokens": 4492061.0, "step": 2470 }, { "entropy": 6.240914964675904, "epoch": 2.1263429308122044, "grad_norm": 0.96484375, "learning_rate": 0.00047898633538099363, "loss": 5.8403, "mean_token_accuracy": 0.16375242471694945, "num_tokens": 4501829.0, "step": 2475 }, { "entropy": 6.218794155120849, "epoch": 2.130640309411259, "grad_norm": 1.015625, "learning_rate": 0.0004788458122128327, "loss": 5.8683, "mean_token_accuracy": 0.16395576894283295, "num_tokens": 4511539.0, "step": 2480 }, { "entropy": 6.197416687011719, "epoch": 2.134937688010314, "grad_norm": 1.09375, "learning_rate": 0.00047870484392689434, "loss": 5.7256, "mean_token_accuracy": 0.1732256680727005, "num_tokens": 4520425.0, "step": 2485 }, { "entropy": 6.153204393386841, "epoch": 2.1392350666093685, "grad_norm": 1.0859375, "learning_rate": 0.000478563430830997, "loss": 5.8237, "mean_token_accuracy": 0.1660924568772316, "num_tokens": 4529474.0, "step": 2490 }, { "entropy": 6.238351488113404, "epoch": 2.143532445208423, "grad_norm": 1.0859375, "learning_rate": 0.00047842157323393035, "loss": 5.7621, "mean_token_accuracy": 0.17213856130838395, "num_tokens": 4538082.0, "step": 2495 }, { "entropy": 6.185346221923828, "epoch": 2.1478298238074776, "grad_norm": 1.046875, "learning_rate": 0.0004782792714454547, "loss": 5.9584, "mean_token_accuracy": 0.1624767854809761, "num_tokens": 4547340.0, "step": 2500 }, { "epoch": 2.1478298238074776, "eval_entropy": 5.975759233440365, "eval_loss": 6.17199182510376, "eval_mean_token_accuracy": 0.15897784858673542, "eval_num_tokens": 4547340.0, "eval_runtime": 2.0476, "eval_samples_per_second": 1733.228, "eval_steps_per_second": 216.837, "step": 2500 }, { "entropy": 6.180462265014649, "epoch": 2.152127202406532, "grad_norm": 1.15625, "learning_rate": 0.0004781365257763002, "loss": 5.787, "mean_token_accuracy": 0.17266686409711837, "num_tokens": 4556415.0, "step": 2505 }, { "entropy": 6.12634973526001, "epoch": 2.1564245810055866, "grad_norm": 1.359375, "learning_rate": 0.00047799333653816633, "loss": 5.6887, "mean_token_accuracy": 0.18215310871601104, "num_tokens": 4565156.0, "step": 2510 }, { "entropy": 6.176189327239991, "epoch": 2.160721959604641, "grad_norm": 1.0, "learning_rate": 0.00047784970404372124, "loss": 5.7844, "mean_token_accuracy": 0.1708789974451065, "num_tokens": 4574678.0, "step": 2515 }, { "entropy": 6.102172183990478, "epoch": 2.1650193382036957, "grad_norm": 1.1328125, "learning_rate": 0.00047770562860660083, "loss": 5.811, "mean_token_accuracy": 0.16656892001628876, "num_tokens": 4583253.0, "step": 2520 }, { "entropy": 6.217717409133911, "epoch": 2.1693167168027503, "grad_norm": 0.9609375, "learning_rate": 0.0004775611105414083, "loss": 5.88, "mean_token_accuracy": 0.16319236159324646, "num_tokens": 4594042.0, "step": 2525 }, { "entropy": 6.179096984863281, "epoch": 2.173614095401805, "grad_norm": 0.99609375, "learning_rate": 0.0004774161501637133, "loss": 5.8208, "mean_token_accuracy": 0.16849268227815628, "num_tokens": 4603128.0, "step": 2530 }, { "entropy": 6.120770788192749, "epoch": 2.1779114740008594, "grad_norm": 1.25, "learning_rate": 0.0004772707477900514, "loss": 5.805, "mean_token_accuracy": 0.17015553265810013, "num_tokens": 4611537.0, "step": 2535 }, { "entropy": 6.254914093017578, "epoch": 2.182208852599914, "grad_norm": 1.1171875, "learning_rate": 0.0004771249037379232, "loss": 5.8984, "mean_token_accuracy": 0.16680168211460114, "num_tokens": 4622481.0, "step": 2540 }, { "entropy": 6.1392961025238035, "epoch": 2.1865062311989685, "grad_norm": 1.109375, "learning_rate": 0.0004769786183257939, "loss": 5.7985, "mean_token_accuracy": 0.17476972788572312, "num_tokens": 4631259.0, "step": 2545 }, { "entropy": 6.153097438812256, "epoch": 2.190803609798023, "grad_norm": 1.1484375, "learning_rate": 0.0004768318918730924, "loss": 5.7586, "mean_token_accuracy": 0.17578112632036208, "num_tokens": 4640266.0, "step": 2550 }, { "entropy": 6.1481832504272464, "epoch": 2.195100988397078, "grad_norm": 1.0625, "learning_rate": 0.00047668472470021044, "loss": 5.8113, "mean_token_accuracy": 0.16784311085939407, "num_tokens": 4649520.0, "step": 2555 }, { "entropy": 6.236191320419311, "epoch": 2.1993983669961326, "grad_norm": 1.0625, "learning_rate": 0.0004765371171285025, "loss": 5.7573, "mean_token_accuracy": 0.17908186167478563, "num_tokens": 4658501.0, "step": 2560 }, { "entropy": 6.072221088409424, "epoch": 2.203695745595187, "grad_norm": 1.078125, "learning_rate": 0.00047638906948028445, "loss": 5.8192, "mean_token_accuracy": 0.16885218769311905, "num_tokens": 4667567.0, "step": 2565 }, { "entropy": 6.158999061584472, "epoch": 2.2079931241942417, "grad_norm": 1.1640625, "learning_rate": 0.00047624058207883317, "loss": 5.8154, "mean_token_accuracy": 0.16924346387386321, "num_tokens": 4676618.0, "step": 2570 }, { "entropy": 6.275905895233154, "epoch": 2.212290502793296, "grad_norm": 1.03125, "learning_rate": 0.00047609165524838576, "loss": 5.8748, "mean_token_accuracy": 0.16927455067634584, "num_tokens": 4685967.0, "step": 2575 }, { "entropy": 6.100191926956176, "epoch": 2.2165878813923507, "grad_norm": 1.265625, "learning_rate": 0.0004759422893141389, "loss": 5.766, "mean_token_accuracy": 0.17206043750047684, "num_tokens": 4694568.0, "step": 2580 }, { "entropy": 6.1515192031860355, "epoch": 2.2208852599914053, "grad_norm": 1.09375, "learning_rate": 0.0004757924846022482, "loss": 5.826, "mean_token_accuracy": 0.17090158760547638, "num_tokens": 4703648.0, "step": 2585 }, { "entropy": 6.158021259307861, "epoch": 2.22518263859046, "grad_norm": 1.1796875, "learning_rate": 0.00047564224143982714, "loss": 5.6863, "mean_token_accuracy": 0.1804699569940567, "num_tokens": 4712444.0, "step": 2590 }, { "entropy": 6.184865522384643, "epoch": 2.2294800171895144, "grad_norm": 1.171875, "learning_rate": 0.00047549156015494676, "loss": 5.8404, "mean_token_accuracy": 0.1678527906537056, "num_tokens": 4722034.0, "step": 2595 }, { "entropy": 6.146687793731689, "epoch": 2.233777395788569, "grad_norm": 1.1484375, "learning_rate": 0.00047534044107663484, "loss": 5.8616, "mean_token_accuracy": 0.1660257250070572, "num_tokens": 4731344.0, "step": 2600 }, { "entropy": 6.193604421615601, "epoch": 2.2380747743876235, "grad_norm": 1.1796875, "learning_rate": 0.00047518888453487496, "loss": 5.7742, "mean_token_accuracy": 0.18199435770511627, "num_tokens": 4739302.0, "step": 2605 }, { "entropy": 6.141710472106934, "epoch": 2.242372152986678, "grad_norm": 0.99609375, "learning_rate": 0.0004750368908606061, "loss": 5.8788, "mean_token_accuracy": 0.16633899062871932, "num_tokens": 4748848.0, "step": 2610 }, { "entropy": 6.243501758575439, "epoch": 2.2466695315857326, "grad_norm": 1.015625, "learning_rate": 0.00047488446038572164, "loss": 5.9385, "mean_token_accuracy": 0.16254912167787552, "num_tokens": 4758194.0, "step": 2615 }, { "entropy": 6.16309700012207, "epoch": 2.250966910184787, "grad_norm": 1.1640625, "learning_rate": 0.0004747315934430688, "loss": 5.8537, "mean_token_accuracy": 0.16617514342069625, "num_tokens": 4768081.0, "step": 2620 }, { "entropy": 6.1246990203857425, "epoch": 2.2552642887838417, "grad_norm": 1.1328125, "learning_rate": 0.000474578290366448, "loss": 5.7741, "mean_token_accuracy": 0.1719846934080124, "num_tokens": 4776471.0, "step": 2625 }, { "entropy": 6.14447546005249, "epoch": 2.259561667382896, "grad_norm": 1.1328125, "learning_rate": 0.0004744245514906117, "loss": 5.7832, "mean_token_accuracy": 0.17195819914340973, "num_tokens": 4784403.0, "step": 2630 }, { "entropy": 6.066154432296753, "epoch": 2.263859045981951, "grad_norm": 1.125, "learning_rate": 0.00047427037715126426, "loss": 5.7561, "mean_token_accuracy": 0.17391669005155563, "num_tokens": 4792779.0, "step": 2635 }, { "entropy": 6.112756013870239, "epoch": 2.2681564245810057, "grad_norm": 1.0390625, "learning_rate": 0.0004741157676850608, "loss": 5.7404, "mean_token_accuracy": 0.17708691954612732, "num_tokens": 4801426.0, "step": 2640 }, { "entropy": 6.151607942581177, "epoch": 2.2724538031800603, "grad_norm": 1.2890625, "learning_rate": 0.00047396072342960663, "loss": 5.7871, "mean_token_accuracy": 0.16531864404678345, "num_tokens": 4810329.0, "step": 2645 }, { "entropy": 6.167152023315429, "epoch": 2.276751181779115, "grad_norm": 1.0546875, "learning_rate": 0.00047380524472345645, "loss": 5.8465, "mean_token_accuracy": 0.1671397715806961, "num_tokens": 4819544.0, "step": 2650 }, { "entropy": 6.143870735168457, "epoch": 2.2810485603781694, "grad_norm": 1.1171875, "learning_rate": 0.0004736493319061134, "loss": 5.8444, "mean_token_accuracy": 0.16567971110343932, "num_tokens": 4828113.0, "step": 2655 }, { "entropy": 6.107415771484375, "epoch": 2.285345938977224, "grad_norm": 1.0, "learning_rate": 0.0004734929853180291, "loss": 5.8272, "mean_token_accuracy": 0.1672320321202278, "num_tokens": 4836989.0, "step": 2660 }, { "entropy": 6.207330274581909, "epoch": 2.2896433175762785, "grad_norm": 0.96875, "learning_rate": 0.00047333620530060175, "loss": 5.8623, "mean_token_accuracy": 0.16721852421760558, "num_tokens": 4847103.0, "step": 2665 }, { "entropy": 6.162304496765136, "epoch": 2.293940696175333, "grad_norm": 1.125, "learning_rate": 0.0004731789921961764, "loss": 5.8736, "mean_token_accuracy": 0.17004917562007904, "num_tokens": 4856238.0, "step": 2670 }, { "entropy": 6.193357849121094, "epoch": 2.2982380747743876, "grad_norm": 1.1484375, "learning_rate": 0.0004730213463480434, "loss": 5.7792, "mean_token_accuracy": 0.17748985737562178, "num_tokens": 4864608.0, "step": 2675 }, { "entropy": 6.143442440032959, "epoch": 2.302535453373442, "grad_norm": 1.0546875, "learning_rate": 0.00047286326810043857, "loss": 5.7374, "mean_token_accuracy": 0.17498592138290406, "num_tokens": 4873889.0, "step": 2680 }, { "entropy": 6.103658771514892, "epoch": 2.3068328319724967, "grad_norm": 1.15625, "learning_rate": 0.00047270475779854137, "loss": 5.7804, "mean_token_accuracy": 0.1758432075381279, "num_tokens": 4882902.0, "step": 2685 }, { "entropy": 6.238151502609253, "epoch": 2.311130210571551, "grad_norm": 1.125, "learning_rate": 0.00047254581578847507, "loss": 5.7985, "mean_token_accuracy": 0.16956950426101686, "num_tokens": 4892390.0, "step": 2690 }, { "entropy": 6.069392299652099, "epoch": 2.3154275891706058, "grad_norm": 1.109375, "learning_rate": 0.0004723864424173055, "loss": 5.9313, "mean_token_accuracy": 0.16988434195518493, "num_tokens": 4901625.0, "step": 2695 }, { "entropy": 6.1605274200439455, "epoch": 2.3197249677696608, "grad_norm": 1.078125, "learning_rate": 0.0004722266380330403, "loss": 5.7187, "mean_token_accuracy": 0.18315426409244537, "num_tokens": 4910804.0, "step": 2700 }, { "entropy": 6.102558517456055, "epoch": 2.3240223463687153, "grad_norm": 1.1015625, "learning_rate": 0.00047206640298462857, "loss": 5.8022, "mean_token_accuracy": 0.1743517056107521, "num_tokens": 4920441.0, "step": 2705 }, { "entropy": 6.1133277893066404, "epoch": 2.32831972496777, "grad_norm": 1.1171875, "learning_rate": 0.00047190573762195945, "loss": 5.8433, "mean_token_accuracy": 0.17087312042713165, "num_tokens": 4930204.0, "step": 2710 }, { "entropy": 6.14404034614563, "epoch": 2.3326171035668244, "grad_norm": 0.94140625, "learning_rate": 0.00047174464229586186, "loss": 5.9433, "mean_token_accuracy": 0.16308000683784485, "num_tokens": 4941191.0, "step": 2715 }, { "entropy": 6.279735326766968, "epoch": 2.336914482165879, "grad_norm": 1.265625, "learning_rate": 0.0004715831173581036, "loss": 5.9209, "mean_token_accuracy": 0.16662475615739822, "num_tokens": 4951825.0, "step": 2720 }, { "entropy": 6.089169502258301, "epoch": 2.3412118607649335, "grad_norm": 0.96484375, "learning_rate": 0.00047142116316139073, "loss": 5.8464, "mean_token_accuracy": 0.1715902417898178, "num_tokens": 4960632.0, "step": 2725 }, { "entropy": 6.170309162139892, "epoch": 2.345509239363988, "grad_norm": 0.9765625, "learning_rate": 0.0004712587800593663, "loss": 5.8798, "mean_token_accuracy": 0.16993365585803985, "num_tokens": 4969455.0, "step": 2730 }, { "entropy": 6.12351393699646, "epoch": 2.3498066179630426, "grad_norm": 1.2890625, "learning_rate": 0.0004710959684066102, "loss": 5.7859, "mean_token_accuracy": 0.1763747364282608, "num_tokens": 4978997.0, "step": 2735 }, { "entropy": 6.158613014221191, "epoch": 2.354103996562097, "grad_norm": 1.0546875, "learning_rate": 0.00047093272855863803, "loss": 5.8433, "mean_token_accuracy": 0.17122582197189332, "num_tokens": 4988305.0, "step": 2740 }, { "entropy": 6.098289728164673, "epoch": 2.3584013751611517, "grad_norm": 1.0546875, "learning_rate": 0.0004707690608719003, "loss": 5.7826, "mean_token_accuracy": 0.1741472825407982, "num_tokens": 4997022.0, "step": 2745 }, { "entropy": 6.148240280151367, "epoch": 2.362698753760206, "grad_norm": 1.1484375, "learning_rate": 0.0004706049657037818, "loss": 5.8468, "mean_token_accuracy": 0.1676038146018982, "num_tokens": 5005664.0, "step": 2750 }, { "entropy": 6.128693151473999, "epoch": 2.3669961323592608, "grad_norm": 1.03125, "learning_rate": 0.0004704404434126009, "loss": 5.7993, "mean_token_accuracy": 0.1630128264427185, "num_tokens": 5014769.0, "step": 2755 }, { "entropy": 6.173029994964599, "epoch": 2.3712935109583153, "grad_norm": 1.015625, "learning_rate": 0.00047027549435760843, "loss": 5.869, "mean_token_accuracy": 0.16782066822052003, "num_tokens": 5024060.0, "step": 2760 }, { "entropy": 6.2035542011260985, "epoch": 2.37559088955737, "grad_norm": 1.140625, "learning_rate": 0.0004701101188989872, "loss": 5.9029, "mean_token_accuracy": 0.16563379466533662, "num_tokens": 5033046.0, "step": 2765 }, { "entropy": 6.131078624725342, "epoch": 2.3798882681564244, "grad_norm": 1.2421875, "learning_rate": 0.00046994431739785114, "loss": 5.7511, "mean_token_accuracy": 0.18578273057937622, "num_tokens": 5040894.0, "step": 2770 }, { "entropy": 6.131838798522949, "epoch": 2.384185646755479, "grad_norm": 1.046875, "learning_rate": 0.00046977809021624454, "loss": 5.9175, "mean_token_accuracy": 0.16830566823482512, "num_tokens": 5050961.0, "step": 2775 }, { "entropy": 6.217153167724609, "epoch": 2.3884830253545335, "grad_norm": 1.140625, "learning_rate": 0.0004696114377171409, "loss": 5.8447, "mean_token_accuracy": 0.166962693631649, "num_tokens": 5060226.0, "step": 2780 }, { "entropy": 6.121598243713379, "epoch": 2.3927804039535885, "grad_norm": 1.1015625, "learning_rate": 0.0004694443602644429, "loss": 5.8063, "mean_token_accuracy": 0.16657978445291519, "num_tokens": 5069225.0, "step": 2785 }, { "entropy": 6.101100969314575, "epoch": 2.397077782552643, "grad_norm": 1.0703125, "learning_rate": 0.0004692768582229808, "loss": 5.7858, "mean_token_accuracy": 0.17338948845863342, "num_tokens": 5078386.0, "step": 2790 }, { "entropy": 6.113597059249878, "epoch": 2.4013751611516976, "grad_norm": 0.98046875, "learning_rate": 0.00046910893195851213, "loss": 5.7198, "mean_token_accuracy": 0.1726340189576149, "num_tokens": 5087161.0, "step": 2795 }, { "entropy": 6.0919578075408936, "epoch": 2.405672539750752, "grad_norm": 1.0546875, "learning_rate": 0.00046894058183772074, "loss": 5.892, "mean_token_accuracy": 0.16800693273544312, "num_tokens": 5096613.0, "step": 2800 }, { "entropy": 6.136939573287964, "epoch": 2.4099699183498067, "grad_norm": 1.1484375, "learning_rate": 0.000468771808228216, "loss": 5.8324, "mean_token_accuracy": 0.16829856783151625, "num_tokens": 5106534.0, "step": 2805 }, { "entropy": 6.107866430282593, "epoch": 2.414267296948861, "grad_norm": 1.1171875, "learning_rate": 0.00046860261149853197, "loss": 5.8647, "mean_token_accuracy": 0.16899872124195098, "num_tokens": 5115975.0, "step": 2810 }, { "entropy": 6.077492666244507, "epoch": 2.4185646755479158, "grad_norm": 1.1640625, "learning_rate": 0.0004684329920181268, "loss": 5.751, "mean_token_accuracy": 0.1743526503443718, "num_tokens": 5124635.0, "step": 2815 }, { "entropy": 6.131529140472412, "epoch": 2.4228620541469703, "grad_norm": 1.1875, "learning_rate": 0.00046826295015738154, "loss": 5.7277, "mean_token_accuracy": 0.18355693519115449, "num_tokens": 5133226.0, "step": 2820 }, { "entropy": 6.0389073371887205, "epoch": 2.427159432746025, "grad_norm": 1.0390625, "learning_rate": 0.0004680924862875996, "loss": 5.8249, "mean_token_accuracy": 0.17242621779441833, "num_tokens": 5142257.0, "step": 2825 }, { "entropy": 6.1347551345825195, "epoch": 2.4314568113450794, "grad_norm": 1.015625, "learning_rate": 0.00046792160078100605, "loss": 5.803, "mean_token_accuracy": 0.1744050070643425, "num_tokens": 5150752.0, "step": 2830 }, { "entropy": 6.121770191192627, "epoch": 2.435754189944134, "grad_norm": 1.0546875, "learning_rate": 0.00046775029401074653, "loss": 5.7301, "mean_token_accuracy": 0.18050158619880677, "num_tokens": 5160237.0, "step": 2835 }, { "entropy": 6.1292938709259035, "epoch": 2.4400515685431885, "grad_norm": 1.125, "learning_rate": 0.00046757856635088645, "loss": 5.8006, "mean_token_accuracy": 0.1770539328455925, "num_tokens": 5169752.0, "step": 2840 }, { "entropy": 6.102383279800415, "epoch": 2.444348947142243, "grad_norm": 1.0234375, "learning_rate": 0.0004674064181764105, "loss": 5.8474, "mean_token_accuracy": 0.17131757587194443, "num_tokens": 5178892.0, "step": 2845 }, { "entropy": 6.149413537979126, "epoch": 2.448646325741298, "grad_norm": 0.9765625, "learning_rate": 0.00046723384986322147, "loss": 5.8235, "mean_token_accuracy": 0.1708175078034401, "num_tokens": 5188468.0, "step": 2850 }, { "entropy": 6.077389192581177, "epoch": 2.4529437043403526, "grad_norm": 1.0625, "learning_rate": 0.0004670608617881395, "loss": 5.7566, "mean_token_accuracy": 0.17592367827892302, "num_tokens": 5197565.0, "step": 2855 }, { "entropy": 6.030880689620972, "epoch": 2.457241082939407, "grad_norm": 1.1171875, "learning_rate": 0.0004668874543289014, "loss": 5.7533, "mean_token_accuracy": 0.17984721809625626, "num_tokens": 5205791.0, "step": 2860 }, { "entropy": 6.102017164230347, "epoch": 2.4615384615384617, "grad_norm": 1.078125, "learning_rate": 0.00046671362786415986, "loss": 5.7546, "mean_token_accuracy": 0.18463018238544465, "num_tokens": 5214773.0, "step": 2865 }, { "entropy": 6.001052331924439, "epoch": 2.465835840137516, "grad_norm": 1.015625, "learning_rate": 0.00046653938277348237, "loss": 5.7784, "mean_token_accuracy": 0.178888800740242, "num_tokens": 5223734.0, "step": 2870 }, { "entropy": 6.224011754989624, "epoch": 2.4701332187365708, "grad_norm": 1.203125, "learning_rate": 0.0004663647194373505, "loss": 5.8493, "mean_token_accuracy": 0.1654559224843979, "num_tokens": 5231742.0, "step": 2875 }, { "entropy": 6.084821510314941, "epoch": 2.4744305973356253, "grad_norm": 1.046875, "learning_rate": 0.00046618963823715913, "loss": 5.8114, "mean_token_accuracy": 0.1739817038178444, "num_tokens": 5241673.0, "step": 2880 }, { "entropy": 6.109079217910766, "epoch": 2.47872797593468, "grad_norm": 1.15625, "learning_rate": 0.00046601413955521575, "loss": 5.7746, "mean_token_accuracy": 0.1694352611899376, "num_tokens": 5250082.0, "step": 2885 }, { "entropy": 6.092094850540161, "epoch": 2.4830253545337344, "grad_norm": 1.1640625, "learning_rate": 0.0004658382237747393, "loss": 5.8386, "mean_token_accuracy": 0.1698906749486923, "num_tokens": 5259680.0, "step": 2890 }, { "entropy": 6.122028970718384, "epoch": 2.487322733132789, "grad_norm": 0.98046875, "learning_rate": 0.00046566189127985946, "loss": 5.8246, "mean_token_accuracy": 0.17455327808856963, "num_tokens": 5269561.0, "step": 2895 }, { "entropy": 6.137590980529785, "epoch": 2.4916201117318435, "grad_norm": 0.98828125, "learning_rate": 0.000465485142455616, "loss": 5.7746, "mean_token_accuracy": 0.18081652075052262, "num_tokens": 5278659.0, "step": 2900 }, { "entropy": 6.009606838226318, "epoch": 2.495917490330898, "grad_norm": 1.046875, "learning_rate": 0.00046530797768795765, "loss": 5.7732, "mean_token_accuracy": 0.180580236017704, "num_tokens": 5287619.0, "step": 2905 }, { "entropy": 6.08758659362793, "epoch": 2.5002148689299526, "grad_norm": 1.03125, "learning_rate": 0.00046513039736374153, "loss": 5.8748, "mean_token_accuracy": 0.16648129373788834, "num_tokens": 5297334.0, "step": 2910 }, { "entropy": 6.145037269592285, "epoch": 2.504512247529007, "grad_norm": 1.1484375, "learning_rate": 0.0004649524018707319, "loss": 5.8028, "mean_token_accuracy": 0.17371988743543626, "num_tokens": 5306208.0, "step": 2915 }, { "entropy": 6.061826229095459, "epoch": 2.5088096261280617, "grad_norm": 1.1875, "learning_rate": 0.00046477399159759996, "loss": 5.7262, "mean_token_accuracy": 0.17741942554712295, "num_tokens": 5314754.0, "step": 2920 }, { "entropy": 5.948053169250488, "epoch": 2.5131070047271162, "grad_norm": 1.21875, "learning_rate": 0.00046459516693392246, "loss": 5.751, "mean_token_accuracy": 0.17220668345689774, "num_tokens": 5324000.0, "step": 2925 }, { "entropy": 6.165147161483764, "epoch": 2.517404383326171, "grad_norm": 1.09375, "learning_rate": 0.0004644159282701808, "loss": 5.7977, "mean_token_accuracy": 0.17256153672933577, "num_tokens": 5332478.0, "step": 2930 }, { "entropy": 6.164068746566772, "epoch": 2.5217017619252258, "grad_norm": 1.03125, "learning_rate": 0.00046423627599776076, "loss": 5.881, "mean_token_accuracy": 0.16446806192398072, "num_tokens": 5341635.0, "step": 2935 }, { "entropy": 6.083116579055786, "epoch": 2.5259991405242803, "grad_norm": 1.046875, "learning_rate": 0.000464056210508951, "loss": 5.8562, "mean_token_accuracy": 0.17011034190654756, "num_tokens": 5350144.0, "step": 2940 }, { "entropy": 6.115471887588501, "epoch": 2.530296519123335, "grad_norm": 1.1171875, "learning_rate": 0.0004638757321969426, "loss": 5.7867, "mean_token_accuracy": 0.17099311649799348, "num_tokens": 5358788.0, "step": 2945 }, { "entropy": 6.1144256591796875, "epoch": 2.5345938977223894, "grad_norm": 1.0859375, "learning_rate": 0.00046369484145582815, "loss": 5.8724, "mean_token_accuracy": 0.16514718383550644, "num_tokens": 5368057.0, "step": 2950 }, { "entropy": 6.012566184997558, "epoch": 2.538891276321444, "grad_norm": 1.0546875, "learning_rate": 0.00046351353868060054, "loss": 5.7114, "mean_token_accuracy": 0.18013515770435334, "num_tokens": 5376739.0, "step": 2955 }, { "entropy": 6.148663520812988, "epoch": 2.5431886549204985, "grad_norm": 1.078125, "learning_rate": 0.00046333182426715273, "loss": 5.8321, "mean_token_accuracy": 0.17200638502836227, "num_tokens": 5385967.0, "step": 2960 }, { "entropy": 6.130731725692749, "epoch": 2.547486033519553, "grad_norm": 1.046875, "learning_rate": 0.00046314969861227626, "loss": 5.8503, "mean_token_accuracy": 0.16146385669708252, "num_tokens": 5395192.0, "step": 2965 }, { "entropy": 6.0897301197052, "epoch": 2.5517834121186076, "grad_norm": 0.94921875, "learning_rate": 0.0004629671621136608, "loss": 5.8185, "mean_token_accuracy": 0.1707317277789116, "num_tokens": 5404694.0, "step": 2970 }, { "entropy": 6.1051109790802, "epoch": 2.556080790717662, "grad_norm": 1.1328125, "learning_rate": 0.0004627842151698931, "loss": 5.8208, "mean_token_accuracy": 0.1680385336279869, "num_tokens": 5413102.0, "step": 2975 }, { "entropy": 6.055868244171142, "epoch": 2.5603781693167167, "grad_norm": 1.0703125, "learning_rate": 0.00046260085818045625, "loss": 5.8548, "mean_token_accuracy": 0.17059293240308762, "num_tokens": 5423339.0, "step": 2980 }, { "entropy": 6.159391784667969, "epoch": 2.5646755479157712, "grad_norm": 1.078125, "learning_rate": 0.0004624170915457284, "loss": 5.8092, "mean_token_accuracy": 0.1737206295132637, "num_tokens": 5432377.0, "step": 2985 }, { "entropy": 6.093228054046631, "epoch": 2.5689729265148262, "grad_norm": 1.2109375, "learning_rate": 0.00046223291566698264, "loss": 5.7337, "mean_token_accuracy": 0.17590138092637062, "num_tokens": 5441038.0, "step": 2990 }, { "entropy": 6.048309326171875, "epoch": 2.5732703051138808, "grad_norm": 1.0234375, "learning_rate": 0.0004620483309463855, "loss": 5.752, "mean_token_accuracy": 0.18060761988162993, "num_tokens": 5449557.0, "step": 2995 }, { "entropy": 6.118342542648316, "epoch": 2.5775676837129353, "grad_norm": 1.078125, "learning_rate": 0.0004618633377869961, "loss": 5.8908, "mean_token_accuracy": 0.17027267962694168, "num_tokens": 5458931.0, "step": 3000 }, { "epoch": 2.5775676837129353, "eval_entropy": 5.922512502283664, "eval_loss": 6.089641094207764, "eval_mean_token_accuracy": 0.16509396373084537, "eval_num_tokens": 5458931.0, "eval_runtime": 2.0449, "eval_samples_per_second": 1735.564, "eval_steps_per_second": 217.129, "step": 3000 }, { "entropy": 6.080532598495483, "epoch": 2.58186506231199, "grad_norm": 1.0078125, "learning_rate": 0.0004616779365927656, "loss": 5.715, "mean_token_accuracy": 0.18329965472221374, "num_tokens": 5468539.0, "step": 3005 }, { "entropy": 5.97086615562439, "epoch": 2.5861624409110444, "grad_norm": 1.2734375, "learning_rate": 0.0004614921277685361, "loss": 5.6562, "mean_token_accuracy": 0.18940457850694656, "num_tokens": 5475710.0, "step": 3010 }, { "entropy": 6.014264154434204, "epoch": 2.590459819510099, "grad_norm": 1.0390625, "learning_rate": 0.00046130591172003976, "loss": 5.8105, "mean_token_accuracy": 0.17161315381526948, "num_tokens": 5484597.0, "step": 3015 }, { "entropy": 6.152267932891846, "epoch": 2.5947571981091535, "grad_norm": 1.0390625, "learning_rate": 0.0004611192888538981, "loss": 5.8783, "mean_token_accuracy": 0.16452959179878235, "num_tokens": 5493213.0, "step": 3020 }, { "entropy": 6.160617208480835, "epoch": 2.599054576708208, "grad_norm": 1.2890625, "learning_rate": 0.00046093225957762084, "loss": 5.8774, "mean_token_accuracy": 0.1677599936723709, "num_tokens": 5502556.0, "step": 3025 }, { "entropy": 6.121158790588379, "epoch": 2.6033519553072626, "grad_norm": 1.078125, "learning_rate": 0.0004607448242996051, "loss": 5.7783, "mean_token_accuracy": 0.17667657732963563, "num_tokens": 5511779.0, "step": 3030 }, { "entropy": 6.09659161567688, "epoch": 2.607649333906317, "grad_norm": 1.0859375, "learning_rate": 0.0004605569834291347, "loss": 5.7762, "mean_token_accuracy": 0.17719159871339799, "num_tokens": 5520836.0, "step": 3035 }, { "entropy": 6.045120048522949, "epoch": 2.6119467125053717, "grad_norm": 1.1953125, "learning_rate": 0.00046036873737637904, "loss": 5.7919, "mean_token_accuracy": 0.1743898034095764, "num_tokens": 5529285.0, "step": 3040 }, { "entropy": 6.032482624053955, "epoch": 2.6162440911044262, "grad_norm": 1.0859375, "learning_rate": 0.0004601800865523921, "loss": 5.795, "mean_token_accuracy": 0.171583154797554, "num_tokens": 5538160.0, "step": 3045 }, { "entropy": 6.106417560577393, "epoch": 2.620541469703481, "grad_norm": 1.0859375, "learning_rate": 0.00045999103136911204, "loss": 5.801, "mean_token_accuracy": 0.16901974827051164, "num_tokens": 5547355.0, "step": 3050 }, { "entropy": 6.082255268096924, "epoch": 2.6248388483025353, "grad_norm": 1.0390625, "learning_rate": 0.00045980157223935965, "loss": 5.8134, "mean_token_accuracy": 0.17005283683538436, "num_tokens": 5557299.0, "step": 3055 }, { "entropy": 6.022762966156006, "epoch": 2.62913622690159, "grad_norm": 1.015625, "learning_rate": 0.00045961170957683806, "loss": 5.7335, "mean_token_accuracy": 0.17649647146463393, "num_tokens": 5565469.0, "step": 3060 }, { "entropy": 6.093750953674316, "epoch": 2.6334336055006444, "grad_norm": 1.0390625, "learning_rate": 0.00045942144379613147, "loss": 5.8526, "mean_token_accuracy": 0.17166510373353958, "num_tokens": 5574740.0, "step": 3065 }, { "entropy": 6.13080358505249, "epoch": 2.637730984099699, "grad_norm": 1.0703125, "learning_rate": 0.00045923077531270426, "loss": 5.8407, "mean_token_accuracy": 0.1691308617591858, "num_tokens": 5583438.0, "step": 3070 }, { "entropy": 6.099384212493897, "epoch": 2.6420283626987535, "grad_norm": 1.078125, "learning_rate": 0.0004590397045429001, "loss": 5.8094, "mean_token_accuracy": 0.17814612835645677, "num_tokens": 5592389.0, "step": 3075 }, { "entropy": 6.034897947311402, "epoch": 2.646325741297808, "grad_norm": 0.984375, "learning_rate": 0.00045884823190394134, "loss": 5.7097, "mean_token_accuracy": 0.18135049045085908, "num_tokens": 5601598.0, "step": 3080 }, { "entropy": 6.026404762268067, "epoch": 2.650623119896863, "grad_norm": 1.1484375, "learning_rate": 0.0004586563578139275, "loss": 5.7991, "mean_token_accuracy": 0.16875589936971663, "num_tokens": 5610498.0, "step": 3085 }, { "entropy": 6.030868768692017, "epoch": 2.6549204984959176, "grad_norm": 1.1875, "learning_rate": 0.00045846408269183505, "loss": 5.7054, "mean_token_accuracy": 0.1822717770934105, "num_tokens": 5620082.0, "step": 3090 }, { "entropy": 6.124869680404663, "epoch": 2.659217877094972, "grad_norm": 1.0546875, "learning_rate": 0.00045827140695751603, "loss": 5.7925, "mean_token_accuracy": 0.17561250925064087, "num_tokens": 5630291.0, "step": 3095 }, { "entropy": 6.034296226501465, "epoch": 2.6635152556940267, "grad_norm": 1.1875, "learning_rate": 0.0004580783310316971, "loss": 5.7735, "mean_token_accuracy": 0.1745203271508217, "num_tokens": 5638784.0, "step": 3100 }, { "entropy": 5.955547714233399, "epoch": 2.6678126342930812, "grad_norm": 1.09375, "learning_rate": 0.00045788485533597895, "loss": 5.6462, "mean_token_accuracy": 0.1883938804268837, "num_tokens": 5647968.0, "step": 3105 }, { "entropy": 6.074917793273926, "epoch": 2.672110012892136, "grad_norm": 1.0625, "learning_rate": 0.00045769098029283526, "loss": 5.8675, "mean_token_accuracy": 0.16595781594514847, "num_tokens": 5657543.0, "step": 3110 }, { "entropy": 6.098982810974121, "epoch": 2.6764073914911903, "grad_norm": 1.1328125, "learning_rate": 0.0004574967063256115, "loss": 5.7801, "mean_token_accuracy": 0.17747018337249756, "num_tokens": 5666535.0, "step": 3115 }, { "entropy": 6.071979904174805, "epoch": 2.680704770090245, "grad_norm": 1.125, "learning_rate": 0.00045730203385852447, "loss": 5.8624, "mean_token_accuracy": 0.17044119387865067, "num_tokens": 5676273.0, "step": 3120 }, { "entropy": 5.99071159362793, "epoch": 2.6850021486892994, "grad_norm": 1.0546875, "learning_rate": 0.000457106963316661, "loss": 5.7612, "mean_token_accuracy": 0.177787147462368, "num_tokens": 5684888.0, "step": 3125 }, { "entropy": 6.075513315200806, "epoch": 2.689299527288354, "grad_norm": 1.0859375, "learning_rate": 0.00045691149512597717, "loss": 5.8228, "mean_token_accuracy": 0.17054860144853592, "num_tokens": 5693626.0, "step": 3130 }, { "entropy": 6.097319412231445, "epoch": 2.6935969058874085, "grad_norm": 1.3671875, "learning_rate": 0.00045671562971329736, "loss": 5.7252, "mean_token_accuracy": 0.18006587252020836, "num_tokens": 5702542.0, "step": 3135 }, { "entropy": 5.990765905380249, "epoch": 2.6978942844864635, "grad_norm": 1.1875, "learning_rate": 0.00045651936750631337, "loss": 5.7717, "mean_token_accuracy": 0.17453034669160844, "num_tokens": 5711440.0, "step": 3140 }, { "entropy": 6.161540746688843, "epoch": 2.702191663085518, "grad_norm": 1.0546875, "learning_rate": 0.00045632270893358333, "loss": 5.8408, "mean_token_accuracy": 0.17016429752111434, "num_tokens": 5721495.0, "step": 3145 }, { "entropy": 6.113433980941773, "epoch": 2.7064890416845726, "grad_norm": 1.1171875, "learning_rate": 0.0004561256544245312, "loss": 5.8702, "mean_token_accuracy": 0.16516700088977815, "num_tokens": 5730664.0, "step": 3150 }, { "entropy": 6.01002688407898, "epoch": 2.710786420283627, "grad_norm": 1.0859375, "learning_rate": 0.000455928204409445, "loss": 5.7398, "mean_token_accuracy": 0.18062053769826888, "num_tokens": 5740229.0, "step": 3155 }, { "entropy": 6.008193063735962, "epoch": 2.7150837988826817, "grad_norm": 1.1484375, "learning_rate": 0.00045573035931947684, "loss": 5.7378, "mean_token_accuracy": 0.1744979053735733, "num_tokens": 5748549.0, "step": 3160 }, { "entropy": 6.039789533615112, "epoch": 2.7193811774817362, "grad_norm": 1.125, "learning_rate": 0.0004555321195866411, "loss": 5.6806, "mean_token_accuracy": 0.17732828259468078, "num_tokens": 5757603.0, "step": 3165 }, { "entropy": 6.080458354949951, "epoch": 2.723678556080791, "grad_norm": 1.265625, "learning_rate": 0.0004553334856438143, "loss": 5.8618, "mean_token_accuracy": 0.16976358145475387, "num_tokens": 5767520.0, "step": 3170 }, { "entropy": 6.09289813041687, "epoch": 2.7279759346798453, "grad_norm": 1.0078125, "learning_rate": 0.00045513445792473356, "loss": 5.8583, "mean_token_accuracy": 0.1657076820731163, "num_tokens": 5776778.0, "step": 3175 }, { "entropy": 6.129758882522583, "epoch": 2.7322733132789, "grad_norm": 1.1328125, "learning_rate": 0.0004549350368639958, "loss": 5.8808, "mean_token_accuracy": 0.16651461273431778, "num_tokens": 5785652.0, "step": 3180 }, { "entropy": 6.144531726837158, "epoch": 2.7365706918779544, "grad_norm": 1.078125, "learning_rate": 0.00045473522289705693, "loss": 5.849, "mean_token_accuracy": 0.1734338730573654, "num_tokens": 5795766.0, "step": 3185 }, { "entropy": 5.992430114746094, "epoch": 2.740868070477009, "grad_norm": 1.140625, "learning_rate": 0.00045453501646023085, "loss": 5.8822, "mean_token_accuracy": 0.16669443398714065, "num_tokens": 5804504.0, "step": 3190 }, { "entropy": 6.04656753540039, "epoch": 2.7451654490760635, "grad_norm": 0.94921875, "learning_rate": 0.00045433441799068837, "loss": 5.7879, "mean_token_accuracy": 0.17372047603130342, "num_tokens": 5814161.0, "step": 3195 }, { "entropy": 6.094403076171875, "epoch": 2.749462827675118, "grad_norm": 1.0625, "learning_rate": 0.0004541334279264562, "loss": 5.6942, "mean_token_accuracy": 0.18637760877609252, "num_tokens": 5822235.0, "step": 3200 }, { "entropy": 6.033037233352661, "epoch": 2.7537602062741726, "grad_norm": 1.15625, "learning_rate": 0.00045393204670641656, "loss": 5.7009, "mean_token_accuracy": 0.17470744848251343, "num_tokens": 5831572.0, "step": 3205 }, { "entropy": 5.930374097824097, "epoch": 2.758057584873227, "grad_norm": 1.046875, "learning_rate": 0.0004537302747703055, "loss": 5.7328, "mean_token_accuracy": 0.18409457355737685, "num_tokens": 5839694.0, "step": 3210 }, { "entropy": 6.124579620361328, "epoch": 2.7623549634722817, "grad_norm": 1.1640625, "learning_rate": 0.00045352811255871216, "loss": 5.8448, "mean_token_accuracy": 0.17230847403407096, "num_tokens": 5849131.0, "step": 3215 }, { "entropy": 6.174058246612549, "epoch": 2.7666523420713363, "grad_norm": 0.93359375, "learning_rate": 0.00045332556051307804, "loss": 5.7711, "mean_token_accuracy": 0.1720232903957367, "num_tokens": 5858861.0, "step": 3220 }, { "entropy": 6.060689687728882, "epoch": 2.770949720670391, "grad_norm": 1.078125, "learning_rate": 0.00045312261907569585, "loss": 5.7833, "mean_token_accuracy": 0.17473076432943344, "num_tokens": 5867585.0, "step": 3225 }, { "entropy": 6.015534782409668, "epoch": 2.775247099269446, "grad_norm": 1.078125, "learning_rate": 0.00045291928868970867, "loss": 5.7865, "mean_token_accuracy": 0.16985856741666794, "num_tokens": 5876256.0, "step": 3230 }, { "entropy": 6.038353967666626, "epoch": 2.7795444778685003, "grad_norm": 1.0703125, "learning_rate": 0.0004527155697991087, "loss": 5.8471, "mean_token_accuracy": 0.16595111042261124, "num_tokens": 5885302.0, "step": 3235 }, { "entropy": 6.021387720108033, "epoch": 2.783841856467555, "grad_norm": 0.94921875, "learning_rate": 0.0004525114628487365, "loss": 5.8628, "mean_token_accuracy": 0.16938397139310837, "num_tokens": 5895066.0, "step": 3240 }, { "entropy": 6.151897144317627, "epoch": 2.7881392350666094, "grad_norm": 1.0546875, "learning_rate": 0.00045230696828428026, "loss": 5.8557, "mean_token_accuracy": 0.16848236918449402, "num_tokens": 5903258.0, "step": 3245 }, { "entropy": 6.039195203781128, "epoch": 2.792436613665664, "grad_norm": 1.15625, "learning_rate": 0.0004521020865522742, "loss": 5.7511, "mean_token_accuracy": 0.1710444927215576, "num_tokens": 5911714.0, "step": 3250 }, { "entropy": 5.995623922348022, "epoch": 2.7967339922647185, "grad_norm": 1.0859375, "learning_rate": 0.00045189681810009827, "loss": 5.8176, "mean_token_accuracy": 0.17098681181669234, "num_tokens": 5920432.0, "step": 3255 }, { "entropy": 6.183896541595459, "epoch": 2.801031370863773, "grad_norm": 1.234375, "learning_rate": 0.00045169116337597653, "loss": 5.8195, "mean_token_accuracy": 0.1705167680978775, "num_tokens": 5929202.0, "step": 3260 }, { "entropy": 6.115947484970093, "epoch": 2.8053287494628276, "grad_norm": 1.1640625, "learning_rate": 0.000451485122828977, "loss": 5.8601, "mean_token_accuracy": 0.16596955806016922, "num_tokens": 5938034.0, "step": 3265 }, { "entropy": 5.939763784408569, "epoch": 2.809626128061882, "grad_norm": 1.015625, "learning_rate": 0.00045127869690900956, "loss": 5.7097, "mean_token_accuracy": 0.18104571253061294, "num_tokens": 5946944.0, "step": 3270 }, { "entropy": 6.011037588119507, "epoch": 2.8139235066609367, "grad_norm": 1.21875, "learning_rate": 0.00045107188606682613, "loss": 5.8219, "mean_token_accuracy": 0.17439836859703065, "num_tokens": 5956475.0, "step": 3275 }, { "entropy": 6.145265722274781, "epoch": 2.8182208852599913, "grad_norm": 1.125, "learning_rate": 0.0004508646907540188, "loss": 5.7788, "mean_token_accuracy": 0.1669231042265892, "num_tokens": 5965814.0, "step": 3280 }, { "entropy": 6.047088956832885, "epoch": 2.8225182638590463, "grad_norm": 1.1484375, "learning_rate": 0.0004506571114230195, "loss": 5.839, "mean_token_accuracy": 0.16392946541309356, "num_tokens": 5973850.0, "step": 3285 }, { "entropy": 5.966979217529297, "epoch": 2.826815642458101, "grad_norm": 1.0, "learning_rate": 0.00045044914852709824, "loss": 5.7718, "mean_token_accuracy": 0.17040073573589326, "num_tokens": 5982987.0, "step": 3290 }, { "entropy": 6.124370384216308, "epoch": 2.8311130210571553, "grad_norm": 1.1328125, "learning_rate": 0.0004502408025203631, "loss": 5.7567, "mean_token_accuracy": 0.18116641640663148, "num_tokens": 5992227.0, "step": 3295 }, { "entropy": 6.060417222976684, "epoch": 2.83541039965621, "grad_norm": 1.046875, "learning_rate": 0.0004500320738577584, "loss": 5.7373, "mean_token_accuracy": 0.18134199529886247, "num_tokens": 6000243.0, "step": 3300 }, { "entropy": 6.009725856781006, "epoch": 2.8397077782552644, "grad_norm": 1.1171875, "learning_rate": 0.00044982296299506407, "loss": 5.7396, "mean_token_accuracy": 0.1772996261715889, "num_tokens": 6009771.0, "step": 3305 }, { "entropy": 6.065958309173584, "epoch": 2.844005156854319, "grad_norm": 1.203125, "learning_rate": 0.0004496134703888948, "loss": 5.8227, "mean_token_accuracy": 0.17212264090776444, "num_tokens": 6018683.0, "step": 3310 }, { "entropy": 6.064363861083985, "epoch": 2.8483025354533735, "grad_norm": 1.0859375, "learning_rate": 0.00044940359649669846, "loss": 5.6744, "mean_token_accuracy": 0.18227704763412475, "num_tokens": 6027422.0, "step": 3315 }, { "entropy": 5.975715494155883, "epoch": 2.852599914052428, "grad_norm": 1.0859375, "learning_rate": 0.00044919334177675595, "loss": 5.7633, "mean_token_accuracy": 0.1744269087910652, "num_tokens": 6035670.0, "step": 3320 }, { "entropy": 6.036557149887085, "epoch": 2.8568972926514826, "grad_norm": 1.1015625, "learning_rate": 0.00044898270668817955, "loss": 5.6979, "mean_token_accuracy": 0.1815047174692154, "num_tokens": 6044092.0, "step": 3325 }, { "entropy": 6.020293951034546, "epoch": 2.861194671250537, "grad_norm": 0.99609375, "learning_rate": 0.000448771691690912, "loss": 5.7773, "mean_token_accuracy": 0.16777832806110382, "num_tokens": 6053970.0, "step": 3330 }, { "entropy": 6.0354420185089115, "epoch": 2.8654920498495917, "grad_norm": 1.0546875, "learning_rate": 0.0004485602972457257, "loss": 5.7383, "mean_token_accuracy": 0.1797216445207596, "num_tokens": 6062965.0, "step": 3335 }, { "entropy": 6.068978786468506, "epoch": 2.8697894284486463, "grad_norm": 1.0859375, "learning_rate": 0.00044834852381422165, "loss": 5.8049, "mean_token_accuracy": 0.17490418255329132, "num_tokens": 6072420.0, "step": 3340 }, { "entropy": 6.005989837646484, "epoch": 2.874086807047701, "grad_norm": 1.078125, "learning_rate": 0.00044813637185882836, "loss": 5.7175, "mean_token_accuracy": 0.17540892213582993, "num_tokens": 6080915.0, "step": 3345 }, { "entropy": 6.08394684791565, "epoch": 2.8783841856467554, "grad_norm": 1.1953125, "learning_rate": 0.00044792384184280106, "loss": 5.8546, "mean_token_accuracy": 0.16469819992780685, "num_tokens": 6090453.0, "step": 3350 }, { "entropy": 6.007422256469726, "epoch": 2.88268156424581, "grad_norm": 1.1171875, "learning_rate": 0.00044771093423022013, "loss": 5.8795, "mean_token_accuracy": 0.16327449679374695, "num_tokens": 6099390.0, "step": 3355 }, { "entropy": 6.039740133285522, "epoch": 2.8869789428448644, "grad_norm": 1.03125, "learning_rate": 0.0004474976494859909, "loss": 5.798, "mean_token_accuracy": 0.17494250684976578, "num_tokens": 6108677.0, "step": 3360 }, { "entropy": 6.058880233764649, "epoch": 2.891276321443919, "grad_norm": 0.98828125, "learning_rate": 0.0004472839880758419, "loss": 5.716, "mean_token_accuracy": 0.17659443318843843, "num_tokens": 6117151.0, "step": 3365 }, { "entropy": 6.116930532455444, "epoch": 2.8955737000429735, "grad_norm": 1.140625, "learning_rate": 0.0004470699504663242, "loss": 5.8387, "mean_token_accuracy": 0.16731317192316056, "num_tokens": 6127167.0, "step": 3370 }, { "entropy": 6.032348012924194, "epoch": 2.899871078642028, "grad_norm": 1.0703125, "learning_rate": 0.0004468555371248104, "loss": 5.7315, "mean_token_accuracy": 0.1812448814511299, "num_tokens": 6136487.0, "step": 3375 }, { "entropy": 6.042793130874633, "epoch": 2.904168457241083, "grad_norm": 1.0546875, "learning_rate": 0.0004466407485194937, "loss": 5.8432, "mean_token_accuracy": 0.16948612183332443, "num_tokens": 6145334.0, "step": 3380 }, { "entropy": 6.0380902767181395, "epoch": 2.9084658358401376, "grad_norm": 1.0546875, "learning_rate": 0.0004464255851193864, "loss": 5.7558, "mean_token_accuracy": 0.17524855434894562, "num_tokens": 6155062.0, "step": 3385 }, { "entropy": 6.0504677295684814, "epoch": 2.912763214439192, "grad_norm": 1.84375, "learning_rate": 0.0004462100473943194, "loss": 5.6948, "mean_token_accuracy": 0.1831045612692833, "num_tokens": 6164313.0, "step": 3390 }, { "entropy": 6.009590721130371, "epoch": 2.9170605930382467, "grad_norm": 1.0234375, "learning_rate": 0.000445994135814941, "loss": 5.7596, "mean_token_accuracy": 0.17282265722751616, "num_tokens": 6173513.0, "step": 3395 }, { "entropy": 6.019471836090088, "epoch": 2.9213579716373013, "grad_norm": 1.2578125, "learning_rate": 0.00044577785085271566, "loss": 5.7717, "mean_token_accuracy": 0.17321082055568696, "num_tokens": 6182000.0, "step": 3400 }, { "entropy": 6.081956481933593, "epoch": 2.925655350236356, "grad_norm": 1.0, "learning_rate": 0.0004455611929799235, "loss": 5.8084, "mean_token_accuracy": 0.16455612033605577, "num_tokens": 6191887.0, "step": 3405 }, { "entropy": 5.956260538101196, "epoch": 2.9299527288354104, "grad_norm": 1.0234375, "learning_rate": 0.0004453441626696585, "loss": 5.8554, "mean_token_accuracy": 0.16420858800411225, "num_tokens": 6202897.0, "step": 3410 }, { "entropy": 6.078850793838501, "epoch": 2.934250107434465, "grad_norm": 1.0625, "learning_rate": 0.00044512676039582823, "loss": 5.7438, "mean_token_accuracy": 0.18065994828939438, "num_tokens": 6211811.0, "step": 3415 }, { "entropy": 6.1270510196685795, "epoch": 2.9385474860335195, "grad_norm": 1.125, "learning_rate": 0.0004449089866331524, "loss": 5.7475, "mean_token_accuracy": 0.1779635578393936, "num_tokens": 6219896.0, "step": 3420 }, { "entropy": 5.887205791473389, "epoch": 2.942844864632574, "grad_norm": 1.109375, "learning_rate": 0.0004446908418571617, "loss": 5.737, "mean_token_accuracy": 0.17775188386440277, "num_tokens": 6228212.0, "step": 3425 }, { "entropy": 6.039950180053711, "epoch": 2.9471422432316285, "grad_norm": 1.046875, "learning_rate": 0.0004444723265441973, "loss": 5.896, "mean_token_accuracy": 0.16747722327709197, "num_tokens": 6238133.0, "step": 3430 }, { "entropy": 6.076528787612915, "epoch": 2.9514396218306835, "grad_norm": 0.9765625, "learning_rate": 0.0004442534411714092, "loss": 5.7945, "mean_token_accuracy": 0.16944347620010375, "num_tokens": 6247331.0, "step": 3435 }, { "entropy": 6.101355123519897, "epoch": 2.955737000429738, "grad_norm": 1.125, "learning_rate": 0.00044403418621675555, "loss": 5.7926, "mean_token_accuracy": 0.17052113264799118, "num_tokens": 6255280.0, "step": 3440 }, { "entropy": 6.050349187850952, "epoch": 2.9600343790287926, "grad_norm": 1.0625, "learning_rate": 0.0004438145621590017, "loss": 5.7555, "mean_token_accuracy": 0.17491218596696853, "num_tokens": 6264752.0, "step": 3445 }, { "entropy": 5.9742930889129635, "epoch": 2.964331757627847, "grad_norm": 1.140625, "learning_rate": 0.00044359456947771857, "loss": 5.7023, "mean_token_accuracy": 0.17401786297559738, "num_tokens": 6273258.0, "step": 3450 }, { "entropy": 5.869597768783569, "epoch": 2.9686291362269017, "grad_norm": 1.21875, "learning_rate": 0.0004433742086532824, "loss": 5.6228, "mean_token_accuracy": 0.19265587478876114, "num_tokens": 6281584.0, "step": 3455 }, { "entropy": 6.010894346237182, "epoch": 2.9729265148259563, "grad_norm": 1.203125, "learning_rate": 0.00044315348016687317, "loss": 5.7472, "mean_token_accuracy": 0.17217940390110015, "num_tokens": 6290016.0, "step": 3460 }, { "entropy": 5.982230138778687, "epoch": 2.977223893425011, "grad_norm": 1.0703125, "learning_rate": 0.0004429323845004736, "loss": 5.6523, "mean_token_accuracy": 0.18324829190969466, "num_tokens": 6298569.0, "step": 3465 }, { "entropy": 6.025563192367554, "epoch": 2.9815212720240654, "grad_norm": 1.015625, "learning_rate": 0.00044271092213686824, "loss": 5.6855, "mean_token_accuracy": 0.18166320472955705, "num_tokens": 6307684.0, "step": 3470 }, { "entropy": 6.1143230438232425, "epoch": 2.98581865062312, "grad_norm": 0.98046875, "learning_rate": 0.00044248909355964247, "loss": 5.8192, "mean_token_accuracy": 0.17195742726325988, "num_tokens": 6317767.0, "step": 3475 }, { "entropy": 6.101650047302246, "epoch": 2.9901160292221745, "grad_norm": 1.140625, "learning_rate": 0.00044226689925318117, "loss": 5.8454, "mean_token_accuracy": 0.16614989936351776, "num_tokens": 6327457.0, "step": 3480 }, { "entropy": 5.9672809600830075, "epoch": 2.994413407821229, "grad_norm": 1.0703125, "learning_rate": 0.00044204433970266785, "loss": 5.6491, "mean_token_accuracy": 0.1888262301683426, "num_tokens": 6335747.0, "step": 3485 }, { "entropy": 5.983137941360473, "epoch": 2.9987107864202835, "grad_norm": 1.078125, "learning_rate": 0.0004418214153940837, "loss": 5.7429, "mean_token_accuracy": 0.18020158410072326, "num_tokens": 6344750.0, "step": 3490 }, { "entropy": 6.069730917612712, "epoch": 3.002578427159433, "grad_norm": 0.9140625, "learning_rate": 0.00044159812681420624, "loss": 5.6774, "mean_token_accuracy": 0.18010619613859388, "num_tokens": 6354779.0, "step": 3495 }, { "entropy": 6.04861307144165, "epoch": 3.0068758057584875, "grad_norm": 1.109375, "learning_rate": 0.0004413744744506086, "loss": 5.4671, "mean_token_accuracy": 0.18785551339387893, "num_tokens": 6363809.0, "step": 3500 }, { "epoch": 3.0068758057584875, "eval_entropy": 5.758008357640859, "eval_loss": 6.010996341705322, "eval_mean_token_accuracy": 0.17076294478196818, "eval_num_tokens": 6363809.0, "eval_runtime": 2.0461, "eval_samples_per_second": 1734.535, "eval_steps_per_second": 217.0, "step": 3500 }, { "entropy": 5.996388673782349, "epoch": 3.011173184357542, "grad_norm": 1.078125, "learning_rate": 0.00044115045879165806, "loss": 5.5232, "mean_token_accuracy": 0.18338106274604798, "num_tokens": 6373082.0, "step": 3505 }, { "entropy": 6.007624244689941, "epoch": 3.0154705629565965, "grad_norm": 1.1328125, "learning_rate": 0.00044092608032651515, "loss": 5.4884, "mean_token_accuracy": 0.18493928611278534, "num_tokens": 6381286.0, "step": 3510 }, { "entropy": 6.025672864913941, "epoch": 3.019767941555651, "grad_norm": 1.0, "learning_rate": 0.00044070133954513305, "loss": 5.4331, "mean_token_accuracy": 0.19577560871839522, "num_tokens": 6390217.0, "step": 3515 }, { "entropy": 6.000850200653076, "epoch": 3.0240653201547056, "grad_norm": 1.2421875, "learning_rate": 0.0004404762369382555, "loss": 5.4683, "mean_token_accuracy": 0.18827376067638396, "num_tokens": 6399276.0, "step": 3520 }, { "entropy": 5.93954758644104, "epoch": 3.02836269875376, "grad_norm": 1.1640625, "learning_rate": 0.00044025077299741683, "loss": 5.4445, "mean_token_accuracy": 0.1970704421401024, "num_tokens": 6407981.0, "step": 3525 }, { "entropy": 5.957830190658569, "epoch": 3.0326600773528147, "grad_norm": 1.1484375, "learning_rate": 0.00044002494821494007, "loss": 5.4438, "mean_token_accuracy": 0.1922784000635147, "num_tokens": 6416159.0, "step": 3530 }, { "entropy": 5.9005261898040775, "epoch": 3.0369574559518693, "grad_norm": 1.109375, "learning_rate": 0.00043979876308393635, "loss": 5.4964, "mean_token_accuracy": 0.19178588539361954, "num_tokens": 6424564.0, "step": 3535 }, { "entropy": 6.050303220748901, "epoch": 3.041254834550924, "grad_norm": 1.0625, "learning_rate": 0.0004395722180983036, "loss": 5.5298, "mean_token_accuracy": 0.1850397542119026, "num_tokens": 6434163.0, "step": 3540 }, { "entropy": 5.894874811172485, "epoch": 3.0455522131499784, "grad_norm": 1.1171875, "learning_rate": 0.00043934531375272535, "loss": 5.3505, "mean_token_accuracy": 0.20217433124780654, "num_tokens": 6443372.0, "step": 3545 }, { "entropy": 5.910711765289307, "epoch": 3.049849591749033, "grad_norm": 1.0, "learning_rate": 0.00043911805054267015, "loss": 5.4569, "mean_token_accuracy": 0.19326651990413665, "num_tokens": 6452638.0, "step": 3550 }, { "entropy": 6.0943183422088625, "epoch": 3.0541469703480875, "grad_norm": 1.0703125, "learning_rate": 0.00043889042896439004, "loss": 5.4504, "mean_token_accuracy": 0.19366029798984527, "num_tokens": 6461319.0, "step": 3555 }, { "entropy": 5.933924388885498, "epoch": 3.0584443489471425, "grad_norm": 1.2890625, "learning_rate": 0.00043866244951491946, "loss": 5.3807, "mean_token_accuracy": 0.20453428626060485, "num_tokens": 6469506.0, "step": 3560 }, { "entropy": 5.953407573699951, "epoch": 3.062741727546197, "grad_norm": 1.1171875, "learning_rate": 0.00043843411269207445, "loss": 5.437, "mean_token_accuracy": 0.1967134654521942, "num_tokens": 6478404.0, "step": 3565 }, { "entropy": 5.93691873550415, "epoch": 3.0670391061452515, "grad_norm": 1.09375, "learning_rate": 0.0004382054189944514, "loss": 5.3816, "mean_token_accuracy": 0.19278321117162706, "num_tokens": 6487447.0, "step": 3570 }, { "entropy": 5.878192138671875, "epoch": 3.071336484744306, "grad_norm": 1.046875, "learning_rate": 0.0004379763689214259, "loss": 5.4196, "mean_token_accuracy": 0.18803493976593016, "num_tokens": 6496738.0, "step": 3575 }, { "entropy": 5.99528021812439, "epoch": 3.0756338633433606, "grad_norm": 0.984375, "learning_rate": 0.0004377469629731518, "loss": 5.4317, "mean_token_accuracy": 0.193548683822155, "num_tokens": 6505848.0, "step": 3580 }, { "entropy": 5.963389873504639, "epoch": 3.079931241942415, "grad_norm": 1.0390625, "learning_rate": 0.0004375172016505599, "loss": 5.4138, "mean_token_accuracy": 0.19534891694784165, "num_tokens": 6515731.0, "step": 3585 }, { "entropy": 5.967396306991577, "epoch": 3.0842286205414697, "grad_norm": 1.0390625, "learning_rate": 0.0004372870854553572, "loss": 5.4706, "mean_token_accuracy": 0.194082772731781, "num_tokens": 6524914.0, "step": 3590 }, { "entropy": 5.949575996398925, "epoch": 3.0885259991405243, "grad_norm": 1.078125, "learning_rate": 0.0004370566148900255, "loss": 5.4527, "mean_token_accuracy": 0.19753452241420746, "num_tokens": 6533712.0, "step": 3595 }, { "entropy": 5.982216501235962, "epoch": 3.092823377739579, "grad_norm": 1.0859375, "learning_rate": 0.00043682579045782024, "loss": 5.5375, "mean_token_accuracy": 0.18995364159345626, "num_tokens": 6543313.0, "step": 3600 }, { "entropy": 5.923231220245361, "epoch": 3.0971207563386334, "grad_norm": 1.203125, "learning_rate": 0.0004365946126627699, "loss": 5.4189, "mean_token_accuracy": 0.2017338365316391, "num_tokens": 6551634.0, "step": 3605 }, { "entropy": 5.981328535079956, "epoch": 3.101418134937688, "grad_norm": 1.0859375, "learning_rate": 0.00043636308200967433, "loss": 5.4241, "mean_token_accuracy": 0.2000526711344719, "num_tokens": 6560695.0, "step": 3610 }, { "entropy": 5.813827800750732, "epoch": 3.1057155135367425, "grad_norm": 1.0390625, "learning_rate": 0.0004361311990041039, "loss": 5.3344, "mean_token_accuracy": 0.1969393327832222, "num_tokens": 6569086.0, "step": 3615 }, { "entropy": 5.89613938331604, "epoch": 3.110012892135797, "grad_norm": 1.09375, "learning_rate": 0.00043589896415239843, "loss": 5.4161, "mean_token_accuracy": 0.19979367852211, "num_tokens": 6578287.0, "step": 3620 }, { "entropy": 5.952451086044311, "epoch": 3.1143102707348516, "grad_norm": 1.015625, "learning_rate": 0.00043566637796166595, "loss": 5.4753, "mean_token_accuracy": 0.19049297720193864, "num_tokens": 6587015.0, "step": 3625 }, { "entropy": 5.962173509597778, "epoch": 3.118607649333906, "grad_norm": 1.1796875, "learning_rate": 0.00043543344093978186, "loss": 5.5175, "mean_token_accuracy": 0.18538623303174973, "num_tokens": 6596187.0, "step": 3630 }, { "entropy": 5.912763595581055, "epoch": 3.122905027932961, "grad_norm": 1.109375, "learning_rate": 0.00043520015359538745, "loss": 5.3898, "mean_token_accuracy": 0.19703881144523622, "num_tokens": 6605226.0, "step": 3635 }, { "entropy": 5.858266019821167, "epoch": 3.1272024065320156, "grad_norm": 1.1328125, "learning_rate": 0.0004349665164378891, "loss": 5.4371, "mean_token_accuracy": 0.1935065433382988, "num_tokens": 6613232.0, "step": 3640 }, { "entropy": 5.901705503463745, "epoch": 3.13149978513107, "grad_norm": 1.140625, "learning_rate": 0.00043473252997745684, "loss": 5.4392, "mean_token_accuracy": 0.19095546007156372, "num_tokens": 6622247.0, "step": 3645 }, { "entropy": 5.985095024108887, "epoch": 3.1357971637301247, "grad_norm": 1.6171875, "learning_rate": 0.00043449819472502366, "loss": 5.3871, "mean_token_accuracy": 0.19558012783527373, "num_tokens": 6630883.0, "step": 3650 }, { "entropy": 5.880083703994751, "epoch": 3.1400945423291793, "grad_norm": 1.1328125, "learning_rate": 0.0004342635111922841, "loss": 5.5374, "mean_token_accuracy": 0.19031234830617905, "num_tokens": 6639399.0, "step": 3655 }, { "entropy": 5.943640947341919, "epoch": 3.144391920928234, "grad_norm": 1.140625, "learning_rate": 0.0004340284798916931, "loss": 5.433, "mean_token_accuracy": 0.19181231111288072, "num_tokens": 6649288.0, "step": 3660 }, { "entropy": 5.898565721511841, "epoch": 3.1486892995272884, "grad_norm": 1.03125, "learning_rate": 0.0004337931013364653, "loss": 5.3804, "mean_token_accuracy": 0.19686342626810074, "num_tokens": 6658670.0, "step": 3665 }, { "entropy": 5.893218088150024, "epoch": 3.152986678126343, "grad_norm": 1.125, "learning_rate": 0.000433557376040573, "loss": 5.4538, "mean_token_accuracy": 0.196715846657753, "num_tokens": 6667302.0, "step": 3670 }, { "entropy": 5.956412696838379, "epoch": 3.1572840567253975, "grad_norm": 1.09375, "learning_rate": 0.00043332130451874645, "loss": 5.4965, "mean_token_accuracy": 0.1952889919281006, "num_tokens": 6677393.0, "step": 3675 }, { "entropy": 5.933012199401856, "epoch": 3.161581435324452, "grad_norm": 0.96875, "learning_rate": 0.00043308488728647127, "loss": 5.4744, "mean_token_accuracy": 0.1893087148666382, "num_tokens": 6686727.0, "step": 3680 }, { "entropy": 5.877901268005371, "epoch": 3.1658788139235066, "grad_norm": 1.1875, "learning_rate": 0.0004328481248599882, "loss": 5.3869, "mean_token_accuracy": 0.19530683755874634, "num_tokens": 6696116.0, "step": 3685 }, { "entropy": 5.934775876998901, "epoch": 3.170176192522561, "grad_norm": 1.078125, "learning_rate": 0.0004326110177562918, "loss": 5.4945, "mean_token_accuracy": 0.18531183749437333, "num_tokens": 6704640.0, "step": 3690 }, { "entropy": 5.860188579559326, "epoch": 3.1744735711216157, "grad_norm": 1.2265625, "learning_rate": 0.00043237356649312926, "loss": 5.3497, "mean_token_accuracy": 0.20377567410469055, "num_tokens": 6713663.0, "step": 3695 }, { "entropy": 5.891902303695678, "epoch": 3.17877094972067, "grad_norm": 1.0703125, "learning_rate": 0.0004321357715889991, "loss": 5.4868, "mean_token_accuracy": 0.18891167342662812, "num_tokens": 6722965.0, "step": 3700 }, { "entropy": 5.922307395935059, "epoch": 3.1830683283197247, "grad_norm": 1.140625, "learning_rate": 0.0004318976335631505, "loss": 5.4553, "mean_token_accuracy": 0.19657856673002244, "num_tokens": 6732776.0, "step": 3705 }, { "entropy": 5.93743257522583, "epoch": 3.1873657069187797, "grad_norm": 1.0703125, "learning_rate": 0.00043165915293558155, "loss": 5.4328, "mean_token_accuracy": 0.19283491969108582, "num_tokens": 6741309.0, "step": 3710 }, { "entropy": 5.882253980636596, "epoch": 3.1916630855178343, "grad_norm": 1.0703125, "learning_rate": 0.0004314203302270388, "loss": 5.4972, "mean_token_accuracy": 0.19080058485269547, "num_tokens": 6750584.0, "step": 3715 }, { "entropy": 5.9671601295471195, "epoch": 3.195960464116889, "grad_norm": 1.1953125, "learning_rate": 0.0004311811659590154, "loss": 5.4717, "mean_token_accuracy": 0.19037737101316451, "num_tokens": 6759344.0, "step": 3720 }, { "entropy": 5.958385229110718, "epoch": 3.2002578427159434, "grad_norm": 0.90234375, "learning_rate": 0.0004309416606537507, "loss": 5.6122, "mean_token_accuracy": 0.18305473029613495, "num_tokens": 6770345.0, "step": 3725 }, { "entropy": 5.974073982238769, "epoch": 3.204555221314998, "grad_norm": 1.1328125, "learning_rate": 0.00043070181483422843, "loss": 5.5015, "mean_token_accuracy": 0.18963303416967392, "num_tokens": 6779991.0, "step": 3730 }, { "entropy": 5.910816812515259, "epoch": 3.2088525999140525, "grad_norm": 1.1875, "learning_rate": 0.000430461629024176, "loss": 5.4509, "mean_token_accuracy": 0.19442620873451233, "num_tokens": 6788972.0, "step": 3735 }, { "entropy": 5.827464151382446, "epoch": 3.213149978513107, "grad_norm": 1.203125, "learning_rate": 0.0004302211037480634, "loss": 5.3772, "mean_token_accuracy": 0.19249555021524428, "num_tokens": 6796967.0, "step": 3740 }, { "entropy": 5.8557556629180905, "epoch": 3.2174473571121616, "grad_norm": 1.2421875, "learning_rate": 0.0004299802395311015, "loss": 5.4743, "mean_token_accuracy": 0.19575096070766448, "num_tokens": 6805961.0, "step": 3745 }, { "entropy": 5.86885895729065, "epoch": 3.221744735711216, "grad_norm": 1.265625, "learning_rate": 0.0004297390368992414, "loss": 5.3787, "mean_token_accuracy": 0.19657269567251207, "num_tokens": 6814657.0, "step": 3750 }, { "entropy": 5.923396444320678, "epoch": 3.2260421143102707, "grad_norm": 1.2109375, "learning_rate": 0.00042949749637917353, "loss": 5.4168, "mean_token_accuracy": 0.1941995695233345, "num_tokens": 6823095.0, "step": 3755 }, { "entropy": 5.889871215820312, "epoch": 3.230339492909325, "grad_norm": 1.0, "learning_rate": 0.0004292556184983256, "loss": 5.4421, "mean_token_accuracy": 0.1958567351102829, "num_tokens": 6832195.0, "step": 3760 }, { "entropy": 5.969770717620849, "epoch": 3.2346368715083798, "grad_norm": 1.203125, "learning_rate": 0.0004290134037848623, "loss": 5.575, "mean_token_accuracy": 0.1806161344051361, "num_tokens": 6840922.0, "step": 3765 }, { "entropy": 5.95328722000122, "epoch": 3.2389342501074343, "grad_norm": 1.1953125, "learning_rate": 0.00042877085276768386, "loss": 5.4178, "mean_token_accuracy": 0.20026799887418748, "num_tokens": 6849182.0, "step": 3770 }, { "entropy": 5.878392887115479, "epoch": 3.243231628706489, "grad_norm": 1.09375, "learning_rate": 0.00042852796597642455, "loss": 5.408, "mean_token_accuracy": 0.19837529808282853, "num_tokens": 6857932.0, "step": 3775 }, { "entropy": 5.953036594390869, "epoch": 3.247529007305544, "grad_norm": 1.125, "learning_rate": 0.0004282847439414522, "loss": 5.5606, "mean_token_accuracy": 0.18480827659368515, "num_tokens": 6867283.0, "step": 3780 }, { "entropy": 5.960994386672974, "epoch": 3.2518263859045984, "grad_norm": 1.1015625, "learning_rate": 0.0004280411871938664, "loss": 5.5237, "mean_token_accuracy": 0.18728075176477432, "num_tokens": 6876123.0, "step": 3785 }, { "entropy": 5.969729375839234, "epoch": 3.256123764503653, "grad_norm": 1.203125, "learning_rate": 0.0004277972962654979, "loss": 5.4539, "mean_token_accuracy": 0.19224015027284622, "num_tokens": 6885239.0, "step": 3790 }, { "entropy": 5.8932037353515625, "epoch": 3.2604211431027075, "grad_norm": 1.0859375, "learning_rate": 0.0004275530716889069, "loss": 5.5146, "mean_token_accuracy": 0.18382496386766434, "num_tokens": 6895061.0, "step": 3795 }, { "entropy": 5.947560358047485, "epoch": 3.264718521701762, "grad_norm": 1.234375, "learning_rate": 0.0004273085139973822, "loss": 5.5657, "mean_token_accuracy": 0.1781401515007019, "num_tokens": 6903828.0, "step": 3800 }, { "entropy": 5.989862442016602, "epoch": 3.2690159003008166, "grad_norm": 1.1484375, "learning_rate": 0.0004270636237249401, "loss": 5.4777, "mean_token_accuracy": 0.18864577561616896, "num_tokens": 6912805.0, "step": 3805 }, { "entropy": 5.920726633071899, "epoch": 3.273313278899871, "grad_norm": 1.1328125, "learning_rate": 0.00042681840140632314, "loss": 5.5243, "mean_token_accuracy": 0.18295771330595018, "num_tokens": 6922165.0, "step": 3810 }, { "entropy": 5.956135368347168, "epoch": 3.2776106574989257, "grad_norm": 1.0859375, "learning_rate": 0.0004265728475769989, "loss": 5.4939, "mean_token_accuracy": 0.1879052475094795, "num_tokens": 6931677.0, "step": 3815 }, { "entropy": 5.918879604339599, "epoch": 3.28190803609798, "grad_norm": 1.0078125, "learning_rate": 0.0004263269627731586, "loss": 5.452, "mean_token_accuracy": 0.192815600335598, "num_tokens": 6940486.0, "step": 3820 }, { "entropy": 5.825883960723877, "epoch": 3.2862054146970348, "grad_norm": 1.125, "learning_rate": 0.0004260807475317164, "loss": 5.4745, "mean_token_accuracy": 0.18577916026115418, "num_tokens": 6948990.0, "step": 3825 }, { "entropy": 5.979638195037841, "epoch": 3.2905027932960893, "grad_norm": 1.0234375, "learning_rate": 0.0004258342023903081, "loss": 5.5953, "mean_token_accuracy": 0.18115273416042327, "num_tokens": 6959311.0, "step": 3830 }, { "entropy": 5.957726049423218, "epoch": 3.294800171895144, "grad_norm": 1.125, "learning_rate": 0.00042558732788728975, "loss": 5.3649, "mean_token_accuracy": 0.20235307216644288, "num_tokens": 6968619.0, "step": 3835 }, { "entropy": 5.867683029174804, "epoch": 3.2990975504941984, "grad_norm": 1.109375, "learning_rate": 0.00042534012456173643, "loss": 5.4398, "mean_token_accuracy": 0.1914222314953804, "num_tokens": 6977469.0, "step": 3840 }, { "entropy": 5.841017484664917, "epoch": 3.303394929093253, "grad_norm": 1.2578125, "learning_rate": 0.00042509259295344157, "loss": 5.4285, "mean_token_accuracy": 0.18821925073862075, "num_tokens": 6986772.0, "step": 3845 }, { "entropy": 5.910496807098388, "epoch": 3.3076923076923075, "grad_norm": 1.265625, "learning_rate": 0.00042484473360291514, "loss": 5.4393, "mean_token_accuracy": 0.19060401618480682, "num_tokens": 6993937.0, "step": 3850 }, { "entropy": 5.86693377494812, "epoch": 3.311989686291362, "grad_norm": 1.15625, "learning_rate": 0.00042459654705138294, "loss": 5.497, "mean_token_accuracy": 0.19289185404777526, "num_tokens": 7003222.0, "step": 3855 }, { "entropy": 5.9119508266448975, "epoch": 3.316287064890417, "grad_norm": 1.125, "learning_rate": 0.0004243480338407853, "loss": 5.4532, "mean_token_accuracy": 0.19899186342954636, "num_tokens": 7012055.0, "step": 3860 }, { "entropy": 5.892843008041382, "epoch": 3.3205844434894716, "grad_norm": 1.109375, "learning_rate": 0.0004240991945137755, "loss": 5.4592, "mean_token_accuracy": 0.19213219434022905, "num_tokens": 7021036.0, "step": 3865 }, { "entropy": 5.882801103591919, "epoch": 3.324881822088526, "grad_norm": 1.15625, "learning_rate": 0.00042385002961371944, "loss": 5.4441, "mean_token_accuracy": 0.19504359364509583, "num_tokens": 7030450.0, "step": 3870 }, { "entropy": 5.978818750381469, "epoch": 3.3291792006875807, "grad_norm": 1.1875, "learning_rate": 0.0004236005396846935, "loss": 5.5439, "mean_token_accuracy": 0.1879236653447151, "num_tokens": 7039740.0, "step": 3875 }, { "entropy": 5.946993112564087, "epoch": 3.333476579286635, "grad_norm": 1.15625, "learning_rate": 0.00042335072527148406, "loss": 5.5256, "mean_token_accuracy": 0.19050987511873246, "num_tokens": 7050430.0, "step": 3880 }, { "entropy": 5.85164065361023, "epoch": 3.3377739578856898, "grad_norm": 1.28125, "learning_rate": 0.0004231005869195859, "loss": 5.5069, "mean_token_accuracy": 0.18632889091968535, "num_tokens": 7059477.0, "step": 3885 }, { "entropy": 5.922405767440796, "epoch": 3.3420713364847443, "grad_norm": 1.4921875, "learning_rate": 0.0004228501251752011, "loss": 5.4601, "mean_token_accuracy": 0.1952619045972824, "num_tokens": 7067805.0, "step": 3890 }, { "entropy": 5.881201887130738, "epoch": 3.346368715083799, "grad_norm": 1.1171875, "learning_rate": 0.00042259934058523814, "loss": 5.46, "mean_token_accuracy": 0.18905477821826935, "num_tokens": 7077606.0, "step": 3895 }, { "entropy": 5.918180227279663, "epoch": 3.3506660936828534, "grad_norm": 1.171875, "learning_rate": 0.00042234823369731027, "loss": 5.4043, "mean_token_accuracy": 0.1953267216682434, "num_tokens": 7085647.0, "step": 3900 }, { "entropy": 5.835781908035278, "epoch": 3.354963472281908, "grad_norm": 1.140625, "learning_rate": 0.00042209680505973465, "loss": 5.449, "mean_token_accuracy": 0.1939581647515297, "num_tokens": 7095298.0, "step": 3905 }, { "entropy": 5.841428470611572, "epoch": 3.3592608508809625, "grad_norm": 1.0625, "learning_rate": 0.0004218450552215308, "loss": 5.5157, "mean_token_accuracy": 0.194383442401886, "num_tokens": 7105207.0, "step": 3910 }, { "entropy": 5.941448974609375, "epoch": 3.363558229480017, "grad_norm": 1.0546875, "learning_rate": 0.0004215929847324199, "loss": 5.5708, "mean_token_accuracy": 0.18115754574537277, "num_tokens": 7114833.0, "step": 3915 }, { "entropy": 5.963439035415649, "epoch": 3.3678556080790716, "grad_norm": 1.171875, "learning_rate": 0.000421340594142823, "loss": 5.3787, "mean_token_accuracy": 0.19882705360651015, "num_tokens": 7123608.0, "step": 3920 }, { "entropy": 5.8740808963775635, "epoch": 3.3721529866781266, "grad_norm": 1.21875, "learning_rate": 0.00042108788400386035, "loss": 5.4499, "mean_token_accuracy": 0.19346580952405928, "num_tokens": 7132250.0, "step": 3925 }, { "entropy": 5.880605030059814, "epoch": 3.376450365277181, "grad_norm": 0.98046875, "learning_rate": 0.0004208348548673498, "loss": 5.5399, "mean_token_accuracy": 0.19135694503784179, "num_tokens": 7142086.0, "step": 3930 }, { "entropy": 5.951083946228027, "epoch": 3.3807477438762357, "grad_norm": 1.1484375, "learning_rate": 0.000420581507285806, "loss": 5.4858, "mean_token_accuracy": 0.1825383946299553, "num_tokens": 7152434.0, "step": 3935 }, { "entropy": 5.82304835319519, "epoch": 3.38504512247529, "grad_norm": 1.0703125, "learning_rate": 0.0004203278418124386, "loss": 5.419, "mean_token_accuracy": 0.19977713227272034, "num_tokens": 7163041.0, "step": 3940 }, { "entropy": 5.829355192184448, "epoch": 3.3893425010743448, "grad_norm": 1.046875, "learning_rate": 0.0004200738590011518, "loss": 5.4173, "mean_token_accuracy": 0.19818853884935378, "num_tokens": 7171875.0, "step": 3945 }, { "entropy": 5.888874340057373, "epoch": 3.3936398796733993, "grad_norm": 1.1796875, "learning_rate": 0.00041981955940654245, "loss": 5.5242, "mean_token_accuracy": 0.1952082931995392, "num_tokens": 7180803.0, "step": 3950 }, { "entropy": 5.918650579452515, "epoch": 3.397937258272454, "grad_norm": 1.140625, "learning_rate": 0.0004195649435838992, "loss": 5.5527, "mean_token_accuracy": 0.17839486598968507, "num_tokens": 7190661.0, "step": 3955 }, { "entropy": 5.824749708175659, "epoch": 3.4022346368715084, "grad_norm": 1.109375, "learning_rate": 0.0004193100120892013, "loss": 5.3825, "mean_token_accuracy": 0.20471955984830856, "num_tokens": 7199357.0, "step": 3960 }, { "entropy": 5.87763524055481, "epoch": 3.406532015470563, "grad_norm": 1.015625, "learning_rate": 0.0004190547654791172, "loss": 5.5507, "mean_token_accuracy": 0.18516142815351486, "num_tokens": 7209856.0, "step": 3965 }, { "entropy": 5.960424184799194, "epoch": 3.4108293940696175, "grad_norm": 1.2578125, "learning_rate": 0.00041879920431100347, "loss": 5.5182, "mean_token_accuracy": 0.18072724491357803, "num_tokens": 7218778.0, "step": 3970 }, { "entropy": 5.929759693145752, "epoch": 3.415126772668672, "grad_norm": 1.1328125, "learning_rate": 0.0004185433291429036, "loss": 5.5383, "mean_token_accuracy": 0.19149455428123474, "num_tokens": 7228442.0, "step": 3975 }, { "entropy": 5.9323328018188475, "epoch": 3.4194241512677266, "grad_norm": 1.171875, "learning_rate": 0.00041828714053354665, "loss": 5.5232, "mean_token_accuracy": 0.18421948850154876, "num_tokens": 7238724.0, "step": 3980 }, { "entropy": 5.825714445114135, "epoch": 3.423721529866781, "grad_norm": 1.09375, "learning_rate": 0.0004180306390423462, "loss": 5.48, "mean_token_accuracy": 0.19667282402515412, "num_tokens": 7247844.0, "step": 3985 }, { "entropy": 5.873914575576782, "epoch": 3.4280189084658357, "grad_norm": 1.078125, "learning_rate": 0.00041777382522939884, "loss": 5.5471, "mean_token_accuracy": 0.18860624134540557, "num_tokens": 7257260.0, "step": 3990 }, { "entropy": 5.9357811450958256, "epoch": 3.4323162870648902, "grad_norm": 1.0078125, "learning_rate": 0.00041751669965548344, "loss": 5.5448, "mean_token_accuracy": 0.18522292822599412, "num_tokens": 7266890.0, "step": 3995 }, { "entropy": 5.9294140338897705, "epoch": 3.4366136656639448, "grad_norm": 1.171875, "learning_rate": 0.00041725926288205945, "loss": 5.5664, "mean_token_accuracy": 0.18018038868904113, "num_tokens": 7276114.0, "step": 4000 }, { "epoch": 3.4366136656639448, "eval_entropy": 5.70672602589066, "eval_loss": 5.984828472137451, "eval_mean_token_accuracy": 0.17211216785483532, "eval_num_tokens": 7276114.0, "eval_runtime": 2.0584, "eval_samples_per_second": 1724.118, "eval_steps_per_second": 215.697, "step": 4000 } ], "logging_steps": 5, "max_steps": 11630, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1640861880145920.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }