{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.2887838418564677, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 7.6312949657440186, "epoch": 0.004297378599054577, "grad_norm": 0.94921875, "learning_rate": 2e-06, "loss": 7.384, "mean_token_accuracy": 0.09047168418765068, "num_tokens": 10107.0, "step": 5 }, { "entropy": 7.674387979507446, "epoch": 0.008594757198109154, "grad_norm": 1.1484375, "learning_rate": 4.5e-06, "loss": 7.3814, "mean_token_accuracy": 0.09915048182010651, "num_tokens": 18391.0, "step": 10 }, { "entropy": 7.658490705490112, "epoch": 0.01289213579716373, "grad_norm": 1.015625, "learning_rate": 7e-06, "loss": 7.4194, "mean_token_accuracy": 0.09372682273387908, "num_tokens": 27061.0, "step": 15 }, { "entropy": 7.6485553741455075, "epoch": 0.017189514396218308, "grad_norm": 1.09375, "learning_rate": 9.5e-06, "loss": 7.4387, "mean_token_accuracy": 0.09950413554906845, "num_tokens": 36339.0, "step": 20 }, { "entropy": 7.655299663543701, "epoch": 0.021486892995272882, "grad_norm": 0.95703125, "learning_rate": 1.2e-05, "loss": 7.4336, "mean_token_accuracy": 0.09199422970414162, "num_tokens": 45770.0, "step": 25 }, { "entropy": 7.707321071624756, "epoch": 0.02578427159432746, "grad_norm": 0.96875, "learning_rate": 1.4500000000000002e-05, "loss": 7.4406, "mean_token_accuracy": 0.09267855286598206, "num_tokens": 54575.0, "step": 30 }, { "entropy": 7.718957376480103, "epoch": 0.030081650193382038, "grad_norm": 0.97265625, "learning_rate": 1.7000000000000003e-05, "loss": 7.5222, "mean_token_accuracy": 0.08976790606975556, "num_tokens": 66403.0, "step": 35 }, { "entropy": 7.742082262039185, "epoch": 0.034379028792436615, "grad_norm": 0.87890625, "learning_rate": 1.95e-05, "loss": 7.4377, "mean_token_accuracy": 0.09164252653717994, "num_tokens": 76510.0, "step": 40 }, { "entropy": 7.745701646804809, "epoch": 0.03867640739149119, "grad_norm": 0.99609375, "learning_rate": 2.2e-05, "loss": 7.358, "mean_token_accuracy": 0.0955798089504242, "num_tokens": 84836.0, "step": 45 }, { "entropy": 7.780595874786377, "epoch": 0.042973785990545764, "grad_norm": 0.984375, "learning_rate": 2.4500000000000003e-05, "loss": 7.3289, "mean_token_accuracy": 0.10552914068102837, "num_tokens": 93197.0, "step": 50 }, { "entropy": 7.764179325103759, "epoch": 0.047271164589600345, "grad_norm": 0.98828125, "learning_rate": 2.7e-05, "loss": 7.3234, "mean_token_accuracy": 0.09917277097702026, "num_tokens": 101546.0, "step": 55 }, { "entropy": 7.719727945327759, "epoch": 0.05156854318865492, "grad_norm": 0.8515625, "learning_rate": 2.95e-05, "loss": 7.4172, "mean_token_accuracy": 0.0928034670650959, "num_tokens": 111703.0, "step": 60 }, { "entropy": 7.748228645324707, "epoch": 0.055865921787709494, "grad_norm": 0.95703125, "learning_rate": 3.2e-05, "loss": 7.3403, "mean_token_accuracy": 0.10037123262882233, "num_tokens": 119894.0, "step": 65 }, { "entropy": 7.714352416992187, "epoch": 0.060163300386764075, "grad_norm": 0.89453125, "learning_rate": 3.4500000000000005e-05, "loss": 7.2915, "mean_token_accuracy": 0.1022428810596466, "num_tokens": 128885.0, "step": 70 }, { "entropy": 7.679376173019409, "epoch": 0.06446067898581866, "grad_norm": 0.8984375, "learning_rate": 3.7e-05, "loss": 7.4226, "mean_token_accuracy": 0.0972097434103489, "num_tokens": 138106.0, "step": 75 }, { "entropy": 7.72790002822876, "epoch": 0.06875805758487323, "grad_norm": 1.140625, "learning_rate": 3.95e-05, "loss": 7.3294, "mean_token_accuracy": 0.1022751808166504, "num_tokens": 146691.0, "step": 80 }, { "entropy": 7.730126142501831, "epoch": 0.0730554361839278, "grad_norm": 0.99609375, "learning_rate": 4.2000000000000004e-05, "loss": 7.382, "mean_token_accuracy": 0.09973402544856072, "num_tokens": 155792.0, "step": 85 }, { "entropy": 7.727601718902588, "epoch": 0.07735281478298238, "grad_norm": 0.89453125, "learning_rate": 4.45e-05, "loss": 7.4474, "mean_token_accuracy": 0.08758748695254326, "num_tokens": 166944.0, "step": 90 }, { "entropy": 7.782265329360962, "epoch": 0.08165019338203695, "grad_norm": 0.98828125, "learning_rate": 4.7000000000000004e-05, "loss": 7.2886, "mean_token_accuracy": 0.1041356198489666, "num_tokens": 175303.0, "step": 95 }, { "entropy": 7.751953029632569, "epoch": 0.08594757198109153, "grad_norm": 1.0078125, "learning_rate": 4.9500000000000004e-05, "loss": 7.3403, "mean_token_accuracy": 0.09793160557746887, "num_tokens": 184708.0, "step": 100 }, { "entropy": 7.702822208404541, "epoch": 0.09024495058014612, "grad_norm": 0.921875, "learning_rate": 5.2e-05, "loss": 7.3117, "mean_token_accuracy": 0.09851032048463822, "num_tokens": 193835.0, "step": 105 }, { "entropy": 7.686660861968994, "epoch": 0.09454232917920069, "grad_norm": 1.1328125, "learning_rate": 5.45e-05, "loss": 7.3479, "mean_token_accuracy": 0.0979080393910408, "num_tokens": 203344.0, "step": 110 }, { "entropy": 7.698584461212159, "epoch": 0.09883970777825526, "grad_norm": 0.9296875, "learning_rate": 5.7e-05, "loss": 7.4586, "mean_token_accuracy": 0.09130895733833314, "num_tokens": 213048.0, "step": 115 }, { "entropy": 7.781258678436279, "epoch": 0.10313708637730984, "grad_norm": 1.109375, "learning_rate": 5.9499999999999996e-05, "loss": 7.3094, "mean_token_accuracy": 0.10353164449334144, "num_tokens": 221784.0, "step": 120 }, { "entropy": 7.650211572647095, "epoch": 0.10743446497636441, "grad_norm": 1.0078125, "learning_rate": 6.2e-05, "loss": 7.3189, "mean_token_accuracy": 0.09726176261901856, "num_tokens": 230971.0, "step": 125 }, { "entropy": 7.655170726776123, "epoch": 0.11173184357541899, "grad_norm": 0.96484375, "learning_rate": 6.450000000000001e-05, "loss": 7.2818, "mean_token_accuracy": 0.1042576052248478, "num_tokens": 240524.0, "step": 130 }, { "entropy": 7.7341550350189205, "epoch": 0.11602922217447358, "grad_norm": 0.88671875, "learning_rate": 6.7e-05, "loss": 7.2512, "mean_token_accuracy": 0.1007460281252861, "num_tokens": 249220.0, "step": 135 }, { "entropy": 7.745693302154541, "epoch": 0.12032660077352815, "grad_norm": 1.0234375, "learning_rate": 6.950000000000001e-05, "loss": 7.3688, "mean_token_accuracy": 0.10030856803059578, "num_tokens": 258934.0, "step": 140 }, { "entropy": 7.694993305206299, "epoch": 0.12462397937258272, "grad_norm": 1.0234375, "learning_rate": 7.2e-05, "loss": 7.2936, "mean_token_accuracy": 0.10321335718035698, "num_tokens": 267680.0, "step": 145 }, { "entropy": 7.719129991531372, "epoch": 0.1289213579716373, "grad_norm": 1.0078125, "learning_rate": 7.45e-05, "loss": 7.3236, "mean_token_accuracy": 0.10207543894648552, "num_tokens": 276227.0, "step": 150 }, { "entropy": 7.648375129699707, "epoch": 0.1332187365706919, "grad_norm": 0.94921875, "learning_rate": 7.7e-05, "loss": 7.2203, "mean_token_accuracy": 0.1059327855706215, "num_tokens": 286342.0, "step": 155 }, { "entropy": 7.674158382415771, "epoch": 0.13751611516974646, "grad_norm": 1.0625, "learning_rate": 7.950000000000001e-05, "loss": 7.2988, "mean_token_accuracy": 0.09665355160832405, "num_tokens": 294994.0, "step": 160 }, { "entropy": 7.717900514602661, "epoch": 0.14181349376880104, "grad_norm": 1.046875, "learning_rate": 8.2e-05, "loss": 7.2704, "mean_token_accuracy": 0.10349940955638885, "num_tokens": 303882.0, "step": 165 }, { "entropy": 7.6729988098144535, "epoch": 0.1461108723678556, "grad_norm": 0.9609375, "learning_rate": 8.450000000000001e-05, "loss": 7.3104, "mean_token_accuracy": 0.10128599181771278, "num_tokens": 312515.0, "step": 170 }, { "entropy": 7.739007139205933, "epoch": 0.15040825096691018, "grad_norm": 1.2109375, "learning_rate": 8.7e-05, "loss": 7.27, "mean_token_accuracy": 0.10081852003931999, "num_tokens": 320801.0, "step": 175 }, { "entropy": 7.720875406265259, "epoch": 0.15470562956596476, "grad_norm": 1.015625, "learning_rate": 8.95e-05, "loss": 7.2872, "mean_token_accuracy": 0.10100285485386848, "num_tokens": 329382.0, "step": 180 }, { "entropy": 7.66646089553833, "epoch": 0.15900300816501933, "grad_norm": 1.0390625, "learning_rate": 9.2e-05, "loss": 7.2814, "mean_token_accuracy": 0.1028428927063942, "num_tokens": 337894.0, "step": 185 }, { "entropy": 7.772510719299317, "epoch": 0.1633003867640739, "grad_norm": 1.125, "learning_rate": 9.45e-05, "loss": 7.2803, "mean_token_accuracy": 0.10378619506955147, "num_tokens": 346380.0, "step": 190 }, { "entropy": 7.690706968307495, "epoch": 0.16759776536312848, "grad_norm": 0.890625, "learning_rate": 9.7e-05, "loss": 7.3588, "mean_token_accuracy": 0.09733301475644111, "num_tokens": 356305.0, "step": 195 }, { "entropy": 7.79454927444458, "epoch": 0.17189514396218306, "grad_norm": 1.0078125, "learning_rate": 9.95e-05, "loss": 7.306, "mean_token_accuracy": 0.09683404862880707, "num_tokens": 364899.0, "step": 200 }, { "entropy": 7.694888687133789, "epoch": 0.17619252256123766, "grad_norm": 1.015625, "learning_rate": 0.000102, "loss": 7.2938, "mean_token_accuracy": 0.09810400977730752, "num_tokens": 373663.0, "step": 205 }, { "entropy": 7.748025798797608, "epoch": 0.18048990116029223, "grad_norm": 1.1640625, "learning_rate": 0.00010449999999999999, "loss": 7.2566, "mean_token_accuracy": 0.10043591782450675, "num_tokens": 382730.0, "step": 210 }, { "entropy": 7.706165361404419, "epoch": 0.1847872797593468, "grad_norm": 1.1328125, "learning_rate": 0.000107, "loss": 7.3157, "mean_token_accuracy": 0.09612104147672654, "num_tokens": 392676.0, "step": 215 }, { "entropy": 7.760982656478882, "epoch": 0.18908465835840138, "grad_norm": 1.2265625, "learning_rate": 0.0001095, "loss": 7.2955, "mean_token_accuracy": 0.10281639397144318, "num_tokens": 401050.0, "step": 220 }, { "entropy": 7.626513719558716, "epoch": 0.19338203695745596, "grad_norm": 1.078125, "learning_rate": 0.000112, "loss": 7.2692, "mean_token_accuracy": 0.10119878426194191, "num_tokens": 410009.0, "step": 225 }, { "entropy": 7.726489019393921, "epoch": 0.19767941555651053, "grad_norm": 0.98828125, "learning_rate": 0.0001145, "loss": 7.2683, "mean_token_accuracy": 0.10186234638094901, "num_tokens": 419302.0, "step": 230 }, { "entropy": 7.643717670440674, "epoch": 0.2019767941555651, "grad_norm": 1.109375, "learning_rate": 0.00011700000000000001, "loss": 7.1665, "mean_token_accuracy": 0.10647615045309067, "num_tokens": 427296.0, "step": 235 }, { "entropy": 7.666737127304077, "epoch": 0.20627417275461968, "grad_norm": 1.125, "learning_rate": 0.00011949999999999999, "loss": 7.3139, "mean_token_accuracy": 0.10131902173161507, "num_tokens": 436368.0, "step": 240 }, { "entropy": 7.772911167144775, "epoch": 0.21057155135367425, "grad_norm": 1.046875, "learning_rate": 0.000122, "loss": 7.2112, "mean_token_accuracy": 0.1055280588567257, "num_tokens": 445535.0, "step": 245 }, { "entropy": 7.602903366088867, "epoch": 0.21486892995272883, "grad_norm": 1.046875, "learning_rate": 0.0001245, "loss": 7.2153, "mean_token_accuracy": 0.10406075567007064, "num_tokens": 454769.0, "step": 250 }, { "entropy": 7.693030595779419, "epoch": 0.2191663085517834, "grad_norm": 1.125, "learning_rate": 0.000127, "loss": 7.2315, "mean_token_accuracy": 0.10270996242761612, "num_tokens": 463975.0, "step": 255 }, { "entropy": 7.637308835983276, "epoch": 0.22346368715083798, "grad_norm": 1.109375, "learning_rate": 0.0001295, "loss": 7.2542, "mean_token_accuracy": 0.10225536078214645, "num_tokens": 472899.0, "step": 260 }, { "entropy": 7.740519666671753, "epoch": 0.22776106574989258, "grad_norm": 1.09375, "learning_rate": 0.000132, "loss": 7.229, "mean_token_accuracy": 0.1005932256579399, "num_tokens": 481556.0, "step": 265 }, { "entropy": 7.654651689529419, "epoch": 0.23205844434894715, "grad_norm": 1.0625, "learning_rate": 0.00013450000000000002, "loss": 7.2258, "mean_token_accuracy": 0.10702893435955048, "num_tokens": 490253.0, "step": 270 }, { "entropy": 7.660864973068238, "epoch": 0.23635582294800173, "grad_norm": 1.2265625, "learning_rate": 0.00013700000000000002, "loss": 7.2451, "mean_token_accuracy": 0.10333684608340263, "num_tokens": 498444.0, "step": 275 }, { "entropy": 7.637535953521729, "epoch": 0.2406532015470563, "grad_norm": 0.98046875, "learning_rate": 0.0001395, "loss": 7.191, "mean_token_accuracy": 0.10794568434357643, "num_tokens": 508330.0, "step": 280 }, { "entropy": 7.6566917419433596, "epoch": 0.24495058014611087, "grad_norm": 1.234375, "learning_rate": 0.00014199999999999998, "loss": 7.3004, "mean_token_accuracy": 0.10417937636375427, "num_tokens": 517900.0, "step": 285 }, { "entropy": 7.670303010940552, "epoch": 0.24924795874516545, "grad_norm": 1.1484375, "learning_rate": 0.0001445, "loss": 7.2276, "mean_token_accuracy": 0.10308908969163895, "num_tokens": 527808.0, "step": 290 }, { "entropy": 7.719700765609741, "epoch": 0.25354533734422, "grad_norm": 1.1484375, "learning_rate": 0.000147, "loss": 7.2415, "mean_token_accuracy": 0.10010977610945701, "num_tokens": 536931.0, "step": 295 }, { "entropy": 7.668509387969971, "epoch": 0.2578427159432746, "grad_norm": 1.1796875, "learning_rate": 0.0001495, "loss": 7.279, "mean_token_accuracy": 0.10248880609869956, "num_tokens": 545758.0, "step": 300 }, { "entropy": 7.700217819213867, "epoch": 0.26214009454232917, "grad_norm": 1.0390625, "learning_rate": 0.000152, "loss": 7.2819, "mean_token_accuracy": 0.10198702886700631, "num_tokens": 555165.0, "step": 305 }, { "entropy": 7.6267822265625, "epoch": 0.2664374731413838, "grad_norm": 1.1171875, "learning_rate": 0.00015450000000000001, "loss": 7.2035, "mean_token_accuracy": 0.10117841735482216, "num_tokens": 564719.0, "step": 310 }, { "entropy": 7.646708202362061, "epoch": 0.2707348517404383, "grad_norm": 1.0859375, "learning_rate": 0.000157, "loss": 7.1638, "mean_token_accuracy": 0.10670615658164025, "num_tokens": 573572.0, "step": 315 }, { "entropy": 7.759027910232544, "epoch": 0.2750322303394929, "grad_norm": 1.3984375, "learning_rate": 0.0001595, "loss": 7.3476, "mean_token_accuracy": 0.10210367739200592, "num_tokens": 581497.0, "step": 320 }, { "entropy": 7.590592908859253, "epoch": 0.27932960893854747, "grad_norm": 1.125, "learning_rate": 0.000162, "loss": 7.2138, "mean_token_accuracy": 0.10664469674229622, "num_tokens": 591107.0, "step": 325 }, { "entropy": 7.70356388092041, "epoch": 0.28362698753760207, "grad_norm": 1.0546875, "learning_rate": 0.00016450000000000001, "loss": 7.2482, "mean_token_accuracy": 0.1050640620291233, "num_tokens": 600241.0, "step": 330 }, { "entropy": 7.639587259292602, "epoch": 0.2879243661366566, "grad_norm": 1.0703125, "learning_rate": 0.00016700000000000002, "loss": 7.161, "mean_token_accuracy": 0.1065776713192463, "num_tokens": 608697.0, "step": 335 }, { "entropy": 7.602131795883179, "epoch": 0.2922217447357112, "grad_norm": 1.1484375, "learning_rate": 0.00016950000000000003, "loss": 7.1698, "mean_token_accuracy": 0.1098954938352108, "num_tokens": 617275.0, "step": 340 }, { "entropy": 7.669042348861694, "epoch": 0.29651912333476577, "grad_norm": 1.0859375, "learning_rate": 0.00017199999999999998, "loss": 7.2602, "mean_token_accuracy": 0.1007254920899868, "num_tokens": 626644.0, "step": 345 }, { "entropy": 7.623440217971802, "epoch": 0.30081650193382037, "grad_norm": 1.1171875, "learning_rate": 0.00017449999999999999, "loss": 7.1639, "mean_token_accuracy": 0.1080157920718193, "num_tokens": 635110.0, "step": 350 }, { "entropy": 7.711002826690674, "epoch": 0.30511388053287497, "grad_norm": 0.97265625, "learning_rate": 0.000177, "loss": 7.3139, "mean_token_accuracy": 0.10216462090611458, "num_tokens": 644746.0, "step": 355 }, { "entropy": 7.708708238601685, "epoch": 0.3094112591319295, "grad_norm": 1.234375, "learning_rate": 0.0001795, "loss": 7.2216, "mean_token_accuracy": 0.1021303728222847, "num_tokens": 654281.0, "step": 360 }, { "entropy": 7.534019136428833, "epoch": 0.3137086377309841, "grad_norm": 1.234375, "learning_rate": 0.000182, "loss": 7.2333, "mean_token_accuracy": 0.10576817691326142, "num_tokens": 663174.0, "step": 365 }, { "entropy": 7.660452365875244, "epoch": 0.31800601633003867, "grad_norm": 1.0625, "learning_rate": 0.0001845, "loss": 7.1525, "mean_token_accuracy": 0.10541519671678543, "num_tokens": 672178.0, "step": 370 }, { "entropy": 7.651990938186645, "epoch": 0.32230339492909327, "grad_norm": 1.1484375, "learning_rate": 0.000187, "loss": 7.1748, "mean_token_accuracy": 0.10421534106135369, "num_tokens": 681323.0, "step": 375 }, { "entropy": 7.537337684631348, "epoch": 0.3266007735281478, "grad_norm": 0.98046875, "learning_rate": 0.0001895, "loss": 7.1001, "mean_token_accuracy": 0.11140918657183647, "num_tokens": 690461.0, "step": 380 }, { "entropy": 7.596573305130005, "epoch": 0.3308981521272024, "grad_norm": 1.2734375, "learning_rate": 0.000192, "loss": 7.1461, "mean_token_accuracy": 0.10594902262091636, "num_tokens": 699199.0, "step": 385 }, { "entropy": 7.566946506500244, "epoch": 0.33519553072625696, "grad_norm": 1.2265625, "learning_rate": 0.0001945, "loss": 7.109, "mean_token_accuracy": 0.11522968709468842, "num_tokens": 707949.0, "step": 390 }, { "entropy": 7.66830849647522, "epoch": 0.33949290932531156, "grad_norm": 1.15625, "learning_rate": 0.00019700000000000002, "loss": 7.1843, "mean_token_accuracy": 0.10416831225156784, "num_tokens": 715752.0, "step": 395 }, { "entropy": 7.619978666305542, "epoch": 0.3437902879243661, "grad_norm": 1.2734375, "learning_rate": 0.00019950000000000002, "loss": 7.1119, "mean_token_accuracy": 0.11198346018791198, "num_tokens": 724416.0, "step": 400 }, { "entropy": 7.594716548919678, "epoch": 0.3480876665234207, "grad_norm": 1.3203125, "learning_rate": 0.000202, "loss": 7.1774, "mean_token_accuracy": 0.10296614542603492, "num_tokens": 733116.0, "step": 405 }, { "entropy": 7.614369249343872, "epoch": 0.3523850451224753, "grad_norm": 1.265625, "learning_rate": 0.00020449999999999998, "loss": 7.1639, "mean_token_accuracy": 0.10737873241305351, "num_tokens": 742093.0, "step": 410 }, { "entropy": 7.532227945327759, "epoch": 0.35668242372152986, "grad_norm": 1.1640625, "learning_rate": 0.000207, "loss": 7.1385, "mean_token_accuracy": 0.11264142915606498, "num_tokens": 750402.0, "step": 415 }, { "entropy": 7.510246276855469, "epoch": 0.36097980232058446, "grad_norm": 1.0625, "learning_rate": 0.0002095, "loss": 7.1129, "mean_token_accuracy": 0.11108387559652329, "num_tokens": 760961.0, "step": 420 }, { "entropy": 7.720337963104248, "epoch": 0.365277180919639, "grad_norm": 1.171875, "learning_rate": 0.000212, "loss": 7.2042, "mean_token_accuracy": 0.10612902790307999, "num_tokens": 770554.0, "step": 425 }, { "entropy": 7.437310361862183, "epoch": 0.3695745595186936, "grad_norm": 1.328125, "learning_rate": 0.0002145, "loss": 7.1596, "mean_token_accuracy": 0.11299800872802734, "num_tokens": 779172.0, "step": 430 }, { "entropy": 7.663910818099976, "epoch": 0.37387193811774816, "grad_norm": 1.1953125, "learning_rate": 0.00021700000000000002, "loss": 7.2239, "mean_token_accuracy": 0.10290571823716163, "num_tokens": 788040.0, "step": 435 }, { "entropy": 7.589281415939331, "epoch": 0.37816931671680276, "grad_norm": 1.125, "learning_rate": 0.0002195, "loss": 7.1461, "mean_token_accuracy": 0.10722599253058433, "num_tokens": 796786.0, "step": 440 }, { "entropy": 7.543337059020996, "epoch": 0.3824666953158573, "grad_norm": 1.4296875, "learning_rate": 0.000222, "loss": 7.1192, "mean_token_accuracy": 0.10885161831974983, "num_tokens": 805520.0, "step": 445 }, { "entropy": 7.486078453063965, "epoch": 0.3867640739149119, "grad_norm": 1.3125, "learning_rate": 0.0002245, "loss": 7.074, "mean_token_accuracy": 0.10658745989203453, "num_tokens": 814939.0, "step": 450 }, { "entropy": 7.534557342529297, "epoch": 0.39106145251396646, "grad_norm": 1.2421875, "learning_rate": 0.00022700000000000002, "loss": 7.0766, "mean_token_accuracy": 0.11227057129144669, "num_tokens": 823862.0, "step": 455 }, { "entropy": 7.5476549625396725, "epoch": 0.39535883111302106, "grad_norm": 1.15625, "learning_rate": 0.00022950000000000002, "loss": 7.1124, "mean_token_accuracy": 0.10576009079813957, "num_tokens": 832820.0, "step": 460 }, { "entropy": 7.601094675064087, "epoch": 0.39965620971207566, "grad_norm": 1.234375, "learning_rate": 0.00023200000000000003, "loss": 7.0697, "mean_token_accuracy": 0.11121490225195885, "num_tokens": 841538.0, "step": 465 }, { "entropy": 7.544060945510864, "epoch": 0.4039535883111302, "grad_norm": 1.1953125, "learning_rate": 0.00023449999999999998, "loss": 7.2069, "mean_token_accuracy": 0.10181558132171631, "num_tokens": 851123.0, "step": 470 }, { "entropy": 7.549469089508056, "epoch": 0.4082509669101848, "grad_norm": 1.1875, "learning_rate": 0.000237, "loss": 7.1633, "mean_token_accuracy": 0.11091246008872986, "num_tokens": 860357.0, "step": 475 }, { "entropy": 7.547894096374511, "epoch": 0.41254834550923936, "grad_norm": 1.234375, "learning_rate": 0.0002395, "loss": 7.0874, "mean_token_accuracy": 0.10722309574484826, "num_tokens": 869980.0, "step": 480 }, { "entropy": 7.507503604888916, "epoch": 0.41684572410829396, "grad_norm": 1.2421875, "learning_rate": 0.000242, "loss": 7.0572, "mean_token_accuracy": 0.11242355704307556, "num_tokens": 878250.0, "step": 485 }, { "entropy": 7.5191121101379395, "epoch": 0.4211431027073485, "grad_norm": 1.125, "learning_rate": 0.0002445, "loss": 7.1411, "mean_token_accuracy": 0.11158529818058013, "num_tokens": 887624.0, "step": 490 }, { "entropy": 7.454204320907593, "epoch": 0.4254404813064031, "grad_norm": 1.1640625, "learning_rate": 0.000247, "loss": 7.1159, "mean_token_accuracy": 0.11260272860527039, "num_tokens": 897120.0, "step": 495 }, { "entropy": 7.495032835006714, "epoch": 0.42973785990545765, "grad_norm": 1.140625, "learning_rate": 0.0002495, "loss": 7.0795, "mean_token_accuracy": 0.11134620234370232, "num_tokens": 906215.0, "step": 500 }, { "epoch": 0.42973785990545765, "eval_entropy": 7.203803374960616, "eval_loss": 7.096514701843262, "eval_mean_token_accuracy": 0.11462040213649874, "eval_num_tokens": 906215.0, "eval_runtime": 2.0645, "eval_samples_per_second": 1719.022, "eval_steps_per_second": 215.059, "step": 500 }, { "entropy": 7.447824621200562, "epoch": 0.43403523850451226, "grad_norm": 1.15625, "learning_rate": 0.000252, "loss": 7.0811, "mean_token_accuracy": 0.1122453585267067, "num_tokens": 915181.0, "step": 505 }, { "entropy": 7.498021125793457, "epoch": 0.4383326171035668, "grad_norm": 1.328125, "learning_rate": 0.0002545, "loss": 7.1044, "mean_token_accuracy": 0.10958386138081551, "num_tokens": 924377.0, "step": 510 }, { "entropy": 7.607626008987427, "epoch": 0.4426299957026214, "grad_norm": 1.1796875, "learning_rate": 0.000257, "loss": 7.1944, "mean_token_accuracy": 0.10655399709939957, "num_tokens": 933114.0, "step": 515 }, { "entropy": 7.6139122486114506, "epoch": 0.44692737430167595, "grad_norm": 1.0625, "learning_rate": 0.0002595, "loss": 7.1453, "mean_token_accuracy": 0.11119715198874473, "num_tokens": 943306.0, "step": 520 }, { "entropy": 7.436026573181152, "epoch": 0.45122475290073055, "grad_norm": 1.2578125, "learning_rate": 0.000262, "loss": 7.0354, "mean_token_accuracy": 0.11904665902256965, "num_tokens": 951515.0, "step": 525 }, { "entropy": 7.494698238372803, "epoch": 0.45552213149978515, "grad_norm": 1.2578125, "learning_rate": 0.00026450000000000003, "loss": 7.1519, "mean_token_accuracy": 0.10504961535334587, "num_tokens": 962686.0, "step": 530 }, { "entropy": 7.572213172912598, "epoch": 0.4598195100988397, "grad_norm": 1.125, "learning_rate": 0.00026700000000000004, "loss": 7.1449, "mean_token_accuracy": 0.11348244249820709, "num_tokens": 972136.0, "step": 535 }, { "entropy": 7.405817127227783, "epoch": 0.4641168886978943, "grad_norm": 1.2734375, "learning_rate": 0.00026950000000000005, "loss": 7.0518, "mean_token_accuracy": 0.1100372053682804, "num_tokens": 981301.0, "step": 540 }, { "entropy": 7.484500360488892, "epoch": 0.46841426729694885, "grad_norm": 1.390625, "learning_rate": 0.00027200000000000005, "loss": 7.0823, "mean_token_accuracy": 0.1120329774916172, "num_tokens": 990360.0, "step": 545 }, { "entropy": 7.573296546936035, "epoch": 0.47271164589600345, "grad_norm": 1.21875, "learning_rate": 0.0002745, "loss": 7.1293, "mean_token_accuracy": 0.10760239511728287, "num_tokens": 999415.0, "step": 550 }, { "entropy": 7.419287919998169, "epoch": 0.477009024495058, "grad_norm": 1.0859375, "learning_rate": 0.000277, "loss": 7.057, "mean_token_accuracy": 0.10999582111835479, "num_tokens": 1008762.0, "step": 555 }, { "entropy": 7.44342451095581, "epoch": 0.4813064030941126, "grad_norm": 1.2890625, "learning_rate": 0.0002795, "loss": 7.0505, "mean_token_accuracy": 0.11702658385038375, "num_tokens": 1017704.0, "step": 560 }, { "entropy": 7.457871007919311, "epoch": 0.48560378169316715, "grad_norm": 1.234375, "learning_rate": 0.00028199999999999997, "loss": 7.018, "mean_token_accuracy": 0.11318592131137847, "num_tokens": 1026251.0, "step": 565 }, { "entropy": 7.356105470657349, "epoch": 0.48990116029222175, "grad_norm": 1.0859375, "learning_rate": 0.0002845, "loss": 7.0083, "mean_token_accuracy": 0.11355392187833786, "num_tokens": 1036191.0, "step": 570 }, { "entropy": 7.5119133472442625, "epoch": 0.4941985388912763, "grad_norm": 1.1953125, "learning_rate": 0.000287, "loss": 7.0501, "mean_token_accuracy": 0.11168754398822785, "num_tokens": 1044936.0, "step": 575 }, { "entropy": 7.406773805618286, "epoch": 0.4984959174903309, "grad_norm": 1.171875, "learning_rate": 0.0002895, "loss": 7.0476, "mean_token_accuracy": 0.1135815680027008, "num_tokens": 1053683.0, "step": 580 }, { "entropy": 7.3828895568847654, "epoch": 0.5027932960893855, "grad_norm": 1.15625, "learning_rate": 0.000292, "loss": 7.0283, "mean_token_accuracy": 0.11782724559307098, "num_tokens": 1062932.0, "step": 585 }, { "entropy": 7.4789910316467285, "epoch": 0.50709067468844, "grad_norm": 1.0859375, "learning_rate": 0.0002945, "loss": 7.0524, "mean_token_accuracy": 0.11150057762861251, "num_tokens": 1072313.0, "step": 590 }, { "entropy": 7.458136653900146, "epoch": 0.5113880532874946, "grad_norm": 1.078125, "learning_rate": 0.000297, "loss": 7.033, "mean_token_accuracy": 0.10738502442836761, "num_tokens": 1081675.0, "step": 595 }, { "entropy": 7.437460470199585, "epoch": 0.5156854318865493, "grad_norm": 1.1875, "learning_rate": 0.0002995, "loss": 7.0392, "mean_token_accuracy": 0.11078862249851226, "num_tokens": 1091541.0, "step": 600 }, { "entropy": 7.43347053527832, "epoch": 0.5199828104856038, "grad_norm": 1.1171875, "learning_rate": 0.000302, "loss": 7.0467, "mean_token_accuracy": 0.11545747444033623, "num_tokens": 1100724.0, "step": 605 }, { "entropy": 7.34070782661438, "epoch": 0.5242801890846583, "grad_norm": 1.265625, "learning_rate": 0.0003045, "loss": 7.0062, "mean_token_accuracy": 0.11681902781128883, "num_tokens": 1108869.0, "step": 610 }, { "entropy": 7.513333511352539, "epoch": 0.5285775676837129, "grad_norm": 1.2109375, "learning_rate": 0.000307, "loss": 7.0303, "mean_token_accuracy": 0.11391275599598885, "num_tokens": 1117314.0, "step": 615 }, { "entropy": 7.237616014480591, "epoch": 0.5328749462827675, "grad_norm": 1.1875, "learning_rate": 0.0003095, "loss": 6.969, "mean_token_accuracy": 0.11866867989301681, "num_tokens": 1126786.0, "step": 620 }, { "entropy": 7.403380393981934, "epoch": 0.5371723248818221, "grad_norm": 1.3515625, "learning_rate": 0.000312, "loss": 6.983, "mean_token_accuracy": 0.11322688534855843, "num_tokens": 1136013.0, "step": 625 }, { "entropy": 7.355997228622437, "epoch": 0.5414697034808766, "grad_norm": 1.15625, "learning_rate": 0.0003145, "loss": 7.0163, "mean_token_accuracy": 0.1159099243581295, "num_tokens": 1144970.0, "step": 630 }, { "entropy": 7.416441440582275, "epoch": 0.5457670820799312, "grad_norm": 1.3046875, "learning_rate": 0.000317, "loss": 6.9784, "mean_token_accuracy": 0.12343248203396798, "num_tokens": 1153810.0, "step": 635 }, { "entropy": 7.320913982391358, "epoch": 0.5500644606789858, "grad_norm": 1.234375, "learning_rate": 0.0003195, "loss": 6.96, "mean_token_accuracy": 0.11895549520850182, "num_tokens": 1162498.0, "step": 640 }, { "entropy": 7.383200359344483, "epoch": 0.5543618392780404, "grad_norm": 1.15625, "learning_rate": 0.000322, "loss": 7.0441, "mean_token_accuracy": 0.11171148270368576, "num_tokens": 1172091.0, "step": 645 }, { "entropy": 7.465569925308228, "epoch": 0.5586592178770949, "grad_norm": 1.1875, "learning_rate": 0.00032450000000000003, "loss": 7.0379, "mean_token_accuracy": 0.1126454509794712, "num_tokens": 1181400.0, "step": 650 }, { "entropy": 7.29718279838562, "epoch": 0.5629565964761496, "grad_norm": 1.3671875, "learning_rate": 0.00032700000000000003, "loss": 7.0066, "mean_token_accuracy": 0.11692977026104927, "num_tokens": 1189780.0, "step": 655 }, { "entropy": 7.376112461090088, "epoch": 0.5672539750752041, "grad_norm": 1.234375, "learning_rate": 0.00032950000000000004, "loss": 6.9708, "mean_token_accuracy": 0.11179102137684822, "num_tokens": 1198671.0, "step": 660 }, { "entropy": 7.406812715530395, "epoch": 0.5715513536742587, "grad_norm": 1.140625, "learning_rate": 0.00033200000000000005, "loss": 6.9887, "mean_token_accuracy": 0.11439693570137024, "num_tokens": 1207173.0, "step": 665 }, { "entropy": 7.267558336257935, "epoch": 0.5758487322733132, "grad_norm": 1.328125, "learning_rate": 0.00033450000000000005, "loss": 6.9252, "mean_token_accuracy": 0.11824023947119713, "num_tokens": 1216387.0, "step": 670 }, { "entropy": 7.466721105575561, "epoch": 0.5801461108723679, "grad_norm": 1.1640625, "learning_rate": 0.000337, "loss": 6.9093, "mean_token_accuracy": 0.11586858034133911, "num_tokens": 1224461.0, "step": 675 }, { "entropy": 7.260802936553955, "epoch": 0.5844434894714224, "grad_norm": 1.2265625, "learning_rate": 0.0003395, "loss": 6.9855, "mean_token_accuracy": 0.1176436722278595, "num_tokens": 1233774.0, "step": 680 }, { "entropy": 7.267514610290528, "epoch": 0.588740868070477, "grad_norm": 1.2109375, "learning_rate": 0.000342, "loss": 6.9319, "mean_token_accuracy": 0.12313097864389419, "num_tokens": 1242812.0, "step": 685 }, { "entropy": 7.451924133300781, "epoch": 0.5930382466695315, "grad_norm": 1.1640625, "learning_rate": 0.00034449999999999997, "loss": 7.0445, "mean_token_accuracy": 0.1125735655426979, "num_tokens": 1252872.0, "step": 690 }, { "entropy": 7.1216278076171875, "epoch": 0.5973356252685862, "grad_norm": 1.21875, "learning_rate": 0.000347, "loss": 6.8314, "mean_token_accuracy": 0.1210754469037056, "num_tokens": 1260852.0, "step": 695 }, { "entropy": 7.292500305175781, "epoch": 0.6016330038676407, "grad_norm": 1.21875, "learning_rate": 0.0003495, "loss": 6.9419, "mean_token_accuracy": 0.1167706459760666, "num_tokens": 1268925.0, "step": 700 }, { "entropy": 7.384844732284546, "epoch": 0.6059303824666953, "grad_norm": 1.1484375, "learning_rate": 0.000352, "loss": 6.9849, "mean_token_accuracy": 0.11300796419382095, "num_tokens": 1278994.0, "step": 705 }, { "entropy": 7.286926889419556, "epoch": 0.6102277610657499, "grad_norm": 1.1875, "learning_rate": 0.0003545, "loss": 6.9847, "mean_token_accuracy": 0.11259545534849166, "num_tokens": 1287698.0, "step": 710 }, { "entropy": 7.337662601470948, "epoch": 0.6145251396648045, "grad_norm": 1.125, "learning_rate": 0.000357, "loss": 6.9117, "mean_token_accuracy": 0.12028303518891334, "num_tokens": 1297475.0, "step": 715 }, { "entropy": 7.265739297866821, "epoch": 0.618822518263859, "grad_norm": 1.234375, "learning_rate": 0.0003595, "loss": 6.9558, "mean_token_accuracy": 0.11790136769413948, "num_tokens": 1306836.0, "step": 720 }, { "entropy": 7.3774675846099855, "epoch": 0.6231198968629136, "grad_norm": 1.140625, "learning_rate": 0.000362, "loss": 6.9932, "mean_token_accuracy": 0.11299360319972038, "num_tokens": 1315872.0, "step": 725 }, { "entropy": 7.3129335880279545, "epoch": 0.6274172754619682, "grad_norm": 1.28125, "learning_rate": 0.0003645, "loss": 6.9353, "mean_token_accuracy": 0.12453719973564148, "num_tokens": 1324624.0, "step": 730 }, { "entropy": 7.300215101242065, "epoch": 0.6317146540610228, "grad_norm": 1.34375, "learning_rate": 0.000367, "loss": 6.9246, "mean_token_accuracy": 0.12120431885123253, "num_tokens": 1333058.0, "step": 735 }, { "entropy": 7.065497016906738, "epoch": 0.6360120326600773, "grad_norm": 1.0703125, "learning_rate": 0.0003695, "loss": 6.8904, "mean_token_accuracy": 0.11625659838318825, "num_tokens": 1342376.0, "step": 740 }, { "entropy": 7.412401533126831, "epoch": 0.6403094112591319, "grad_norm": 1.2578125, "learning_rate": 0.000372, "loss": 6.9293, "mean_token_accuracy": 0.11268759667873382, "num_tokens": 1351386.0, "step": 745 }, { "entropy": 7.194233036041259, "epoch": 0.6446067898581865, "grad_norm": 1.3359375, "learning_rate": 0.0003745, "loss": 6.8338, "mean_token_accuracy": 0.12849506586790085, "num_tokens": 1358958.0, "step": 750 }, { "entropy": 7.3347986221313475, "epoch": 0.6489041684572411, "grad_norm": 1.2109375, "learning_rate": 0.000377, "loss": 6.988, "mean_token_accuracy": 0.11507417485117913, "num_tokens": 1368599.0, "step": 755 }, { "entropy": 7.380126667022705, "epoch": 0.6532015470562956, "grad_norm": 1.984375, "learning_rate": 0.0003795, "loss": 7.0127, "mean_token_accuracy": 0.111283528059721, "num_tokens": 1378529.0, "step": 760 }, { "entropy": 7.157611989974976, "epoch": 0.6574989256553503, "grad_norm": 1.3984375, "learning_rate": 0.000382, "loss": 6.8052, "mean_token_accuracy": 0.1265752285718918, "num_tokens": 1386993.0, "step": 765 }, { "entropy": 7.21686282157898, "epoch": 0.6617963042544048, "grad_norm": 1.4296875, "learning_rate": 0.0003845, "loss": 6.8936, "mean_token_accuracy": 0.12180712148547172, "num_tokens": 1395790.0, "step": 770 }, { "entropy": 7.166302919387817, "epoch": 0.6660936828534594, "grad_norm": 1.1875, "learning_rate": 0.00038700000000000003, "loss": 6.9063, "mean_token_accuracy": 0.11845313757658005, "num_tokens": 1405587.0, "step": 775 }, { "entropy": 7.20961365699768, "epoch": 0.6703910614525139, "grad_norm": 1.1875, "learning_rate": 0.00038950000000000003, "loss": 6.8702, "mean_token_accuracy": 0.12274195328354835, "num_tokens": 1414478.0, "step": 780 }, { "entropy": 7.319825458526611, "epoch": 0.6746884400515686, "grad_norm": 1.4296875, "learning_rate": 0.00039200000000000004, "loss": 6.9317, "mean_token_accuracy": 0.12083822339773179, "num_tokens": 1423791.0, "step": 785 }, { "entropy": 7.313541460037231, "epoch": 0.6789858186506231, "grad_norm": 1.328125, "learning_rate": 0.00039450000000000005, "loss": 6.975, "mean_token_accuracy": 0.11185284182429314, "num_tokens": 1432955.0, "step": 790 }, { "entropy": 7.242367315292358, "epoch": 0.6832831972496777, "grad_norm": 1.03125, "learning_rate": 0.00039700000000000005, "loss": 6.9394, "mean_token_accuracy": 0.11529579535126686, "num_tokens": 1441907.0, "step": 795 }, { "entropy": 7.173644304275513, "epoch": 0.6875805758487322, "grad_norm": 1.2734375, "learning_rate": 0.0003995, "loss": 6.8059, "mean_token_accuracy": 0.12198502644896507, "num_tokens": 1451062.0, "step": 800 }, { "entropy": 7.2840491771698, "epoch": 0.6918779544477869, "grad_norm": 1.109375, "learning_rate": 0.000402, "loss": 6.8894, "mean_token_accuracy": 0.11644295528531075, "num_tokens": 1460132.0, "step": 805 }, { "entropy": 7.085446500778199, "epoch": 0.6961753330468414, "grad_norm": 1.078125, "learning_rate": 0.0004045, "loss": 6.7896, "mean_token_accuracy": 0.12437586709856988, "num_tokens": 1469582.0, "step": 810 }, { "entropy": 7.180881690979004, "epoch": 0.700472711645896, "grad_norm": 1.4453125, "learning_rate": 0.00040699999999999997, "loss": 6.8844, "mean_token_accuracy": 0.11694586053490638, "num_tokens": 1479053.0, "step": 815 }, { "entropy": 7.176044559478759, "epoch": 0.7047700902449506, "grad_norm": 1.21875, "learning_rate": 0.0004095, "loss": 6.8874, "mean_token_accuracy": 0.11812442615628242, "num_tokens": 1488189.0, "step": 820 }, { "entropy": 7.071721315383911, "epoch": 0.7090674688440052, "grad_norm": 1.2578125, "learning_rate": 0.000412, "loss": 6.7495, "mean_token_accuracy": 0.12273769155144691, "num_tokens": 1497324.0, "step": 825 }, { "entropy": 7.243275499343872, "epoch": 0.7133648474430597, "grad_norm": 1.0546875, "learning_rate": 0.0004145, "loss": 6.8631, "mean_token_accuracy": 0.12297548577189446, "num_tokens": 1506543.0, "step": 830 }, { "entropy": 7.1102629661560055, "epoch": 0.7176622260421143, "grad_norm": 1.171875, "learning_rate": 0.000417, "loss": 6.8571, "mean_token_accuracy": 0.1257997862994671, "num_tokens": 1516737.0, "step": 835 }, { "entropy": 7.015081739425659, "epoch": 0.7219596046411689, "grad_norm": 1.1015625, "learning_rate": 0.0004195, "loss": 6.7311, "mean_token_accuracy": 0.12102818563580513, "num_tokens": 1525561.0, "step": 840 }, { "entropy": 7.17170901298523, "epoch": 0.7262569832402235, "grad_norm": 1.203125, "learning_rate": 0.000422, "loss": 6.757, "mean_token_accuracy": 0.12571127861738204, "num_tokens": 1533323.0, "step": 845 }, { "entropy": 7.173940944671631, "epoch": 0.730554361839278, "grad_norm": 1.2109375, "learning_rate": 0.0004245, "loss": 6.821, "mean_token_accuracy": 0.12750849053263663, "num_tokens": 1542632.0, "step": 850 }, { "entropy": 7.148316097259522, "epoch": 0.7348517404383326, "grad_norm": 1.296875, "learning_rate": 0.000427, "loss": 6.7649, "mean_token_accuracy": 0.12507490813732147, "num_tokens": 1551236.0, "step": 855 }, { "entropy": 6.981910467147827, "epoch": 0.7391491190373872, "grad_norm": 1.21875, "learning_rate": 0.0004295, "loss": 6.7641, "mean_token_accuracy": 0.12514904662966728, "num_tokens": 1559674.0, "step": 860 }, { "entropy": 7.186282157897949, "epoch": 0.7434464976364418, "grad_norm": 1.1484375, "learning_rate": 0.000432, "loss": 6.8498, "mean_token_accuracy": 0.1250532478094101, "num_tokens": 1569481.0, "step": 865 }, { "entropy": 7.118600702285766, "epoch": 0.7477438762354963, "grad_norm": 1.1796875, "learning_rate": 0.0004345, "loss": 6.8888, "mean_token_accuracy": 0.1209896370768547, "num_tokens": 1578488.0, "step": 870 }, { "entropy": 7.105226039886475, "epoch": 0.752041254834551, "grad_norm": 1.078125, "learning_rate": 0.000437, "loss": 6.7736, "mean_token_accuracy": 0.12527675032615662, "num_tokens": 1586675.0, "step": 875 }, { "entropy": 7.185068035125733, "epoch": 0.7563386334336055, "grad_norm": 1.1015625, "learning_rate": 0.0004395, "loss": 6.8782, "mean_token_accuracy": 0.1180253192782402, "num_tokens": 1595411.0, "step": 880 }, { "entropy": 7.179415893554688, "epoch": 0.7606360120326601, "grad_norm": 1.2734375, "learning_rate": 0.000442, "loss": 6.8619, "mean_token_accuracy": 0.12292847484350204, "num_tokens": 1604046.0, "step": 885 }, { "entropy": 7.130577564239502, "epoch": 0.7649333906317146, "grad_norm": 1.15625, "learning_rate": 0.0004445, "loss": 6.8566, "mean_token_accuracy": 0.11715829819440841, "num_tokens": 1613759.0, "step": 890 }, { "entropy": 7.111226511001587, "epoch": 0.7692307692307693, "grad_norm": 1.09375, "learning_rate": 0.000447, "loss": 6.8191, "mean_token_accuracy": 0.1252148814499378, "num_tokens": 1623323.0, "step": 895 }, { "entropy": 7.097943353652954, "epoch": 0.7735281478298238, "grad_norm": 1.21875, "learning_rate": 0.00044950000000000003, "loss": 6.7922, "mean_token_accuracy": 0.11943844705820084, "num_tokens": 1631727.0, "step": 900 }, { "entropy": 7.073408317565918, "epoch": 0.7778255264288784, "grad_norm": 1.21875, "learning_rate": 0.00045200000000000004, "loss": 6.7454, "mean_token_accuracy": 0.12582483813166617, "num_tokens": 1639544.0, "step": 905 }, { "entropy": 7.1905022144317625, "epoch": 0.7821229050279329, "grad_norm": 1.2421875, "learning_rate": 0.00045450000000000004, "loss": 6.8716, "mean_token_accuracy": 0.11673429310321808, "num_tokens": 1648931.0, "step": 910 }, { "entropy": 7.032827425003052, "epoch": 0.7864202836269876, "grad_norm": 1.140625, "learning_rate": 0.00045700000000000005, "loss": 6.7325, "mean_token_accuracy": 0.12737771049141883, "num_tokens": 1657688.0, "step": 915 }, { "entropy": 7.160619735717773, "epoch": 0.7907176622260421, "grad_norm": 1.0859375, "learning_rate": 0.00045950000000000006, "loss": 6.8191, "mean_token_accuracy": 0.11969996094703675, "num_tokens": 1666879.0, "step": 920 }, { "entropy": 7.016655492782593, "epoch": 0.7950150408250967, "grad_norm": 1.125, "learning_rate": 0.000462, "loss": 6.7912, "mean_token_accuracy": 0.12404834032058716, "num_tokens": 1676773.0, "step": 925 }, { "entropy": 7.205742454528808, "epoch": 0.7993124194241513, "grad_norm": 1.140625, "learning_rate": 0.0004645, "loss": 6.8942, "mean_token_accuracy": 0.11682869419455529, "num_tokens": 1686144.0, "step": 930 }, { "entropy": 7.093483018875122, "epoch": 0.8036097980232059, "grad_norm": 1.09375, "learning_rate": 0.000467, "loss": 6.8555, "mean_token_accuracy": 0.11735839322209358, "num_tokens": 1695476.0, "step": 935 }, { "entropy": 7.090408611297607, "epoch": 0.8079071766222604, "grad_norm": 1.1171875, "learning_rate": 0.0004695, "loss": 6.7525, "mean_token_accuracy": 0.12118161767721176, "num_tokens": 1704907.0, "step": 940 }, { "entropy": 7.016019344329834, "epoch": 0.812204555221315, "grad_norm": 1.0078125, "learning_rate": 0.000472, "loss": 6.7924, "mean_token_accuracy": 0.12617168575525284, "num_tokens": 1714564.0, "step": 945 }, { "entropy": 7.132166576385498, "epoch": 0.8165019338203696, "grad_norm": 1.1328125, "learning_rate": 0.0004745, "loss": 6.8135, "mean_token_accuracy": 0.12022659555077553, "num_tokens": 1725285.0, "step": 950 }, { "entropy": 7.00044469833374, "epoch": 0.8207993124194242, "grad_norm": 1.1015625, "learning_rate": 0.000477, "loss": 6.8177, "mean_token_accuracy": 0.12241263464093208, "num_tokens": 1734331.0, "step": 955 }, { "entropy": 7.126689529418945, "epoch": 0.8250966910184787, "grad_norm": 1.28125, "learning_rate": 0.0004795, "loss": 6.749, "mean_token_accuracy": 0.11530287116765976, "num_tokens": 1742340.0, "step": 960 }, { "entropy": 7.05500750541687, "epoch": 0.8293940696175333, "grad_norm": 1.15625, "learning_rate": 0.000482, "loss": 6.7383, "mean_token_accuracy": 0.12545244619250298, "num_tokens": 1751725.0, "step": 965 }, { "entropy": 6.894489717483521, "epoch": 0.8336914482165879, "grad_norm": 1.1796875, "learning_rate": 0.0004845, "loss": 6.6736, "mean_token_accuracy": 0.12856126353144645, "num_tokens": 1760294.0, "step": 970 }, { "entropy": 7.036704349517822, "epoch": 0.8379888268156425, "grad_norm": 1.0859375, "learning_rate": 0.000487, "loss": 6.7265, "mean_token_accuracy": 0.1231304183602333, "num_tokens": 1768912.0, "step": 975 }, { "entropy": 7.092654848098755, "epoch": 0.842286205414697, "grad_norm": 1.140625, "learning_rate": 0.0004895, "loss": 6.9187, "mean_token_accuracy": 0.12804483920335769, "num_tokens": 1778633.0, "step": 980 }, { "entropy": 7.090839195251465, "epoch": 0.8465835840137517, "grad_norm": 1.140625, "learning_rate": 0.000492, "loss": 6.7883, "mean_token_accuracy": 0.12408955544233322, "num_tokens": 1787275.0, "step": 985 }, { "entropy": 7.0695414543151855, "epoch": 0.8508809626128062, "grad_norm": 1.2734375, "learning_rate": 0.0004945, "loss": 6.7844, "mean_token_accuracy": 0.12348324134945869, "num_tokens": 1795994.0, "step": 990 }, { "entropy": 6.964667177200317, "epoch": 0.8551783412118608, "grad_norm": 0.94921875, "learning_rate": 0.000497, "loss": 6.7175, "mean_token_accuracy": 0.12602235972881318, "num_tokens": 1806379.0, "step": 995 }, { "entropy": 7.061655473709107, "epoch": 0.8594757198109153, "grad_norm": 1.09375, "learning_rate": 0.0004995, "loss": 6.7479, "mean_token_accuracy": 0.13024335727095604, "num_tokens": 1816135.0, "step": 1000 }, { "epoch": 0.8594757198109153, "eval_entropy": 6.75515693050247, "eval_loss": 6.752710819244385, "eval_mean_token_accuracy": 0.12811107195175445, "eval_num_tokens": 1816135.0, "eval_runtime": 2.0604, "eval_samples_per_second": 1722.442, "eval_steps_per_second": 215.487, "step": 1000 }, { "entropy": 6.9897054672241214, "epoch": 0.86377309840997, "grad_norm": 1.2890625, "learning_rate": 0.0004999998427807679, "loss": 6.7314, "mean_token_accuracy": 0.12282020673155784, "num_tokens": 1824777.0, "step": 1005 }, { "entropy": 6.925821113586426, "epoch": 0.8680704770090245, "grad_norm": 1.4296875, "learning_rate": 0.0004999992040780138, "loss": 6.8085, "mean_token_accuracy": 0.1247783549129963, "num_tokens": 1833807.0, "step": 1010 }, { "entropy": 7.123036670684814, "epoch": 0.8723678556080791, "grad_norm": 1.078125, "learning_rate": 0.0004999980740669294, "loss": 6.754, "mean_token_accuracy": 0.12499897480010987, "num_tokens": 1843375.0, "step": 1015 }, { "entropy": 7.027141857147217, "epoch": 0.8766652342071336, "grad_norm": 1.1796875, "learning_rate": 0.0004999964527499823, "loss": 6.8155, "mean_token_accuracy": 0.12067028507590294, "num_tokens": 1853036.0, "step": 1020 }, { "entropy": 7.018357038497925, "epoch": 0.8809626128061883, "grad_norm": 1.1328125, "learning_rate": 0.0004999943401307127, "loss": 6.7605, "mean_token_accuracy": 0.12497071847319603, "num_tokens": 1862041.0, "step": 1025 }, { "entropy": 6.984006929397583, "epoch": 0.8852599914052428, "grad_norm": 1.2421875, "learning_rate": 0.0004999917362137337, "loss": 6.6885, "mean_token_accuracy": 0.12735832259058952, "num_tokens": 1870707.0, "step": 1030 }, { "entropy": 6.964999151229859, "epoch": 0.8895573700042974, "grad_norm": 1.140625, "learning_rate": 0.0004999886410047312, "loss": 6.6849, "mean_token_accuracy": 0.12543184384703637, "num_tokens": 1879787.0, "step": 1035 }, { "entropy": 7.046022748947143, "epoch": 0.8938547486033519, "grad_norm": 1.1171875, "learning_rate": 0.0004999850545104638, "loss": 6.7336, "mean_token_accuracy": 0.12585699930787086, "num_tokens": 1889413.0, "step": 1040 }, { "entropy": 6.9450146675109865, "epoch": 0.8981521272024066, "grad_norm": 1.265625, "learning_rate": 0.0004999809767387633, "loss": 6.7291, "mean_token_accuracy": 0.12462790235877037, "num_tokens": 1898283.0, "step": 1045 }, { "entropy": 6.982704973220825, "epoch": 0.9024495058014611, "grad_norm": 1.109375, "learning_rate": 0.0004999764076985337, "loss": 6.7474, "mean_token_accuracy": 0.12953734770417213, "num_tokens": 1907175.0, "step": 1050 }, { "entropy": 6.947793340682983, "epoch": 0.9067468844005157, "grad_norm": 1.109375, "learning_rate": 0.0004999713473997519, "loss": 6.7933, "mean_token_accuracy": 0.12337937280535698, "num_tokens": 1918223.0, "step": 1055 }, { "entropy": 7.053569555282593, "epoch": 0.9110442629995703, "grad_norm": 1.109375, "learning_rate": 0.0004999657958534677, "loss": 6.7435, "mean_token_accuracy": 0.11936211958527565, "num_tokens": 1928801.0, "step": 1060 }, { "entropy": 6.874362564086914, "epoch": 0.9153416415986249, "grad_norm": 1.1171875, "learning_rate": 0.0004999597530718034, "loss": 6.7076, "mean_token_accuracy": 0.12535862401127815, "num_tokens": 1937406.0, "step": 1065 }, { "entropy": 6.924251508712769, "epoch": 0.9196390201976794, "grad_norm": 1.1171875, "learning_rate": 0.000499953219067954, "loss": 6.7025, "mean_token_accuracy": 0.12463184967637062, "num_tokens": 1947184.0, "step": 1070 }, { "entropy": 7.056308698654175, "epoch": 0.923936398796734, "grad_norm": 1.15625, "learning_rate": 0.0004999461938561873, "loss": 6.7241, "mean_token_accuracy": 0.12476856112480164, "num_tokens": 1956293.0, "step": 1075 }, { "entropy": 6.90220274925232, "epoch": 0.9282337773957886, "grad_norm": 1.1328125, "learning_rate": 0.0004999386774518432, "loss": 6.6968, "mean_token_accuracy": 0.12625648751854895, "num_tokens": 1964791.0, "step": 1080 }, { "entropy": 6.965981435775757, "epoch": 0.9325311559948432, "grad_norm": 1.0546875, "learning_rate": 0.0004999306698713349, "loss": 6.616, "mean_token_accuracy": 0.12837354317307473, "num_tokens": 1973754.0, "step": 1085 }, { "entropy": 6.929974555969238, "epoch": 0.9368285345938977, "grad_norm": 1.1015625, "learning_rate": 0.0004999221711321477, "loss": 6.6857, "mean_token_accuracy": 0.12695353776216506, "num_tokens": 1983035.0, "step": 1090 }, { "entropy": 6.804391956329345, "epoch": 0.9411259131929522, "grad_norm": 1.0859375, "learning_rate": 0.0004999131812528393, "loss": 6.7126, "mean_token_accuracy": 0.12742481231689454, "num_tokens": 1992584.0, "step": 1095 }, { "entropy": 7.0129533290863035, "epoch": 0.9454232917920069, "grad_norm": 0.94140625, "learning_rate": 0.00049990370025304, "loss": 6.745, "mean_token_accuracy": 0.1250165306031704, "num_tokens": 2001876.0, "step": 1100 }, { "entropy": 6.9361108303070065, "epoch": 0.9497206703910615, "grad_norm": 1.015625, "learning_rate": 0.0004998937281534526, "loss": 6.6354, "mean_token_accuracy": 0.1352070689201355, "num_tokens": 2011067.0, "step": 1105 }, { "entropy": 7.00281867980957, "epoch": 0.954018048990116, "grad_norm": 1.140625, "learning_rate": 0.0004998832649758521, "loss": 6.7191, "mean_token_accuracy": 0.12910578772425652, "num_tokens": 2020763.0, "step": 1110 }, { "entropy": 6.846075534820557, "epoch": 0.9583154275891707, "grad_norm": 1.2421875, "learning_rate": 0.0004998723107430862, "loss": 6.702, "mean_token_accuracy": 0.12597106099128724, "num_tokens": 2029534.0, "step": 1115 }, { "entropy": 6.979312801361084, "epoch": 0.9626128061882252, "grad_norm": 1.109375, "learning_rate": 0.0004998608654790741, "loss": 6.6576, "mean_token_accuracy": 0.12685178518295287, "num_tokens": 2039143.0, "step": 1120 }, { "entropy": 6.840395832061768, "epoch": 0.9669101847872797, "grad_norm": 1.1953125, "learning_rate": 0.000499848929208808, "loss": 6.619, "mean_token_accuracy": 0.13090287074446677, "num_tokens": 2048253.0, "step": 1125 }, { "entropy": 6.833210182189942, "epoch": 0.9712075633863343, "grad_norm": 1.234375, "learning_rate": 0.0004998365019583519, "loss": 6.6747, "mean_token_accuracy": 0.13630941957235337, "num_tokens": 2057234.0, "step": 1130 }, { "entropy": 7.008919525146484, "epoch": 0.975504941985389, "grad_norm": 1.203125, "learning_rate": 0.0004998235837548417, "loss": 6.7058, "mean_token_accuracy": 0.12927891165018082, "num_tokens": 2065431.0, "step": 1135 }, { "entropy": 6.887974071502685, "epoch": 0.9798023205844435, "grad_norm": 1.1015625, "learning_rate": 0.000499810174626486, "loss": 6.7146, "mean_token_accuracy": 0.1267981804907322, "num_tokens": 2074723.0, "step": 1140 }, { "entropy": 6.909135150909424, "epoch": 0.984099699183498, "grad_norm": 1.2265625, "learning_rate": 0.0004997962746025646, "loss": 6.5835, "mean_token_accuracy": 0.13582983165979384, "num_tokens": 2084509.0, "step": 1145 }, { "entropy": 6.8790112972259525, "epoch": 0.9883970777825526, "grad_norm": 1.1875, "learning_rate": 0.0004997818837134298, "loss": 6.7192, "mean_token_accuracy": 0.13046733066439628, "num_tokens": 2093110.0, "step": 1150 }, { "entropy": 6.820547676086425, "epoch": 0.9926944563816072, "grad_norm": 1.1484375, "learning_rate": 0.0004997670019905057, "loss": 6.5939, "mean_token_accuracy": 0.12773325443267822, "num_tokens": 2102355.0, "step": 1155 }, { "entropy": 6.849571800231933, "epoch": 0.9969918349806618, "grad_norm": 1.2109375, "learning_rate": 0.0004997516294662876, "loss": 6.6207, "mean_token_accuracy": 0.1278907351195812, "num_tokens": 2110418.0, "step": 1160 }, { "entropy": 6.932281441158718, "epoch": 1.0008594757198108, "grad_norm": 1.1796875, "learning_rate": 0.0004997357661743433, "loss": 6.6076, "mean_token_accuracy": 0.13429299659199184, "num_tokens": 2117866.0, "step": 1165 }, { "entropy": 6.776707983016967, "epoch": 1.0051568543188656, "grad_norm": 1.1171875, "learning_rate": 0.0004997194121493118, "loss": 6.4353, "mean_token_accuracy": 0.14019777849316598, "num_tokens": 2126082.0, "step": 1170 }, { "entropy": 6.887734413146973, "epoch": 1.0094542329179201, "grad_norm": 1.0859375, "learning_rate": 0.0004997025674269037, "loss": 6.4211, "mean_token_accuracy": 0.13955733701586723, "num_tokens": 2134042.0, "step": 1175 }, { "entropy": 6.774314117431641, "epoch": 1.0137516115169747, "grad_norm": 1.2109375, "learning_rate": 0.0004996852320439013, "loss": 6.4895, "mean_token_accuracy": 0.13937605321407318, "num_tokens": 2142570.0, "step": 1180 }, { "entropy": 6.8031017780303955, "epoch": 1.0180489901160292, "grad_norm": 1.015625, "learning_rate": 0.0004996674060381578, "loss": 6.4187, "mean_token_accuracy": 0.13786159604787826, "num_tokens": 2151310.0, "step": 1185 }, { "entropy": 6.884524583816528, "epoch": 1.0223463687150838, "grad_norm": 1.2109375, "learning_rate": 0.0004996490894485985, "loss": 6.4993, "mean_token_accuracy": 0.1331610009074211, "num_tokens": 2160662.0, "step": 1190 }, { "entropy": 6.801689147949219, "epoch": 1.0266437473141383, "grad_norm": 1.1484375, "learning_rate": 0.0004996302823152193, "loss": 6.445, "mean_token_accuracy": 0.13591438457369803, "num_tokens": 2170067.0, "step": 1195 }, { "entropy": 6.76284008026123, "epoch": 1.0309411259131929, "grad_norm": 1.15625, "learning_rate": 0.0004996109846790873, "loss": 6.4084, "mean_token_accuracy": 0.14033972024917601, "num_tokens": 2178850.0, "step": 1200 }, { "entropy": 6.71863865852356, "epoch": 1.0352385045122476, "grad_norm": 1.0, "learning_rate": 0.0004995911965823412, "loss": 6.4263, "mean_token_accuracy": 0.1453915849328041, "num_tokens": 2188307.0, "step": 1205 }, { "entropy": 6.847736549377442, "epoch": 1.0395358831113022, "grad_norm": 1.21875, "learning_rate": 0.0004995709180681899, "loss": 6.4144, "mean_token_accuracy": 0.1416982263326645, "num_tokens": 2197026.0, "step": 1210 }, { "entropy": 6.729686546325683, "epoch": 1.0438332617103567, "grad_norm": 1.125, "learning_rate": 0.000499550149180914, "loss": 6.4003, "mean_token_accuracy": 0.13990466818213462, "num_tokens": 2205537.0, "step": 1215 }, { "entropy": 6.780020618438721, "epoch": 1.0481306403094113, "grad_norm": 1.15625, "learning_rate": 0.0004995288899658641, "loss": 6.4298, "mean_token_accuracy": 0.1448238343000412, "num_tokens": 2214508.0, "step": 1220 }, { "entropy": 6.842759847640991, "epoch": 1.0524280189084658, "grad_norm": 1.171875, "learning_rate": 0.0004995071404694619, "loss": 6.5391, "mean_token_accuracy": 0.1354886084794998, "num_tokens": 2223084.0, "step": 1225 }, { "entropy": 6.7924669742584225, "epoch": 1.0567253975075204, "grad_norm": 1.078125, "learning_rate": 0.0004994849007391996, "loss": 6.4679, "mean_token_accuracy": 0.13138427063822747, "num_tokens": 2231406.0, "step": 1230 }, { "entropy": 6.731750345230102, "epoch": 1.061022776106575, "grad_norm": 1.1328125, "learning_rate": 0.0004994621708236401, "loss": 6.3805, "mean_token_accuracy": 0.14119497835636138, "num_tokens": 2239867.0, "step": 1235 }, { "entropy": 6.745153379440308, "epoch": 1.0653201547056295, "grad_norm": 1.2265625, "learning_rate": 0.000499438950772416, "loss": 6.4467, "mean_token_accuracy": 0.1372622825205326, "num_tokens": 2248844.0, "step": 1240 }, { "entropy": 6.710582876205445, "epoch": 1.0696175333046842, "grad_norm": 1.078125, "learning_rate": 0.0004994152406362311, "loss": 6.3633, "mean_token_accuracy": 0.14102791994810104, "num_tokens": 2257599.0, "step": 1245 }, { "entropy": 6.773756074905395, "epoch": 1.0739149119037388, "grad_norm": 1.296875, "learning_rate": 0.0004993910404668586, "loss": 6.418, "mean_token_accuracy": 0.13638516888022423, "num_tokens": 2266510.0, "step": 1250 }, { "entropy": 6.720381832122802, "epoch": 1.0782122905027933, "grad_norm": 1.03125, "learning_rate": 0.000499366350317142, "loss": 6.4145, "mean_token_accuracy": 0.1418795846402645, "num_tokens": 2275462.0, "step": 1255 }, { "entropy": 6.712311601638794, "epoch": 1.0825096691018479, "grad_norm": 1.15625, "learning_rate": 0.0004993411702409948, "loss": 6.3874, "mean_token_accuracy": 0.1354715533554554, "num_tokens": 2283826.0, "step": 1260 }, { "entropy": 6.76007399559021, "epoch": 1.0868070477009024, "grad_norm": 1.3203125, "learning_rate": 0.0004993155002934002, "loss": 6.3997, "mean_token_accuracy": 0.13856483697891236, "num_tokens": 2292967.0, "step": 1265 }, { "entropy": 6.8389280319213865, "epoch": 1.091104426299957, "grad_norm": 1.7109375, "learning_rate": 0.0004992893405304111, "loss": 6.5262, "mean_token_accuracy": 0.13781826868653296, "num_tokens": 2302336.0, "step": 1270 }, { "entropy": 6.64991979598999, "epoch": 1.0954018048990115, "grad_norm": 1.078125, "learning_rate": 0.00049926269100915, "loss": 6.4293, "mean_token_accuracy": 0.1432204395532608, "num_tokens": 2311465.0, "step": 1275 }, { "entropy": 6.792691707611084, "epoch": 1.0996991834980663, "grad_norm": 1.140625, "learning_rate": 0.0004992355517878087, "loss": 6.542, "mean_token_accuracy": 0.13071493357419967, "num_tokens": 2320281.0, "step": 1280 }, { "entropy": 6.689556837081909, "epoch": 1.1039965620971208, "grad_norm": 1.171875, "learning_rate": 0.0004992079229256484, "loss": 6.4431, "mean_token_accuracy": 0.1360026031732559, "num_tokens": 2329755.0, "step": 1285 }, { "entropy": 6.6757041931152346, "epoch": 1.1082939406961754, "grad_norm": 1.0546875, "learning_rate": 0.0004991798044829996, "loss": 6.3861, "mean_token_accuracy": 0.1369478650391102, "num_tokens": 2338807.0, "step": 1290 }, { "entropy": 6.7733612060546875, "epoch": 1.11259131929523, "grad_norm": 1.171875, "learning_rate": 0.0004991511965212618, "loss": 6.4719, "mean_token_accuracy": 0.13780709579586983, "num_tokens": 2348056.0, "step": 1295 }, { "entropy": 6.688971424102784, "epoch": 1.1168886978942845, "grad_norm": 1.1171875, "learning_rate": 0.0004991220991029032, "loss": 6.4868, "mean_token_accuracy": 0.13366840407252312, "num_tokens": 2357780.0, "step": 1300 }, { "entropy": 6.773650407791138, "epoch": 1.121186076493339, "grad_norm": 1.3046875, "learning_rate": 0.000499092512291461, "loss": 6.4446, "mean_token_accuracy": 0.13651487827301026, "num_tokens": 2367060.0, "step": 1305 }, { "entropy": 6.7718230247497555, "epoch": 1.1254834550923936, "grad_norm": 1.0703125, "learning_rate": 0.000499062436151541, "loss": 6.441, "mean_token_accuracy": 0.1382215812802315, "num_tokens": 2375751.0, "step": 1310 }, { "entropy": 6.800968360900879, "epoch": 1.129780833691448, "grad_norm": 1.1640625, "learning_rate": 0.0004990318707488173, "loss": 6.5069, "mean_token_accuracy": 0.13017478883266448, "num_tokens": 2385013.0, "step": 1315 }, { "entropy": 6.692961692810059, "epoch": 1.1340782122905029, "grad_norm": 1.1953125, "learning_rate": 0.0004990008161500327, "loss": 6.3937, "mean_token_accuracy": 0.14006393477320672, "num_tokens": 2392935.0, "step": 1320 }, { "entropy": 6.706206512451172, "epoch": 1.1383755908895574, "grad_norm": 1.2578125, "learning_rate": 0.000498969272422998, "loss": 6.4188, "mean_token_accuracy": 0.1468452200293541, "num_tokens": 2401560.0, "step": 1325 }, { "entropy": 6.711210012435913, "epoch": 1.142672969488612, "grad_norm": 1.1328125, "learning_rate": 0.0004989372396365921, "loss": 6.3447, "mean_token_accuracy": 0.1455326870083809, "num_tokens": 2410050.0, "step": 1330 }, { "entropy": 6.756243276596069, "epoch": 1.1469703480876665, "grad_norm": 1.1796875, "learning_rate": 0.0004989047178607618, "loss": 6.4505, "mean_token_accuracy": 0.13842038065195084, "num_tokens": 2418980.0, "step": 1335 }, { "entropy": 6.671654081344604, "epoch": 1.151267726686721, "grad_norm": 1.1328125, "learning_rate": 0.0004988717071665215, "loss": 6.4407, "mean_token_accuracy": 0.13684784546494483, "num_tokens": 2427992.0, "step": 1340 }, { "entropy": 6.762688112258911, "epoch": 1.1555651052857756, "grad_norm": 1.046875, "learning_rate": 0.0004988382076259537, "loss": 6.3572, "mean_token_accuracy": 0.14135119169950486, "num_tokens": 2436368.0, "step": 1345 }, { "entropy": 6.5892657279968265, "epoch": 1.1598624838848304, "grad_norm": 1.0546875, "learning_rate": 0.0004988042193122077, "loss": 6.3456, "mean_token_accuracy": 0.14492984861135483, "num_tokens": 2445499.0, "step": 1350 }, { "entropy": 6.752876138687133, "epoch": 1.164159862483885, "grad_norm": 1.2265625, "learning_rate": 0.0004987697422995005, "loss": 6.3818, "mean_token_accuracy": 0.13490121066570282, "num_tokens": 2454312.0, "step": 1355 }, { "entropy": 6.647862577438355, "epoch": 1.1684572410829395, "grad_norm": 1.109375, "learning_rate": 0.0004987347766631161, "loss": 6.4437, "mean_token_accuracy": 0.1407245770096779, "num_tokens": 2462922.0, "step": 1360 }, { "entropy": 6.755164289474488, "epoch": 1.172754619681994, "grad_norm": 1.0703125, "learning_rate": 0.0004986993224794055, "loss": 6.4781, "mean_token_accuracy": 0.13789629712700843, "num_tokens": 2472195.0, "step": 1365 }, { "entropy": 6.6456316947937015, "epoch": 1.1770519982810486, "grad_norm": 1.1953125, "learning_rate": 0.0004986633798257865, "loss": 6.3829, "mean_token_accuracy": 0.14376115351915358, "num_tokens": 2481021.0, "step": 1370 }, { "entropy": 6.657115125656128, "epoch": 1.181349376880103, "grad_norm": 1.15625, "learning_rate": 0.0004986269487807434, "loss": 6.405, "mean_token_accuracy": 0.13883866667747496, "num_tokens": 2490250.0, "step": 1375 }, { "entropy": 6.763047981262207, "epoch": 1.1856467554791577, "grad_norm": 1.0859375, "learning_rate": 0.000498590029423827, "loss": 6.4581, "mean_token_accuracy": 0.14272229447960855, "num_tokens": 2499122.0, "step": 1380 }, { "entropy": 6.686977815628052, "epoch": 1.1899441340782122, "grad_norm": 1.109375, "learning_rate": 0.0004985526218356546, "loss": 6.4227, "mean_token_accuracy": 0.13726608753204345, "num_tokens": 2508454.0, "step": 1385 }, { "entropy": 6.699887418746949, "epoch": 1.1942415126772667, "grad_norm": 1.1328125, "learning_rate": 0.0004985147260979093, "loss": 6.3632, "mean_token_accuracy": 0.1465839110314846, "num_tokens": 2517353.0, "step": 1390 }, { "entropy": 6.691904354095459, "epoch": 1.1985388912763215, "grad_norm": 1.1796875, "learning_rate": 0.0004984763422933402, "loss": 6.3821, "mean_token_accuracy": 0.14337702393531798, "num_tokens": 2526321.0, "step": 1395 }, { "entropy": 6.6859358787536625, "epoch": 1.202836269875376, "grad_norm": 1.0078125, "learning_rate": 0.0004984374705057623, "loss": 6.4144, "mean_token_accuracy": 0.14242582842707635, "num_tokens": 2535924.0, "step": 1400 }, { "entropy": 6.640392780303955, "epoch": 1.2071336484744306, "grad_norm": 1.171875, "learning_rate": 0.0004983981108200561, "loss": 6.3922, "mean_token_accuracy": 0.1401688925921917, "num_tokens": 2545606.0, "step": 1405 }, { "entropy": 6.649671459197998, "epoch": 1.2114310270734852, "grad_norm": 1.171875, "learning_rate": 0.0004983582633221672, "loss": 6.3859, "mean_token_accuracy": 0.1407300591468811, "num_tokens": 2554947.0, "step": 1410 }, { "entropy": 6.765527582168579, "epoch": 1.2157284056725397, "grad_norm": 1.0234375, "learning_rate": 0.0004983179280991068, "loss": 6.5354, "mean_token_accuracy": 0.13627680763602257, "num_tokens": 2564462.0, "step": 1415 }, { "entropy": 6.688222122192383, "epoch": 1.2200257842715942, "grad_norm": 1.1328125, "learning_rate": 0.0004982771052389508, "loss": 6.3743, "mean_token_accuracy": 0.1444454774260521, "num_tokens": 2573124.0, "step": 1420 }, { "entropy": 6.700618696212769, "epoch": 1.224323162870649, "grad_norm": 1.1484375, "learning_rate": 0.0004982357948308401, "loss": 6.4798, "mean_token_accuracy": 0.13040754944086075, "num_tokens": 2581829.0, "step": 1425 }, { "entropy": 6.7136975765228275, "epoch": 1.2286205414697036, "grad_norm": 1.1328125, "learning_rate": 0.0004981939969649799, "loss": 6.3405, "mean_token_accuracy": 0.1422662131488323, "num_tokens": 2590631.0, "step": 1430 }, { "entropy": 6.661464500427246, "epoch": 1.232917920068758, "grad_norm": 1.1796875, "learning_rate": 0.0004981517117326404, "loss": 6.4484, "mean_token_accuracy": 0.13987314701080322, "num_tokens": 2600684.0, "step": 1435 }, { "entropy": 6.6479767799377445, "epoch": 1.2372152986678127, "grad_norm": 1.0859375, "learning_rate": 0.0004981089392261553, "loss": 6.3605, "mean_token_accuracy": 0.14449947997927665, "num_tokens": 2609667.0, "step": 1440 }, { "entropy": 6.643135976791382, "epoch": 1.2415126772668672, "grad_norm": 1.0, "learning_rate": 0.000498065679538923, "loss": 6.4317, "mean_token_accuracy": 0.14703501164913177, "num_tokens": 2620025.0, "step": 1445 }, { "entropy": 6.672731685638428, "epoch": 1.2458100558659218, "grad_norm": 1.1484375, "learning_rate": 0.0004980219327654049, "loss": 6.351, "mean_token_accuracy": 0.14008775800466539, "num_tokens": 2629032.0, "step": 1450 }, { "entropy": 6.605780506134034, "epoch": 1.2501074344649763, "grad_norm": 1.15625, "learning_rate": 0.000497977699001127, "loss": 6.3357, "mean_token_accuracy": 0.1428795799612999, "num_tokens": 2638303.0, "step": 1455 }, { "entropy": 6.698618459701538, "epoch": 1.2544048130640308, "grad_norm": 1.1328125, "learning_rate": 0.0004979329783426778, "loss": 6.3527, "mean_token_accuracy": 0.14518981352448462, "num_tokens": 2647902.0, "step": 1460 }, { "entropy": 6.619544601440429, "epoch": 1.2587021916630854, "grad_norm": 1.1015625, "learning_rate": 0.0004978877708877094, "loss": 6.4046, "mean_token_accuracy": 0.1414396196603775, "num_tokens": 2657902.0, "step": 1465 }, { "entropy": 6.67303991317749, "epoch": 1.2629995702621402, "grad_norm": 1.09375, "learning_rate": 0.0004978420767349368, "loss": 6.3504, "mean_token_accuracy": 0.14340997561812402, "num_tokens": 2667082.0, "step": 1470 }, { "entropy": 6.647952270507813, "epoch": 1.2672969488611947, "grad_norm": 1.0546875, "learning_rate": 0.0004977958959841379, "loss": 6.4223, "mean_token_accuracy": 0.1364084042608738, "num_tokens": 2676855.0, "step": 1475 }, { "entropy": 6.6442427158355715, "epoch": 1.2715943274602493, "grad_norm": 1.1015625, "learning_rate": 0.000497749228736153, "loss": 6.3546, "mean_token_accuracy": 0.145116026699543, "num_tokens": 2685750.0, "step": 1480 }, { "entropy": 6.597840929031372, "epoch": 1.2758917060593038, "grad_norm": 1.1953125, "learning_rate": 0.0004977020750928845, "loss": 6.4075, "mean_token_accuracy": 0.14761355221271516, "num_tokens": 2695272.0, "step": 1485 }, { "entropy": 6.709882497787476, "epoch": 1.2801890846583583, "grad_norm": 1.0703125, "learning_rate": 0.0004976544351572973, "loss": 6.3504, "mean_token_accuracy": 0.1418570265173912, "num_tokens": 2704806.0, "step": 1490 }, { "entropy": 6.533363771438599, "epoch": 1.2844864632574131, "grad_norm": 1.09375, "learning_rate": 0.0004976063090334179, "loss": 6.4036, "mean_token_accuracy": 0.1452034071087837, "num_tokens": 2713521.0, "step": 1495 }, { "entropy": 6.7042053699493405, "epoch": 1.2887838418564677, "grad_norm": 1.171875, "learning_rate": 0.0004975576968263346, "loss": 6.3966, "mean_token_accuracy": 0.1381194919347763, "num_tokens": 2721848.0, "step": 1500 }, { "epoch": 1.2887838418564677, "eval_entropy": 6.494678375957249, "eval_loss": 6.482933044433594, "eval_mean_token_accuracy": 0.14236528785513328, "eval_num_tokens": 2721848.0, "eval_runtime": 2.0538, "eval_samples_per_second": 1728.039, "eval_steps_per_second": 216.187, "step": 1500 } ], "logging_steps": 5, "max_steps": 11630, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 613354283642880.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }