{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 8.161581435324452, "eval_steps": 500, "global_step": 9500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 10.742608070373535, "epoch": 0.004297378599054577, "grad_norm": 5.46875, "learning_rate": 2e-06, "loss": 10.7643, "mean_token_accuracy": 7.587253348901868e-05, "num_tokens": 10107.0, "step": 5 }, { "entropy": 10.742630290985108, "epoch": 0.008594757198109154, "grad_norm": 5.78125, "learning_rate": 4.5e-06, "loss": 10.7086, "mean_token_accuracy": 0.0, "num_tokens": 18391.0, "step": 10 }, { "entropy": 10.74263505935669, "epoch": 0.01289213579716373, "grad_norm": 5.3125, "learning_rate": 7e-06, "loss": 10.6888, "mean_token_accuracy": 7.022471982054412e-05, "num_tokens": 27061.0, "step": 15 }, { "entropy": 10.742604160308838, "epoch": 0.017189514396218308, "grad_norm": 6.0, "learning_rate": 9.5e-06, "loss": 10.6611, "mean_token_accuracy": 0.0008422504703048617, "num_tokens": 36339.0, "step": 20 }, { "entropy": 10.742517948150635, "epoch": 0.021486892995272882, "grad_norm": 4.75, "learning_rate": 1.2e-05, "loss": 10.5317, "mean_token_accuracy": 0.02025789166800678, "num_tokens": 45770.0, "step": 25 }, { "entropy": 10.741962242126466, "epoch": 0.02578427159432746, "grad_norm": 4.25, "learning_rate": 1.4500000000000002e-05, "loss": 10.399, "mean_token_accuracy": 0.04876907132565975, "num_tokens": 54575.0, "step": 30 }, { "entropy": 10.73945140838623, "epoch": 0.030081650193382038, "grad_norm": 3.15625, "learning_rate": 1.7000000000000003e-05, "loss": 10.3065, "mean_token_accuracy": 0.0514072135090828, "num_tokens": 66403.0, "step": 35 }, { "entropy": 10.730937385559082, "epoch": 0.034379028792436615, "grad_norm": 2.640625, "learning_rate": 1.95e-05, "loss": 10.0976, "mean_token_accuracy": 0.05973539762198925, "num_tokens": 76510.0, "step": 40 }, { "entropy": 10.715238952636719, "epoch": 0.03867640739149119, "grad_norm": 2.40625, "learning_rate": 2.2e-05, "loss": 9.9688, "mean_token_accuracy": 0.05614017099142075, "num_tokens": 84836.0, "step": 45 }, { "entropy": 10.702037715911866, "epoch": 0.042973785990545764, "grad_norm": 2.046875, "learning_rate": 2.4500000000000003e-05, "loss": 9.9015, "mean_token_accuracy": 0.053829558193683624, "num_tokens": 93197.0, "step": 50 }, { "entropy": 10.697910690307618, "epoch": 0.047271164589600345, "grad_norm": 2.40625, "learning_rate": 2.7e-05, "loss": 9.8366, "mean_token_accuracy": 0.05843428298830986, "num_tokens": 101546.0, "step": 55 }, { "entropy": 10.693470478057861, "epoch": 0.05156854318865492, "grad_norm": 1.9609375, "learning_rate": 2.95e-05, "loss": 9.8429, "mean_token_accuracy": 0.0558084711432457, "num_tokens": 111703.0, "step": 60 }, { "entropy": 10.680869865417481, "epoch": 0.055865921787709494, "grad_norm": 1.9453125, "learning_rate": 3.2e-05, "loss": 9.7131, "mean_token_accuracy": 0.0589165486395359, "num_tokens": 119894.0, "step": 65 }, { "entropy": 10.668927574157715, "epoch": 0.060163300386764075, "grad_norm": 1.9765625, "learning_rate": 3.4500000000000005e-05, "loss": 9.6682, "mean_token_accuracy": 0.06148771904408932, "num_tokens": 128885.0, "step": 70 }, { "entropy": 10.654484272003174, "epoch": 0.06446067898581866, "grad_norm": 1.953125, "learning_rate": 3.7e-05, "loss": 9.6297, "mean_token_accuracy": 0.057728851959109304, "num_tokens": 138106.0, "step": 75 }, { "entropy": 10.645826625823975, "epoch": 0.06875805758487323, "grad_norm": 1.9296875, "learning_rate": 3.95e-05, "loss": 9.5722, "mean_token_accuracy": 0.058954347297549246, "num_tokens": 146691.0, "step": 80 }, { "entropy": 10.637816619873046, "epoch": 0.0730554361839278, "grad_norm": 1.90625, "learning_rate": 4.2000000000000004e-05, "loss": 9.5126, "mean_token_accuracy": 0.059067190065979956, "num_tokens": 155792.0, "step": 85 }, { "entropy": 10.63103084564209, "epoch": 0.07735281478298238, "grad_norm": 1.7890625, "learning_rate": 4.45e-05, "loss": 9.5251, "mean_token_accuracy": 0.0552229531109333, "num_tokens": 166944.0, "step": 90 }, { "entropy": 10.616693305969239, "epoch": 0.08165019338203695, "grad_norm": 1.96875, "learning_rate": 4.7000000000000004e-05, "loss": 9.3423, "mean_token_accuracy": 0.060124922543764114, "num_tokens": 175303.0, "step": 95 }, { "entropy": 10.591300106048584, "epoch": 0.08594757198109153, "grad_norm": 1.8203125, "learning_rate": 4.9500000000000004e-05, "loss": 9.3133, "mean_token_accuracy": 0.06174388714134693, "num_tokens": 184708.0, "step": 100 }, { "entropy": 10.564336776733398, "epoch": 0.09024495058014612, "grad_norm": 1.7890625, "learning_rate": 5.2e-05, "loss": 9.2307, "mean_token_accuracy": 0.0674959484487772, "num_tokens": 193835.0, "step": 105 }, { "entropy": 10.52622423171997, "epoch": 0.09454232917920069, "grad_norm": 1.8828125, "learning_rate": 5.45e-05, "loss": 9.1379, "mean_token_accuracy": 0.07480009235441684, "num_tokens": 203344.0, "step": 110 }, { "entropy": 10.454349136352539, "epoch": 0.09883970777825526, "grad_norm": 1.6171875, "learning_rate": 5.7e-05, "loss": 9.1209, "mean_token_accuracy": 0.06218625903129578, "num_tokens": 213048.0, "step": 115 }, { "entropy": 10.415324211120605, "epoch": 0.10313708637730984, "grad_norm": 1.578125, "learning_rate": 5.9499999999999996e-05, "loss": 8.9306, "mean_token_accuracy": 0.07533645890653133, "num_tokens": 221784.0, "step": 120 }, { "entropy": 10.303644943237305, "epoch": 0.10743446497636441, "grad_norm": 1.4765625, "learning_rate": 6.2e-05, "loss": 8.8509, "mean_token_accuracy": 0.07504003196954727, "num_tokens": 230971.0, "step": 125 }, { "entropy": 10.209668159484863, "epoch": 0.11173184357541899, "grad_norm": 1.4296875, "learning_rate": 6.450000000000001e-05, "loss": 8.7412, "mean_token_accuracy": 0.07478504739701748, "num_tokens": 240524.0, "step": 130 }, { "entropy": 10.153745365142822, "epoch": 0.11602922217447358, "grad_norm": 1.3359375, "learning_rate": 6.7e-05, "loss": 8.6323, "mean_token_accuracy": 0.07354197278618813, "num_tokens": 249220.0, "step": 135 }, { "entropy": 10.068094253540039, "epoch": 0.12032660077352815, "grad_norm": 1.3125, "learning_rate": 6.950000000000001e-05, "loss": 8.61, "mean_token_accuracy": 0.07049238979816437, "num_tokens": 258934.0, "step": 140 }, { "entropy": 9.973960685729981, "epoch": 0.12462397937258272, "grad_norm": 1.2734375, "learning_rate": 7.2e-05, "loss": 8.4673, "mean_token_accuracy": 0.07534252405166626, "num_tokens": 267680.0, "step": 145 }, { "entropy": 9.815561103820801, "epoch": 0.1289213579716373, "grad_norm": 1.09375, "learning_rate": 7.45e-05, "loss": 8.3709, "mean_token_accuracy": 0.07952065020799637, "num_tokens": 276227.0, "step": 150 }, { "entropy": 9.66996259689331, "epoch": 0.1332187365706919, "grad_norm": 1.1875, "learning_rate": 7.7e-05, "loss": 8.2269, "mean_token_accuracy": 0.08225171342492103, "num_tokens": 286342.0, "step": 155 }, { "entropy": 9.510671615600586, "epoch": 0.13751611516974646, "grad_norm": 0.953125, "learning_rate": 7.950000000000001e-05, "loss": 8.1921, "mean_token_accuracy": 0.0742720566689968, "num_tokens": 294994.0, "step": 160 }, { "entropy": 9.346861934661865, "epoch": 0.14181349376880104, "grad_norm": 0.984375, "learning_rate": 8.2e-05, "loss": 8.113, "mean_token_accuracy": 0.08004417940974236, "num_tokens": 303882.0, "step": 165 }, { "entropy": 9.199288940429687, "epoch": 0.1461108723678556, "grad_norm": 0.9296875, "learning_rate": 8.450000000000001e-05, "loss": 8.0403, "mean_token_accuracy": 0.07799897268414498, "num_tokens": 312515.0, "step": 170 }, { "entropy": 8.978620052337646, "epoch": 0.15040825096691018, "grad_norm": 0.9375, "learning_rate": 8.7e-05, "loss": 7.9977, "mean_token_accuracy": 0.07381256259977817, "num_tokens": 320801.0, "step": 175 }, { "entropy": 8.861582374572754, "epoch": 0.15470562956596476, "grad_norm": 0.9765625, "learning_rate": 8.95e-05, "loss": 7.9642, "mean_token_accuracy": 0.08192512467503547, "num_tokens": 329382.0, "step": 180 }, { "entropy": 8.755144786834716, "epoch": 0.15900300816501933, "grad_norm": 0.9296875, "learning_rate": 9.2e-05, "loss": 7.9273, "mean_token_accuracy": 0.07583913430571557, "num_tokens": 337894.0, "step": 185 }, { "entropy": 8.582227611541748, "epoch": 0.1633003867640739, "grad_norm": 0.8984375, "learning_rate": 9.45e-05, "loss": 7.9012, "mean_token_accuracy": 0.07614588961005211, "num_tokens": 346380.0, "step": 190 }, { "entropy": 8.591823768615722, "epoch": 0.16759776536312848, "grad_norm": 0.9609375, "learning_rate": 9.7e-05, "loss": 7.9407, "mean_token_accuracy": 0.07390806600451469, "num_tokens": 356305.0, "step": 195 }, { "entropy": 8.515201950073243, "epoch": 0.17189514396218306, "grad_norm": 1.1328125, "learning_rate": 9.95e-05, "loss": 7.8901, "mean_token_accuracy": 0.07247771993279457, "num_tokens": 364899.0, "step": 200 }, { "entropy": 8.457213211059571, "epoch": 0.17619252256123766, "grad_norm": 0.93359375, "learning_rate": 0.000102, "loss": 7.8566, "mean_token_accuracy": 0.0781160645186901, "num_tokens": 373663.0, "step": 205 }, { "entropy": 8.381179523468017, "epoch": 0.18048990116029223, "grad_norm": 0.95703125, "learning_rate": 0.00010449999999999999, "loss": 7.8221, "mean_token_accuracy": 0.07758632972836495, "num_tokens": 382730.0, "step": 210 }, { "entropy": 8.390653896331788, "epoch": 0.1847872797593468, "grad_norm": 0.921875, "learning_rate": 0.000107, "loss": 7.8622, "mean_token_accuracy": 0.071787304058671, "num_tokens": 392676.0, "step": 215 }, { "entropy": 8.255177211761474, "epoch": 0.18908465835840138, "grad_norm": 1.1015625, "learning_rate": 0.0001095, "loss": 7.8473, "mean_token_accuracy": 0.08185218423604965, "num_tokens": 401050.0, "step": 220 }, { "entropy": 8.367721462249756, "epoch": 0.19338203695745596, "grad_norm": 0.796875, "learning_rate": 0.000112, "loss": 7.795, "mean_token_accuracy": 0.07991239950060844, "num_tokens": 410009.0, "step": 225 }, { "entropy": 8.268333339691162, "epoch": 0.19767941555651053, "grad_norm": 0.859375, "learning_rate": 0.0001145, "loss": 7.7757, "mean_token_accuracy": 0.08171008005738259, "num_tokens": 419302.0, "step": 230 }, { "entropy": 8.304029846191407, "epoch": 0.2019767941555651, "grad_norm": 0.984375, "learning_rate": 0.00011700000000000001, "loss": 7.6812, "mean_token_accuracy": 0.08820762410759926, "num_tokens": 427296.0, "step": 235 }, { "entropy": 8.16576337814331, "epoch": 0.20627417275461968, "grad_norm": 0.91796875, "learning_rate": 0.00011949999999999999, "loss": 7.8198, "mean_token_accuracy": 0.07870872803032399, "num_tokens": 436368.0, "step": 240 }, { "entropy": 8.189785575866699, "epoch": 0.21057155135367425, "grad_norm": 1.28125, "learning_rate": 0.000122, "loss": 7.7389, "mean_token_accuracy": 0.08551637679338456, "num_tokens": 445535.0, "step": 245 }, { "entropy": 8.265625381469727, "epoch": 0.21486892995272883, "grad_norm": 0.8671875, "learning_rate": 0.0001245, "loss": 7.7093, "mean_token_accuracy": 0.07919453792273998, "num_tokens": 454769.0, "step": 250 }, { "entropy": 8.1545090675354, "epoch": 0.2191663085517834, "grad_norm": 0.93359375, "learning_rate": 0.000127, "loss": 7.7315, "mean_token_accuracy": 0.0871740497648716, "num_tokens": 463975.0, "step": 255 }, { "entropy": 8.13952112197876, "epoch": 0.22346368715083798, "grad_norm": 0.88671875, "learning_rate": 0.0001295, "loss": 7.726, "mean_token_accuracy": 0.08799278363585472, "num_tokens": 472899.0, "step": 260 }, { "entropy": 8.196070003509522, "epoch": 0.22776106574989258, "grad_norm": 0.93359375, "learning_rate": 0.000132, "loss": 7.7354, "mean_token_accuracy": 0.08013860881328583, "num_tokens": 481556.0, "step": 265 }, { "entropy": 8.114658737182618, "epoch": 0.23205844434894715, "grad_norm": 0.91015625, "learning_rate": 0.00013450000000000002, "loss": 7.7023, "mean_token_accuracy": 0.0854449674487114, "num_tokens": 490253.0, "step": 270 }, { "entropy": 8.193334579467773, "epoch": 0.23635582294800173, "grad_norm": 1.09375, "learning_rate": 0.00013700000000000002, "loss": 7.7066, "mean_token_accuracy": 0.0806311085820198, "num_tokens": 498444.0, "step": 275 }, { "entropy": 8.104936504364014, "epoch": 0.2406532015470563, "grad_norm": 0.8046875, "learning_rate": 0.0001395, "loss": 7.6467, "mean_token_accuracy": 0.08675235286355018, "num_tokens": 508330.0, "step": 280 }, { "entropy": 8.113396596908569, "epoch": 0.24495058014611087, "grad_norm": 1.015625, "learning_rate": 0.00014199999999999998, "loss": 7.7405, "mean_token_accuracy": 0.08165572881698609, "num_tokens": 517900.0, "step": 285 }, { "entropy": 8.046846723556518, "epoch": 0.24924795874516545, "grad_norm": 0.93359375, "learning_rate": 0.0001445, "loss": 7.6901, "mean_token_accuracy": 0.08230286985635757, "num_tokens": 527808.0, "step": 290 }, { "entropy": 8.13338761329651, "epoch": 0.25354533734422, "grad_norm": 0.8984375, "learning_rate": 0.000147, "loss": 7.6711, "mean_token_accuracy": 0.08156475871801376, "num_tokens": 536931.0, "step": 295 }, { "entropy": 8.18837013244629, "epoch": 0.2578427159432746, "grad_norm": 1.1875, "learning_rate": 0.0001495, "loss": 7.7049, "mean_token_accuracy": 0.0835341140627861, "num_tokens": 545758.0, "step": 300 }, { "entropy": 8.025089168548584, "epoch": 0.26214009454232917, "grad_norm": 0.9921875, "learning_rate": 0.000152, "loss": 7.7131, "mean_token_accuracy": 0.08242038711905479, "num_tokens": 555165.0, "step": 305 }, { "entropy": 8.155539417266846, "epoch": 0.2664374731413838, "grad_norm": 0.86328125, "learning_rate": 0.00015450000000000001, "loss": 7.6144, "mean_token_accuracy": 0.08789716809988021, "num_tokens": 564719.0, "step": 310 }, { "entropy": 8.041153383255004, "epoch": 0.2707348517404383, "grad_norm": 1.0, "learning_rate": 0.000157, "loss": 7.594, "mean_token_accuracy": 0.09155945181846618, "num_tokens": 573572.0, "step": 315 }, { "entropy": 8.15259666442871, "epoch": 0.2750322303394929, "grad_norm": 1.0859375, "learning_rate": 0.0001595, "loss": 7.7634, "mean_token_accuracy": 0.08318910300731659, "num_tokens": 581497.0, "step": 320 }, { "entropy": 8.100253248214722, "epoch": 0.27932960893854747, "grad_norm": 1.125, "learning_rate": 0.000162, "loss": 7.6118, "mean_token_accuracy": 0.08767011985182763, "num_tokens": 591107.0, "step": 325 }, { "entropy": 7.984478855133057, "epoch": 0.28362698753760207, "grad_norm": 0.84765625, "learning_rate": 0.00016450000000000001, "loss": 7.6456, "mean_token_accuracy": 0.08353794142603874, "num_tokens": 600241.0, "step": 330 }, { "entropy": 8.057686376571656, "epoch": 0.2879243661366566, "grad_norm": 0.91796875, "learning_rate": 0.00016700000000000002, "loss": 7.5776, "mean_token_accuracy": 0.08751234114170074, "num_tokens": 608697.0, "step": 335 }, { "entropy": 8.016141748428344, "epoch": 0.2922217447357112, "grad_norm": 0.9453125, "learning_rate": 0.00016950000000000003, "loss": 7.568, "mean_token_accuracy": 0.09023259431123734, "num_tokens": 617275.0, "step": 340 }, { "entropy": 8.084819841384888, "epoch": 0.29651912333476577, "grad_norm": 0.8984375, "learning_rate": 0.00017199999999999998, "loss": 7.6405, "mean_token_accuracy": 0.08630914464592934, "num_tokens": 626644.0, "step": 345 }, { "entropy": 8.008595705032349, "epoch": 0.30081650193382037, "grad_norm": 0.98828125, "learning_rate": 0.00017449999999999999, "loss": 7.5665, "mean_token_accuracy": 0.08766811862587928, "num_tokens": 635110.0, "step": 350 }, { "entropy": 8.04712610244751, "epoch": 0.30511388053287497, "grad_norm": 0.87109375, "learning_rate": 0.000177, "loss": 7.7031, "mean_token_accuracy": 0.08570141717791557, "num_tokens": 644746.0, "step": 355 }, { "entropy": 8.179811954498291, "epoch": 0.3094112591319295, "grad_norm": 1.1015625, "learning_rate": 0.0001795, "loss": 7.5831, "mean_token_accuracy": 0.08595824986696243, "num_tokens": 654281.0, "step": 360 }, { "entropy": 7.987443113327027, "epoch": 0.3137086377309841, "grad_norm": 1.203125, "learning_rate": 0.000182, "loss": 7.585, "mean_token_accuracy": 0.09283285215497017, "num_tokens": 663174.0, "step": 365 }, { "entropy": 7.916810417175293, "epoch": 0.31800601633003867, "grad_norm": 0.90625, "learning_rate": 0.0001845, "loss": 7.511, "mean_token_accuracy": 0.08863886222243308, "num_tokens": 672178.0, "step": 370 }, { "entropy": 8.005489206314087, "epoch": 0.32230339492909327, "grad_norm": 0.96484375, "learning_rate": 0.000187, "loss": 7.5218, "mean_token_accuracy": 0.09131815880537034, "num_tokens": 681323.0, "step": 375 }, { "entropy": 7.9803643226623535, "epoch": 0.3266007735281478, "grad_norm": 0.890625, "learning_rate": 0.0001895, "loss": 7.4406, "mean_token_accuracy": 0.08985799476504326, "num_tokens": 690461.0, "step": 380 }, { "entropy": 7.829833698272705, "epoch": 0.3308981521272024, "grad_norm": 1.046875, "learning_rate": 0.000192, "loss": 7.5004, "mean_token_accuracy": 0.08490158319473266, "num_tokens": 699199.0, "step": 385 }, { "entropy": 8.038139152526856, "epoch": 0.33519553072625696, "grad_norm": 1.1484375, "learning_rate": 0.0001945, "loss": 7.4484, "mean_token_accuracy": 0.09670188426971435, "num_tokens": 707949.0, "step": 390 }, { "entropy": 7.9735198497772215, "epoch": 0.33949290932531156, "grad_norm": 1.203125, "learning_rate": 0.00019700000000000002, "loss": 7.5219, "mean_token_accuracy": 0.08999367579817771, "num_tokens": 715752.0, "step": 395 }, { "entropy": 7.93391604423523, "epoch": 0.3437902879243661, "grad_norm": 1.1171875, "learning_rate": 0.00019950000000000002, "loss": 7.4479, "mean_token_accuracy": 0.0979436494410038, "num_tokens": 724416.0, "step": 400 }, { "entropy": 7.925309085845948, "epoch": 0.3480876665234207, "grad_norm": 1.0546875, "learning_rate": 0.000202, "loss": 7.4953, "mean_token_accuracy": 0.09031900316476822, "num_tokens": 733116.0, "step": 405 }, { "entropy": 7.916099977493286, "epoch": 0.3523850451224753, "grad_norm": 1.0625, "learning_rate": 0.00020449999999999998, "loss": 7.4726, "mean_token_accuracy": 0.09227924942970275, "num_tokens": 742093.0, "step": 410 }, { "entropy": 7.918701934814453, "epoch": 0.35668242372152986, "grad_norm": 1.046875, "learning_rate": 0.000207, "loss": 7.4649, "mean_token_accuracy": 0.09618089124560356, "num_tokens": 750402.0, "step": 415 }, { "entropy": 7.816703271865845, "epoch": 0.36097980232058446, "grad_norm": 0.9140625, "learning_rate": 0.0002095, "loss": 7.4336, "mean_token_accuracy": 0.09461462944746017, "num_tokens": 760961.0, "step": 420 }, { "entropy": 7.944287586212158, "epoch": 0.365277180919639, "grad_norm": 1.0390625, "learning_rate": 0.000212, "loss": 7.4865, "mean_token_accuracy": 0.09455274268984795, "num_tokens": 770554.0, "step": 425 }, { "entropy": 7.750526332855225, "epoch": 0.3695745595186936, "grad_norm": 1.03125, "learning_rate": 0.0002145, "loss": 7.4618, "mean_token_accuracy": 0.09681151732802391, "num_tokens": 779172.0, "step": 430 }, { "entropy": 7.9787256717681885, "epoch": 0.37387193811774816, "grad_norm": 0.984375, "learning_rate": 0.00021700000000000002, "loss": 7.5123, "mean_token_accuracy": 0.08840151131153107, "num_tokens": 788040.0, "step": 435 }, { "entropy": 7.883750295639038, "epoch": 0.37816931671680276, "grad_norm": 1.109375, "learning_rate": 0.0002195, "loss": 7.4135, "mean_token_accuracy": 0.0939902700483799, "num_tokens": 796786.0, "step": 440 }, { "entropy": 7.851776885986328, "epoch": 0.3824666953158573, "grad_norm": 1.09375, "learning_rate": 0.000222, "loss": 7.4233, "mean_token_accuracy": 0.0923767201602459, "num_tokens": 805520.0, "step": 445 }, { "entropy": 7.805376100540161, "epoch": 0.3867640739149119, "grad_norm": 1.1484375, "learning_rate": 0.0002245, "loss": 7.3508, "mean_token_accuracy": 0.09647825658321381, "num_tokens": 814939.0, "step": 450 }, { "entropy": 7.874559307098389, "epoch": 0.39106145251396646, "grad_norm": 1.2265625, "learning_rate": 0.00022700000000000002, "loss": 7.3531, "mean_token_accuracy": 0.09795481041073799, "num_tokens": 823862.0, "step": 455 }, { "entropy": 7.7626677513122555, "epoch": 0.39535883111302106, "grad_norm": 1.1328125, "learning_rate": 0.00022950000000000002, "loss": 7.3918, "mean_token_accuracy": 0.09068166017532349, "num_tokens": 832820.0, "step": 460 }, { "entropy": 7.928297901153565, "epoch": 0.39965620971207566, "grad_norm": 1.1171875, "learning_rate": 0.00023200000000000003, "loss": 7.3494, "mean_token_accuracy": 0.09501236006617546, "num_tokens": 841538.0, "step": 465 }, { "entropy": 7.7496504306793215, "epoch": 0.4039535883111302, "grad_norm": 0.99609375, "learning_rate": 0.00023449999999999998, "loss": 7.4626, "mean_token_accuracy": 0.09104103595018387, "num_tokens": 851123.0, "step": 470 }, { "entropy": 7.8953351974487305, "epoch": 0.4082509669101848, "grad_norm": 1.125, "learning_rate": 0.000237, "loss": 7.4266, "mean_token_accuracy": 0.09596899375319481, "num_tokens": 860357.0, "step": 475 }, { "entropy": 7.76341495513916, "epoch": 0.41254834550923936, "grad_norm": 1.0703125, "learning_rate": 0.0002395, "loss": 7.3425, "mean_token_accuracy": 0.09861095696687698, "num_tokens": 869980.0, "step": 480 }, { "entropy": 7.82184157371521, "epoch": 0.41684572410829396, "grad_norm": 1.03125, "learning_rate": 0.000242, "loss": 7.2999, "mean_token_accuracy": 0.10065284445881843, "num_tokens": 878250.0, "step": 485 }, { "entropy": 7.76347074508667, "epoch": 0.4211431027073485, "grad_norm": 1.25, "learning_rate": 0.0002445, "loss": 7.4007, "mean_token_accuracy": 0.095355936139822, "num_tokens": 887624.0, "step": 490 }, { "entropy": 7.753844261169434, "epoch": 0.4254404813064031, "grad_norm": 1.1484375, "learning_rate": 0.000247, "loss": 7.3568, "mean_token_accuracy": 0.09853926301002502, "num_tokens": 897120.0, "step": 495 }, { "entropy": 7.802051830291748, "epoch": 0.42973785990545765, "grad_norm": 1.03125, "learning_rate": 0.0002495, "loss": 7.3179, "mean_token_accuracy": 0.10127250477671623, "num_tokens": 906215.0, "step": 500 }, { "epoch": 0.42973785990545765, "eval_entropy": 7.412716417699246, "eval_loss": 7.3790483474731445, "eval_mean_token_accuracy": 0.09986981684929347, "eval_num_tokens": 906215.0, "eval_runtime": 2.0966, "eval_samples_per_second": 1692.736, "eval_steps_per_second": 211.771, "step": 500 }, { "entropy": 7.651102495193482, "epoch": 0.43403523850451226, "grad_norm": 1.09375, "learning_rate": 0.000252, "loss": 7.3112, "mean_token_accuracy": 0.10008608102798462, "num_tokens": 915181.0, "step": 505 }, { "entropy": 7.728409194946289, "epoch": 0.4383326171035668, "grad_norm": 1.0703125, "learning_rate": 0.0002545, "loss": 7.3388, "mean_token_accuracy": 0.09651862978935241, "num_tokens": 924377.0, "step": 510 }, { "entropy": 7.770003318786621, "epoch": 0.4426299957026214, "grad_norm": 0.984375, "learning_rate": 0.000257, "loss": 7.4098, "mean_token_accuracy": 0.09438847750425339, "num_tokens": 933114.0, "step": 515 }, { "entropy": 7.86782751083374, "epoch": 0.44692737430167595, "grad_norm": 0.9375, "learning_rate": 0.0002595, "loss": 7.3692, "mean_token_accuracy": 0.09444344118237495, "num_tokens": 943306.0, "step": 520 }, { "entropy": 7.659075498580933, "epoch": 0.45122475290073055, "grad_norm": 1.1875, "learning_rate": 0.000262, "loss": 7.2626, "mean_token_accuracy": 0.10587219074368477, "num_tokens": 951515.0, "step": 525 }, { "entropy": 7.713227224349976, "epoch": 0.45552213149978515, "grad_norm": 1.015625, "learning_rate": 0.00026450000000000003, "loss": 7.3711, "mean_token_accuracy": 0.09387057200074196, "num_tokens": 962686.0, "step": 530 }, { "entropy": 7.780395078659057, "epoch": 0.4598195100988397, "grad_norm": 1.09375, "learning_rate": 0.00026700000000000004, "loss": 7.3777, "mean_token_accuracy": 0.10021266266703606, "num_tokens": 972136.0, "step": 535 }, { "entropy": 7.657458114624023, "epoch": 0.4641168886978943, "grad_norm": 1.09375, "learning_rate": 0.00026950000000000005, "loss": 7.2696, "mean_token_accuracy": 0.10345774069428444, "num_tokens": 981301.0, "step": 540 }, { "entropy": 7.700049114227295, "epoch": 0.46841426729694885, "grad_norm": 1.1484375, "learning_rate": 0.00027200000000000005, "loss": 7.2923, "mean_token_accuracy": 0.10189392492175102, "num_tokens": 990360.0, "step": 545 }, { "entropy": 7.770557546615601, "epoch": 0.47271164589600345, "grad_norm": 1.0859375, "learning_rate": 0.0002745, "loss": 7.3438, "mean_token_accuracy": 0.09953725263476372, "num_tokens": 999415.0, "step": 550 }, { "entropy": 7.656623125076294, "epoch": 0.477009024495058, "grad_norm": 1.0625, "learning_rate": 0.000277, "loss": 7.2635, "mean_token_accuracy": 0.10239741951227188, "num_tokens": 1008762.0, "step": 555 }, { "entropy": 7.690563821792603, "epoch": 0.4813064030941126, "grad_norm": 1.171875, "learning_rate": 0.0002795, "loss": 7.2652, "mean_token_accuracy": 0.10631422251462937, "num_tokens": 1017704.0, "step": 560 }, { "entropy": 7.641897583007813, "epoch": 0.48560378169316715, "grad_norm": 1.1640625, "learning_rate": 0.00028199999999999997, "loss": 7.2341, "mean_token_accuracy": 0.10428761765360832, "num_tokens": 1026251.0, "step": 565 }, { "entropy": 7.641419315338135, "epoch": 0.48990116029222175, "grad_norm": 1.03125, "learning_rate": 0.0002845, "loss": 7.2158, "mean_token_accuracy": 0.10731100514531136, "num_tokens": 1036191.0, "step": 570 }, { "entropy": 7.658735990524292, "epoch": 0.4941985388912763, "grad_norm": 1.0859375, "learning_rate": 0.000287, "loss": 7.2462, "mean_token_accuracy": 0.10594421103596688, "num_tokens": 1044936.0, "step": 575 }, { "entropy": 7.621677112579346, "epoch": 0.4984959174903309, "grad_norm": 1.1171875, "learning_rate": 0.0002895, "loss": 7.2472, "mean_token_accuracy": 0.10367096737027168, "num_tokens": 1053683.0, "step": 580 }, { "entropy": 7.570435047149658, "epoch": 0.5027932960893855, "grad_norm": 1.046875, "learning_rate": 0.000292, "loss": 7.2271, "mean_token_accuracy": 0.1076263040304184, "num_tokens": 1062932.0, "step": 585 }, { "entropy": 7.723283386230468, "epoch": 0.50709067468844, "grad_norm": 0.98828125, "learning_rate": 0.0002945, "loss": 7.2544, "mean_token_accuracy": 0.10264097228646278, "num_tokens": 1072313.0, "step": 590 }, { "entropy": 7.62511043548584, "epoch": 0.5113880532874946, "grad_norm": 1.171875, "learning_rate": 0.000297, "loss": 7.2228, "mean_token_accuracy": 0.09801378548145294, "num_tokens": 1081675.0, "step": 595 }, { "entropy": 7.608328151702881, "epoch": 0.5156854318865493, "grad_norm": 1.0703125, "learning_rate": 0.0002995, "loss": 7.2433, "mean_token_accuracy": 0.10141062065958976, "num_tokens": 1091541.0, "step": 600 }, { "entropy": 7.695394897460938, "epoch": 0.5199828104856038, "grad_norm": 1.015625, "learning_rate": 0.000302, "loss": 7.2462, "mean_token_accuracy": 0.10475782826542854, "num_tokens": 1100724.0, "step": 605 }, { "entropy": 7.50453405380249, "epoch": 0.5242801890846583, "grad_norm": 1.0546875, "learning_rate": 0.0003045, "loss": 7.1924, "mean_token_accuracy": 0.1077597513794899, "num_tokens": 1108869.0, "step": 610 }, { "entropy": 7.644835519790649, "epoch": 0.5285775676837129, "grad_norm": 1.1015625, "learning_rate": 0.000307, "loss": 7.2261, "mean_token_accuracy": 0.10431057810783387, "num_tokens": 1117314.0, "step": 615 }, { "entropy": 7.488267469406128, "epoch": 0.5328749462827675, "grad_norm": 1.109375, "learning_rate": 0.0003095, "loss": 7.148, "mean_token_accuracy": 0.10711429193615914, "num_tokens": 1126786.0, "step": 620 }, { "entropy": 7.577956056594848, "epoch": 0.5371723248818221, "grad_norm": 1.3046875, "learning_rate": 0.000312, "loss": 7.1645, "mean_token_accuracy": 0.10579404905438423, "num_tokens": 1136013.0, "step": 625 }, { "entropy": 7.527575206756592, "epoch": 0.5414697034808766, "grad_norm": 1.109375, "learning_rate": 0.0003145, "loss": 7.1969, "mean_token_accuracy": 0.10749110653996467, "num_tokens": 1144970.0, "step": 630 }, { "entropy": 7.613465976715088, "epoch": 0.5457670820799312, "grad_norm": 1.2578125, "learning_rate": 0.000317, "loss": 7.1614, "mean_token_accuracy": 0.11203600242733955, "num_tokens": 1153810.0, "step": 635 }, { "entropy": 7.521342611312866, "epoch": 0.5500644606789858, "grad_norm": 1.0546875, "learning_rate": 0.0003195, "loss": 7.1408, "mean_token_accuracy": 0.10991051346063614, "num_tokens": 1162498.0, "step": 640 }, { "entropy": 7.5313867092132565, "epoch": 0.5543618392780404, "grad_norm": 1.0546875, "learning_rate": 0.000322, "loss": 7.2164, "mean_token_accuracy": 0.1044546626508236, "num_tokens": 1172091.0, "step": 645 }, { "entropy": 7.653256607055664, "epoch": 0.5586592178770949, "grad_norm": 1.1015625, "learning_rate": 0.00032450000000000003, "loss": 7.1977, "mean_token_accuracy": 0.10631284043192864, "num_tokens": 1181400.0, "step": 650 }, { "entropy": 7.537307643890381, "epoch": 0.5629565964761496, "grad_norm": 1.2890625, "learning_rate": 0.00032700000000000003, "loss": 7.1721, "mean_token_accuracy": 0.11125476211309433, "num_tokens": 1189780.0, "step": 655 }, { "entropy": 7.477937269210815, "epoch": 0.5672539750752041, "grad_norm": 1.1875, "learning_rate": 0.00032950000000000004, "loss": 7.1315, "mean_token_accuracy": 0.1057468131184578, "num_tokens": 1198671.0, "step": 660 }, { "entropy": 7.589753818511963, "epoch": 0.5715513536742587, "grad_norm": 1.09375, "learning_rate": 0.00033200000000000005, "loss": 7.1652, "mean_token_accuracy": 0.1051194004714489, "num_tokens": 1207173.0, "step": 665 }, { "entropy": 7.461796855926513, "epoch": 0.5758487322733132, "grad_norm": 1.21875, "learning_rate": 0.00033450000000000005, "loss": 7.0998, "mean_token_accuracy": 0.11046240702271462, "num_tokens": 1216387.0, "step": 670 }, { "entropy": 7.622633552551269, "epoch": 0.5801461108723679, "grad_norm": 1.0234375, "learning_rate": 0.000337, "loss": 7.0722, "mean_token_accuracy": 0.11004948541522026, "num_tokens": 1224461.0, "step": 675 }, { "entropy": 7.451505851745606, "epoch": 0.5844434894714224, "grad_norm": 1.1796875, "learning_rate": 0.0003395, "loss": 7.1414, "mean_token_accuracy": 0.11011224165558815, "num_tokens": 1233774.0, "step": 680 }, { "entropy": 7.457524538040161, "epoch": 0.588740868070477, "grad_norm": 1.2109375, "learning_rate": 0.000342, "loss": 7.0938, "mean_token_accuracy": 0.1142980344593525, "num_tokens": 1242812.0, "step": 685 }, { "entropy": 7.605640840530396, "epoch": 0.5930382466695315, "grad_norm": 1.03125, "learning_rate": 0.00034449999999999997, "loss": 7.191, "mean_token_accuracy": 0.11035142987966537, "num_tokens": 1252872.0, "step": 690 }, { "entropy": 7.307473850250244, "epoch": 0.5973356252685862, "grad_norm": 1.1796875, "learning_rate": 0.000347, "loss": 6.983, "mean_token_accuracy": 0.11081922426819801, "num_tokens": 1260852.0, "step": 695 }, { "entropy": 7.438599157333374, "epoch": 0.6016330038676407, "grad_norm": 1.2578125, "learning_rate": 0.0003495, "loss": 7.0984, "mean_token_accuracy": 0.10763570070266723, "num_tokens": 1268925.0, "step": 700 }, { "entropy": 7.530004072189331, "epoch": 0.6059303824666953, "grad_norm": 1.109375, "learning_rate": 0.000352, "loss": 7.145, "mean_token_accuracy": 0.10653513446450233, "num_tokens": 1278994.0, "step": 705 }, { "entropy": 7.4260091304779055, "epoch": 0.6102277610657499, "grad_norm": 1.1640625, "learning_rate": 0.0003545, "loss": 7.1323, "mean_token_accuracy": 0.10368426591157913, "num_tokens": 1287698.0, "step": 710 }, { "entropy": 7.482218551635742, "epoch": 0.6145251396648045, "grad_norm": 1.0546875, "learning_rate": 0.000357, "loss": 7.0787, "mean_token_accuracy": 0.11120296269655228, "num_tokens": 1297475.0, "step": 715 }, { "entropy": 7.480340671539307, "epoch": 0.618822518263859, "grad_norm": 1.1328125, "learning_rate": 0.0003595, "loss": 7.1091, "mean_token_accuracy": 0.11085583940148354, "num_tokens": 1306836.0, "step": 720 }, { "entropy": 7.506947946548462, "epoch": 0.6231198968629136, "grad_norm": 1.03125, "learning_rate": 0.000362, "loss": 7.1377, "mean_token_accuracy": 0.10435779988765717, "num_tokens": 1315872.0, "step": 725 }, { "entropy": 7.4788847923278805, "epoch": 0.6274172754619682, "grad_norm": 1.1796875, "learning_rate": 0.0003645, "loss": 7.0782, "mean_token_accuracy": 0.11685637310147286, "num_tokens": 1324624.0, "step": 730 }, { "entropy": 7.444537830352783, "epoch": 0.6317146540610228, "grad_norm": 1.15625, "learning_rate": 0.000367, "loss": 7.061, "mean_token_accuracy": 0.11548577472567559, "num_tokens": 1333058.0, "step": 735 }, { "entropy": 7.262284660339356, "epoch": 0.6360120326600773, "grad_norm": 1.078125, "learning_rate": 0.0003695, "loss": 7.0248, "mean_token_accuracy": 0.11004846841096878, "num_tokens": 1342376.0, "step": 740 }, { "entropy": 7.526681852340698, "epoch": 0.6403094112591319, "grad_norm": 1.1484375, "learning_rate": 0.000372, "loss": 7.0693, "mean_token_accuracy": 0.10503109246492386, "num_tokens": 1351386.0, "step": 745 }, { "entropy": 7.364239978790283, "epoch": 0.6446067898581865, "grad_norm": 1.265625, "learning_rate": 0.0003745, "loss": 6.9832, "mean_token_accuracy": 0.11761592403054237, "num_tokens": 1358958.0, "step": 750 }, { "entropy": 7.496349859237671, "epoch": 0.6489041684572411, "grad_norm": 1.109375, "learning_rate": 0.000377, "loss": 7.1231, "mean_token_accuracy": 0.10967899858951569, "num_tokens": 1368599.0, "step": 755 }, { "entropy": 7.435608530044556, "epoch": 0.6532015470562956, "grad_norm": 1.890625, "learning_rate": 0.0003795, "loss": 7.1433, "mean_token_accuracy": 0.1064300425350666, "num_tokens": 1378529.0, "step": 760 }, { "entropy": 7.344243001937866, "epoch": 0.6574989256553503, "grad_norm": 1.25, "learning_rate": 0.000382, "loss": 6.9306, "mean_token_accuracy": 0.11750481277704239, "num_tokens": 1386993.0, "step": 765 }, { "entropy": 7.390715217590332, "epoch": 0.6617963042544048, "grad_norm": 1.5, "learning_rate": 0.0003845, "loss": 7.0322, "mean_token_accuracy": 0.11829963177442551, "num_tokens": 1395790.0, "step": 770 }, { "entropy": 7.302670812606811, "epoch": 0.6660936828534594, "grad_norm": 1.078125, "learning_rate": 0.00038700000000000003, "loss": 7.0393, "mean_token_accuracy": 0.11235549300909042, "num_tokens": 1405587.0, "step": 775 }, { "entropy": 7.348860168457032, "epoch": 0.6703910614525139, "grad_norm": 1.0390625, "learning_rate": 0.00038950000000000003, "loss": 6.9999, "mean_token_accuracy": 0.11504087448120118, "num_tokens": 1414478.0, "step": 780 }, { "entropy": 7.428205347061157, "epoch": 0.6746884400515686, "grad_norm": 1.375, "learning_rate": 0.00039200000000000004, "loss": 7.0623, "mean_token_accuracy": 0.11534775421023369, "num_tokens": 1423791.0, "step": 785 }, { "entropy": 7.467832851409912, "epoch": 0.6789858186506231, "grad_norm": 1.234375, "learning_rate": 0.00039450000000000005, "loss": 7.1014, "mean_token_accuracy": 0.10728210881352425, "num_tokens": 1432955.0, "step": 790 }, { "entropy": 7.385548782348633, "epoch": 0.6832831972496777, "grad_norm": 0.99609375, "learning_rate": 0.00039700000000000005, "loss": 7.074, "mean_token_accuracy": 0.1087567687034607, "num_tokens": 1441907.0, "step": 795 }, { "entropy": 7.290066146850586, "epoch": 0.6875805758487322, "grad_norm": 1.203125, "learning_rate": 0.0003995, "loss": 6.935, "mean_token_accuracy": 0.11768098697066307, "num_tokens": 1451062.0, "step": 800 }, { "entropy": 7.399672508239746, "epoch": 0.6918779544477869, "grad_norm": 1.0234375, "learning_rate": 0.000402, "loss": 7.0218, "mean_token_accuracy": 0.10959179401397705, "num_tokens": 1460132.0, "step": 805 }, { "entropy": 7.272280263900757, "epoch": 0.6961753330468414, "grad_norm": 1.0625, "learning_rate": 0.0004045, "loss": 6.9141, "mean_token_accuracy": 0.11885375007987023, "num_tokens": 1469582.0, "step": 810 }, { "entropy": 7.255832242965698, "epoch": 0.700472711645896, "grad_norm": 1.3515625, "learning_rate": 0.00040699999999999997, "loss": 7.012, "mean_token_accuracy": 0.10950389429926873, "num_tokens": 1479053.0, "step": 815 }, { "entropy": 7.313858604431152, "epoch": 0.7047700902449506, "grad_norm": 1.21875, "learning_rate": 0.0004095, "loss": 7.0142, "mean_token_accuracy": 0.11343196108937263, "num_tokens": 1488189.0, "step": 820 }, { "entropy": 7.236453676223755, "epoch": 0.7090674688440052, "grad_norm": 1.046875, "learning_rate": 0.000412, "loss": 6.8662, "mean_token_accuracy": 0.12046442031860352, "num_tokens": 1497324.0, "step": 825 }, { "entropy": 7.310264635086059, "epoch": 0.7133648474430597, "grad_norm": 1.015625, "learning_rate": 0.0004145, "loss": 6.9814, "mean_token_accuracy": 0.11739002540707588, "num_tokens": 1506543.0, "step": 830 }, { "entropy": 7.289929437637329, "epoch": 0.7176622260421143, "grad_norm": 1.109375, "learning_rate": 0.000417, "loss": 6.9742, "mean_token_accuracy": 0.12236066460609436, "num_tokens": 1516737.0, "step": 835 }, { "entropy": 7.161224508285523, "epoch": 0.7219596046411689, "grad_norm": 1.046875, "learning_rate": 0.0004195, "loss": 6.8503, "mean_token_accuracy": 0.11500222384929656, "num_tokens": 1525561.0, "step": 840 }, { "entropy": 7.280500030517578, "epoch": 0.7262569832402235, "grad_norm": 1.1328125, "learning_rate": 0.000422, "loss": 6.8765, "mean_token_accuracy": 0.1242159940302372, "num_tokens": 1533323.0, "step": 845 }, { "entropy": 7.292038059234619, "epoch": 0.730554361839278, "grad_norm": 1.1875, "learning_rate": 0.0004245, "loss": 6.9379, "mean_token_accuracy": 0.12142991349101066, "num_tokens": 1542632.0, "step": 850 }, { "entropy": 7.305912923812866, "epoch": 0.7348517404383326, "grad_norm": 1.265625, "learning_rate": 0.000427, "loss": 6.8775, "mean_token_accuracy": 0.12107516825199127, "num_tokens": 1551236.0, "step": 855 }, { "entropy": 7.118098545074463, "epoch": 0.7391491190373872, "grad_norm": 1.15625, "learning_rate": 0.0004295, "loss": 6.878, "mean_token_accuracy": 0.12266490310430526, "num_tokens": 1559674.0, "step": 860 }, { "entropy": 7.268103885650635, "epoch": 0.7434464976364418, "grad_norm": 1.09375, "learning_rate": 0.000432, "loss": 6.9687, "mean_token_accuracy": 0.1217973381280899, "num_tokens": 1569481.0, "step": 865 }, { "entropy": 7.2675707817077635, "epoch": 0.7477438762354963, "grad_norm": 1.0859375, "learning_rate": 0.0004345, "loss": 6.9975, "mean_token_accuracy": 0.11359266638755798, "num_tokens": 1578488.0, "step": 870 }, { "entropy": 7.171451759338379, "epoch": 0.752041254834551, "grad_norm": 1.0625, "learning_rate": 0.000437, "loss": 6.8946, "mean_token_accuracy": 0.11810402423143387, "num_tokens": 1586675.0, "step": 875 }, { "entropy": 7.285072469711304, "epoch": 0.7563386334336055, "grad_norm": 1.0859375, "learning_rate": 0.0004395, "loss": 7.0021, "mean_token_accuracy": 0.10800698548555374, "num_tokens": 1595411.0, "step": 880 }, { "entropy": 7.312672233581543, "epoch": 0.7606360120326601, "grad_norm": 1.1953125, "learning_rate": 0.000442, "loss": 6.9755, "mean_token_accuracy": 0.11759781166911125, "num_tokens": 1604046.0, "step": 885 }, { "entropy": 7.245748281478882, "epoch": 0.7649333906317146, "grad_norm": 1.0859375, "learning_rate": 0.0004445, "loss": 6.9643, "mean_token_accuracy": 0.11201045587658882, "num_tokens": 1613759.0, "step": 890 }, { "entropy": 7.238279533386231, "epoch": 0.7692307692307693, "grad_norm": 1.015625, "learning_rate": 0.000447, "loss": 6.9209, "mean_token_accuracy": 0.11877147182822227, "num_tokens": 1623323.0, "step": 895 }, { "entropy": 7.230697107315064, "epoch": 0.7735281478298238, "grad_norm": 1.1328125, "learning_rate": 0.00044950000000000003, "loss": 6.9005, "mean_token_accuracy": 0.11391794160008431, "num_tokens": 1631727.0, "step": 900 }, { "entropy": 7.194222545623779, "epoch": 0.7778255264288784, "grad_norm": 1.1875, "learning_rate": 0.00045200000000000004, "loss": 6.8583, "mean_token_accuracy": 0.12049278989434242, "num_tokens": 1639544.0, "step": 905 }, { "entropy": 7.284112405776978, "epoch": 0.7821229050279329, "grad_norm": 1.125, "learning_rate": 0.00045450000000000004, "loss": 6.9773, "mean_token_accuracy": 0.11113567724823951, "num_tokens": 1648931.0, "step": 910 }, { "entropy": 7.1627342224121096, "epoch": 0.7864202836269876, "grad_norm": 1.15625, "learning_rate": 0.00045700000000000005, "loss": 6.8345, "mean_token_accuracy": 0.12127922549843788, "num_tokens": 1657688.0, "step": 915 }, { "entropy": 7.259271335601807, "epoch": 0.7907176622260421, "grad_norm": 1.0390625, "learning_rate": 0.00045950000000000006, "loss": 6.9244, "mean_token_accuracy": 0.11565326899290085, "num_tokens": 1666879.0, "step": 920 }, { "entropy": 7.1275458335876465, "epoch": 0.7950150408250967, "grad_norm": 1.109375, "learning_rate": 0.000462, "loss": 6.8982, "mean_token_accuracy": 0.118662890791893, "num_tokens": 1676773.0, "step": 925 }, { "entropy": 7.2360998630523685, "epoch": 0.7993124194241513, "grad_norm": 1.0859375, "learning_rate": 0.0004645, "loss": 7.0092, "mean_token_accuracy": 0.11184348464012146, "num_tokens": 1686144.0, "step": 930 }, { "entropy": 7.26247010231018, "epoch": 0.8036097980232059, "grad_norm": 1.078125, "learning_rate": 0.000467, "loss": 6.9646, "mean_token_accuracy": 0.10949353277683258, "num_tokens": 1695476.0, "step": 935 }, { "entropy": 7.174946022033692, "epoch": 0.8079071766222604, "grad_norm": 1.046875, "learning_rate": 0.0004695, "loss": 6.8498, "mean_token_accuracy": 0.12084392830729485, "num_tokens": 1704907.0, "step": 940 }, { "entropy": 7.166734504699707, "epoch": 0.812204555221315, "grad_norm": 0.9609375, "learning_rate": 0.000472, "loss": 6.8948, "mean_token_accuracy": 0.12091493904590607, "num_tokens": 1714564.0, "step": 945 }, { "entropy": 7.244975614547729, "epoch": 0.8165019338203696, "grad_norm": 1.1171875, "learning_rate": 0.0004745, "loss": 6.9209, "mean_token_accuracy": 0.1155279442667961, "num_tokens": 1725285.0, "step": 950 }, { "entropy": 7.1149109363555905, "epoch": 0.8207993124194242, "grad_norm": 1.03125, "learning_rate": 0.000477, "loss": 6.9153, "mean_token_accuracy": 0.11715079098939896, "num_tokens": 1734331.0, "step": 955 }, { "entropy": 7.227117824554443, "epoch": 0.8250966910184787, "grad_norm": 1.2578125, "learning_rate": 0.0004795, "loss": 6.852, "mean_token_accuracy": 0.11185217499732972, "num_tokens": 1742340.0, "step": 960 }, { "entropy": 7.160442066192627, "epoch": 0.8293940696175333, "grad_norm": 1.109375, "learning_rate": 0.000482, "loss": 6.8351, "mean_token_accuracy": 0.12198592498898506, "num_tokens": 1751725.0, "step": 965 }, { "entropy": 6.999344539642334, "epoch": 0.8336914482165879, "grad_norm": 1.1328125, "learning_rate": 0.0004845, "loss": 6.7683, "mean_token_accuracy": 0.12398558706045151, "num_tokens": 1760294.0, "step": 970 }, { "entropy": 7.112461137771606, "epoch": 0.8379888268156425, "grad_norm": 1.0546875, "learning_rate": 0.000487, "loss": 6.8275, "mean_token_accuracy": 0.11639805063605309, "num_tokens": 1768912.0, "step": 975 }, { "entropy": 7.257990169525146, "epoch": 0.842286205414697, "grad_norm": 1.0390625, "learning_rate": 0.0004895, "loss": 7.0148, "mean_token_accuracy": 0.12016609534621239, "num_tokens": 1778633.0, "step": 980 }, { "entropy": 7.1191816329956055, "epoch": 0.8465835840137517, "grad_norm": 1.1171875, "learning_rate": 0.000492, "loss": 6.8847, "mean_token_accuracy": 0.11811531409621238, "num_tokens": 1787275.0, "step": 985 }, { "entropy": 7.235857200622559, "epoch": 0.8508809626128062, "grad_norm": 1.2578125, "learning_rate": 0.0004945, "loss": 6.8878, "mean_token_accuracy": 0.11604067236185074, "num_tokens": 1795994.0, "step": 990 }, { "entropy": 7.036646842956543, "epoch": 0.8551783412118608, "grad_norm": 0.8359375, "learning_rate": 0.000497, "loss": 6.804, "mean_token_accuracy": 0.11985133662819862, "num_tokens": 1806379.0, "step": 995 }, { "entropy": 7.154667520523072, "epoch": 0.8594757198109153, "grad_norm": 1.0546875, "learning_rate": 0.0004995, "loss": 6.8296, "mean_token_accuracy": 0.1270947828888893, "num_tokens": 1816135.0, "step": 1000 }, { "epoch": 0.8594757198109153, "eval_entropy": 6.812919497489929, "eval_loss": 6.8574419021606445, "eval_mean_token_accuracy": 0.12292942362795542, "eval_num_tokens": 1816135.0, "eval_runtime": 2.0522, "eval_samples_per_second": 1729.37, "eval_steps_per_second": 216.354, "step": 1000 }, { "entropy": 7.122643280029297, "epoch": 0.86377309840997, "grad_norm": 1.2734375, "learning_rate": 0.0004999998427807679, "loss": 6.8305, "mean_token_accuracy": 0.12133256047964096, "num_tokens": 1824777.0, "step": 1005 }, { "entropy": 7.058982563018799, "epoch": 0.8680704770090245, "grad_norm": 1.234375, "learning_rate": 0.0004999992040780138, "loss": 6.8924, "mean_token_accuracy": 0.12320492565631866, "num_tokens": 1833807.0, "step": 1010 }, { "entropy": 7.185050773620605, "epoch": 0.8723678556080791, "grad_norm": 1.0078125, "learning_rate": 0.0004999980740669294, "loss": 6.8357, "mean_token_accuracy": 0.11969011649489403, "num_tokens": 1843375.0, "step": 1015 }, { "entropy": 7.11086139678955, "epoch": 0.8766652342071336, "grad_norm": 1.140625, "learning_rate": 0.0004999964527499823, "loss": 6.9058, "mean_token_accuracy": 0.11237111985683441, "num_tokens": 1853036.0, "step": 1020 }, { "entropy": 7.120519638061523, "epoch": 0.8809626128061883, "grad_norm": 1.0703125, "learning_rate": 0.0004999943401307127, "loss": 6.8707, "mean_token_accuracy": 0.11769452393054962, "num_tokens": 1862041.0, "step": 1025 }, { "entropy": 7.087871503829956, "epoch": 0.8852599914052428, "grad_norm": 1.1015625, "learning_rate": 0.0004999917362137337, "loss": 6.7742, "mean_token_accuracy": 0.1225271351635456, "num_tokens": 1870707.0, "step": 1030 }, { "entropy": 7.055140686035156, "epoch": 0.8895573700042974, "grad_norm": 1.078125, "learning_rate": 0.0004999886410047312, "loss": 6.7705, "mean_token_accuracy": 0.11845692843198777, "num_tokens": 1879787.0, "step": 1035 }, { "entropy": 7.138674926757813, "epoch": 0.8938547486033519, "grad_norm": 0.98828125, "learning_rate": 0.0004999850545104638, "loss": 6.8315, "mean_token_accuracy": 0.1223653219640255, "num_tokens": 1889413.0, "step": 1040 }, { "entropy": 7.048402404785156, "epoch": 0.8981521272024066, "grad_norm": 1.171875, "learning_rate": 0.0004999809767387633, "loss": 6.8174, "mean_token_accuracy": 0.12110616937279702, "num_tokens": 1898283.0, "step": 1045 }, { "entropy": 7.144178056716919, "epoch": 0.9024495058014611, "grad_norm": 1.0546875, "learning_rate": 0.0004999764076985337, "loss": 6.8287, "mean_token_accuracy": 0.12670400962233544, "num_tokens": 1907175.0, "step": 1050 }, { "entropy": 6.988327312469482, "epoch": 0.9067468844005157, "grad_norm": 1.09375, "learning_rate": 0.0004999713473997519, "loss": 6.8824, "mean_token_accuracy": 0.11774980947375298, "num_tokens": 1918223.0, "step": 1055 }, { "entropy": 7.124748563766479, "epoch": 0.9110442629995703, "grad_norm": 1.09375, "learning_rate": 0.0004999657958534677, "loss": 6.8312, "mean_token_accuracy": 0.1194000355899334, "num_tokens": 1928801.0, "step": 1060 }, { "entropy": 7.008511686325074, "epoch": 0.9153416415986249, "grad_norm": 1.1328125, "learning_rate": 0.0004999597530718034, "loss": 6.7896, "mean_token_accuracy": 0.12186847031116485, "num_tokens": 1937406.0, "step": 1065 }, { "entropy": 6.997484445571899, "epoch": 0.9196390201976794, "grad_norm": 1.078125, "learning_rate": 0.000499953219067954, "loss": 6.7932, "mean_token_accuracy": 0.11857569143176079, "num_tokens": 1947184.0, "step": 1070 }, { "entropy": 7.135808944702148, "epoch": 0.923936398796734, "grad_norm": 1.09375, "learning_rate": 0.0004999461938561873, "loss": 6.8139, "mean_token_accuracy": 0.12288291603326798, "num_tokens": 1956293.0, "step": 1075 }, { "entropy": 7.027012157440185, "epoch": 0.9282337773957886, "grad_norm": 1.1328125, "learning_rate": 0.0004999386774518432, "loss": 6.7854, "mean_token_accuracy": 0.11997194737195968, "num_tokens": 1964791.0, "step": 1080 }, { "entropy": 6.975531768798828, "epoch": 0.9325311559948432, "grad_norm": 1.0703125, "learning_rate": 0.0004999306698713349, "loss": 6.7088, "mean_token_accuracy": 0.12559010088443756, "num_tokens": 1973754.0, "step": 1085 }, { "entropy": 7.052453565597534, "epoch": 0.9368285345938977, "grad_norm": 1.078125, "learning_rate": 0.0004999221711321477, "loss": 6.7738, "mean_token_accuracy": 0.12475829720497131, "num_tokens": 1983035.0, "step": 1090 }, { "entropy": 6.906819009780884, "epoch": 0.9411259131929522, "grad_norm": 1.0703125, "learning_rate": 0.0004999131812528393, "loss": 6.8003, "mean_token_accuracy": 0.12229804769158363, "num_tokens": 1992584.0, "step": 1095 }, { "entropy": 7.109902429580688, "epoch": 0.9454232917920069, "grad_norm": 0.97265625, "learning_rate": 0.00049990370025304, "loss": 6.8193, "mean_token_accuracy": 0.12188051193952561, "num_tokens": 2001876.0, "step": 1100 }, { "entropy": 7.017454195022583, "epoch": 0.9497206703910615, "grad_norm": 0.97265625, "learning_rate": 0.0004998937281534526, "loss": 6.7115, "mean_token_accuracy": 0.1300358146429062, "num_tokens": 2011067.0, "step": 1105 }, { "entropy": 7.091220808029175, "epoch": 0.954018048990116, "grad_norm": 1.09375, "learning_rate": 0.0004998832649758521, "loss": 6.8077, "mean_token_accuracy": 0.12548175528645517, "num_tokens": 2020763.0, "step": 1110 }, { "entropy": 6.9685986042022705, "epoch": 0.9583154275891707, "grad_norm": 1.1796875, "learning_rate": 0.0004998723107430862, "loss": 6.7867, "mean_token_accuracy": 0.12391732335090637, "num_tokens": 2029534.0, "step": 1115 }, { "entropy": 7.046098041534424, "epoch": 0.9626128061882252, "grad_norm": 1.09375, "learning_rate": 0.0004998608654790741, "loss": 6.7311, "mean_token_accuracy": 0.12396327033638954, "num_tokens": 2039143.0, "step": 1120 }, { "entropy": 6.939239406585694, "epoch": 0.9669101847872797, "grad_norm": 1.125, "learning_rate": 0.000499848929208808, "loss": 6.7022, "mean_token_accuracy": 0.1295892022550106, "num_tokens": 2048253.0, "step": 1125 }, { "entropy": 6.931437301635742, "epoch": 0.9712075633863343, "grad_norm": 1.1484375, "learning_rate": 0.0004998365019583519, "loss": 6.7428, "mean_token_accuracy": 0.13122318536043168, "num_tokens": 2057234.0, "step": 1130 }, { "entropy": 7.081391954421997, "epoch": 0.975504941985389, "grad_norm": 1.1953125, "learning_rate": 0.0004998235837548417, "loss": 6.7881, "mean_token_accuracy": 0.1271953523159027, "num_tokens": 2065431.0, "step": 1135 }, { "entropy": 6.974546146392822, "epoch": 0.9798023205844435, "grad_norm": 1.0625, "learning_rate": 0.000499810174626486, "loss": 6.7888, "mean_token_accuracy": 0.1228917419910431, "num_tokens": 2074723.0, "step": 1140 }, { "entropy": 7.011039209365845, "epoch": 0.984099699183498, "grad_norm": 1.1953125, "learning_rate": 0.0004997962746025646, "loss": 6.6544, "mean_token_accuracy": 0.13169871941208838, "num_tokens": 2084509.0, "step": 1145 }, { "entropy": 6.973200798034668, "epoch": 0.9883970777825526, "grad_norm": 1.21875, "learning_rate": 0.0004997818837134298, "loss": 6.8028, "mean_token_accuracy": 0.12382483929395675, "num_tokens": 2093110.0, "step": 1150 }, { "entropy": 6.879178285598755, "epoch": 0.9926944563816072, "grad_norm": 1.125, "learning_rate": 0.0004997670019905057, "loss": 6.6634, "mean_token_accuracy": 0.12532600611448289, "num_tokens": 2102355.0, "step": 1155 }, { "entropy": 6.967250823974609, "epoch": 0.9969918349806618, "grad_norm": 1.171875, "learning_rate": 0.0004997516294662876, "loss": 6.6987, "mean_token_accuracy": 0.12651606351137162, "num_tokens": 2110418.0, "step": 1160 }, { "entropy": 6.987489064534505, "epoch": 1.0008594757198108, "grad_norm": 1.1484375, "learning_rate": 0.0004997357661743433, "loss": 6.6851, "mean_token_accuracy": 0.12885562578837076, "num_tokens": 2117866.0, "step": 1165 }, { "entropy": 6.906875991821289, "epoch": 1.0051568543188656, "grad_norm": 1.09375, "learning_rate": 0.0004997194121493118, "loss": 6.5242, "mean_token_accuracy": 0.1341039627790451, "num_tokens": 2126082.0, "step": 1170 }, { "entropy": 6.9217222213745115, "epoch": 1.0094542329179201, "grad_norm": 1.078125, "learning_rate": 0.0004997025674269037, "loss": 6.496, "mean_token_accuracy": 0.14013660922646523, "num_tokens": 2134042.0, "step": 1175 }, { "entropy": 6.853777265548706, "epoch": 1.0137516115169747, "grad_norm": 1.1953125, "learning_rate": 0.0004996852320439013, "loss": 6.5756, "mean_token_accuracy": 0.13146138042211533, "num_tokens": 2142570.0, "step": 1180 }, { "entropy": 6.882978248596191, "epoch": 1.0180489901160292, "grad_norm": 0.9765625, "learning_rate": 0.0004996674060381578, "loss": 6.5116, "mean_token_accuracy": 0.13583723902702333, "num_tokens": 2151310.0, "step": 1185 }, { "entropy": 6.949011325836182, "epoch": 1.0223463687150838, "grad_norm": 1.09375, "learning_rate": 0.0004996490894485985, "loss": 6.5696, "mean_token_accuracy": 0.1317083679139614, "num_tokens": 2160662.0, "step": 1190 }, { "entropy": 6.906634664535522, "epoch": 1.0266437473141383, "grad_norm": 1.078125, "learning_rate": 0.0004996302823152193, "loss": 6.5221, "mean_token_accuracy": 0.132858457416296, "num_tokens": 2170067.0, "step": 1195 }, { "entropy": 6.835825204849243, "epoch": 1.0309411259131929, "grad_norm": 1.09375, "learning_rate": 0.0004996109846790873, "loss": 6.4844, "mean_token_accuracy": 0.13565613552927971, "num_tokens": 2178850.0, "step": 1200 }, { "entropy": 6.833173513412476, "epoch": 1.0352385045122476, "grad_norm": 0.984375, "learning_rate": 0.0004995911965823412, "loss": 6.5058, "mean_token_accuracy": 0.14241415858268738, "num_tokens": 2188307.0, "step": 1205 }, { "entropy": 6.888755178451538, "epoch": 1.0395358831113022, "grad_norm": 1.171875, "learning_rate": 0.0004995709180681899, "loss": 6.5098, "mean_token_accuracy": 0.14214854687452316, "num_tokens": 2197026.0, "step": 1210 }, { "entropy": 6.828827667236328, "epoch": 1.0438332617103567, "grad_norm": 1.109375, "learning_rate": 0.000499550149180914, "loss": 6.4795, "mean_token_accuracy": 0.13599886670708655, "num_tokens": 2205537.0, "step": 1215 }, { "entropy": 6.880095815658569, "epoch": 1.0481306403094113, "grad_norm": 1.15625, "learning_rate": 0.0004995288899658641, "loss": 6.5128, "mean_token_accuracy": 0.14047559648752211, "num_tokens": 2214508.0, "step": 1220 }, { "entropy": 6.848831415176392, "epoch": 1.0524280189084658, "grad_norm": 1.1796875, "learning_rate": 0.0004995071404694619, "loss": 6.6248, "mean_token_accuracy": 0.1286735638976097, "num_tokens": 2223084.0, "step": 1225 }, { "entropy": 6.930538558959961, "epoch": 1.0567253975075204, "grad_norm": 1.0546875, "learning_rate": 0.0004994849007391996, "loss": 6.5507, "mean_token_accuracy": 0.12893568202853203, "num_tokens": 2231406.0, "step": 1230 }, { "entropy": 6.784887790679932, "epoch": 1.061022776106575, "grad_norm": 1.0859375, "learning_rate": 0.0004994621708236401, "loss": 6.4682, "mean_token_accuracy": 0.136442781239748, "num_tokens": 2239867.0, "step": 1235 }, { "entropy": 6.8624866008758545, "epoch": 1.0653201547056295, "grad_norm": 1.203125, "learning_rate": 0.000499438950772416, "loss": 6.5264, "mean_token_accuracy": 0.1343722127377987, "num_tokens": 2248844.0, "step": 1240 }, { "entropy": 6.764705419540405, "epoch": 1.0696175333046842, "grad_norm": 1.125, "learning_rate": 0.0004994152406362311, "loss": 6.4525, "mean_token_accuracy": 0.14018251076340676, "num_tokens": 2257599.0, "step": 1245 }, { "entropy": 6.871714019775391, "epoch": 1.0739149119037388, "grad_norm": 1.2421875, "learning_rate": 0.0004993910404668586, "loss": 6.4992, "mean_token_accuracy": 0.1316287100315094, "num_tokens": 2266510.0, "step": 1250 }, { "entropy": 6.801673936843872, "epoch": 1.0782122905027933, "grad_norm": 1.0, "learning_rate": 0.000499366350317142, "loss": 6.4902, "mean_token_accuracy": 0.1355181120336056, "num_tokens": 2275462.0, "step": 1255 }, { "entropy": 6.805047512054443, "epoch": 1.0825096691018479, "grad_norm": 1.1484375, "learning_rate": 0.0004993411702409948, "loss": 6.4684, "mean_token_accuracy": 0.13499311953783036, "num_tokens": 2283826.0, "step": 1260 }, { "entropy": 6.796231460571289, "epoch": 1.0868070477009024, "grad_norm": 1.171875, "learning_rate": 0.0004993155002934002, "loss": 6.4758, "mean_token_accuracy": 0.13739539608359336, "num_tokens": 2292967.0, "step": 1265 }, { "entropy": 6.935551691055298, "epoch": 1.091104426299957, "grad_norm": 1.5078125, "learning_rate": 0.0004992893405304111, "loss": 6.6091, "mean_token_accuracy": 0.13493912890553475, "num_tokens": 2302336.0, "step": 1270 }, { "entropy": 6.757972192764282, "epoch": 1.0954018048990115, "grad_norm": 1.03125, "learning_rate": 0.00049926269100915, "loss": 6.5039, "mean_token_accuracy": 0.14085786640644074, "num_tokens": 2311465.0, "step": 1275 }, { "entropy": 6.884800767898559, "epoch": 1.0996991834980663, "grad_norm": 1.0859375, "learning_rate": 0.0004992355517878087, "loss": 6.6134, "mean_token_accuracy": 0.12797435671091079, "num_tokens": 2320281.0, "step": 1280 }, { "entropy": 6.775428581237793, "epoch": 1.1039965620971208, "grad_norm": 1.15625, "learning_rate": 0.0004992079229256484, "loss": 6.5189, "mean_token_accuracy": 0.1329084627330303, "num_tokens": 2329755.0, "step": 1285 }, { "entropy": 6.721524858474732, "epoch": 1.1082939406961754, "grad_norm": 1.015625, "learning_rate": 0.0004991798044829996, "loss": 6.4524, "mean_token_accuracy": 0.1344260886311531, "num_tokens": 2338807.0, "step": 1290 }, { "entropy": 6.870701122283935, "epoch": 1.11259131929523, "grad_norm": 1.109375, "learning_rate": 0.0004991511965212618, "loss": 6.5591, "mean_token_accuracy": 0.13554905205965043, "num_tokens": 2348056.0, "step": 1295 }, { "entropy": 6.759064626693726, "epoch": 1.1168886978942845, "grad_norm": 1.0546875, "learning_rate": 0.0004991220991029032, "loss": 6.5619, "mean_token_accuracy": 0.13164993077516557, "num_tokens": 2357780.0, "step": 1300 }, { "entropy": 6.845104169845581, "epoch": 1.121186076493339, "grad_norm": 1.296875, "learning_rate": 0.000499092512291461, "loss": 6.526, "mean_token_accuracy": 0.13971479684114457, "num_tokens": 2367060.0, "step": 1305 }, { "entropy": 6.800533056259155, "epoch": 1.1254834550923936, "grad_norm": 1.0859375, "learning_rate": 0.000499062436151541, "loss": 6.5277, "mean_token_accuracy": 0.13263508304953575, "num_tokens": 2375751.0, "step": 1310 }, { "entropy": 6.890619134902954, "epoch": 1.129780833691448, "grad_norm": 1.109375, "learning_rate": 0.0004990318707488173, "loss": 6.5788, "mean_token_accuracy": 0.12899956330657006, "num_tokens": 2385013.0, "step": 1315 }, { "entropy": 6.769053792953491, "epoch": 1.1340782122905029, "grad_norm": 1.140625, "learning_rate": 0.0004990008161500327, "loss": 6.48, "mean_token_accuracy": 0.1359359547495842, "num_tokens": 2392935.0, "step": 1320 }, { "entropy": 6.7767839431762695, "epoch": 1.1383755908895574, "grad_norm": 1.2109375, "learning_rate": 0.000498969272422998, "loss": 6.4887, "mean_token_accuracy": 0.13946662694215775, "num_tokens": 2401560.0, "step": 1325 }, { "entropy": 6.732125520706177, "epoch": 1.142672969488612, "grad_norm": 1.0546875, "learning_rate": 0.0004989372396365921, "loss": 6.4183, "mean_token_accuracy": 0.13894038647413254, "num_tokens": 2410050.0, "step": 1330 }, { "entropy": 6.8855541229248045, "epoch": 1.1469703480876665, "grad_norm": 1.1015625, "learning_rate": 0.0004989047178607618, "loss": 6.5218, "mean_token_accuracy": 0.13579266518354416, "num_tokens": 2418980.0, "step": 1335 }, { "entropy": 6.7566611766815186, "epoch": 1.151267726686721, "grad_norm": 1.09375, "learning_rate": 0.0004988717071665215, "loss": 6.5177, "mean_token_accuracy": 0.13580050468444824, "num_tokens": 2427992.0, "step": 1340 }, { "entropy": 6.821787118911743, "epoch": 1.1555651052857756, "grad_norm": 0.99609375, "learning_rate": 0.0004988382076259537, "loss": 6.4297, "mean_token_accuracy": 0.1417124703526497, "num_tokens": 2436368.0, "step": 1345 }, { "entropy": 6.65723991394043, "epoch": 1.1598624838848304, "grad_norm": 1.0, "learning_rate": 0.0004988042193122077, "loss": 6.4243, "mean_token_accuracy": 0.1399266541004181, "num_tokens": 2445499.0, "step": 1350 }, { "entropy": 6.846164894104004, "epoch": 1.164159862483885, "grad_norm": 1.171875, "learning_rate": 0.0004987697422995005, "loss": 6.4564, "mean_token_accuracy": 0.13335739225149154, "num_tokens": 2454312.0, "step": 1355 }, { "entropy": 6.705566883087158, "epoch": 1.1684572410829395, "grad_norm": 1.0625, "learning_rate": 0.0004987347766631161, "loss": 6.5179, "mean_token_accuracy": 0.13981100916862488, "num_tokens": 2462922.0, "step": 1360 }, { "entropy": 6.8054440975189205, "epoch": 1.172754619681994, "grad_norm": 1.046875, "learning_rate": 0.0004986993224794055, "loss": 6.5574, "mean_token_accuracy": 0.12931617349386215, "num_tokens": 2472195.0, "step": 1365 }, { "entropy": 6.731846857070923, "epoch": 1.1770519982810486, "grad_norm": 1.171875, "learning_rate": 0.0004986633798257865, "loss": 6.456, "mean_token_accuracy": 0.13557855412364006, "num_tokens": 2481021.0, "step": 1370 }, { "entropy": 6.709754800796508, "epoch": 1.181349376880103, "grad_norm": 1.171875, "learning_rate": 0.0004986269487807434, "loss": 6.4682, "mean_token_accuracy": 0.13462188541889192, "num_tokens": 2490250.0, "step": 1375 }, { "entropy": 6.8344573974609375, "epoch": 1.1856467554791577, "grad_norm": 1.0625, "learning_rate": 0.000498590029423827, "loss": 6.529, "mean_token_accuracy": 0.13892517015337943, "num_tokens": 2499122.0, "step": 1380 }, { "entropy": 6.794313240051269, "epoch": 1.1899441340782122, "grad_norm": 1.109375, "learning_rate": 0.0004985526218356546, "loss": 6.5102, "mean_token_accuracy": 0.13186247944831847, "num_tokens": 2508454.0, "step": 1385 }, { "entropy": 6.717947912216187, "epoch": 1.1942415126772667, "grad_norm": 1.09375, "learning_rate": 0.0004985147260979093, "loss": 6.449, "mean_token_accuracy": 0.1434843860566616, "num_tokens": 2517353.0, "step": 1390 }, { "entropy": 6.771858787536621, "epoch": 1.1985388912763215, "grad_norm": 1.140625, "learning_rate": 0.0004984763422933402, "loss": 6.4618, "mean_token_accuracy": 0.13847233429551126, "num_tokens": 2526321.0, "step": 1395 }, { "entropy": 6.732237863540649, "epoch": 1.202836269875376, "grad_norm": 0.984375, "learning_rate": 0.0004984374705057623, "loss": 6.5033, "mean_token_accuracy": 0.13528537154197692, "num_tokens": 2535924.0, "step": 1400 }, { "entropy": 6.721146202087402, "epoch": 1.2071336484744306, "grad_norm": 1.1484375, "learning_rate": 0.0004983981108200561, "loss": 6.4711, "mean_token_accuracy": 0.13535311296582223, "num_tokens": 2545606.0, "step": 1405 }, { "entropy": 6.733812093734741, "epoch": 1.2114310270734852, "grad_norm": 1.125, "learning_rate": 0.0004983582633221672, "loss": 6.4601, "mean_token_accuracy": 0.1369933992624283, "num_tokens": 2554947.0, "step": 1410 }, { "entropy": 6.855603933334351, "epoch": 1.2157284056725397, "grad_norm": 0.984375, "learning_rate": 0.0004983179280991068, "loss": 6.6134, "mean_token_accuracy": 0.12978528887033464, "num_tokens": 2564462.0, "step": 1415 }, { "entropy": 6.726688861846924, "epoch": 1.2200257842715942, "grad_norm": 1.09375, "learning_rate": 0.0004982771052389508, "loss": 6.4475, "mean_token_accuracy": 0.1368112660944462, "num_tokens": 2573124.0, "step": 1420 }, { "entropy": 6.807424783706665, "epoch": 1.224323162870649, "grad_norm": 1.1015625, "learning_rate": 0.0004982357948308401, "loss": 6.5481, "mean_token_accuracy": 0.13265790268778802, "num_tokens": 2581829.0, "step": 1425 }, { "entropy": 6.770775365829468, "epoch": 1.2286205414697036, "grad_norm": 1.1015625, "learning_rate": 0.0004981939969649799, "loss": 6.4049, "mean_token_accuracy": 0.14194427505135537, "num_tokens": 2590631.0, "step": 1430 }, { "entropy": 6.709357166290284, "epoch": 1.232917920068758, "grad_norm": 1.1640625, "learning_rate": 0.0004981517117326404, "loss": 6.5216, "mean_token_accuracy": 0.13609697446227073, "num_tokens": 2600684.0, "step": 1435 }, { "entropy": 6.725667095184326, "epoch": 1.2372152986678127, "grad_norm": 1.046875, "learning_rate": 0.0004981089392261553, "loss": 6.4349, "mean_token_accuracy": 0.14131608307361604, "num_tokens": 2609667.0, "step": 1440 }, { "entropy": 6.692513275146484, "epoch": 1.2415126772668672, "grad_norm": 0.99609375, "learning_rate": 0.000498065679538923, "loss": 6.5055, "mean_token_accuracy": 0.14114993885159494, "num_tokens": 2620025.0, "step": 1445 }, { "entropy": 6.7513340473175045, "epoch": 1.2458100558659218, "grad_norm": 1.125, "learning_rate": 0.0004980219327654049, "loss": 6.428, "mean_token_accuracy": 0.13774933964014052, "num_tokens": 2629032.0, "step": 1450 }, { "entropy": 6.702835464477539, "epoch": 1.2501074344649763, "grad_norm": 1.09375, "learning_rate": 0.000497977699001127, "loss": 6.402, "mean_token_accuracy": 0.142982679605484, "num_tokens": 2638303.0, "step": 1455 }, { "entropy": 6.761410474777222, "epoch": 1.2544048130640308, "grad_norm": 1.125, "learning_rate": 0.0004979329783426778, "loss": 6.4318, "mean_token_accuracy": 0.14380076453089713, "num_tokens": 2647902.0, "step": 1460 }, { "entropy": 6.731089019775391, "epoch": 1.2587021916630854, "grad_norm": 1.1015625, "learning_rate": 0.0004978877708877094, "loss": 6.4848, "mean_token_accuracy": 0.13676076754927635, "num_tokens": 2657902.0, "step": 1465 }, { "entropy": 6.71400637626648, "epoch": 1.2629995702621402, "grad_norm": 1.0703125, "learning_rate": 0.0004978420767349368, "loss": 6.4196, "mean_token_accuracy": 0.13780386745929718, "num_tokens": 2667082.0, "step": 1470 }, { "entropy": 6.737793684005737, "epoch": 1.2672969488611947, "grad_norm": 1.03125, "learning_rate": 0.0004977958959841379, "loss": 6.4943, "mean_token_accuracy": 0.1352358005940914, "num_tokens": 2676855.0, "step": 1475 }, { "entropy": 6.734015226364136, "epoch": 1.2715943274602493, "grad_norm": 1.0390625, "learning_rate": 0.000497749228736153, "loss": 6.4201, "mean_token_accuracy": 0.14142746701836587, "num_tokens": 2685750.0, "step": 1480 }, { "entropy": 6.656690311431885, "epoch": 1.2758917060593038, "grad_norm": 1.171875, "learning_rate": 0.0004977020750928845, "loss": 6.4771, "mean_token_accuracy": 0.14191860556602479, "num_tokens": 2695272.0, "step": 1485 }, { "entropy": 6.794925928115845, "epoch": 1.2801890846583583, "grad_norm": 1.046875, "learning_rate": 0.0004976544351572973, "loss": 6.4253, "mean_token_accuracy": 0.14196638017892838, "num_tokens": 2704806.0, "step": 1490 }, { "entropy": 6.56059627532959, "epoch": 1.2844864632574131, "grad_norm": 1.0390625, "learning_rate": 0.0004976063090334179, "loss": 6.4836, "mean_token_accuracy": 0.14093814194202423, "num_tokens": 2713521.0, "step": 1495 }, { "entropy": 6.7648594856262205, "epoch": 1.2887838418564677, "grad_norm": 1.1171875, "learning_rate": 0.0004975576968263346, "loss": 6.472, "mean_token_accuracy": 0.13531532436609267, "num_tokens": 2721848.0, "step": 1500 }, { "epoch": 1.2887838418564677, "eval_entropy": 6.583824046023257, "eval_loss": 6.552463054656982, "eval_mean_token_accuracy": 0.13841687775477096, "eval_num_tokens": 2721848.0, "eval_runtime": 2.0451, "eval_samples_per_second": 1735.359, "eval_steps_per_second": 217.103, "step": 1500 }, { "entropy": 6.6689835548400875, "epoch": 1.2930812204555222, "grad_norm": 1.0, "learning_rate": 0.000497508598642197, "loss": 6.4406, "mean_token_accuracy": 0.13946301937103273, "num_tokens": 2731473.0, "step": 1505 }, { "entropy": 6.724963998794555, "epoch": 1.2973785990545768, "grad_norm": 1.0625, "learning_rate": 0.000497459014588216, "loss": 6.5064, "mean_token_accuracy": 0.13410719558596612, "num_tokens": 2739867.0, "step": 1510 }, { "entropy": 6.701112556457519, "epoch": 1.3016759776536313, "grad_norm": 1.0859375, "learning_rate": 0.000497408944772663, "loss": 6.4165, "mean_token_accuracy": 0.14087883234024048, "num_tokens": 2748903.0, "step": 1515 }, { "entropy": 6.621306848526001, "epoch": 1.3059733562526858, "grad_norm": 1.0390625, "learning_rate": 0.0004973583893048707, "loss": 6.4144, "mean_token_accuracy": 0.13790024891495706, "num_tokens": 2757711.0, "step": 1520 }, { "entropy": 6.8078021049499515, "epoch": 1.3102707348517404, "grad_norm": 1.109375, "learning_rate": 0.0004973073482952321, "loss": 6.4178, "mean_token_accuracy": 0.14102478623390197, "num_tokens": 2765633.0, "step": 1525 }, { "entropy": 6.606275224685669, "epoch": 1.314568113450795, "grad_norm": 1.3046875, "learning_rate": 0.0004972558218552004, "loss": 6.454, "mean_token_accuracy": 0.1388860262930393, "num_tokens": 2774495.0, "step": 1530 }, { "entropy": 6.737347936630249, "epoch": 1.3188654920498495, "grad_norm": 1.1328125, "learning_rate": 0.0004972038100972885, "loss": 6.4827, "mean_token_accuracy": 0.13370617032051085, "num_tokens": 2782665.0, "step": 1535 }, { "entropy": 6.652740144729615, "epoch": 1.323162870648904, "grad_norm": 1.3125, "learning_rate": 0.0004971513131350697, "loss": 6.4163, "mean_token_accuracy": 0.13846877068281174, "num_tokens": 2791394.0, "step": 1540 }, { "entropy": 6.583173847198486, "epoch": 1.3274602492479588, "grad_norm": 1.1484375, "learning_rate": 0.0004970983310831759, "loss": 6.4113, "mean_token_accuracy": 0.13881225883960724, "num_tokens": 2800488.0, "step": 1545 }, { "entropy": 6.734278392791748, "epoch": 1.3317576278470133, "grad_norm": 1.03125, "learning_rate": 0.0004970448640572989, "loss": 6.5243, "mean_token_accuracy": 0.1339696764945984, "num_tokens": 2810116.0, "step": 1550 }, { "entropy": 6.658429765701294, "epoch": 1.336055006446068, "grad_norm": 0.94921875, "learning_rate": 0.0004969909121741895, "loss": 6.3255, "mean_token_accuracy": 0.14455484077334405, "num_tokens": 2819205.0, "step": 1555 }, { "entropy": 6.591242885589599, "epoch": 1.3403523850451224, "grad_norm": 1.109375, "learning_rate": 0.0004969364755516569, "loss": 6.4035, "mean_token_accuracy": 0.13771276026964188, "num_tokens": 2828017.0, "step": 1560 }, { "entropy": 6.73987512588501, "epoch": 1.344649763644177, "grad_norm": 1.1328125, "learning_rate": 0.0004968815543085689, "loss": 6.438, "mean_token_accuracy": 0.14133503511548043, "num_tokens": 2837125.0, "step": 1565 }, { "entropy": 6.648034620285034, "epoch": 1.3489471422432318, "grad_norm": 1.0625, "learning_rate": 0.0004968261485648516, "loss": 6.4665, "mean_token_accuracy": 0.13752973526716233, "num_tokens": 2845438.0, "step": 1570 }, { "entropy": 6.690678644180298, "epoch": 1.3532445208422863, "grad_norm": 1.015625, "learning_rate": 0.000496770258441489, "loss": 6.4311, "mean_token_accuracy": 0.14550055414438248, "num_tokens": 2854389.0, "step": 1575 }, { "entropy": 6.591717529296875, "epoch": 1.3575418994413408, "grad_norm": 1.0234375, "learning_rate": 0.0004967138840605228, "loss": 6.3947, "mean_token_accuracy": 0.1433369368314743, "num_tokens": 2863654.0, "step": 1580 }, { "entropy": 6.645109987258911, "epoch": 1.3618392780403954, "grad_norm": 1.0703125, "learning_rate": 0.000496657025545052, "loss": 6.3068, "mean_token_accuracy": 0.14519514814019202, "num_tokens": 2872871.0, "step": 1585 }, { "entropy": 6.5770776748657225, "epoch": 1.36613665663945, "grad_norm": 1.1328125, "learning_rate": 0.000496599683019233, "loss": 6.4037, "mean_token_accuracy": 0.14221980646252633, "num_tokens": 2881140.0, "step": 1590 }, { "entropy": 6.7226653575897215, "epoch": 1.3704340352385045, "grad_norm": 1.0546875, "learning_rate": 0.000496541856608279, "loss": 6.3852, "mean_token_accuracy": 0.14397331327199936, "num_tokens": 2889945.0, "step": 1595 }, { "entropy": 6.5361980438232425, "epoch": 1.374731413837559, "grad_norm": 0.95703125, "learning_rate": 0.0004964835464384595, "loss": 6.3238, "mean_token_accuracy": 0.145409494638443, "num_tokens": 2898897.0, "step": 1600 }, { "entropy": 6.686757373809814, "epoch": 1.3790287924366136, "grad_norm": 1.09375, "learning_rate": 0.000496424752637101, "loss": 6.3401, "mean_token_accuracy": 0.14611406177282332, "num_tokens": 2907717.0, "step": 1605 }, { "entropy": 6.578691530227661, "epoch": 1.3833261710356681, "grad_norm": 1.078125, "learning_rate": 0.0004963654753325853, "loss": 6.3297, "mean_token_accuracy": 0.14271921664476395, "num_tokens": 2916213.0, "step": 1610 }, { "entropy": 6.683462333679199, "epoch": 1.387623549634723, "grad_norm": 1.0, "learning_rate": 0.0004963057146543505, "loss": 6.4949, "mean_token_accuracy": 0.1387751467525959, "num_tokens": 2925706.0, "step": 1615 }, { "entropy": 6.599123191833496, "epoch": 1.3919209282337774, "grad_norm": 1.015625, "learning_rate": 0.00049624547073289, "loss": 6.4208, "mean_token_accuracy": 0.1372368849813938, "num_tokens": 2934464.0, "step": 1620 }, { "entropy": 6.672312545776367, "epoch": 1.396218306832832, "grad_norm": 1.140625, "learning_rate": 0.0004961847436997526, "loss": 6.3195, "mean_token_accuracy": 0.14415977373719216, "num_tokens": 2944095.0, "step": 1625 }, { "entropy": 6.480645990371704, "epoch": 1.4005156854318865, "grad_norm": 1.09375, "learning_rate": 0.0004961235336875416, "loss": 6.3231, "mean_token_accuracy": 0.14915895387530326, "num_tokens": 2953357.0, "step": 1630 }, { "entropy": 6.639774322509766, "epoch": 1.404813064030941, "grad_norm": 1.109375, "learning_rate": 0.0004960618408299154, "loss": 6.4687, "mean_token_accuracy": 0.13529081642627716, "num_tokens": 2963020.0, "step": 1635 }, { "entropy": 6.682909727096558, "epoch": 1.4091104426299956, "grad_norm": 1.046875, "learning_rate": 0.0004959996652615865, "loss": 6.319, "mean_token_accuracy": 0.14330243095755577, "num_tokens": 2971955.0, "step": 1640 }, { "entropy": 6.6523435592651365, "epoch": 1.4134078212290504, "grad_norm": 1.0703125, "learning_rate": 0.0004959370071183216, "loss": 6.3766, "mean_token_accuracy": 0.14444040805101394, "num_tokens": 2980662.0, "step": 1645 }, { "entropy": 6.675427007675171, "epoch": 1.417705199828105, "grad_norm": 1.1484375, "learning_rate": 0.0004958738665369407, "loss": 6.5051, "mean_token_accuracy": 0.12928852811455727, "num_tokens": 2990038.0, "step": 1650 }, { "entropy": 6.632522964477539, "epoch": 1.4220025784271595, "grad_norm": 1.1328125, "learning_rate": 0.0004958102436553179, "loss": 6.4172, "mean_token_accuracy": 0.1390580452978611, "num_tokens": 2999835.0, "step": 1655 }, { "entropy": 6.694387483596802, "epoch": 1.426299957026214, "grad_norm": 0.98828125, "learning_rate": 0.00049574613861238, "loss": 6.4118, "mean_token_accuracy": 0.13762674629688262, "num_tokens": 3009593.0, "step": 1660 }, { "entropy": 6.648862266540528, "epoch": 1.4305973356252686, "grad_norm": 0.99609375, "learning_rate": 0.0004956815515481069, "loss": 6.4348, "mean_token_accuracy": 0.144145817309618, "num_tokens": 3019187.0, "step": 1665 }, { "entropy": 6.582254266738891, "epoch": 1.4348947142243231, "grad_norm": 1.078125, "learning_rate": 0.0004956164826035309, "loss": 6.3495, "mean_token_accuracy": 0.14171260893344878, "num_tokens": 3027875.0, "step": 1670 }, { "entropy": 6.569947624206543, "epoch": 1.4391920928233777, "grad_norm": 1.1171875, "learning_rate": 0.0004955509319207363, "loss": 6.3833, "mean_token_accuracy": 0.13855091333389283, "num_tokens": 3036902.0, "step": 1675 }, { "entropy": 6.548913908004761, "epoch": 1.4434894714224322, "grad_norm": 0.9375, "learning_rate": 0.0004954848996428601, "loss": 6.36, "mean_token_accuracy": 0.14765606224536895, "num_tokens": 3046653.0, "step": 1680 }, { "entropy": 6.6836981773376465, "epoch": 1.4477868500214868, "grad_norm": 1.3515625, "learning_rate": 0.00049541838591409, "loss": 6.448, "mean_token_accuracy": 0.13707543835043906, "num_tokens": 3056273.0, "step": 1685 }, { "entropy": 6.570832586288452, "epoch": 1.4520842286205415, "grad_norm": 1.046875, "learning_rate": 0.0004953513908796657, "loss": 6.3562, "mean_token_accuracy": 0.13904846012592315, "num_tokens": 3065662.0, "step": 1690 }, { "entropy": 6.719029092788697, "epoch": 1.456381607219596, "grad_norm": 1.140625, "learning_rate": 0.0004952839146858773, "loss": 6.3883, "mean_token_accuracy": 0.14505013972520828, "num_tokens": 3073970.0, "step": 1695 }, { "entropy": 6.546349334716797, "epoch": 1.4606789858186506, "grad_norm": 1.1796875, "learning_rate": 0.0004952159574800658, "loss": 6.3978, "mean_token_accuracy": 0.13897576928138733, "num_tokens": 3082500.0, "step": 1700 }, { "entropy": 6.645324468612671, "epoch": 1.4649763644177052, "grad_norm": 1.0859375, "learning_rate": 0.0004951475194106229, "loss": 6.342, "mean_token_accuracy": 0.14458465725183486, "num_tokens": 3091574.0, "step": 1705 }, { "entropy": 6.590623474121093, "epoch": 1.4692737430167597, "grad_norm": 1.0234375, "learning_rate": 0.0004950786006269898, "loss": 6.4477, "mean_token_accuracy": 0.1356819100677967, "num_tokens": 3102402.0, "step": 1710 }, { "entropy": 6.654024839401245, "epoch": 1.4735711216158143, "grad_norm": 1.125, "learning_rate": 0.0004950092012796576, "loss": 6.2738, "mean_token_accuracy": 0.14728236198425293, "num_tokens": 3111347.0, "step": 1715 }, { "entropy": 6.553081369400024, "epoch": 1.477868500214869, "grad_norm": 1.1796875, "learning_rate": 0.0004949393215201666, "loss": 6.3455, "mean_token_accuracy": 0.14207591861486435, "num_tokens": 3120018.0, "step": 1720 }, { "entropy": 6.595822668075561, "epoch": 1.4821658788139236, "grad_norm": 0.96875, "learning_rate": 0.0004948689615011065, "loss": 6.4086, "mean_token_accuracy": 0.13704866543412209, "num_tokens": 3129669.0, "step": 1725 }, { "entropy": 6.628203105926514, "epoch": 1.4864632574129781, "grad_norm": 0.953125, "learning_rate": 0.0004947981213761154, "loss": 6.3443, "mean_token_accuracy": 0.14518199041485785, "num_tokens": 3139112.0, "step": 1730 }, { "entropy": 6.5786394596099855, "epoch": 1.4907606360120327, "grad_norm": 1.046875, "learning_rate": 0.0004947268012998797, "loss": 6.3058, "mean_token_accuracy": 0.15637002438306807, "num_tokens": 3148437.0, "step": 1735 }, { "entropy": 6.570107936859131, "epoch": 1.4950580146110872, "grad_norm": 0.9609375, "learning_rate": 0.000494655001428134, "loss": 6.2891, "mean_token_accuracy": 0.14667836502194403, "num_tokens": 3158165.0, "step": 1740 }, { "entropy": 6.586823749542236, "epoch": 1.4993553932101418, "grad_norm": 1.1015625, "learning_rate": 0.0004945827219176604, "loss": 6.3587, "mean_token_accuracy": 0.1493491068482399, "num_tokens": 3167262.0, "step": 1745 }, { "entropy": 6.514509057998657, "epoch": 1.5036527718091963, "grad_norm": 1.0078125, "learning_rate": 0.0004945099629262888, "loss": 6.3479, "mean_token_accuracy": 0.1436598651111126, "num_tokens": 3176696.0, "step": 1750 }, { "entropy": 6.673803234100342, "epoch": 1.5079501504082509, "grad_norm": 1.0546875, "learning_rate": 0.0004944367246128954, "loss": 6.4304, "mean_token_accuracy": 0.13725945726037025, "num_tokens": 3185857.0, "step": 1755 }, { "entropy": 6.5661591529846195, "epoch": 1.5122475290073054, "grad_norm": 1.0625, "learning_rate": 0.0004943630071374036, "loss": 6.2677, "mean_token_accuracy": 0.14966750741004944, "num_tokens": 3194687.0, "step": 1760 }, { "entropy": 6.554711723327637, "epoch": 1.51654490760636, "grad_norm": 1.0078125, "learning_rate": 0.0004942888106607828, "loss": 6.3291, "mean_token_accuracy": 0.14281144142150878, "num_tokens": 3204913.0, "step": 1765 }, { "entropy": 6.641019535064697, "epoch": 1.5208422862054147, "grad_norm": 1.0390625, "learning_rate": 0.0004942141353450486, "loss": 6.3145, "mean_token_accuracy": 0.1485350415110588, "num_tokens": 3213312.0, "step": 1770 }, { "entropy": 6.493930768966675, "epoch": 1.5251396648044693, "grad_norm": 0.96875, "learning_rate": 0.0004941389813532619, "loss": 6.2368, "mean_token_accuracy": 0.15905009657144548, "num_tokens": 3222992.0, "step": 1775 }, { "entropy": 6.511264657974243, "epoch": 1.5294370434035238, "grad_norm": 0.984375, "learning_rate": 0.000494063348849529, "loss": 6.2816, "mean_token_accuracy": 0.14892083406448364, "num_tokens": 3232836.0, "step": 1780 }, { "entropy": 6.616392660140991, "epoch": 1.5337344220025786, "grad_norm": 0.94140625, "learning_rate": 0.0004939872379990011, "loss": 6.4346, "mean_token_accuracy": 0.1384902000427246, "num_tokens": 3243171.0, "step": 1785 }, { "entropy": 6.671454858779907, "epoch": 1.5380318006016331, "grad_norm": 1.1796875, "learning_rate": 0.0004939106489678739, "loss": 6.3565, "mean_token_accuracy": 0.14886578172445297, "num_tokens": 3251995.0, "step": 1790 }, { "entropy": 6.483775520324707, "epoch": 1.5423291792006877, "grad_norm": 1.015625, "learning_rate": 0.000493833581923387, "loss": 6.2999, "mean_token_accuracy": 0.147441129386425, "num_tokens": 3260841.0, "step": 1795 }, { "entropy": 6.614831399917603, "epoch": 1.5466265577997422, "grad_norm": 1.0546875, "learning_rate": 0.0004937560370338244, "loss": 6.4359, "mean_token_accuracy": 0.1328293912112713, "num_tokens": 3270979.0, "step": 1800 }, { "entropy": 6.602978515625, "epoch": 1.5509239363987968, "grad_norm": 1.0859375, "learning_rate": 0.000493678014468513, "loss": 6.3703, "mean_token_accuracy": 0.14689823091030121, "num_tokens": 3279848.0, "step": 1805 }, { "entropy": 6.534598064422608, "epoch": 1.5552213149978513, "grad_norm": 0.94921875, "learning_rate": 0.0004935995143978227, "loss": 6.3674, "mean_token_accuracy": 0.14537320658564568, "num_tokens": 3289172.0, "step": 1810 }, { "entropy": 6.508708524703979, "epoch": 1.5595186935969059, "grad_norm": 1.1484375, "learning_rate": 0.0004935205369931664, "loss": 6.2677, "mean_token_accuracy": 0.1513919234275818, "num_tokens": 3297432.0, "step": 1815 }, { "entropy": 6.684668636322021, "epoch": 1.5638160721959604, "grad_norm": 0.92578125, "learning_rate": 0.0004934410824269992, "loss": 6.2954, "mean_token_accuracy": 0.1454857923090458, "num_tokens": 3307486.0, "step": 1820 }, { "entropy": 6.466551637649536, "epoch": 1.568113450795015, "grad_norm": 1.0234375, "learning_rate": 0.0004933611508728182, "loss": 6.2671, "mean_token_accuracy": 0.14967258870601655, "num_tokens": 3316296.0, "step": 1825 }, { "entropy": 6.563362693786621, "epoch": 1.5724108293940695, "grad_norm": 1.0078125, "learning_rate": 0.000493280742505162, "loss": 6.2972, "mean_token_accuracy": 0.14479405283927918, "num_tokens": 3326080.0, "step": 1830 }, { "entropy": 6.456173896789551, "epoch": 1.576708207993124, "grad_norm": 1.0546875, "learning_rate": 0.0004931998574996102, "loss": 6.217, "mean_token_accuracy": 0.15072606950998307, "num_tokens": 3334826.0, "step": 1835 }, { "entropy": 6.472858524322509, "epoch": 1.5810055865921788, "grad_norm": 1.0859375, "learning_rate": 0.0004931184960327832, "loss": 6.2177, "mean_token_accuracy": 0.1524192661046982, "num_tokens": 3343261.0, "step": 1840 }, { "entropy": 6.493236398696899, "epoch": 1.5853029651912334, "grad_norm": 1.640625, "learning_rate": 0.0004930366582823421, "loss": 6.2619, "mean_token_accuracy": 0.14549409449100495, "num_tokens": 3352513.0, "step": 1845 }, { "entropy": 6.541861534118652, "epoch": 1.589600343790288, "grad_norm": 1.1484375, "learning_rate": 0.0004929543444269879, "loss": 6.3147, "mean_token_accuracy": 0.15202615782618523, "num_tokens": 3361577.0, "step": 1850 }, { "entropy": 6.516072130203247, "epoch": 1.5938977223893425, "grad_norm": 1.1171875, "learning_rate": 0.000492871554646461, "loss": 6.3805, "mean_token_accuracy": 0.1442191444337368, "num_tokens": 3370591.0, "step": 1855 }, { "entropy": 6.489377784729004, "epoch": 1.5981951009883972, "grad_norm": 1.0703125, "learning_rate": 0.0004927882891215413, "loss": 6.2995, "mean_token_accuracy": 0.1446702793240547, "num_tokens": 3379761.0, "step": 1860 }, { "entropy": 6.6347997188568115, "epoch": 1.6024924795874518, "grad_norm": 1.203125, "learning_rate": 0.0004927045480340475, "loss": 6.3729, "mean_token_accuracy": 0.13809221014380454, "num_tokens": 3388974.0, "step": 1865 }, { "entropy": 6.515362644195557, "epoch": 1.6067898581865063, "grad_norm": 0.9765625, "learning_rate": 0.0004926203315668363, "loss": 6.2995, "mean_token_accuracy": 0.14509507045149803, "num_tokens": 3398339.0, "step": 1870 }, { "entropy": 6.501726579666138, "epoch": 1.6110872367855609, "grad_norm": 1.046875, "learning_rate": 0.0004925356399038032, "loss": 6.2645, "mean_token_accuracy": 0.14561111479997635, "num_tokens": 3408292.0, "step": 1875 }, { "entropy": 6.528331470489502, "epoch": 1.6153846153846154, "grad_norm": 1.1484375, "learning_rate": 0.0004924504732298808, "loss": 6.2363, "mean_token_accuracy": 0.15578987523913385, "num_tokens": 3417057.0, "step": 1880 }, { "entropy": 6.547144651412964, "epoch": 1.61968199398367, "grad_norm": 1.0703125, "learning_rate": 0.0004923648317310391, "loss": 6.3436, "mean_token_accuracy": 0.1472199097275734, "num_tokens": 3425830.0, "step": 1885 }, { "entropy": 6.503617954254151, "epoch": 1.6239793725827245, "grad_norm": 0.98046875, "learning_rate": 0.0004922787155942849, "loss": 6.3929, "mean_token_accuracy": 0.13893435150384903, "num_tokens": 3435513.0, "step": 1890 }, { "entropy": 6.572265768051148, "epoch": 1.628276751181779, "grad_norm": 1.03125, "learning_rate": 0.0004921921250076611, "loss": 6.2966, "mean_token_accuracy": 0.14931443706154823, "num_tokens": 3444684.0, "step": 1895 }, { "entropy": 6.4495138168334964, "epoch": 1.6325741297808336, "grad_norm": 1.1015625, "learning_rate": 0.0004921050601602475, "loss": 6.3435, "mean_token_accuracy": 0.14741323441267012, "num_tokens": 3453454.0, "step": 1900 }, { "entropy": 6.556122159957885, "epoch": 1.6368715083798882, "grad_norm": 1.0546875, "learning_rate": 0.0004920175212421587, "loss": 6.2787, "mean_token_accuracy": 0.14662181138992308, "num_tokens": 3463228.0, "step": 1905 }, { "entropy": 6.366853141784668, "epoch": 1.6411688869789427, "grad_norm": 1.03125, "learning_rate": 0.0004919295084445445, "loss": 6.166, "mean_token_accuracy": 0.15177097618579866, "num_tokens": 3472131.0, "step": 1910 }, { "entropy": 6.485814142227173, "epoch": 1.6454662655779975, "grad_norm": 0.98828125, "learning_rate": 0.0004918410219595899, "loss": 6.2547, "mean_token_accuracy": 0.1523374244570732, "num_tokens": 3480642.0, "step": 1915 }, { "entropy": 6.621995449066162, "epoch": 1.649763644177052, "grad_norm": 0.9765625, "learning_rate": 0.000491752061980514, "loss": 6.2277, "mean_token_accuracy": 0.15280286371707916, "num_tokens": 3489346.0, "step": 1920 }, { "entropy": 6.4284903049469, "epoch": 1.6540610227761066, "grad_norm": 1.1015625, "learning_rate": 0.0004916626287015697, "loss": 6.2756, "mean_token_accuracy": 0.15068823397159575, "num_tokens": 3498473.0, "step": 1925 }, { "entropy": 6.515523910522461, "epoch": 1.658358401375161, "grad_norm": 1.0, "learning_rate": 0.0004915727223180436, "loss": 6.2738, "mean_token_accuracy": 0.142893535643816, "num_tokens": 3507415.0, "step": 1930 }, { "entropy": 6.528269815444946, "epoch": 1.6626557799742159, "grad_norm": 0.984375, "learning_rate": 0.0004914823430262554, "loss": 6.3984, "mean_token_accuracy": 0.1329946205019951, "num_tokens": 3516873.0, "step": 1935 }, { "entropy": 6.484966564178467, "epoch": 1.6669531585732704, "grad_norm": 1.140625, "learning_rate": 0.0004913914910235573, "loss": 6.2479, "mean_token_accuracy": 0.14868821799755097, "num_tokens": 3525047.0, "step": 1940 }, { "entropy": 6.448112821578979, "epoch": 1.671250537172325, "grad_norm": 1.0859375, "learning_rate": 0.0004913001665083337, "loss": 6.2685, "mean_token_accuracy": 0.14392302706837654, "num_tokens": 3534354.0, "step": 1945 }, { "entropy": 6.528091144561768, "epoch": 1.6755479157713795, "grad_norm": 1.2265625, "learning_rate": 0.0004912083696800008, "loss": 6.2926, "mean_token_accuracy": 0.14562170803546906, "num_tokens": 3543830.0, "step": 1950 }, { "entropy": 6.4218017578125, "epoch": 1.679845294370434, "grad_norm": 1.09375, "learning_rate": 0.0004911161007390063, "loss": 6.1933, "mean_token_accuracy": 0.14804754853248597, "num_tokens": 3552314.0, "step": 1955 }, { "entropy": 6.470229148864746, "epoch": 1.6841426729694886, "grad_norm": 1.1875, "learning_rate": 0.0004910233598868287, "loss": 6.2765, "mean_token_accuracy": 0.14477257579565048, "num_tokens": 3561656.0, "step": 1960 }, { "entropy": 6.467269372940064, "epoch": 1.6884400515685432, "grad_norm": 1.0625, "learning_rate": 0.0004909301473259769, "loss": 6.2641, "mean_token_accuracy": 0.14551830440759658, "num_tokens": 3571784.0, "step": 1965 }, { "entropy": 6.518259859085083, "epoch": 1.6927374301675977, "grad_norm": 1.0625, "learning_rate": 0.0004908364632599899, "loss": 6.228, "mean_token_accuracy": 0.15220747292041778, "num_tokens": 3580626.0, "step": 1970 }, { "entropy": 6.378790664672851, "epoch": 1.6970348087666522, "grad_norm": 1.046875, "learning_rate": 0.0004907423078934362, "loss": 6.2467, "mean_token_accuracy": 0.14601020216941835, "num_tokens": 3589916.0, "step": 1975 }, { "entropy": 6.473833656311035, "epoch": 1.7013321873657068, "grad_norm": 1.0078125, "learning_rate": 0.0004906476814319134, "loss": 6.2572, "mean_token_accuracy": 0.14930620267987252, "num_tokens": 3599128.0, "step": 1980 }, { "entropy": 6.429199600219727, "epoch": 1.7056295659647613, "grad_norm": 0.9140625, "learning_rate": 0.0004905525840820481, "loss": 6.2686, "mean_token_accuracy": 0.1471567466855049, "num_tokens": 3608764.0, "step": 1985 }, { "entropy": 6.58309121131897, "epoch": 1.709926944563816, "grad_norm": 0.9453125, "learning_rate": 0.0004904570160514948, "loss": 6.3077, "mean_token_accuracy": 0.14043890461325645, "num_tokens": 3619082.0, "step": 1990 }, { "entropy": 6.45733323097229, "epoch": 1.7142243231628707, "grad_norm": 1.140625, "learning_rate": 0.0004903609775489358, "loss": 6.2682, "mean_token_accuracy": 0.14586469754576684, "num_tokens": 3628695.0, "step": 1995 }, { "entropy": 6.511290454864502, "epoch": 1.7185217017619252, "grad_norm": 1.015625, "learning_rate": 0.0004902644687840809, "loss": 6.267, "mean_token_accuracy": 0.14717549681663514, "num_tokens": 3637599.0, "step": 2000 }, { "epoch": 1.7185217017619252, "eval_entropy": 6.214308420817058, "eval_loss": 6.331518173217773, "eval_mean_token_accuracy": 0.14971260959702032, "eval_num_tokens": 3637599.0, "eval_runtime": 2.0415, "eval_samples_per_second": 1738.466, "eval_steps_per_second": 217.492, "step": 2000 }, { "entropy": 6.427486324310303, "epoch": 1.7228190803609797, "grad_norm": 1.1484375, "learning_rate": 0.0004901674899676667, "loss": 6.2449, "mean_token_accuracy": 0.14803531616926194, "num_tokens": 3647406.0, "step": 2005 }, { "entropy": 6.416431045532226, "epoch": 1.7271164589600345, "grad_norm": 1.03125, "learning_rate": 0.0004900700413114561, "loss": 6.1252, "mean_token_accuracy": 0.15068818926811217, "num_tokens": 3656531.0, "step": 2010 }, { "entropy": 6.388833618164062, "epoch": 1.731413837559089, "grad_norm": 1.0078125, "learning_rate": 0.000489972123028238, "loss": 6.2244, "mean_token_accuracy": 0.1465991474688053, "num_tokens": 3664922.0, "step": 2015 }, { "entropy": 6.502804613113403, "epoch": 1.7357112161581436, "grad_norm": 1.0234375, "learning_rate": 0.0004898737353318268, "loss": 6.1557, "mean_token_accuracy": 0.1519090563058853, "num_tokens": 3673283.0, "step": 2020 }, { "entropy": 6.377015924453735, "epoch": 1.7400085947571982, "grad_norm": 1.125, "learning_rate": 0.000489774878437062, "loss": 6.298, "mean_token_accuracy": 0.15162839442491532, "num_tokens": 3681760.0, "step": 2025 }, { "entropy": 6.46599555015564, "epoch": 1.7443059733562527, "grad_norm": 1.078125, "learning_rate": 0.0004896755525598074, "loss": 6.1178, "mean_token_accuracy": 0.15259039252996445, "num_tokens": 3689408.0, "step": 2030 }, { "entropy": 6.4247987270355225, "epoch": 1.7486033519553073, "grad_norm": 1.109375, "learning_rate": 0.0004895757579169511, "loss": 6.234, "mean_token_accuracy": 0.14994207322597503, "num_tokens": 3697904.0, "step": 2035 }, { "entropy": 6.579666042327881, "epoch": 1.7529007305543618, "grad_norm": 1.0078125, "learning_rate": 0.0004894754947264047, "loss": 6.2504, "mean_token_accuracy": 0.15150809586048125, "num_tokens": 3706704.0, "step": 2040 }, { "entropy": 6.433872127532959, "epoch": 1.7571981091534163, "grad_norm": 1.109375, "learning_rate": 0.000489374763207103, "loss": 6.3286, "mean_token_accuracy": 0.14471730291843415, "num_tokens": 3715690.0, "step": 2045 }, { "entropy": 6.465651893615723, "epoch": 1.761495487752471, "grad_norm": 1.109375, "learning_rate": 0.0004892735635790033, "loss": 6.125, "mean_token_accuracy": 0.15927532613277434, "num_tokens": 3724835.0, "step": 2050 }, { "entropy": 6.368647861480713, "epoch": 1.7657928663515254, "grad_norm": 0.94140625, "learning_rate": 0.000489171896063085, "loss": 6.1498, "mean_token_accuracy": 0.157290717959404, "num_tokens": 3733977.0, "step": 2055 }, { "entropy": 6.458992671966553, "epoch": 1.77009024495058, "grad_norm": 1.078125, "learning_rate": 0.0004890697608813495, "loss": 6.2682, "mean_token_accuracy": 0.14064312726259232, "num_tokens": 3742665.0, "step": 2060 }, { "entropy": 6.583484077453614, "epoch": 1.7743876235496348, "grad_norm": 1.078125, "learning_rate": 0.0004889671582568193, "loss": 6.3367, "mean_token_accuracy": 0.14621492847800255, "num_tokens": 3751647.0, "step": 2065 }, { "entropy": 6.387417125701904, "epoch": 1.7786850021486893, "grad_norm": 1.140625, "learning_rate": 0.0004888640884135374, "loss": 6.2386, "mean_token_accuracy": 0.1474798172712326, "num_tokens": 3760852.0, "step": 2070 }, { "entropy": 6.3953369617462155, "epoch": 1.7829823807477438, "grad_norm": 1.25, "learning_rate": 0.0004887605515765671, "loss": 6.1913, "mean_token_accuracy": 0.15439595878124238, "num_tokens": 3768640.0, "step": 2075 }, { "entropy": 6.503360080718994, "epoch": 1.7872797593467986, "grad_norm": 1.0546875, "learning_rate": 0.0004886565479719914, "loss": 6.2177, "mean_token_accuracy": 0.14689500331878663, "num_tokens": 3776504.0, "step": 2080 }, { "entropy": 6.52859411239624, "epoch": 1.7915771379458532, "grad_norm": 1.1875, "learning_rate": 0.0004885520778269128, "loss": 6.2515, "mean_token_accuracy": 0.1499434307217598, "num_tokens": 3786353.0, "step": 2085 }, { "entropy": 6.410916137695312, "epoch": 1.7958745165449077, "grad_norm": 1.0859375, "learning_rate": 0.0004884471413694523, "loss": 6.2783, "mean_token_accuracy": 0.15109124332666396, "num_tokens": 3795902.0, "step": 2090 }, { "entropy": 6.470384979248047, "epoch": 1.8001718951439623, "grad_norm": 0.9140625, "learning_rate": 0.0004883417388287491, "loss": 6.194, "mean_token_accuracy": 0.1435760647058487, "num_tokens": 3805986.0, "step": 2095 }, { "entropy": 6.400091123580933, "epoch": 1.8044692737430168, "grad_norm": 1.140625, "learning_rate": 0.0004882358704349603, "loss": 6.3188, "mean_token_accuracy": 0.1500417910516262, "num_tokens": 3814915.0, "step": 2100 }, { "entropy": 6.456367015838623, "epoch": 1.8087666523420713, "grad_norm": 1.15625, "learning_rate": 0.0004881295364192601, "loss": 6.2089, "mean_token_accuracy": 0.15894449651241302, "num_tokens": 3823966.0, "step": 2105 }, { "entropy": 6.510165739059448, "epoch": 1.813064030941126, "grad_norm": 1.0078125, "learning_rate": 0.0004880227370138394, "loss": 6.2729, "mean_token_accuracy": 0.142085450142622, "num_tokens": 3832775.0, "step": 2110 }, { "entropy": 6.3983588218688965, "epoch": 1.8173614095401804, "grad_norm": 0.8984375, "learning_rate": 0.0004879154724519057, "loss": 6.1809, "mean_token_accuracy": 0.15120477825403214, "num_tokens": 3842808.0, "step": 2115 }, { "entropy": 6.493490934371948, "epoch": 1.821658788139235, "grad_norm": 1.046875, "learning_rate": 0.0004878077429676816, "loss": 6.3143, "mean_token_accuracy": 0.14699392020702362, "num_tokens": 3853303.0, "step": 2120 }, { "entropy": 6.4460196018219, "epoch": 1.8259561667382895, "grad_norm": 1.046875, "learning_rate": 0.0004876995487964054, "loss": 6.2277, "mean_token_accuracy": 0.13867998719215394, "num_tokens": 3862462.0, "step": 2125 }, { "entropy": 6.459061241149902, "epoch": 1.830253545337344, "grad_norm": 1.0234375, "learning_rate": 0.00048759089017432996, "loss": 6.3388, "mean_token_accuracy": 0.14455281794071198, "num_tokens": 3871596.0, "step": 2130 }, { "entropy": 6.482069444656372, "epoch": 1.8345509239363988, "grad_norm": 1.015625, "learning_rate": 0.0004874817673387222, "loss": 6.2427, "mean_token_accuracy": 0.14856942594051362, "num_tokens": 3881276.0, "step": 2135 }, { "entropy": 6.43566927909851, "epoch": 1.8388483025354534, "grad_norm": 0.96875, "learning_rate": 0.00048737218052786275, "loss": 6.33, "mean_token_accuracy": 0.14330809488892554, "num_tokens": 3891610.0, "step": 2140 }, { "entropy": 6.498207521438599, "epoch": 1.843145681134508, "grad_norm": 0.98046875, "learning_rate": 0.00048726212998104554, "loss": 6.2531, "mean_token_accuracy": 0.14796748533844947, "num_tokens": 3900584.0, "step": 2145 }, { "entropy": 6.405120611190796, "epoch": 1.8474430597335625, "grad_norm": 1.0390625, "learning_rate": 0.0004871516159385768, "loss": 6.1817, "mean_token_accuracy": 0.1539264902472496, "num_tokens": 3910208.0, "step": 2150 }, { "entropy": 6.320563936233521, "epoch": 1.8517404383326173, "grad_norm": 1.1015625, "learning_rate": 0.0004870406386417752, "loss": 6.1061, "mean_token_accuracy": 0.15697987973690034, "num_tokens": 3918424.0, "step": 2155 }, { "entropy": 6.313277053833008, "epoch": 1.8560378169316718, "grad_norm": 1.0859375, "learning_rate": 0.0004869291983329707, "loss": 6.047, "mean_token_accuracy": 0.17023974657058716, "num_tokens": 3926206.0, "step": 2160 }, { "entropy": 6.473067951202393, "epoch": 1.8603351955307263, "grad_norm": 1.046875, "learning_rate": 0.0004868172952555044, "loss": 6.1485, "mean_token_accuracy": 0.14482472315430642, "num_tokens": 3935769.0, "step": 2165 }, { "entropy": 6.363153123855591, "epoch": 1.864632574129781, "grad_norm": 0.9453125, "learning_rate": 0.0004867049296537278, "loss": 6.1373, "mean_token_accuracy": 0.1534383252263069, "num_tokens": 3945118.0, "step": 2170 }, { "entropy": 6.399164772033691, "epoch": 1.8689299527288354, "grad_norm": 1.2578125, "learning_rate": 0.0004865921017730027, "loss": 6.2358, "mean_token_accuracy": 0.15296792089939118, "num_tokens": 3954012.0, "step": 2175 }, { "entropy": 6.471106052398682, "epoch": 1.87322733132789, "grad_norm": 0.94140625, "learning_rate": 0.00048647881185969995, "loss": 6.2355, "mean_token_accuracy": 0.15060990452766418, "num_tokens": 3964239.0, "step": 2180 }, { "entropy": 6.386410093307495, "epoch": 1.8775247099269445, "grad_norm": 1.015625, "learning_rate": 0.0004863650601611994, "loss": 6.1502, "mean_token_accuracy": 0.15660223215818406, "num_tokens": 3973694.0, "step": 2185 }, { "entropy": 6.372910404205323, "epoch": 1.881822088525999, "grad_norm": 1.0703125, "learning_rate": 0.00048625084692588937, "loss": 6.185, "mean_token_accuracy": 0.15601919442415238, "num_tokens": 3982706.0, "step": 2190 }, { "entropy": 6.401282548904419, "epoch": 1.8861194671250536, "grad_norm": 1.09375, "learning_rate": 0.00048613617240316593, "loss": 6.138, "mean_token_accuracy": 0.15665835291147232, "num_tokens": 3990934.0, "step": 2195 }, { "entropy": 6.4126348972320555, "epoch": 1.8904168457241082, "grad_norm": 1.0390625, "learning_rate": 0.0004860210368434323, "loss": 6.192, "mean_token_accuracy": 0.1556440055370331, "num_tokens": 3999864.0, "step": 2200 }, { "entropy": 6.424229860305786, "epoch": 1.8947142243231627, "grad_norm": 0.9765625, "learning_rate": 0.00048590544049809857, "loss": 6.1968, "mean_token_accuracy": 0.15178433507680894, "num_tokens": 4008273.0, "step": 2205 }, { "entropy": 6.427778577804565, "epoch": 1.8990116029222175, "grad_norm": 0.99609375, "learning_rate": 0.000485789383619581, "loss": 6.2178, "mean_token_accuracy": 0.1559001922607422, "num_tokens": 4017697.0, "step": 2210 }, { "entropy": 6.4254296779632565, "epoch": 1.903308981521272, "grad_norm": 1.09375, "learning_rate": 0.0004856728664613015, "loss": 6.2293, "mean_token_accuracy": 0.14589258283376694, "num_tokens": 4026775.0, "step": 2215 }, { "entropy": 6.351989793777466, "epoch": 1.9076063601203266, "grad_norm": 1.03125, "learning_rate": 0.00048555588927768674, "loss": 6.1972, "mean_token_accuracy": 0.15271373167634011, "num_tokens": 4036476.0, "step": 2220 }, { "entropy": 6.473893165588379, "epoch": 1.9119037387193811, "grad_norm": 1.109375, "learning_rate": 0.0004854384523241683, "loss": 6.204, "mean_token_accuracy": 0.15081721246242524, "num_tokens": 4045221.0, "step": 2225 }, { "entropy": 6.310385704040527, "epoch": 1.916201117318436, "grad_norm": 1.0078125, "learning_rate": 0.00048532055585718143, "loss": 6.1112, "mean_token_accuracy": 0.15869007259607315, "num_tokens": 4053754.0, "step": 2230 }, { "entropy": 6.390126276016235, "epoch": 1.9204984959174904, "grad_norm": 1.1015625, "learning_rate": 0.00048520220013416505, "loss": 6.1455, "mean_token_accuracy": 0.15594211518764495, "num_tokens": 4061730.0, "step": 2235 }, { "entropy": 6.3809610366821286, "epoch": 1.924795874516545, "grad_norm": 1.0390625, "learning_rate": 0.0004850833854135607, "loss": 6.197, "mean_token_accuracy": 0.15130506530404092, "num_tokens": 4070501.0, "step": 2240 }, { "entropy": 6.420936059951782, "epoch": 1.9290932531155995, "grad_norm": 0.9296875, "learning_rate": 0.0004849641119548122, "loss": 6.2763, "mean_token_accuracy": 0.1485205315053463, "num_tokens": 4079621.0, "step": 2245 }, { "entropy": 6.4735170841217045, "epoch": 1.933390631714654, "grad_norm": 1.046875, "learning_rate": 0.000484844380018365, "loss": 6.2663, "mean_token_accuracy": 0.14868344217538834, "num_tokens": 4090106.0, "step": 2250 }, { "entropy": 6.461083984375, "epoch": 1.9376880103137086, "grad_norm": 1.0, "learning_rate": 0.000484724189865666, "loss": 6.1985, "mean_token_accuracy": 0.1501224085688591, "num_tokens": 4099269.0, "step": 2255 }, { "entropy": 6.287312364578247, "epoch": 1.9419853889127632, "grad_norm": 1.046875, "learning_rate": 0.0004846035417591624, "loss": 6.1351, "mean_token_accuracy": 0.1544906511902809, "num_tokens": 4108414.0, "step": 2260 }, { "entropy": 6.426730060577393, "epoch": 1.9462827675118177, "grad_norm": 1.1328125, "learning_rate": 0.0004844824359623014, "loss": 6.2629, "mean_token_accuracy": 0.14584496468305588, "num_tokens": 4117731.0, "step": 2265 }, { "entropy": 6.451971340179443, "epoch": 1.9505801461108723, "grad_norm": 1.0703125, "learning_rate": 0.00048436087273952966, "loss": 6.2441, "mean_token_accuracy": 0.14279974550008773, "num_tokens": 4127194.0, "step": 2270 }, { "entropy": 6.396147346496582, "epoch": 1.9548775247099268, "grad_norm": 1.09375, "learning_rate": 0.00048423885235629265, "loss": 6.193, "mean_token_accuracy": 0.15773467123508453, "num_tokens": 4135594.0, "step": 2275 }, { "entropy": 6.39124755859375, "epoch": 1.9591749033089814, "grad_norm": 1.0, "learning_rate": 0.0004841163750790342, "loss": 6.2256, "mean_token_accuracy": 0.15189721137285234, "num_tokens": 4145027.0, "step": 2280 }, { "entropy": 6.383194398880005, "epoch": 1.9634722819080361, "grad_norm": 0.99609375, "learning_rate": 0.00048399344117519555, "loss": 6.087, "mean_token_accuracy": 0.15884610414505004, "num_tokens": 4153754.0, "step": 2285 }, { "entropy": 6.330159759521484, "epoch": 1.9677696605070907, "grad_norm": 0.99609375, "learning_rate": 0.00048387005091321544, "loss": 6.1553, "mean_token_accuracy": 0.15946451872587203, "num_tokens": 4162765.0, "step": 2290 }, { "entropy": 6.414357376098633, "epoch": 1.9720670391061452, "grad_norm": 1.140625, "learning_rate": 0.00048374620456252877, "loss": 6.1748, "mean_token_accuracy": 0.1570574849843979, "num_tokens": 4171589.0, "step": 2295 }, { "entropy": 6.360631132125855, "epoch": 1.9763644177052, "grad_norm": 1.015625, "learning_rate": 0.00048362190239356644, "loss": 6.1913, "mean_token_accuracy": 0.155552938580513, "num_tokens": 4181817.0, "step": 2300 }, { "entropy": 6.352840518951416, "epoch": 1.9806617963042545, "grad_norm": 0.91796875, "learning_rate": 0.00048349714467775474, "loss": 6.1462, "mean_token_accuracy": 0.1511269122362137, "num_tokens": 4191350.0, "step": 2305 }, { "entropy": 6.3630085468292235, "epoch": 1.984959174903309, "grad_norm": 1.046875, "learning_rate": 0.00048337193168751464, "loss": 6.1935, "mean_token_accuracy": 0.1461350604891777, "num_tokens": 4199888.0, "step": 2310 }, { "entropy": 6.447411775588989, "epoch": 1.9892565535023636, "grad_norm": 1.1171875, "learning_rate": 0.0004832462636962613, "loss": 6.1829, "mean_token_accuracy": 0.1507252760231495, "num_tokens": 4209509.0, "step": 2315 }, { "entropy": 6.372689247131348, "epoch": 1.9935539321014182, "grad_norm": 1.09375, "learning_rate": 0.0004831201409784034, "loss": 6.1215, "mean_token_accuracy": 0.15712654441595078, "num_tokens": 4218496.0, "step": 2320 }, { "entropy": 6.357889032363891, "epoch": 1.9978513107004727, "grad_norm": 0.99609375, "learning_rate": 0.0004829935638093424, "loss": 6.1463, "mean_token_accuracy": 0.15369027704000474, "num_tokens": 4227504.0, "step": 2325 }, { "entropy": 6.373083750406901, "epoch": 2.0017189514396216, "grad_norm": 1.046875, "learning_rate": 0.0004828665324654724, "loss": 6.0581, "mean_token_accuracy": 0.15794145895375145, "num_tokens": 4235338.0, "step": 2330 }, { "entropy": 6.4267494678497314, "epoch": 2.006016330038676, "grad_norm": 0.9765625, "learning_rate": 0.0004827390472241791, "loss": 5.8418, "mean_token_accuracy": 0.16316850185394288, "num_tokens": 4244905.0, "step": 2335 }, { "entropy": 6.314910984039306, "epoch": 2.010313708637731, "grad_norm": 0.9375, "learning_rate": 0.0004826111083638392, "loss": 5.9211, "mean_token_accuracy": 0.1677140362560749, "num_tokens": 4254533.0, "step": 2340 }, { "entropy": 6.370204210281372, "epoch": 2.0146110872367857, "grad_norm": 0.98828125, "learning_rate": 0.00048248271616382, "loss": 5.8961, "mean_token_accuracy": 0.16431671380996704, "num_tokens": 4264023.0, "step": 2345 }, { "entropy": 6.326271295547485, "epoch": 2.0189084658358403, "grad_norm": 1.015625, "learning_rate": 0.00048235387090447894, "loss": 5.9306, "mean_token_accuracy": 0.1572665750980377, "num_tokens": 4273298.0, "step": 2350 }, { "entropy": 6.378605699539184, "epoch": 2.023205844434895, "grad_norm": 1.0390625, "learning_rate": 0.00048222457286716235, "loss": 5.8756, "mean_token_accuracy": 0.16723261177539825, "num_tokens": 4283244.0, "step": 2355 }, { "entropy": 6.322220325469971, "epoch": 2.0275032230339494, "grad_norm": 1.140625, "learning_rate": 0.00048209482233420564, "loss": 5.8185, "mean_token_accuracy": 0.1769508183002472, "num_tokens": 4291677.0, "step": 2360 }, { "entropy": 6.314945793151855, "epoch": 2.031800601633004, "grad_norm": 1.0546875, "learning_rate": 0.000481964619588932, "loss": 5.8793, "mean_token_accuracy": 0.16825687736272812, "num_tokens": 4300822.0, "step": 2365 }, { "entropy": 6.339528942108155, "epoch": 2.0360979802320585, "grad_norm": 1.0859375, "learning_rate": 0.0004818339649156523, "loss": 5.8876, "mean_token_accuracy": 0.16732898950576783, "num_tokens": 4310149.0, "step": 2370 }, { "entropy": 6.19782075881958, "epoch": 2.040395358831113, "grad_norm": 1.0078125, "learning_rate": 0.00048170285859966395, "loss": 5.7924, "mean_token_accuracy": 0.17466236799955367, "num_tokens": 4319109.0, "step": 2375 }, { "entropy": 6.3286045551300045, "epoch": 2.0446927374301676, "grad_norm": 0.984375, "learning_rate": 0.00048157130092725087, "loss": 5.7843, "mean_token_accuracy": 0.1704682469367981, "num_tokens": 4327921.0, "step": 2380 }, { "entropy": 6.329291915893554, "epoch": 2.048990116029222, "grad_norm": 1.0234375, "learning_rate": 0.0004814392921856824, "loss": 5.9287, "mean_token_accuracy": 0.16586144566535949, "num_tokens": 4338026.0, "step": 2385 }, { "entropy": 6.2563072681427006, "epoch": 2.0532874946282766, "grad_norm": 0.95703125, "learning_rate": 0.0004813068326632128, "loss": 5.7762, "mean_token_accuracy": 0.17654864937067033, "num_tokens": 4347794.0, "step": 2390 }, { "entropy": 6.329816913604736, "epoch": 2.057584873227331, "grad_norm": 1.078125, "learning_rate": 0.0004811739226490809, "loss": 5.9557, "mean_token_accuracy": 0.16758598685264586, "num_tokens": 4357249.0, "step": 2395 }, { "entropy": 6.283816623687744, "epoch": 2.0618822518263857, "grad_norm": 1.0625, "learning_rate": 0.00048104056243350896, "loss": 5.9041, "mean_token_accuracy": 0.16363563090562822, "num_tokens": 4366053.0, "step": 2400 }, { "entropy": 6.297672891616822, "epoch": 2.0661796304254403, "grad_norm": 0.98046875, "learning_rate": 0.0004809067523077023, "loss": 5.9163, "mean_token_accuracy": 0.16945113092660904, "num_tokens": 4375543.0, "step": 2405 }, { "entropy": 6.2845330238342285, "epoch": 2.0704770090244953, "grad_norm": 1.0625, "learning_rate": 0.00048077249256384884, "loss": 5.8006, "mean_token_accuracy": 0.17305675595998765, "num_tokens": 4384332.0, "step": 2410 }, { "entropy": 6.210544061660767, "epoch": 2.07477438762355, "grad_norm": 1.1953125, "learning_rate": 0.0004806377834951182, "loss": 5.8994, "mean_token_accuracy": 0.16216432005167009, "num_tokens": 4393670.0, "step": 2415 }, { "entropy": 6.373771142959595, "epoch": 2.0790717662226044, "grad_norm": 1.1328125, "learning_rate": 0.00048050262539566104, "loss": 5.9012, "mean_token_accuracy": 0.16862600147724152, "num_tokens": 4402763.0, "step": 2420 }, { "entropy": 6.269940948486328, "epoch": 2.083369144821659, "grad_norm": 0.984375, "learning_rate": 0.0004803670185606087, "loss": 5.8086, "mean_token_accuracy": 0.17335692346096038, "num_tokens": 4411863.0, "step": 2425 }, { "entropy": 6.265923166275025, "epoch": 2.0876665234207135, "grad_norm": 1.078125, "learning_rate": 0.0004802309632860724, "loss": 5.9059, "mean_token_accuracy": 0.16651569604873656, "num_tokens": 4421110.0, "step": 2430 }, { "entropy": 6.352302503585816, "epoch": 2.091963902019768, "grad_norm": 1.0390625, "learning_rate": 0.00048009445986914236, "loss": 5.8854, "mean_token_accuracy": 0.16589637845754623, "num_tokens": 4430249.0, "step": 2435 }, { "entropy": 6.263960170745849, "epoch": 2.0962612806188226, "grad_norm": 1.0078125, "learning_rate": 0.00047995750860788756, "loss": 5.8661, "mean_token_accuracy": 0.15910358875989913, "num_tokens": 4439686.0, "step": 2440 }, { "entropy": 6.227327108383179, "epoch": 2.100558659217877, "grad_norm": 1.1796875, "learning_rate": 0.0004798201098013547, "loss": 5.8709, "mean_token_accuracy": 0.1692453533411026, "num_tokens": 4448645.0, "step": 2445 }, { "entropy": 6.291311168670655, "epoch": 2.1048560378169316, "grad_norm": 0.96484375, "learning_rate": 0.00047968226374956797, "loss": 5.8333, "mean_token_accuracy": 0.1675017699599266, "num_tokens": 4456870.0, "step": 2450 }, { "entropy": 6.195930767059326, "epoch": 2.109153416415986, "grad_norm": 1.03125, "learning_rate": 0.00047954397075352794, "loss": 5.8684, "mean_token_accuracy": 0.17277338951826096, "num_tokens": 4466287.0, "step": 2455 }, { "entropy": 6.2388382911682125, "epoch": 2.1134507950150407, "grad_norm": 1.0703125, "learning_rate": 0.00047940523111521136, "loss": 5.7553, "mean_token_accuracy": 0.17395039051771163, "num_tokens": 4474461.0, "step": 2460 }, { "entropy": 6.255577421188354, "epoch": 2.1177481736140953, "grad_norm": 1.1875, "learning_rate": 0.0004792660451375701, "loss": 5.835, "mean_token_accuracy": 0.16953630596399308, "num_tokens": 4483002.0, "step": 2465 }, { "entropy": 6.224816513061524, "epoch": 2.12204555221315, "grad_norm": 1.0859375, "learning_rate": 0.00047912641312453064, "loss": 5.8459, "mean_token_accuracy": 0.1695180580019951, "num_tokens": 4492061.0, "step": 2470 }, { "entropy": 6.284405374526978, "epoch": 2.1263429308122044, "grad_norm": 0.9375, "learning_rate": 0.00047898633538099363, "loss": 5.8957, "mean_token_accuracy": 0.16090027987957, "num_tokens": 4501829.0, "step": 2475 }, { "entropy": 6.258666229248047, "epoch": 2.130640309411259, "grad_norm": 0.98828125, "learning_rate": 0.0004788458122128327, "loss": 5.9181, "mean_token_accuracy": 0.1656097248196602, "num_tokens": 4511539.0, "step": 2480 }, { "entropy": 6.246809720993042, "epoch": 2.134937688010314, "grad_norm": 1.0625, "learning_rate": 0.00047870484392689434, "loss": 5.7722, "mean_token_accuracy": 0.1671189084649086, "num_tokens": 4520425.0, "step": 2485 }, { "entropy": 6.220279026031494, "epoch": 2.1392350666093685, "grad_norm": 1.0859375, "learning_rate": 0.000478563430830997, "loss": 5.8751, "mean_token_accuracy": 0.16446918100118638, "num_tokens": 4529474.0, "step": 2490 }, { "entropy": 6.2571605205535885, "epoch": 2.143532445208423, "grad_norm": 1.0546875, "learning_rate": 0.00047842157323393035, "loss": 5.8041, "mean_token_accuracy": 0.1694269135594368, "num_tokens": 4538082.0, "step": 2495 }, { "entropy": 6.218803596496582, "epoch": 2.1478298238074776, "grad_norm": 1.015625, "learning_rate": 0.0004782792714454547, "loss": 5.9987, "mean_token_accuracy": 0.16337930560112, "num_tokens": 4547340.0, "step": 2500 }, { "epoch": 2.1478298238074776, "eval_entropy": 6.073525357890773, "eval_loss": 6.213027477264404, "eval_mean_token_accuracy": 0.15643914548999016, "eval_num_tokens": 4547340.0, "eval_runtime": 2.0452, "eval_samples_per_second": 1735.325, "eval_steps_per_second": 217.099, "step": 2500 }, { "entropy": 6.266714763641358, "epoch": 2.152127202406532, "grad_norm": 1.1171875, "learning_rate": 0.0004781365257763002, "loss": 5.8423, "mean_token_accuracy": 0.16869749277830123, "num_tokens": 4556415.0, "step": 2505 }, { "entropy": 6.1728370666503904, "epoch": 2.1564245810055866, "grad_norm": 1.28125, "learning_rate": 0.00047799333653816633, "loss": 5.7293, "mean_token_accuracy": 0.17461720257997512, "num_tokens": 4565156.0, "step": 2510 }, { "entropy": 6.233670806884765, "epoch": 2.160721959604641, "grad_norm": 1.0703125, "learning_rate": 0.00047784970404372124, "loss": 5.8327, "mean_token_accuracy": 0.16848449259996415, "num_tokens": 4574678.0, "step": 2515 }, { "entropy": 6.12764801979065, "epoch": 2.1650193382036957, "grad_norm": 1.1171875, "learning_rate": 0.00047770562860660083, "loss": 5.854, "mean_token_accuracy": 0.16377500146627427, "num_tokens": 4583253.0, "step": 2520 }, { "entropy": 6.273917770385742, "epoch": 2.1693167168027503, "grad_norm": 0.91796875, "learning_rate": 0.0004775611105414083, "loss": 5.9138, "mean_token_accuracy": 0.16056130826473236, "num_tokens": 4594042.0, "step": 2525 }, { "entropy": 6.210309171676636, "epoch": 2.173614095401805, "grad_norm": 0.98828125, "learning_rate": 0.0004774161501637133, "loss": 5.8661, "mean_token_accuracy": 0.16690902709960936, "num_tokens": 4603128.0, "step": 2530 }, { "entropy": 6.207437753677368, "epoch": 2.1779114740008594, "grad_norm": 1.234375, "learning_rate": 0.0004772707477900514, "loss": 5.8489, "mean_token_accuracy": 0.17330004572868346, "num_tokens": 4611537.0, "step": 2535 }, { "entropy": 6.316633796691894, "epoch": 2.182208852599914, "grad_norm": 1.09375, "learning_rate": 0.0004771249037379232, "loss": 5.9518, "mean_token_accuracy": 0.1604529470205307, "num_tokens": 4622481.0, "step": 2540 }, { "entropy": 6.174561834335327, "epoch": 2.1865062311989685, "grad_norm": 1.0625, "learning_rate": 0.0004769786183257939, "loss": 5.8564, "mean_token_accuracy": 0.17447448074817656, "num_tokens": 4631259.0, "step": 2545 }, { "entropy": 6.186811542510986, "epoch": 2.190803609798023, "grad_norm": 1.0859375, "learning_rate": 0.0004768318918730924, "loss": 5.7986, "mean_token_accuracy": 0.1752243533730507, "num_tokens": 4640266.0, "step": 2550 }, { "entropy": 6.212873888015747, "epoch": 2.195100988397078, "grad_norm": 1.046875, "learning_rate": 0.00047668472470021044, "loss": 5.853, "mean_token_accuracy": 0.16329605877399445, "num_tokens": 4649520.0, "step": 2555 }, { "entropy": 6.257145929336548, "epoch": 2.1993983669961326, "grad_norm": 1.03125, "learning_rate": 0.0004765371171285025, "loss": 5.8079, "mean_token_accuracy": 0.1733356922864914, "num_tokens": 4658501.0, "step": 2560 }, { "entropy": 6.108858823776245, "epoch": 2.203695745595187, "grad_norm": 1.0546875, "learning_rate": 0.00047638906948028445, "loss": 5.8536, "mean_token_accuracy": 0.16747843474149704, "num_tokens": 4667567.0, "step": 2565 }, { "entropy": 6.222007703781128, "epoch": 2.2079931241942417, "grad_norm": 1.1640625, "learning_rate": 0.00047624058207883317, "loss": 5.8596, "mean_token_accuracy": 0.16799781173467637, "num_tokens": 4676618.0, "step": 2570 }, { "entropy": 6.326595973968506, "epoch": 2.212290502793296, "grad_norm": 1.0, "learning_rate": 0.00047609165524838576, "loss": 5.921, "mean_token_accuracy": 0.16489885598421097, "num_tokens": 4685967.0, "step": 2575 }, { "entropy": 6.112624216079712, "epoch": 2.2165878813923507, "grad_norm": 1.2421875, "learning_rate": 0.0004759422893141389, "loss": 5.8098, "mean_token_accuracy": 0.17214897125959397, "num_tokens": 4694568.0, "step": 2580 }, { "entropy": 6.23127293586731, "epoch": 2.2208852599914053, "grad_norm": 1.0859375, "learning_rate": 0.0004757924846022482, "loss": 5.8764, "mean_token_accuracy": 0.1683722823858261, "num_tokens": 4703648.0, "step": 2585 }, { "entropy": 6.2149560928344725, "epoch": 2.22518263859046, "grad_norm": 1.171875, "learning_rate": 0.00047564224143982714, "loss": 5.7317, "mean_token_accuracy": 0.18064576983451844, "num_tokens": 4712444.0, "step": 2590 }, { "entropy": 6.195422506332397, "epoch": 2.2294800171895144, "grad_norm": 1.1796875, "learning_rate": 0.00047549156015494676, "loss": 5.887, "mean_token_accuracy": 0.16564202010631562, "num_tokens": 4722034.0, "step": 2595 }, { "entropy": 6.179683208465576, "epoch": 2.233777395788569, "grad_norm": 1.046875, "learning_rate": 0.00047534044107663484, "loss": 5.9075, "mean_token_accuracy": 0.16279049664735795, "num_tokens": 4731344.0, "step": 2600 }, { "entropy": 6.295088148117065, "epoch": 2.2380747743876235, "grad_norm": 1.15625, "learning_rate": 0.00047518888453487496, "loss": 5.809, "mean_token_accuracy": 0.17704246044158936, "num_tokens": 4739302.0, "step": 2605 }, { "entropy": 6.1531964302062985, "epoch": 2.242372152986678, "grad_norm": 0.98046875, "learning_rate": 0.0004750368908606061, "loss": 5.9282, "mean_token_accuracy": 0.16434444785118102, "num_tokens": 4748848.0, "step": 2610 }, { "entropy": 6.262106943130493, "epoch": 2.2466695315857326, "grad_norm": 0.99609375, "learning_rate": 0.00047488446038572164, "loss": 5.9816, "mean_token_accuracy": 0.16012711673974991, "num_tokens": 4758194.0, "step": 2615 }, { "entropy": 6.268323373794556, "epoch": 2.250966910184787, "grad_norm": 1.1171875, "learning_rate": 0.0004747315934430688, "loss": 5.8908, "mean_token_accuracy": 0.164437834918499, "num_tokens": 4768081.0, "step": 2620 }, { "entropy": 6.122048091888428, "epoch": 2.2552642887838417, "grad_norm": 1.1328125, "learning_rate": 0.000474578290366448, "loss": 5.8245, "mean_token_accuracy": 0.1705750197172165, "num_tokens": 4776471.0, "step": 2625 }, { "entropy": 6.204921579360962, "epoch": 2.259561667382896, "grad_norm": 1.09375, "learning_rate": 0.0004744245514906117, "loss": 5.8253, "mean_token_accuracy": 0.1741186946630478, "num_tokens": 4784403.0, "step": 2630 }, { "entropy": 6.1283422946929935, "epoch": 2.263859045981951, "grad_norm": 1.1171875, "learning_rate": 0.00047427037715126426, "loss": 5.8029, "mean_token_accuracy": 0.16940733194351196, "num_tokens": 4792779.0, "step": 2635 }, { "entropy": 6.132787275314331, "epoch": 2.2681564245810057, "grad_norm": 0.9921875, "learning_rate": 0.0004741157676850608, "loss": 5.7827, "mean_token_accuracy": 0.1744200199842453, "num_tokens": 4801426.0, "step": 2640 }, { "entropy": 6.2156031131744385, "epoch": 2.2724538031800603, "grad_norm": 1.2578125, "learning_rate": 0.00047396072342960663, "loss": 5.8338, "mean_token_accuracy": 0.16472329795360566, "num_tokens": 4810329.0, "step": 2645 }, { "entropy": 6.1918652057647705, "epoch": 2.276751181779115, "grad_norm": 1.0234375, "learning_rate": 0.00047380524472345645, "loss": 5.8802, "mean_token_accuracy": 0.16467834115028382, "num_tokens": 4819544.0, "step": 2650 }, { "entropy": 6.203462934494018, "epoch": 2.2810485603781694, "grad_norm": 1.078125, "learning_rate": 0.0004736493319061134, "loss": 5.8876, "mean_token_accuracy": 0.16658470630645753, "num_tokens": 4828113.0, "step": 2655 }, { "entropy": 6.154991245269775, "epoch": 2.285345938977224, "grad_norm": 0.98046875, "learning_rate": 0.0004734929853180291, "loss": 5.8764, "mean_token_accuracy": 0.16575339883565904, "num_tokens": 4836989.0, "step": 2660 }, { "entropy": 6.258448839187622, "epoch": 2.2896433175762785, "grad_norm": 0.921875, "learning_rate": 0.00047333620530060175, "loss": 5.9117, "mean_token_accuracy": 0.16528864502906798, "num_tokens": 4847103.0, "step": 2665 }, { "entropy": 6.181549310684204, "epoch": 2.293940696175333, "grad_norm": 1.1328125, "learning_rate": 0.0004731789921961764, "loss": 5.9289, "mean_token_accuracy": 0.16640040427446365, "num_tokens": 4856238.0, "step": 2670 }, { "entropy": 6.227826976776123, "epoch": 2.2982380747743876, "grad_norm": 1.09375, "learning_rate": 0.0004730213463480434, "loss": 5.8189, "mean_token_accuracy": 0.17475187480449678, "num_tokens": 4864608.0, "step": 2675 }, { "entropy": 6.163301944732666, "epoch": 2.302535453373442, "grad_norm": 1.0390625, "learning_rate": 0.00047286326810043857, "loss": 5.7783, "mean_token_accuracy": 0.17075299024581908, "num_tokens": 4873889.0, "step": 2680 }, { "entropy": 6.134186220169068, "epoch": 2.3068328319724967, "grad_norm": 1.109375, "learning_rate": 0.00047270475779854137, "loss": 5.8223, "mean_token_accuracy": 0.1724078834056854, "num_tokens": 4882902.0, "step": 2685 }, { "entropy": 6.292477703094482, "epoch": 2.311130210571551, "grad_norm": 1.09375, "learning_rate": 0.00047254581578847507, "loss": 5.8426, "mean_token_accuracy": 0.16808903068304062, "num_tokens": 4892390.0, "step": 2690 }, { "entropy": 6.170593881607056, "epoch": 2.3154275891706058, "grad_norm": 1.1015625, "learning_rate": 0.0004723864424173055, "loss": 5.9683, "mean_token_accuracy": 0.1666146218776703, "num_tokens": 4901625.0, "step": 2695 }, { "entropy": 6.194738912582397, "epoch": 2.3197249677696608, "grad_norm": 1.0703125, "learning_rate": 0.0004722266380330403, "loss": 5.7718, "mean_token_accuracy": 0.17559022307395936, "num_tokens": 4910804.0, "step": 2700 }, { "entropy": 6.180141830444336, "epoch": 2.3240223463687153, "grad_norm": 1.0625, "learning_rate": 0.00047206640298462857, "loss": 5.8472, "mean_token_accuracy": 0.16781375855207442, "num_tokens": 4920441.0, "step": 2705 }, { "entropy": 6.170105838775635, "epoch": 2.32831972496777, "grad_norm": 1.109375, "learning_rate": 0.00047190573762195945, "loss": 5.8928, "mean_token_accuracy": 0.1647154539823532, "num_tokens": 4930204.0, "step": 2710 }, { "entropy": 6.171744394302368, "epoch": 2.3326171035668244, "grad_norm": 0.89453125, "learning_rate": 0.00047174464229586186, "loss": 5.9868, "mean_token_accuracy": 0.15878558307886123, "num_tokens": 4941191.0, "step": 2715 }, { "entropy": 6.294037532806397, "epoch": 2.336914482165879, "grad_norm": 1.234375, "learning_rate": 0.0004715831173581036, "loss": 5.9658, "mean_token_accuracy": 0.16081493049860002, "num_tokens": 4951825.0, "step": 2720 }, { "entropy": 6.163305330276489, "epoch": 2.3412118607649335, "grad_norm": 0.97265625, "learning_rate": 0.00047142116316139073, "loss": 5.9007, "mean_token_accuracy": 0.1701881170272827, "num_tokens": 4960632.0, "step": 2725 }, { "entropy": 6.263418245315552, "epoch": 2.345509239363988, "grad_norm": 0.97265625, "learning_rate": 0.0004712587800593663, "loss": 5.9268, "mean_token_accuracy": 0.1628424420952797, "num_tokens": 4969455.0, "step": 2730 }, { "entropy": 6.159938859939575, "epoch": 2.3498066179630426, "grad_norm": 1.234375, "learning_rate": 0.0004710959684066102, "loss": 5.822, "mean_token_accuracy": 0.1740834206342697, "num_tokens": 4978997.0, "step": 2735 }, { "entropy": 6.198467969894409, "epoch": 2.354103996562097, "grad_norm": 1.0234375, "learning_rate": 0.00047093272855863803, "loss": 5.89, "mean_token_accuracy": 0.16633735448122025, "num_tokens": 4988305.0, "step": 2740 }, { "entropy": 6.171191024780273, "epoch": 2.3584013751611517, "grad_norm": 1.03125, "learning_rate": 0.0004707690608719003, "loss": 5.8201, "mean_token_accuracy": 0.17565433084964752, "num_tokens": 4997022.0, "step": 2745 }, { "entropy": 6.182925462722778, "epoch": 2.362698753760206, "grad_norm": 1.140625, "learning_rate": 0.0004706049657037818, "loss": 5.879, "mean_token_accuracy": 0.16346064060926438, "num_tokens": 5005664.0, "step": 2750 }, { "entropy": 6.149474191665649, "epoch": 2.3669961323592608, "grad_norm": 1.0078125, "learning_rate": 0.0004704404434126009, "loss": 5.8502, "mean_token_accuracy": 0.16408389210700988, "num_tokens": 5014769.0, "step": 2755 }, { "entropy": 6.255496549606323, "epoch": 2.3712935109583153, "grad_norm": 1.0078125, "learning_rate": 0.00047027549435760843, "loss": 5.9078, "mean_token_accuracy": 0.16789433360099792, "num_tokens": 5024060.0, "step": 2760 }, { "entropy": 6.256794357299805, "epoch": 2.37559088955737, "grad_norm": 1.109375, "learning_rate": 0.0004701101188989872, "loss": 5.9544, "mean_token_accuracy": 0.1624842867255211, "num_tokens": 5033046.0, "step": 2765 }, { "entropy": 6.156686782836914, "epoch": 2.3798882681564244, "grad_norm": 1.1953125, "learning_rate": 0.00046994431739785114, "loss": 5.7991, "mean_token_accuracy": 0.18271932750940323, "num_tokens": 5040894.0, "step": 2770 }, { "entropy": 6.20210337638855, "epoch": 2.384185646755479, "grad_norm": 1.015625, "learning_rate": 0.00046977809021624454, "loss": 5.9534, "mean_token_accuracy": 0.17005517482757568, "num_tokens": 5050961.0, "step": 2775 }, { "entropy": 6.216541862487793, "epoch": 2.3884830253545335, "grad_norm": 1.078125, "learning_rate": 0.0004696114377171409, "loss": 5.8757, "mean_token_accuracy": 0.1636977568268776, "num_tokens": 5060226.0, "step": 2780 }, { "entropy": 6.160855150222778, "epoch": 2.3927804039535885, "grad_norm": 1.09375, "learning_rate": 0.0004694443602644429, "loss": 5.8457, "mean_token_accuracy": 0.16862347573041916, "num_tokens": 5069225.0, "step": 2785 }, { "entropy": 6.22788553237915, "epoch": 2.397077782552643, "grad_norm": 1.0625, "learning_rate": 0.0004692768582229808, "loss": 5.8344, "mean_token_accuracy": 0.17104473561048508, "num_tokens": 5078386.0, "step": 2790 }, { "entropy": 6.091501474380493, "epoch": 2.4013751611516976, "grad_norm": 0.96484375, "learning_rate": 0.00046910893195851213, "loss": 5.765, "mean_token_accuracy": 0.16869171112775802, "num_tokens": 5087161.0, "step": 2795 }, { "entropy": 6.183551597595215, "epoch": 2.405672539750752, "grad_norm": 1.0234375, "learning_rate": 0.00046894058183772074, "loss": 5.9281, "mean_token_accuracy": 0.16594007909297942, "num_tokens": 5096613.0, "step": 2800 }, { "entropy": 6.197868537902832, "epoch": 2.4099699183498067, "grad_norm": 1.1171875, "learning_rate": 0.000468771808228216, "loss": 5.8912, "mean_token_accuracy": 0.16417519897222518, "num_tokens": 5106534.0, "step": 2805 }, { "entropy": 6.143604946136475, "epoch": 2.414267296948861, "grad_norm": 1.078125, "learning_rate": 0.00046860261149853197, "loss": 5.9134, "mean_token_accuracy": 0.1646139517426491, "num_tokens": 5115975.0, "step": 2810 }, { "entropy": 6.127184104919434, "epoch": 2.4185646755479158, "grad_norm": 1.125, "learning_rate": 0.0004684329920181268, "loss": 5.8045, "mean_token_accuracy": 0.16945046484470366, "num_tokens": 5124635.0, "step": 2815 }, { "entropy": 6.151847076416016, "epoch": 2.4228620541469703, "grad_norm": 1.1640625, "learning_rate": 0.00046826295015738154, "loss": 5.7738, "mean_token_accuracy": 0.1773565873503685, "num_tokens": 5133226.0, "step": 2820 }, { "entropy": 6.0929807186126705, "epoch": 2.427159432746025, "grad_norm": 1.0078125, "learning_rate": 0.0004680924862875996, "loss": 5.8663, "mean_token_accuracy": 0.17087701261043547, "num_tokens": 5142257.0, "step": 2825 }, { "entropy": 6.199731492996216, "epoch": 2.4314568113450794, "grad_norm": 0.984375, "learning_rate": 0.00046792160078100605, "loss": 5.8592, "mean_token_accuracy": 0.17053601890802383, "num_tokens": 5150752.0, "step": 2830 }, { "entropy": 6.151450777053833, "epoch": 2.435754189944134, "grad_norm": 1.0078125, "learning_rate": 0.00046775029401074653, "loss": 5.7783, "mean_token_accuracy": 0.17438559532165526, "num_tokens": 5160237.0, "step": 2835 }, { "entropy": 6.171485233306885, "epoch": 2.4400515685431885, "grad_norm": 1.109375, "learning_rate": 0.00046757856635088645, "loss": 5.85, "mean_token_accuracy": 0.17521743029356002, "num_tokens": 5169752.0, "step": 2840 }, { "entropy": 6.1737254619598385, "epoch": 2.444348947142243, "grad_norm": 1.0078125, "learning_rate": 0.0004674064181764105, "loss": 5.8887, "mean_token_accuracy": 0.17213839143514634, "num_tokens": 5178892.0, "step": 2845 }, { "entropy": 6.169126319885254, "epoch": 2.448646325741298, "grad_norm": 0.9609375, "learning_rate": 0.00046723384986322147, "loss": 5.8736, "mean_token_accuracy": 0.16697555780410767, "num_tokens": 5188468.0, "step": 2850 }, { "entropy": 6.121142101287842, "epoch": 2.4529437043403526, "grad_norm": 1.0078125, "learning_rate": 0.0004670608617881395, "loss": 5.7947, "mean_token_accuracy": 0.1755498692393303, "num_tokens": 5197565.0, "step": 2855 }, { "entropy": 6.083435106277466, "epoch": 2.457241082939407, "grad_norm": 1.09375, "learning_rate": 0.0004668874543289014, "loss": 5.7851, "mean_token_accuracy": 0.1805465489625931, "num_tokens": 5205791.0, "step": 2860 }, { "entropy": 6.136435890197754, "epoch": 2.4615384615384617, "grad_norm": 1.046875, "learning_rate": 0.00046671362786415986, "loss": 5.7872, "mean_token_accuracy": 0.18155153840780258, "num_tokens": 5214773.0, "step": 2865 }, { "entropy": 6.082297658920288, "epoch": 2.465835840137516, "grad_norm": 0.9921875, "learning_rate": 0.00046653938277348237, "loss": 5.8211, "mean_token_accuracy": 0.1757299304008484, "num_tokens": 5223734.0, "step": 2870 }, { "entropy": 6.256624984741211, "epoch": 2.4701332187365708, "grad_norm": 1.1796875, "learning_rate": 0.0004663647194373505, "loss": 5.9026, "mean_token_accuracy": 0.16392517536878587, "num_tokens": 5231742.0, "step": 2875 }, { "entropy": 6.135076570510864, "epoch": 2.4744305973356253, "grad_norm": 1.0078125, "learning_rate": 0.00046618963823715913, "loss": 5.8631, "mean_token_accuracy": 0.17133675366640091, "num_tokens": 5241673.0, "step": 2880 }, { "entropy": 6.190168714523315, "epoch": 2.47872797593468, "grad_norm": 1.1171875, "learning_rate": 0.00046601413955521575, "loss": 5.8246, "mean_token_accuracy": 0.1694057285785675, "num_tokens": 5250082.0, "step": 2885 }, { "entropy": 6.136935997009277, "epoch": 2.4830253545337344, "grad_norm": 1.1484375, "learning_rate": 0.0004658382237747393, "loss": 5.8976, "mean_token_accuracy": 0.16706683337688447, "num_tokens": 5259680.0, "step": 2890 }, { "entropy": 6.16978874206543, "epoch": 2.487322733132789, "grad_norm": 0.97265625, "learning_rate": 0.00046566189127985946, "loss": 5.8769, "mean_token_accuracy": 0.1714440792798996, "num_tokens": 5269561.0, "step": 2895 }, { "entropy": 6.182620716094971, "epoch": 2.4916201117318435, "grad_norm": 0.9921875, "learning_rate": 0.000465485142455616, "loss": 5.8189, "mean_token_accuracy": 0.17375694811344147, "num_tokens": 5278659.0, "step": 2900 }, { "entropy": 6.057879829406739, "epoch": 2.495917490330898, "grad_norm": 1.0390625, "learning_rate": 0.00046530797768795765, "loss": 5.8103, "mean_token_accuracy": 0.18172994256019592, "num_tokens": 5287619.0, "step": 2905 }, { "entropy": 6.1459949016571045, "epoch": 2.5002148689299526, "grad_norm": 1.0078125, "learning_rate": 0.00046513039736374153, "loss": 5.9271, "mean_token_accuracy": 0.16282536834478378, "num_tokens": 5297334.0, "step": 2910 }, { "entropy": 6.201943445205688, "epoch": 2.504512247529007, "grad_norm": 1.109375, "learning_rate": 0.0004649524018707319, "loss": 5.8405, "mean_token_accuracy": 0.1736244261264801, "num_tokens": 5306208.0, "step": 2915 }, { "entropy": 6.117348289489746, "epoch": 2.5088096261280617, "grad_norm": 1.2109375, "learning_rate": 0.00046477399159759996, "loss": 5.7789, "mean_token_accuracy": 0.1744915708899498, "num_tokens": 5314754.0, "step": 2920 }, { "entropy": 6.022426891326904, "epoch": 2.5131070047271162, "grad_norm": 1.125, "learning_rate": 0.00046459516693392246, "loss": 5.7951, "mean_token_accuracy": 0.17653965055942536, "num_tokens": 5324000.0, "step": 2925 }, { "entropy": 6.192726993560791, "epoch": 2.517404383326171, "grad_norm": 1.0546875, "learning_rate": 0.0004644159282701808, "loss": 5.8412, "mean_token_accuracy": 0.1699216842651367, "num_tokens": 5332478.0, "step": 2930 }, { "entropy": 6.193784236907959, "epoch": 2.5217017619252258, "grad_norm": 0.99609375, "learning_rate": 0.00046423627599776076, "loss": 5.9229, "mean_token_accuracy": 0.1587831899523735, "num_tokens": 5341635.0, "step": 2935 }, { "entropy": 6.126192474365235, "epoch": 2.5259991405242803, "grad_norm": 1.0, "learning_rate": 0.000464056210508951, "loss": 5.9125, "mean_token_accuracy": 0.16348374187946318, "num_tokens": 5350144.0, "step": 2940 }, { "entropy": 6.17839298248291, "epoch": 2.530296519123335, "grad_norm": 1.078125, "learning_rate": 0.0004638757321969426, "loss": 5.8251, "mean_token_accuracy": 0.17073310166597366, "num_tokens": 5358788.0, "step": 2945 }, { "entropy": 6.144708824157715, "epoch": 2.5345938977223894, "grad_norm": 1.0859375, "learning_rate": 0.00046369484145582815, "loss": 5.9064, "mean_token_accuracy": 0.16323922872543334, "num_tokens": 5368057.0, "step": 2950 }, { "entropy": 6.069336700439453, "epoch": 2.538891276321444, "grad_norm": 1.0546875, "learning_rate": 0.00046351353868060054, "loss": 5.7586, "mean_token_accuracy": 0.174574413895607, "num_tokens": 5376739.0, "step": 2955 }, { "entropy": 6.171047353744507, "epoch": 2.5431886549204985, "grad_norm": 1.03125, "learning_rate": 0.00046333182426715273, "loss": 5.8806, "mean_token_accuracy": 0.16850085258483888, "num_tokens": 5385967.0, "step": 2960 }, { "entropy": 6.161162233352661, "epoch": 2.547486033519553, "grad_norm": 1.0390625, "learning_rate": 0.00046314969861227626, "loss": 5.9049, "mean_token_accuracy": 0.15845982432365419, "num_tokens": 5395192.0, "step": 2965 }, { "entropy": 6.14454460144043, "epoch": 2.5517834121186076, "grad_norm": 0.96484375, "learning_rate": 0.0004629671621136608, "loss": 5.8588, "mean_token_accuracy": 0.16995412558317186, "num_tokens": 5404694.0, "step": 2970 }, { "entropy": 6.158005809783935, "epoch": 2.556080790717662, "grad_norm": 1.1484375, "learning_rate": 0.0004627842151698931, "loss": 5.8623, "mean_token_accuracy": 0.16851141750812532, "num_tokens": 5413102.0, "step": 2975 }, { "entropy": 6.134857320785523, "epoch": 2.5603781693167167, "grad_norm": 1.046875, "learning_rate": 0.00046260085818045625, "loss": 5.8942, "mean_token_accuracy": 0.16586572974920272, "num_tokens": 5423339.0, "step": 2980 }, { "entropy": 6.197592544555664, "epoch": 2.5646755479157712, "grad_norm": 1.0546875, "learning_rate": 0.0004624170915457284, "loss": 5.8504, "mean_token_accuracy": 0.17059714645147322, "num_tokens": 5432377.0, "step": 2985 }, { "entropy": 6.128017950057983, "epoch": 2.5689729265148262, "grad_norm": 1.09375, "learning_rate": 0.00046223291566698264, "loss": 5.7959, "mean_token_accuracy": 0.17204724699258805, "num_tokens": 5441038.0, "step": 2990 }, { "entropy": 6.107345724105835, "epoch": 2.5732703051138808, "grad_norm": 1.046875, "learning_rate": 0.0004620483309463855, "loss": 5.7918, "mean_token_accuracy": 0.17900732010602952, "num_tokens": 5449557.0, "step": 2995 }, { "entropy": 6.1927958011627195, "epoch": 2.5775676837129353, "grad_norm": 1.0390625, "learning_rate": 0.0004618633377869961, "loss": 5.9156, "mean_token_accuracy": 0.16568114012479782, "num_tokens": 5458931.0, "step": 3000 }, { "epoch": 2.5775676837129353, "eval_entropy": 5.998430791201892, "eval_loss": 6.121789455413818, "eval_mean_token_accuracy": 0.16322041645243363, "eval_num_tokens": 5458931.0, "eval_runtime": 2.0487, "eval_samples_per_second": 1732.347, "eval_steps_per_second": 216.726, "step": 3000 }, { "entropy": 6.126945543289184, "epoch": 2.58186506231199, "grad_norm": 0.9765625, "learning_rate": 0.0004616779365927656, "loss": 5.7528, "mean_token_accuracy": 0.18461534082889558, "num_tokens": 5468539.0, "step": 3005 }, { "entropy": 5.964468240737915, "epoch": 2.5861624409110444, "grad_norm": 1.2734375, "learning_rate": 0.0004614921277685361, "loss": 5.6994, "mean_token_accuracy": 0.18173616677522658, "num_tokens": 5475710.0, "step": 3010 }, { "entropy": 6.099804162979126, "epoch": 2.590459819510099, "grad_norm": 1.0234375, "learning_rate": 0.00046130591172003976, "loss": 5.845, "mean_token_accuracy": 0.16855668723583223, "num_tokens": 5484597.0, "step": 3015 }, { "entropy": 6.216131401062012, "epoch": 2.5947571981091535, "grad_norm": 1.0234375, "learning_rate": 0.0004611192888538981, "loss": 5.9276, "mean_token_accuracy": 0.16257163286209106, "num_tokens": 5493213.0, "step": 3020 }, { "entropy": 6.1808586597442625, "epoch": 2.599054576708208, "grad_norm": 1.1484375, "learning_rate": 0.00046093225957762084, "loss": 5.903, "mean_token_accuracy": 0.16862684190273286, "num_tokens": 5502556.0, "step": 3025 }, { "entropy": 6.1216977596282955, "epoch": 2.6033519553072626, "grad_norm": 1.0703125, "learning_rate": 0.0004607448242996051, "loss": 5.8208, "mean_token_accuracy": 0.1719271272420883, "num_tokens": 5511779.0, "step": 3030 }, { "entropy": 6.1579231262207035, "epoch": 2.607649333906317, "grad_norm": 1.0625, "learning_rate": 0.0004605569834291347, "loss": 5.8058, "mean_token_accuracy": 0.18103471398353577, "num_tokens": 5520836.0, "step": 3035 }, { "entropy": 6.061151313781738, "epoch": 2.6119467125053717, "grad_norm": 1.171875, "learning_rate": 0.00046036873737637904, "loss": 5.8302, "mean_token_accuracy": 0.17482185810804368, "num_tokens": 5529285.0, "step": 3040 }, { "entropy": 6.116726493835449, "epoch": 2.6162440911044262, "grad_norm": 1.1015625, "learning_rate": 0.0004601800865523921, "loss": 5.8482, "mean_token_accuracy": 0.1684387966990471, "num_tokens": 5538160.0, "step": 3045 }, { "entropy": 6.122728109359741, "epoch": 2.620541469703481, "grad_norm": 1.0859375, "learning_rate": 0.00045999103136911204, "loss": 5.8517, "mean_token_accuracy": 0.16452286690473555, "num_tokens": 5547355.0, "step": 3050 }, { "entropy": 6.120913076400757, "epoch": 2.6248388483025353, "grad_norm": 1.0078125, "learning_rate": 0.00045980157223935965, "loss": 5.8606, "mean_token_accuracy": 0.16614654809236526, "num_tokens": 5557299.0, "step": 3055 }, { "entropy": 6.061937570571899, "epoch": 2.62913622690159, "grad_norm": 1.0, "learning_rate": 0.00045961170957683806, "loss": 5.7822, "mean_token_accuracy": 0.17485247999429704, "num_tokens": 5565469.0, "step": 3060 }, { "entropy": 6.150688505172729, "epoch": 2.6334336055006444, "grad_norm": 1.03125, "learning_rate": 0.00045942144379613147, "loss": 5.8945, "mean_token_accuracy": 0.16743394434452058, "num_tokens": 5574740.0, "step": 3065 }, { "entropy": 6.152962112426758, "epoch": 2.637730984099699, "grad_norm": 1.0546875, "learning_rate": 0.00045923077531270426, "loss": 5.8866, "mean_token_accuracy": 0.16888206750154494, "num_tokens": 5583438.0, "step": 3070 }, { "entropy": 6.126224088668823, "epoch": 2.6420283626987535, "grad_norm": 1.046875, "learning_rate": 0.0004590397045429001, "loss": 5.84, "mean_token_accuracy": 0.17367925941944123, "num_tokens": 5592389.0, "step": 3075 }, { "entropy": 6.084698152542114, "epoch": 2.646325741297808, "grad_norm": 0.9609375, "learning_rate": 0.00045884823190394134, "loss": 5.7589, "mean_token_accuracy": 0.1789909452199936, "num_tokens": 5601598.0, "step": 3080 }, { "entropy": 6.075862979888916, "epoch": 2.650623119896863, "grad_norm": 1.1171875, "learning_rate": 0.0004586563578139275, "loss": 5.8461, "mean_token_accuracy": 0.1662924975156784, "num_tokens": 5610498.0, "step": 3085 }, { "entropy": 6.096910190582276, "epoch": 2.6549204984959176, "grad_norm": 1.1796875, "learning_rate": 0.00045846408269183505, "loss": 5.7512, "mean_token_accuracy": 0.17860534340143203, "num_tokens": 5620082.0, "step": 3090 }, { "entropy": 6.1647505283355715, "epoch": 2.659217877094972, "grad_norm": 1.0234375, "learning_rate": 0.00045827140695751603, "loss": 5.8362, "mean_token_accuracy": 0.17174756973981858, "num_tokens": 5630291.0, "step": 3095 }, { "entropy": 6.091697454452515, "epoch": 2.6635152556940267, "grad_norm": 1.1484375, "learning_rate": 0.0004580783310316971, "loss": 5.8104, "mean_token_accuracy": 0.17255474478006363, "num_tokens": 5638784.0, "step": 3100 }, { "entropy": 6.026739645004272, "epoch": 2.6678126342930812, "grad_norm": 1.046875, "learning_rate": 0.00045788485533597895, "loss": 5.6819, "mean_token_accuracy": 0.18163852095603944, "num_tokens": 5647968.0, "step": 3105 }, { "entropy": 6.098209285736084, "epoch": 2.672110012892136, "grad_norm": 1.0390625, "learning_rate": 0.00045769098029283526, "loss": 5.906, "mean_token_accuracy": 0.16296559423208237, "num_tokens": 5657543.0, "step": 3110 }, { "entropy": 6.150312328338623, "epoch": 2.6764073914911903, "grad_norm": 1.1328125, "learning_rate": 0.0004574967063256115, "loss": 5.836, "mean_token_accuracy": 0.17701750695705415, "num_tokens": 5666535.0, "step": 3115 }, { "entropy": 6.1265421390533445, "epoch": 2.680704770090245, "grad_norm": 1.1015625, "learning_rate": 0.00045730203385852447, "loss": 5.9135, "mean_token_accuracy": 0.16741105765104294, "num_tokens": 5676273.0, "step": 3120 }, { "entropy": 6.052946949005127, "epoch": 2.6850021486892994, "grad_norm": 1.0703125, "learning_rate": 0.000457106963316661, "loss": 5.8151, "mean_token_accuracy": 0.1772770792245865, "num_tokens": 5684888.0, "step": 3125 }, { "entropy": 6.088335084915161, "epoch": 2.689299527288354, "grad_norm": 1.0703125, "learning_rate": 0.00045691149512597717, "loss": 5.8631, "mean_token_accuracy": 0.16669325679540634, "num_tokens": 5693626.0, "step": 3130 }, { "entropy": 6.180005502700806, "epoch": 2.6935969058874085, "grad_norm": 1.4453125, "learning_rate": 0.00045671562971329736, "loss": 5.7649, "mean_token_accuracy": 0.18092152327299119, "num_tokens": 5702542.0, "step": 3135 }, { "entropy": 6.056423187255859, "epoch": 2.6978942844864635, "grad_norm": 1.1484375, "learning_rate": 0.00045651936750631337, "loss": 5.8131, "mean_token_accuracy": 0.17378336936235428, "num_tokens": 5711440.0, "step": 3140 }, { "entropy": 6.189997816085816, "epoch": 2.702191663085518, "grad_norm": 1.0234375, "learning_rate": 0.00045632270893358333, "loss": 5.8825, "mean_token_accuracy": 0.17272377163171768, "num_tokens": 5721495.0, "step": 3145 }, { "entropy": 6.167654418945313, "epoch": 2.7064890416845726, "grad_norm": 1.109375, "learning_rate": 0.0004561256544245312, "loss": 5.9067, "mean_token_accuracy": 0.1615714728832245, "num_tokens": 5730664.0, "step": 3150 }, { "entropy": 6.04947509765625, "epoch": 2.710786420283627, "grad_norm": 1.0625, "learning_rate": 0.000455928204409445, "loss": 5.79, "mean_token_accuracy": 0.17923566401004792, "num_tokens": 5740229.0, "step": 3155 }, { "entropy": 6.107324123382568, "epoch": 2.7150837988826817, "grad_norm": 1.1328125, "learning_rate": 0.00045573035931947684, "loss": 5.7791, "mean_token_accuracy": 0.17757482677698136, "num_tokens": 5748549.0, "step": 3160 }, { "entropy": 6.101696872711182, "epoch": 2.7193811774817362, "grad_norm": 1.109375, "learning_rate": 0.0004555321195866411, "loss": 5.732, "mean_token_accuracy": 0.17644069641828536, "num_tokens": 5757603.0, "step": 3165 }, { "entropy": 6.136196327209473, "epoch": 2.723678556080791, "grad_norm": 1.2265625, "learning_rate": 0.0004553334856438143, "loss": 5.9098, "mean_token_accuracy": 0.16370768547058107, "num_tokens": 5767520.0, "step": 3170 }, { "entropy": 6.1458038806915285, "epoch": 2.7279759346798453, "grad_norm": 0.98828125, "learning_rate": 0.00045513445792473356, "loss": 5.8906, "mean_token_accuracy": 0.16408973336219787, "num_tokens": 5776778.0, "step": 3175 }, { "entropy": 6.174926614761352, "epoch": 2.7322733132789, "grad_norm": 1.109375, "learning_rate": 0.0004549350368639958, "loss": 5.9249, "mean_token_accuracy": 0.16355405300855635, "num_tokens": 5785652.0, "step": 3180 }, { "entropy": 6.212893629074097, "epoch": 2.7365706918779544, "grad_norm": 1.078125, "learning_rate": 0.00045473522289705693, "loss": 5.8811, "mean_token_accuracy": 0.1690053179860115, "num_tokens": 5795766.0, "step": 3185 }, { "entropy": 6.0142913341522215, "epoch": 2.740868070477009, "grad_norm": 1.140625, "learning_rate": 0.00045453501646023085, "loss": 5.9293, "mean_token_accuracy": 0.16316341012716293, "num_tokens": 5804504.0, "step": 3190 }, { "entropy": 6.090119218826294, "epoch": 2.7451654490760635, "grad_norm": 0.94921875, "learning_rate": 0.00045433441799068837, "loss": 5.8318, "mean_token_accuracy": 0.17157045751810074, "num_tokens": 5814161.0, "step": 3195 }, { "entropy": 6.133489179611206, "epoch": 2.749462827675118, "grad_norm": 1.0625, "learning_rate": 0.0004541334279264562, "loss": 5.7556, "mean_token_accuracy": 0.17994108349084853, "num_tokens": 5822235.0, "step": 3200 }, { "entropy": 6.069830846786499, "epoch": 2.7537602062741726, "grad_norm": 1.171875, "learning_rate": 0.00045393204670641656, "loss": 5.7589, "mean_token_accuracy": 0.17203548699617385, "num_tokens": 5831572.0, "step": 3205 }, { "entropy": 5.9929163455963135, "epoch": 2.758057584873227, "grad_norm": 1.0390625, "learning_rate": 0.0004537302747703055, "loss": 5.7621, "mean_token_accuracy": 0.18025242835283278, "num_tokens": 5839694.0, "step": 3210 }, { "entropy": 6.185488748550415, "epoch": 2.7623549634722817, "grad_norm": 1.1875, "learning_rate": 0.00045352811255871216, "loss": 5.8899, "mean_token_accuracy": 0.17093945741653443, "num_tokens": 5849131.0, "step": 3215 }, { "entropy": 6.186608505249024, "epoch": 2.7666523420713363, "grad_norm": 0.91796875, "learning_rate": 0.00045332556051307804, "loss": 5.8208, "mean_token_accuracy": 0.16853767782449722, "num_tokens": 5858861.0, "step": 3220 }, { "entropy": 6.110893869400025, "epoch": 2.770949720670391, "grad_norm": 1.0546875, "learning_rate": 0.00045312261907569585, "loss": 5.82, "mean_token_accuracy": 0.17171475738286973, "num_tokens": 5867585.0, "step": 3225 }, { "entropy": 6.081268453598023, "epoch": 2.775247099269446, "grad_norm": 1.0859375, "learning_rate": 0.00045291928868970867, "loss": 5.8317, "mean_token_accuracy": 0.16950544714927673, "num_tokens": 5876256.0, "step": 3230 }, { "entropy": 6.064776659011841, "epoch": 2.7795444778685003, "grad_norm": 1.0859375, "learning_rate": 0.0004527155697991087, "loss": 5.8911, "mean_token_accuracy": 0.16254067420959473, "num_tokens": 5885302.0, "step": 3235 }, { "entropy": 6.128396034240723, "epoch": 2.783841856467555, "grad_norm": 0.95703125, "learning_rate": 0.0004525114628487365, "loss": 5.9091, "mean_token_accuracy": 0.16473145335912703, "num_tokens": 5895066.0, "step": 3240 }, { "entropy": 6.1276613712310795, "epoch": 2.7881392350666094, "grad_norm": 1.0625, "learning_rate": 0.00045230696828428026, "loss": 5.8938, "mean_token_accuracy": 0.16614799648523332, "num_tokens": 5903258.0, "step": 3245 }, { "entropy": 6.09830675125122, "epoch": 2.792436613665664, "grad_norm": 1.125, "learning_rate": 0.0004521020865522742, "loss": 5.7738, "mean_token_accuracy": 0.1714928478002548, "num_tokens": 5911714.0, "step": 3250 }, { "entropy": 6.070488023757934, "epoch": 2.7967339922647185, "grad_norm": 1.0859375, "learning_rate": 0.00045189681810009827, "loss": 5.8635, "mean_token_accuracy": 0.16751533150672912, "num_tokens": 5920432.0, "step": 3255 }, { "entropy": 6.227630186080932, "epoch": 2.801031370863773, "grad_norm": 1.2265625, "learning_rate": 0.00045169116337597653, "loss": 5.8701, "mean_token_accuracy": 0.17065902799367905, "num_tokens": 5929202.0, "step": 3260 }, { "entropy": 6.189503717422485, "epoch": 2.8053287494628276, "grad_norm": 1.15625, "learning_rate": 0.000451485122828977, "loss": 5.9003, "mean_token_accuracy": 0.1647379770874977, "num_tokens": 5938034.0, "step": 3265 }, { "entropy": 6.010164356231689, "epoch": 2.809626128061882, "grad_norm": 0.9921875, "learning_rate": 0.00045127869690900956, "loss": 5.7485, "mean_token_accuracy": 0.17689475119113923, "num_tokens": 5946944.0, "step": 3270 }, { "entropy": 6.029814195632935, "epoch": 2.8139235066609367, "grad_norm": 1.2421875, "learning_rate": 0.00045107188606682613, "loss": 5.8498, "mean_token_accuracy": 0.17715609222650527, "num_tokens": 5956475.0, "step": 3275 }, { "entropy": 6.185597848892212, "epoch": 2.8182208852599913, "grad_norm": 1.0390625, "learning_rate": 0.0004508646907540188, "loss": 5.8236, "mean_token_accuracy": 0.16963610351085662, "num_tokens": 5965814.0, "step": 3280 }, { "entropy": 6.105741548538208, "epoch": 2.8225182638590463, "grad_norm": 1.1328125, "learning_rate": 0.0004506571114230195, "loss": 5.8687, "mean_token_accuracy": 0.16442400217056274, "num_tokens": 5973850.0, "step": 3285 }, { "entropy": 6.0313629627227785, "epoch": 2.826815642458101, "grad_norm": 1.0, "learning_rate": 0.00045044914852709824, "loss": 5.8113, "mean_token_accuracy": 0.16617825627326965, "num_tokens": 5982987.0, "step": 3290 }, { "entropy": 6.152327919006348, "epoch": 2.8311130210571553, "grad_norm": 1.1015625, "learning_rate": 0.0004502408025203631, "loss": 5.7981, "mean_token_accuracy": 0.17620996087789537, "num_tokens": 5992227.0, "step": 3295 }, { "entropy": 6.093041801452637, "epoch": 2.83541039965621, "grad_norm": 1.0546875, "learning_rate": 0.0004500320738577584, "loss": 5.7804, "mean_token_accuracy": 0.17178058624267578, "num_tokens": 6000243.0, "step": 3300 }, { "entropy": 6.071863269805908, "epoch": 2.8397077782552644, "grad_norm": 1.109375, "learning_rate": 0.00044982296299506407, "loss": 5.7959, "mean_token_accuracy": 0.1757694289088249, "num_tokens": 6009771.0, "step": 3305 }, { "entropy": 6.104401445388794, "epoch": 2.844005156854319, "grad_norm": 1.1796875, "learning_rate": 0.0004496134703888948, "loss": 5.8655, "mean_token_accuracy": 0.16886720359325408, "num_tokens": 6018683.0, "step": 3310 }, { "entropy": 6.063603019714355, "epoch": 2.8483025354533735, "grad_norm": 1.0703125, "learning_rate": 0.00044940359649669846, "loss": 5.7182, "mean_token_accuracy": 0.1814822018146515, "num_tokens": 6027422.0, "step": 3315 }, { "entropy": 6.0563880443573, "epoch": 2.852599914052428, "grad_norm": 1.09375, "learning_rate": 0.00044919334177675595, "loss": 5.8185, "mean_token_accuracy": 0.16714439690113067, "num_tokens": 6035670.0, "step": 3320 }, { "entropy": 6.098821926116943, "epoch": 2.8568972926514826, "grad_norm": 1.078125, "learning_rate": 0.00044898270668817955, "loss": 5.7433, "mean_token_accuracy": 0.17498091757297515, "num_tokens": 6044092.0, "step": 3325 }, { "entropy": 6.041405916213989, "epoch": 2.861194671250537, "grad_norm": 0.99609375, "learning_rate": 0.000448771691690912, "loss": 5.8089, "mean_token_accuracy": 0.17252034097909927, "num_tokens": 6053970.0, "step": 3330 }, { "entropy": 6.098532438278198, "epoch": 2.8654920498495917, "grad_norm": 1.0234375, "learning_rate": 0.0004485602972457257, "loss": 5.7875, "mean_token_accuracy": 0.17401470988988876, "num_tokens": 6062965.0, "step": 3335 }, { "entropy": 6.10422191619873, "epoch": 2.8697894284486463, "grad_norm": 1.078125, "learning_rate": 0.00044834852381422165, "loss": 5.8375, "mean_token_accuracy": 0.17349963784217834, "num_tokens": 6072420.0, "step": 3340 }, { "entropy": 6.048533582687378, "epoch": 2.874086807047701, "grad_norm": 1.078125, "learning_rate": 0.00044813637185882836, "loss": 5.7604, "mean_token_accuracy": 0.17201080173254013, "num_tokens": 6080915.0, "step": 3345 }, { "entropy": 6.129676723480225, "epoch": 2.8783841856467554, "grad_norm": 1.1875, "learning_rate": 0.00044792384184280106, "loss": 5.8898, "mean_token_accuracy": 0.16713710129261017, "num_tokens": 6090453.0, "step": 3350 }, { "entropy": 6.036713743209839, "epoch": 2.88268156424581, "grad_norm": 1.09375, "learning_rate": 0.00044771093423022013, "loss": 5.9178, "mean_token_accuracy": 0.16426213681697846, "num_tokens": 6099390.0, "step": 3355 }, { "entropy": 6.090553140640258, "epoch": 2.8869789428448644, "grad_norm": 0.99609375, "learning_rate": 0.0004474976494859909, "loss": 5.8439, "mean_token_accuracy": 0.17439688742160797, "num_tokens": 6108677.0, "step": 3360 }, { "entropy": 6.084423589706421, "epoch": 2.891276321443919, "grad_norm": 0.98046875, "learning_rate": 0.0004472839880758419, "loss": 5.7572, "mean_token_accuracy": 0.17288744151592256, "num_tokens": 6117151.0, "step": 3365 }, { "entropy": 6.169969892501831, "epoch": 2.8955737000429735, "grad_norm": 1.109375, "learning_rate": 0.0004470699504663242, "loss": 5.8724, "mean_token_accuracy": 0.1652231350541115, "num_tokens": 6127167.0, "step": 3370 }, { "entropy": 6.055519533157349, "epoch": 2.899871078642028, "grad_norm": 1.03125, "learning_rate": 0.0004468555371248104, "loss": 5.7663, "mean_token_accuracy": 0.17967537939548492, "num_tokens": 6136487.0, "step": 3375 }, { "entropy": 6.096647262573242, "epoch": 2.904168457241083, "grad_norm": 1.0078125, "learning_rate": 0.0004466407485194937, "loss": 5.8808, "mean_token_accuracy": 0.16516373604536055, "num_tokens": 6145334.0, "step": 3380 }, { "entropy": 6.091698265075683, "epoch": 2.9084658358401376, "grad_norm": 1.0625, "learning_rate": 0.0004464255851193864, "loss": 5.7913, "mean_token_accuracy": 0.17120025604963302, "num_tokens": 6155062.0, "step": 3385 }, { "entropy": 6.080928611755371, "epoch": 2.912763214439192, "grad_norm": 1.7265625, "learning_rate": 0.0004462100473943194, "loss": 5.7627, "mean_token_accuracy": 0.17752974182367326, "num_tokens": 6164313.0, "step": 3390 }, { "entropy": 6.061914777755737, "epoch": 2.9170605930382467, "grad_norm": 1.03125, "learning_rate": 0.000445994135814941, "loss": 5.8024, "mean_token_accuracy": 0.17023618370294571, "num_tokens": 6173513.0, "step": 3395 }, { "entropy": 6.057987403869629, "epoch": 2.9213579716373013, "grad_norm": 1.1953125, "learning_rate": 0.00044577785085271566, "loss": 5.8041, "mean_token_accuracy": 0.17476166486740113, "num_tokens": 6182000.0, "step": 3400 }, { "entropy": 6.1352544784545895, "epoch": 2.925655350236356, "grad_norm": 1.015625, "learning_rate": 0.0004455611929799235, "loss": 5.8516, "mean_token_accuracy": 0.1572086051106453, "num_tokens": 6191887.0, "step": 3405 }, { "entropy": 6.025879716873169, "epoch": 2.9299527288354104, "grad_norm": 1.015625, "learning_rate": 0.0004453441626696585, "loss": 5.885, "mean_token_accuracy": 0.16230087578296662, "num_tokens": 6202897.0, "step": 3410 }, { "entropy": 6.132012939453125, "epoch": 2.934250107434465, "grad_norm": 1.0390625, "learning_rate": 0.00044512676039582823, "loss": 5.7891, "mean_token_accuracy": 0.1754133865237236, "num_tokens": 6211811.0, "step": 3415 }, { "entropy": 6.114519882202148, "epoch": 2.9385474860335195, "grad_norm": 1.109375, "learning_rate": 0.0004449089866331524, "loss": 5.7826, "mean_token_accuracy": 0.18096065670251846, "num_tokens": 6219896.0, "step": 3420 }, { "entropy": 5.983143472671509, "epoch": 2.942844864632574, "grad_norm": 1.078125, "learning_rate": 0.0004446908418571617, "loss": 5.7734, "mean_token_accuracy": 0.1765346944332123, "num_tokens": 6228212.0, "step": 3425 }, { "entropy": 6.059330701828003, "epoch": 2.9471422432316285, "grad_norm": 1.0390625, "learning_rate": 0.0004444723265441973, "loss": 5.9301, "mean_token_accuracy": 0.1656051605939865, "num_tokens": 6238133.0, "step": 3430 }, { "entropy": 6.08131365776062, "epoch": 2.9514396218306835, "grad_norm": 0.98046875, "learning_rate": 0.0004442534411714092, "loss": 5.8366, "mean_token_accuracy": 0.1650673657655716, "num_tokens": 6247331.0, "step": 3435 }, { "entropy": 6.160918760299682, "epoch": 2.955737000429738, "grad_norm": 1.0859375, "learning_rate": 0.00044403418621675555, "loss": 5.8406, "mean_token_accuracy": 0.16983808875083922, "num_tokens": 6255280.0, "step": 3440 }, { "entropy": 6.073430061340332, "epoch": 2.9600343790287926, "grad_norm": 1.015625, "learning_rate": 0.0004438145621590017, "loss": 5.7939, "mean_token_accuracy": 0.17472269237041474, "num_tokens": 6264752.0, "step": 3445 }, { "entropy": 6.033823823928833, "epoch": 2.964331757627847, "grad_norm": 1.140625, "learning_rate": 0.00044359456947771857, "loss": 5.7495, "mean_token_accuracy": 0.172511225938797, "num_tokens": 6273258.0, "step": 3450 }, { "entropy": 5.891212129592896, "epoch": 2.9686291362269017, "grad_norm": 1.1953125, "learning_rate": 0.0004433742086532824, "loss": 5.6668, "mean_token_accuracy": 0.19016601592302323, "num_tokens": 6281584.0, "step": 3455 }, { "entropy": 6.076795339584351, "epoch": 2.9729265148259563, "grad_norm": 1.171875, "learning_rate": 0.00044315348016687317, "loss": 5.7854, "mean_token_accuracy": 0.17181758135557174, "num_tokens": 6290016.0, "step": 3460 }, { "entropy": 6.06014461517334, "epoch": 2.977223893425011, "grad_norm": 1.078125, "learning_rate": 0.0004429323845004736, "loss": 5.694, "mean_token_accuracy": 0.17798333764076232, "num_tokens": 6298569.0, "step": 3465 }, { "entropy": 5.982924079895019, "epoch": 2.9815212720240654, "grad_norm": 1.0078125, "learning_rate": 0.00044271092213686824, "loss": 5.7296, "mean_token_accuracy": 0.17693220674991608, "num_tokens": 6307684.0, "step": 3470 }, { "entropy": 6.1649445533752445, "epoch": 2.98581865062312, "grad_norm": 0.9453125, "learning_rate": 0.00044248909355964247, "loss": 5.8556, "mean_token_accuracy": 0.1716341868042946, "num_tokens": 6317767.0, "step": 3475 }, { "entropy": 6.146809720993042, "epoch": 2.9901160292221745, "grad_norm": 1.1484375, "learning_rate": 0.00044226689925318117, "loss": 5.8931, "mean_token_accuracy": 0.16468499451875687, "num_tokens": 6327457.0, "step": 3480 }, { "entropy": 5.985245990753174, "epoch": 2.994413407821229, "grad_norm": 1.0625, "learning_rate": 0.00044204433970266785, "loss": 5.6945, "mean_token_accuracy": 0.18739936202764512, "num_tokens": 6335747.0, "step": 3485 }, { "entropy": 6.050507545471191, "epoch": 2.9987107864202835, "grad_norm": 1.0625, "learning_rate": 0.0004418214153940837, "loss": 5.7846, "mean_token_accuracy": 0.1760311618447304, "num_tokens": 6344750.0, "step": 3490 }, { "entropy": 6.092853705088298, "epoch": 3.002578427159433, "grad_norm": 0.890625, "learning_rate": 0.00044159812681420624, "loss": 5.7217, "mean_token_accuracy": 0.17525596585538653, "num_tokens": 6354779.0, "step": 3495 }, { "entropy": 6.122584819793701, "epoch": 3.0068758057584875, "grad_norm": 1.0703125, "learning_rate": 0.0004413744744506086, "loss": 5.506, "mean_token_accuracy": 0.1860961213707924, "num_tokens": 6363809.0, "step": 3500 }, { "epoch": 3.0068758057584875, "eval_entropy": 5.801608745042269, "eval_loss": 6.042037010192871, "eval_mean_token_accuracy": 0.1686659706336958, "eval_num_tokens": 6363809.0, "eval_runtime": 2.0476, "eval_samples_per_second": 1733.255, "eval_steps_per_second": 216.84, "step": 3500 }, { "entropy": 5.992935609817505, "epoch": 3.011173184357542, "grad_norm": 1.046875, "learning_rate": 0.00044115045879165806, "loss": 5.563, "mean_token_accuracy": 0.18435313254594804, "num_tokens": 6373082.0, "step": 3505 }, { "entropy": 6.053584480285645, "epoch": 3.0154705629565965, "grad_norm": 1.1015625, "learning_rate": 0.00044092608032651515, "loss": 5.5261, "mean_token_accuracy": 0.1837206542491913, "num_tokens": 6381286.0, "step": 3510 }, { "entropy": 6.083251333236694, "epoch": 3.019767941555651, "grad_norm": 0.98046875, "learning_rate": 0.00044070133954513305, "loss": 5.4729, "mean_token_accuracy": 0.19432286769151688, "num_tokens": 6390217.0, "step": 3515 }, { "entropy": 6.058011102676391, "epoch": 3.0240653201547056, "grad_norm": 1.28125, "learning_rate": 0.0004404762369382555, "loss": 5.5036, "mean_token_accuracy": 0.18731357306241989, "num_tokens": 6399276.0, "step": 3520 }, { "entropy": 6.000890445709229, "epoch": 3.02836269875376, "grad_norm": 1.1640625, "learning_rate": 0.00044025077299741683, "loss": 5.4811, "mean_token_accuracy": 0.192198945581913, "num_tokens": 6407981.0, "step": 3525 }, { "entropy": 5.988429880142212, "epoch": 3.0326600773528147, "grad_norm": 1.125, "learning_rate": 0.00044002494821494007, "loss": 5.4804, "mean_token_accuracy": 0.18921354711055755, "num_tokens": 6416159.0, "step": 3530 }, { "entropy": 5.9463738918304445, "epoch": 3.0369574559518693, "grad_norm": 1.125, "learning_rate": 0.00043979876308393635, "loss": 5.531, "mean_token_accuracy": 0.1913963183760643, "num_tokens": 6424564.0, "step": 3535 }, { "entropy": 6.106854009628296, "epoch": 3.041254834550924, "grad_norm": 1.0234375, "learning_rate": 0.0004395722180983036, "loss": 5.5823, "mean_token_accuracy": 0.18249945044517518, "num_tokens": 6434163.0, "step": 3540 }, { "entropy": 5.950508308410645, "epoch": 3.0455522131499784, "grad_norm": 1.0625, "learning_rate": 0.00043934531375272535, "loss": 5.3919, "mean_token_accuracy": 0.20384220778942108, "num_tokens": 6443372.0, "step": 3545 }, { "entropy": 5.974466180801391, "epoch": 3.049849591749033, "grad_norm": 0.96875, "learning_rate": 0.00043911805054267015, "loss": 5.4833, "mean_token_accuracy": 0.18905829787254333, "num_tokens": 6452638.0, "step": 3550 }, { "entropy": 6.111138391494751, "epoch": 3.0541469703480875, "grad_norm": 1.0546875, "learning_rate": 0.00043889042896439004, "loss": 5.4924, "mean_token_accuracy": 0.19172994196414947, "num_tokens": 6461319.0, "step": 3555 }, { "entropy": 6.002539110183716, "epoch": 3.0584443489471425, "grad_norm": 1.3046875, "learning_rate": 0.00043866244951491946, "loss": 5.4305, "mean_token_accuracy": 0.1999826490879059, "num_tokens": 6469506.0, "step": 3560 }, { "entropy": 6.020529794692993, "epoch": 3.062741727546197, "grad_norm": 1.1171875, "learning_rate": 0.00043843411269207445, "loss": 5.4837, "mean_token_accuracy": 0.19121226519346238, "num_tokens": 6478404.0, "step": 3565 }, { "entropy": 5.9611005783081055, "epoch": 3.0670391061452515, "grad_norm": 1.09375, "learning_rate": 0.0004382054189944514, "loss": 5.433, "mean_token_accuracy": 0.18942490667104722, "num_tokens": 6487447.0, "step": 3570 }, { "entropy": 5.9097977638244625, "epoch": 3.071336484744306, "grad_norm": 1.0234375, "learning_rate": 0.0004379763689214259, "loss": 5.469, "mean_token_accuracy": 0.18396330773830413, "num_tokens": 6496738.0, "step": 3575 }, { "entropy": 6.013470220565796, "epoch": 3.0756338633433606, "grad_norm": 0.97265625, "learning_rate": 0.0004377469629731518, "loss": 5.4752, "mean_token_accuracy": 0.1895818755030632, "num_tokens": 6505848.0, "step": 3580 }, { "entropy": 6.006653928756714, "epoch": 3.079931241942415, "grad_norm": 1.015625, "learning_rate": 0.0004375172016505599, "loss": 5.4558, "mean_token_accuracy": 0.18824636489152907, "num_tokens": 6515731.0, "step": 3585 }, { "entropy": 5.979631328582764, "epoch": 3.0842286205414697, "grad_norm": 1.03125, "learning_rate": 0.0004372870854553572, "loss": 5.5152, "mean_token_accuracy": 0.18944674283266066, "num_tokens": 6524914.0, "step": 3590 }, { "entropy": 5.99342303276062, "epoch": 3.0885259991405243, "grad_norm": 1.0625, "learning_rate": 0.0004370566148900255, "loss": 5.4967, "mean_token_accuracy": 0.19440635293722153, "num_tokens": 6533712.0, "step": 3595 }, { "entropy": 6.0267222881317135, "epoch": 3.092823377739579, "grad_norm": 1.0625, "learning_rate": 0.00043682579045782024, "loss": 5.5786, "mean_token_accuracy": 0.18650965839624406, "num_tokens": 6543313.0, "step": 3600 }, { "entropy": 5.940178155899048, "epoch": 3.0971207563386334, "grad_norm": 1.1953125, "learning_rate": 0.0004365946126627699, "loss": 5.4649, "mean_token_accuracy": 0.19772678166627883, "num_tokens": 6551634.0, "step": 3605 }, { "entropy": 6.004144239425659, "epoch": 3.101418134937688, "grad_norm": 1.0546875, "learning_rate": 0.00043636308200967433, "loss": 5.4821, "mean_token_accuracy": 0.1942768707871437, "num_tokens": 6560695.0, "step": 3610 }, { "entropy": 5.857456827163697, "epoch": 3.1057155135367425, "grad_norm": 1.03125, "learning_rate": 0.0004361311990041039, "loss": 5.3753, "mean_token_accuracy": 0.19874223917722703, "num_tokens": 6569086.0, "step": 3615 }, { "entropy": 5.919683027267456, "epoch": 3.110012892135797, "grad_norm": 1.0859375, "learning_rate": 0.00043589896415239843, "loss": 5.4564, "mean_token_accuracy": 0.1986413672566414, "num_tokens": 6578287.0, "step": 3620 }, { "entropy": 5.956605434417725, "epoch": 3.1143102707348516, "grad_norm": 0.99609375, "learning_rate": 0.00043566637796166595, "loss": 5.5147, "mean_token_accuracy": 0.18752527385950088, "num_tokens": 6587015.0, "step": 3625 }, { "entropy": 5.9813155174255375, "epoch": 3.118607649333906, "grad_norm": 1.140625, "learning_rate": 0.00043543344093978186, "loss": 5.5585, "mean_token_accuracy": 0.18545775562524797, "num_tokens": 6596187.0, "step": 3630 }, { "entropy": 5.964481592178345, "epoch": 3.122905027932961, "grad_norm": 1.0703125, "learning_rate": 0.00043520015359538745, "loss": 5.4268, "mean_token_accuracy": 0.19721884578466414, "num_tokens": 6605226.0, "step": 3635 }, { "entropy": 5.862498092651367, "epoch": 3.1272024065320156, "grad_norm": 1.109375, "learning_rate": 0.0004349665164378891, "loss": 5.475, "mean_token_accuracy": 0.18966546505689622, "num_tokens": 6613232.0, "step": 3640 }, { "entropy": 5.976254987716675, "epoch": 3.13149978513107, "grad_norm": 1.0859375, "learning_rate": 0.00043473252997745684, "loss": 5.4789, "mean_token_accuracy": 0.18647109866142272, "num_tokens": 6622247.0, "step": 3645 }, { "entropy": 6.025827789306641, "epoch": 3.1357971637301247, "grad_norm": 1.71875, "learning_rate": 0.00043449819472502366, "loss": 5.4281, "mean_token_accuracy": 0.19298454523086547, "num_tokens": 6630883.0, "step": 3650 }, { "entropy": 5.921304559707641, "epoch": 3.1400945423291793, "grad_norm": 1.09375, "learning_rate": 0.0004342635111922841, "loss": 5.5595, "mean_token_accuracy": 0.18861598372459412, "num_tokens": 6639399.0, "step": 3655 }, { "entropy": 5.989827823638916, "epoch": 3.144391920928234, "grad_norm": 1.125, "learning_rate": 0.0004340284798916931, "loss": 5.483, "mean_token_accuracy": 0.19412256628274918, "num_tokens": 6649288.0, "step": 3660 }, { "entropy": 5.921028423309326, "epoch": 3.1486892995272884, "grad_norm": 1.0078125, "learning_rate": 0.0004337931013364653, "loss": 5.4165, "mean_token_accuracy": 0.19552054554224013, "num_tokens": 6658670.0, "step": 3665 }, { "entropy": 5.969826030731201, "epoch": 3.152986678126343, "grad_norm": 1.125, "learning_rate": 0.000433557376040573, "loss": 5.4991, "mean_token_accuracy": 0.1942813739180565, "num_tokens": 6667302.0, "step": 3670 }, { "entropy": 5.992925643920898, "epoch": 3.1572840567253975, "grad_norm": 1.0703125, "learning_rate": 0.00043332130451874645, "loss": 5.5383, "mean_token_accuracy": 0.1936521127820015, "num_tokens": 6677393.0, "step": 3675 }, { "entropy": 6.003905582427978, "epoch": 3.161581435324452, "grad_norm": 0.94140625, "learning_rate": 0.00043308488728647127, "loss": 5.5183, "mean_token_accuracy": 0.18625610321760178, "num_tokens": 6686727.0, "step": 3680 }, { "entropy": 5.899046134948731, "epoch": 3.1658788139235066, "grad_norm": 1.1796875, "learning_rate": 0.0004328481248599882, "loss": 5.4279, "mean_token_accuracy": 0.196131394803524, "num_tokens": 6696116.0, "step": 3685 }, { "entropy": 5.968793296813965, "epoch": 3.170176192522561, "grad_norm": 1.078125, "learning_rate": 0.0004326110177562918, "loss": 5.5429, "mean_token_accuracy": 0.18541710525751115, "num_tokens": 6704640.0, "step": 3690 }, { "entropy": 5.916857767105102, "epoch": 3.1744735711216157, "grad_norm": 1.203125, "learning_rate": 0.00043237356649312926, "loss": 5.3912, "mean_token_accuracy": 0.20387934297323226, "num_tokens": 6713663.0, "step": 3695 }, { "entropy": 5.932327318191528, "epoch": 3.17877094972067, "grad_norm": 1.0625, "learning_rate": 0.0004321357715889991, "loss": 5.526, "mean_token_accuracy": 0.1858012244105339, "num_tokens": 6722965.0, "step": 3700 }, { "entropy": 5.9681384563446045, "epoch": 3.1830683283197247, "grad_norm": 1.140625, "learning_rate": 0.0004318976335631505, "loss": 5.4893, "mean_token_accuracy": 0.19365193992853164, "num_tokens": 6732776.0, "step": 3705 }, { "entropy": 5.964018297195435, "epoch": 3.1873657069187797, "grad_norm": 1.046875, "learning_rate": 0.00043165915293558155, "loss": 5.4682, "mean_token_accuracy": 0.19091420918703078, "num_tokens": 6741309.0, "step": 3710 }, { "entropy": 5.944598436355591, "epoch": 3.1916630855178343, "grad_norm": 1.0546875, "learning_rate": 0.0004314203302270388, "loss": 5.5274, "mean_token_accuracy": 0.18904216587543488, "num_tokens": 6750584.0, "step": 3715 }, { "entropy": 5.97039303779602, "epoch": 3.195960464116889, "grad_norm": 1.1640625, "learning_rate": 0.0004311811659590154, "loss": 5.5007, "mean_token_accuracy": 0.1887460470199585, "num_tokens": 6759344.0, "step": 3720 }, { "entropy": 6.059423017501831, "epoch": 3.2002578427159434, "grad_norm": 0.87890625, "learning_rate": 0.0004309416606537507, "loss": 5.6563, "mean_token_accuracy": 0.18009912818670273, "num_tokens": 6770345.0, "step": 3725 }, { "entropy": 6.00485258102417, "epoch": 3.204555221314998, "grad_norm": 1.125, "learning_rate": 0.00043070181483422843, "loss": 5.5411, "mean_token_accuracy": 0.1854734942317009, "num_tokens": 6779991.0, "step": 3730 }, { "entropy": 5.88880934715271, "epoch": 3.2088525999140525, "grad_norm": 1.1953125, "learning_rate": 0.000430461629024176, "loss": 5.4983, "mean_token_accuracy": 0.19071830958127975, "num_tokens": 6788972.0, "step": 3735 }, { "entropy": 5.885913467407226, "epoch": 3.213149978513107, "grad_norm": 1.1640625, "learning_rate": 0.0004302211037480634, "loss": 5.4111, "mean_token_accuracy": 0.19531920850276946, "num_tokens": 6796967.0, "step": 3740 }, { "entropy": 5.912165975570678, "epoch": 3.2174473571121616, "grad_norm": 1.234375, "learning_rate": 0.0004299802395311015, "loss": 5.5182, "mean_token_accuracy": 0.18958668708801268, "num_tokens": 6805961.0, "step": 3745 }, { "entropy": 5.875810194015503, "epoch": 3.221744735711216, "grad_norm": 1.234375, "learning_rate": 0.0004297390368992414, "loss": 5.4233, "mean_token_accuracy": 0.19228914380073547, "num_tokens": 6814657.0, "step": 3750 }, { "entropy": 5.940344333648682, "epoch": 3.2260421143102707, "grad_norm": 1.2265625, "learning_rate": 0.00042949749637917353, "loss": 5.4718, "mean_token_accuracy": 0.1930217519402504, "num_tokens": 6823095.0, "step": 3755 }, { "entropy": 5.956659030914307, "epoch": 3.230339492909325, "grad_norm": 0.99609375, "learning_rate": 0.0004292556184983256, "loss": 5.4872, "mean_token_accuracy": 0.19027772098779677, "num_tokens": 6832195.0, "step": 3760 }, { "entropy": 6.009495830535888, "epoch": 3.2346368715083798, "grad_norm": 1.15625, "learning_rate": 0.0004290134037848623, "loss": 5.6084, "mean_token_accuracy": 0.18570149838924407, "num_tokens": 6840922.0, "step": 3765 }, { "entropy": 5.964060831069946, "epoch": 3.2389342501074343, "grad_norm": 1.171875, "learning_rate": 0.00042877085276768386, "loss": 5.46, "mean_token_accuracy": 0.19570931494235994, "num_tokens": 6849182.0, "step": 3770 }, { "entropy": 5.94105863571167, "epoch": 3.243231628706489, "grad_norm": 1.1015625, "learning_rate": 0.00042852796597642455, "loss": 5.4551, "mean_token_accuracy": 0.19768441170454026, "num_tokens": 6857932.0, "step": 3775 }, { "entropy": 5.997882509231568, "epoch": 3.247529007305544, "grad_norm": 1.0859375, "learning_rate": 0.0004282847439414522, "loss": 5.616, "mean_token_accuracy": 0.17659982144832612, "num_tokens": 6867283.0, "step": 3780 }, { "entropy": 6.0180786609649655, "epoch": 3.2518263859045984, "grad_norm": 1.078125, "learning_rate": 0.0004280411871938664, "loss": 5.5648, "mean_token_accuracy": 0.18943356424570085, "num_tokens": 6876123.0, "step": 3785 }, { "entropy": 6.006447601318359, "epoch": 3.256123764503653, "grad_norm": 1.1796875, "learning_rate": 0.0004277972962654979, "loss": 5.5082, "mean_token_accuracy": 0.18536664098501204, "num_tokens": 6885239.0, "step": 3790 }, { "entropy": 5.930108880996704, "epoch": 3.2604211431027075, "grad_norm": 1.0859375, "learning_rate": 0.0004275530716889069, "loss": 5.5573, "mean_token_accuracy": 0.18274880945682526, "num_tokens": 6895061.0, "step": 3795 }, { "entropy": 5.983970260620117, "epoch": 3.264718521701762, "grad_norm": 1.2265625, "learning_rate": 0.0004273085139973822, "loss": 5.5993, "mean_token_accuracy": 0.177694109082222, "num_tokens": 6903828.0, "step": 3800 }, { "entropy": 6.014524221420288, "epoch": 3.2690159003008166, "grad_norm": 1.140625, "learning_rate": 0.0004270636237249401, "loss": 5.5151, "mean_token_accuracy": 0.18856608420610427, "num_tokens": 6912805.0, "step": 3805 }, { "entropy": 5.941100168228149, "epoch": 3.273313278899871, "grad_norm": 1.1015625, "learning_rate": 0.00042681840140632314, "loss": 5.5616, "mean_token_accuracy": 0.18302462846040726, "num_tokens": 6922165.0, "step": 3810 }, { "entropy": 5.997183227539063, "epoch": 3.2776106574989257, "grad_norm": 1.0859375, "learning_rate": 0.0004265728475769989, "loss": 5.5322, "mean_token_accuracy": 0.18632204383611678, "num_tokens": 6931677.0, "step": 3815 }, { "entropy": 5.975349044799804, "epoch": 3.28190803609798, "grad_norm": 0.97265625, "learning_rate": 0.0004263269627731586, "loss": 5.4952, "mean_token_accuracy": 0.19264112412929535, "num_tokens": 6940486.0, "step": 3820 }, { "entropy": 5.868766260147095, "epoch": 3.2862054146970348, "grad_norm": 1.1015625, "learning_rate": 0.0004260807475317164, "loss": 5.51, "mean_token_accuracy": 0.1856775924563408, "num_tokens": 6948990.0, "step": 3825 }, { "entropy": 6.010857200622558, "epoch": 3.2905027932960893, "grad_norm": 1.0234375, "learning_rate": 0.0004258342023903081, "loss": 5.636, "mean_token_accuracy": 0.17837173044681548, "num_tokens": 6959311.0, "step": 3830 }, { "entropy": 6.02067198753357, "epoch": 3.294800171895144, "grad_norm": 1.125, "learning_rate": 0.00042558732788728975, "loss": 5.4186, "mean_token_accuracy": 0.19980644732713698, "num_tokens": 6968619.0, "step": 3835 }, { "entropy": 5.891939735412597, "epoch": 3.2990975504941984, "grad_norm": 1.09375, "learning_rate": 0.00042534012456173643, "loss": 5.4745, "mean_token_accuracy": 0.1930858761072159, "num_tokens": 6977469.0, "step": 3840 }, { "entropy": 5.908893871307373, "epoch": 3.303394929093253, "grad_norm": 1.2421875, "learning_rate": 0.00042509259295344157, "loss": 5.4637, "mean_token_accuracy": 0.18524923622608186, "num_tokens": 6986772.0, "step": 3845 }, { "entropy": 5.965682172775269, "epoch": 3.3076923076923075, "grad_norm": 1.2578125, "learning_rate": 0.00042484473360291514, "loss": 5.4722, "mean_token_accuracy": 0.1818112000823021, "num_tokens": 6993937.0, "step": 3850 }, { "entropy": 5.878727436065674, "epoch": 3.311989686291362, "grad_norm": 1.1328125, "learning_rate": 0.00042459654705138294, "loss": 5.5336, "mean_token_accuracy": 0.19061464071273804, "num_tokens": 7003222.0, "step": 3855 }, { "entropy": 5.907388973236084, "epoch": 3.316287064890417, "grad_norm": 1.109375, "learning_rate": 0.0004243480338407853, "loss": 5.5021, "mean_token_accuracy": 0.19867320060729982, "num_tokens": 7012055.0, "step": 3860 }, { "entropy": 5.968272018432617, "epoch": 3.3205844434894716, "grad_norm": 1.078125, "learning_rate": 0.0004240991945137755, "loss": 5.4952, "mean_token_accuracy": 0.1932666853070259, "num_tokens": 7021036.0, "step": 3865 }, { "entropy": 5.909445858001709, "epoch": 3.324881822088526, "grad_norm": 1.1328125, "learning_rate": 0.00042385002961371944, "loss": 5.4787, "mean_token_accuracy": 0.194594843685627, "num_tokens": 7030450.0, "step": 3870 }, { "entropy": 6.005906677246093, "epoch": 3.3291792006875807, "grad_norm": 1.1640625, "learning_rate": 0.0004236005396846935, "loss": 5.5873, "mean_token_accuracy": 0.18787091970443726, "num_tokens": 7039740.0, "step": 3875 }, { "entropy": 6.0099263191223145, "epoch": 3.333476579286635, "grad_norm": 1.125, "learning_rate": 0.00042335072527148406, "loss": 5.5642, "mean_token_accuracy": 0.18891336619853974, "num_tokens": 7050430.0, "step": 3880 }, { "entropy": 5.886811065673828, "epoch": 3.3377739578856898, "grad_norm": 1.25, "learning_rate": 0.0004231005869195859, "loss": 5.5523, "mean_token_accuracy": 0.18664977699518204, "num_tokens": 7059477.0, "step": 3885 }, { "entropy": 5.945472669601441, "epoch": 3.3420713364847443, "grad_norm": 1.4296875, "learning_rate": 0.0004228501251752011, "loss": 5.4871, "mean_token_accuracy": 0.19109417051076888, "num_tokens": 7067805.0, "step": 3890 }, { "entropy": 5.942922163009643, "epoch": 3.346368715083799, "grad_norm": 1.0625, "learning_rate": 0.00042259934058523814, "loss": 5.4972, "mean_token_accuracy": 0.18601811528205872, "num_tokens": 7077606.0, "step": 3895 }, { "entropy": 5.984446573257446, "epoch": 3.3506660936828534, "grad_norm": 1.15625, "learning_rate": 0.00042234823369731027, "loss": 5.448, "mean_token_accuracy": 0.19036031365394593, "num_tokens": 7085647.0, "step": 3900 }, { "entropy": 5.861058759689331, "epoch": 3.354963472281908, "grad_norm": 1.1171875, "learning_rate": 0.00042209680505973465, "loss": 5.4762, "mean_token_accuracy": 0.19057320803403854, "num_tokens": 7095298.0, "step": 3905 }, { "entropy": 5.868588638305664, "epoch": 3.3592608508809625, "grad_norm": 1.0546875, "learning_rate": 0.0004218450552215308, "loss": 5.5542, "mean_token_accuracy": 0.19133240431547166, "num_tokens": 7105207.0, "step": 3910 }, { "entropy": 5.973352527618408, "epoch": 3.363558229480017, "grad_norm": 1.0390625, "learning_rate": 0.0004215929847324199, "loss": 5.6046, "mean_token_accuracy": 0.18282657265663146, "num_tokens": 7114833.0, "step": 3915 }, { "entropy": 6.0064185619354244, "epoch": 3.3678556080790716, "grad_norm": 1.1875, "learning_rate": 0.000421340594142823, "loss": 5.4227, "mean_token_accuracy": 0.20140644013881684, "num_tokens": 7123608.0, "step": 3920 }, { "entropy": 5.875625896453857, "epoch": 3.3721529866781266, "grad_norm": 1.21875, "learning_rate": 0.00042108788400386035, "loss": 5.4824, "mean_token_accuracy": 0.19125625491142273, "num_tokens": 7132250.0, "step": 3925 }, { "entropy": 5.91867356300354, "epoch": 3.376450365277181, "grad_norm": 0.99609375, "learning_rate": 0.0004208348548673498, "loss": 5.5796, "mean_token_accuracy": 0.18955173790454866, "num_tokens": 7142086.0, "step": 3930 }, { "entropy": 5.989838075637818, "epoch": 3.3807477438762357, "grad_norm": 1.125, "learning_rate": 0.000420581507285806, "loss": 5.525, "mean_token_accuracy": 0.1797061249613762, "num_tokens": 7152434.0, "step": 3935 }, { "entropy": 5.870218181610108, "epoch": 3.38504512247529, "grad_norm": 1.046875, "learning_rate": 0.0004203278418124386, "loss": 5.4707, "mean_token_accuracy": 0.19644346386194228, "num_tokens": 7163041.0, "step": 3940 }, { "entropy": 5.865656518936158, "epoch": 3.3893425010743448, "grad_norm": 1.0390625, "learning_rate": 0.0004200738590011518, "loss": 5.4512, "mean_token_accuracy": 0.19743987321853637, "num_tokens": 7171875.0, "step": 3945 }, { "entropy": 5.906575489044189, "epoch": 3.3936398796733993, "grad_norm": 1.1484375, "learning_rate": 0.00041981955940654245, "loss": 5.5679, "mean_token_accuracy": 0.18974538147449493, "num_tokens": 7180803.0, "step": 3950 }, { "entropy": 5.951998472213745, "epoch": 3.397937258272454, "grad_norm": 1.1171875, "learning_rate": 0.0004195649435838992, "loss": 5.5884, "mean_token_accuracy": 0.17947447150945664, "num_tokens": 7190661.0, "step": 3955 }, { "entropy": 5.871505403518677, "epoch": 3.4022346368715084, "grad_norm": 1.09375, "learning_rate": 0.0004193100120892013, "loss": 5.418, "mean_token_accuracy": 0.19889674335718155, "num_tokens": 7199357.0, "step": 3960 }, { "entropy": 5.934350156784058, "epoch": 3.406532015470563, "grad_norm": 0.99609375, "learning_rate": 0.0004190547654791172, "loss": 5.597, "mean_token_accuracy": 0.18219801187515258, "num_tokens": 7209856.0, "step": 3965 }, { "entropy": 5.969940042495727, "epoch": 3.4108293940696175, "grad_norm": 1.2265625, "learning_rate": 0.00041879920431100347, "loss": 5.5648, "mean_token_accuracy": 0.17899948358535767, "num_tokens": 7218778.0, "step": 3970 }, { "entropy": 5.924646472930908, "epoch": 3.415126772668672, "grad_norm": 1.1171875, "learning_rate": 0.0004185433291429036, "loss": 5.5802, "mean_token_accuracy": 0.18834476321935653, "num_tokens": 7228442.0, "step": 3975 }, { "entropy": 5.978606748580932, "epoch": 3.4194241512677266, "grad_norm": 1.171875, "learning_rate": 0.00041828714053354665, "loss": 5.5653, "mean_token_accuracy": 0.18292482793331147, "num_tokens": 7238724.0, "step": 3980 }, { "entropy": 5.850194692611694, "epoch": 3.423721529866781, "grad_norm": 1.078125, "learning_rate": 0.0004180306390423462, "loss": 5.5145, "mean_token_accuracy": 0.19443774223327637, "num_tokens": 7247844.0, "step": 3985 }, { "entropy": 5.919923639297485, "epoch": 3.4280189084658357, "grad_norm": 1.0703125, "learning_rate": 0.00041777382522939884, "loss": 5.5776, "mean_token_accuracy": 0.1839929461479187, "num_tokens": 7257260.0, "step": 3990 }, { "entropy": 5.963938665390015, "epoch": 3.4323162870648902, "grad_norm": 0.9921875, "learning_rate": 0.00041751669965548344, "loss": 5.5802, "mean_token_accuracy": 0.1809097185730934, "num_tokens": 7266890.0, "step": 3995 }, { "entropy": 5.974624681472778, "epoch": 3.4366136656639448, "grad_norm": 1.1484375, "learning_rate": 0.00041725926288205945, "loss": 5.598, "mean_token_accuracy": 0.17821378856897355, "num_tokens": 7276114.0, "step": 4000 }, { "epoch": 3.4366136656639448, "eval_entropy": 5.73526575543859, "eval_loss": 6.016810417175293, "eval_mean_token_accuracy": 0.17057843910748358, "eval_num_tokens": 7276114.0, "eval_runtime": 2.0499, "eval_samples_per_second": 1731.264, "eval_steps_per_second": 216.591, "step": 4000 }, { "entropy": 5.9616344451904295, "epoch": 3.4409110442629998, "grad_norm": 1.078125, "learning_rate": 0.0004170015154712658, "loss": 5.548, "mean_token_accuracy": 0.1874366208910942, "num_tokens": 7284426.0, "step": 4005 }, { "entropy": 5.910069179534912, "epoch": 3.4452084228620543, "grad_norm": 1.015625, "learning_rate": 0.00041674345798591993, "loss": 5.5843, "mean_token_accuracy": 0.18420783281326295, "num_tokens": 7294813.0, "step": 4010 }, { "entropy": 5.961581373214722, "epoch": 3.449505801461109, "grad_norm": 1.0546875, "learning_rate": 0.0004164850909895161, "loss": 5.5619, "mean_token_accuracy": 0.18809896260499953, "num_tokens": 7304655.0, "step": 4015 }, { "entropy": 5.849625158309936, "epoch": 3.4538031800601634, "grad_norm": 1.0, "learning_rate": 0.0004162264150462247, "loss": 5.5155, "mean_token_accuracy": 0.1865479052066803, "num_tokens": 7313610.0, "step": 4020 }, { "entropy": 5.980514192581177, "epoch": 3.458100558659218, "grad_norm": 1.1171875, "learning_rate": 0.00041596743072089065, "loss": 5.5535, "mean_token_accuracy": 0.19074880033731462, "num_tokens": 7322243.0, "step": 4025 }, { "entropy": 6.062830209732056, "epoch": 3.4623979372582725, "grad_norm": 1.1953125, "learning_rate": 0.000415708138579032, "loss": 5.5229, "mean_token_accuracy": 0.17943777292966842, "num_tokens": 7331040.0, "step": 4030 }, { "entropy": 5.886963891983032, "epoch": 3.466695315857327, "grad_norm": 1.1171875, "learning_rate": 0.00041544853918683923, "loss": 5.5948, "mean_token_accuracy": 0.1817588433623314, "num_tokens": 7340771.0, "step": 4035 }, { "entropy": 5.9117542743682865, "epoch": 3.4709926944563816, "grad_norm": 1.0625, "learning_rate": 0.0004151886331111737, "loss": 5.6421, "mean_token_accuracy": 0.18092233091592788, "num_tokens": 7349960.0, "step": 4040 }, { "entropy": 5.899527883529663, "epoch": 3.475290073055436, "grad_norm": 1.1796875, "learning_rate": 0.00041492842091956646, "loss": 5.4649, "mean_token_accuracy": 0.1919792726635933, "num_tokens": 7357983.0, "step": 4045 }, { "entropy": 5.988178062438965, "epoch": 3.4795874516544907, "grad_norm": 1.1015625, "learning_rate": 0.0004146679031802167, "loss": 5.591, "mean_token_accuracy": 0.19019764959812163, "num_tokens": 7366814.0, "step": 4050 }, { "entropy": 5.9325186252594, "epoch": 3.4838848302535452, "grad_norm": 1.1953125, "learning_rate": 0.00041440708046199123, "loss": 5.452, "mean_token_accuracy": 0.19600227922201158, "num_tokens": 7374773.0, "step": 4055 }, { "entropy": 5.890796184539795, "epoch": 3.4881822088525998, "grad_norm": 1.0625, "learning_rate": 0.0004141459533344226, "loss": 5.5562, "mean_token_accuracy": 0.1825706109404564, "num_tokens": 7383937.0, "step": 4060 }, { "entropy": 5.957454347610474, "epoch": 3.4924795874516543, "grad_norm": 1.1015625, "learning_rate": 0.00041388452236770795, "loss": 5.5305, "mean_token_accuracy": 0.18163443803787233, "num_tokens": 7392577.0, "step": 4065 }, { "entropy": 5.882272720336914, "epoch": 3.4967769660507093, "grad_norm": 1.125, "learning_rate": 0.00041362278813270823, "loss": 5.4193, "mean_token_accuracy": 0.20885447710752486, "num_tokens": 7401473.0, "step": 4070 }, { "entropy": 5.992699241638183, "epoch": 3.501074344649764, "grad_norm": 1.0234375, "learning_rate": 0.00041336075120094616, "loss": 5.6214, "mean_token_accuracy": 0.17333737909793853, "num_tokens": 7410831.0, "step": 4075 }, { "entropy": 6.0088804244995115, "epoch": 3.5053717232488184, "grad_norm": 1.0390625, "learning_rate": 0.00041309841214460586, "loss": 5.6193, "mean_token_accuracy": 0.18231521993875505, "num_tokens": 7421563.0, "step": 4080 }, { "entropy": 5.887757968902588, "epoch": 3.509669101847873, "grad_norm": 1.1171875, "learning_rate": 0.0004128357715365309, "loss": 5.5266, "mean_token_accuracy": 0.191811466217041, "num_tokens": 7430174.0, "step": 4085 }, { "entropy": 5.899808502197265, "epoch": 3.5139664804469275, "grad_norm": 1.0703125, "learning_rate": 0.00041257282995022345, "loss": 5.4928, "mean_token_accuracy": 0.1953655794262886, "num_tokens": 7439034.0, "step": 4090 }, { "entropy": 5.912106704711914, "epoch": 3.518263859045982, "grad_norm": 1.359375, "learning_rate": 0.0004123095879598426, "loss": 5.5195, "mean_token_accuracy": 0.18628203123807907, "num_tokens": 7447663.0, "step": 4095 }, { "entropy": 5.960794830322266, "epoch": 3.5225612376450366, "grad_norm": 1.0625, "learning_rate": 0.00041204604614020397, "loss": 5.6081, "mean_token_accuracy": 0.17660218775272368, "num_tokens": 7456615.0, "step": 4100 }, { "entropy": 5.996097373962402, "epoch": 3.526858616244091, "grad_norm": 1.09375, "learning_rate": 0.0004117822050667773, "loss": 5.6382, "mean_token_accuracy": 0.18591019809246062, "num_tokens": 7466203.0, "step": 4105 }, { "entropy": 5.9893563747406, "epoch": 3.5311559948431457, "grad_norm": 1.09375, "learning_rate": 0.00041151806531568617, "loss": 5.5802, "mean_token_accuracy": 0.18335504829883575, "num_tokens": 7475411.0, "step": 4110 }, { "entropy": 5.906181669235229, "epoch": 3.5354533734422002, "grad_norm": 1.0390625, "learning_rate": 0.00041125362746370625, "loss": 5.6004, "mean_token_accuracy": 0.18042974472045897, "num_tokens": 7484965.0, "step": 4115 }, { "entropy": 5.995426511764526, "epoch": 3.5397507520412548, "grad_norm": 1.09375, "learning_rate": 0.0004109888920882639, "loss": 5.5249, "mean_token_accuracy": 0.19167679399251938, "num_tokens": 7494240.0, "step": 4120 }, { "entropy": 5.949258327484131, "epoch": 3.5440481306403093, "grad_norm": 1.0625, "learning_rate": 0.0004107238597674356, "loss": 5.5586, "mean_token_accuracy": 0.18614224940538407, "num_tokens": 7503560.0, "step": 4125 }, { "entropy": 5.863224458694458, "epoch": 3.548345509239364, "grad_norm": 1.0078125, "learning_rate": 0.000410458531079946, "loss": 5.4812, "mean_token_accuracy": 0.19368503391742706, "num_tokens": 7512650.0, "step": 4130 }, { "entropy": 5.9348499298095705, "epoch": 3.5526428878384184, "grad_norm": 1.1640625, "learning_rate": 0.0004101929066051668, "loss": 5.599, "mean_token_accuracy": 0.1838935688138008, "num_tokens": 7521864.0, "step": 4135 }, { "entropy": 5.878848266601563, "epoch": 3.556940266437473, "grad_norm": 1.1171875, "learning_rate": 0.0004099269869231157, "loss": 5.496, "mean_token_accuracy": 0.19109761267900466, "num_tokens": 7531013.0, "step": 4140 }, { "entropy": 5.948237895965576, "epoch": 3.5612376450365275, "grad_norm": 1.046875, "learning_rate": 0.00040966077261445495, "loss": 5.503, "mean_token_accuracy": 0.1837790846824646, "num_tokens": 7539959.0, "step": 4145 }, { "entropy": 6.009708642959595, "epoch": 3.565535023635582, "grad_norm": 1.28125, "learning_rate": 0.0004093942642604904, "loss": 5.4789, "mean_token_accuracy": 0.19033878594636916, "num_tokens": 7548354.0, "step": 4150 }, { "entropy": 5.921438217163086, "epoch": 3.5698324022346366, "grad_norm": 1.03125, "learning_rate": 0.00040912746244316944, "loss": 5.6032, "mean_token_accuracy": 0.18626796901226045, "num_tokens": 7558321.0, "step": 4155 }, { "entropy": 5.902405214309693, "epoch": 3.5741297808336916, "grad_norm": 1.046875, "learning_rate": 0.00040886036774508095, "loss": 5.4904, "mean_token_accuracy": 0.18896115869283675, "num_tokens": 7567889.0, "step": 4160 }, { "entropy": 5.9710170269012455, "epoch": 3.578427159432746, "grad_norm": 1.0859375, "learning_rate": 0.0004085929807494527, "loss": 5.5489, "mean_token_accuracy": 0.1867457315325737, "num_tokens": 7576752.0, "step": 4165 }, { "entropy": 5.900749206542969, "epoch": 3.5827245380318007, "grad_norm": 1.015625, "learning_rate": 0.0004083253020401512, "loss": 5.4498, "mean_token_accuracy": 0.19864338636398315, "num_tokens": 7585413.0, "step": 4170 }, { "entropy": 5.9034223556518555, "epoch": 3.5870219166308552, "grad_norm": 1.234375, "learning_rate": 0.0004080573322016797, "loss": 5.4085, "mean_token_accuracy": 0.19775232523679734, "num_tokens": 7593966.0, "step": 4175 }, { "entropy": 5.905447053909302, "epoch": 3.59131929522991, "grad_norm": 1.09375, "learning_rate": 0.0004077890718191773, "loss": 5.4219, "mean_token_accuracy": 0.19463559091091157, "num_tokens": 7602746.0, "step": 4180 }, { "entropy": 5.888575172424316, "epoch": 3.5956166738289643, "grad_norm": 1.15625, "learning_rate": 0.00040752052147841733, "loss": 5.485, "mean_token_accuracy": 0.18464642763137817, "num_tokens": 7611245.0, "step": 4185 }, { "entropy": 5.9167564868927, "epoch": 3.599914052428019, "grad_norm": 1.0234375, "learning_rate": 0.0004072516817658065, "loss": 5.5085, "mean_token_accuracy": 0.19180469512939452, "num_tokens": 7620234.0, "step": 4190 }, { "entropy": 5.9288722515106205, "epoch": 3.6042114310270734, "grad_norm": 1.140625, "learning_rate": 0.0004069825532683831, "loss": 5.5362, "mean_token_accuracy": 0.19008248895406724, "num_tokens": 7629794.0, "step": 4195 }, { "entropy": 5.883164501190185, "epoch": 3.608508809626128, "grad_norm": 1.046875, "learning_rate": 0.00040671313657381645, "loss": 5.4768, "mean_token_accuracy": 0.19734710156917573, "num_tokens": 7639497.0, "step": 4200 }, { "entropy": 5.833352327346802, "epoch": 3.6128061882251825, "grad_norm": 1.09375, "learning_rate": 0.00040644343227040473, "loss": 5.4305, "mean_token_accuracy": 0.192035111784935, "num_tokens": 7647647.0, "step": 4205 }, { "entropy": 5.882366132736206, "epoch": 3.617103566824237, "grad_norm": 1.046875, "learning_rate": 0.0004061734409470745, "loss": 5.6069, "mean_token_accuracy": 0.18727213144302368, "num_tokens": 7657988.0, "step": 4210 }, { "entropy": 5.946136331558227, "epoch": 3.621400945423292, "grad_norm": 1.078125, "learning_rate": 0.0004059031631933788, "loss": 5.5226, "mean_token_accuracy": 0.18810444325208664, "num_tokens": 7667498.0, "step": 4215 }, { "entropy": 5.928274488449096, "epoch": 3.6256983240223466, "grad_norm": 1.0625, "learning_rate": 0.00040563259959949615, "loss": 5.6612, "mean_token_accuracy": 0.17574882060289382, "num_tokens": 7677386.0, "step": 4220 }, { "entropy": 6.023345851898194, "epoch": 3.629995702621401, "grad_norm": 1.0546875, "learning_rate": 0.0004053617507562295, "loss": 5.4993, "mean_token_accuracy": 0.1883416697382927, "num_tokens": 7686643.0, "step": 4225 }, { "entropy": 5.927192258834839, "epoch": 3.6342930812204557, "grad_norm": 1.2265625, "learning_rate": 0.00040509061725500426, "loss": 5.5344, "mean_token_accuracy": 0.18648910969495774, "num_tokens": 7695089.0, "step": 4230 }, { "entropy": 5.855798292160034, "epoch": 3.6385904598195102, "grad_norm": 1.078125, "learning_rate": 0.0004048191996878677, "loss": 5.5169, "mean_token_accuracy": 0.18715409338474273, "num_tokens": 7703854.0, "step": 4235 }, { "entropy": 5.873931074142456, "epoch": 3.642887838418565, "grad_norm": 1.0859375, "learning_rate": 0.00040454749864748734, "loss": 5.4623, "mean_token_accuracy": 0.1924944058060646, "num_tokens": 7712903.0, "step": 4240 }, { "entropy": 5.9368483543396, "epoch": 3.6471852170176193, "grad_norm": 1.0, "learning_rate": 0.0004042755147271496, "loss": 5.4073, "mean_token_accuracy": 0.19578560292720795, "num_tokens": 7721701.0, "step": 4245 }, { "entropy": 5.814197635650634, "epoch": 3.651482595616674, "grad_norm": 0.9921875, "learning_rate": 0.0004040032485207587, "loss": 5.5316, "mean_token_accuracy": 0.18780674338340758, "num_tokens": 7731318.0, "step": 4250 }, { "entropy": 5.960366725921631, "epoch": 3.6557799742157284, "grad_norm": 0.9921875, "learning_rate": 0.0004037307006228352, "loss": 5.4563, "mean_token_accuracy": 0.19457500725984572, "num_tokens": 7740413.0, "step": 4255 }, { "entropy": 5.894597911834717, "epoch": 3.660077352814783, "grad_norm": 1.0703125, "learning_rate": 0.0004034578716285147, "loss": 5.4362, "mean_token_accuracy": 0.19790690541267394, "num_tokens": 7749054.0, "step": 4260 }, { "entropy": 5.855839014053345, "epoch": 3.6643747314138375, "grad_norm": 1.2109375, "learning_rate": 0.0004031847621335467, "loss": 5.4711, "mean_token_accuracy": 0.19566139876842498, "num_tokens": 7757366.0, "step": 4265 }, { "entropy": 5.889632892608643, "epoch": 3.668672110012892, "grad_norm": 1.2578125, "learning_rate": 0.0004029113727342933, "loss": 5.502, "mean_token_accuracy": 0.19420932680368425, "num_tokens": 7766471.0, "step": 4270 }, { "entropy": 5.851235818862915, "epoch": 3.6729694886119466, "grad_norm": 1.09375, "learning_rate": 0.00040263770402772746, "loss": 5.4897, "mean_token_accuracy": 0.1871536925435066, "num_tokens": 7775920.0, "step": 4275 }, { "entropy": 5.934095287322998, "epoch": 3.677266867211001, "grad_norm": 1.1953125, "learning_rate": 0.0004023637566114325, "loss": 5.5382, "mean_token_accuracy": 0.1889081373810768, "num_tokens": 7784530.0, "step": 4280 }, { "entropy": 5.93968391418457, "epoch": 3.6815642458100557, "grad_norm": 1.09375, "learning_rate": 0.0004020895310835999, "loss": 5.4721, "mean_token_accuracy": 0.1917961835861206, "num_tokens": 7793656.0, "step": 4285 }, { "entropy": 5.9000050067901615, "epoch": 3.6858616244091102, "grad_norm": 1.0234375, "learning_rate": 0.00040181502804302865, "loss": 5.496, "mean_token_accuracy": 0.1914617270231247, "num_tokens": 7802185.0, "step": 4290 }, { "entropy": 5.8633284091949465, "epoch": 3.690159003008165, "grad_norm": 1.171875, "learning_rate": 0.00040154024808912377, "loss": 5.483, "mean_token_accuracy": 0.19215791970491408, "num_tokens": 7810345.0, "step": 4295 }, { "entropy": 5.897251462936401, "epoch": 3.6944563816072193, "grad_norm": 1.15625, "learning_rate": 0.0004012651918218947, "loss": 5.5314, "mean_token_accuracy": 0.1837465301156044, "num_tokens": 7818998.0, "step": 4300 }, { "entropy": 5.959916353225708, "epoch": 3.6987537602062743, "grad_norm": 1.0703125, "learning_rate": 0.0004009898598419544, "loss": 5.6474, "mean_token_accuracy": 0.17348452657461166, "num_tokens": 7828638.0, "step": 4305 }, { "entropy": 5.956097745895386, "epoch": 3.703051138805329, "grad_norm": 1.171875, "learning_rate": 0.000400714252750518, "loss": 5.622, "mean_token_accuracy": 0.1802245110273361, "num_tokens": 7838812.0, "step": 4310 }, { "entropy": 5.987325286865234, "epoch": 3.7073485174043834, "grad_norm": 1.1171875, "learning_rate": 0.0004004383711494011, "loss": 5.5288, "mean_token_accuracy": 0.19345352202653884, "num_tokens": 7847458.0, "step": 4315 }, { "entropy": 5.95421142578125, "epoch": 3.711645896003438, "grad_norm": 1.03125, "learning_rate": 0.0004001622156410189, "loss": 5.5496, "mean_token_accuracy": 0.18483526557683944, "num_tokens": 7856553.0, "step": 4320 }, { "entropy": 5.850839233398437, "epoch": 3.7159432746024925, "grad_norm": 1.0546875, "learning_rate": 0.00039988578682838467, "loss": 5.4869, "mean_token_accuracy": 0.18971165865659714, "num_tokens": 7864788.0, "step": 4325 }, { "entropy": 5.903116130828858, "epoch": 3.720240653201547, "grad_norm": 1.0390625, "learning_rate": 0.00039960908531510843, "loss": 5.484, "mean_token_accuracy": 0.19329809993505478, "num_tokens": 7873850.0, "step": 4330 }, { "entropy": 5.974154853820801, "epoch": 3.7245380318006016, "grad_norm": 1.1484375, "learning_rate": 0.0003993321117053956, "loss": 5.6039, "mean_token_accuracy": 0.18225040286779404, "num_tokens": 7882775.0, "step": 4335 }, { "entropy": 5.980661678314209, "epoch": 3.728835410399656, "grad_norm": 1.1015625, "learning_rate": 0.00039905486660404604, "loss": 5.5353, "mean_token_accuracy": 0.18522801846265793, "num_tokens": 7890570.0, "step": 4340 }, { "entropy": 5.8748914241790775, "epoch": 3.7331327889987107, "grad_norm": 1.015625, "learning_rate": 0.00039877735061645206, "loss": 5.5033, "mean_token_accuracy": 0.1971554860472679, "num_tokens": 7900090.0, "step": 4345 }, { "entropy": 5.934943914413452, "epoch": 3.7374301675977653, "grad_norm": 1.171875, "learning_rate": 0.0003984995643485977, "loss": 5.5358, "mean_token_accuracy": 0.18585693091154099, "num_tokens": 7908077.0, "step": 4350 }, { "entropy": 5.9528398513793945, "epoch": 3.74172754619682, "grad_norm": 1.421875, "learning_rate": 0.00039822150840705716, "loss": 5.5391, "mean_token_accuracy": 0.19125075042247772, "num_tokens": 7916290.0, "step": 4355 }, { "entropy": 5.999798917770386, "epoch": 3.746024924795875, "grad_norm": 1.1875, "learning_rate": 0.00039794318339899347, "loss": 5.6233, "mean_token_accuracy": 0.17912040501832963, "num_tokens": 7925835.0, "step": 4360 }, { "entropy": 5.929653787612915, "epoch": 3.7503223033949293, "grad_norm": 1.0703125, "learning_rate": 0.00039766458993215726, "loss": 5.5867, "mean_token_accuracy": 0.18147629946470262, "num_tokens": 7935076.0, "step": 4365 }, { "entropy": 5.84507122039795, "epoch": 3.754619681993984, "grad_norm": 1.0390625, "learning_rate": 0.00039738572861488527, "loss": 5.4837, "mean_token_accuracy": 0.19409503191709518, "num_tokens": 7943958.0, "step": 4370 }, { "entropy": 5.907137012481689, "epoch": 3.7589170605930384, "grad_norm": 1.03125, "learning_rate": 0.000397106600056099, "loss": 5.5211, "mean_token_accuracy": 0.18553533554077148, "num_tokens": 7953189.0, "step": 4375 }, { "entropy": 5.878173971176148, "epoch": 3.763214439192093, "grad_norm": 0.9765625, "learning_rate": 0.0003968272048653039, "loss": 5.4441, "mean_token_accuracy": 0.19779548197984695, "num_tokens": 7962927.0, "step": 4380 }, { "entropy": 5.8026800632476805, "epoch": 3.7675118177911475, "grad_norm": 1.015625, "learning_rate": 0.0003965475436525873, "loss": 5.4712, "mean_token_accuracy": 0.197597499191761, "num_tokens": 7973087.0, "step": 4385 }, { "entropy": 5.8803709030151365, "epoch": 3.771809196390202, "grad_norm": 1.0078125, "learning_rate": 0.0003962676170286174, "loss": 5.4288, "mean_token_accuracy": 0.1919528603553772, "num_tokens": 7982535.0, "step": 4390 }, { "entropy": 5.943622827529907, "epoch": 3.7761065749892566, "grad_norm": 1.1796875, "learning_rate": 0.00039598742560464223, "loss": 5.507, "mean_token_accuracy": 0.19596254229545593, "num_tokens": 7990740.0, "step": 4395 }, { "entropy": 5.965104579925537, "epoch": 3.780403953588311, "grad_norm": 1.21875, "learning_rate": 0.0003957069699924877, "loss": 5.5021, "mean_token_accuracy": 0.1843058630824089, "num_tokens": 7999349.0, "step": 4400 }, { "entropy": 5.906688165664673, "epoch": 3.7847013321873657, "grad_norm": 1.0859375, "learning_rate": 0.000395426250804557, "loss": 5.5119, "mean_token_accuracy": 0.19529375731945037, "num_tokens": 8007615.0, "step": 4405 }, { "entropy": 5.893620347976684, "epoch": 3.7889987107864203, "grad_norm": 1.0234375, "learning_rate": 0.00039514526865382847, "loss": 5.4918, "mean_token_accuracy": 0.19342261105775832, "num_tokens": 8017545.0, "step": 4410 }, { "entropy": 5.898420667648315, "epoch": 3.793296089385475, "grad_norm": 1.140625, "learning_rate": 0.0003948640241538548, "loss": 5.4376, "mean_token_accuracy": 0.1940651446580887, "num_tokens": 8026381.0, "step": 4415 }, { "entropy": 5.925773334503174, "epoch": 3.7975934679845293, "grad_norm": 1.4921875, "learning_rate": 0.0003945825179187617, "loss": 5.5471, "mean_token_accuracy": 0.1862453892827034, "num_tokens": 8034745.0, "step": 4420 }, { "entropy": 5.93576078414917, "epoch": 3.801890846583584, "grad_norm": 1.203125, "learning_rate": 0.00039430075056324604, "loss": 5.4864, "mean_token_accuracy": 0.19621551632881165, "num_tokens": 8043995.0, "step": 4425 }, { "entropy": 5.9152994632720945, "epoch": 3.8061882251826384, "grad_norm": 1.1015625, "learning_rate": 0.00039401872270257546, "loss": 5.5773, "mean_token_accuracy": 0.18623047918081284, "num_tokens": 8053059.0, "step": 4430 }, { "entropy": 5.9053184509277346, "epoch": 3.810485603781693, "grad_norm": 1.0546875, "learning_rate": 0.00039373643495258567, "loss": 5.5995, "mean_token_accuracy": 0.18803995102643967, "num_tokens": 8062160.0, "step": 4435 }, { "entropy": 5.876355934143066, "epoch": 3.8147829823807475, "grad_norm": 1.15625, "learning_rate": 0.00039345388792968056, "loss": 5.4979, "mean_token_accuracy": 0.1962131142616272, "num_tokens": 8071260.0, "step": 4440 }, { "entropy": 5.975628805160523, "epoch": 3.819080360979802, "grad_norm": 1.2578125, "learning_rate": 0.00039317108225082984, "loss": 5.6148, "mean_token_accuracy": 0.1825527474284172, "num_tokens": 8081540.0, "step": 4445 }, { "entropy": 5.8768692970275875, "epoch": 3.8233777395788566, "grad_norm": 1.1640625, "learning_rate": 0.00039288801853356806, "loss": 5.5798, "mean_token_accuracy": 0.1876271441578865, "num_tokens": 8089785.0, "step": 4450 }, { "entropy": 5.926883172988892, "epoch": 3.8276751181779116, "grad_norm": 1.21875, "learning_rate": 0.0003926046973959932, "loss": 5.4322, "mean_token_accuracy": 0.1977944403886795, "num_tokens": 8098097.0, "step": 4455 }, { "entropy": 5.84870548248291, "epoch": 3.831972496776966, "grad_norm": 1.0390625, "learning_rate": 0.0003923211194567654, "loss": 5.6562, "mean_token_accuracy": 0.1832739979028702, "num_tokens": 8108693.0, "step": 4460 }, { "entropy": 5.936432361602783, "epoch": 3.8362698753760207, "grad_norm": 1.15625, "learning_rate": 0.00039203728533510556, "loss": 5.4945, "mean_token_accuracy": 0.19009887129068376, "num_tokens": 8117181.0, "step": 4465 }, { "entropy": 5.9394755363464355, "epoch": 3.8405672539750753, "grad_norm": 1.1484375, "learning_rate": 0.000391753195650794, "loss": 5.5152, "mean_token_accuracy": 0.1871207147836685, "num_tokens": 8125398.0, "step": 4470 }, { "entropy": 5.89150915145874, "epoch": 3.84486463257413, "grad_norm": 1.046875, "learning_rate": 0.00039146885102416895, "loss": 5.519, "mean_token_accuracy": 0.19240910410881043, "num_tokens": 8135320.0, "step": 4475 }, { "entropy": 5.932202434539795, "epoch": 3.8491620111731844, "grad_norm": 1.1328125, "learning_rate": 0.00039118425207612553, "loss": 5.6074, "mean_token_accuracy": 0.18543781340122223, "num_tokens": 8144320.0, "step": 4480 }, { "entropy": 5.821663093566895, "epoch": 3.853459389772239, "grad_norm": 1.1328125, "learning_rate": 0.00039089939942811396, "loss": 5.478, "mean_token_accuracy": 0.19514185637235643, "num_tokens": 8153653.0, "step": 4485 }, { "entropy": 5.937240219116211, "epoch": 3.8577567683712934, "grad_norm": 1.03125, "learning_rate": 0.00039061429370213863, "loss": 5.513, "mean_token_accuracy": 0.18825586438179015, "num_tokens": 8162741.0, "step": 4490 }, { "entropy": 5.856398630142212, "epoch": 3.862054146970348, "grad_norm": 1.0859375, "learning_rate": 0.00039032893552075646, "loss": 5.4271, "mean_token_accuracy": 0.1990933135151863, "num_tokens": 8171078.0, "step": 4495 }, { "entropy": 5.858392572402954, "epoch": 3.8663515255694025, "grad_norm": 1.1796875, "learning_rate": 0.0003900433255070758, "loss": 5.4881, "mean_token_accuracy": 0.19236364662647248, "num_tokens": 8179968.0, "step": 4500 }, { "epoch": 3.8663515255694025, "eval_entropy": 5.69006564058699, "eval_loss": 5.968277454376221, "eval_mean_token_accuracy": 0.1735342912006754, "eval_num_tokens": 8179968.0, "eval_runtime": 2.0443, "eval_samples_per_second": 1736.068, "eval_steps_per_second": 217.192, "step": 4500 }, { "entropy": 5.894122076034546, "epoch": 3.870648904168457, "grad_norm": 1.0859375, "learning_rate": 0.00038975746428475454, "loss": 5.4732, "mean_token_accuracy": 0.19004281610250473, "num_tokens": 8189261.0, "step": 4505 }, { "entropy": 5.959436702728271, "epoch": 3.874946282767512, "grad_norm": 1.15625, "learning_rate": 0.00038947135247799955, "loss": 5.4841, "mean_token_accuracy": 0.19915961623191833, "num_tokens": 8198302.0, "step": 4510 }, { "entropy": 5.907156896591187, "epoch": 3.8792436613665666, "grad_norm": 1.0546875, "learning_rate": 0.00038918499071156443, "loss": 5.4669, "mean_token_accuracy": 0.1965099200606346, "num_tokens": 8207098.0, "step": 4515 }, { "entropy": 5.902419233322144, "epoch": 3.883541039965621, "grad_norm": 1.2109375, "learning_rate": 0.000388898379610749, "loss": 5.5132, "mean_token_accuracy": 0.18933655470609664, "num_tokens": 8216831.0, "step": 4520 }, { "entropy": 5.858121109008789, "epoch": 3.8878384185646757, "grad_norm": 1.140625, "learning_rate": 0.0003886115198013973, "loss": 5.5158, "mean_token_accuracy": 0.19693622142076492, "num_tokens": 8225369.0, "step": 4525 }, { "entropy": 5.928486585617065, "epoch": 3.8921357971637303, "grad_norm": 1.1328125, "learning_rate": 0.0003883244119098965, "loss": 5.6449, "mean_token_accuracy": 0.17984056174755098, "num_tokens": 8234440.0, "step": 4530 }, { "entropy": 5.944949722290039, "epoch": 3.896433175762785, "grad_norm": 1.0625, "learning_rate": 0.0003880370565631754, "loss": 5.4373, "mean_token_accuracy": 0.19602712541818618, "num_tokens": 8243707.0, "step": 4535 }, { "entropy": 5.938224267959595, "epoch": 3.9007305543618394, "grad_norm": 1.0859375, "learning_rate": 0.00038774945438870337, "loss": 5.6105, "mean_token_accuracy": 0.18423481285572052, "num_tokens": 8254223.0, "step": 4540 }, { "entropy": 5.871773719787598, "epoch": 3.905027932960894, "grad_norm": 1.1484375, "learning_rate": 0.00038746160601448845, "loss": 5.465, "mean_token_accuracy": 0.1903871014714241, "num_tokens": 8263105.0, "step": 4545 }, { "entropy": 5.857735824584961, "epoch": 3.9093253115599484, "grad_norm": 1.03125, "learning_rate": 0.0003871735120690766, "loss": 5.5241, "mean_token_accuracy": 0.18961958587169647, "num_tokens": 8271478.0, "step": 4550 }, { "entropy": 5.936745357513428, "epoch": 3.913622690159003, "grad_norm": 1.1171875, "learning_rate": 0.0003868851731815497, "loss": 5.5649, "mean_token_accuracy": 0.1800309345126152, "num_tokens": 8280396.0, "step": 4555 }, { "entropy": 5.948010683059692, "epoch": 3.9179200687580575, "grad_norm": 1.1953125, "learning_rate": 0.0003865965899815247, "loss": 5.5559, "mean_token_accuracy": 0.18653638958930968, "num_tokens": 8290371.0, "step": 4560 }, { "entropy": 5.885638093948364, "epoch": 3.922217447357112, "grad_norm": 1.125, "learning_rate": 0.0003863077630991518, "loss": 5.4559, "mean_token_accuracy": 0.1984282374382019, "num_tokens": 8298976.0, "step": 4565 }, { "entropy": 5.830101728439331, "epoch": 3.9265148259561666, "grad_norm": 1.0546875, "learning_rate": 0.0003860186931651139, "loss": 5.5129, "mean_token_accuracy": 0.1856519967317581, "num_tokens": 8308752.0, "step": 4570 }, { "entropy": 5.904654264450073, "epoch": 3.930812204555221, "grad_norm": 1.0625, "learning_rate": 0.0003857293808106238, "loss": 5.5693, "mean_token_accuracy": 0.18588138967752457, "num_tokens": 8317343.0, "step": 4575 }, { "entropy": 5.934261655807495, "epoch": 3.9351095831542757, "grad_norm": 1.0546875, "learning_rate": 0.0003854398266674241, "loss": 5.4226, "mean_token_accuracy": 0.19770598262548447, "num_tokens": 8326956.0, "step": 4580 }, { "entropy": 5.8273883819580075, "epoch": 3.9394069617533303, "grad_norm": 1.1796875, "learning_rate": 0.00038515003136778544, "loss": 5.5387, "mean_token_accuracy": 0.18877289444208145, "num_tokens": 8335589.0, "step": 4585 }, { "entropy": 5.864310264587402, "epoch": 3.943704340352385, "grad_norm": 1.125, "learning_rate": 0.00038485999554450483, "loss": 5.5134, "mean_token_accuracy": 0.18962926417589188, "num_tokens": 8345517.0, "step": 4590 }, { "entropy": 5.81669340133667, "epoch": 3.9480017189514394, "grad_norm": 1.171875, "learning_rate": 0.00038456971983090454, "loss": 5.4482, "mean_token_accuracy": 0.19930247962474823, "num_tokens": 8354702.0, "step": 4595 }, { "entropy": 5.906301403045655, "epoch": 3.9522990975504944, "grad_norm": 1.09375, "learning_rate": 0.0003842792048608309, "loss": 5.4765, "mean_token_accuracy": 0.19456401616334915, "num_tokens": 8362940.0, "step": 4600 }, { "entropy": 5.906610107421875, "epoch": 3.956596476149549, "grad_norm": 1.015625, "learning_rate": 0.0003839884512686523, "loss": 5.5178, "mean_token_accuracy": 0.19119103550910949, "num_tokens": 8372034.0, "step": 4605 }, { "entropy": 5.910079717636108, "epoch": 3.9608938547486034, "grad_norm": 1.1015625, "learning_rate": 0.00038369745968925846, "loss": 5.5487, "mean_token_accuracy": 0.1872400775551796, "num_tokens": 8381673.0, "step": 4610 }, { "entropy": 5.925352668762207, "epoch": 3.965191233347658, "grad_norm": 1.03125, "learning_rate": 0.00038340623075805875, "loss": 5.4909, "mean_token_accuracy": 0.1889455035328865, "num_tokens": 8390804.0, "step": 4615 }, { "entropy": 5.934152221679687, "epoch": 3.9694886119467125, "grad_norm": 1.15625, "learning_rate": 0.00038311476511098053, "loss": 5.5365, "mean_token_accuracy": 0.19448018521070481, "num_tokens": 8399644.0, "step": 4620 }, { "entropy": 5.884286642074585, "epoch": 3.973785990545767, "grad_norm": 1.15625, "learning_rate": 0.0003828230633844685, "loss": 5.5523, "mean_token_accuracy": 0.19329068064689636, "num_tokens": 8409264.0, "step": 4625 }, { "entropy": 5.916780805587768, "epoch": 3.9780833691448216, "grad_norm": 1.1953125, "learning_rate": 0.00038253112621548243, "loss": 5.496, "mean_token_accuracy": 0.186178120970726, "num_tokens": 8418383.0, "step": 4630 }, { "entropy": 5.926163101196289, "epoch": 3.982380747743876, "grad_norm": 1.0625, "learning_rate": 0.0003822389542414966, "loss": 5.5232, "mean_token_accuracy": 0.18829717487096786, "num_tokens": 8427411.0, "step": 4635 }, { "entropy": 5.882813405990601, "epoch": 3.9866781263429307, "grad_norm": 1.1171875, "learning_rate": 0.00038194654810049775, "loss": 5.4629, "mean_token_accuracy": 0.18817957490682602, "num_tokens": 8435537.0, "step": 4640 }, { "entropy": 5.882016706466675, "epoch": 3.9909755049419853, "grad_norm": 1.046875, "learning_rate": 0.000381653908430984, "loss": 5.5432, "mean_token_accuracy": 0.18621994256973268, "num_tokens": 8444400.0, "step": 4645 }, { "entropy": 5.930685234069824, "epoch": 3.99527288354104, "grad_norm": 1.1015625, "learning_rate": 0.0003813610358719634, "loss": 5.5236, "mean_token_accuracy": 0.1859032317996025, "num_tokens": 8453830.0, "step": 4650 }, { "entropy": 5.866905212402344, "epoch": 3.999570262140095, "grad_norm": 1.0625, "learning_rate": 0.00038106793106295266, "loss": 5.4873, "mean_token_accuracy": 0.20101941972970963, "num_tokens": 8463033.0, "step": 4655 }, { "entropy": 5.898269759284125, "epoch": 4.003437902879243, "grad_norm": 1.0625, "learning_rate": 0.0003807745946439754, "loss": 5.2703, "mean_token_accuracy": 0.20677175455623203, "num_tokens": 8470740.0, "step": 4660 }, { "entropy": 5.857395029067993, "epoch": 4.007735281478298, "grad_norm": 1.0078125, "learning_rate": 0.0003804810272555612, "loss": 5.2529, "mean_token_accuracy": 0.20413458198308945, "num_tokens": 8480480.0, "step": 4665 }, { "entropy": 5.816273021697998, "epoch": 4.012032660077352, "grad_norm": 1.1875, "learning_rate": 0.0003801872295387439, "loss": 5.2035, "mean_token_accuracy": 0.21528093218803407, "num_tokens": 8489047.0, "step": 4670 }, { "entropy": 5.927360010147095, "epoch": 4.016330038676408, "grad_norm": 1.1015625, "learning_rate": 0.0003798932021350603, "loss": 5.2819, "mean_token_accuracy": 0.20662181824445724, "num_tokens": 8497763.0, "step": 4675 }, { "entropy": 5.861963748931885, "epoch": 4.020627417275462, "grad_norm": 1.109375, "learning_rate": 0.00037959894568654864, "loss": 5.2537, "mean_token_accuracy": 0.20978819131851195, "num_tokens": 8506814.0, "step": 4680 }, { "entropy": 5.957066392898559, "epoch": 4.024924795874517, "grad_norm": 1.0859375, "learning_rate": 0.0003793044608357474, "loss": 5.377, "mean_token_accuracy": 0.19830369651317598, "num_tokens": 8516384.0, "step": 4685 }, { "entropy": 5.93622350692749, "epoch": 4.0292221744735714, "grad_norm": 1.1875, "learning_rate": 0.0003790097482256939, "loss": 5.214, "mean_token_accuracy": 0.2048332706093788, "num_tokens": 8524822.0, "step": 4690 }, { "entropy": 5.870176839828491, "epoch": 4.033519553072626, "grad_norm": 0.98828125, "learning_rate": 0.0003787148084999225, "loss": 5.242, "mean_token_accuracy": 0.2090427428483963, "num_tokens": 8534129.0, "step": 4695 }, { "entropy": 5.8284914016723635, "epoch": 4.0378169316716805, "grad_norm": 1.1875, "learning_rate": 0.00037841964230246394, "loss": 5.3055, "mean_token_accuracy": 0.20019746124744414, "num_tokens": 8543235.0, "step": 4700 }, { "entropy": 5.8483837127685545, "epoch": 4.042114310270735, "grad_norm": 1.1875, "learning_rate": 0.0003781242502778429, "loss": 5.2003, "mean_token_accuracy": 0.22053535431623458, "num_tokens": 8551903.0, "step": 4705 }, { "entropy": 5.880414295196533, "epoch": 4.04641168886979, "grad_norm": 1.2109375, "learning_rate": 0.00037782863307107785, "loss": 5.287, "mean_token_accuracy": 0.20505535304546357, "num_tokens": 8561173.0, "step": 4710 }, { "entropy": 5.899335432052612, "epoch": 4.050709067468844, "grad_norm": 1.1640625, "learning_rate": 0.00037753279132767833, "loss": 5.1929, "mean_token_accuracy": 0.21593824326992034, "num_tokens": 8569789.0, "step": 4715 }, { "entropy": 5.804694700241089, "epoch": 4.055006446067899, "grad_norm": 1.2421875, "learning_rate": 0.00037723672569364453, "loss": 5.1963, "mean_token_accuracy": 0.20983130037784575, "num_tokens": 8577971.0, "step": 4720 }, { "entropy": 5.866218900680542, "epoch": 4.059303824666953, "grad_norm": 1.125, "learning_rate": 0.00037694043681546545, "loss": 5.2858, "mean_token_accuracy": 0.2029922142624855, "num_tokens": 8587299.0, "step": 4725 }, { "entropy": 5.831310987472534, "epoch": 4.063601203266008, "grad_norm": 1.0703125, "learning_rate": 0.0003766439253401177, "loss": 5.2472, "mean_token_accuracy": 0.20737850219011306, "num_tokens": 8595813.0, "step": 4730 }, { "entropy": 5.844350147247314, "epoch": 4.067898581865062, "grad_norm": 1.15625, "learning_rate": 0.00037634719191506367, "loss": 5.2617, "mean_token_accuracy": 0.21165675073862075, "num_tokens": 8604552.0, "step": 4735 }, { "entropy": 5.796354818344116, "epoch": 4.072195960464117, "grad_norm": 1.3203125, "learning_rate": 0.00037605023718825065, "loss": 5.2002, "mean_token_accuracy": 0.2150500625371933, "num_tokens": 8612701.0, "step": 4740 }, { "entropy": 5.846735095977783, "epoch": 4.0764933390631715, "grad_norm": 1.0078125, "learning_rate": 0.000375753061808109, "loss": 5.2598, "mean_token_accuracy": 0.20762900859117508, "num_tokens": 8622699.0, "step": 4745 }, { "entropy": 5.842225646972656, "epoch": 4.080790717662226, "grad_norm": 1.1015625, "learning_rate": 0.00037545566642355107, "loss": 5.2295, "mean_token_accuracy": 0.20560641288757325, "num_tokens": 8631821.0, "step": 4750 }, { "entropy": 5.840038156509399, "epoch": 4.0850880962612806, "grad_norm": 1.0625, "learning_rate": 0.0003751580516839695, "loss": 5.202, "mean_token_accuracy": 0.20931526124477387, "num_tokens": 8641814.0, "step": 4755 }, { "entropy": 5.884950733184814, "epoch": 4.089385474860335, "grad_norm": 1.1640625, "learning_rate": 0.00037486021823923574, "loss": 5.286, "mean_token_accuracy": 0.20766208320856094, "num_tokens": 8649649.0, "step": 4760 }, { "entropy": 5.810858106613159, "epoch": 4.09368285345939, "grad_norm": 1.15625, "learning_rate": 0.00037456216673969925, "loss": 5.2206, "mean_token_accuracy": 0.21204735338687897, "num_tokens": 8658216.0, "step": 4765 }, { "entropy": 5.874101734161377, "epoch": 4.097980232058444, "grad_norm": 1.0, "learning_rate": 0.0003742638978361851, "loss": 5.2958, "mean_token_accuracy": 0.20435795933008194, "num_tokens": 8667725.0, "step": 4770 }, { "entropy": 5.781695938110351, "epoch": 4.102277610657499, "grad_norm": 1.1484375, "learning_rate": 0.00037396541217999367, "loss": 5.1561, "mean_token_accuracy": 0.2138916879892349, "num_tokens": 8675739.0, "step": 4775 }, { "entropy": 5.839225959777832, "epoch": 4.106574989256553, "grad_norm": 1.125, "learning_rate": 0.0003736667104228981, "loss": 5.2313, "mean_token_accuracy": 0.21251195222139357, "num_tokens": 8685764.0, "step": 4780 }, { "entropy": 5.8689206600189205, "epoch": 4.110872367855608, "grad_norm": 1.3125, "learning_rate": 0.00037336779321714376, "loss": 5.2059, "mean_token_accuracy": 0.21196469962596892, "num_tokens": 8695476.0, "step": 4785 }, { "entropy": 5.80074520111084, "epoch": 4.115169746454662, "grad_norm": 1.1953125, "learning_rate": 0.00037306866121544633, "loss": 5.2825, "mean_token_accuracy": 0.20670025944709777, "num_tokens": 8705544.0, "step": 4790 }, { "entropy": 5.860075855255127, "epoch": 4.119467125053717, "grad_norm": 1.15625, "learning_rate": 0.0003727693150709904, "loss": 5.2645, "mean_token_accuracy": 0.20871647000312804, "num_tokens": 8714883.0, "step": 4795 }, { "entropy": 5.886887168884277, "epoch": 4.1237645036527715, "grad_norm": 1.0859375, "learning_rate": 0.00037246975543742843, "loss": 5.3176, "mean_token_accuracy": 0.20150526314973832, "num_tokens": 8724589.0, "step": 4800 }, { "entropy": 5.745695161819458, "epoch": 4.128061882251826, "grad_norm": 1.109375, "learning_rate": 0.000372169982968879, "loss": 5.1867, "mean_token_accuracy": 0.20965181291103363, "num_tokens": 8733771.0, "step": 4805 }, { "entropy": 5.845971202850341, "epoch": 4.132359260850881, "grad_norm": 1.234375, "learning_rate": 0.0003718699983199252, "loss": 5.2624, "mean_token_accuracy": 0.20873973071575164, "num_tokens": 8742348.0, "step": 4810 }, { "entropy": 5.7872912883758545, "epoch": 4.136656639449935, "grad_norm": 1.1171875, "learning_rate": 0.0003715698021456137, "loss": 5.2081, "mean_token_accuracy": 0.21571390181779862, "num_tokens": 8751357.0, "step": 4815 }, { "entropy": 5.7935162544250485, "epoch": 4.1409540180489905, "grad_norm": 1.09375, "learning_rate": 0.00037126939510145294, "loss": 5.2631, "mean_token_accuracy": 0.21045506447553636, "num_tokens": 8760813.0, "step": 4820 }, { "entropy": 5.919540929794311, "epoch": 4.145251396648045, "grad_norm": 1.1875, "learning_rate": 0.0003709687778434118, "loss": 5.3088, "mean_token_accuracy": 0.20338443517684937, "num_tokens": 8770228.0, "step": 4825 }, { "entropy": 5.766780090332031, "epoch": 4.1495487752471, "grad_norm": 1.3203125, "learning_rate": 0.0003706679510279183, "loss": 5.1405, "mean_token_accuracy": 0.2135200873017311, "num_tokens": 8779351.0, "step": 4830 }, { "entropy": 5.818261432647705, "epoch": 4.153846153846154, "grad_norm": 1.2109375, "learning_rate": 0.0003703669153118578, "loss": 5.3029, "mean_token_accuracy": 0.20108458995819092, "num_tokens": 8789116.0, "step": 4835 }, { "entropy": 5.810438871383667, "epoch": 4.158143532445209, "grad_norm": 1.15625, "learning_rate": 0.00037006567135257216, "loss": 5.2702, "mean_token_accuracy": 0.20288445353507994, "num_tokens": 8797790.0, "step": 4840 }, { "entropy": 5.865516614913941, "epoch": 4.162440911044263, "grad_norm": 1.0859375, "learning_rate": 0.00036976421980785764, "loss": 5.3081, "mean_token_accuracy": 0.2026110991835594, "num_tokens": 8808067.0, "step": 4845 }, { "entropy": 5.80728063583374, "epoch": 4.166738289643318, "grad_norm": 1.1640625, "learning_rate": 0.0003694625613359641, "loss": 5.2167, "mean_token_accuracy": 0.21420625150203704, "num_tokens": 8816587.0, "step": 4850 }, { "entropy": 5.843136548995972, "epoch": 4.171035668242372, "grad_norm": 1.2109375, "learning_rate": 0.0003691606965955929, "loss": 5.2734, "mean_token_accuracy": 0.20686964243650435, "num_tokens": 8826045.0, "step": 4855 }, { "entropy": 5.781480550765991, "epoch": 4.175333046841427, "grad_norm": 1.078125, "learning_rate": 0.000368858626245896, "loss": 5.2662, "mean_token_accuracy": 0.21182646304368974, "num_tokens": 8835427.0, "step": 4860 }, { "entropy": 5.802968168258667, "epoch": 4.1796304254404815, "grad_norm": 0.9609375, "learning_rate": 0.0003685563509464744, "loss": 5.2058, "mean_token_accuracy": 0.21191840171813964, "num_tokens": 8845167.0, "step": 4865 }, { "entropy": 5.854573917388916, "epoch": 4.183927804039536, "grad_norm": 1.25, "learning_rate": 0.00036825387135737647, "loss": 5.2076, "mean_token_accuracy": 0.21366898566484452, "num_tokens": 8853591.0, "step": 4870 }, { "entropy": 5.830286979675293, "epoch": 4.188225182638591, "grad_norm": 1.203125, "learning_rate": 0.00036795118813909674, "loss": 5.3259, "mean_token_accuracy": 0.19266606420278548, "num_tokens": 8863647.0, "step": 4875 }, { "entropy": 5.880206489562989, "epoch": 4.192522561237645, "grad_norm": 1.1875, "learning_rate": 0.00036764830195257437, "loss": 5.2531, "mean_token_accuracy": 0.2108171060681343, "num_tokens": 8872911.0, "step": 4880 }, { "entropy": 5.866643857955933, "epoch": 4.1968199398367, "grad_norm": 1.2578125, "learning_rate": 0.0003673452134591918, "loss": 5.2999, "mean_token_accuracy": 0.2029878944158554, "num_tokens": 8881001.0, "step": 4885 }, { "entropy": 5.772600555419922, "epoch": 4.201117318435754, "grad_norm": 1.1953125, "learning_rate": 0.000367041923320773, "loss": 5.2042, "mean_token_accuracy": 0.21341877430677414, "num_tokens": 8890323.0, "step": 4890 }, { "entropy": 5.771191835403442, "epoch": 4.205414697034809, "grad_norm": 1.125, "learning_rate": 0.00036673843219958257, "loss": 5.2368, "mean_token_accuracy": 0.21208913624286652, "num_tokens": 8900471.0, "step": 4895 }, { "entropy": 5.88256139755249, "epoch": 4.209712075633863, "grad_norm": 1.1953125, "learning_rate": 0.0003664347407583238, "loss": 5.2863, "mean_token_accuracy": 0.20272428095340728, "num_tokens": 8909320.0, "step": 4900 }, { "entropy": 5.836409950256348, "epoch": 4.214009454232918, "grad_norm": 1.1953125, "learning_rate": 0.0003661308496601373, "loss": 5.2072, "mean_token_accuracy": 0.2157358020544052, "num_tokens": 8917453.0, "step": 4905 }, { "entropy": 5.788828945159912, "epoch": 4.218306832831972, "grad_norm": 1.265625, "learning_rate": 0.00036582675956859983, "loss": 5.2828, "mean_token_accuracy": 0.2104206308722496, "num_tokens": 8925737.0, "step": 4910 }, { "entropy": 5.720648384094238, "epoch": 4.222604211431027, "grad_norm": 1.171875, "learning_rate": 0.00036552247114772263, "loss": 5.2101, "mean_token_accuracy": 0.2065804719924927, "num_tokens": 8935475.0, "step": 4915 }, { "entropy": 5.83034381866455, "epoch": 4.2269015900300815, "grad_norm": 1.0546875, "learning_rate": 0.00036521798506194996, "loss": 5.2346, "mean_token_accuracy": 0.21483660042285918, "num_tokens": 8944683.0, "step": 4920 }, { "entropy": 5.881083297729492, "epoch": 4.231198968629136, "grad_norm": 1.2421875, "learning_rate": 0.00036491330197615775, "loss": 5.2826, "mean_token_accuracy": 0.199912728369236, "num_tokens": 8953837.0, "step": 4925 }, { "entropy": 5.823856353759766, "epoch": 4.235496347228191, "grad_norm": 0.98046875, "learning_rate": 0.00036460842255565197, "loss": 5.3172, "mean_token_accuracy": 0.2043285608291626, "num_tokens": 8964822.0, "step": 4930 }, { "entropy": 5.869928026199341, "epoch": 4.239793725827245, "grad_norm": 1.328125, "learning_rate": 0.0003643033474661676, "loss": 5.2965, "mean_token_accuracy": 0.20673907697200775, "num_tokens": 8974363.0, "step": 4935 }, { "entropy": 5.82686710357666, "epoch": 4.2440911044263, "grad_norm": 1.2109375, "learning_rate": 0.00036399807737386657, "loss": 5.2074, "mean_token_accuracy": 0.21254496574401854, "num_tokens": 8983122.0, "step": 4940 }, { "entropy": 5.857899141311646, "epoch": 4.248388483025354, "grad_norm": 1.2578125, "learning_rate": 0.0003636926129453368, "loss": 5.3123, "mean_token_accuracy": 0.20272811949253083, "num_tokens": 8991618.0, "step": 4945 }, { "entropy": 5.824826383590699, "epoch": 4.252685861624409, "grad_norm": 1.171875, "learning_rate": 0.0003633869548475904, "loss": 5.2415, "mean_token_accuracy": 0.21045928597450256, "num_tokens": 9000128.0, "step": 4950 }, { "entropy": 5.775493240356445, "epoch": 4.256983240223463, "grad_norm": 1.15625, "learning_rate": 0.0003630811037480627, "loss": 5.2319, "mean_token_accuracy": 0.2093399852514267, "num_tokens": 9008951.0, "step": 4955 }, { "entropy": 5.842453670501709, "epoch": 4.261280618822518, "grad_norm": 1.078125, "learning_rate": 0.0003627750603146101, "loss": 5.2789, "mean_token_accuracy": 0.2030516341328621, "num_tokens": 9018949.0, "step": 4960 }, { "entropy": 5.883487272262573, "epoch": 4.265577997421573, "grad_norm": 1.0546875, "learning_rate": 0.0003624688252155091, "loss": 5.2747, "mean_token_accuracy": 0.20714085996150972, "num_tokens": 9028910.0, "step": 4965 }, { "entropy": 5.809985780715943, "epoch": 4.269875376020628, "grad_norm": 1.1015625, "learning_rate": 0.0003621623991194549, "loss": 5.324, "mean_token_accuracy": 0.19819179475307463, "num_tokens": 9039012.0, "step": 4970 }, { "entropy": 5.9007415771484375, "epoch": 4.274172754619682, "grad_norm": 1.109375, "learning_rate": 0.0003618557826955594, "loss": 5.2954, "mean_token_accuracy": 0.20645973831415176, "num_tokens": 9048639.0, "step": 4975 }, { "entropy": 5.815454912185669, "epoch": 4.278470133218737, "grad_norm": 1.1171875, "learning_rate": 0.00036154897661335063, "loss": 5.2517, "mean_token_accuracy": 0.2086031049489975, "num_tokens": 9057453.0, "step": 4980 }, { "entropy": 5.792014074325562, "epoch": 4.2827675118177915, "grad_norm": 1.15625, "learning_rate": 0.0003612419815427702, "loss": 5.2826, "mean_token_accuracy": 0.20074526816606522, "num_tokens": 9066761.0, "step": 4985 }, { "entropy": 5.858555936813355, "epoch": 4.287064890416846, "grad_norm": 1.4140625, "learning_rate": 0.0003609347981541726, "loss": 5.3553, "mean_token_accuracy": 0.1983863353729248, "num_tokens": 9075535.0, "step": 4990 }, { "entropy": 5.862577295303344, "epoch": 4.291362269015901, "grad_norm": 1.2109375, "learning_rate": 0.00036062742711832376, "loss": 5.257, "mean_token_accuracy": 0.2088131219148636, "num_tokens": 9084559.0, "step": 4995 }, { "entropy": 5.811804294586182, "epoch": 4.295659647614955, "grad_norm": 1.234375, "learning_rate": 0.0003603198691063991, "loss": 5.2313, "mean_token_accuracy": 0.2083360180258751, "num_tokens": 9093069.0, "step": 5000 }, { "epoch": 4.295659647614955, "eval_entropy": 5.572498395636275, "eval_loss": 5.972146987915039, "eval_mean_token_accuracy": 0.17474245199480573, "eval_num_tokens": 9093069.0, "eval_runtime": 2.0519, "eval_samples_per_second": 1729.593, "eval_steps_per_second": 216.382, "step": 5000 }, { "entropy": 5.8002112865447994, "epoch": 4.29995702621401, "grad_norm": 1.3125, "learning_rate": 0.0003600121247899824, "loss": 5.2227, "mean_token_accuracy": 0.2073623850941658, "num_tokens": 9101914.0, "step": 5005 }, { "entropy": 5.834455966949463, "epoch": 4.304254404813064, "grad_norm": 1.1484375, "learning_rate": 0.00035970419484106404, "loss": 5.2887, "mean_token_accuracy": 0.20548986196517943, "num_tokens": 9110967.0, "step": 5010 }, { "entropy": 5.891673374176025, "epoch": 4.308551783412119, "grad_norm": 1.2578125, "learning_rate": 0.0003593960799320402, "loss": 5.3822, "mean_token_accuracy": 0.19926034808158874, "num_tokens": 9119774.0, "step": 5015 }, { "entropy": 5.887394714355469, "epoch": 4.312849162011173, "grad_norm": 1.2890625, "learning_rate": 0.0003590877807357107, "loss": 5.2922, "mean_token_accuracy": 0.20317730754613877, "num_tokens": 9127738.0, "step": 5020 }, { "entropy": 5.785108280181885, "epoch": 4.317146540610228, "grad_norm": 1.203125, "learning_rate": 0.0003587792979252776, "loss": 5.2629, "mean_token_accuracy": 0.20784647464752198, "num_tokens": 9137060.0, "step": 5025 }, { "entropy": 5.777895927429199, "epoch": 4.321443919209282, "grad_norm": 1.2734375, "learning_rate": 0.0003584706321743442, "loss": 5.1962, "mean_token_accuracy": 0.2092631295323372, "num_tokens": 9145169.0, "step": 5030 }, { "entropy": 5.796663856506347, "epoch": 4.325741297808337, "grad_norm": 1.1328125, "learning_rate": 0.000358161784156913, "loss": 5.2276, "mean_token_accuracy": 0.21179858297109605, "num_tokens": 9154092.0, "step": 5035 }, { "entropy": 5.85888671875, "epoch": 4.3300386764073915, "grad_norm": 1.1171875, "learning_rate": 0.00035785275454738456, "loss": 5.286, "mean_token_accuracy": 0.19925448596477507, "num_tokens": 9162824.0, "step": 5040 }, { "entropy": 5.7883411884307865, "epoch": 4.334336055006446, "grad_norm": 1.3828125, "learning_rate": 0.00035754354402055635, "loss": 5.1959, "mean_token_accuracy": 0.21434530913829802, "num_tokens": 9170977.0, "step": 5045 }, { "entropy": 5.730002689361572, "epoch": 4.338633433605501, "grad_norm": 1.1875, "learning_rate": 0.0003572341532516202, "loss": 5.2367, "mean_token_accuracy": 0.20432866215705872, "num_tokens": 9179539.0, "step": 5050 }, { "entropy": 5.77237024307251, "epoch": 4.342930812204555, "grad_norm": 1.140625, "learning_rate": 0.0003569245829161622, "loss": 5.3173, "mean_token_accuracy": 0.20617470294237136, "num_tokens": 9188861.0, "step": 5055 }, { "entropy": 5.834583187103272, "epoch": 4.34722819080361, "grad_norm": 1.15625, "learning_rate": 0.00035661483369016004, "loss": 5.2608, "mean_token_accuracy": 0.20369923412799834, "num_tokens": 9197724.0, "step": 5060 }, { "entropy": 5.79484076499939, "epoch": 4.351525569402664, "grad_norm": 1.1640625, "learning_rate": 0.0003563049062499822, "loss": 5.2692, "mean_token_accuracy": 0.2074078604578972, "num_tokens": 9206375.0, "step": 5065 }, { "entropy": 5.755507230758667, "epoch": 4.355822948001719, "grad_norm": 1.296875, "learning_rate": 0.0003559948012723865, "loss": 5.2271, "mean_token_accuracy": 0.21173418909311295, "num_tokens": 9214675.0, "step": 5070 }, { "entropy": 5.802625036239624, "epoch": 4.360120326600773, "grad_norm": 1.1953125, "learning_rate": 0.0003556845194345181, "loss": 5.2516, "mean_token_accuracy": 0.20623590499162675, "num_tokens": 9224128.0, "step": 5075 }, { "entropy": 5.769022130966187, "epoch": 4.364417705199828, "grad_norm": 1.359375, "learning_rate": 0.0003553740614139086, "loss": 5.1773, "mean_token_accuracy": 0.21178028136491775, "num_tokens": 9232568.0, "step": 5080 }, { "entropy": 5.831740474700927, "epoch": 4.368715083798882, "grad_norm": 1.2734375, "learning_rate": 0.0003550634278884742, "loss": 5.2776, "mean_token_accuracy": 0.2081983670592308, "num_tokens": 9241809.0, "step": 5085 }, { "entropy": 5.803788042068481, "epoch": 4.373012462397937, "grad_norm": 1.140625, "learning_rate": 0.00035475261953651433, "loss": 5.272, "mean_token_accuracy": 0.20985971093177797, "num_tokens": 9250845.0, "step": 5090 }, { "entropy": 5.7017419815063475, "epoch": 4.3773098409969915, "grad_norm": 1.2265625, "learning_rate": 0.00035444163703671026, "loss": 5.2316, "mean_token_accuracy": 0.2108854666352272, "num_tokens": 9259465.0, "step": 5095 }, { "entropy": 5.795203113555909, "epoch": 4.381607219596046, "grad_norm": 1.078125, "learning_rate": 0.00035413048106812357, "loss": 5.2177, "mean_token_accuracy": 0.21499419659376146, "num_tokens": 9267853.0, "step": 5100 }, { "entropy": 5.927629661560059, "epoch": 4.385904598195101, "grad_norm": 1.125, "learning_rate": 0.00035381915231019425, "loss": 5.4268, "mean_token_accuracy": 0.19061524271965027, "num_tokens": 9276664.0, "step": 5105 }, { "entropy": 5.820791101455688, "epoch": 4.390201976794156, "grad_norm": 1.21875, "learning_rate": 0.0003535076514427401, "loss": 5.2285, "mean_token_accuracy": 0.20389644652605057, "num_tokens": 9285482.0, "step": 5110 }, { "entropy": 5.833712720870972, "epoch": 4.39449935539321, "grad_norm": 1.0859375, "learning_rate": 0.00035319597914595436, "loss": 5.3276, "mean_token_accuracy": 0.19536473900079726, "num_tokens": 9293936.0, "step": 5115 }, { "entropy": 5.812803554534912, "epoch": 4.398796733992265, "grad_norm": 1.1328125, "learning_rate": 0.0003528841361004049, "loss": 5.3509, "mean_token_accuracy": 0.19318777322769165, "num_tokens": 9303998.0, "step": 5120 }, { "entropy": 5.777164936065674, "epoch": 4.40309411259132, "grad_norm": 1.25, "learning_rate": 0.0003525721229870323, "loss": 5.3018, "mean_token_accuracy": 0.2057452142238617, "num_tokens": 9313117.0, "step": 5125 }, { "entropy": 5.843145132064819, "epoch": 4.407391491190374, "grad_norm": 1.140625, "learning_rate": 0.00035225994048714823, "loss": 5.2845, "mean_token_accuracy": 0.205299773812294, "num_tokens": 9321446.0, "step": 5130 }, { "entropy": 5.799930763244629, "epoch": 4.411688869789429, "grad_norm": 1.171875, "learning_rate": 0.0003519475892824348, "loss": 5.2629, "mean_token_accuracy": 0.20662948340177537, "num_tokens": 9330752.0, "step": 5135 }, { "entropy": 5.77738208770752, "epoch": 4.415986248388483, "grad_norm": 1.1875, "learning_rate": 0.0003516350700549419, "loss": 5.3006, "mean_token_accuracy": 0.20330240875482558, "num_tokens": 9339322.0, "step": 5140 }, { "entropy": 5.84840669631958, "epoch": 4.420283626987538, "grad_norm": 1.203125, "learning_rate": 0.00035132238348708697, "loss": 5.3297, "mean_token_accuracy": 0.19938498139381408, "num_tokens": 9349024.0, "step": 5145 }, { "entropy": 5.926564788818359, "epoch": 4.424581005586592, "grad_norm": 1.296875, "learning_rate": 0.00035100953026165224, "loss": 5.4256, "mean_token_accuracy": 0.197027026116848, "num_tokens": 9358833.0, "step": 5150 }, { "entropy": 5.868610525131226, "epoch": 4.428878384185647, "grad_norm": 1.1328125, "learning_rate": 0.0003506965110617841, "loss": 5.2718, "mean_token_accuracy": 0.2099718302488327, "num_tokens": 9368276.0, "step": 5155 }, { "entropy": 5.859810876846313, "epoch": 4.4331757627847015, "grad_norm": 1.015625, "learning_rate": 0.0003503833265709915, "loss": 5.3479, "mean_token_accuracy": 0.1974034383893013, "num_tokens": 9378501.0, "step": 5160 }, { "entropy": 5.875433778762817, "epoch": 4.437473141383756, "grad_norm": 1.265625, "learning_rate": 0.00035006997747314404, "loss": 5.3298, "mean_token_accuracy": 0.19622083157300949, "num_tokens": 9387789.0, "step": 5165 }, { "entropy": 5.835582590103149, "epoch": 4.441770519982811, "grad_norm": 1.125, "learning_rate": 0.00034975646445247106, "loss": 5.3721, "mean_token_accuracy": 0.2014732614159584, "num_tokens": 9397041.0, "step": 5170 }, { "entropy": 5.775737285614014, "epoch": 4.446067898581865, "grad_norm": 1.1953125, "learning_rate": 0.0003494427881935596, "loss": 5.3059, "mean_token_accuracy": 0.20452196449041365, "num_tokens": 9405393.0, "step": 5175 }, { "entropy": 5.779368114471436, "epoch": 4.45036527718092, "grad_norm": 1.1484375, "learning_rate": 0.00034912894938135325, "loss": 5.2582, "mean_token_accuracy": 0.20273705422878266, "num_tokens": 9415127.0, "step": 5180 }, { "entropy": 5.846761655807495, "epoch": 4.454662655779974, "grad_norm": 1.2265625, "learning_rate": 0.0003488149487011506, "loss": 5.3699, "mean_token_accuracy": 0.20174208134412766, "num_tokens": 9424416.0, "step": 5185 }, { "entropy": 5.890099239349365, "epoch": 4.458960034379029, "grad_norm": 1.1640625, "learning_rate": 0.00034850078683860346, "loss": 5.3262, "mean_token_accuracy": 0.19683828055858613, "num_tokens": 9434523.0, "step": 5190 }, { "entropy": 5.831119251251221, "epoch": 4.463257412978083, "grad_norm": 1.140625, "learning_rate": 0.0003481864644797159, "loss": 5.3245, "mean_token_accuracy": 0.2093776971101761, "num_tokens": 9443605.0, "step": 5195 }, { "entropy": 5.803278684616089, "epoch": 4.467554791577138, "grad_norm": 1.1640625, "learning_rate": 0.0003478719823108424, "loss": 5.3317, "mean_token_accuracy": 0.19572802931070327, "num_tokens": 9453268.0, "step": 5200 }, { "entropy": 5.8240362167358395, "epoch": 4.471852170176192, "grad_norm": 1.1953125, "learning_rate": 0.00034755734101868613, "loss": 5.214, "mean_token_accuracy": 0.2097940504550934, "num_tokens": 9461578.0, "step": 5205 }, { "entropy": 5.79837703704834, "epoch": 4.476149548775247, "grad_norm": 1.140625, "learning_rate": 0.00034724254129029795, "loss": 5.2436, "mean_token_accuracy": 0.2102679118514061, "num_tokens": 9470722.0, "step": 5210 }, { "entropy": 5.837281274795532, "epoch": 4.4804469273743015, "grad_norm": 1.2890625, "learning_rate": 0.0003469275838130748, "loss": 5.3607, "mean_token_accuracy": 0.19933488070964814, "num_tokens": 9479695.0, "step": 5215 }, { "entropy": 5.8430516719818115, "epoch": 4.484744305973356, "grad_norm": 1.1640625, "learning_rate": 0.0003466124692747577, "loss": 5.2646, "mean_token_accuracy": 0.2044244959950447, "num_tokens": 9488444.0, "step": 5220 }, { "entropy": 5.742202806472778, "epoch": 4.489041684572411, "grad_norm": 1.1953125, "learning_rate": 0.00034629719836343106, "loss": 5.2215, "mean_token_accuracy": 0.21403959393501282, "num_tokens": 9497413.0, "step": 5225 }, { "entropy": 5.7987758159637455, "epoch": 4.493339063171465, "grad_norm": 1.296875, "learning_rate": 0.0003459817717675203, "loss": 5.2598, "mean_token_accuracy": 0.21579257249832154, "num_tokens": 9506135.0, "step": 5230 }, { "entropy": 5.835311031341552, "epoch": 4.49763644177052, "grad_norm": 1.0625, "learning_rate": 0.0003456661901757913, "loss": 5.3341, "mean_token_accuracy": 0.20138609558343887, "num_tokens": 9516918.0, "step": 5235 }, { "entropy": 5.866192770004273, "epoch": 4.501933820369574, "grad_norm": 1.2578125, "learning_rate": 0.00034535045427734796, "loss": 5.276, "mean_token_accuracy": 0.2101076439023018, "num_tokens": 9526052.0, "step": 5240 }, { "entropy": 5.733947229385376, "epoch": 4.506231198968629, "grad_norm": 1.265625, "learning_rate": 0.0003450345647616313, "loss": 5.3369, "mean_token_accuracy": 0.2056139588356018, "num_tokens": 9535200.0, "step": 5245 }, { "entropy": 5.76122088432312, "epoch": 4.510528577567683, "grad_norm": 1.1796875, "learning_rate": 0.0003447185223184177, "loss": 5.3074, "mean_token_accuracy": 0.20514743030071259, "num_tokens": 9544786.0, "step": 5250 }, { "entropy": 5.871483230590821, "epoch": 4.514825956166739, "grad_norm": 1.1484375, "learning_rate": 0.00034440232763781765, "loss": 5.2522, "mean_token_accuracy": 0.20949897319078445, "num_tokens": 9553694.0, "step": 5255 }, { "entropy": 5.753093576431274, "epoch": 4.519123334765792, "grad_norm": 1.1953125, "learning_rate": 0.000344085981410274, "loss": 5.3192, "mean_token_accuracy": 0.20984772890806197, "num_tokens": 9563332.0, "step": 5260 }, { "entropy": 5.711885738372803, "epoch": 4.523420713364848, "grad_norm": 1.1015625, "learning_rate": 0.00034376948432656036, "loss": 5.2301, "mean_token_accuracy": 0.2115880087018013, "num_tokens": 9572367.0, "step": 5265 }, { "entropy": 5.860666131973266, "epoch": 4.527718091963902, "grad_norm": 1.0546875, "learning_rate": 0.0003434528370777798, "loss": 5.3255, "mean_token_accuracy": 0.19527169466018676, "num_tokens": 9582535.0, "step": 5270 }, { "entropy": 5.807507610321045, "epoch": 4.532015470562957, "grad_norm": 1.203125, "learning_rate": 0.00034313604035536344, "loss": 5.2775, "mean_token_accuracy": 0.21002310365438462, "num_tokens": 9590688.0, "step": 5275 }, { "entropy": 5.773982238769531, "epoch": 4.5363128491620115, "grad_norm": 1.171875, "learning_rate": 0.0003428190948510687, "loss": 5.3213, "mean_token_accuracy": 0.2039690524339676, "num_tokens": 9599209.0, "step": 5280 }, { "entropy": 5.852875804901123, "epoch": 4.540610227761066, "grad_norm": 1.2265625, "learning_rate": 0.0003425020012569778, "loss": 5.3626, "mean_token_accuracy": 0.20032234340906144, "num_tokens": 9608575.0, "step": 5285 }, { "entropy": 5.903119659423828, "epoch": 4.544907606360121, "grad_norm": 1.1796875, "learning_rate": 0.00034218476026549665, "loss": 5.3113, "mean_token_accuracy": 0.2009777992963791, "num_tokens": 9617312.0, "step": 5290 }, { "entropy": 5.826537036895752, "epoch": 4.549204984959175, "grad_norm": 1.265625, "learning_rate": 0.0003418673725693524, "loss": 5.2895, "mean_token_accuracy": 0.21229007989168167, "num_tokens": 9626398.0, "step": 5295 }, { "entropy": 5.797998762130737, "epoch": 4.55350236355823, "grad_norm": 1.1640625, "learning_rate": 0.0003415498388615932, "loss": 5.2692, "mean_token_accuracy": 0.20089106261730194, "num_tokens": 9635470.0, "step": 5300 }, { "entropy": 5.809066820144653, "epoch": 4.557799742157284, "grad_norm": 1.1640625, "learning_rate": 0.0003412321598355857, "loss": 5.213, "mean_token_accuracy": 0.21215442568063736, "num_tokens": 9644728.0, "step": 5305 }, { "entropy": 5.776236963272095, "epoch": 4.562097120756339, "grad_norm": 1.0859375, "learning_rate": 0.0003409143361850139, "loss": 5.2752, "mean_token_accuracy": 0.2105761721730232, "num_tokens": 9654129.0, "step": 5310 }, { "entropy": 5.822030639648437, "epoch": 4.566394499355393, "grad_norm": 1.25, "learning_rate": 0.0003405963686038775, "loss": 5.3633, "mean_token_accuracy": 0.1967499941587448, "num_tokens": 9662648.0, "step": 5315 }, { "entropy": 5.843867492675781, "epoch": 4.570691877954448, "grad_norm": 1.1640625, "learning_rate": 0.0003402782577864908, "loss": 5.3261, "mean_token_accuracy": 0.20646921396255494, "num_tokens": 9672082.0, "step": 5320 }, { "entropy": 5.86830587387085, "epoch": 4.574989256553502, "grad_norm": 1.2421875, "learning_rate": 0.00033996000442748056, "loss": 5.2528, "mean_token_accuracy": 0.21100070625543593, "num_tokens": 9681422.0, "step": 5325 }, { "entropy": 5.829919290542603, "epoch": 4.579286635152557, "grad_norm": 1.28125, "learning_rate": 0.00033964160922178495, "loss": 5.2957, "mean_token_accuracy": 0.206342414021492, "num_tokens": 9690675.0, "step": 5330 }, { "entropy": 5.813098335266114, "epoch": 4.5835840137516115, "grad_norm": 1.125, "learning_rate": 0.0003393230728646518, "loss": 5.2833, "mean_token_accuracy": 0.2053971081972122, "num_tokens": 9700200.0, "step": 5335 }, { "entropy": 5.761319780349732, "epoch": 4.587881392350666, "grad_norm": 1.2421875, "learning_rate": 0.00033900439605163724, "loss": 5.2785, "mean_token_accuracy": 0.2027950644493103, "num_tokens": 9709533.0, "step": 5340 }, { "entropy": 5.774492692947388, "epoch": 4.592178770949721, "grad_norm": 1.09375, "learning_rate": 0.00033868557947860407, "loss": 5.3247, "mean_token_accuracy": 0.20598720461130143, "num_tokens": 9719250.0, "step": 5345 }, { "entropy": 5.826806688308716, "epoch": 4.596476149548775, "grad_norm": 1.1484375, "learning_rate": 0.00033836662384172014, "loss": 5.243, "mean_token_accuracy": 0.20927662551403045, "num_tokens": 9727837.0, "step": 5350 }, { "entropy": 5.759864807128906, "epoch": 4.60077352814783, "grad_norm": 1.1484375, "learning_rate": 0.0003380475298374573, "loss": 5.3326, "mean_token_accuracy": 0.20309751331806183, "num_tokens": 9737125.0, "step": 5355 }, { "entropy": 5.813335514068603, "epoch": 4.605070906746884, "grad_norm": 1.109375, "learning_rate": 0.000337728298162589, "loss": 5.3499, "mean_token_accuracy": 0.19702604413032532, "num_tokens": 9746309.0, "step": 5360 }, { "entropy": 5.838102722167969, "epoch": 4.609368285345939, "grad_norm": 1.1484375, "learning_rate": 0.00033740892951418993, "loss": 5.232, "mean_token_accuracy": 0.2094883754849434, "num_tokens": 9755633.0, "step": 5365 }, { "entropy": 5.877121877670288, "epoch": 4.613665663944993, "grad_norm": 1.2734375, "learning_rate": 0.0003370894245896333, "loss": 5.2713, "mean_token_accuracy": 0.19735931158065795, "num_tokens": 9765179.0, "step": 5370 }, { "entropy": 5.872338008880615, "epoch": 4.617963042544048, "grad_norm": 1.3359375, "learning_rate": 0.00033676978408659047, "loss": 5.2987, "mean_token_accuracy": 0.2016567572951317, "num_tokens": 9774085.0, "step": 5375 }, { "entropy": 5.845898246765136, "epoch": 4.622260421143102, "grad_norm": 1.09375, "learning_rate": 0.0003364500087030283, "loss": 5.4123, "mean_token_accuracy": 0.19296547174453735, "num_tokens": 9784650.0, "step": 5380 }, { "entropy": 5.869012546539307, "epoch": 4.626557799742157, "grad_norm": 1.1171875, "learning_rate": 0.00033613009913720845, "loss": 5.2707, "mean_token_accuracy": 0.20299201905727388, "num_tokens": 9793947.0, "step": 5385 }, { "entropy": 5.734190225601196, "epoch": 4.6308551783412115, "grad_norm": 1.234375, "learning_rate": 0.00033581005608768563, "loss": 5.2453, "mean_token_accuracy": 0.2124895542860031, "num_tokens": 9803593.0, "step": 5390 }, { "entropy": 5.793021965026855, "epoch": 4.635152556940266, "grad_norm": 1.1953125, "learning_rate": 0.0003354898802533058, "loss": 5.2855, "mean_token_accuracy": 0.20431207865476608, "num_tokens": 9812295.0, "step": 5395 }, { "entropy": 5.791452312469483, "epoch": 4.6394499355393215, "grad_norm": 1.125, "learning_rate": 0.0003351695723332051, "loss": 5.2934, "mean_token_accuracy": 0.2097485601902008, "num_tokens": 9820586.0, "step": 5400 }, { "entropy": 5.798425960540771, "epoch": 4.643747314138375, "grad_norm": 1.1484375, "learning_rate": 0.00033484913302680807, "loss": 5.2279, "mean_token_accuracy": 0.21040427088737487, "num_tokens": 9829080.0, "step": 5405 }, { "entropy": 5.796739912033081, "epoch": 4.648044692737431, "grad_norm": 1.125, "learning_rate": 0.00033452856303382595, "loss": 5.2475, "mean_token_accuracy": 0.20435117036104203, "num_tokens": 9838421.0, "step": 5410 }, { "entropy": 5.759791278839112, "epoch": 4.652342071336484, "grad_norm": 1.3515625, "learning_rate": 0.0003342078630542555, "loss": 5.2524, "mean_token_accuracy": 0.21281823366880417, "num_tokens": 9847151.0, "step": 5415 }, { "entropy": 5.807016801834107, "epoch": 4.65663944993554, "grad_norm": 1.171875, "learning_rate": 0.00033388703378837737, "loss": 5.275, "mean_token_accuracy": 0.20886558741331102, "num_tokens": 9856803.0, "step": 5420 }, { "entropy": 5.791787147521973, "epoch": 4.660936828534594, "grad_norm": 1.1953125, "learning_rate": 0.0003335660759367544, "loss": 5.1847, "mean_token_accuracy": 0.22501839995384215, "num_tokens": 9865617.0, "step": 5425 }, { "entropy": 5.765948724746704, "epoch": 4.665234207133649, "grad_norm": 1.1328125, "learning_rate": 0.00033324499020023025, "loss": 5.2534, "mean_token_accuracy": 0.21098006069660186, "num_tokens": 9875454.0, "step": 5430 }, { "entropy": 5.817541313171387, "epoch": 4.669531585732703, "grad_norm": 1.1875, "learning_rate": 0.0003329237772799277, "loss": 5.2502, "mean_token_accuracy": 0.20961165130138398, "num_tokens": 9884770.0, "step": 5435 }, { "entropy": 5.783469343185425, "epoch": 4.673828964331758, "grad_norm": 1.2421875, "learning_rate": 0.0003326024378772477, "loss": 5.2538, "mean_token_accuracy": 0.2091410353779793, "num_tokens": 9893594.0, "step": 5440 }, { "entropy": 5.793620014190674, "epoch": 4.678126342930812, "grad_norm": 1.109375, "learning_rate": 0.0003322809726938667, "loss": 5.3607, "mean_token_accuracy": 0.19666333645582199, "num_tokens": 9902260.0, "step": 5445 }, { "entropy": 5.804405307769775, "epoch": 4.682423721529867, "grad_norm": 1.2265625, "learning_rate": 0.00033195938243173645, "loss": 5.2657, "mean_token_accuracy": 0.20829562693834305, "num_tokens": 9911020.0, "step": 5450 }, { "entropy": 5.8101622581481935, "epoch": 4.6867211001289215, "grad_norm": 1.3515625, "learning_rate": 0.0003316376677930814, "loss": 5.277, "mean_token_accuracy": 0.20017611235380173, "num_tokens": 9918696.0, "step": 5455 }, { "entropy": 5.745956611633301, "epoch": 4.691018478727976, "grad_norm": 1.21875, "learning_rate": 0.0003313158294803977, "loss": 5.3171, "mean_token_accuracy": 0.1995955988764763, "num_tokens": 9927638.0, "step": 5460 }, { "entropy": 5.824975442886353, "epoch": 4.695315857327031, "grad_norm": 1.2109375, "learning_rate": 0.00033099386819645176, "loss": 5.2912, "mean_token_accuracy": 0.20382552444934846, "num_tokens": 9936969.0, "step": 5465 }, { "entropy": 5.796650314331055, "epoch": 4.699613235926085, "grad_norm": 1.046875, "learning_rate": 0.0003306717846442782, "loss": 5.1993, "mean_token_accuracy": 0.20417630672454834, "num_tokens": 9945229.0, "step": 5470 }, { "entropy": 5.7901218891143795, "epoch": 4.70391061452514, "grad_norm": 1.25, "learning_rate": 0.0003303495795271788, "loss": 5.1995, "mean_token_accuracy": 0.20233412235975265, "num_tokens": 9953759.0, "step": 5475 }, { "entropy": 5.770085334777832, "epoch": 4.708207993124194, "grad_norm": 1.140625, "learning_rate": 0.00033002725354872075, "loss": 5.3092, "mean_token_accuracy": 0.2047215849161148, "num_tokens": 9962771.0, "step": 5480 }, { "entropy": 5.800899696350098, "epoch": 4.712505371723249, "grad_norm": 1.3203125, "learning_rate": 0.00032970480741273514, "loss": 5.3104, "mean_token_accuracy": 0.19106538593769073, "num_tokens": 9972481.0, "step": 5485 }, { "entropy": 5.8685791015625, "epoch": 4.716802750322303, "grad_norm": 1.390625, "learning_rate": 0.0003293822418233155, "loss": 5.256, "mean_token_accuracy": 0.2051583468914032, "num_tokens": 9980773.0, "step": 5490 }, { "entropy": 5.8781898021698, "epoch": 4.721100128921358, "grad_norm": 1.1953125, "learning_rate": 0.0003290595574848161, "loss": 5.3453, "mean_token_accuracy": 0.19384868294000626, "num_tokens": 9989830.0, "step": 5495 }, { "entropy": 5.756228923797607, "epoch": 4.725397507520412, "grad_norm": 1.15625, "learning_rate": 0.0003287367551018505, "loss": 5.272, "mean_token_accuracy": 0.20579312443733216, "num_tokens": 9999234.0, "step": 5500 }, { "epoch": 4.725397507520412, "eval_entropy": 5.592626677977072, "eval_loss": 5.931019306182861, "eval_mean_token_accuracy": 0.17753368537235367, "eval_num_tokens": 9999234.0, "eval_runtime": 2.0334, "eval_samples_per_second": 1745.336, "eval_steps_per_second": 218.351, "step": 5500 }, { "entropy": 5.8168501377105715, "epoch": 4.729694886119467, "grad_norm": 1.1015625, "learning_rate": 0.0003284138353792903, "loss": 5.3383, "mean_token_accuracy": 0.2040895164012909, "num_tokens": 10008671.0, "step": 5505 }, { "entropy": 5.784496402740478, "epoch": 4.7339922647185215, "grad_norm": 1.265625, "learning_rate": 0.0003280907990222628, "loss": 5.2985, "mean_token_accuracy": 0.2070325642824173, "num_tokens": 10017170.0, "step": 5510 }, { "entropy": 5.77425446510315, "epoch": 4.738289643317576, "grad_norm": 1.3125, "learning_rate": 0.00032776764673615055, "loss": 5.3156, "mean_token_accuracy": 0.20255803018808366, "num_tokens": 10025712.0, "step": 5515 }, { "entropy": 5.8088236331939695, "epoch": 4.742587021916631, "grad_norm": 1.09375, "learning_rate": 0.0003274443792265888, "loss": 5.3115, "mean_token_accuracy": 0.21292225122451783, "num_tokens": 10035297.0, "step": 5520 }, { "entropy": 5.837254667282105, "epoch": 4.746884400515685, "grad_norm": 1.1796875, "learning_rate": 0.00032712099719946474, "loss": 5.278, "mean_token_accuracy": 0.21366028040647506, "num_tokens": 10043903.0, "step": 5525 }, { "entropy": 5.739489364624023, "epoch": 4.75118177911474, "grad_norm": 1.2265625, "learning_rate": 0.00032679750136091533, "loss": 5.3269, "mean_token_accuracy": 0.20195425003767015, "num_tokens": 10053035.0, "step": 5530 }, { "entropy": 5.721313333511352, "epoch": 4.755479157713794, "grad_norm": 1.25, "learning_rate": 0.0003264738924173262, "loss": 5.2737, "mean_token_accuracy": 0.20684178918600082, "num_tokens": 10061911.0, "step": 5535 }, { "entropy": 5.809173583984375, "epoch": 4.759776536312849, "grad_norm": 1.1875, "learning_rate": 0.00032615017107533, "loss": 5.2765, "mean_token_accuracy": 0.2063768208026886, "num_tokens": 10070738.0, "step": 5540 }, { "entropy": 5.802584886550903, "epoch": 4.764073914911903, "grad_norm": 1.21875, "learning_rate": 0.0003258263380418047, "loss": 5.2855, "mean_token_accuracy": 0.20578781515359879, "num_tokens": 10080638.0, "step": 5545 }, { "entropy": 5.914425611495972, "epoch": 4.768371293510958, "grad_norm": 1.2421875, "learning_rate": 0.00032550239402387226, "loss": 5.3363, "mean_token_accuracy": 0.19763863384723662, "num_tokens": 10089429.0, "step": 5550 }, { "entropy": 5.7599162578582765, "epoch": 4.772668672110013, "grad_norm": 1.1953125, "learning_rate": 0.00032517833972889695, "loss": 5.206, "mean_token_accuracy": 0.2099302053451538, "num_tokens": 10098109.0, "step": 5555 }, { "entropy": 5.811598682403565, "epoch": 4.776966050709067, "grad_norm": 1.1953125, "learning_rate": 0.00032485417586448375, "loss": 5.3145, "mean_token_accuracy": 0.20163995772600174, "num_tokens": 10106808.0, "step": 5560 }, { "entropy": 5.86333212852478, "epoch": 4.781263429308122, "grad_norm": 1.28125, "learning_rate": 0.000324529903138477, "loss": 5.3143, "mean_token_accuracy": 0.20143208354711534, "num_tokens": 10116372.0, "step": 5565 }, { "entropy": 5.801443433761596, "epoch": 4.785560807907177, "grad_norm": 1.1875, "learning_rate": 0.0003242055222589587, "loss": 5.2258, "mean_token_accuracy": 0.21505694687366486, "num_tokens": 10125256.0, "step": 5570 }, { "entropy": 5.808296251296997, "epoch": 4.7898581865062315, "grad_norm": 1.15625, "learning_rate": 0.000323881033934247, "loss": 5.3535, "mean_token_accuracy": 0.19890447854995727, "num_tokens": 10134784.0, "step": 5575 }, { "entropy": 5.8784411430358885, "epoch": 4.794155565105286, "grad_norm": 1.265625, "learning_rate": 0.00032355643887289486, "loss": 5.289, "mean_token_accuracy": 0.2091620832681656, "num_tokens": 10144324.0, "step": 5580 }, { "entropy": 5.851370334625244, "epoch": 4.798452943704341, "grad_norm": 1.2109375, "learning_rate": 0.0003232317377836881, "loss": 5.329, "mean_token_accuracy": 0.19960423558950424, "num_tokens": 10152866.0, "step": 5585 }, { "entropy": 5.782239103317261, "epoch": 4.802750322303395, "grad_norm": 1.1796875, "learning_rate": 0.000322906931375644, "loss": 5.2522, "mean_token_accuracy": 0.2105662614107132, "num_tokens": 10162457.0, "step": 5590 }, { "entropy": 5.767592477798462, "epoch": 4.80704770090245, "grad_norm": 1.171875, "learning_rate": 0.00032258202035801, "loss": 5.3246, "mean_token_accuracy": 0.1998135194182396, "num_tokens": 10171604.0, "step": 5595 }, { "entropy": 5.87656307220459, "epoch": 4.811345079501504, "grad_norm": 1.15625, "learning_rate": 0.000322257005440262, "loss": 5.2848, "mean_token_accuracy": 0.20817122161388396, "num_tokens": 10180762.0, "step": 5600 }, { "entropy": 5.740377759933471, "epoch": 4.815642458100559, "grad_norm": 1.1484375, "learning_rate": 0.0003219318873321025, "loss": 5.2017, "mean_token_accuracy": 0.22599002420902253, "num_tokens": 10189122.0, "step": 5605 }, { "entropy": 5.7609411716461185, "epoch": 4.819939836699613, "grad_norm": 1.1328125, "learning_rate": 0.00032160666674345954, "loss": 5.3069, "mean_token_accuracy": 0.19678669720888137, "num_tokens": 10197280.0, "step": 5610 }, { "entropy": 5.834485340118408, "epoch": 4.824237215298668, "grad_norm": 1.0859375, "learning_rate": 0.00032128134438448504, "loss": 5.3481, "mean_token_accuracy": 0.19607715606689452, "num_tokens": 10207507.0, "step": 5615 }, { "entropy": 5.843401002883911, "epoch": 4.828534593897722, "grad_norm": 1.2265625, "learning_rate": 0.00032095592096555284, "loss": 5.3241, "mean_token_accuracy": 0.19834306091070175, "num_tokens": 10217584.0, "step": 5620 }, { "entropy": 5.804885768890381, "epoch": 4.832831972496777, "grad_norm": 1.1484375, "learning_rate": 0.0003206303971972577, "loss": 5.257, "mean_token_accuracy": 0.21136587262153625, "num_tokens": 10226388.0, "step": 5625 }, { "entropy": 5.773497581481934, "epoch": 4.8371293510958315, "grad_norm": 1.25, "learning_rate": 0.0003203047737904134, "loss": 5.2796, "mean_token_accuracy": 0.20816502273082732, "num_tokens": 10235333.0, "step": 5630 }, { "entropy": 5.809968280792236, "epoch": 4.841426729694886, "grad_norm": 1.2578125, "learning_rate": 0.00031997905145605135, "loss": 5.3218, "mean_token_accuracy": 0.20077406167984008, "num_tokens": 10243985.0, "step": 5635 }, { "entropy": 5.875359725952149, "epoch": 4.845724108293941, "grad_norm": 1.1953125, "learning_rate": 0.00031965323090541874, "loss": 5.3292, "mean_token_accuracy": 0.19166997075080872, "num_tokens": 10252968.0, "step": 5640 }, { "entropy": 5.836591100692749, "epoch": 4.850021486892995, "grad_norm": 1.1875, "learning_rate": 0.0003193273128499777, "loss": 5.1951, "mean_token_accuracy": 0.20659874528646469, "num_tokens": 10261890.0, "step": 5645 }, { "entropy": 5.755848217010498, "epoch": 4.85431886549205, "grad_norm": 1.21875, "learning_rate": 0.00031900129800140287, "loss": 5.3049, "mean_token_accuracy": 0.20563669949769975, "num_tokens": 10271363.0, "step": 5650 }, { "entropy": 5.803152656555175, "epoch": 4.858616244091104, "grad_norm": 1.34375, "learning_rate": 0.00031867518707158027, "loss": 5.335, "mean_token_accuracy": 0.19770116060972215, "num_tokens": 10280608.0, "step": 5655 }, { "entropy": 5.818397092819214, "epoch": 4.862913622690159, "grad_norm": 1.1875, "learning_rate": 0.000318348980772606, "loss": 5.2525, "mean_token_accuracy": 0.20726011395454408, "num_tokens": 10289972.0, "step": 5660 }, { "entropy": 5.860452508926391, "epoch": 4.867211001289213, "grad_norm": 1.3203125, "learning_rate": 0.00031802267981678414, "loss": 5.3123, "mean_token_accuracy": 0.20409038215875625, "num_tokens": 10298740.0, "step": 5665 }, { "entropy": 5.833805656433105, "epoch": 4.871508379888268, "grad_norm": 1.078125, "learning_rate": 0.00031769628491662563, "loss": 5.2809, "mean_token_accuracy": 0.20727547705173494, "num_tokens": 10307706.0, "step": 5670 }, { "entropy": 5.803285360336304, "epoch": 4.8758057584873224, "grad_norm": 1.0703125, "learning_rate": 0.00031736979678484634, "loss": 5.329, "mean_token_accuracy": 0.2064347133040428, "num_tokens": 10317549.0, "step": 5675 }, { "entropy": 5.820219898223877, "epoch": 4.880103137086377, "grad_norm": 1.1640625, "learning_rate": 0.00031704321613436597, "loss": 5.3611, "mean_token_accuracy": 0.19811774492263795, "num_tokens": 10327681.0, "step": 5680 }, { "entropy": 5.755207061767578, "epoch": 4.8844005156854315, "grad_norm": 1.0859375, "learning_rate": 0.0003167165436783061, "loss": 5.2873, "mean_token_accuracy": 0.21109480857849122, "num_tokens": 10336261.0, "step": 5685 }, { "entropy": 5.727659749984741, "epoch": 4.888697894284486, "grad_norm": 1.234375, "learning_rate": 0.00031638978012998875, "loss": 5.2052, "mean_token_accuracy": 0.21589273661375047, "num_tokens": 10344770.0, "step": 5690 }, { "entropy": 5.819532823562622, "epoch": 4.892995272883541, "grad_norm": 1.1796875, "learning_rate": 0.000316062926202935, "loss": 5.3654, "mean_token_accuracy": 0.196533140540123, "num_tokens": 10354246.0, "step": 5695 }, { "entropy": 5.840965127944946, "epoch": 4.897292651482596, "grad_norm": 1.328125, "learning_rate": 0.0003157359826108632, "loss": 5.2826, "mean_token_accuracy": 0.20469695180654526, "num_tokens": 10362693.0, "step": 5700 }, { "entropy": 5.81398639678955, "epoch": 4.90159003008165, "grad_norm": 1.1953125, "learning_rate": 0.00031540895006768727, "loss": 5.2798, "mean_token_accuracy": 0.20513766556978225, "num_tokens": 10371639.0, "step": 5705 }, { "entropy": 5.778346061706543, "epoch": 4.905887408680705, "grad_norm": 1.1015625, "learning_rate": 0.0003150818292875158, "loss": 5.2986, "mean_token_accuracy": 0.20309000611305236, "num_tokens": 10381237.0, "step": 5710 }, { "entropy": 5.8171275615692135, "epoch": 4.91018478727976, "grad_norm": 1.203125, "learning_rate": 0.0003147546209846497, "loss": 5.2726, "mean_token_accuracy": 0.20377830415964127, "num_tokens": 10389932.0, "step": 5715 }, { "entropy": 5.729213857650757, "epoch": 4.914482165878814, "grad_norm": 1.2109375, "learning_rate": 0.0003144273258735812, "loss": 5.1685, "mean_token_accuracy": 0.2099962517619133, "num_tokens": 10398938.0, "step": 5720 }, { "entropy": 5.770321941375732, "epoch": 4.918779544477869, "grad_norm": 1.453125, "learning_rate": 0.0003140999446689919, "loss": 5.2774, "mean_token_accuracy": 0.20088756531476976, "num_tokens": 10407980.0, "step": 5725 }, { "entropy": 5.770383071899414, "epoch": 4.923076923076923, "grad_norm": 1.421875, "learning_rate": 0.0003137724780857516, "loss": 5.3436, "mean_token_accuracy": 0.20163364857435226, "num_tokens": 10416990.0, "step": 5730 }, { "entropy": 5.863169193267822, "epoch": 4.927374301675978, "grad_norm": 1.0625, "learning_rate": 0.00031344492683891634, "loss": 5.3587, "mean_token_accuracy": 0.204886694252491, "num_tokens": 10426573.0, "step": 5735 }, { "entropy": 5.851557922363281, "epoch": 4.931671680275032, "grad_norm": 1.265625, "learning_rate": 0.0003131172916437272, "loss": 5.3233, "mean_token_accuracy": 0.19881743043661118, "num_tokens": 10435162.0, "step": 5740 }, { "entropy": 5.759703254699707, "epoch": 4.935969058874087, "grad_norm": 1.2578125, "learning_rate": 0.00031278957321560845, "loss": 5.3238, "mean_token_accuracy": 0.20171435326337814, "num_tokens": 10444374.0, "step": 5745 }, { "entropy": 5.877453994750977, "epoch": 4.9402664374731415, "grad_norm": 1.390625, "learning_rate": 0.00031246177227016615, "loss": 5.3411, "mean_token_accuracy": 0.1953754648566246, "num_tokens": 10452679.0, "step": 5750 }, { "entropy": 5.818898057937622, "epoch": 4.944563816072196, "grad_norm": 1.1640625, "learning_rate": 0.00031213388952318653, "loss": 5.2927, "mean_token_accuracy": 0.2119872346520424, "num_tokens": 10461801.0, "step": 5755 }, { "entropy": 5.807455348968506, "epoch": 4.948861194671251, "grad_norm": 1.125, "learning_rate": 0.0003118059256906345, "loss": 5.2809, "mean_token_accuracy": 0.20208909511566162, "num_tokens": 10471176.0, "step": 5760 }, { "entropy": 5.840291500091553, "epoch": 4.953158573270305, "grad_norm": 1.25, "learning_rate": 0.00031147788148865204, "loss": 5.342, "mean_token_accuracy": 0.19445150792598725, "num_tokens": 10480403.0, "step": 5765 }, { "entropy": 5.799935436248779, "epoch": 4.95745595186936, "grad_norm": 1.1484375, "learning_rate": 0.0003111497576335564, "loss": 5.2761, "mean_token_accuracy": 0.20416030585765838, "num_tokens": 10489574.0, "step": 5770 }, { "entropy": 5.798885202407837, "epoch": 4.961753330468414, "grad_norm": 1.015625, "learning_rate": 0.0003108215548418391, "loss": 5.2857, "mean_token_accuracy": 0.20692466497421264, "num_tokens": 10499631.0, "step": 5775 }, { "entropy": 5.770237159729004, "epoch": 4.966050709067469, "grad_norm": 1.4375, "learning_rate": 0.0003104932738301637, "loss": 5.2702, "mean_token_accuracy": 0.2006146103143692, "num_tokens": 10508128.0, "step": 5780 }, { "entropy": 5.782887268066406, "epoch": 4.970348087666523, "grad_norm": 1.171875, "learning_rate": 0.00031016491531536477, "loss": 5.2448, "mean_token_accuracy": 0.2088773876428604, "num_tokens": 10517544.0, "step": 5785 }, { "entropy": 5.767910051345825, "epoch": 4.974645466265578, "grad_norm": 1.234375, "learning_rate": 0.0003098364800144462, "loss": 5.3132, "mean_token_accuracy": 0.20686182081699372, "num_tokens": 10526244.0, "step": 5790 }, { "entropy": 5.888048982620239, "epoch": 4.9789428448646325, "grad_norm": 1.2109375, "learning_rate": 0.0003095079686445792, "loss": 5.3812, "mean_token_accuracy": 0.20500532984733583, "num_tokens": 10535887.0, "step": 5795 }, { "entropy": 5.821762752532959, "epoch": 4.983240223463687, "grad_norm": 1.15625, "learning_rate": 0.00030917938192310146, "loss": 5.2341, "mean_token_accuracy": 0.20750374495983123, "num_tokens": 10544420.0, "step": 5800 }, { "entropy": 5.807241201400757, "epoch": 4.9875376020627415, "grad_norm": 1.1328125, "learning_rate": 0.00030885072056751494, "loss": 5.3215, "mean_token_accuracy": 0.20071204453706742, "num_tokens": 10553114.0, "step": 5805 }, { "entropy": 5.778054094314575, "epoch": 4.991834980661796, "grad_norm": 1.1171875, "learning_rate": 0.00030852198529548476, "loss": 5.3415, "mean_token_accuracy": 0.2032615214586258, "num_tokens": 10562272.0, "step": 5810 }, { "entropy": 5.810254526138306, "epoch": 4.996132359260851, "grad_norm": 1.21875, "learning_rate": 0.0003081931768248373, "loss": 5.2967, "mean_token_accuracy": 0.20949976444244384, "num_tokens": 10571757.0, "step": 5815 }, { "entropy": 5.7521191173129615, "epoch": 5.0, "grad_norm": 1.796875, "learning_rate": 0.0003078642958735588, "loss": 5.238, "mean_token_accuracy": 0.2155479672882292, "num_tokens": 10579660.0, "step": 5820 }, { "entropy": 5.837719202041626, "epoch": 5.0042973785990545, "grad_norm": 1.203125, "learning_rate": 0.00030753534315979393, "loss": 5.1272, "mean_token_accuracy": 0.21314742416143417, "num_tokens": 10589139.0, "step": 5825 }, { "entropy": 5.797129201889038, "epoch": 5.008594757198109, "grad_norm": 1.2890625, "learning_rate": 0.0003072063194018438, "loss": 4.9242, "mean_token_accuracy": 0.2357708305120468, "num_tokens": 10597915.0, "step": 5830 }, { "entropy": 5.798255205154419, "epoch": 5.012892135797164, "grad_norm": 1.1953125, "learning_rate": 0.0003068772253181648, "loss": 5.1386, "mean_token_accuracy": 0.21062317192554475, "num_tokens": 10606491.0, "step": 5835 }, { "entropy": 5.813257837295533, "epoch": 5.017189514396218, "grad_norm": 1.0546875, "learning_rate": 0.0003065480616273671, "loss": 5.117, "mean_token_accuracy": 0.21578232049942017, "num_tokens": 10615852.0, "step": 5840 }, { "entropy": 5.866687679290772, "epoch": 5.021486892995273, "grad_norm": 1.2734375, "learning_rate": 0.0003062188290482123, "loss": 5.0954, "mean_token_accuracy": 0.21719059944152833, "num_tokens": 10625442.0, "step": 5845 }, { "entropy": 5.775999212265015, "epoch": 5.025784271594327, "grad_norm": 1.078125, "learning_rate": 0.00030588952829961304, "loss": 5.0303, "mean_token_accuracy": 0.22669098973274232, "num_tokens": 10634972.0, "step": 5850 }, { "entropy": 5.820281648635865, "epoch": 5.030081650193382, "grad_norm": 1.234375, "learning_rate": 0.0003055601601006303, "loss": 5.0305, "mean_token_accuracy": 0.21823905259370804, "num_tokens": 10644487.0, "step": 5855 }, { "entropy": 5.800490808486939, "epoch": 5.034379028792436, "grad_norm": 1.2578125, "learning_rate": 0.0003052307251704728, "loss": 5.0487, "mean_token_accuracy": 0.23768426477909088, "num_tokens": 10654144.0, "step": 5860 }, { "entropy": 5.776832151412964, "epoch": 5.038676407391491, "grad_norm": 1.4375, "learning_rate": 0.0003049012242284946, "loss": 5.0901, "mean_token_accuracy": 0.21536518186330794, "num_tokens": 10663023.0, "step": 5865 }, { "entropy": 5.837555408477783, "epoch": 5.0429737859905455, "grad_norm": 1.2109375, "learning_rate": 0.0003045716579941941, "loss": 5.1255, "mean_token_accuracy": 0.21810145378112794, "num_tokens": 10672001.0, "step": 5870 }, { "entropy": 5.780414390563965, "epoch": 5.0472711645896, "grad_norm": 1.0859375, "learning_rate": 0.00030424202718721215, "loss": 5.095, "mean_token_accuracy": 0.21664355248212813, "num_tokens": 10682654.0, "step": 5875 }, { "entropy": 5.7828668594360355, "epoch": 5.051568543188655, "grad_norm": 1.2734375, "learning_rate": 0.00030391233252733085, "loss": 5.0795, "mean_token_accuracy": 0.22158391326665877, "num_tokens": 10691429.0, "step": 5880 }, { "entropy": 5.7656014919281, "epoch": 5.055865921787709, "grad_norm": 1.1640625, "learning_rate": 0.00030358257473447144, "loss": 5.0694, "mean_token_accuracy": 0.22057975679636002, "num_tokens": 10701130.0, "step": 5885 }, { "entropy": 5.748148441314697, "epoch": 5.060163300386764, "grad_norm": 1.203125, "learning_rate": 0.00030325275452869316, "loss": 5.0128, "mean_token_accuracy": 0.2287128150463104, "num_tokens": 10709779.0, "step": 5890 }, { "entropy": 5.718500375747681, "epoch": 5.064460678985819, "grad_norm": 1.234375, "learning_rate": 0.00030292287263019153, "loss": 5.0811, "mean_token_accuracy": 0.22246713042259217, "num_tokens": 10718795.0, "step": 5895 }, { "entropy": 5.670262241363526, "epoch": 5.068758057584874, "grad_norm": 1.21875, "learning_rate": 0.00030259292975929675, "loss": 4.9893, "mean_token_accuracy": 0.23385845869779587, "num_tokens": 10728202.0, "step": 5900 }, { "entropy": 5.783095836639404, "epoch": 5.073055436183928, "grad_norm": 1.109375, "learning_rate": 0.0003022629266364723, "loss": 5.0571, "mean_token_accuracy": 0.2212306410074234, "num_tokens": 10737050.0, "step": 5905 }, { "entropy": 5.780191469192505, "epoch": 5.077352814782983, "grad_norm": 1.2890625, "learning_rate": 0.00030193286398231276, "loss": 5.0425, "mean_token_accuracy": 0.2253589302301407, "num_tokens": 10745261.0, "step": 5910 }, { "entropy": 5.737091827392578, "epoch": 5.081650193382037, "grad_norm": 1.21875, "learning_rate": 0.00030160274251754337, "loss": 5.1055, "mean_token_accuracy": 0.2190812796354294, "num_tokens": 10755008.0, "step": 5915 }, { "entropy": 5.78898344039917, "epoch": 5.085947571981092, "grad_norm": 1.2265625, "learning_rate": 0.00030127256296301724, "loss": 5.1151, "mean_token_accuracy": 0.21488914340734483, "num_tokens": 10763951.0, "step": 5920 }, { "entropy": 5.804350757598877, "epoch": 5.090244950580146, "grad_norm": 1.140625, "learning_rate": 0.0003009423260397148, "loss": 5.0493, "mean_token_accuracy": 0.2211121663451195, "num_tokens": 10772770.0, "step": 5925 }, { "entropy": 5.7645410060882565, "epoch": 5.094542329179201, "grad_norm": 1.171875, "learning_rate": 0.00030061203246874125, "loss": 5.126, "mean_token_accuracy": 0.21714986562728883, "num_tokens": 10781827.0, "step": 5930 }, { "entropy": 5.829766368865966, "epoch": 5.0988397077782555, "grad_norm": 1.2578125, "learning_rate": 0.00030028168297132593, "loss": 5.1971, "mean_token_accuracy": 0.21828972399234772, "num_tokens": 10792321.0, "step": 5935 }, { "entropy": 5.7105169773101805, "epoch": 5.10313708637731, "grad_norm": 1.328125, "learning_rate": 0.0002999512782688199, "loss": 5.1534, "mean_token_accuracy": 0.22245844155550004, "num_tokens": 10801689.0, "step": 5940 }, { "entropy": 5.806783151626587, "epoch": 5.1074344649763646, "grad_norm": 1.203125, "learning_rate": 0.0002996208190826951, "loss": 5.0674, "mean_token_accuracy": 0.21947959512472154, "num_tokens": 10810513.0, "step": 5945 }, { "entropy": 5.784919166564942, "epoch": 5.111731843575419, "grad_norm": 1.1875, "learning_rate": 0.00029929030613454227, "loss": 5.0423, "mean_token_accuracy": 0.2230915367603302, "num_tokens": 10819581.0, "step": 5950 }, { "entropy": 5.73317141532898, "epoch": 5.116029222174474, "grad_norm": 1.1953125, "learning_rate": 0.0002989597401460697, "loss": 5.0427, "mean_token_accuracy": 0.22367439568042755, "num_tokens": 10828139.0, "step": 5955 }, { "entropy": 5.761006593704224, "epoch": 5.120326600773528, "grad_norm": 1.328125, "learning_rate": 0.00029862912183910105, "loss": 5.0515, "mean_token_accuracy": 0.2288846716284752, "num_tokens": 10836256.0, "step": 5960 }, { "entropy": 5.753820896148682, "epoch": 5.124623979372583, "grad_norm": 1.1875, "learning_rate": 0.00029829845193557496, "loss": 5.0806, "mean_token_accuracy": 0.22265101224184036, "num_tokens": 10846255.0, "step": 5965 }, { "entropy": 5.755499696731567, "epoch": 5.128921357971637, "grad_norm": 1.234375, "learning_rate": 0.0002979677311575421, "loss": 5.0762, "mean_token_accuracy": 0.2280671551823616, "num_tokens": 10855546.0, "step": 5970 }, { "entropy": 5.836764097213745, "epoch": 5.133218736570692, "grad_norm": 1.296875, "learning_rate": 0.0002976369602271646, "loss": 5.1451, "mean_token_accuracy": 0.21697065979242325, "num_tokens": 10864417.0, "step": 5975 }, { "entropy": 5.814312410354614, "epoch": 5.137516115169746, "grad_norm": 1.1484375, "learning_rate": 0.0002973061398667138, "loss": 5.1172, "mean_token_accuracy": 0.21944077759981157, "num_tokens": 10874527.0, "step": 5980 }, { "entropy": 5.744282293319702, "epoch": 5.141813493768801, "grad_norm": 1.296875, "learning_rate": 0.00029697527079856916, "loss": 5.1538, "mean_token_accuracy": 0.21050577759742736, "num_tokens": 10883486.0, "step": 5985 }, { "entropy": 5.761128902435303, "epoch": 5.1461108723678555, "grad_norm": 1.2890625, "learning_rate": 0.00029664435374521665, "loss": 5.0225, "mean_token_accuracy": 0.2243912473320961, "num_tokens": 10891972.0, "step": 5990 }, { "entropy": 5.789876794815063, "epoch": 5.15040825096691, "grad_norm": 1.2109375, "learning_rate": 0.00029631338942924664, "loss": 5.0419, "mean_token_accuracy": 0.22999733984470366, "num_tokens": 10901350.0, "step": 5995 }, { "entropy": 5.719483852386475, "epoch": 5.154705629565965, "grad_norm": 1.359375, "learning_rate": 0.0002959823785733531, "loss": 5.03, "mean_token_accuracy": 0.22308178544044494, "num_tokens": 10910114.0, "step": 6000 }, { "epoch": 5.154705629565965, "eval_entropy": 5.520973839738348, "eval_loss": 5.938914775848389, "eval_mean_token_accuracy": 0.1778814101440681, "eval_num_tokens": 10910114.0, "eval_runtime": 2.0532, "eval_samples_per_second": 1728.506, "eval_steps_per_second": 216.246, "step": 6000 }, { "entropy": 5.724570035934448, "epoch": 5.159003008165019, "grad_norm": 1.296875, "learning_rate": 0.0002956513219003312, "loss": 5.1102, "mean_token_accuracy": 0.22220082879066466, "num_tokens": 10919781.0, "step": 6005 }, { "entropy": 5.794006776809693, "epoch": 5.163300386764074, "grad_norm": 1.234375, "learning_rate": 0.00029532022013307666, "loss": 5.107, "mean_token_accuracy": 0.2223549634218216, "num_tokens": 10929561.0, "step": 6010 }, { "entropy": 5.776598358154297, "epoch": 5.167597765363128, "grad_norm": 1.1640625, "learning_rate": 0.00029498907399458325, "loss": 5.0801, "mean_token_accuracy": 0.21775457113981248, "num_tokens": 10939123.0, "step": 6015 }, { "entropy": 5.791857671737671, "epoch": 5.171895143962183, "grad_norm": 1.25, "learning_rate": 0.0002946578842079418, "loss": 5.111, "mean_token_accuracy": 0.22705103009939193, "num_tokens": 10947990.0, "step": 6020 }, { "entropy": 5.7706746578216555, "epoch": 5.176192522561237, "grad_norm": 1.328125, "learning_rate": 0.0002943266514963384, "loss": 5.0366, "mean_token_accuracy": 0.23295889049768448, "num_tokens": 10956569.0, "step": 6025 }, { "entropy": 5.732059144973755, "epoch": 5.180489901160292, "grad_norm": 1.296875, "learning_rate": 0.0002939953765830529, "loss": 5.0807, "mean_token_accuracy": 0.21273524165153504, "num_tokens": 10965466.0, "step": 6030 }, { "entropy": 5.758616828918457, "epoch": 5.184787279759346, "grad_norm": 1.15625, "learning_rate": 0.00029366406019145735, "loss": 5.0975, "mean_token_accuracy": 0.22184741795063018, "num_tokens": 10975051.0, "step": 6035 }, { "entropy": 5.697403001785278, "epoch": 5.189084658358402, "grad_norm": 1.2890625, "learning_rate": 0.0002933327030450143, "loss": 5.0003, "mean_token_accuracy": 0.22943248599767685, "num_tokens": 10983940.0, "step": 6040 }, { "entropy": 5.7583231925964355, "epoch": 5.193382036957456, "grad_norm": 1.25, "learning_rate": 0.00029300130586727545, "loss": 5.1201, "mean_token_accuracy": 0.2127103865146637, "num_tokens": 10994029.0, "step": 6045 }, { "entropy": 5.817176723480225, "epoch": 5.197679415556511, "grad_norm": 1.15625, "learning_rate": 0.00029266986938187943, "loss": 5.147, "mean_token_accuracy": 0.2095362886786461, "num_tokens": 11003616.0, "step": 6050 }, { "entropy": 5.755208492279053, "epoch": 5.2019767941555655, "grad_norm": 1.2109375, "learning_rate": 0.0002923383943125514, "loss": 5.0296, "mean_token_accuracy": 0.22274913787841796, "num_tokens": 11012068.0, "step": 6055 }, { "entropy": 5.748697900772095, "epoch": 5.20627417275462, "grad_norm": 1.34375, "learning_rate": 0.0002920068813831002, "loss": 5.0779, "mean_token_accuracy": 0.21767261028289794, "num_tokens": 11020510.0, "step": 6060 }, { "entropy": 5.725340127944946, "epoch": 5.210571551353675, "grad_norm": 1.1953125, "learning_rate": 0.0002916753313174178, "loss": 5.1218, "mean_token_accuracy": 0.21479322165250778, "num_tokens": 11029804.0, "step": 6065 }, { "entropy": 5.7446118831634525, "epoch": 5.214868929952729, "grad_norm": 1.2734375, "learning_rate": 0.0002913437448394768, "loss": 5.1012, "mean_token_accuracy": 0.22081879377365113, "num_tokens": 11038586.0, "step": 6070 }, { "entropy": 5.8289836883544925, "epoch": 5.219166308551784, "grad_norm": 1.1796875, "learning_rate": 0.00029101212267332955, "loss": 5.1577, "mean_token_accuracy": 0.21826230138540267, "num_tokens": 11048240.0, "step": 6075 }, { "entropy": 5.774588584899902, "epoch": 5.223463687150838, "grad_norm": 1.1171875, "learning_rate": 0.00029068046554310637, "loss": 5.0426, "mean_token_accuracy": 0.22095113396644592, "num_tokens": 11056703.0, "step": 6080 }, { "entropy": 5.716027879714966, "epoch": 5.227761065749893, "grad_norm": 1.140625, "learning_rate": 0.0002903487741730139, "loss": 5.0819, "mean_token_accuracy": 0.21253616362810135, "num_tokens": 11066246.0, "step": 6085 }, { "entropy": 5.745535945892334, "epoch": 5.232058444348947, "grad_norm": 1.3125, "learning_rate": 0.00029001704928733354, "loss": 5.1308, "mean_token_accuracy": 0.2211642697453499, "num_tokens": 11075277.0, "step": 6090 }, { "entropy": 5.723521947860718, "epoch": 5.236355822948002, "grad_norm": 1.21875, "learning_rate": 0.0002896852916104198, "loss": 5.0242, "mean_token_accuracy": 0.2263767346739769, "num_tokens": 11083759.0, "step": 6095 }, { "entropy": 5.815294790267944, "epoch": 5.240653201547056, "grad_norm": 1.2265625, "learning_rate": 0.0002893535018666988, "loss": 5.0008, "mean_token_accuracy": 0.22687483876943587, "num_tokens": 11091960.0, "step": 6100 }, { "entropy": 5.741855907440185, "epoch": 5.244950580146111, "grad_norm": 1.453125, "learning_rate": 0.00028902168078066674, "loss": 5.1304, "mean_token_accuracy": 0.21954144090414046, "num_tokens": 11101134.0, "step": 6105 }, { "entropy": 5.715764713287354, "epoch": 5.2492479587451655, "grad_norm": 1.2578125, "learning_rate": 0.0002886898290768883, "loss": 5.0783, "mean_token_accuracy": 0.21966248154640197, "num_tokens": 11110282.0, "step": 6110 }, { "entropy": 5.766628980636597, "epoch": 5.25354533734422, "grad_norm": 1.2421875, "learning_rate": 0.000288357947479995, "loss": 5.1472, "mean_token_accuracy": 0.2185099706053734, "num_tokens": 11119591.0, "step": 6115 }, { "entropy": 5.794510555267334, "epoch": 5.257842715943275, "grad_norm": 1.125, "learning_rate": 0.00028802603671468347, "loss": 5.1333, "mean_token_accuracy": 0.21663097888231278, "num_tokens": 11129164.0, "step": 6120 }, { "entropy": 5.794670677185058, "epoch": 5.262140094542329, "grad_norm": 1.203125, "learning_rate": 0.00028769409750571413, "loss": 5.1054, "mean_token_accuracy": 0.21479454636573792, "num_tokens": 11137973.0, "step": 6125 }, { "entropy": 5.762855958938599, "epoch": 5.266437473141384, "grad_norm": 1.1640625, "learning_rate": 0.00028736213057790975, "loss": 5.1731, "mean_token_accuracy": 0.21324900835752486, "num_tokens": 11147285.0, "step": 6130 }, { "entropy": 5.80087947845459, "epoch": 5.270734851740438, "grad_norm": 1.328125, "learning_rate": 0.0002870301366561533, "loss": 5.1457, "mean_token_accuracy": 0.2189454674720764, "num_tokens": 11155303.0, "step": 6135 }, { "entropy": 5.777155160903931, "epoch": 5.275032230339493, "grad_norm": 1.375, "learning_rate": 0.0002866981164653867, "loss": 5.0068, "mean_token_accuracy": 0.2319532886147499, "num_tokens": 11163553.0, "step": 6140 }, { "entropy": 5.673604679107666, "epoch": 5.279329608938547, "grad_norm": 1.4921875, "learning_rate": 0.0002863660707306095, "loss": 4.9501, "mean_token_accuracy": 0.23503543585538864, "num_tokens": 11171865.0, "step": 6145 }, { "entropy": 5.708528232574463, "epoch": 5.283626987537602, "grad_norm": 1.1328125, "learning_rate": 0.00028603400017687675, "loss": 5.1259, "mean_token_accuracy": 0.2180730476975441, "num_tokens": 11181137.0, "step": 6150 }, { "entropy": 5.644678974151612, "epoch": 5.287924366136656, "grad_norm": 1.2265625, "learning_rate": 0.00028570190552929794, "loss": 5.0033, "mean_token_accuracy": 0.23516686558723449, "num_tokens": 11190174.0, "step": 6155 }, { "entropy": 5.731041049957275, "epoch": 5.292221744735711, "grad_norm": 1.3203125, "learning_rate": 0.000285369787513035, "loss": 5.0203, "mean_token_accuracy": 0.22731503397226333, "num_tokens": 11197964.0, "step": 6160 }, { "entropy": 5.808375072479248, "epoch": 5.2965191233347655, "grad_norm": 1.078125, "learning_rate": 0.00028503764685330077, "loss": 5.1475, "mean_token_accuracy": 0.2158605545759201, "num_tokens": 11207974.0, "step": 6165 }, { "entropy": 5.69039740562439, "epoch": 5.30081650193382, "grad_norm": 1.359375, "learning_rate": 0.00028470548427535794, "loss": 5.0855, "mean_token_accuracy": 0.2215003788471222, "num_tokens": 11216430.0, "step": 6170 }, { "entropy": 5.781186437606811, "epoch": 5.305113880532875, "grad_norm": 1.203125, "learning_rate": 0.00028437330050451654, "loss": 5.1772, "mean_token_accuracy": 0.21430482119321823, "num_tokens": 11226189.0, "step": 6175 }, { "entropy": 5.7771838188171385, "epoch": 5.309411259131929, "grad_norm": 1.3515625, "learning_rate": 0.0002840410962661334, "loss": 5.1362, "mean_token_accuracy": 0.216461843252182, "num_tokens": 11234691.0, "step": 6180 }, { "entropy": 5.6778770923614506, "epoch": 5.313708637730985, "grad_norm": 1.328125, "learning_rate": 0.0002837088722856098, "loss": 4.9647, "mean_token_accuracy": 0.23130213618278503, "num_tokens": 11243852.0, "step": 6185 }, { "entropy": 5.759836912155151, "epoch": 5.318006016330038, "grad_norm": 1.2109375, "learning_rate": 0.00028337662928838996, "loss": 5.1367, "mean_token_accuracy": 0.22025407403707503, "num_tokens": 11253416.0, "step": 6190 }, { "entropy": 5.744880151748657, "epoch": 5.322303394929094, "grad_norm": 1.09375, "learning_rate": 0.00028304436799995986, "loss": 5.0381, "mean_token_accuracy": 0.23224859237670897, "num_tokens": 11262869.0, "step": 6195 }, { "entropy": 5.794106817245483, "epoch": 5.326600773528148, "grad_norm": 1.140625, "learning_rate": 0.00028271208914584534, "loss": 5.1634, "mean_token_accuracy": 0.21408282816410065, "num_tokens": 11272386.0, "step": 6200 }, { "entropy": 5.7819318771362305, "epoch": 5.330898152127203, "grad_norm": 1.2265625, "learning_rate": 0.00028237979345161065, "loss": 5.0198, "mean_token_accuracy": 0.2246573656797409, "num_tokens": 11281590.0, "step": 6205 }, { "entropy": 5.723759984970092, "epoch": 5.335195530726257, "grad_norm": 1.3359375, "learning_rate": 0.0002820474816428568, "loss": 5.0703, "mean_token_accuracy": 0.22172184884548188, "num_tokens": 11290873.0, "step": 6210 }, { "entropy": 5.763450860977173, "epoch": 5.339492909325312, "grad_norm": 1.3125, "learning_rate": 0.0002817151544452198, "loss": 5.047, "mean_token_accuracy": 0.22653649896383285, "num_tokens": 11299064.0, "step": 6215 }, { "entropy": 5.763836097717285, "epoch": 5.343790287924366, "grad_norm": 1.21875, "learning_rate": 0.00028138281258436947, "loss": 5.0815, "mean_token_accuracy": 0.2125794693827629, "num_tokens": 11307390.0, "step": 6220 }, { "entropy": 5.6746196269989015, "epoch": 5.348087666523421, "grad_norm": 1.421875, "learning_rate": 0.0002810504567860078, "loss": 5.0321, "mean_token_accuracy": 0.22260272949934007, "num_tokens": 11315606.0, "step": 6225 }, { "entropy": 5.817198133468628, "epoch": 5.3523850451224755, "grad_norm": 1.296875, "learning_rate": 0.0002807180877758667, "loss": 5.1543, "mean_token_accuracy": 0.2130942553281784, "num_tokens": 11323821.0, "step": 6230 }, { "entropy": 5.749542999267578, "epoch": 5.35668242372153, "grad_norm": 1.4296875, "learning_rate": 0.00028038570627970754, "loss": 5.0964, "mean_token_accuracy": 0.21933864206075668, "num_tokens": 11331850.0, "step": 6235 }, { "entropy": 5.718008375167846, "epoch": 5.360979802320585, "grad_norm": 1.4140625, "learning_rate": 0.0002800533130233184, "loss": 5.0655, "mean_token_accuracy": 0.22661355137825012, "num_tokens": 11340125.0, "step": 6240 }, { "entropy": 5.798234176635742, "epoch": 5.365277180919639, "grad_norm": 1.3203125, "learning_rate": 0.0002797209087325135, "loss": 5.099, "mean_token_accuracy": 0.21534867137670516, "num_tokens": 11349184.0, "step": 6245 }, { "entropy": 5.796571254730225, "epoch": 5.369574559518694, "grad_norm": 1.25, "learning_rate": 0.00027938849413313083, "loss": 5.151, "mean_token_accuracy": 0.21026744544506074, "num_tokens": 11357536.0, "step": 6250 }, { "entropy": 5.762140798568725, "epoch": 5.373871938117748, "grad_norm": 1.1640625, "learning_rate": 0.000279056069951031, "loss": 5.1291, "mean_token_accuracy": 0.2182137981057167, "num_tokens": 11367242.0, "step": 6255 }, { "entropy": 5.744655513763428, "epoch": 5.378169316716803, "grad_norm": 1.2890625, "learning_rate": 0.00027872363691209564, "loss": 5.0854, "mean_token_accuracy": 0.22035084962844848, "num_tokens": 11374932.0, "step": 6260 }, { "entropy": 5.718549394607544, "epoch": 5.382466695315857, "grad_norm": 1.3046875, "learning_rate": 0.0002783911957422256, "loss": 5.0746, "mean_token_accuracy": 0.21616823524236678, "num_tokens": 11383575.0, "step": 6265 }, { "entropy": 5.8026519298553465, "epoch": 5.386764073914912, "grad_norm": 1.3046875, "learning_rate": 0.0002780587471673394, "loss": 5.1228, "mean_token_accuracy": 0.2199627310037613, "num_tokens": 11392285.0, "step": 6270 }, { "entropy": 5.761375951766968, "epoch": 5.391061452513966, "grad_norm": 1.2265625, "learning_rate": 0.00027772629191337206, "loss": 5.0803, "mean_token_accuracy": 0.22224314510822296, "num_tokens": 11401054.0, "step": 6275 }, { "entropy": 5.73178014755249, "epoch": 5.395358831113021, "grad_norm": 1.34375, "learning_rate": 0.00027739383070627283, "loss": 5.1133, "mean_token_accuracy": 0.21695896238088608, "num_tokens": 11410529.0, "step": 6280 }, { "entropy": 5.738652086257934, "epoch": 5.3996562097120755, "grad_norm": 1.2109375, "learning_rate": 0.0002770613642720041, "loss": 5.0726, "mean_token_accuracy": 0.22234009355306625, "num_tokens": 11419961.0, "step": 6285 }, { "entropy": 5.816117954254151, "epoch": 5.40395358831113, "grad_norm": 1.109375, "learning_rate": 0.00027672889333653984, "loss": 5.2143, "mean_token_accuracy": 0.19956380277872085, "num_tokens": 11429529.0, "step": 6290 }, { "entropy": 5.772477054595948, "epoch": 5.408250966910185, "grad_norm": 1.1640625, "learning_rate": 0.0002763964186258635, "loss": 5.0713, "mean_token_accuracy": 0.2213875100016594, "num_tokens": 11438254.0, "step": 6295 }, { "entropy": 5.781389999389648, "epoch": 5.412548345509239, "grad_norm": 1.1875, "learning_rate": 0.0002760639408659671, "loss": 5.1279, "mean_token_accuracy": 0.2134677991271019, "num_tokens": 11447587.0, "step": 6300 }, { "entropy": 5.742274236679077, "epoch": 5.416845724108294, "grad_norm": 1.3828125, "learning_rate": 0.0002757314607828489, "loss": 5.0879, "mean_token_accuracy": 0.2216594934463501, "num_tokens": 11455493.0, "step": 6305 }, { "entropy": 5.754730272293091, "epoch": 5.421143102707348, "grad_norm": 1.3046875, "learning_rate": 0.00027539897910251293, "loss": 5.0387, "mean_token_accuracy": 0.2288123995065689, "num_tokens": 11464143.0, "step": 6310 }, { "entropy": 5.715780973434448, "epoch": 5.425440481306403, "grad_norm": 1.328125, "learning_rate": 0.00027506649655096595, "loss": 5.0129, "mean_token_accuracy": 0.22209218442440032, "num_tokens": 11471813.0, "step": 6315 }, { "entropy": 5.726117277145386, "epoch": 5.429737859905457, "grad_norm": 1.3046875, "learning_rate": 0.0002747340138542171, "loss": 5.0473, "mean_token_accuracy": 0.21649690121412277, "num_tokens": 11481374.0, "step": 6320 }, { "entropy": 5.843945550918579, "epoch": 5.434035238504512, "grad_norm": 1.1953125, "learning_rate": 0.0002744015317382757, "loss": 5.2226, "mean_token_accuracy": 0.21010226905345916, "num_tokens": 11490575.0, "step": 6325 }, { "entropy": 5.77156343460083, "epoch": 5.438332617103566, "grad_norm": 1.2109375, "learning_rate": 0.0002740690509291498, "loss": 5.109, "mean_token_accuracy": 0.22323488891124726, "num_tokens": 11499898.0, "step": 6330 }, { "entropy": 5.756300687789917, "epoch": 5.442629995702621, "grad_norm": 1.3671875, "learning_rate": 0.0002737365721528445, "loss": 5.109, "mean_token_accuracy": 0.2162606492638588, "num_tokens": 11508544.0, "step": 6335 }, { "entropy": 5.75588583946228, "epoch": 5.446927374301676, "grad_norm": 1.2421875, "learning_rate": 0.0002734040961353607, "loss": 5.0941, "mean_token_accuracy": 0.21851696968078613, "num_tokens": 11519239.0, "step": 6340 }, { "entropy": 5.778530550003052, "epoch": 5.451224752900731, "grad_norm": 1.28125, "learning_rate": 0.000273071623602693, "loss": 5.0573, "mean_token_accuracy": 0.22304116934537888, "num_tokens": 11529014.0, "step": 6345 }, { "entropy": 5.792081117630005, "epoch": 5.4555221314997855, "grad_norm": 1.234375, "learning_rate": 0.00027273915528082865, "loss": 5.056, "mean_token_accuracy": 0.22800618261098862, "num_tokens": 11538367.0, "step": 6350 }, { "entropy": 5.8074750900268555, "epoch": 5.45981951009884, "grad_norm": 1.203125, "learning_rate": 0.0002724066918957455, "loss": 5.2142, "mean_token_accuracy": 0.20521747320890427, "num_tokens": 11548166.0, "step": 6355 }, { "entropy": 5.703108215332032, "epoch": 5.464116888697895, "grad_norm": 1.2109375, "learning_rate": 0.0002720742341734107, "loss": 5.0789, "mean_token_accuracy": 0.22244166433811188, "num_tokens": 11557187.0, "step": 6360 }, { "entropy": 5.819654417037964, "epoch": 5.468414267296949, "grad_norm": 1.3515625, "learning_rate": 0.00027174178283977904, "loss": 5.1156, "mean_token_accuracy": 0.21346145868301392, "num_tokens": 11566181.0, "step": 6365 }, { "entropy": 5.714676475524902, "epoch": 5.472711645896004, "grad_norm": 1.1796875, "learning_rate": 0.00027140933862079136, "loss": 5.0735, "mean_token_accuracy": 0.22364838123321534, "num_tokens": 11576157.0, "step": 6370 }, { "entropy": 5.70315842628479, "epoch": 5.477009024495058, "grad_norm": 1.296875, "learning_rate": 0.000271076902242373, "loss": 5.0464, "mean_token_accuracy": 0.22535803020000458, "num_tokens": 11585325.0, "step": 6375 }, { "entropy": 5.768360900878906, "epoch": 5.481306403094113, "grad_norm": 1.203125, "learning_rate": 0.000270744474430432, "loss": 5.0335, "mean_token_accuracy": 0.22521175146102906, "num_tokens": 11594623.0, "step": 6380 }, { "entropy": 5.806769609451294, "epoch": 5.485603781693167, "grad_norm": 1.28125, "learning_rate": 0.000270412055910858, "loss": 5.2131, "mean_token_accuracy": 0.21013156920671464, "num_tokens": 11604370.0, "step": 6385 }, { "entropy": 5.686313343048096, "epoch": 5.489901160292222, "grad_norm": 1.1796875, "learning_rate": 0.0002700796474095201, "loss": 5.0334, "mean_token_accuracy": 0.2330898493528366, "num_tokens": 11613779.0, "step": 6390 }, { "entropy": 5.772741842269897, "epoch": 5.494198538891276, "grad_norm": 1.3203125, "learning_rate": 0.0002697472496522656, "loss": 5.1181, "mean_token_accuracy": 0.2183234751224518, "num_tokens": 11623037.0, "step": 6395 }, { "entropy": 5.848201179504395, "epoch": 5.498495917490331, "grad_norm": 1.234375, "learning_rate": 0.0002694148633649184, "loss": 5.1467, "mean_token_accuracy": 0.21451639384031296, "num_tokens": 11631640.0, "step": 6400 }, { "entropy": 5.744092893600464, "epoch": 5.5027932960893855, "grad_norm": 1.328125, "learning_rate": 0.0002690824892732772, "loss": 5.1001, "mean_token_accuracy": 0.22413897514343262, "num_tokens": 11640500.0, "step": 6405 }, { "entropy": 5.730341243743896, "epoch": 5.50709067468844, "grad_norm": 1.3125, "learning_rate": 0.0002687501281031142, "loss": 5.1363, "mean_token_accuracy": 0.21347840279340743, "num_tokens": 11649173.0, "step": 6410 }, { "entropy": 5.61120228767395, "epoch": 5.511388053287495, "grad_norm": 1.296875, "learning_rate": 0.0002684177805801734, "loss": 4.9907, "mean_token_accuracy": 0.23398321270942687, "num_tokens": 11658808.0, "step": 6415 }, { "entropy": 5.78149824142456, "epoch": 5.515685431886549, "grad_norm": 1.1640625, "learning_rate": 0.00026808544743016886, "loss": 5.0821, "mean_token_accuracy": 0.21574064046144487, "num_tokens": 11667600.0, "step": 6420 }, { "entropy": 5.750644302368164, "epoch": 5.519982810485604, "grad_norm": 1.3125, "learning_rate": 0.0002677531293787835, "loss": 5.0974, "mean_token_accuracy": 0.21414555311203004, "num_tokens": 11675597.0, "step": 6425 }, { "entropy": 5.660244941711426, "epoch": 5.524280189084658, "grad_norm": 1.234375, "learning_rate": 0.000267420827151667, "loss": 5.0231, "mean_token_accuracy": 0.23164253532886506, "num_tokens": 11684549.0, "step": 6430 }, { "entropy": 5.730451011657715, "epoch": 5.528577567683713, "grad_norm": 1.28125, "learning_rate": 0.0002670885414744347, "loss": 5.1151, "mean_token_accuracy": 0.22453600615262986, "num_tokens": 11693043.0, "step": 6435 }, { "entropy": 5.851344728469849, "epoch": 5.532874946282767, "grad_norm": 1.2265625, "learning_rate": 0.0002667562730726655, "loss": 5.1998, "mean_token_accuracy": 0.21441607922315598, "num_tokens": 11702982.0, "step": 6440 }, { "entropy": 5.804118871688843, "epoch": 5.537172324881822, "grad_norm": 1.1875, "learning_rate": 0.00026642402267190095, "loss": 5.2054, "mean_token_accuracy": 0.20979426354169844, "num_tokens": 11711994.0, "step": 6445 }, { "entropy": 5.78815655708313, "epoch": 5.541469703480876, "grad_norm": 1.296875, "learning_rate": 0.00026609179099764313, "loss": 5.1463, "mean_token_accuracy": 0.2133957788348198, "num_tokens": 11722165.0, "step": 6450 }, { "entropy": 5.748242044448853, "epoch": 5.545767082079931, "grad_norm": 1.1875, "learning_rate": 0.00026575957877535323, "loss": 5.1148, "mean_token_accuracy": 0.21890448033809662, "num_tokens": 11731265.0, "step": 6455 }, { "entropy": 5.782331275939941, "epoch": 5.5500644606789855, "grad_norm": 1.21875, "learning_rate": 0.00026542738673044985, "loss": 5.1388, "mean_token_accuracy": 0.21340786814689636, "num_tokens": 11741779.0, "step": 6460 }, { "entropy": 5.744899702072144, "epoch": 5.55436183927804, "grad_norm": 1.3671875, "learning_rate": 0.0002650952155883077, "loss": 5.1048, "mean_token_accuracy": 0.2189340263605118, "num_tokens": 11749976.0, "step": 6465 }, { "entropy": 5.73995532989502, "epoch": 5.558659217877095, "grad_norm": 1.2109375, "learning_rate": 0.0002647630660742559, "loss": 5.0929, "mean_token_accuracy": 0.21515202820301055, "num_tokens": 11759781.0, "step": 6470 }, { "entropy": 5.763435029983521, "epoch": 5.56295659647615, "grad_norm": 1.1171875, "learning_rate": 0.000264430938913576, "loss": 5.0636, "mean_token_accuracy": 0.2215006723999977, "num_tokens": 11769544.0, "step": 6475 }, { "entropy": 5.762722492218018, "epoch": 5.567253975075204, "grad_norm": 1.4375, "learning_rate": 0.00026409883483150123, "loss": 5.0644, "mean_token_accuracy": 0.22043437957763673, "num_tokens": 11778831.0, "step": 6480 }, { "entropy": 5.726540517807007, "epoch": 5.571551353674259, "grad_norm": 1.21875, "learning_rate": 0.000263766754553214, "loss": 5.1385, "mean_token_accuracy": 0.20914600044488907, "num_tokens": 11788813.0, "step": 6485 }, { "entropy": 5.783534860610962, "epoch": 5.575848732273313, "grad_norm": 1.1796875, "learning_rate": 0.0002634346988038448, "loss": 5.0812, "mean_token_accuracy": 0.22230044454336167, "num_tokens": 11797335.0, "step": 6490 }, { "entropy": 5.774841403961181, "epoch": 5.580146110872368, "grad_norm": 1.3125, "learning_rate": 0.00026310266830847093, "loss": 5.105, "mean_token_accuracy": 0.21853111684322357, "num_tokens": 11806741.0, "step": 6495 }, { "entropy": 5.791754579544067, "epoch": 5.584443489471423, "grad_norm": 1.28125, "learning_rate": 0.00026277066379211406, "loss": 5.1402, "mean_token_accuracy": 0.2172718971967697, "num_tokens": 11815551.0, "step": 6500 }, { "epoch": 5.584443489471423, "eval_entropy": 5.548214195010899, "eval_loss": 5.916170597076416, "eval_mean_token_accuracy": 0.1791800964321639, "eval_num_tokens": 11815551.0, "eval_runtime": 2.2528, "eval_samples_per_second": 1575.34, "eval_steps_per_second": 197.084, "step": 6500 }, { "entropy": 5.79419641494751, "epoch": 5.588740868070477, "grad_norm": 1.3046875, "learning_rate": 0.0002624386859797396, "loss": 5.1641, "mean_token_accuracy": 0.2150591015815735, "num_tokens": 11824483.0, "step": 6505 }, { "entropy": 5.678532218933105, "epoch": 5.593038246669532, "grad_norm": 1.28125, "learning_rate": 0.00026210673559625406, "loss": 4.9558, "mean_token_accuracy": 0.23172966986894608, "num_tokens": 11832383.0, "step": 6510 }, { "entropy": 5.777034711837769, "epoch": 5.597335625268586, "grad_norm": 1.4765625, "learning_rate": 0.0002617748133665047, "loss": 5.1953, "mean_token_accuracy": 0.21313114762306212, "num_tokens": 11841430.0, "step": 6515 }, { "entropy": 5.757608795166016, "epoch": 5.601633003867641, "grad_norm": 1.2421875, "learning_rate": 0.0002614429200152768, "loss": 5.1529, "mean_token_accuracy": 0.21601863503456115, "num_tokens": 11850863.0, "step": 6520 }, { "entropy": 5.719894313812256, "epoch": 5.6059303824666955, "grad_norm": 1.359375, "learning_rate": 0.000261111056267293, "loss": 5.0249, "mean_token_accuracy": 0.22059513330459596, "num_tokens": 11859392.0, "step": 6525 }, { "entropy": 5.748900747299194, "epoch": 5.61022776106575, "grad_norm": 1.265625, "learning_rate": 0.00026077922284721084, "loss": 5.0761, "mean_token_accuracy": 0.22885973751544952, "num_tokens": 11868762.0, "step": 6530 }, { "entropy": 5.7151679515838625, "epoch": 5.614525139664805, "grad_norm": 1.3671875, "learning_rate": 0.00026044742047962206, "loss": 5.0306, "mean_token_accuracy": 0.230779293179512, "num_tokens": 11876722.0, "step": 6535 }, { "entropy": 5.73458080291748, "epoch": 5.618822518263859, "grad_norm": 1.2578125, "learning_rate": 0.00026011564988905023, "loss": 5.1741, "mean_token_accuracy": 0.21847135871648787, "num_tokens": 11885614.0, "step": 6540 }, { "entropy": 5.835862874984741, "epoch": 5.623119896862914, "grad_norm": 1.2890625, "learning_rate": 0.0002597839117999499, "loss": 5.1883, "mean_token_accuracy": 0.2149903357028961, "num_tokens": 11894702.0, "step": 6545 }, { "entropy": 5.769807910919189, "epoch": 5.627417275461968, "grad_norm": 1.3828125, "learning_rate": 0.0002594522069367044, "loss": 5.0606, "mean_token_accuracy": 0.22778366208076478, "num_tokens": 11902829.0, "step": 6550 }, { "entropy": 5.7354350090026855, "epoch": 5.631714654061023, "grad_norm": 1.1796875, "learning_rate": 0.0002591205360236245, "loss": 5.1061, "mean_token_accuracy": 0.22033643573522568, "num_tokens": 11912377.0, "step": 6555 }, { "entropy": 5.73629994392395, "epoch": 5.636012032660077, "grad_norm": 1.3671875, "learning_rate": 0.000258788899784947, "loss": 5.1083, "mean_token_accuracy": 0.21579407155513763, "num_tokens": 11920563.0, "step": 6560 }, { "entropy": 5.782598829269409, "epoch": 5.640309411259132, "grad_norm": 1.21875, "learning_rate": 0.00025845729894483283, "loss": 5.1321, "mean_token_accuracy": 0.21574058383703232, "num_tokens": 11930190.0, "step": 6565 }, { "entropy": 5.814107990264892, "epoch": 5.644606789858186, "grad_norm": 1.2109375, "learning_rate": 0.0002581257342273657, "loss": 5.1906, "mean_token_accuracy": 0.21169122010469438, "num_tokens": 11939840.0, "step": 6570 }, { "entropy": 5.767313432693482, "epoch": 5.648904168457241, "grad_norm": 1.328125, "learning_rate": 0.0002577942063565504, "loss": 5.1207, "mean_token_accuracy": 0.22112152874469757, "num_tokens": 11948260.0, "step": 6575 }, { "entropy": 5.735608530044556, "epoch": 5.6532015470562955, "grad_norm": 1.4140625, "learning_rate": 0.0002574627160563114, "loss": 5.1704, "mean_token_accuracy": 0.2230113223195076, "num_tokens": 11956776.0, "step": 6580 }, { "entropy": 5.847114753723145, "epoch": 5.65749892565535, "grad_norm": 1.3203125, "learning_rate": 0.0002571312640504909, "loss": 5.1992, "mean_token_accuracy": 0.21259045898914336, "num_tokens": 11966375.0, "step": 6585 }, { "entropy": 5.86904125213623, "epoch": 5.661796304254405, "grad_norm": 1.2265625, "learning_rate": 0.0002567998510628476, "loss": 5.2003, "mean_token_accuracy": 0.2081349566578865, "num_tokens": 11975835.0, "step": 6590 }, { "entropy": 5.757561254501343, "epoch": 5.666093682853459, "grad_norm": 1.3671875, "learning_rate": 0.00025646847781705506, "loss": 5.0878, "mean_token_accuracy": 0.21930547803640366, "num_tokens": 11984672.0, "step": 6595 }, { "entropy": 5.788959789276123, "epoch": 5.670391061452514, "grad_norm": 1.359375, "learning_rate": 0.0002561371450367, "loss": 5.1018, "mean_token_accuracy": 0.22340647727251053, "num_tokens": 11993954.0, "step": 6600 }, { "entropy": 5.792072677612305, "epoch": 5.674688440051568, "grad_norm": 1.3203125, "learning_rate": 0.00025580585344528076, "loss": 5.1573, "mean_token_accuracy": 0.2121841624379158, "num_tokens": 12002523.0, "step": 6605 }, { "entropy": 5.788462352752686, "epoch": 5.678985818650623, "grad_norm": 1.328125, "learning_rate": 0.0002554746037662058, "loss": 5.1837, "mean_token_accuracy": 0.2102429136633873, "num_tokens": 12011638.0, "step": 6610 }, { "entropy": 5.825799894332886, "epoch": 5.683283197249677, "grad_norm": 1.1328125, "learning_rate": 0.0002551433967227919, "loss": 5.1468, "mean_token_accuracy": 0.2148883506655693, "num_tokens": 12021319.0, "step": 6615 }, { "entropy": 5.772326278686523, "epoch": 5.687580575848732, "grad_norm": 1.2578125, "learning_rate": 0.000254812233038263, "loss": 5.0897, "mean_token_accuracy": 0.22315099984407424, "num_tokens": 12030255.0, "step": 6620 }, { "entropy": 5.790443277359008, "epoch": 5.691877954447786, "grad_norm": 1.359375, "learning_rate": 0.00025448111343574813, "loss": 5.093, "mean_token_accuracy": 0.22532202005386354, "num_tokens": 12038884.0, "step": 6625 }, { "entropy": 5.74477071762085, "epoch": 5.696175333046842, "grad_norm": 1.078125, "learning_rate": 0.0002541500386382802, "loss": 5.0745, "mean_token_accuracy": 0.21967335492372514, "num_tokens": 12047477.0, "step": 6630 }, { "entropy": 5.764187002182007, "epoch": 5.7004727116458955, "grad_norm": 1.2265625, "learning_rate": 0.00025381900936879433, "loss": 5.1567, "mean_token_accuracy": 0.22008973658084868, "num_tokens": 12056902.0, "step": 6635 }, { "entropy": 5.75988278388977, "epoch": 5.704770090244951, "grad_norm": 1.2265625, "learning_rate": 0.0002534880263501259, "loss": 5.1201, "mean_token_accuracy": 0.2115958884358406, "num_tokens": 12065721.0, "step": 6640 }, { "entropy": 5.747469568252564, "epoch": 5.7090674688440055, "grad_norm": 1.171875, "learning_rate": 0.0002531570903050097, "loss": 5.0979, "mean_token_accuracy": 0.22399253994226456, "num_tokens": 12074870.0, "step": 6645 }, { "entropy": 5.807171726226807, "epoch": 5.71336484744306, "grad_norm": 1.1640625, "learning_rate": 0.0002528262019560776, "loss": 5.1381, "mean_token_accuracy": 0.21587093770503998, "num_tokens": 12084557.0, "step": 6650 }, { "entropy": 5.729248476028443, "epoch": 5.717662226042115, "grad_norm": 1.15625, "learning_rate": 0.0002524953620258579, "loss": 5.0104, "mean_token_accuracy": 0.23036109060049056, "num_tokens": 12093074.0, "step": 6655 }, { "entropy": 5.726521444320679, "epoch": 5.721959604641169, "grad_norm": 1.296875, "learning_rate": 0.0002521645712367724, "loss": 5.0357, "mean_token_accuracy": 0.23260863721370698, "num_tokens": 12102785.0, "step": 6660 }, { "entropy": 5.734358453750611, "epoch": 5.726256983240224, "grad_norm": 1.34375, "learning_rate": 0.00025183383031113606, "loss": 5.0578, "mean_token_accuracy": 0.2322448804974556, "num_tokens": 12112535.0, "step": 6665 }, { "entropy": 5.652305364608765, "epoch": 5.730554361839278, "grad_norm": 1.2109375, "learning_rate": 0.00025150313997115476, "loss": 5.0056, "mean_token_accuracy": 0.2260068476200104, "num_tokens": 12121604.0, "step": 6670 }, { "entropy": 5.757162714004517, "epoch": 5.734851740438333, "grad_norm": 1.2109375, "learning_rate": 0.0002511725009389244, "loss": 5.1818, "mean_token_accuracy": 0.21061882376670837, "num_tokens": 12131276.0, "step": 6675 }, { "entropy": 5.742733335494995, "epoch": 5.739149119037387, "grad_norm": 1.390625, "learning_rate": 0.000250841913936428, "loss": 5.1347, "mean_token_accuracy": 0.2235199674963951, "num_tokens": 12140180.0, "step": 6680 }, { "entropy": 5.728972244262695, "epoch": 5.743446497636442, "grad_norm": 1.28125, "learning_rate": 0.0002505113796855357, "loss": 5.1024, "mean_token_accuracy": 0.2127378210425377, "num_tokens": 12149635.0, "step": 6685 }, { "entropy": 5.762466812133789, "epoch": 5.747743876235496, "grad_norm": 1.328125, "learning_rate": 0.00025018089890800225, "loss": 5.0582, "mean_token_accuracy": 0.22984133958816527, "num_tokens": 12157565.0, "step": 6690 }, { "entropy": 5.760795783996582, "epoch": 5.752041254834551, "grad_norm": 1.234375, "learning_rate": 0.00024985047232546544, "loss": 5.1539, "mean_token_accuracy": 0.21633774489164354, "num_tokens": 12166647.0, "step": 6695 }, { "entropy": 5.78938307762146, "epoch": 5.7563386334336055, "grad_norm": 1.2109375, "learning_rate": 0.00024952010065944485, "loss": 5.1526, "mean_token_accuracy": 0.21298279315233232, "num_tokens": 12175554.0, "step": 6700 }, { "entropy": 5.794042539596558, "epoch": 5.76063601203266, "grad_norm": 1.375, "learning_rate": 0.0002491897846313402, "loss": 5.123, "mean_token_accuracy": 0.22041986286640167, "num_tokens": 12184756.0, "step": 6705 }, { "entropy": 5.837274694442749, "epoch": 5.764933390631715, "grad_norm": 1.203125, "learning_rate": 0.0002488595249624297, "loss": 5.2088, "mean_token_accuracy": 0.20747051686048507, "num_tokens": 12194724.0, "step": 6710 }, { "entropy": 5.758440542221069, "epoch": 5.769230769230769, "grad_norm": 1.171875, "learning_rate": 0.00024852932237386837, "loss": 5.1497, "mean_token_accuracy": 0.22039461135864258, "num_tokens": 12203804.0, "step": 6715 }, { "entropy": 5.761465644836425, "epoch": 5.773528147829824, "grad_norm": 1.1875, "learning_rate": 0.00024819917758668673, "loss": 5.0999, "mean_token_accuracy": 0.219867567718029, "num_tokens": 12212868.0, "step": 6720 }, { "entropy": 5.790355968475342, "epoch": 5.777825526428878, "grad_norm": 1.2265625, "learning_rate": 0.00024786909132178906, "loss": 5.1777, "mean_token_accuracy": 0.2172165408730507, "num_tokens": 12221650.0, "step": 6725 }, { "entropy": 5.773914241790772, "epoch": 5.782122905027933, "grad_norm": 1.34375, "learning_rate": 0.00024753906429995194, "loss": 5.0614, "mean_token_accuracy": 0.22624436914920806, "num_tokens": 12231541.0, "step": 6730 }, { "entropy": 5.77492847442627, "epoch": 5.786420283626987, "grad_norm": 1.3125, "learning_rate": 0.0002472090972418222, "loss": 5.1611, "mean_token_accuracy": 0.21325822174549103, "num_tokens": 12240899.0, "step": 6735 }, { "entropy": 5.800062751770019, "epoch": 5.790717662226042, "grad_norm": 1.21875, "learning_rate": 0.0002468791908679163, "loss": 5.1479, "mean_token_accuracy": 0.20612489581108093, "num_tokens": 12250352.0, "step": 6740 }, { "entropy": 5.793203258514405, "epoch": 5.795015040825096, "grad_norm": 1.265625, "learning_rate": 0.0002465493458986175, "loss": 5.1933, "mean_token_accuracy": 0.20913417190313338, "num_tokens": 12259975.0, "step": 6745 }, { "entropy": 5.695867681503296, "epoch": 5.799312419424151, "grad_norm": 1.40625, "learning_rate": 0.00024621956305417587, "loss": 5.0425, "mean_token_accuracy": 0.22444724589586257, "num_tokens": 12269203.0, "step": 6750 }, { "entropy": 5.797395038604736, "epoch": 5.8036097980232055, "grad_norm": 1.3203125, "learning_rate": 0.000245889843054705, "loss": 5.135, "mean_token_accuracy": 0.22068165093660355, "num_tokens": 12279481.0, "step": 6755 }, { "entropy": 5.747924900054931, "epoch": 5.80790717662226, "grad_norm": 1.2421875, "learning_rate": 0.00024556018662018163, "loss": 5.1148, "mean_token_accuracy": 0.22157147377729416, "num_tokens": 12288848.0, "step": 6760 }, { "entropy": 5.782013320922852, "epoch": 5.812204555221315, "grad_norm": 1.2109375, "learning_rate": 0.00024523059447044377, "loss": 5.1238, "mean_token_accuracy": 0.2141062006354332, "num_tokens": 12297346.0, "step": 6765 }, { "entropy": 5.784574270248413, "epoch": 5.816501933820369, "grad_norm": 1.25, "learning_rate": 0.0002449010673251887, "loss": 5.1208, "mean_token_accuracy": 0.214461612701416, "num_tokens": 12306233.0, "step": 6770 }, { "entropy": 5.823604106903076, "epoch": 5.820799312419425, "grad_norm": 1.2578125, "learning_rate": 0.0002445716059039723, "loss": 5.2241, "mean_token_accuracy": 0.20766980350017547, "num_tokens": 12315609.0, "step": 6775 }, { "entropy": 5.792714786529541, "epoch": 5.825096691018478, "grad_norm": 1.4765625, "learning_rate": 0.00024424221092620644, "loss": 5.1593, "mean_token_accuracy": 0.2178465098142624, "num_tokens": 12323915.0, "step": 6780 }, { "entropy": 5.679864931106567, "epoch": 5.829394069617534, "grad_norm": 1.234375, "learning_rate": 0.00024391288311115822, "loss": 5.092, "mean_token_accuracy": 0.21546332389116288, "num_tokens": 12334077.0, "step": 6785 }, { "entropy": 5.752875280380249, "epoch": 5.833691448216588, "grad_norm": 1.3359375, "learning_rate": 0.0002435836231779478, "loss": 5.1173, "mean_token_accuracy": 0.21589842587709426, "num_tokens": 12342411.0, "step": 6790 }, { "entropy": 5.71485276222229, "epoch": 5.837988826815643, "grad_norm": 1.2109375, "learning_rate": 0.00024325443184554724, "loss": 5.0154, "mean_token_accuracy": 0.23167243152856826, "num_tokens": 12351308.0, "step": 6795 }, { "entropy": 5.741046476364136, "epoch": 5.842286205414697, "grad_norm": 1.2421875, "learning_rate": 0.00024292530983277904, "loss": 5.1886, "mean_token_accuracy": 0.2137362465262413, "num_tokens": 12359673.0, "step": 6800 }, { "entropy": 5.730896043777466, "epoch": 5.846583584013752, "grad_norm": 1.296875, "learning_rate": 0.00024259625785831408, "loss": 5.0495, "mean_token_accuracy": 0.21516438126564025, "num_tokens": 12367876.0, "step": 6805 }, { "entropy": 5.795110273361206, "epoch": 5.850880962612806, "grad_norm": 1.3046875, "learning_rate": 0.00024226727664067023, "loss": 5.1901, "mean_token_accuracy": 0.2127310201525688, "num_tokens": 12377040.0, "step": 6810 }, { "entropy": 5.86044659614563, "epoch": 5.855178341211861, "grad_norm": 1.234375, "learning_rate": 0.00024193836689821109, "loss": 5.2514, "mean_token_accuracy": 0.2128416433930397, "num_tokens": 12387622.0, "step": 6815 }, { "entropy": 5.697026014328003, "epoch": 5.8594757198109155, "grad_norm": 1.3046875, "learning_rate": 0.0002416095293491439, "loss": 5.035, "mean_token_accuracy": 0.2235540360212326, "num_tokens": 12396447.0, "step": 6820 }, { "entropy": 5.762108945846558, "epoch": 5.86377309840997, "grad_norm": 1.34375, "learning_rate": 0.0002412807647115186, "loss": 5.0562, "mean_token_accuracy": 0.23084075152873992, "num_tokens": 12405887.0, "step": 6825 }, { "entropy": 5.700514125823974, "epoch": 5.868070477009025, "grad_norm": 1.2734375, "learning_rate": 0.00024095207370322574, "loss": 5.0786, "mean_token_accuracy": 0.21588899046182633, "num_tokens": 12414543.0, "step": 6830 }, { "entropy": 5.715729188919068, "epoch": 5.872367855608079, "grad_norm": 1.3515625, "learning_rate": 0.00024062345704199507, "loss": 5.0879, "mean_token_accuracy": 0.2205901026725769, "num_tokens": 12423370.0, "step": 6835 }, { "entropy": 5.793241882324219, "epoch": 5.876665234207134, "grad_norm": 1.03125, "learning_rate": 0.00024029491544539405, "loss": 5.1822, "mean_token_accuracy": 0.2112067312002182, "num_tokens": 12433980.0, "step": 6840 }, { "entropy": 5.738333225250244, "epoch": 5.880962612806188, "grad_norm": 1.3359375, "learning_rate": 0.00023996644963082616, "loss": 5.132, "mean_token_accuracy": 0.22221640795469283, "num_tokens": 12443300.0, "step": 6845 }, { "entropy": 5.76718807220459, "epoch": 5.885259991405243, "grad_norm": 1.15625, "learning_rate": 0.00023963806031552948, "loss": 5.1758, "mean_token_accuracy": 0.2108922243118286, "num_tokens": 12452462.0, "step": 6850 }, { "entropy": 5.706634998321533, "epoch": 5.889557370004297, "grad_norm": 1.2578125, "learning_rate": 0.00023930974821657504, "loss": 5.0996, "mean_token_accuracy": 0.21803777813911437, "num_tokens": 12461605.0, "step": 6855 }, { "entropy": 5.8166309833526615, "epoch": 5.893854748603352, "grad_norm": 1.25, "learning_rate": 0.00023898151405086533, "loss": 5.1663, "mean_token_accuracy": 0.21597474217414855, "num_tokens": 12470905.0, "step": 6860 }, { "entropy": 5.821134376525879, "epoch": 5.8981521272024064, "grad_norm": 1.3515625, "learning_rate": 0.00023865335853513232, "loss": 5.1416, "mean_token_accuracy": 0.21774317771196366, "num_tokens": 12478913.0, "step": 6865 }, { "entropy": 5.761513996124267, "epoch": 5.902449505801461, "grad_norm": 1.2265625, "learning_rate": 0.00023832528238593677, "loss": 5.2181, "mean_token_accuracy": 0.21117616146802903, "num_tokens": 12487561.0, "step": 6870 }, { "entropy": 5.7301372528076175, "epoch": 5.9067468844005155, "grad_norm": 1.28125, "learning_rate": 0.00023799728631966556, "loss": 5.1255, "mean_token_accuracy": 0.2209423691034317, "num_tokens": 12496781.0, "step": 6875 }, { "entropy": 5.796772241592407, "epoch": 5.91104426299957, "grad_norm": 1.4375, "learning_rate": 0.0002376693710525313, "loss": 5.2086, "mean_token_accuracy": 0.20858777016401292, "num_tokens": 12505716.0, "step": 6880 }, { "entropy": 5.841960048675537, "epoch": 5.915341641598625, "grad_norm": 1.1640625, "learning_rate": 0.00023734153730056967, "loss": 5.1519, "mean_token_accuracy": 0.2122661292552948, "num_tokens": 12515594.0, "step": 6885 }, { "entropy": 5.72416934967041, "epoch": 5.919639020197679, "grad_norm": 1.484375, "learning_rate": 0.00023701378577963873, "loss": 4.9846, "mean_token_accuracy": 0.23235433548688889, "num_tokens": 12523439.0, "step": 6890 }, { "entropy": 5.707376480102539, "epoch": 5.923936398796734, "grad_norm": 1.328125, "learning_rate": 0.0002366861172054166, "loss": 5.0914, "mean_token_accuracy": 0.22402575612068176, "num_tokens": 12532100.0, "step": 6895 }, { "entropy": 5.742840337753296, "epoch": 5.928233777395788, "grad_norm": 1.3359375, "learning_rate": 0.00023635853229340054, "loss": 5.113, "mean_token_accuracy": 0.21385788768529893, "num_tokens": 12539689.0, "step": 6900 }, { "entropy": 5.761408472061158, "epoch": 5.932531155994843, "grad_norm": 1.2421875, "learning_rate": 0.00023603103175890512, "loss": 5.1386, "mean_token_accuracy": 0.2112519159913063, "num_tokens": 12548486.0, "step": 6905 }, { "entropy": 5.7453104019165036, "epoch": 5.936828534593897, "grad_norm": 1.2578125, "learning_rate": 0.00023570361631706062, "loss": 5.0162, "mean_token_accuracy": 0.23448468893766403, "num_tokens": 12557423.0, "step": 6910 }, { "entropy": 5.724758243560791, "epoch": 5.941125913192952, "grad_norm": 1.2890625, "learning_rate": 0.00023537628668281142, "loss": 5.1705, "mean_token_accuracy": 0.2140020415186882, "num_tokens": 12566086.0, "step": 6915 }, { "entropy": 5.7241425037384035, "epoch": 5.945423291792007, "grad_norm": 1.2109375, "learning_rate": 0.00023504904357091468, "loss": 5.0751, "mean_token_accuracy": 0.2268398404121399, "num_tokens": 12575827.0, "step": 6920 }, { "entropy": 5.759646368026734, "epoch": 5.949720670391061, "grad_norm": 1.375, "learning_rate": 0.0002347218876959384, "loss": 5.0637, "mean_token_accuracy": 0.2303071603178978, "num_tokens": 12585044.0, "step": 6925 }, { "entropy": 5.7616418361663815, "epoch": 5.954018048990116, "grad_norm": 1.2265625, "learning_rate": 0.0002343948197722604, "loss": 5.1006, "mean_token_accuracy": 0.22172485142946244, "num_tokens": 12594677.0, "step": 6930 }, { "entropy": 5.730101203918457, "epoch": 5.958315427589171, "grad_norm": 1.28125, "learning_rate": 0.00023406784051406638, "loss": 5.1346, "mean_token_accuracy": 0.21585479229688645, "num_tokens": 12604829.0, "step": 6935 }, { "entropy": 5.84703574180603, "epoch": 5.9626128061882255, "grad_norm": 1.2421875, "learning_rate": 0.00023374095063534816, "loss": 5.1052, "mean_token_accuracy": 0.22624473124742508, "num_tokens": 12613869.0, "step": 6940 }, { "entropy": 5.750234889984131, "epoch": 5.96691018478728, "grad_norm": 1.2578125, "learning_rate": 0.00023341415084990276, "loss": 5.1064, "mean_token_accuracy": 0.2171055868268013, "num_tokens": 12623248.0, "step": 6945 }, { "entropy": 5.663731479644776, "epoch": 5.971207563386335, "grad_norm": 1.1953125, "learning_rate": 0.00023308744187132996, "loss": 5.0223, "mean_token_accuracy": 0.2321384847164154, "num_tokens": 12631973.0, "step": 6950 }, { "entropy": 5.7603648662567135, "epoch": 5.975504941985389, "grad_norm": 1.2890625, "learning_rate": 0.00023276082441303197, "loss": 5.1427, "mean_token_accuracy": 0.21930764019489288, "num_tokens": 12641435.0, "step": 6955 }, { "entropy": 5.780388498306275, "epoch": 5.979802320584444, "grad_norm": 1.265625, "learning_rate": 0.00023243429918821056, "loss": 5.1286, "mean_token_accuracy": 0.22100035548210145, "num_tokens": 12651077.0, "step": 6960 }, { "entropy": 5.846546459197998, "epoch": 5.984099699183498, "grad_norm": 1.328125, "learning_rate": 0.00023210786690986646, "loss": 5.2114, "mean_token_accuracy": 0.21253881752490997, "num_tokens": 12659929.0, "step": 6965 }, { "entropy": 5.781086397171021, "epoch": 5.988397077782553, "grad_norm": 1.3125, "learning_rate": 0.00023178152829079712, "loss": 5.0692, "mean_token_accuracy": 0.2191861242055893, "num_tokens": 12670725.0, "step": 6970 }, { "entropy": 5.7392051219940186, "epoch": 5.992694456381607, "grad_norm": 1.3828125, "learning_rate": 0.00023145528404359562, "loss": 5.1194, "mean_token_accuracy": 0.22653693705797195, "num_tokens": 12680820.0, "step": 6975 }, { "entropy": 5.772517347335816, "epoch": 5.996991834980662, "grad_norm": 1.2890625, "learning_rate": 0.0002311291348806492, "loss": 5.1847, "mean_token_accuracy": 0.2081763491034508, "num_tokens": 12689785.0, "step": 6980 }, { "entropy": 5.725305080413818, "epoch": 6.000859475719811, "grad_norm": 1.2890625, "learning_rate": 0.0002308030815141372, "loss": 5.115, "mean_token_accuracy": 0.21141118307908377, "num_tokens": 12697221.0, "step": 6985 }, { "entropy": 5.731379842758178, "epoch": 6.005156854318866, "grad_norm": 1.234375, "learning_rate": 0.00023047712465602976, "loss": 4.93, "mean_token_accuracy": 0.24172718375921248, "num_tokens": 12707127.0, "step": 6990 }, { "entropy": 5.700897169113159, "epoch": 6.00945423291792, "grad_norm": 1.4140625, "learning_rate": 0.00023015126501808641, "loss": 4.9318, "mean_token_accuracy": 0.2316366359591484, "num_tokens": 12715364.0, "step": 6995 }, { "entropy": 5.7420319557189945, "epoch": 6.013751611516975, "grad_norm": 1.203125, "learning_rate": 0.00022982550331185437, "loss": 4.9289, "mean_token_accuracy": 0.2380008026957512, "num_tokens": 12724914.0, "step": 7000 }, { "epoch": 6.013751611516975, "eval_entropy": 5.5273827663413035, "eval_loss": 5.911673069000244, "eval_mean_token_accuracy": 0.17968238789487528, "eval_num_tokens": 12724914.0, "eval_runtime": 2.0516, "eval_samples_per_second": 1729.866, "eval_steps_per_second": 216.416, "step": 7000 }, { "entropy": 5.69878044128418, "epoch": 6.0180489901160295, "grad_norm": 1.171875, "learning_rate": 0.00022949984024866704, "loss": 4.9492, "mean_token_accuracy": 0.23745594918727875, "num_tokens": 12735193.0, "step": 7005 }, { "entropy": 5.753796625137329, "epoch": 6.022346368715084, "grad_norm": 1.34375, "learning_rate": 0.0002291742765396424, "loss": 4.9928, "mean_token_accuracy": 0.23509994596242906, "num_tokens": 12743945.0, "step": 7010 }, { "entropy": 5.772705030441284, "epoch": 6.0266437473141385, "grad_norm": 1.1796875, "learning_rate": 0.00022884881289568133, "loss": 4.9965, "mean_token_accuracy": 0.229385170340538, "num_tokens": 12753130.0, "step": 7015 }, { "entropy": 5.695847749710083, "epoch": 6.030941125913193, "grad_norm": 1.28125, "learning_rate": 0.0002285234500274665, "loss": 4.9808, "mean_token_accuracy": 0.23147749304771423, "num_tokens": 12762108.0, "step": 7020 }, { "entropy": 5.800906610488892, "epoch": 6.035238504512248, "grad_norm": 1.328125, "learning_rate": 0.00022819818864546016, "loss": 5.0278, "mean_token_accuracy": 0.22203525006771088, "num_tokens": 12772102.0, "step": 7025 }, { "entropy": 5.751763248443604, "epoch": 6.039535883111302, "grad_norm": 1.296875, "learning_rate": 0.00022787302945990345, "loss": 4.9848, "mean_token_accuracy": 0.23573557138442994, "num_tokens": 12781225.0, "step": 7030 }, { "entropy": 5.77105746269226, "epoch": 6.043833261710357, "grad_norm": 1.390625, "learning_rate": 0.00022754797318081383, "loss": 4.9454, "mean_token_accuracy": 0.23269859850406646, "num_tokens": 12789896.0, "step": 7035 }, { "entropy": 5.738353204727173, "epoch": 6.048130640309411, "grad_norm": 1.28125, "learning_rate": 0.00022722302051798442, "loss": 4.8836, "mean_token_accuracy": 0.23907660245895385, "num_tokens": 12798596.0, "step": 7040 }, { "entropy": 5.66996431350708, "epoch": 6.052428018908466, "grad_norm": 1.296875, "learning_rate": 0.0002268981721809819, "loss": 4.9122, "mean_token_accuracy": 0.23859167844057083, "num_tokens": 12807285.0, "step": 7045 }, { "entropy": 5.720047092437744, "epoch": 6.05672539750752, "grad_norm": 1.3046875, "learning_rate": 0.0002265734288791451, "loss": 4.9691, "mean_token_accuracy": 0.2319769710302353, "num_tokens": 12816668.0, "step": 7050 }, { "entropy": 5.782533931732178, "epoch": 6.061022776106575, "grad_norm": 1.3515625, "learning_rate": 0.00022624879132158377, "loss": 5.0621, "mean_token_accuracy": 0.22383298426866532, "num_tokens": 12825943.0, "step": 7055 }, { "entropy": 5.740294361114502, "epoch": 6.0653201547056295, "grad_norm": 1.171875, "learning_rate": 0.00022592426021717654, "loss": 4.8845, "mean_token_accuracy": 0.23752743601799012, "num_tokens": 12835693.0, "step": 7060 }, { "entropy": 5.734462833404541, "epoch": 6.069617533304684, "grad_norm": 1.328125, "learning_rate": 0.0002255998362745696, "loss": 4.7947, "mean_token_accuracy": 0.2523151770234108, "num_tokens": 12844201.0, "step": 7065 }, { "entropy": 5.674288940429688, "epoch": 6.073914911903739, "grad_norm": 1.2734375, "learning_rate": 0.00022527552020217513, "loss": 4.9312, "mean_token_accuracy": 0.23512519299983978, "num_tokens": 12853220.0, "step": 7070 }, { "entropy": 5.679785203933716, "epoch": 6.078212290502793, "grad_norm": 1.2421875, "learning_rate": 0.0002249513127081697, "loss": 5.0051, "mean_token_accuracy": 0.2304693043231964, "num_tokens": 12862486.0, "step": 7075 }, { "entropy": 5.797192478179932, "epoch": 6.082509669101848, "grad_norm": 1.2734375, "learning_rate": 0.00022462721450049316, "loss": 5.0032, "mean_token_accuracy": 0.22529298514127732, "num_tokens": 12871717.0, "step": 7080 }, { "entropy": 5.758112525939941, "epoch": 6.086807047700902, "grad_norm": 1.4296875, "learning_rate": 0.0002243032262868464, "loss": 4.9584, "mean_token_accuracy": 0.22690722793340684, "num_tokens": 12881278.0, "step": 7085 }, { "entropy": 5.750693368911743, "epoch": 6.091104426299957, "grad_norm": 1.2109375, "learning_rate": 0.00022397934877469, "loss": 4.9822, "mean_token_accuracy": 0.22972595542669297, "num_tokens": 12890720.0, "step": 7090 }, { "entropy": 5.79561676979065, "epoch": 6.095401804899011, "grad_norm": 1.2734375, "learning_rate": 0.0002236555826712432, "loss": 5.0162, "mean_token_accuracy": 0.22308289557695388, "num_tokens": 12900428.0, "step": 7095 }, { "entropy": 5.806292009353638, "epoch": 6.099699183498066, "grad_norm": 1.3125, "learning_rate": 0.00022333192868348152, "loss": 4.9924, "mean_token_accuracy": 0.22728473246097564, "num_tokens": 12910177.0, "step": 7100 }, { "entropy": 5.733801794052124, "epoch": 6.10399656209712, "grad_norm": 1.21875, "learning_rate": 0.00022300838751813606, "loss": 5.032, "mean_token_accuracy": 0.23358280062675477, "num_tokens": 12920734.0, "step": 7105 }, { "entropy": 5.729845237731934, "epoch": 6.108293940696175, "grad_norm": 1.3984375, "learning_rate": 0.00022268495988169145, "loss": 4.9171, "mean_token_accuracy": 0.2373756691813469, "num_tokens": 12929585.0, "step": 7110 }, { "entropy": 5.642404937744141, "epoch": 6.1125913192952295, "grad_norm": 1.3515625, "learning_rate": 0.00022236164648038433, "loss": 4.9177, "mean_token_accuracy": 0.2359202727675438, "num_tokens": 12938933.0, "step": 7115 }, { "entropy": 5.697567796707153, "epoch": 6.116888697894285, "grad_norm": 1.3984375, "learning_rate": 0.0002220384480202019, "loss": 4.8809, "mean_token_accuracy": 0.23985225856304168, "num_tokens": 12947461.0, "step": 7120 }, { "entropy": 5.735206222534179, "epoch": 6.1211860764933395, "grad_norm": 1.34375, "learning_rate": 0.00022171536520688046, "loss": 4.9414, "mean_token_accuracy": 0.23593441843986512, "num_tokens": 12956507.0, "step": 7125 }, { "entropy": 5.7640424251556395, "epoch": 6.125483455092394, "grad_norm": 1.3046875, "learning_rate": 0.00022139239874590362, "loss": 5.0582, "mean_token_accuracy": 0.22682570517063141, "num_tokens": 12965740.0, "step": 7130 }, { "entropy": 5.745245885848999, "epoch": 6.1297808336914485, "grad_norm": 1.2734375, "learning_rate": 0.0002210695493425013, "loss": 4.9708, "mean_token_accuracy": 0.23030567169189453, "num_tokens": 12975057.0, "step": 7135 }, { "entropy": 5.677684688568116, "epoch": 6.134078212290503, "grad_norm": 1.53125, "learning_rate": 0.00022074681770164735, "loss": 4.9075, "mean_token_accuracy": 0.23593185544013978, "num_tokens": 12984087.0, "step": 7140 }, { "entropy": 5.6709558963775635, "epoch": 6.138375590889558, "grad_norm": 1.2265625, "learning_rate": 0.00022042420452805868, "loss": 4.9296, "mean_token_accuracy": 0.24000215977430345, "num_tokens": 12992793.0, "step": 7145 }, { "entropy": 5.755114269256592, "epoch": 6.142672969488612, "grad_norm": 1.3828125, "learning_rate": 0.00022010171052619365, "loss": 4.9894, "mean_token_accuracy": 0.2350299596786499, "num_tokens": 13000769.0, "step": 7150 }, { "entropy": 5.762734317779541, "epoch": 6.146970348087667, "grad_norm": 1.28125, "learning_rate": 0.00021977933640025, "loss": 4.9752, "mean_token_accuracy": 0.23126785159111024, "num_tokens": 13010677.0, "step": 7155 }, { "entropy": 5.642547845840454, "epoch": 6.151267726686721, "grad_norm": 1.28125, "learning_rate": 0.00021945708285416434, "loss": 4.8383, "mean_token_accuracy": 0.24688103795051575, "num_tokens": 13019791.0, "step": 7160 }, { "entropy": 5.730410861968994, "epoch": 6.155565105285776, "grad_norm": 1.328125, "learning_rate": 0.0002191349505916093, "loss": 5.0057, "mean_token_accuracy": 0.23793091177940368, "num_tokens": 13029223.0, "step": 7165 }, { "entropy": 5.737023544311524, "epoch": 6.15986248388483, "grad_norm": 1.234375, "learning_rate": 0.00021881294031599318, "loss": 4.9491, "mean_token_accuracy": 0.23026928752660752, "num_tokens": 13038716.0, "step": 7170 }, { "entropy": 5.756968975067139, "epoch": 6.164159862483885, "grad_norm": 1.40625, "learning_rate": 0.0002184910527304576, "loss": 4.9858, "mean_token_accuracy": 0.23431467413902282, "num_tokens": 13047915.0, "step": 7175 }, { "entropy": 5.707231950759888, "epoch": 6.1684572410829395, "grad_norm": 1.359375, "learning_rate": 0.00021816928853787636, "loss": 4.936, "mean_token_accuracy": 0.24333883821964264, "num_tokens": 13056613.0, "step": 7180 }, { "entropy": 5.7403875350952145, "epoch": 6.172754619681994, "grad_norm": 1.28125, "learning_rate": 0.00021784764844085398, "loss": 4.9922, "mean_token_accuracy": 0.23102872520685197, "num_tokens": 13066658.0, "step": 7185 }, { "entropy": 5.814033651351929, "epoch": 6.177051998281049, "grad_norm": 1.4453125, "learning_rate": 0.0002175261331417238, "loss": 4.9941, "mean_token_accuracy": 0.2289365902543068, "num_tokens": 13074798.0, "step": 7190 }, { "entropy": 5.694831132888794, "epoch": 6.181349376880103, "grad_norm": 1.359375, "learning_rate": 0.00021720474334254675, "loss": 4.92, "mean_token_accuracy": 0.22971168160438538, "num_tokens": 13084173.0, "step": 7195 }, { "entropy": 5.730774879455566, "epoch": 6.185646755479158, "grad_norm": 1.25, "learning_rate": 0.00021688347974510962, "loss": 4.9482, "mean_token_accuracy": 0.23248852640390397, "num_tokens": 13093096.0, "step": 7200 }, { "entropy": 5.719029092788697, "epoch": 6.189944134078212, "grad_norm": 1.5078125, "learning_rate": 0.00021656234305092377, "loss": 4.9397, "mean_token_accuracy": 0.239972348511219, "num_tokens": 13101191.0, "step": 7205 }, { "entropy": 5.707985305786133, "epoch": 6.194241512677267, "grad_norm": 1.4609375, "learning_rate": 0.0002162413339612234, "loss": 4.9712, "mean_token_accuracy": 0.23447236716747283, "num_tokens": 13109829.0, "step": 7210 }, { "entropy": 5.708271741867065, "epoch": 6.198538891276321, "grad_norm": 1.4140625, "learning_rate": 0.00021592045317696406, "loss": 4.9274, "mean_token_accuracy": 0.23861967474222184, "num_tokens": 13119314.0, "step": 7215 }, { "entropy": 5.6994280338287355, "epoch": 6.202836269875376, "grad_norm": 1.3828125, "learning_rate": 0.00021559970139882102, "loss": 4.8994, "mean_token_accuracy": 0.23726629912853242, "num_tokens": 13128113.0, "step": 7220 }, { "entropy": 5.7727789878845215, "epoch": 6.20713364847443, "grad_norm": 1.3515625, "learning_rate": 0.0002152790793271881, "loss": 5.0355, "mean_token_accuracy": 0.22249855697155, "num_tokens": 13136892.0, "step": 7225 }, { "entropy": 5.766521263122558, "epoch": 6.211431027073485, "grad_norm": 1.1484375, "learning_rate": 0.00021495858766217558, "loss": 5.0147, "mean_token_accuracy": 0.22924861907958985, "num_tokens": 13146960.0, "step": 7230 }, { "entropy": 5.720303297042847, "epoch": 6.2157284056725395, "grad_norm": 1.375, "learning_rate": 0.00021463822710360932, "loss": 4.8958, "mean_token_accuracy": 0.245298570394516, "num_tokens": 13156147.0, "step": 7235 }, { "entropy": 5.748995399475097, "epoch": 6.220025784271594, "grad_norm": 1.2734375, "learning_rate": 0.00021431799835102867, "loss": 4.9738, "mean_token_accuracy": 0.22614747285842896, "num_tokens": 13164588.0, "step": 7240 }, { "entropy": 5.660120677947998, "epoch": 6.224323162870649, "grad_norm": 1.28125, "learning_rate": 0.00021399790210368524, "loss": 4.9139, "mean_token_accuracy": 0.23604709059000015, "num_tokens": 13174361.0, "step": 7245 }, { "entropy": 5.748390197753906, "epoch": 6.228620541469703, "grad_norm": 1.140625, "learning_rate": 0.00021367793906054133, "loss": 5.119, "mean_token_accuracy": 0.22031570225954056, "num_tokens": 13185266.0, "step": 7250 }, { "entropy": 5.738927364349365, "epoch": 6.232917920068758, "grad_norm": 1.375, "learning_rate": 0.00021335810992026823, "loss": 4.9654, "mean_token_accuracy": 0.24364089071750641, "num_tokens": 13194227.0, "step": 7255 }, { "entropy": 5.771244287490845, "epoch": 6.237215298667812, "grad_norm": 1.4453125, "learning_rate": 0.00021303841538124497, "loss": 5.0114, "mean_token_accuracy": 0.22773328721523284, "num_tokens": 13202569.0, "step": 7260 }, { "entropy": 5.7087109088897705, "epoch": 6.241512677266867, "grad_norm": 1.3515625, "learning_rate": 0.00021271885614155685, "loss": 4.9538, "mean_token_accuracy": 0.2368649423122406, "num_tokens": 13212201.0, "step": 7265 }, { "entropy": 5.627471113204956, "epoch": 6.245810055865922, "grad_norm": 1.4296875, "learning_rate": 0.0002123994328989932, "loss": 4.8806, "mean_token_accuracy": 0.2345322847366333, "num_tokens": 13220802.0, "step": 7270 }, { "entropy": 5.730443000793457, "epoch": 6.250107434464977, "grad_norm": 1.3046875, "learning_rate": 0.00021208014635104688, "loss": 5.0275, "mean_token_accuracy": 0.2250627398490906, "num_tokens": 13229519.0, "step": 7275 }, { "entropy": 5.741388607025146, "epoch": 6.254404813064031, "grad_norm": 1.3515625, "learning_rate": 0.00021176099719491209, "loss": 4.9881, "mean_token_accuracy": 0.22891727834939957, "num_tokens": 13238865.0, "step": 7280 }, { "entropy": 5.702636861801148, "epoch": 6.258702191663086, "grad_norm": 1.4609375, "learning_rate": 0.00021144198612748312, "loss": 4.9049, "mean_token_accuracy": 0.2440029874444008, "num_tokens": 13247259.0, "step": 7285 }, { "entropy": 5.7749049186706545, "epoch": 6.26299957026214, "grad_norm": 1.2578125, "learning_rate": 0.00021112311384535243, "loss": 5.0122, "mean_token_accuracy": 0.23096455335617067, "num_tokens": 13256692.0, "step": 7290 }, { "entropy": 5.753351926803589, "epoch": 6.267296948861195, "grad_norm": 1.2109375, "learning_rate": 0.00021080438104480976, "loss": 4.947, "mean_token_accuracy": 0.23121515959501265, "num_tokens": 13266109.0, "step": 7295 }, { "entropy": 5.79936261177063, "epoch": 6.2715943274602495, "grad_norm": 1.203125, "learning_rate": 0.00021048578842184019, "loss": 5.0452, "mean_token_accuracy": 0.22553887069225312, "num_tokens": 13275484.0, "step": 7300 }, { "entropy": 5.706963014602661, "epoch": 6.275891706059304, "grad_norm": 1.328125, "learning_rate": 0.00021016733667212245, "loss": 4.9322, "mean_token_accuracy": 0.2428619921207428, "num_tokens": 13284755.0, "step": 7305 }, { "entropy": 5.693513488769531, "epoch": 6.280189084658359, "grad_norm": 1.1171875, "learning_rate": 0.00020984902649102806, "loss": 4.9319, "mean_token_accuracy": 0.23691536039113997, "num_tokens": 13294386.0, "step": 7310 }, { "entropy": 5.711094999313355, "epoch": 6.284486463257413, "grad_norm": 1.265625, "learning_rate": 0.00020953085857361924, "loss": 4.9281, "mean_token_accuracy": 0.23375690579414368, "num_tokens": 13303926.0, "step": 7315 }, { "entropy": 5.696382474899292, "epoch": 6.288783841856468, "grad_norm": 1.3984375, "learning_rate": 0.00020921283361464754, "loss": 5.0236, "mean_token_accuracy": 0.22575154900550842, "num_tokens": 13312727.0, "step": 7320 }, { "entropy": 5.716668939590454, "epoch": 6.293081220455522, "grad_norm": 1.34375, "learning_rate": 0.00020889495230855232, "loss": 4.9784, "mean_token_accuracy": 0.22872833162546158, "num_tokens": 13321706.0, "step": 7325 }, { "entropy": 5.668966674804688, "epoch": 6.297378599054577, "grad_norm": 1.25, "learning_rate": 0.00020857721534945923, "loss": 4.9344, "mean_token_accuracy": 0.23734308630228043, "num_tokens": 13330436.0, "step": 7330 }, { "entropy": 5.757170391082764, "epoch": 6.301675977653631, "grad_norm": 1.328125, "learning_rate": 0.0002082596234311789, "loss": 5.0185, "mean_token_accuracy": 0.22471884340047837, "num_tokens": 13339334.0, "step": 7335 }, { "entropy": 5.78465838432312, "epoch": 6.305973356252686, "grad_norm": 1.34375, "learning_rate": 0.0002079421772472051, "loss": 5.0679, "mean_token_accuracy": 0.22117299884557723, "num_tokens": 13348969.0, "step": 7340 }, { "entropy": 5.793431758880615, "epoch": 6.31027073485174, "grad_norm": 1.1328125, "learning_rate": 0.0002076248774907134, "loss": 5.0456, "mean_token_accuracy": 0.22682560235261917, "num_tokens": 13358467.0, "step": 7345 }, { "entropy": 5.696144914627075, "epoch": 6.314568113450795, "grad_norm": 1.5, "learning_rate": 0.00020730772485455962, "loss": 4.8624, "mean_token_accuracy": 0.23993807286024094, "num_tokens": 13366413.0, "step": 7350 }, { "entropy": 5.7636823654174805, "epoch": 6.3188654920498495, "grad_norm": 1.1875, "learning_rate": 0.0002069907200312785, "loss": 5.0386, "mean_token_accuracy": 0.22139448076486587, "num_tokens": 13376620.0, "step": 7355 }, { "entropy": 5.723620796203614, "epoch": 6.323162870648904, "grad_norm": 1.359375, "learning_rate": 0.00020667386371308162, "loss": 5.0121, "mean_token_accuracy": 0.226824951171875, "num_tokens": 13385492.0, "step": 7360 }, { "entropy": 5.699918603897094, "epoch": 6.327460249247959, "grad_norm": 1.2265625, "learning_rate": 0.00020635715659185673, "loss": 4.9534, "mean_token_accuracy": 0.237271548807621, "num_tokens": 13395562.0, "step": 7365 }, { "entropy": 5.646546459197998, "epoch": 6.331757627847013, "grad_norm": 1.4453125, "learning_rate": 0.00020604059935916551, "loss": 4.8925, "mean_token_accuracy": 0.24219037890434264, "num_tokens": 13403357.0, "step": 7370 }, { "entropy": 5.723139715194702, "epoch": 6.336055006446068, "grad_norm": 1.4140625, "learning_rate": 0.00020572419270624255, "loss": 4.9969, "mean_token_accuracy": 0.23338112533092498, "num_tokens": 13412527.0, "step": 7375 }, { "entropy": 5.752758359909057, "epoch": 6.340352385045122, "grad_norm": 1.3046875, "learning_rate": 0.00020540793732399339, "loss": 5.0577, "mean_token_accuracy": 0.22245372980833053, "num_tokens": 13422455.0, "step": 7380 }, { "entropy": 5.807447099685669, "epoch": 6.344649763644177, "grad_norm": 1.375, "learning_rate": 0.00020509183390299325, "loss": 5.138, "mean_token_accuracy": 0.21202833354473113, "num_tokens": 13431677.0, "step": 7385 }, { "entropy": 5.653739356994629, "epoch": 6.348947142243231, "grad_norm": 1.2890625, "learning_rate": 0.00020477588313348594, "loss": 4.8809, "mean_token_accuracy": 0.24472787827253342, "num_tokens": 13440522.0, "step": 7390 }, { "entropy": 5.774398708343506, "epoch": 6.353244520842286, "grad_norm": 1.4921875, "learning_rate": 0.00020446008570538154, "loss": 5.0073, "mean_token_accuracy": 0.22441416233778, "num_tokens": 13450021.0, "step": 7395 }, { "entropy": 5.674509477615357, "epoch": 6.35754189944134, "grad_norm": 1.3515625, "learning_rate": 0.0002041444423082554, "loss": 4.9173, "mean_token_accuracy": 0.23533451408147812, "num_tokens": 13458115.0, "step": 7400 }, { "entropy": 5.764778327941895, "epoch": 6.361839278040395, "grad_norm": 1.3515625, "learning_rate": 0.00020382895363134652, "loss": 4.9923, "mean_token_accuracy": 0.22358438670635222, "num_tokens": 13466798.0, "step": 7405 }, { "entropy": 5.71299376487732, "epoch": 6.3661366566394495, "grad_norm": 1.359375, "learning_rate": 0.00020351362036355602, "loss": 5.0101, "mean_token_accuracy": 0.23004357367753983, "num_tokens": 13476096.0, "step": 7410 }, { "entropy": 5.714753818511963, "epoch": 6.370434035238505, "grad_norm": 1.28125, "learning_rate": 0.0002031984431934459, "loss": 4.9541, "mean_token_accuracy": 0.2371777668595314, "num_tokens": 13484601.0, "step": 7415 }, { "entropy": 5.716524934768676, "epoch": 6.3747314138375595, "grad_norm": 1.25, "learning_rate": 0.00020288342280923695, "loss": 4.9449, "mean_token_accuracy": 0.23932171016931533, "num_tokens": 13493994.0, "step": 7420 }, { "entropy": 5.69349045753479, "epoch": 6.379028792436614, "grad_norm": 1.3203125, "learning_rate": 0.00020256855989880785, "loss": 4.9782, "mean_token_accuracy": 0.2243503674864769, "num_tokens": 13502890.0, "step": 7425 }, { "entropy": 5.67198166847229, "epoch": 6.383326171035669, "grad_norm": 1.3515625, "learning_rate": 0.00020225385514969336, "loss": 4.9378, "mean_token_accuracy": 0.2383576363325119, "num_tokens": 13512980.0, "step": 7430 }, { "entropy": 5.71486463546753, "epoch": 6.387623549634723, "grad_norm": 1.3125, "learning_rate": 0.00020193930924908277, "loss": 5.0231, "mean_token_accuracy": 0.23398934602737426, "num_tokens": 13521558.0, "step": 7435 }, { "entropy": 5.680387735366821, "epoch": 6.391920928233778, "grad_norm": 1.171875, "learning_rate": 0.00020162492288381867, "loss": 4.9307, "mean_token_accuracy": 0.23600068539381028, "num_tokens": 13531506.0, "step": 7440 }, { "entropy": 5.672720003128052, "epoch": 6.396218306832832, "grad_norm": 1.1953125, "learning_rate": 0.0002013106967403953, "loss": 4.8982, "mean_token_accuracy": 0.2424224779009819, "num_tokens": 13540559.0, "step": 7445 }, { "entropy": 5.768481636047364, "epoch": 6.400515685431887, "grad_norm": 1.4140625, "learning_rate": 0.0002009966315049569, "loss": 5.011, "mean_token_accuracy": 0.23141635209321976, "num_tokens": 13550654.0, "step": 7450 }, { "entropy": 5.763478708267212, "epoch": 6.404813064030941, "grad_norm": 1.3125, "learning_rate": 0.0002006827278632964, "loss": 5.027, "mean_token_accuracy": 0.23014189153909684, "num_tokens": 13560708.0, "step": 7455 }, { "entropy": 5.722728872299195, "epoch": 6.409110442629996, "grad_norm": 1.28125, "learning_rate": 0.00020036898650085377, "loss": 4.9173, "mean_token_accuracy": 0.22755362838506699, "num_tokens": 13569330.0, "step": 7460 }, { "entropy": 5.764143085479736, "epoch": 6.41340782122905, "grad_norm": 1.40625, "learning_rate": 0.00020005540810271493, "loss": 5.0893, "mean_token_accuracy": 0.21666009724140167, "num_tokens": 13577500.0, "step": 7465 }, { "entropy": 5.709292697906494, "epoch": 6.417705199828105, "grad_norm": 1.3203125, "learning_rate": 0.00019974199335360976, "loss": 4.9859, "mean_token_accuracy": 0.23307044357061385, "num_tokens": 13586087.0, "step": 7470 }, { "entropy": 5.699592590332031, "epoch": 6.4220025784271595, "grad_norm": 1.3671875, "learning_rate": 0.00019942874293791068, "loss": 4.9396, "mean_token_accuracy": 0.23114058822393418, "num_tokens": 13595346.0, "step": 7475 }, { "entropy": 5.764913511276245, "epoch": 6.426299957026214, "grad_norm": 1.3515625, "learning_rate": 0.00019911565753963145, "loss": 5.0835, "mean_token_accuracy": 0.2268539473414421, "num_tokens": 13604755.0, "step": 7480 }, { "entropy": 5.752863359451294, "epoch": 6.430597335625269, "grad_norm": 1.1953125, "learning_rate": 0.0001988027378424254, "loss": 4.9844, "mean_token_accuracy": 0.22718686014413833, "num_tokens": 13613860.0, "step": 7485 }, { "entropy": 5.664366340637207, "epoch": 6.434894714224323, "grad_norm": 1.421875, "learning_rate": 0.00019848998452958429, "loss": 4.8699, "mean_token_accuracy": 0.24055294096469879, "num_tokens": 13622574.0, "step": 7490 }, { "entropy": 5.728462505340576, "epoch": 6.439192092823378, "grad_norm": 1.265625, "learning_rate": 0.00019817739828403602, "loss": 5.0143, "mean_token_accuracy": 0.22399941384792327, "num_tokens": 13632366.0, "step": 7495 }, { "entropy": 5.775394105911255, "epoch": 6.443489471422432, "grad_norm": 1.2265625, "learning_rate": 0.00019786497978834422, "loss": 4.9188, "mean_token_accuracy": 0.24379066675901412, "num_tokens": 13640682.0, "step": 7500 }, { "epoch": 6.443489471422432, "eval_entropy": 5.516042317356075, "eval_loss": 5.9212799072265625, "eval_mean_token_accuracy": 0.18048049712570402, "eval_num_tokens": 13640682.0, "eval_runtime": 2.2313, "eval_samples_per_second": 1590.525, "eval_steps_per_second": 198.984, "step": 7500 }, { "entropy": 5.710464859008789, "epoch": 6.447786850021487, "grad_norm": 1.3671875, "learning_rate": 0.00019755272972470602, "loss": 4.9985, "mean_token_accuracy": 0.2372261881828308, "num_tokens": 13649675.0, "step": 7505 }, { "entropy": 5.728733777999878, "epoch": 6.452084228620541, "grad_norm": 1.390625, "learning_rate": 0.00019724064877495057, "loss": 4.9681, "mean_token_accuracy": 0.22544475942850112, "num_tokens": 13658260.0, "step": 7510 }, { "entropy": 5.689570474624634, "epoch": 6.456381607219596, "grad_norm": 1.4296875, "learning_rate": 0.00019692873762053808, "loss": 4.9032, "mean_token_accuracy": 0.240114925801754, "num_tokens": 13666571.0, "step": 7515 }, { "entropy": 5.74085488319397, "epoch": 6.46067898581865, "grad_norm": 1.1640625, "learning_rate": 0.00019661699694255785, "loss": 4.9944, "mean_token_accuracy": 0.23399491906166076, "num_tokens": 13675707.0, "step": 7520 }, { "entropy": 5.765975475311279, "epoch": 6.464976364417705, "grad_norm": 1.46875, "learning_rate": 0.00019630542742172692, "loss": 4.9617, "mean_token_accuracy": 0.23181569278240205, "num_tokens": 13684796.0, "step": 7525 }, { "entropy": 5.7313316822052, "epoch": 6.4692737430167595, "grad_norm": 1.328125, "learning_rate": 0.00019599402973838854, "loss": 5.0548, "mean_token_accuracy": 0.2183179423213005, "num_tokens": 13693158.0, "step": 7530 }, { "entropy": 5.759279012680054, "epoch": 6.473571121615814, "grad_norm": 1.40625, "learning_rate": 0.0001956828045725107, "loss": 5.0232, "mean_token_accuracy": 0.2257732018828392, "num_tokens": 13703521.0, "step": 7535 }, { "entropy": 5.726857852935791, "epoch": 6.477868500214869, "grad_norm": 1.359375, "learning_rate": 0.0001953717526036849, "loss": 4.938, "mean_token_accuracy": 0.23097043633460998, "num_tokens": 13712337.0, "step": 7540 }, { "entropy": 5.701276779174805, "epoch": 6.482165878813923, "grad_norm": 1.4296875, "learning_rate": 0.00019506087451112437, "loss": 4.9481, "mean_token_accuracy": 0.22534014284610748, "num_tokens": 13721605.0, "step": 7545 }, { "entropy": 5.643766689300537, "epoch": 6.486463257412978, "grad_norm": 1.4609375, "learning_rate": 0.00019475017097366244, "loss": 4.8556, "mean_token_accuracy": 0.23758668601512908, "num_tokens": 13730827.0, "step": 7550 }, { "entropy": 5.768571901321411, "epoch": 6.490760636012032, "grad_norm": 1.3984375, "learning_rate": 0.00019443964266975156, "loss": 5.0255, "mean_token_accuracy": 0.22498469054698944, "num_tokens": 13740128.0, "step": 7555 }, { "entropy": 5.71935887336731, "epoch": 6.495058014611088, "grad_norm": 1.3515625, "learning_rate": 0.0001941292902774614, "loss": 5.0079, "mean_token_accuracy": 0.22391847968101503, "num_tokens": 13748428.0, "step": 7560 }, { "entropy": 5.719990539550781, "epoch": 6.499355393210142, "grad_norm": 1.25, "learning_rate": 0.00019381911447447742, "loss": 4.9749, "mean_token_accuracy": 0.23590452522039412, "num_tokens": 13757109.0, "step": 7565 }, { "entropy": 5.771088457107544, "epoch": 6.503652771809197, "grad_norm": 1.4609375, "learning_rate": 0.00019350911593809977, "loss": 5.0273, "mean_token_accuracy": 0.22215828597545623, "num_tokens": 13766281.0, "step": 7570 }, { "entropy": 5.7560049533844, "epoch": 6.507950150408251, "grad_norm": 1.265625, "learning_rate": 0.00019319929534524128, "loss": 5.0243, "mean_token_accuracy": 0.22932689040899276, "num_tokens": 13775535.0, "step": 7575 }, { "entropy": 5.734013271331787, "epoch": 6.512247529007306, "grad_norm": 1.3125, "learning_rate": 0.00019288965337242636, "loss": 4.9859, "mean_token_accuracy": 0.22449343800544738, "num_tokens": 13784099.0, "step": 7580 }, { "entropy": 5.832199907302856, "epoch": 6.51654490760636, "grad_norm": 1.3125, "learning_rate": 0.00019258019069578924, "loss": 5.1169, "mean_token_accuracy": 0.22021577656269073, "num_tokens": 13793098.0, "step": 7585 }, { "entropy": 5.665675640106201, "epoch": 6.520842286205415, "grad_norm": 1.3515625, "learning_rate": 0.00019227090799107266, "loss": 4.9582, "mean_token_accuracy": 0.23555098623037338, "num_tokens": 13801847.0, "step": 7590 }, { "entropy": 5.6672852516174315, "epoch": 6.5251396648044695, "grad_norm": 1.25, "learning_rate": 0.0001919618059336265, "loss": 4.8502, "mean_token_accuracy": 0.24223940819501877, "num_tokens": 13810599.0, "step": 7595 }, { "entropy": 5.765718603134156, "epoch": 6.529437043403524, "grad_norm": 1.4453125, "learning_rate": 0.00019165288519840617, "loss": 4.9983, "mean_token_accuracy": 0.22907198518514632, "num_tokens": 13819602.0, "step": 7600 }, { "entropy": 5.687669324874878, "epoch": 6.533734422002579, "grad_norm": 1.453125, "learning_rate": 0.000191344146459971, "loss": 4.9475, "mean_token_accuracy": 0.2387497156858444, "num_tokens": 13828254.0, "step": 7605 }, { "entropy": 5.701369762420654, "epoch": 6.538031800601633, "grad_norm": 1.296875, "learning_rate": 0.00019103559039248302, "loss": 4.9916, "mean_token_accuracy": 0.22820447534322738, "num_tokens": 13837163.0, "step": 7610 }, { "entropy": 5.764211797714234, "epoch": 6.542329179200688, "grad_norm": 1.375, "learning_rate": 0.0001907272176697052, "loss": 5.0007, "mean_token_accuracy": 0.22737849950790406, "num_tokens": 13846373.0, "step": 7615 }, { "entropy": 5.673723602294922, "epoch": 6.546626557799742, "grad_norm": 1.3046875, "learning_rate": 0.00019041902896500059, "loss": 4.9524, "mean_token_accuracy": 0.23322181403636932, "num_tokens": 13855846.0, "step": 7620 }, { "entropy": 5.736261415481567, "epoch": 6.550923936398797, "grad_norm": 1.359375, "learning_rate": 0.00019011102495132993, "loss": 4.9605, "mean_token_accuracy": 0.23272975385189057, "num_tokens": 13864723.0, "step": 7625 }, { "entropy": 5.798337507247925, "epoch": 6.555221314997851, "grad_norm": 1.140625, "learning_rate": 0.00018980320630125104, "loss": 5.1101, "mean_token_accuracy": 0.2181757315993309, "num_tokens": 13873418.0, "step": 7630 }, { "entropy": 5.75980315208435, "epoch": 6.559518693596906, "grad_norm": 1.609375, "learning_rate": 0.00018949557368691666, "loss": 5.0061, "mean_token_accuracy": 0.23227301239967346, "num_tokens": 13881890.0, "step": 7635 }, { "entropy": 5.7168203830719, "epoch": 6.56381607219596, "grad_norm": 1.21875, "learning_rate": 0.00018918812778007343, "loss": 5.0371, "mean_token_accuracy": 0.22749389559030533, "num_tokens": 13891289.0, "step": 7640 }, { "entropy": 5.723117971420288, "epoch": 6.568113450795015, "grad_norm": 1.3203125, "learning_rate": 0.00018888086925206054, "loss": 4.9546, "mean_token_accuracy": 0.23108558654785155, "num_tokens": 13900344.0, "step": 7645 }, { "entropy": 5.778712177276612, "epoch": 6.5724108293940695, "grad_norm": 1.4296875, "learning_rate": 0.00018857379877380763, "loss": 4.9952, "mean_token_accuracy": 0.23351393938064574, "num_tokens": 13909108.0, "step": 7650 }, { "entropy": 5.726499557495117, "epoch": 6.576708207993124, "grad_norm": 1.2578125, "learning_rate": 0.00018826691701583404, "loss": 5.0088, "mean_token_accuracy": 0.23075273931026458, "num_tokens": 13918458.0, "step": 7655 }, { "entropy": 5.750834035873413, "epoch": 6.581005586592179, "grad_norm": 1.3828125, "learning_rate": 0.00018796022464824663, "loss": 4.9703, "mean_token_accuracy": 0.23334225118160248, "num_tokens": 13927186.0, "step": 7660 }, { "entropy": 5.721710252761841, "epoch": 6.585302965191233, "grad_norm": 1.3671875, "learning_rate": 0.00018765372234073912, "loss": 5.0632, "mean_token_accuracy": 0.21736631840467452, "num_tokens": 13936701.0, "step": 7665 }, { "entropy": 5.700629091262817, "epoch": 6.589600343790288, "grad_norm": 1.453125, "learning_rate": 0.00018734741076259005, "loss": 5.075, "mean_token_accuracy": 0.21827920377254487, "num_tokens": 13945842.0, "step": 7670 }, { "entropy": 5.6856285572052006, "epoch": 6.593897722389342, "grad_norm": 1.2109375, "learning_rate": 0.00018704129058266152, "loss": 4.9466, "mean_token_accuracy": 0.2341363787651062, "num_tokens": 13955675.0, "step": 7675 }, { "entropy": 5.80717830657959, "epoch": 6.598195100988397, "grad_norm": 1.3125, "learning_rate": 0.00018673536246939743, "loss": 5.0459, "mean_token_accuracy": 0.23114715218544007, "num_tokens": 13964153.0, "step": 7680 }, { "entropy": 5.752105951309204, "epoch": 6.602492479587451, "grad_norm": 1.4375, "learning_rate": 0.00018642962709082274, "loss": 5.0116, "mean_token_accuracy": 0.22900762856006623, "num_tokens": 13972141.0, "step": 7685 }, { "entropy": 5.650936651229858, "epoch": 6.606789858186506, "grad_norm": 1.4140625, "learning_rate": 0.00018612408511454103, "loss": 4.8898, "mean_token_accuracy": 0.2377362921833992, "num_tokens": 13981369.0, "step": 7690 }, { "entropy": 5.690488052368164, "epoch": 6.61108723678556, "grad_norm": 1.375, "learning_rate": 0.00018581873720773423, "loss": 4.9523, "mean_token_accuracy": 0.2291845917701721, "num_tokens": 13990626.0, "step": 7695 }, { "entropy": 5.7331787109375, "epoch": 6.615384615384615, "grad_norm": 1.34375, "learning_rate": 0.00018551358403715989, "loss": 5.0515, "mean_token_accuracy": 0.22102296650409697, "num_tokens": 13999399.0, "step": 7700 }, { "entropy": 5.750496292114258, "epoch": 6.61968199398367, "grad_norm": 1.4140625, "learning_rate": 0.00018520862626915052, "loss": 4.9555, "mean_token_accuracy": 0.23201001733541488, "num_tokens": 14007487.0, "step": 7705 }, { "entropy": 5.703403472900391, "epoch": 6.623979372582724, "grad_norm": 1.3046875, "learning_rate": 0.00018490386456961223, "loss": 4.927, "mean_token_accuracy": 0.23265215754508972, "num_tokens": 14016779.0, "step": 7710 }, { "entropy": 5.726163053512574, "epoch": 6.6282767511817795, "grad_norm": 1.3359375, "learning_rate": 0.0001845992996040224, "loss": 5.0042, "mean_token_accuracy": 0.23183127194643022, "num_tokens": 14025770.0, "step": 7715 }, { "entropy": 5.670454740524292, "epoch": 6.632574129780834, "grad_norm": 1.3203125, "learning_rate": 0.00018429493203742946, "loss": 4.9045, "mean_token_accuracy": 0.2422835037112236, "num_tokens": 14035304.0, "step": 7720 }, { "entropy": 5.719083833694458, "epoch": 6.636871508379889, "grad_norm": 1.5, "learning_rate": 0.00018399076253445052, "loss": 4.8473, "mean_token_accuracy": 0.24168919026851654, "num_tokens": 14044058.0, "step": 7725 }, { "entropy": 5.739474201202393, "epoch": 6.641168886978943, "grad_norm": 1.3359375, "learning_rate": 0.00018368679175927012, "loss": 4.9136, "mean_token_accuracy": 0.24082895070314408, "num_tokens": 14052709.0, "step": 7730 }, { "entropy": 5.655467987060547, "epoch": 6.645466265577998, "grad_norm": 1.3515625, "learning_rate": 0.00018338302037563885, "loss": 4.8753, "mean_token_accuracy": 0.24130599498748778, "num_tokens": 14061346.0, "step": 7735 }, { "entropy": 5.667017984390259, "epoch": 6.649763644177052, "grad_norm": 1.4453125, "learning_rate": 0.00018307944904687211, "loss": 4.957, "mean_token_accuracy": 0.23699511587619781, "num_tokens": 14070409.0, "step": 7740 }, { "entropy": 5.748349380493164, "epoch": 6.654061022776107, "grad_norm": 1.5546875, "learning_rate": 0.0001827760784358483, "loss": 5.0411, "mean_token_accuracy": 0.2208509013056755, "num_tokens": 14079448.0, "step": 7745 }, { "entropy": 5.750911140441895, "epoch": 6.658358401375161, "grad_norm": 1.3203125, "learning_rate": 0.00018247290920500776, "loss": 5.0164, "mean_token_accuracy": 0.22797961235046388, "num_tokens": 14088452.0, "step": 7750 }, { "entropy": 5.734561491012573, "epoch": 6.662655779974216, "grad_norm": 1.3984375, "learning_rate": 0.00018216994201635062, "loss": 5.0004, "mean_token_accuracy": 0.23072549253702163, "num_tokens": 14097956.0, "step": 7755 }, { "entropy": 5.748372364044189, "epoch": 6.66695315857327, "grad_norm": 1.4140625, "learning_rate": 0.00018186717753143633, "loss": 4.9427, "mean_token_accuracy": 0.23190637975931166, "num_tokens": 14106771.0, "step": 7760 }, { "entropy": 5.754193592071533, "epoch": 6.671250537172325, "grad_norm": 1.25, "learning_rate": 0.00018156461641138133, "loss": 5.0069, "mean_token_accuracy": 0.2301826596260071, "num_tokens": 14115871.0, "step": 7765 }, { "entropy": 5.708150577545166, "epoch": 6.6755479157713795, "grad_norm": 1.2578125, "learning_rate": 0.00018126225931685836, "loss": 4.9429, "mean_token_accuracy": 0.23147647231817245, "num_tokens": 14125316.0, "step": 7770 }, { "entropy": 5.6619995594024655, "epoch": 6.679845294370434, "grad_norm": 1.3671875, "learning_rate": 0.00018096010690809444, "loss": 4.9023, "mean_token_accuracy": 0.23880022764205933, "num_tokens": 14134244.0, "step": 7775 }, { "entropy": 5.749985933303833, "epoch": 6.684142672969489, "grad_norm": 1.296875, "learning_rate": 0.00018065815984486962, "loss": 5.032, "mean_token_accuracy": 0.22433867752552034, "num_tokens": 14143600.0, "step": 7780 }, { "entropy": 5.712496757507324, "epoch": 6.688440051568543, "grad_norm": 1.3828125, "learning_rate": 0.00018035641878651548, "loss": 4.9372, "mean_token_accuracy": 0.23336533308029175, "num_tokens": 14152112.0, "step": 7785 }, { "entropy": 5.681194543838501, "epoch": 6.692737430167598, "grad_norm": 1.484375, "learning_rate": 0.00018005488439191408, "loss": 4.9184, "mean_token_accuracy": 0.235929536819458, "num_tokens": 14161044.0, "step": 7790 }, { "entropy": 5.715410423278809, "epoch": 6.697034808766652, "grad_norm": 1.453125, "learning_rate": 0.0001797535573194959, "loss": 5.0322, "mean_token_accuracy": 0.2334413096308708, "num_tokens": 14169629.0, "step": 7795 }, { "entropy": 5.7020186424255375, "epoch": 6.701332187365707, "grad_norm": 1.234375, "learning_rate": 0.0001794524382272389, "loss": 4.9562, "mean_token_accuracy": 0.2321821540594101, "num_tokens": 14179234.0, "step": 7800 }, { "entropy": 5.716729545593262, "epoch": 6.705629565964761, "grad_norm": 1.1875, "learning_rate": 0.0001791515277726667, "loss": 4.9933, "mean_token_accuracy": 0.2396368682384491, "num_tokens": 14188887.0, "step": 7805 }, { "entropy": 5.681606817245483, "epoch": 6.709926944563816, "grad_norm": 1.3046875, "learning_rate": 0.00017885082661284763, "loss": 4.9074, "mean_token_accuracy": 0.23539066463708877, "num_tokens": 14197731.0, "step": 7810 }, { "entropy": 5.743403530120849, "epoch": 6.71422432316287, "grad_norm": 1.203125, "learning_rate": 0.00017855033540439274, "loss": 5.0258, "mean_token_accuracy": 0.22792317420244218, "num_tokens": 14206851.0, "step": 7815 }, { "entropy": 5.718718433380127, "epoch": 6.718521701761925, "grad_norm": 1.515625, "learning_rate": 0.00017825005480345463, "loss": 5.0302, "mean_token_accuracy": 0.2288100838661194, "num_tokens": 14215879.0, "step": 7820 }, { "entropy": 5.6874980449676515, "epoch": 6.7228190803609795, "grad_norm": 1.4296875, "learning_rate": 0.00017794998546572627, "loss": 4.8801, "mean_token_accuracy": 0.23987502455711365, "num_tokens": 14223798.0, "step": 7825 }, { "entropy": 5.725723648071289, "epoch": 6.727116458960034, "grad_norm": 1.1796875, "learning_rate": 0.0001776501280464391, "loss": 4.9859, "mean_token_accuracy": 0.23374852985143663, "num_tokens": 14233234.0, "step": 7830 }, { "entropy": 5.729088068008423, "epoch": 6.731413837559089, "grad_norm": 1.5, "learning_rate": 0.00017735048320036197, "loss": 4.9332, "mean_token_accuracy": 0.22950732260942458, "num_tokens": 14241851.0, "step": 7835 }, { "entropy": 5.720467853546142, "epoch": 6.735711216158143, "grad_norm": 1.4765625, "learning_rate": 0.00017705105158179917, "loss": 5.0433, "mean_token_accuracy": 0.21816251277923585, "num_tokens": 14251578.0, "step": 7840 }, { "entropy": 5.7909361839294435, "epoch": 6.740008594757198, "grad_norm": 1.2265625, "learning_rate": 0.00017675183384458987, "loss": 5.0511, "mean_token_accuracy": 0.22307134717702864, "num_tokens": 14261122.0, "step": 7845 }, { "entropy": 5.771282529830932, "epoch": 6.744305973356253, "grad_norm": 1.453125, "learning_rate": 0.00017645283064210616, "loss": 4.9444, "mean_token_accuracy": 0.23607346415519714, "num_tokens": 14270594.0, "step": 7850 }, { "entropy": 5.738985013961792, "epoch": 6.748603351955307, "grad_norm": 1.359375, "learning_rate": 0.00017615404262725132, "loss": 4.9987, "mean_token_accuracy": 0.2309481084346771, "num_tokens": 14279646.0, "step": 7855 }, { "entropy": 5.714361047744751, "epoch": 6.752900730554362, "grad_norm": 1.4296875, "learning_rate": 0.00017585547045245885, "loss": 4.9405, "mean_token_accuracy": 0.23584286719560624, "num_tokens": 14288555.0, "step": 7860 }, { "entropy": 5.699155330657959, "epoch": 6.757198109153417, "grad_norm": 1.4140625, "learning_rate": 0.00017555711476969138, "loss": 4.9656, "mean_token_accuracy": 0.22831491380929947, "num_tokens": 14297813.0, "step": 7865 }, { "entropy": 5.742558240890503, "epoch": 6.761495487752471, "grad_norm": 1.4375, "learning_rate": 0.00017525897623043806, "loss": 4.9478, "mean_token_accuracy": 0.23676440864801407, "num_tokens": 14305779.0, "step": 7870 }, { "entropy": 5.777757740020752, "epoch": 6.765792866351526, "grad_norm": 1.484375, "learning_rate": 0.00017496105548571472, "loss": 5.0401, "mean_token_accuracy": 0.21774942576885223, "num_tokens": 14314419.0, "step": 7875 }, { "entropy": 5.742297220230102, "epoch": 6.77009024495058, "grad_norm": 1.4296875, "learning_rate": 0.00017466335318606086, "loss": 4.989, "mean_token_accuracy": 0.22961059510707854, "num_tokens": 14322959.0, "step": 7880 }, { "entropy": 5.7465451717376705, "epoch": 6.774387623549635, "grad_norm": 1.2109375, "learning_rate": 0.00017436586998153947, "loss": 4.948, "mean_token_accuracy": 0.2412852019071579, "num_tokens": 14332270.0, "step": 7885 }, { "entropy": 5.658016204833984, "epoch": 6.7786850021486895, "grad_norm": 1.1953125, "learning_rate": 0.00017406860652173495, "loss": 4.9692, "mean_token_accuracy": 0.2288740873336792, "num_tokens": 14341701.0, "step": 7890 }, { "entropy": 5.730817270278931, "epoch": 6.782982380747744, "grad_norm": 1.46875, "learning_rate": 0.00017377156345575176, "loss": 4.9837, "mean_token_accuracy": 0.23206369131803511, "num_tokens": 14349551.0, "step": 7895 }, { "entropy": 5.648237323760986, "epoch": 6.787279759346799, "grad_norm": 1.3515625, "learning_rate": 0.00017347474143221338, "loss": 4.8989, "mean_token_accuracy": 0.23939766883850097, "num_tokens": 14358577.0, "step": 7900 }, { "entropy": 5.807467699050903, "epoch": 6.791577137945853, "grad_norm": 1.3203125, "learning_rate": 0.00017317814109926044, "loss": 5.0141, "mean_token_accuracy": 0.22136349976062775, "num_tokens": 14367862.0, "step": 7905 }, { "entropy": 5.643885517120362, "epoch": 6.795874516544908, "grad_norm": 1.21875, "learning_rate": 0.0001728817631045495, "loss": 4.9685, "mean_token_accuracy": 0.23076968789100646, "num_tokens": 14377414.0, "step": 7910 }, { "entropy": 5.614119005203247, "epoch": 6.800171895143962, "grad_norm": 1.2421875, "learning_rate": 0.0001725856080952516, "loss": 4.8926, "mean_token_accuracy": 0.2443048432469368, "num_tokens": 14387239.0, "step": 7915 }, { "entropy": 5.6486059665679935, "epoch": 6.804469273743017, "grad_norm": 1.375, "learning_rate": 0.0001722896767180509, "loss": 4.9112, "mean_token_accuracy": 0.2445044696331024, "num_tokens": 14396076.0, "step": 7920 }, { "entropy": 5.712938213348389, "epoch": 6.808766652342071, "grad_norm": 1.234375, "learning_rate": 0.00017199396961914334, "loss": 4.9828, "mean_token_accuracy": 0.2291121393442154, "num_tokens": 14404982.0, "step": 7925 }, { "entropy": 5.704281949996949, "epoch": 6.813064030941126, "grad_norm": 1.421875, "learning_rate": 0.00017169848744423506, "loss": 4.9396, "mean_token_accuracy": 0.22628463208675384, "num_tokens": 14413364.0, "step": 7930 }, { "entropy": 5.776637268066406, "epoch": 6.81736140954018, "grad_norm": 1.1875, "learning_rate": 0.00017140323083854076, "loss": 5.0554, "mean_token_accuracy": 0.22274332046508788, "num_tokens": 14424279.0, "step": 7935 }, { "entropy": 5.7132195949554445, "epoch": 6.821658788139235, "grad_norm": 1.390625, "learning_rate": 0.00017110820044678317, "loss": 5.0418, "mean_token_accuracy": 0.21940283626317977, "num_tokens": 14432931.0, "step": 7940 }, { "entropy": 5.733008098602295, "epoch": 6.8259561667382895, "grad_norm": 1.2890625, "learning_rate": 0.00017081339691319054, "loss": 4.9758, "mean_token_accuracy": 0.22764192670583724, "num_tokens": 14442652.0, "step": 7945 }, { "entropy": 5.75035982131958, "epoch": 6.830253545337344, "grad_norm": 1.2890625, "learning_rate": 0.00017051882088149612, "loss": 4.9486, "mean_token_accuracy": 0.22507085800170898, "num_tokens": 14452061.0, "step": 7950 }, { "entropy": 5.73496150970459, "epoch": 6.834550923936399, "grad_norm": 1.296875, "learning_rate": 0.00017022447299493599, "loss": 4.9373, "mean_token_accuracy": 0.2278796076774597, "num_tokens": 14460771.0, "step": 7955 }, { "entropy": 5.793766689300537, "epoch": 6.838848302535453, "grad_norm": 1.2890625, "learning_rate": 0.00016993035389624854, "loss": 5.0537, "mean_token_accuracy": 0.22595914900302888, "num_tokens": 14469983.0, "step": 7960 }, { "entropy": 5.70825343132019, "epoch": 6.843145681134508, "grad_norm": 1.390625, "learning_rate": 0.0001696364642276722, "loss": 4.9494, "mean_token_accuracy": 0.2361249253153801, "num_tokens": 14478641.0, "step": 7965 }, { "entropy": 5.731251430511475, "epoch": 6.847443059733562, "grad_norm": 1.28125, "learning_rate": 0.00016934280463094448, "loss": 5.0319, "mean_token_accuracy": 0.2262295663356781, "num_tokens": 14487922.0, "step": 7970 }, { "entropy": 5.655464458465576, "epoch": 6.851740438332617, "grad_norm": 1.5546875, "learning_rate": 0.00016904937574730062, "loss": 4.8786, "mean_token_accuracy": 0.24183200299739838, "num_tokens": 14496259.0, "step": 7975 }, { "entropy": 5.706764793395996, "epoch": 6.856037816931671, "grad_norm": 1.546875, "learning_rate": 0.00016875617821747208, "loss": 4.9795, "mean_token_accuracy": 0.22885009348392488, "num_tokens": 14504366.0, "step": 7980 }, { "entropy": 5.688942623138428, "epoch": 6.860335195530726, "grad_norm": 1.1796875, "learning_rate": 0.00016846321268168508, "loss": 4.9992, "mean_token_accuracy": 0.2282481923699379, "num_tokens": 14513561.0, "step": 7985 }, { "entropy": 5.7252562046051025, "epoch": 6.8646325741297805, "grad_norm": 1.3046875, "learning_rate": 0.00016817047977965905, "loss": 4.9373, "mean_token_accuracy": 0.23489120304584504, "num_tokens": 14522531.0, "step": 7990 }, { "entropy": 5.773946905136109, "epoch": 6.868929952728836, "grad_norm": 1.3984375, "learning_rate": 0.0001678779801506058, "loss": 4.9565, "mean_token_accuracy": 0.2310111179947853, "num_tokens": 14531375.0, "step": 7995 }, { "entropy": 5.709077548980713, "epoch": 6.8732273313278895, "grad_norm": 1.328125, "learning_rate": 0.00016758571443322774, "loss": 4.9955, "mean_token_accuracy": 0.23008209466934204, "num_tokens": 14541081.0, "step": 8000 }, { "epoch": 6.8732273313278895, "eval_entropy": 5.525449365108937, "eval_loss": 5.902733325958252, "eval_mean_token_accuracy": 0.18115301297658737, "eval_num_tokens": 14541081.0, "eval_runtime": 2.0411, "eval_samples_per_second": 1738.786, "eval_steps_per_second": 217.532, "step": 8000 }, { "entropy": 5.63901720046997, "epoch": 6.877524709926945, "grad_norm": 1.28125, "learning_rate": 0.0001672936832657162, "loss": 4.9221, "mean_token_accuracy": 0.24134268015623092, "num_tokens": 14550129.0, "step": 8005 }, { "entropy": 5.745688438415527, "epoch": 6.8818220885259995, "grad_norm": 1.5078125, "learning_rate": 0.00016700188728575047, "loss": 5.0905, "mean_token_accuracy": 0.21997221261262895, "num_tokens": 14559051.0, "step": 8010 }, { "entropy": 5.691987371444702, "epoch": 6.886119467125054, "grad_norm": 1.3125, "learning_rate": 0.00016671032713049655, "loss": 5.0005, "mean_token_accuracy": 0.23101048469543456, "num_tokens": 14567719.0, "step": 8015 }, { "entropy": 5.759095239639282, "epoch": 6.890416845724109, "grad_norm": 1.390625, "learning_rate": 0.00016641900343660515, "loss": 5.0156, "mean_token_accuracy": 0.22946806252002716, "num_tokens": 14576256.0, "step": 8020 }, { "entropy": 5.766721200942993, "epoch": 6.894714224323163, "grad_norm": 1.1640625, "learning_rate": 0.0001661279168402107, "loss": 5.0238, "mean_token_accuracy": 0.22409347891807557, "num_tokens": 14586392.0, "step": 8025 }, { "entropy": 5.720468521118164, "epoch": 6.899011602922218, "grad_norm": 1.40625, "learning_rate": 0.00016583706797693008, "loss": 4.994, "mean_token_accuracy": 0.23179059326648713, "num_tokens": 14595448.0, "step": 8030 }, { "entropy": 5.732708072662353, "epoch": 6.903308981521272, "grad_norm": 1.296875, "learning_rate": 0.00016554645748186105, "loss": 5.029, "mean_token_accuracy": 0.23494130671024321, "num_tokens": 14604242.0, "step": 8035 }, { "entropy": 5.803254747390747, "epoch": 6.907606360120327, "grad_norm": 1.3359375, "learning_rate": 0.00016525608598958063, "loss": 5.1037, "mean_token_accuracy": 0.220962455868721, "num_tokens": 14614983.0, "step": 8040 }, { "entropy": 5.757966709136963, "epoch": 6.911903738719381, "grad_norm": 1.375, "learning_rate": 0.00016496595413414421, "loss": 4.971, "mean_token_accuracy": 0.2303234815597534, "num_tokens": 14624748.0, "step": 8045 }, { "entropy": 5.678277587890625, "epoch": 6.916201117318436, "grad_norm": 1.1875, "learning_rate": 0.00016467606254908355, "loss": 4.9261, "mean_token_accuracy": 0.23968843072652818, "num_tokens": 14633642.0, "step": 8050 }, { "entropy": 5.722575759887695, "epoch": 6.9204984959174904, "grad_norm": 1.2890625, "learning_rate": 0.00016438641186740632, "loss": 5.0191, "mean_token_accuracy": 0.2276478499174118, "num_tokens": 14642549.0, "step": 8055 }, { "entropy": 5.710235452651977, "epoch": 6.924795874516545, "grad_norm": 1.359375, "learning_rate": 0.00016409700272159371, "loss": 5.0261, "mean_token_accuracy": 0.22573624700307846, "num_tokens": 14651642.0, "step": 8060 }, { "entropy": 5.7362377643585205, "epoch": 6.9290932531155995, "grad_norm": 1.21875, "learning_rate": 0.00016380783574359957, "loss": 4.9909, "mean_token_accuracy": 0.22976325154304506, "num_tokens": 14661052.0, "step": 8065 }, { "entropy": 5.71889214515686, "epoch": 6.933390631714654, "grad_norm": 1.46875, "learning_rate": 0.0001635189115648491, "loss": 4.9958, "mean_token_accuracy": 0.22729426622390747, "num_tokens": 14670292.0, "step": 8070 }, { "entropy": 5.69280972480774, "epoch": 6.937688010313709, "grad_norm": 1.3828125, "learning_rate": 0.00016323023081623705, "loss": 4.9191, "mean_token_accuracy": 0.23388173431158066, "num_tokens": 14679735.0, "step": 8075 }, { "entropy": 5.754827642440796, "epoch": 6.941985388912763, "grad_norm": 1.296875, "learning_rate": 0.00016294179412812702, "loss": 5.0344, "mean_token_accuracy": 0.21625297963619233, "num_tokens": 14688710.0, "step": 8080 }, { "entropy": 5.741319990158081, "epoch": 6.946282767511818, "grad_norm": 1.328125, "learning_rate": 0.00016265360213034923, "loss": 5.0987, "mean_token_accuracy": 0.21535037606954574, "num_tokens": 14698523.0, "step": 8085 }, { "entropy": 5.681290912628174, "epoch": 6.950580146110872, "grad_norm": 1.1875, "learning_rate": 0.00016236565545220007, "loss": 4.9824, "mean_token_accuracy": 0.22788620889186859, "num_tokens": 14707674.0, "step": 8090 }, { "entropy": 5.701905870437622, "epoch": 6.954877524709927, "grad_norm": 1.21875, "learning_rate": 0.00016207795472243975, "loss": 4.9084, "mean_token_accuracy": 0.237464140355587, "num_tokens": 14716600.0, "step": 8095 }, { "entropy": 5.77295413017273, "epoch": 6.959174903308981, "grad_norm": 1.515625, "learning_rate": 0.00016179050056929173, "loss": 5.0906, "mean_token_accuracy": 0.21767136603593826, "num_tokens": 14726112.0, "step": 8100 }, { "entropy": 5.728647947311401, "epoch": 6.963472281908036, "grad_norm": 1.3671875, "learning_rate": 0.00016150329362044102, "loss": 4.9675, "mean_token_accuracy": 0.23759464919567108, "num_tokens": 14735126.0, "step": 8105 }, { "entropy": 5.7574504852294925, "epoch": 6.9677696605070905, "grad_norm": 1.21875, "learning_rate": 0.00016121633450303285, "loss": 4.9816, "mean_token_accuracy": 0.23083561658859253, "num_tokens": 14744346.0, "step": 8110 }, { "entropy": 5.817537593841553, "epoch": 6.972067039106145, "grad_norm": 1.2578125, "learning_rate": 0.00016092962384367122, "loss": 5.0652, "mean_token_accuracy": 0.22513457387685776, "num_tokens": 14753322.0, "step": 8115 }, { "entropy": 5.663236141204834, "epoch": 6.9763644177051995, "grad_norm": 1.3671875, "learning_rate": 0.0001606431622684176, "loss": 4.98, "mean_token_accuracy": 0.2384248659014702, "num_tokens": 14762384.0, "step": 8120 }, { "entropy": 5.715035390853882, "epoch": 6.980661796304254, "grad_norm": 1.3125, "learning_rate": 0.00016035695040278935, "loss": 5.0252, "mean_token_accuracy": 0.22985492646694183, "num_tokens": 14771451.0, "step": 8125 }, { "entropy": 5.794897556304932, "epoch": 6.984959174903309, "grad_norm": 1.265625, "learning_rate": 0.00016007098887175914, "loss": 5.04, "mean_token_accuracy": 0.22508551180362701, "num_tokens": 14780662.0, "step": 8130 }, { "entropy": 5.753342247009277, "epoch": 6.989256553502363, "grad_norm": 1.4453125, "learning_rate": 0.00015978527829975254, "loss": 4.9586, "mean_token_accuracy": 0.2316376730799675, "num_tokens": 14789201.0, "step": 8135 }, { "entropy": 5.703181743621826, "epoch": 6.993553932101419, "grad_norm": 1.40625, "learning_rate": 0.00015949981931064714, "loss": 4.9866, "mean_token_accuracy": 0.23185751140117644, "num_tokens": 14797857.0, "step": 8140 }, { "entropy": 5.75775842666626, "epoch": 6.997851310700472, "grad_norm": 1.484375, "learning_rate": 0.0001592146125277714, "loss": 4.9861, "mean_token_accuracy": 0.22932713627815246, "num_tokens": 14806271.0, "step": 8145 }, { "entropy": 5.770098368326823, "epoch": 7.001718951439622, "grad_norm": 1.1875, "learning_rate": 0.00015892965857390278, "loss": 5.0595, "mean_token_accuracy": 0.21925362944602966, "num_tokens": 14815568.0, "step": 8150 }, { "entropy": 5.790607690811157, "epoch": 7.006016330038676, "grad_norm": 1.3125, "learning_rate": 0.00015864495807126704, "loss": 4.9636, "mean_token_accuracy": 0.23844788372516632, "num_tokens": 14825140.0, "step": 8155 }, { "entropy": 5.7898753643035885, "epoch": 7.010313708637731, "grad_norm": 1.2421875, "learning_rate": 0.00015836051164153602, "loss": 4.9574, "mean_token_accuracy": 0.23473027497529983, "num_tokens": 14834459.0, "step": 8160 }, { "entropy": 5.77668514251709, "epoch": 7.014611087236785, "grad_norm": 1.4140625, "learning_rate": 0.00015807631990582733, "loss": 4.8857, "mean_token_accuracy": 0.2403940051794052, "num_tokens": 14843632.0, "step": 8165 }, { "entropy": 5.790946006774902, "epoch": 7.01890846583584, "grad_norm": 1.28125, "learning_rate": 0.00015779238348470192, "loss": 4.879, "mean_token_accuracy": 0.24623702019453048, "num_tokens": 14852626.0, "step": 8170 }, { "entropy": 5.639549970626831, "epoch": 7.023205844434894, "grad_norm": 1.453125, "learning_rate": 0.00015750870299816345, "loss": 4.8233, "mean_token_accuracy": 0.23911771923303604, "num_tokens": 14861571.0, "step": 8175 }, { "entropy": 5.668585920333863, "epoch": 7.027503223033949, "grad_norm": 1.3671875, "learning_rate": 0.00015722527906565672, "loss": 4.8115, "mean_token_accuracy": 0.24984675794839858, "num_tokens": 14870383.0, "step": 8180 }, { "entropy": 5.711289310455323, "epoch": 7.0318006016330035, "grad_norm": 1.4765625, "learning_rate": 0.00015694211230606647, "loss": 4.9099, "mean_token_accuracy": 0.23236954361200332, "num_tokens": 14880212.0, "step": 8185 }, { "entropy": 5.720697021484375, "epoch": 7.036097980232058, "grad_norm": 1.328125, "learning_rate": 0.00015665920333771564, "loss": 4.8637, "mean_token_accuracy": 0.24579361379146575, "num_tokens": 14889347.0, "step": 8190 }, { "entropy": 5.725212049484253, "epoch": 7.0403953588311134, "grad_norm": 1.28125, "learning_rate": 0.00015637655277836427, "loss": 4.9014, "mean_token_accuracy": 0.23043718487024306, "num_tokens": 14898553.0, "step": 8195 }, { "entropy": 5.726214981079101, "epoch": 7.044692737430168, "grad_norm": 1.3359375, "learning_rate": 0.0001560941612452081, "loss": 4.8983, "mean_token_accuracy": 0.23977451920509338, "num_tokens": 14907275.0, "step": 8200 }, { "entropy": 5.698152351379394, "epoch": 7.0489901160292225, "grad_norm": 1.40625, "learning_rate": 0.0001558120293548777, "loss": 4.8956, "mean_token_accuracy": 0.24299730211496354, "num_tokens": 14916409.0, "step": 8205 }, { "entropy": 5.807874727249145, "epoch": 7.053287494628277, "grad_norm": 1.390625, "learning_rate": 0.00015553015772343614, "loss": 4.9654, "mean_token_accuracy": 0.2276977479457855, "num_tokens": 14927144.0, "step": 8210 }, { "entropy": 5.668857002258301, "epoch": 7.057584873227332, "grad_norm": 1.4921875, "learning_rate": 0.00015524854696637847, "loss": 4.8286, "mean_token_accuracy": 0.25360685139894484, "num_tokens": 14936310.0, "step": 8215 }, { "entropy": 5.673094940185547, "epoch": 7.061882251826386, "grad_norm": 1.234375, "learning_rate": 0.00015496719769862981, "loss": 4.8297, "mean_token_accuracy": 0.2441292092204094, "num_tokens": 14945571.0, "step": 8220 }, { "entropy": 5.648290920257568, "epoch": 7.066179630425441, "grad_norm": 1.4375, "learning_rate": 0.00015468611053454478, "loss": 4.8496, "mean_token_accuracy": 0.24561198949813842, "num_tokens": 14954586.0, "step": 8225 }, { "entropy": 5.725588417053222, "epoch": 7.070477009024495, "grad_norm": 1.5625, "learning_rate": 0.00015440528608790533, "loss": 4.9286, "mean_token_accuracy": 0.23840467929840087, "num_tokens": 14963048.0, "step": 8230 }, { "entropy": 5.739574861526489, "epoch": 7.07477438762355, "grad_norm": 1.375, "learning_rate": 0.0001541247249719197, "loss": 4.9141, "mean_token_accuracy": 0.2364799052476883, "num_tokens": 14972158.0, "step": 8235 }, { "entropy": 5.750800275802613, "epoch": 7.079071766222604, "grad_norm": 1.328125, "learning_rate": 0.00015384442779922135, "loss": 4.8631, "mean_token_accuracy": 0.2408420354127884, "num_tokens": 14980869.0, "step": 8240 }, { "entropy": 5.7228302478790285, "epoch": 7.083369144821659, "grad_norm": 1.546875, "learning_rate": 0.00015356439518186726, "loss": 4.8409, "mean_token_accuracy": 0.24564204663038253, "num_tokens": 14989103.0, "step": 8245 }, { "entropy": 5.659767150878906, "epoch": 7.0876665234207135, "grad_norm": 1.171875, "learning_rate": 0.00015328462773133672, "loss": 4.8656, "mean_token_accuracy": 0.24564456343650817, "num_tokens": 14999378.0, "step": 8250 }, { "entropy": 5.721705627441406, "epoch": 7.091963902019768, "grad_norm": 1.390625, "learning_rate": 0.00015300512605852977, "loss": 4.9331, "mean_token_accuracy": 0.23642863035202027, "num_tokens": 15007971.0, "step": 8255 }, { "entropy": 5.755592966079712, "epoch": 7.0962612806188226, "grad_norm": 1.2734375, "learning_rate": 0.0001527258907737668, "loss": 4.936, "mean_token_accuracy": 0.23605114668607713, "num_tokens": 15017537.0, "step": 8260 }, { "entropy": 5.686761331558228, "epoch": 7.100558659217877, "grad_norm": 1.515625, "learning_rate": 0.00015244692248678586, "loss": 4.8187, "mean_token_accuracy": 0.2529310867190361, "num_tokens": 15025684.0, "step": 8265 }, { "entropy": 5.70835599899292, "epoch": 7.104856037816932, "grad_norm": 1.484375, "learning_rate": 0.0001521682218067421, "loss": 4.869, "mean_token_accuracy": 0.24218338280916213, "num_tokens": 15034753.0, "step": 8270 }, { "entropy": 5.728517532348633, "epoch": 7.109153416415986, "grad_norm": 1.3359375, "learning_rate": 0.00015188978934220642, "loss": 4.8816, "mean_token_accuracy": 0.24051901549100876, "num_tokens": 15044685.0, "step": 8275 }, { "entropy": 5.771997499465942, "epoch": 7.113450795015041, "grad_norm": 1.2109375, "learning_rate": 0.0001516116257011641, "loss": 4.9531, "mean_token_accuracy": 0.24105844050645828, "num_tokens": 15054853.0, "step": 8280 }, { "entropy": 5.715450191497803, "epoch": 7.117748173614095, "grad_norm": 1.2109375, "learning_rate": 0.0001513337314910134, "loss": 4.8978, "mean_token_accuracy": 0.23127783685922623, "num_tokens": 15065244.0, "step": 8285 }, { "entropy": 5.662863492965698, "epoch": 7.12204555221315, "grad_norm": 1.40625, "learning_rate": 0.00015105610731856416, "loss": 4.8215, "mean_token_accuracy": 0.2513589784502983, "num_tokens": 15074046.0, "step": 8290 }, { "entropy": 5.695620584487915, "epoch": 7.126342930812204, "grad_norm": 1.40625, "learning_rate": 0.00015077875379003653, "loss": 4.8739, "mean_token_accuracy": 0.23767761290073394, "num_tokens": 15083518.0, "step": 8295 }, { "entropy": 5.7220344066619875, "epoch": 7.130640309411259, "grad_norm": 1.2890625, "learning_rate": 0.00015050167151105988, "loss": 4.9374, "mean_token_accuracy": 0.23466922491788864, "num_tokens": 15092512.0, "step": 8300 }, { "entropy": 5.771974849700928, "epoch": 7.1349376880103135, "grad_norm": 1.5625, "learning_rate": 0.000150224861086671, "loss": 4.9051, "mean_token_accuracy": 0.24226571321487428, "num_tokens": 15101722.0, "step": 8305 }, { "entropy": 5.73566575050354, "epoch": 7.139235066609368, "grad_norm": 1.3125, "learning_rate": 0.00014994832312131332, "loss": 4.8418, "mean_token_accuracy": 0.24409846365451812, "num_tokens": 15110114.0, "step": 8310 }, { "entropy": 5.698592281341552, "epoch": 7.143532445208423, "grad_norm": 1.3984375, "learning_rate": 0.00014967205821883532, "loss": 4.937, "mean_token_accuracy": 0.23917200565338134, "num_tokens": 15119461.0, "step": 8315 }, { "entropy": 5.707847642898559, "epoch": 7.147829823807477, "grad_norm": 1.4765625, "learning_rate": 0.000149396066982489, "loss": 4.8737, "mean_token_accuracy": 0.24108270555734634, "num_tokens": 15127518.0, "step": 8320 }, { "entropy": 5.750520896911621, "epoch": 7.152127202406532, "grad_norm": 1.3203125, "learning_rate": 0.00014912035001492897, "loss": 4.9462, "mean_token_accuracy": 0.2327495649456978, "num_tokens": 15136741.0, "step": 8325 }, { "entropy": 5.734196424484253, "epoch": 7.156424581005586, "grad_norm": 1.4375, "learning_rate": 0.00014884490791821058, "loss": 4.8907, "mean_token_accuracy": 0.24345339983701705, "num_tokens": 15145193.0, "step": 8330 }, { "entropy": 5.699493360519409, "epoch": 7.160721959604641, "grad_norm": 1.375, "learning_rate": 0.00014856974129378981, "loss": 4.8922, "mean_token_accuracy": 0.2417183518409729, "num_tokens": 15154117.0, "step": 8335 }, { "entropy": 5.712007856369018, "epoch": 7.165019338203695, "grad_norm": 1.296875, "learning_rate": 0.0001482948507425203, "loss": 4.913, "mean_token_accuracy": 0.23444428145885468, "num_tokens": 15163221.0, "step": 8340 }, { "entropy": 5.747190380096436, "epoch": 7.169316716802751, "grad_norm": 1.2421875, "learning_rate": 0.00014802023686465314, "loss": 4.9764, "mean_token_accuracy": 0.22482520043849946, "num_tokens": 15173234.0, "step": 8345 }, { "entropy": 5.685973215103149, "epoch": 7.173614095401805, "grad_norm": 1.421875, "learning_rate": 0.00014774590025983523, "loss": 4.8529, "mean_token_accuracy": 0.24296284317970276, "num_tokens": 15181436.0, "step": 8350 }, { "entropy": 5.700090456008911, "epoch": 7.17791147400086, "grad_norm": 1.5234375, "learning_rate": 0.00014747184152710807, "loss": 4.9054, "mean_token_accuracy": 0.24234439432621002, "num_tokens": 15191697.0, "step": 8355 }, { "entropy": 5.714896535873413, "epoch": 7.182208852599914, "grad_norm": 1.3515625, "learning_rate": 0.00014719806126490658, "loss": 4.8689, "mean_token_accuracy": 0.2476797804236412, "num_tokens": 15201563.0, "step": 8360 }, { "entropy": 5.682293272018432, "epoch": 7.186506231198969, "grad_norm": 1.4296875, "learning_rate": 0.0001469245600710573, "loss": 4.87, "mean_token_accuracy": 0.24080406427383422, "num_tokens": 15210886.0, "step": 8365 }, { "entropy": 5.713798952102661, "epoch": 7.1908036097980235, "grad_norm": 1.4765625, "learning_rate": 0.00014665133854277742, "loss": 4.9057, "mean_token_accuracy": 0.23708308786153792, "num_tokens": 15219254.0, "step": 8370 }, { "entropy": 5.675087833404541, "epoch": 7.195100988397078, "grad_norm": 1.359375, "learning_rate": 0.0001463783972766737, "loss": 4.8843, "mean_token_accuracy": 0.24380502253770828, "num_tokens": 15228117.0, "step": 8375 }, { "entropy": 5.638825178146362, "epoch": 7.199398366996133, "grad_norm": 1.4765625, "learning_rate": 0.0001461057368687407, "loss": 4.8655, "mean_token_accuracy": 0.24366891533136367, "num_tokens": 15236621.0, "step": 8380 }, { "entropy": 5.668357849121094, "epoch": 7.203695745595187, "grad_norm": 1.3671875, "learning_rate": 0.00014583335791435971, "loss": 4.82, "mean_token_accuracy": 0.24267793148756028, "num_tokens": 15245487.0, "step": 8385 }, { "entropy": 5.695615434646607, "epoch": 7.207993124194242, "grad_norm": 1.3125, "learning_rate": 0.00014556126100829774, "loss": 4.8767, "mean_token_accuracy": 0.23861388117074966, "num_tokens": 15255321.0, "step": 8390 }, { "entropy": 5.645771932601929, "epoch": 7.212290502793296, "grad_norm": 1.3125, "learning_rate": 0.00014528944674470546, "loss": 4.7918, "mean_token_accuracy": 0.24924195259809495, "num_tokens": 15264788.0, "step": 8395 }, { "entropy": 5.726456499099731, "epoch": 7.216587881392351, "grad_norm": 1.4296875, "learning_rate": 0.0001450179157171166, "loss": 4.886, "mean_token_accuracy": 0.23832045942544938, "num_tokens": 15273448.0, "step": 8400 }, { "entropy": 5.758960819244384, "epoch": 7.220885259991405, "grad_norm": 1.4453125, "learning_rate": 0.00014474666851844632, "loss": 4.9719, "mean_token_accuracy": 0.22947929054498672, "num_tokens": 15283071.0, "step": 8405 }, { "entropy": 5.690451908111572, "epoch": 7.22518263859046, "grad_norm": 1.4375, "learning_rate": 0.00014447570574099028, "loss": 4.784, "mean_token_accuracy": 0.25350341796875, "num_tokens": 15291537.0, "step": 8410 }, { "entropy": 5.763349103927612, "epoch": 7.229480017189514, "grad_norm": 1.2890625, "learning_rate": 0.00014420502797642283, "loss": 4.8627, "mean_token_accuracy": 0.24792618304491043, "num_tokens": 15300531.0, "step": 8415 }, { "entropy": 5.700027704238892, "epoch": 7.233777395788569, "grad_norm": 1.453125, "learning_rate": 0.000143934635815796, "loss": 4.947, "mean_token_accuracy": 0.23724473267793655, "num_tokens": 15309820.0, "step": 8420 }, { "entropy": 5.676058435440064, "epoch": 7.2380747743876235, "grad_norm": 1.515625, "learning_rate": 0.0001436645298495381, "loss": 4.8861, "mean_token_accuracy": 0.23884514719247818, "num_tokens": 15318604.0, "step": 8425 }, { "entropy": 5.662927532196045, "epoch": 7.242372152986678, "grad_norm": 1.328125, "learning_rate": 0.00014339471066745262, "loss": 4.8956, "mean_token_accuracy": 0.23285638093948363, "num_tokens": 15327737.0, "step": 8430 }, { "entropy": 5.710688924789428, "epoch": 7.246669531585733, "grad_norm": 1.4765625, "learning_rate": 0.000143125178858717, "loss": 4.9411, "mean_token_accuracy": 0.23508805632591248, "num_tokens": 15336663.0, "step": 8435 }, { "entropy": 5.775945138931275, "epoch": 7.250966910184787, "grad_norm": 1.34375, "learning_rate": 0.00014285593501188083, "loss": 4.9471, "mean_token_accuracy": 0.23182412534952163, "num_tokens": 15345278.0, "step": 8440 }, { "entropy": 5.743250989913941, "epoch": 7.255264288783842, "grad_norm": 1.3125, "learning_rate": 0.00014258697971486492, "loss": 4.9232, "mean_token_accuracy": 0.24036518335342408, "num_tokens": 15354230.0, "step": 8445 }, { "entropy": 5.734498691558838, "epoch": 7.259561667382896, "grad_norm": 1.40625, "learning_rate": 0.00014231831355496045, "loss": 4.9345, "mean_token_accuracy": 0.2410287767648697, "num_tokens": 15362838.0, "step": 8450 }, { "entropy": 5.6637495994567875, "epoch": 7.263859045981951, "grad_norm": 1.234375, "learning_rate": 0.00014204993711882662, "loss": 4.8889, "mean_token_accuracy": 0.24409003406763077, "num_tokens": 15372593.0, "step": 8455 }, { "entropy": 5.759537553787231, "epoch": 7.268156424581005, "grad_norm": 1.359375, "learning_rate": 0.0001417818509924906, "loss": 4.9528, "mean_token_accuracy": 0.23652229011058806, "num_tokens": 15381945.0, "step": 8460 }, { "entropy": 5.695685482025146, "epoch": 7.27245380318006, "grad_norm": 1.265625, "learning_rate": 0.000141514055761345, "loss": 4.8834, "mean_token_accuracy": 0.24260732531547546, "num_tokens": 15391487.0, "step": 8465 }, { "entropy": 5.741024160385132, "epoch": 7.276751181779114, "grad_norm": 1.4375, "learning_rate": 0.00014124655201014786, "loss": 4.8414, "mean_token_accuracy": 0.24344971179962158, "num_tokens": 15399891.0, "step": 8470 }, { "entropy": 5.6841777801513675, "epoch": 7.281048560378169, "grad_norm": 1.359375, "learning_rate": 0.00014097934032302037, "loss": 4.8381, "mean_token_accuracy": 0.24351507276296616, "num_tokens": 15408693.0, "step": 8475 }, { "entropy": 5.673647069931031, "epoch": 7.2853459389772235, "grad_norm": 1.375, "learning_rate": 0.00014071242128344593, "loss": 4.9228, "mean_token_accuracy": 0.238188037276268, "num_tokens": 15417779.0, "step": 8480 }, { "entropy": 5.6682343006134035, "epoch": 7.289643317576278, "grad_norm": 1.5078125, "learning_rate": 0.0001404457954742691, "loss": 4.845, "mean_token_accuracy": 0.2480306074023247, "num_tokens": 15425826.0, "step": 8485 }, { "entropy": 5.69491925239563, "epoch": 7.2939406961753335, "grad_norm": 1.640625, "learning_rate": 0.00014017946347769423, "loss": 4.914, "mean_token_accuracy": 0.2448977291584015, "num_tokens": 15435811.0, "step": 8490 }, { "entropy": 5.648247766494751, "epoch": 7.298238074774388, "grad_norm": 1.4765625, "learning_rate": 0.00013991342587528377, "loss": 4.8112, "mean_token_accuracy": 0.2435745283961296, "num_tokens": 15444949.0, "step": 8495 }, { "entropy": 5.6486053466796875, "epoch": 7.302535453373443, "grad_norm": 1.375, "learning_rate": 0.00013964768324795752, "loss": 4.8301, "mean_token_accuracy": 0.2504597008228302, "num_tokens": 15453398.0, "step": 8500 }, { "epoch": 7.302535453373443, "eval_entropy": 5.5008021507177265, "eval_loss": 5.913280487060547, "eval_mean_token_accuracy": 0.18130035429924457, "eval_num_tokens": 15453398.0, "eval_runtime": 2.0541, "eval_samples_per_second": 1727.748, "eval_steps_per_second": 216.151, "step": 8500 }, { "entropy": 5.640936183929443, "epoch": 7.306832831972497, "grad_norm": 1.265625, "learning_rate": 0.00013938223617599124, "loss": 4.9141, "mean_token_accuracy": 0.23757578134536744, "num_tokens": 15462785.0, "step": 8505 }, { "entropy": 5.703793859481811, "epoch": 7.311130210571552, "grad_norm": 1.484375, "learning_rate": 0.00013911708523901514, "loss": 4.9328, "mean_token_accuracy": 0.23959697782993317, "num_tokens": 15471718.0, "step": 8510 }, { "entropy": 5.737540197372437, "epoch": 7.315427589170606, "grad_norm": 1.3671875, "learning_rate": 0.00013885223101601303, "loss": 4.8673, "mean_token_accuracy": 0.2403181314468384, "num_tokens": 15480204.0, "step": 8515 }, { "entropy": 5.678928709030151, "epoch": 7.319724967769661, "grad_norm": 1.3515625, "learning_rate": 0.00013858767408532051, "loss": 4.8308, "mean_token_accuracy": 0.24473243802785874, "num_tokens": 15489388.0, "step": 8520 }, { "entropy": 5.666277360916138, "epoch": 7.324022346368715, "grad_norm": 1.453125, "learning_rate": 0.00013832341502462432, "loss": 4.8509, "mean_token_accuracy": 0.2423481523990631, "num_tokens": 15498028.0, "step": 8525 }, { "entropy": 5.703804349899292, "epoch": 7.32831972496777, "grad_norm": 1.5703125, "learning_rate": 0.00013805945441096057, "loss": 4.8826, "mean_token_accuracy": 0.24215862900018692, "num_tokens": 15506382.0, "step": 8530 }, { "entropy": 5.715006399154663, "epoch": 7.332617103566824, "grad_norm": 1.4921875, "learning_rate": 0.00013779579282071364, "loss": 4.9085, "mean_token_accuracy": 0.24137408286333084, "num_tokens": 15515271.0, "step": 8535 }, { "entropy": 5.669809722900391, "epoch": 7.336914482165879, "grad_norm": 1.3359375, "learning_rate": 0.00013753243082961512, "loss": 4.8373, "mean_token_accuracy": 0.24630660563707352, "num_tokens": 15524396.0, "step": 8540 }, { "entropy": 5.671169328689575, "epoch": 7.3412118607649335, "grad_norm": 1.4453125, "learning_rate": 0.00013726936901274246, "loss": 4.816, "mean_token_accuracy": 0.24975510984659194, "num_tokens": 15532829.0, "step": 8545 }, { "entropy": 5.729752159118652, "epoch": 7.345509239363988, "grad_norm": 1.2890625, "learning_rate": 0.0001370066079445174, "loss": 4.9423, "mean_token_accuracy": 0.23485267013311387, "num_tokens": 15541726.0, "step": 8550 }, { "entropy": 5.765802001953125, "epoch": 7.349806617963043, "grad_norm": 1.3203125, "learning_rate": 0.00013674414819870502, "loss": 5.0472, "mean_token_accuracy": 0.22578038275241852, "num_tokens": 15551539.0, "step": 8555 }, { "entropy": 5.687641620635986, "epoch": 7.354103996562097, "grad_norm": 1.484375, "learning_rate": 0.00013648199034841264, "loss": 4.8888, "mean_token_accuracy": 0.23955927342176436, "num_tokens": 15560147.0, "step": 8560 }, { "entropy": 5.673187303543091, "epoch": 7.358401375161152, "grad_norm": 1.4453125, "learning_rate": 0.0001362201349660882, "loss": 4.8612, "mean_token_accuracy": 0.24715079367160797, "num_tokens": 15568983.0, "step": 8565 }, { "entropy": 5.653613901138305, "epoch": 7.362698753760206, "grad_norm": 1.421875, "learning_rate": 0.0001359585826235192, "loss": 4.8706, "mean_token_accuracy": 0.24422087669372558, "num_tokens": 15578065.0, "step": 8570 }, { "entropy": 5.7628312587738035, "epoch": 7.366996132359261, "grad_norm": 1.5390625, "learning_rate": 0.00013569733389183126, "loss": 4.9618, "mean_token_accuracy": 0.2341765359044075, "num_tokens": 15587181.0, "step": 8575 }, { "entropy": 5.705190706253052, "epoch": 7.371293510958315, "grad_norm": 1.3359375, "learning_rate": 0.00013543638934148736, "loss": 4.8957, "mean_token_accuracy": 0.23927247971296312, "num_tokens": 15596602.0, "step": 8580 }, { "entropy": 5.772485971450806, "epoch": 7.37559088955737, "grad_norm": 1.5, "learning_rate": 0.000135175749542286, "loss": 4.9429, "mean_token_accuracy": 0.22380622774362563, "num_tokens": 15605857.0, "step": 8585 }, { "entropy": 5.623779296875, "epoch": 7.379888268156424, "grad_norm": 1.2890625, "learning_rate": 0.0001349154150633604, "loss": 4.8308, "mean_token_accuracy": 0.2533808171749115, "num_tokens": 15615320.0, "step": 8590 }, { "entropy": 5.737985992431641, "epoch": 7.384185646755479, "grad_norm": 1.34375, "learning_rate": 0.000134655386473177, "loss": 4.9824, "mean_token_accuracy": 0.22759506702423096, "num_tokens": 15624193.0, "step": 8595 }, { "entropy": 5.601533651351929, "epoch": 7.3884830253545335, "grad_norm": 1.5234375, "learning_rate": 0.00013439566433953427, "loss": 4.8275, "mean_token_accuracy": 0.25197608172893526, "num_tokens": 15632924.0, "step": 8600 }, { "entropy": 5.694646692276001, "epoch": 7.392780403953588, "grad_norm": 1.4296875, "learning_rate": 0.0001341362492295616, "loss": 4.847, "mean_token_accuracy": 0.24944338649511338, "num_tokens": 15642201.0, "step": 8605 }, { "entropy": 5.757268381118775, "epoch": 7.397077782552643, "grad_norm": 1.5234375, "learning_rate": 0.00013387714170971776, "loss": 4.8893, "mean_token_accuracy": 0.23827150762081145, "num_tokens": 15651608.0, "step": 8610 }, { "entropy": 5.751231670379639, "epoch": 7.401375161151697, "grad_norm": 1.3359375, "learning_rate": 0.00013361834234579012, "loss": 4.9938, "mean_token_accuracy": 0.23161635547876358, "num_tokens": 15661768.0, "step": 8615 }, { "entropy": 5.6181495666503904, "epoch": 7.405672539750752, "grad_norm": 1.4609375, "learning_rate": 0.0001333598517028931, "loss": 4.8745, "mean_token_accuracy": 0.2390742525458336, "num_tokens": 15670270.0, "step": 8620 }, { "entropy": 5.660263729095459, "epoch": 7.409969918349806, "grad_norm": 1.40625, "learning_rate": 0.00013310167034546688, "loss": 4.8491, "mean_token_accuracy": 0.2465496301651001, "num_tokens": 15679587.0, "step": 8625 }, { "entropy": 5.721620082855225, "epoch": 7.414267296948861, "grad_norm": 1.5078125, "learning_rate": 0.0001328437988372763, "loss": 4.9246, "mean_token_accuracy": 0.23698771893978118, "num_tokens": 15688838.0, "step": 8630 }, { "entropy": 5.744182872772217, "epoch": 7.418564675547916, "grad_norm": 1.4140625, "learning_rate": 0.00013258623774140967, "loss": 4.8863, "mean_token_accuracy": 0.24282266497611998, "num_tokens": 15697744.0, "step": 8635 }, { "entropy": 5.69905276298523, "epoch": 7.422862054146971, "grad_norm": 1.4765625, "learning_rate": 0.00013232898762027766, "loss": 4.8515, "mean_token_accuracy": 0.24636502265930177, "num_tokens": 15707643.0, "step": 8640 }, { "entropy": 5.722139835357666, "epoch": 7.427159432746025, "grad_norm": 1.3828125, "learning_rate": 0.00013207204903561154, "loss": 4.9429, "mean_token_accuracy": 0.232541623711586, "num_tokens": 15717568.0, "step": 8645 }, { "entropy": 5.655771064758301, "epoch": 7.43145681134508, "grad_norm": 1.2890625, "learning_rate": 0.00013181542254846247, "loss": 4.8108, "mean_token_accuracy": 0.24847375005483627, "num_tokens": 15726467.0, "step": 8650 }, { "entropy": 5.736402368545532, "epoch": 7.435754189944134, "grad_norm": 1.4453125, "learning_rate": 0.0001315591087192002, "loss": 4.9594, "mean_token_accuracy": 0.23719182014465331, "num_tokens": 15736533.0, "step": 8655 }, { "entropy": 5.628046464920044, "epoch": 7.440051568543189, "grad_norm": 1.328125, "learning_rate": 0.00013130310810751162, "loss": 4.8607, "mean_token_accuracy": 0.2462889164686203, "num_tokens": 15745853.0, "step": 8660 }, { "entropy": 5.74335241317749, "epoch": 7.4443489471422435, "grad_norm": 1.4375, "learning_rate": 0.00013104742127239983, "loss": 4.9821, "mean_token_accuracy": 0.23338729590177537, "num_tokens": 15755534.0, "step": 8665 }, { "entropy": 5.772054052352905, "epoch": 7.448646325741298, "grad_norm": 1.2109375, "learning_rate": 0.0001307920487721826, "loss": 4.9678, "mean_token_accuracy": 0.232174876332283, "num_tokens": 15766182.0, "step": 8670 }, { "entropy": 5.706271362304688, "epoch": 7.452943704340353, "grad_norm": 1.421875, "learning_rate": 0.00013053699116449144, "loss": 4.8787, "mean_token_accuracy": 0.23836376070976256, "num_tokens": 15775454.0, "step": 8675 }, { "entropy": 5.858646392822266, "epoch": 7.457241082939407, "grad_norm": 1.296875, "learning_rate": 0.00013028224900627026, "loss": 4.9947, "mean_token_accuracy": 0.22622087746858596, "num_tokens": 15784768.0, "step": 8680 }, { "entropy": 5.753428220748901, "epoch": 7.461538461538462, "grad_norm": 1.125, "learning_rate": 0.00013002782285377395, "loss": 4.9381, "mean_token_accuracy": 0.2409772902727127, "num_tokens": 15794255.0, "step": 8685 }, { "entropy": 5.725694417953491, "epoch": 7.465835840137516, "grad_norm": 1.5234375, "learning_rate": 0.0001297737132625677, "loss": 4.9665, "mean_token_accuracy": 0.23175050765275956, "num_tokens": 15803722.0, "step": 8690 }, { "entropy": 5.731313705444336, "epoch": 7.470133218736571, "grad_norm": 1.6015625, "learning_rate": 0.00012951992078752528, "loss": 4.9028, "mean_token_accuracy": 0.23898655623197557, "num_tokens": 15811819.0, "step": 8695 }, { "entropy": 5.703043031692505, "epoch": 7.474430597335625, "grad_norm": 1.296875, "learning_rate": 0.00012926644598282798, "loss": 4.9245, "mean_token_accuracy": 0.2335854396224022, "num_tokens": 15821446.0, "step": 8700 }, { "entropy": 5.735653400421143, "epoch": 7.47872797593468, "grad_norm": 1.28125, "learning_rate": 0.0001290132894019634, "loss": 4.9445, "mean_token_accuracy": 0.24317347705364228, "num_tokens": 15830585.0, "step": 8705 }, { "entropy": 5.7284379482269285, "epoch": 7.483025354533734, "grad_norm": 1.359375, "learning_rate": 0.00012876045159772442, "loss": 4.9058, "mean_token_accuracy": 0.24099535942077638, "num_tokens": 15838872.0, "step": 8710 }, { "entropy": 5.761771535873413, "epoch": 7.487322733132789, "grad_norm": 1.453125, "learning_rate": 0.00012850793312220766, "loss": 4.9076, "mean_token_accuracy": 0.23802259117364882, "num_tokens": 15847561.0, "step": 8715 }, { "entropy": 5.646856451034546, "epoch": 7.4916201117318435, "grad_norm": 1.265625, "learning_rate": 0.00012825573452681266, "loss": 4.8555, "mean_token_accuracy": 0.25125192701816557, "num_tokens": 15856405.0, "step": 8720 }, { "entropy": 5.72300238609314, "epoch": 7.495917490330898, "grad_norm": 1.53125, "learning_rate": 0.00012800385636224017, "loss": 4.9081, "mean_token_accuracy": 0.24140879213809968, "num_tokens": 15865856.0, "step": 8725 }, { "entropy": 5.711844491958618, "epoch": 7.500214868929953, "grad_norm": 1.5078125, "learning_rate": 0.00012775229917849162, "loss": 4.8527, "mean_token_accuracy": 0.24171538800001144, "num_tokens": 15873605.0, "step": 8730 }, { "entropy": 5.730578804016114, "epoch": 7.504512247529007, "grad_norm": 1.3203125, "learning_rate": 0.00012750106352486728, "loss": 4.8656, "mean_token_accuracy": 0.24494647234678268, "num_tokens": 15883123.0, "step": 8735 }, { "entropy": 5.6782575130462645, "epoch": 7.508809626128062, "grad_norm": 1.203125, "learning_rate": 0.00012725014994996534, "loss": 4.9047, "mean_token_accuracy": 0.24687566012144088, "num_tokens": 15892713.0, "step": 8740 }, { "entropy": 5.758023262023926, "epoch": 7.513107004727116, "grad_norm": 1.3203125, "learning_rate": 0.00012699955900168075, "loss": 4.9081, "mean_token_accuracy": 0.23785718083381652, "num_tokens": 15902913.0, "step": 8745 }, { "entropy": 5.727906036376953, "epoch": 7.517404383326171, "grad_norm": 1.359375, "learning_rate": 0.00012674929122720414, "loss": 4.9398, "mean_token_accuracy": 0.2300073966383934, "num_tokens": 15912721.0, "step": 8750 }, { "entropy": 5.644121694564819, "epoch": 7.521701761925225, "grad_norm": 1.2890625, "learning_rate": 0.0001264993471730202, "loss": 4.833, "mean_token_accuracy": 0.24297845661640166, "num_tokens": 15921520.0, "step": 8755 }, { "entropy": 5.747539854049682, "epoch": 7.52599914052428, "grad_norm": 1.3203125, "learning_rate": 0.00012624972738490675, "loss": 4.9571, "mean_token_accuracy": 0.23408962190151214, "num_tokens": 15930753.0, "step": 8760 }, { "entropy": 5.780120038986206, "epoch": 7.530296519123334, "grad_norm": 1.3203125, "learning_rate": 0.00012600043240793368, "loss": 4.9331, "mean_token_accuracy": 0.2341680034995079, "num_tokens": 15939957.0, "step": 8765 }, { "entropy": 5.723352432250977, "epoch": 7.534593897722389, "grad_norm": 1.453125, "learning_rate": 0.00012575146278646175, "loss": 4.8624, "mean_token_accuracy": 0.24073042422533036, "num_tokens": 15949555.0, "step": 8770 }, { "entropy": 5.667313003540039, "epoch": 7.5388912763214435, "grad_norm": 1.3203125, "learning_rate": 0.00012550281906414097, "loss": 4.8799, "mean_token_accuracy": 0.23746145516633987, "num_tokens": 15958395.0, "step": 8775 }, { "entropy": 5.7248999118804935, "epoch": 7.543188654920499, "grad_norm": 1.3203125, "learning_rate": 0.00012525450178390972, "loss": 4.9127, "mean_token_accuracy": 0.24568843394517897, "num_tokens": 15967522.0, "step": 8780 }, { "entropy": 5.747699928283692, "epoch": 7.547486033519553, "grad_norm": 1.4140625, "learning_rate": 0.0001250065114879939, "loss": 4.9025, "mean_token_accuracy": 0.23830792605876921, "num_tokens": 15976311.0, "step": 8785 }, { "entropy": 5.6730828285217285, "epoch": 7.551783412118608, "grad_norm": 1.375, "learning_rate": 0.00012475884871790505, "loss": 4.8426, "mean_token_accuracy": 0.24330639988183975, "num_tokens": 15985202.0, "step": 8790 }, { "entropy": 5.788040351867676, "epoch": 7.556080790717663, "grad_norm": 1.421875, "learning_rate": 0.00012451151401443982, "loss": 4.9778, "mean_token_accuracy": 0.22562479078769684, "num_tokens": 15995043.0, "step": 8795 }, { "entropy": 5.629932451248169, "epoch": 7.560378169316717, "grad_norm": 1.3828125, "learning_rate": 0.00012426450791767815, "loss": 4.8205, "mean_token_accuracy": 0.24828920215368272, "num_tokens": 16004355.0, "step": 8800 }, { "entropy": 5.700480365753174, "epoch": 7.564675547915772, "grad_norm": 1.4296875, "learning_rate": 0.00012401783096698283, "loss": 4.7502, "mean_token_accuracy": 0.24835503846406937, "num_tokens": 16013069.0, "step": 8805 }, { "entropy": 5.706959199905396, "epoch": 7.568972926514826, "grad_norm": 1.28125, "learning_rate": 0.00012377148370099764, "loss": 4.9231, "mean_token_accuracy": 0.2306264817714691, "num_tokens": 16023757.0, "step": 8810 }, { "entropy": 5.7141008377075195, "epoch": 7.573270305113881, "grad_norm": 1.40625, "learning_rate": 0.00012352546665764642, "loss": 4.9245, "mean_token_accuracy": 0.23618121743202208, "num_tokens": 16032550.0, "step": 8815 }, { "entropy": 5.592525005340576, "epoch": 7.577567683712935, "grad_norm": 1.3359375, "learning_rate": 0.00012327978037413219, "loss": 4.8005, "mean_token_accuracy": 0.25415861159563063, "num_tokens": 16041580.0, "step": 8820 }, { "entropy": 5.6945716381073, "epoch": 7.58186506231199, "grad_norm": 1.4765625, "learning_rate": 0.00012303442538693564, "loss": 4.9079, "mean_token_accuracy": 0.23844119608402253, "num_tokens": 16049845.0, "step": 8825 }, { "entropy": 5.649405431747437, "epoch": 7.586162440911044, "grad_norm": 1.484375, "learning_rate": 0.00012278940223181393, "loss": 4.8096, "mean_token_accuracy": 0.23990656286478043, "num_tokens": 16059703.0, "step": 8830 }, { "entropy": 5.67500786781311, "epoch": 7.590459819510099, "grad_norm": 1.3359375, "learning_rate": 0.00012254471144379964, "loss": 4.7812, "mean_token_accuracy": 0.2586831733584404, "num_tokens": 16068416.0, "step": 8835 }, { "entropy": 5.6926109313964846, "epoch": 7.5947571981091535, "grad_norm": 1.34375, "learning_rate": 0.00012230035355719968, "loss": 4.9417, "mean_token_accuracy": 0.23565699011087418, "num_tokens": 16078067.0, "step": 8840 }, { "entropy": 5.718332052230835, "epoch": 7.599054576708208, "grad_norm": 1.3359375, "learning_rate": 0.0001220563291055941, "loss": 4.8999, "mean_token_accuracy": 0.24132361114025117, "num_tokens": 16086591.0, "step": 8845 }, { "entropy": 5.74136209487915, "epoch": 7.603351955307263, "grad_norm": 1.40625, "learning_rate": 0.0001218126386218347, "loss": 4.9064, "mean_token_accuracy": 0.2423307090997696, "num_tokens": 16096138.0, "step": 8850 }, { "entropy": 5.653201866149902, "epoch": 7.607649333906317, "grad_norm": 1.46875, "learning_rate": 0.00012156928263804403, "loss": 4.839, "mean_token_accuracy": 0.24370431303977966, "num_tokens": 16105182.0, "step": 8855 }, { "entropy": 5.720748567581177, "epoch": 7.611946712505372, "grad_norm": 1.515625, "learning_rate": 0.0001213262616856144, "loss": 4.9646, "mean_token_accuracy": 0.23470364809036254, "num_tokens": 16113940.0, "step": 8860 }, { "entropy": 5.711144542694091, "epoch": 7.616244091104426, "grad_norm": 1.5078125, "learning_rate": 0.00012108357629520635, "loss": 4.8594, "mean_token_accuracy": 0.2404816433787346, "num_tokens": 16123036.0, "step": 8865 }, { "entropy": 5.682049751281738, "epoch": 7.620541469703481, "grad_norm": 1.4296875, "learning_rate": 0.00012084122699674785, "loss": 4.8693, "mean_token_accuracy": 0.24037092477083205, "num_tokens": 16131057.0, "step": 8870 }, { "entropy": 5.707250261306763, "epoch": 7.624838848302535, "grad_norm": 1.4921875, "learning_rate": 0.00012059921431943278, "loss": 4.9119, "mean_token_accuracy": 0.23492788076400756, "num_tokens": 16140259.0, "step": 8875 }, { "entropy": 5.750310230255127, "epoch": 7.62913622690159, "grad_norm": 1.5234375, "learning_rate": 0.00012035753879172026, "loss": 5.0946, "mean_token_accuracy": 0.22113668769598008, "num_tokens": 16149585.0, "step": 8880 }, { "entropy": 5.6439436912536625, "epoch": 7.633433605500644, "grad_norm": 1.453125, "learning_rate": 0.00012011620094133296, "loss": 4.7415, "mean_token_accuracy": 0.2517879784107208, "num_tokens": 16157656.0, "step": 8885 }, { "entropy": 5.633669376373291, "epoch": 7.637730984099699, "grad_norm": 1.359375, "learning_rate": 0.00011987520129525622, "loss": 4.8953, "mean_token_accuracy": 0.2355537548661232, "num_tokens": 16166900.0, "step": 8890 }, { "entropy": 5.7096264362335205, "epoch": 7.6420283626987535, "grad_norm": 1.546875, "learning_rate": 0.000119634540379737, "loss": 4.9149, "mean_token_accuracy": 0.23628997951745986, "num_tokens": 16174859.0, "step": 8895 }, { "entropy": 5.737153196334839, "epoch": 7.646325741297808, "grad_norm": 1.421875, "learning_rate": 0.00011939421872028262, "loss": 4.9069, "mean_token_accuracy": 0.23722454458475112, "num_tokens": 16183660.0, "step": 8900 }, { "entropy": 5.666551923751831, "epoch": 7.650623119896863, "grad_norm": 1.3984375, "learning_rate": 0.00011915423684165948, "loss": 4.8487, "mean_token_accuracy": 0.24358074367046356, "num_tokens": 16192344.0, "step": 8905 }, { "entropy": 5.7381829738616945, "epoch": 7.654920498495917, "grad_norm": 1.328125, "learning_rate": 0.00011891459526789198, "loss": 4.9234, "mean_token_accuracy": 0.23702718764543534, "num_tokens": 16202060.0, "step": 8910 }, { "entropy": 5.74292402267456, "epoch": 7.659217877094972, "grad_norm": 1.359375, "learning_rate": 0.0001186752945222616, "loss": 4.9217, "mean_token_accuracy": 0.23749222308397294, "num_tokens": 16211297.0, "step": 8915 }, { "entropy": 5.719897603988647, "epoch": 7.663515255694026, "grad_norm": 1.265625, "learning_rate": 0.00011843633512730562, "loss": 4.8646, "mean_token_accuracy": 0.24102884978055955, "num_tokens": 16219812.0, "step": 8920 }, { "entropy": 5.695969915390014, "epoch": 7.667812634293082, "grad_norm": 1.34375, "learning_rate": 0.00011819771760481576, "loss": 4.8765, "mean_token_accuracy": 0.24786355644464492, "num_tokens": 16229197.0, "step": 8925 }, { "entropy": 5.634260749816894, "epoch": 7.672110012892135, "grad_norm": 1.4140625, "learning_rate": 0.00011795944247583725, "loss": 4.8107, "mean_token_accuracy": 0.2450357496738434, "num_tokens": 16238154.0, "step": 8930 }, { "entropy": 5.663767862319946, "epoch": 7.676407391491191, "grad_norm": 1.2890625, "learning_rate": 0.00011772151026066789, "loss": 4.8537, "mean_token_accuracy": 0.23516058027744294, "num_tokens": 16247206.0, "step": 8935 }, { "entropy": 5.71944785118103, "epoch": 7.680704770090245, "grad_norm": 1.3671875, "learning_rate": 0.00011748392147885642, "loss": 4.972, "mean_token_accuracy": 0.23327646702528, "num_tokens": 16256571.0, "step": 8940 }, { "entropy": 5.712430381774903, "epoch": 7.6850021486893, "grad_norm": 1.6171875, "learning_rate": 0.00011724667664920177, "loss": 4.9113, "mean_token_accuracy": 0.23777286261320113, "num_tokens": 16265429.0, "step": 8945 }, { "entropy": 5.760171937942505, "epoch": 7.689299527288354, "grad_norm": 1.46875, "learning_rate": 0.00011700977628975183, "loss": 5.0088, "mean_token_accuracy": 0.22751238495111464, "num_tokens": 16273804.0, "step": 8950 }, { "entropy": 5.7118391513824465, "epoch": 7.693596905887409, "grad_norm": 1.453125, "learning_rate": 0.00011677322091780243, "loss": 4.928, "mean_token_accuracy": 0.23360252976417542, "num_tokens": 16282894.0, "step": 8955 }, { "entropy": 5.762385988235474, "epoch": 7.6978942844864635, "grad_norm": 1.390625, "learning_rate": 0.0001165370110498958, "loss": 4.9468, "mean_token_accuracy": 0.2344141960144043, "num_tokens": 16291568.0, "step": 8960 }, { "entropy": 5.75871696472168, "epoch": 7.702191663085518, "grad_norm": 1.3671875, "learning_rate": 0.00011630114720181989, "loss": 4.943, "mean_token_accuracy": 0.23351782113313674, "num_tokens": 16300650.0, "step": 8965 }, { "entropy": 5.710498237609864, "epoch": 7.706489041684573, "grad_norm": 1.40625, "learning_rate": 0.00011606562988860711, "loss": 4.8749, "mean_token_accuracy": 0.23832377195358276, "num_tokens": 16309712.0, "step": 8970 }, { "entropy": 5.69747223854065, "epoch": 7.710786420283627, "grad_norm": 1.25, "learning_rate": 0.0001158304596245332, "loss": 4.8552, "mean_token_accuracy": 0.24203347712755202, "num_tokens": 16319440.0, "step": 8975 }, { "entropy": 5.64069881439209, "epoch": 7.715083798882682, "grad_norm": 1.3359375, "learning_rate": 0.00011559563692311595, "loss": 4.8583, "mean_token_accuracy": 0.24638050347566604, "num_tokens": 16328752.0, "step": 8980 }, { "entropy": 5.687512683868408, "epoch": 7.719381177481736, "grad_norm": 1.34375, "learning_rate": 0.00011536116229711422, "loss": 4.8477, "mean_token_accuracy": 0.24722600281238555, "num_tokens": 16338045.0, "step": 8985 }, { "entropy": 5.758387184143066, "epoch": 7.723678556080791, "grad_norm": 1.359375, "learning_rate": 0.000115127036258527, "loss": 4.9253, "mean_token_accuracy": 0.23546791225671768, "num_tokens": 16347174.0, "step": 8990 }, { "entropy": 5.679119014739991, "epoch": 7.727975934679845, "grad_norm": 1.46875, "learning_rate": 0.00011489325931859185, "loss": 4.7787, "mean_token_accuracy": 0.25743198245763776, "num_tokens": 16355371.0, "step": 8995 }, { "entropy": 5.682134628295898, "epoch": 7.7322733132789, "grad_norm": 1.34375, "learning_rate": 0.0001146598319877843, "loss": 4.8576, "mean_token_accuracy": 0.24060671776533127, "num_tokens": 16363938.0, "step": 9000 }, { "epoch": 7.7322733132789, "eval_entropy": 5.50046287893175, "eval_loss": 5.91103458404541, "eval_mean_token_accuracy": 0.1812090568814997, "eval_num_tokens": 16363938.0, "eval_runtime": 2.0492, "eval_samples_per_second": 1731.916, "eval_steps_per_second": 216.673, "step": 9000 }, { "entropy": 5.69020528793335, "epoch": 7.736570691877954, "grad_norm": 1.53125, "learning_rate": 0.00011442675477581621, "loss": 4.9217, "mean_token_accuracy": 0.23781170547008515, "num_tokens": 16373110.0, "step": 9005 }, { "entropy": 5.713038206100464, "epoch": 7.740868070477009, "grad_norm": 1.3515625, "learning_rate": 0.0001141940281916352, "loss": 4.8384, "mean_token_accuracy": 0.23741564452648162, "num_tokens": 16381521.0, "step": 9010 }, { "entropy": 5.757988119125367, "epoch": 7.7451654490760635, "grad_norm": 1.3359375, "learning_rate": 0.00011396165274342304, "loss": 4.9514, "mean_token_accuracy": 0.23201918452978135, "num_tokens": 16391322.0, "step": 9015 }, { "entropy": 5.672502326965332, "epoch": 7.749462827675118, "grad_norm": 1.296875, "learning_rate": 0.00011372962893859471, "loss": 4.8857, "mean_token_accuracy": 0.23982396721839905, "num_tokens": 16400653.0, "step": 9020 }, { "entropy": 5.67963056564331, "epoch": 7.753760206274173, "grad_norm": 1.390625, "learning_rate": 0.00011349795728379759, "loss": 4.9017, "mean_token_accuracy": 0.23146291971206664, "num_tokens": 16410133.0, "step": 9025 }, { "entropy": 5.738080835342407, "epoch": 7.758057584873227, "grad_norm": 1.375, "learning_rate": 0.00011326663828491, "loss": 4.9688, "mean_token_accuracy": 0.23214154988527297, "num_tokens": 16419302.0, "step": 9030 }, { "entropy": 5.694006776809692, "epoch": 7.762354963472282, "grad_norm": 1.5078125, "learning_rate": 0.00011303567244704015, "loss": 4.9394, "mean_token_accuracy": 0.24590874761343, "num_tokens": 16428020.0, "step": 9035 }, { "entropy": 5.726604032516479, "epoch": 7.766652342071336, "grad_norm": 1.375, "learning_rate": 0.00011280506027452502, "loss": 5.0101, "mean_token_accuracy": 0.22551458925008774, "num_tokens": 16438033.0, "step": 9040 }, { "entropy": 5.740489101409912, "epoch": 7.770949720670391, "grad_norm": 1.5, "learning_rate": 0.0001125748022709295, "loss": 4.9396, "mean_token_accuracy": 0.24004841595888138, "num_tokens": 16447067.0, "step": 9045 }, { "entropy": 5.762021446228028, "epoch": 7.775247099269445, "grad_norm": 1.3359375, "learning_rate": 0.00011234489893904509, "loss": 4.957, "mean_token_accuracy": 0.22969225496053697, "num_tokens": 16457146.0, "step": 9050 }, { "entropy": 5.696189260482788, "epoch": 7.7795444778685, "grad_norm": 1.3671875, "learning_rate": 0.00011211535078088869, "loss": 4.8183, "mean_token_accuracy": 0.2493802011013031, "num_tokens": 16466428.0, "step": 9055 }, { "entropy": 5.687802410125732, "epoch": 7.783841856467554, "grad_norm": 1.4453125, "learning_rate": 0.00011188615829770171, "loss": 4.867, "mean_token_accuracy": 0.242595311999321, "num_tokens": 16474198.0, "step": 9060 }, { "entropy": 5.6849202632904055, "epoch": 7.788139235066609, "grad_norm": 1.453125, "learning_rate": 0.00011165732198994905, "loss": 4.8999, "mean_token_accuracy": 0.23992180526256562, "num_tokens": 16483464.0, "step": 9065 }, { "entropy": 5.681135511398315, "epoch": 7.792436613665664, "grad_norm": 1.46875, "learning_rate": 0.00011142884235731756, "loss": 4.8742, "mean_token_accuracy": 0.24454833716154098, "num_tokens": 16492619.0, "step": 9070 }, { "entropy": 5.7551130771636965, "epoch": 7.796733992264718, "grad_norm": 1.5859375, "learning_rate": 0.00011120071989871564, "loss": 4.9718, "mean_token_accuracy": 0.24016929417848587, "num_tokens": 16501690.0, "step": 9075 }, { "entropy": 5.639516925811767, "epoch": 7.8010313708637735, "grad_norm": 1.2578125, "learning_rate": 0.00011097295511227134, "loss": 4.8645, "mean_token_accuracy": 0.24110034853219986, "num_tokens": 16510158.0, "step": 9080 }, { "entropy": 5.668501472473144, "epoch": 7.805328749462827, "grad_norm": 1.3828125, "learning_rate": 0.0001107455484953321, "loss": 4.8698, "mean_token_accuracy": 0.23620172441005707, "num_tokens": 16518722.0, "step": 9085 }, { "entropy": 5.657008075714112, "epoch": 7.809626128061883, "grad_norm": 1.40625, "learning_rate": 0.00011051850054446306, "loss": 4.882, "mean_token_accuracy": 0.24305418133735657, "num_tokens": 16527404.0, "step": 9090 }, { "entropy": 5.6321070194244385, "epoch": 7.813923506660937, "grad_norm": 1.5078125, "learning_rate": 0.00011029181175544603, "loss": 4.8245, "mean_token_accuracy": 0.25041354447603226, "num_tokens": 16536210.0, "step": 9095 }, { "entropy": 5.769088315963745, "epoch": 7.818220885259992, "grad_norm": 1.53125, "learning_rate": 0.00011006548262327884, "loss": 4.9854, "mean_token_accuracy": 0.23913427740335463, "num_tokens": 16544707.0, "step": 9100 }, { "entropy": 5.764179706573486, "epoch": 7.822518263859046, "grad_norm": 1.328125, "learning_rate": 0.0001098395136421739, "loss": 4.9391, "mean_token_accuracy": 0.22906827330589294, "num_tokens": 16553883.0, "step": 9105 }, { "entropy": 5.7137744426727295, "epoch": 7.826815642458101, "grad_norm": 1.2890625, "learning_rate": 0.00010961390530555712, "loss": 4.9357, "mean_token_accuracy": 0.2278410956263542, "num_tokens": 16562537.0, "step": 9110 }, { "entropy": 5.699681091308594, "epoch": 7.831113021057155, "grad_norm": 1.3359375, "learning_rate": 0.00010938865810606682, "loss": 4.9009, "mean_token_accuracy": 0.24008741080760956, "num_tokens": 16571665.0, "step": 9115 }, { "entropy": 5.712159013748169, "epoch": 7.83541039965621, "grad_norm": 1.296875, "learning_rate": 0.00010916377253555293, "loss": 4.9065, "mean_token_accuracy": 0.23339497298002243, "num_tokens": 16581102.0, "step": 9120 }, { "entropy": 5.671864986419678, "epoch": 7.839707778255264, "grad_norm": 1.4140625, "learning_rate": 0.00010893924908507573, "loss": 4.865, "mean_token_accuracy": 0.24521686434745787, "num_tokens": 16589958.0, "step": 9125 }, { "entropy": 5.765967130661011, "epoch": 7.844005156854319, "grad_norm": 1.4765625, "learning_rate": 0.0001087150882449046, "loss": 4.9268, "mean_token_accuracy": 0.2391319289803505, "num_tokens": 16598800.0, "step": 9130 }, { "entropy": 5.707003164291382, "epoch": 7.8483025354533735, "grad_norm": 1.3671875, "learning_rate": 0.00010849129050451717, "loss": 4.8968, "mean_token_accuracy": 0.23445421308279038, "num_tokens": 16607751.0, "step": 9135 }, { "entropy": 5.652349233627319, "epoch": 7.852599914052428, "grad_norm": 1.3125, "learning_rate": 0.00010826785635259842, "loss": 4.8453, "mean_token_accuracy": 0.24450734853744507, "num_tokens": 16616041.0, "step": 9140 }, { "entropy": 5.660851383209229, "epoch": 7.856897292651483, "grad_norm": 1.5625, "learning_rate": 0.00010804478627703903, "loss": 4.8055, "mean_token_accuracy": 0.2569776579737663, "num_tokens": 16624800.0, "step": 9145 }, { "entropy": 5.752418375015258, "epoch": 7.861194671250537, "grad_norm": 1.7109375, "learning_rate": 0.00010782208076493508, "loss": 4.9431, "mean_token_accuracy": 0.23038492798805238, "num_tokens": 16632808.0, "step": 9150 }, { "entropy": 5.7448986053466795, "epoch": 7.865492049849592, "grad_norm": 1.3984375, "learning_rate": 0.00010759974030258621, "loss": 4.906, "mean_token_accuracy": 0.23768896460533143, "num_tokens": 16641179.0, "step": 9155 }, { "entropy": 5.688127946853638, "epoch": 7.869789428448646, "grad_norm": 1.2421875, "learning_rate": 0.00010737776537549531, "loss": 4.8952, "mean_token_accuracy": 0.24468690007925034, "num_tokens": 16650402.0, "step": 9160 }, { "entropy": 5.64280138015747, "epoch": 7.874086807047701, "grad_norm": 1.4296875, "learning_rate": 0.00010715615646836679, "loss": 4.8555, "mean_token_accuracy": 0.2418656662106514, "num_tokens": 16659661.0, "step": 9165 }, { "entropy": 5.633242177963257, "epoch": 7.878384185646755, "grad_norm": 1.296875, "learning_rate": 0.00010693491406510585, "loss": 4.8508, "mean_token_accuracy": 0.2495250001549721, "num_tokens": 16668630.0, "step": 9170 }, { "entropy": 5.727669334411621, "epoch": 7.88268156424581, "grad_norm": 1.4296875, "learning_rate": 0.00010671403864881757, "loss": 4.9439, "mean_token_accuracy": 0.2366262823343277, "num_tokens": 16678023.0, "step": 9175 }, { "entropy": 5.733814907073975, "epoch": 7.8869789428448644, "grad_norm": 1.3359375, "learning_rate": 0.00010649353070180562, "loss": 4.9239, "mean_token_accuracy": 0.24506820291280745, "num_tokens": 16686751.0, "step": 9180 }, { "entropy": 5.7177135944366455, "epoch": 7.891276321443919, "grad_norm": 1.3515625, "learning_rate": 0.00010627339070557118, "loss": 4.9189, "mean_token_accuracy": 0.23672835975885392, "num_tokens": 16696672.0, "step": 9185 }, { "entropy": 5.688603305816651, "epoch": 7.8955737000429735, "grad_norm": 1.3671875, "learning_rate": 0.00010605361914081194, "loss": 4.8098, "mean_token_accuracy": 0.24818727225065232, "num_tokens": 16706018.0, "step": 9190 }, { "entropy": 5.71311993598938, "epoch": 7.899871078642028, "grad_norm": 1.21875, "learning_rate": 0.00010583421648742125, "loss": 4.8634, "mean_token_accuracy": 0.2426785260438919, "num_tokens": 16715206.0, "step": 9195 }, { "entropy": 5.69930329322815, "epoch": 7.904168457241083, "grad_norm": 1.4296875, "learning_rate": 0.00010561518322448673, "loss": 4.9517, "mean_token_accuracy": 0.23524065911769867, "num_tokens": 16724479.0, "step": 9200 }, { "entropy": 5.6606029033660885, "epoch": 7.908465835840137, "grad_norm": 1.484375, "learning_rate": 0.00010539651983028955, "loss": 4.8302, "mean_token_accuracy": 0.24595999121665954, "num_tokens": 16733304.0, "step": 9205 }, { "entropy": 5.674201583862304, "epoch": 7.912763214439192, "grad_norm": 1.5, "learning_rate": 0.0001051782267823031, "loss": 4.8306, "mean_token_accuracy": 0.24602922052145004, "num_tokens": 16741447.0, "step": 9210 }, { "entropy": 5.695632266998291, "epoch": 7.917060593038247, "grad_norm": 1.4765625, "learning_rate": 0.00010496030455719225, "loss": 4.8963, "mean_token_accuracy": 0.23516589403152466, "num_tokens": 16751487.0, "step": 9215 }, { "entropy": 5.6963306903839115, "epoch": 7.921357971637301, "grad_norm": 1.421875, "learning_rate": 0.00010474275363081193, "loss": 4.8813, "mean_token_accuracy": 0.23939020037651063, "num_tokens": 16760795.0, "step": 9220 }, { "entropy": 5.683042097091675, "epoch": 7.925655350236356, "grad_norm": 1.46875, "learning_rate": 0.0001045255744782064, "loss": 4.8792, "mean_token_accuracy": 0.24189150482416152, "num_tokens": 16769639.0, "step": 9225 }, { "entropy": 5.759686517715454, "epoch": 7.92995272883541, "grad_norm": 1.3515625, "learning_rate": 0.00010430876757360817, "loss": 4.9654, "mean_token_accuracy": 0.23323026299476624, "num_tokens": 16779195.0, "step": 9230 }, { "entropy": 5.640351009368897, "epoch": 7.934250107434465, "grad_norm": 1.4375, "learning_rate": 0.00010409233339043694, "loss": 4.7822, "mean_token_accuracy": 0.25166461169719695, "num_tokens": 16787531.0, "step": 9235 }, { "entropy": 5.643196535110474, "epoch": 7.93854748603352, "grad_norm": 1.453125, "learning_rate": 0.00010387627240129838, "loss": 4.8377, "mean_token_accuracy": 0.24781358987092972, "num_tokens": 16796392.0, "step": 9240 }, { "entropy": 5.7032732486724855, "epoch": 7.942844864632574, "grad_norm": 1.265625, "learning_rate": 0.00010366058507798326, "loss": 4.8837, "mean_token_accuracy": 0.23703473657369614, "num_tokens": 16804942.0, "step": 9245 }, { "entropy": 5.685716962814331, "epoch": 7.947142243231629, "grad_norm": 1.3671875, "learning_rate": 0.00010344527189146655, "loss": 4.9321, "mean_token_accuracy": 0.23355796337127685, "num_tokens": 16813754.0, "step": 9250 }, { "entropy": 5.715789413452148, "epoch": 7.9514396218306835, "grad_norm": 1.265625, "learning_rate": 0.00010323033331190626, "loss": 4.9042, "mean_token_accuracy": 0.23985962867736815, "num_tokens": 16823010.0, "step": 9255 }, { "entropy": 5.716991758346557, "epoch": 7.955737000429738, "grad_norm": 1.40625, "learning_rate": 0.00010301576980864228, "loss": 4.8258, "mean_token_accuracy": 0.25040524303913114, "num_tokens": 16831909.0, "step": 9260 }, { "entropy": 5.622262763977051, "epoch": 7.960034379028793, "grad_norm": 1.4765625, "learning_rate": 0.00010280158185019547, "loss": 4.8755, "mean_token_accuracy": 0.24880994856357574, "num_tokens": 16841460.0, "step": 9265 }, { "entropy": 5.675008153915405, "epoch": 7.964331757627847, "grad_norm": 1.2265625, "learning_rate": 0.00010258776990426686, "loss": 4.8917, "mean_token_accuracy": 0.24589523673057556, "num_tokens": 16850592.0, "step": 9270 }, { "entropy": 5.697416830062866, "epoch": 7.968629136226902, "grad_norm": 1.53125, "learning_rate": 0.00010237433443773612, "loss": 4.8994, "mean_token_accuracy": 0.2388192519545555, "num_tokens": 16859736.0, "step": 9275 }, { "entropy": 5.644947481155396, "epoch": 7.972926514825956, "grad_norm": 1.234375, "learning_rate": 0.00010216127591666115, "loss": 4.9009, "mean_token_accuracy": 0.25150825679302213, "num_tokens": 16870084.0, "step": 9280 }, { "entropy": 5.69167685508728, "epoch": 7.977223893425011, "grad_norm": 1.6484375, "learning_rate": 0.00010194859480627648, "loss": 4.9116, "mean_token_accuracy": 0.2423036977648735, "num_tokens": 16877771.0, "step": 9285 }, { "entropy": 5.679421663284302, "epoch": 7.981521272024065, "grad_norm": 1.25, "learning_rate": 0.00010173629157099279, "loss": 4.9116, "mean_token_accuracy": 0.2358901694417, "num_tokens": 16887487.0, "step": 9290 }, { "entropy": 5.679394340515136, "epoch": 7.98581865062312, "grad_norm": 1.25, "learning_rate": 0.00010152436667439537, "loss": 4.91, "mean_token_accuracy": 0.24376949667930603, "num_tokens": 16897286.0, "step": 9295 }, { "entropy": 5.705884504318237, "epoch": 7.9901160292221745, "grad_norm": 1.40625, "learning_rate": 0.00010131282057924345, "loss": 4.85, "mean_token_accuracy": 0.24412865936756134, "num_tokens": 16905968.0, "step": 9300 }, { "entropy": 5.732138919830322, "epoch": 7.994413407821229, "grad_norm": 1.34375, "learning_rate": 0.00010110165374746924, "loss": 4.8701, "mean_token_accuracy": 0.24227845817804336, "num_tokens": 16914604.0, "step": 9305 }, { "entropy": 5.702543640136719, "epoch": 7.9987107864202835, "grad_norm": 1.3671875, "learning_rate": 0.00010089086664017674, "loss": 5.0117, "mean_token_accuracy": 0.23315883576869964, "num_tokens": 16925085.0, "step": 9310 }, { "entropy": 5.676374594370524, "epoch": 8.002578427159433, "grad_norm": 1.390625, "learning_rate": 0.00010068045971764067, "loss": 4.8175, "mean_token_accuracy": 0.2474090274837282, "num_tokens": 16932717.0, "step": 9315 }, { "entropy": 5.737140417098999, "epoch": 8.006875805758487, "grad_norm": 1.421875, "learning_rate": 0.00010047043343930561, "loss": 4.9202, "mean_token_accuracy": 0.23843726366758347, "num_tokens": 16941332.0, "step": 9320 }, { "entropy": 5.6835075378417965, "epoch": 8.011173184357542, "grad_norm": 1.421875, "learning_rate": 0.00010026078826378502, "loss": 4.7602, "mean_token_accuracy": 0.2514714956283569, "num_tokens": 16949732.0, "step": 9325 }, { "entropy": 5.720363521575928, "epoch": 8.015470562956596, "grad_norm": 1.3359375, "learning_rate": 0.00010005152464886031, "loss": 4.7948, "mean_token_accuracy": 0.2467312902212143, "num_tokens": 16958013.0, "step": 9330 }, { "entropy": 5.649158191680908, "epoch": 8.019767941555651, "grad_norm": 1.3125, "learning_rate": 9.984264305147941e-05, "loss": 4.7621, "mean_token_accuracy": 0.25033883154392245, "num_tokens": 16966050.0, "step": 9335 }, { "entropy": 5.650808954238892, "epoch": 8.024065320154705, "grad_norm": 1.3203125, "learning_rate": 9.963414392775627e-05, "loss": 4.7591, "mean_token_accuracy": 0.25660623162984847, "num_tokens": 16975178.0, "step": 9340 }, { "entropy": 5.585528993606568, "epoch": 8.02836269875376, "grad_norm": 1.296875, "learning_rate": 9.942602773296971e-05, "loss": 4.7392, "mean_token_accuracy": 0.26027477383613584, "num_tokens": 16984247.0, "step": 9345 }, { "entropy": 5.767738199234008, "epoch": 8.032660077352816, "grad_norm": 1.3359375, "learning_rate": 9.921829492156223e-05, "loss": 4.9408, "mean_token_accuracy": 0.23659028112888336, "num_tokens": 16995048.0, "step": 9350 }, { "entropy": 5.726878213882446, "epoch": 8.03695745595187, "grad_norm": 1.6015625, "learning_rate": 9.901094594713933e-05, "loss": 4.9095, "mean_token_accuracy": 0.2300567016005516, "num_tokens": 17003748.0, "step": 9355 }, { "entropy": 5.753246450424195, "epoch": 8.041254834550925, "grad_norm": 1.3984375, "learning_rate": 9.88039812624682e-05, "loss": 4.8654, "mean_token_accuracy": 0.24705124497413636, "num_tokens": 17011639.0, "step": 9360 }, { "entropy": 5.767079067230225, "epoch": 8.045552213149978, "grad_norm": 1.2109375, "learning_rate": 9.859740131947715e-05, "loss": 4.8656, "mean_token_accuracy": 0.23569979518651962, "num_tokens": 17021056.0, "step": 9365 }, { "entropy": 5.730513334274292, "epoch": 8.049849591749034, "grad_norm": 1.3828125, "learning_rate": 9.839120656925407e-05, "loss": 4.9032, "mean_token_accuracy": 0.2346501335501671, "num_tokens": 17030944.0, "step": 9370 }, { "entropy": 5.669353294372558, "epoch": 8.054146970348087, "grad_norm": 1.40625, "learning_rate": 9.818539746204588e-05, "loss": 4.8452, "mean_token_accuracy": 0.24651120901107787, "num_tokens": 17040127.0, "step": 9375 }, { "entropy": 5.753323745727539, "epoch": 8.058444348947143, "grad_norm": 1.5859375, "learning_rate": 9.797997444725745e-05, "loss": 4.8358, "mean_token_accuracy": 0.24490419328212737, "num_tokens": 17049418.0, "step": 9380 }, { "entropy": 5.66489543914795, "epoch": 8.062741727546197, "grad_norm": 1.328125, "learning_rate": 9.77749379734506e-05, "loss": 4.8328, "mean_token_accuracy": 0.25109679251909256, "num_tokens": 17058270.0, "step": 9385 }, { "entropy": 5.712283277511597, "epoch": 8.067039106145252, "grad_norm": 1.4375, "learning_rate": 9.757028848834293e-05, "loss": 4.8444, "mean_token_accuracy": 0.2447240486741066, "num_tokens": 17068011.0, "step": 9390 }, { "entropy": 5.704377889633179, "epoch": 8.071336484744306, "grad_norm": 1.3828125, "learning_rate": 9.736602643880712e-05, "loss": 4.8442, "mean_token_accuracy": 0.2414647787809372, "num_tokens": 17077356.0, "step": 9395 }, { "entropy": 5.683783292770386, "epoch": 8.075633863343361, "grad_norm": 1.34375, "learning_rate": 9.716215227086997e-05, "loss": 4.8058, "mean_token_accuracy": 0.24811055809259414, "num_tokens": 17085679.0, "step": 9400 }, { "entropy": 5.713439559936523, "epoch": 8.079931241942415, "grad_norm": 1.3046875, "learning_rate": 9.695866642971098e-05, "loss": 4.828, "mean_token_accuracy": 0.2418453276157379, "num_tokens": 17094925.0, "step": 9405 }, { "entropy": 5.680472755432129, "epoch": 8.08422862054147, "grad_norm": 1.4140625, "learning_rate": 9.67555693596621e-05, "loss": 4.8917, "mean_token_accuracy": 0.24031679928302765, "num_tokens": 17105278.0, "step": 9410 }, { "entropy": 5.643775463104248, "epoch": 8.088525999140524, "grad_norm": 1.3984375, "learning_rate": 9.655286150420595e-05, "loss": 4.811, "mean_token_accuracy": 0.2506825551390648, "num_tokens": 17114070.0, "step": 9415 }, { "entropy": 5.592681741714477, "epoch": 8.09282337773958, "grad_norm": 1.2421875, "learning_rate": 9.635054330597565e-05, "loss": 4.7437, "mean_token_accuracy": 0.2564584508538246, "num_tokens": 17122862.0, "step": 9420 }, { "entropy": 5.6964335441589355, "epoch": 8.097120756338633, "grad_norm": 1.453125, "learning_rate": 9.614861520675322e-05, "loss": 4.81, "mean_token_accuracy": 0.24855268150568008, "num_tokens": 17131555.0, "step": 9425 }, { "entropy": 5.6678112030029295, "epoch": 8.101418134937688, "grad_norm": 1.40625, "learning_rate": 9.594707764746881e-05, "loss": 4.7697, "mean_token_accuracy": 0.25087203830480576, "num_tokens": 17140841.0, "step": 9430 }, { "entropy": 5.644021463394165, "epoch": 8.105715513536742, "grad_norm": 1.296875, "learning_rate": 9.57459310682e-05, "loss": 4.8181, "mean_token_accuracy": 0.25079510658979415, "num_tokens": 17150027.0, "step": 9435 }, { "entropy": 5.724911117553711, "epoch": 8.110012892135797, "grad_norm": 1.3125, "learning_rate": 9.554517590817055e-05, "loss": 4.8874, "mean_token_accuracy": 0.23362074196338653, "num_tokens": 17159589.0, "step": 9440 }, { "entropy": 5.718877220153809, "epoch": 8.114310270734851, "grad_norm": 1.2890625, "learning_rate": 9.534481260574944e-05, "loss": 4.8569, "mean_token_accuracy": 0.24165450483560563, "num_tokens": 17168219.0, "step": 9445 }, { "entropy": 5.717329406738282, "epoch": 8.118607649333907, "grad_norm": 1.34375, "learning_rate": 9.514484159844997e-05, "loss": 4.8494, "mean_token_accuracy": 0.24667494893074035, "num_tokens": 17177364.0, "step": 9450 }, { "entropy": 5.742877006530762, "epoch": 8.12290502793296, "grad_norm": 1.4765625, "learning_rate": 9.494526332292899e-05, "loss": 4.8939, "mean_token_accuracy": 0.23939796686172485, "num_tokens": 17186572.0, "step": 9455 }, { "entropy": 5.716141700744629, "epoch": 8.127202406532016, "grad_norm": 1.3671875, "learning_rate": 9.47460782149857e-05, "loss": 4.8819, "mean_token_accuracy": 0.24952531158924102, "num_tokens": 17195645.0, "step": 9460 }, { "entropy": 5.716918182373047, "epoch": 8.13149978513107, "grad_norm": 1.328125, "learning_rate": 9.454728670956073e-05, "loss": 4.8623, "mean_token_accuracy": 0.2338918313384056, "num_tokens": 17205279.0, "step": 9465 }, { "entropy": 5.674348020553589, "epoch": 8.135797163730125, "grad_norm": 1.3828125, "learning_rate": 9.43488892407352e-05, "loss": 4.7889, "mean_token_accuracy": 0.2492659032344818, "num_tokens": 17214536.0, "step": 9470 }, { "entropy": 5.7423008441925045, "epoch": 8.140094542329178, "grad_norm": 1.328125, "learning_rate": 9.415088624172997e-05, "loss": 4.8982, "mean_token_accuracy": 0.2365766152739525, "num_tokens": 17223336.0, "step": 9475 }, { "entropy": 5.725331783294678, "epoch": 8.144391920928234, "grad_norm": 1.375, "learning_rate": 9.395327814490439e-05, "loss": 4.8913, "mean_token_accuracy": 0.24087603092193605, "num_tokens": 17232991.0, "step": 9480 }, { "entropy": 5.707345008850098, "epoch": 8.148689299527287, "grad_norm": 1.4140625, "learning_rate": 9.375606538175566e-05, "loss": 4.8079, "mean_token_accuracy": 0.2476293459534645, "num_tokens": 17241760.0, "step": 9485 }, { "entropy": 5.745706558227539, "epoch": 8.152986678126343, "grad_norm": 1.3125, "learning_rate": 9.35592483829175e-05, "loss": 4.9106, "mean_token_accuracy": 0.23400688916444778, "num_tokens": 17251514.0, "step": 9490 }, { "entropy": 5.7085450172424315, "epoch": 8.157284056725398, "grad_norm": 1.3984375, "learning_rate": 9.336282757815964e-05, "loss": 4.8709, "mean_token_accuracy": 0.2447448268532753, "num_tokens": 17260876.0, "step": 9495 }, { "entropy": 5.680332851409912, "epoch": 8.161581435324452, "grad_norm": 1.40625, "learning_rate": 9.316680339638664e-05, "loss": 4.893, "mean_token_accuracy": 0.24191939532756807, "num_tokens": 17270051.0, "step": 9500 }, { "epoch": 8.161581435324452, "eval_entropy": 5.534670775001113, "eval_loss": 5.914956092834473, "eval_mean_token_accuracy": 0.18108219369776077, "eval_num_tokens": 17270051.0, "eval_runtime": 2.0509, "eval_samples_per_second": 1730.421, "eval_steps_per_second": 216.485, "step": 9500 } ], "logging_steps": 5, "max_steps": 11630, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3885703429294080.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }