{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.1478298238074776, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 10.742608070373535, "epoch": 0.004297378599054577, "grad_norm": 5.46875, "learning_rate": 2e-06, "loss": 10.7643, "mean_token_accuracy": 7.587253348901868e-05, "num_tokens": 10107.0, "step": 5 }, { "entropy": 10.742630290985108, "epoch": 0.008594757198109154, "grad_norm": 5.78125, "learning_rate": 4.5e-06, "loss": 10.7086, "mean_token_accuracy": 0.0, "num_tokens": 18391.0, "step": 10 }, { "entropy": 10.74263505935669, "epoch": 0.01289213579716373, "grad_norm": 5.3125, "learning_rate": 7e-06, "loss": 10.6888, "mean_token_accuracy": 7.022471982054412e-05, "num_tokens": 27061.0, "step": 15 }, { "entropy": 10.742604160308838, "epoch": 0.017189514396218308, "grad_norm": 6.0, "learning_rate": 9.5e-06, "loss": 10.6611, "mean_token_accuracy": 0.0008422504703048617, "num_tokens": 36339.0, "step": 20 }, { "entropy": 10.742517948150635, "epoch": 0.021486892995272882, "grad_norm": 4.75, "learning_rate": 1.2e-05, "loss": 10.5317, "mean_token_accuracy": 0.02025789166800678, "num_tokens": 45770.0, "step": 25 }, { "entropy": 10.741962242126466, "epoch": 0.02578427159432746, "grad_norm": 4.25, "learning_rate": 1.4500000000000002e-05, "loss": 10.399, "mean_token_accuracy": 0.04876907132565975, "num_tokens": 54575.0, "step": 30 }, { "entropy": 10.73945140838623, "epoch": 0.030081650193382038, "grad_norm": 3.15625, "learning_rate": 1.7000000000000003e-05, "loss": 10.3065, "mean_token_accuracy": 0.0514072135090828, "num_tokens": 66403.0, "step": 35 }, { "entropy": 10.730937385559082, "epoch": 0.034379028792436615, "grad_norm": 2.640625, "learning_rate": 1.95e-05, "loss": 10.0976, "mean_token_accuracy": 0.05973539762198925, "num_tokens": 76510.0, "step": 40 }, { "entropy": 10.715238952636719, "epoch": 0.03867640739149119, "grad_norm": 2.40625, "learning_rate": 2.2e-05, "loss": 9.9688, "mean_token_accuracy": 0.05614017099142075, "num_tokens": 84836.0, "step": 45 }, { "entropy": 10.702037715911866, "epoch": 0.042973785990545764, "grad_norm": 2.046875, "learning_rate": 2.4500000000000003e-05, "loss": 9.9015, "mean_token_accuracy": 0.053829558193683624, "num_tokens": 93197.0, "step": 50 }, { "entropy": 10.697910690307618, "epoch": 0.047271164589600345, "grad_norm": 2.40625, "learning_rate": 2.7e-05, "loss": 9.8366, "mean_token_accuracy": 0.05843428298830986, "num_tokens": 101546.0, "step": 55 }, { "entropy": 10.693470478057861, "epoch": 0.05156854318865492, "grad_norm": 1.9609375, "learning_rate": 2.95e-05, "loss": 9.8429, "mean_token_accuracy": 0.0558084711432457, "num_tokens": 111703.0, "step": 60 }, { "entropy": 10.680869865417481, "epoch": 0.055865921787709494, "grad_norm": 1.9453125, "learning_rate": 3.2e-05, "loss": 9.7131, "mean_token_accuracy": 0.0589165486395359, "num_tokens": 119894.0, "step": 65 }, { "entropy": 10.668927574157715, "epoch": 0.060163300386764075, "grad_norm": 1.9765625, "learning_rate": 3.4500000000000005e-05, "loss": 9.6682, "mean_token_accuracy": 0.06148771904408932, "num_tokens": 128885.0, "step": 70 }, { "entropy": 10.654484272003174, "epoch": 0.06446067898581866, "grad_norm": 1.953125, "learning_rate": 3.7e-05, "loss": 9.6297, "mean_token_accuracy": 0.057728851959109304, "num_tokens": 138106.0, "step": 75 }, { "entropy": 10.645826625823975, "epoch": 0.06875805758487323, "grad_norm": 1.9296875, "learning_rate": 3.95e-05, "loss": 9.5722, "mean_token_accuracy": 0.058954347297549246, "num_tokens": 146691.0, "step": 80 }, { "entropy": 10.637816619873046, "epoch": 0.0730554361839278, "grad_norm": 1.90625, "learning_rate": 4.2000000000000004e-05, "loss": 9.5126, "mean_token_accuracy": 0.059067190065979956, "num_tokens": 155792.0, "step": 85 }, { "entropy": 10.63103084564209, "epoch": 0.07735281478298238, "grad_norm": 1.7890625, "learning_rate": 4.45e-05, "loss": 9.5251, "mean_token_accuracy": 0.0552229531109333, "num_tokens": 166944.0, "step": 90 }, { "entropy": 10.616693305969239, "epoch": 0.08165019338203695, "grad_norm": 1.96875, "learning_rate": 4.7000000000000004e-05, "loss": 9.3423, "mean_token_accuracy": 0.060124922543764114, "num_tokens": 175303.0, "step": 95 }, { "entropy": 10.591300106048584, "epoch": 0.08594757198109153, "grad_norm": 1.8203125, "learning_rate": 4.9500000000000004e-05, "loss": 9.3133, "mean_token_accuracy": 0.06174388714134693, "num_tokens": 184708.0, "step": 100 }, { "entropy": 10.564336776733398, "epoch": 0.09024495058014612, "grad_norm": 1.7890625, "learning_rate": 5.2e-05, "loss": 9.2307, "mean_token_accuracy": 0.0674959484487772, "num_tokens": 193835.0, "step": 105 }, { "entropy": 10.52622423171997, "epoch": 0.09454232917920069, "grad_norm": 1.8828125, "learning_rate": 5.45e-05, "loss": 9.1379, "mean_token_accuracy": 0.07480009235441684, "num_tokens": 203344.0, "step": 110 }, { "entropy": 10.454349136352539, "epoch": 0.09883970777825526, "grad_norm": 1.6171875, "learning_rate": 5.7e-05, "loss": 9.1209, "mean_token_accuracy": 0.06218625903129578, "num_tokens": 213048.0, "step": 115 }, { "entropy": 10.415324211120605, "epoch": 0.10313708637730984, "grad_norm": 1.578125, "learning_rate": 5.9499999999999996e-05, "loss": 8.9306, "mean_token_accuracy": 0.07533645890653133, "num_tokens": 221784.0, "step": 120 }, { "entropy": 10.303644943237305, "epoch": 0.10743446497636441, "grad_norm": 1.4765625, "learning_rate": 6.2e-05, "loss": 8.8509, "mean_token_accuracy": 0.07504003196954727, "num_tokens": 230971.0, "step": 125 }, { "entropy": 10.209668159484863, "epoch": 0.11173184357541899, "grad_norm": 1.4296875, "learning_rate": 6.450000000000001e-05, "loss": 8.7412, "mean_token_accuracy": 0.07478504739701748, "num_tokens": 240524.0, "step": 130 }, { "entropy": 10.153745365142822, "epoch": 0.11602922217447358, "grad_norm": 1.3359375, "learning_rate": 6.7e-05, "loss": 8.6323, "mean_token_accuracy": 0.07354197278618813, "num_tokens": 249220.0, "step": 135 }, { "entropy": 10.068094253540039, "epoch": 0.12032660077352815, "grad_norm": 1.3125, "learning_rate": 6.950000000000001e-05, "loss": 8.61, "mean_token_accuracy": 0.07049238979816437, "num_tokens": 258934.0, "step": 140 }, { "entropy": 9.973960685729981, "epoch": 0.12462397937258272, "grad_norm": 1.2734375, "learning_rate": 7.2e-05, "loss": 8.4673, "mean_token_accuracy": 0.07534252405166626, "num_tokens": 267680.0, "step": 145 }, { "entropy": 9.815561103820801, "epoch": 0.1289213579716373, "grad_norm": 1.09375, "learning_rate": 7.45e-05, "loss": 8.3709, "mean_token_accuracy": 0.07952065020799637, "num_tokens": 276227.0, "step": 150 }, { "entropy": 9.66996259689331, "epoch": 0.1332187365706919, "grad_norm": 1.1875, "learning_rate": 7.7e-05, "loss": 8.2269, "mean_token_accuracy": 0.08225171342492103, "num_tokens": 286342.0, "step": 155 }, { "entropy": 9.510671615600586, "epoch": 0.13751611516974646, "grad_norm": 0.953125, "learning_rate": 7.950000000000001e-05, "loss": 8.1921, "mean_token_accuracy": 0.0742720566689968, "num_tokens": 294994.0, "step": 160 }, { "entropy": 9.346861934661865, "epoch": 0.14181349376880104, "grad_norm": 0.984375, "learning_rate": 8.2e-05, "loss": 8.113, "mean_token_accuracy": 0.08004417940974236, "num_tokens": 303882.0, "step": 165 }, { "entropy": 9.199288940429687, "epoch": 0.1461108723678556, "grad_norm": 0.9296875, "learning_rate": 8.450000000000001e-05, "loss": 8.0403, "mean_token_accuracy": 0.07799897268414498, "num_tokens": 312515.0, "step": 170 }, { "entropy": 8.978620052337646, "epoch": 0.15040825096691018, "grad_norm": 0.9375, "learning_rate": 8.7e-05, "loss": 7.9977, "mean_token_accuracy": 0.07381256259977817, "num_tokens": 320801.0, "step": 175 }, { "entropy": 8.861582374572754, "epoch": 0.15470562956596476, "grad_norm": 0.9765625, "learning_rate": 8.95e-05, "loss": 7.9642, "mean_token_accuracy": 0.08192512467503547, "num_tokens": 329382.0, "step": 180 }, { "entropy": 8.755144786834716, "epoch": 0.15900300816501933, "grad_norm": 0.9296875, "learning_rate": 9.2e-05, "loss": 7.9273, "mean_token_accuracy": 0.07583913430571557, "num_tokens": 337894.0, "step": 185 }, { "entropy": 8.582227611541748, "epoch": 0.1633003867640739, "grad_norm": 0.8984375, "learning_rate": 9.45e-05, "loss": 7.9012, "mean_token_accuracy": 0.07614588961005211, "num_tokens": 346380.0, "step": 190 }, { "entropy": 8.591823768615722, "epoch": 0.16759776536312848, "grad_norm": 0.9609375, "learning_rate": 9.7e-05, "loss": 7.9407, "mean_token_accuracy": 0.07390806600451469, "num_tokens": 356305.0, "step": 195 }, { "entropy": 8.515201950073243, "epoch": 0.17189514396218306, "grad_norm": 1.1328125, "learning_rate": 9.95e-05, "loss": 7.8901, "mean_token_accuracy": 0.07247771993279457, "num_tokens": 364899.0, "step": 200 }, { "entropy": 8.457213211059571, "epoch": 0.17619252256123766, "grad_norm": 0.93359375, "learning_rate": 0.000102, "loss": 7.8566, "mean_token_accuracy": 0.0781160645186901, "num_tokens": 373663.0, "step": 205 }, { "entropy": 8.381179523468017, "epoch": 0.18048990116029223, "grad_norm": 0.95703125, "learning_rate": 0.00010449999999999999, "loss": 7.8221, "mean_token_accuracy": 0.07758632972836495, "num_tokens": 382730.0, "step": 210 }, { "entropy": 8.390653896331788, "epoch": 0.1847872797593468, "grad_norm": 0.921875, "learning_rate": 0.000107, "loss": 7.8622, "mean_token_accuracy": 0.071787304058671, "num_tokens": 392676.0, "step": 215 }, { "entropy": 8.255177211761474, "epoch": 0.18908465835840138, "grad_norm": 1.1015625, "learning_rate": 0.0001095, "loss": 7.8473, "mean_token_accuracy": 0.08185218423604965, "num_tokens": 401050.0, "step": 220 }, { "entropy": 8.367721462249756, "epoch": 0.19338203695745596, "grad_norm": 0.796875, "learning_rate": 0.000112, "loss": 7.795, "mean_token_accuracy": 0.07991239950060844, "num_tokens": 410009.0, "step": 225 }, { "entropy": 8.268333339691162, "epoch": 0.19767941555651053, "grad_norm": 0.859375, "learning_rate": 0.0001145, "loss": 7.7757, "mean_token_accuracy": 0.08171008005738259, "num_tokens": 419302.0, "step": 230 }, { "entropy": 8.304029846191407, "epoch": 0.2019767941555651, "grad_norm": 0.984375, "learning_rate": 0.00011700000000000001, "loss": 7.6812, "mean_token_accuracy": 0.08820762410759926, "num_tokens": 427296.0, "step": 235 }, { "entropy": 8.16576337814331, "epoch": 0.20627417275461968, "grad_norm": 0.91796875, "learning_rate": 0.00011949999999999999, "loss": 7.8198, "mean_token_accuracy": 0.07870872803032399, "num_tokens": 436368.0, "step": 240 }, { "entropy": 8.189785575866699, "epoch": 0.21057155135367425, "grad_norm": 1.28125, "learning_rate": 0.000122, "loss": 7.7389, "mean_token_accuracy": 0.08551637679338456, "num_tokens": 445535.0, "step": 245 }, { "entropy": 8.265625381469727, "epoch": 0.21486892995272883, "grad_norm": 0.8671875, "learning_rate": 0.0001245, "loss": 7.7093, "mean_token_accuracy": 0.07919453792273998, "num_tokens": 454769.0, "step": 250 }, { "entropy": 8.1545090675354, "epoch": 0.2191663085517834, "grad_norm": 0.93359375, "learning_rate": 0.000127, "loss": 7.7315, "mean_token_accuracy": 0.0871740497648716, "num_tokens": 463975.0, "step": 255 }, { "entropy": 8.13952112197876, "epoch": 0.22346368715083798, "grad_norm": 0.88671875, "learning_rate": 0.0001295, "loss": 7.726, "mean_token_accuracy": 0.08799278363585472, "num_tokens": 472899.0, "step": 260 }, { "entropy": 8.196070003509522, "epoch": 0.22776106574989258, "grad_norm": 0.93359375, "learning_rate": 0.000132, "loss": 7.7354, "mean_token_accuracy": 0.08013860881328583, "num_tokens": 481556.0, "step": 265 }, { "entropy": 8.114658737182618, "epoch": 0.23205844434894715, "grad_norm": 0.91015625, "learning_rate": 0.00013450000000000002, "loss": 7.7023, "mean_token_accuracy": 0.0854449674487114, "num_tokens": 490253.0, "step": 270 }, { "entropy": 8.193334579467773, "epoch": 0.23635582294800173, "grad_norm": 1.09375, "learning_rate": 0.00013700000000000002, "loss": 7.7066, "mean_token_accuracy": 0.0806311085820198, "num_tokens": 498444.0, "step": 275 }, { "entropy": 8.104936504364014, "epoch": 0.2406532015470563, "grad_norm": 0.8046875, "learning_rate": 0.0001395, "loss": 7.6467, "mean_token_accuracy": 0.08675235286355018, "num_tokens": 508330.0, "step": 280 }, { "entropy": 8.113396596908569, "epoch": 0.24495058014611087, "grad_norm": 1.015625, "learning_rate": 0.00014199999999999998, "loss": 7.7405, "mean_token_accuracy": 0.08165572881698609, "num_tokens": 517900.0, "step": 285 }, { "entropy": 8.046846723556518, "epoch": 0.24924795874516545, "grad_norm": 0.93359375, "learning_rate": 0.0001445, "loss": 7.6901, "mean_token_accuracy": 0.08230286985635757, "num_tokens": 527808.0, "step": 290 }, { "entropy": 8.13338761329651, "epoch": 0.25354533734422, "grad_norm": 0.8984375, "learning_rate": 0.000147, "loss": 7.6711, "mean_token_accuracy": 0.08156475871801376, "num_tokens": 536931.0, "step": 295 }, { "entropy": 8.18837013244629, "epoch": 0.2578427159432746, "grad_norm": 1.1875, "learning_rate": 0.0001495, "loss": 7.7049, "mean_token_accuracy": 0.0835341140627861, "num_tokens": 545758.0, "step": 300 }, { "entropy": 8.025089168548584, "epoch": 0.26214009454232917, "grad_norm": 0.9921875, "learning_rate": 0.000152, "loss": 7.7131, "mean_token_accuracy": 0.08242038711905479, "num_tokens": 555165.0, "step": 305 }, { "entropy": 8.155539417266846, "epoch": 0.2664374731413838, "grad_norm": 0.86328125, "learning_rate": 0.00015450000000000001, "loss": 7.6144, "mean_token_accuracy": 0.08789716809988021, "num_tokens": 564719.0, "step": 310 }, { "entropy": 8.041153383255004, "epoch": 0.2707348517404383, "grad_norm": 1.0, "learning_rate": 0.000157, "loss": 7.594, "mean_token_accuracy": 0.09155945181846618, "num_tokens": 573572.0, "step": 315 }, { "entropy": 8.15259666442871, "epoch": 0.2750322303394929, "grad_norm": 1.0859375, "learning_rate": 0.0001595, "loss": 7.7634, "mean_token_accuracy": 0.08318910300731659, "num_tokens": 581497.0, "step": 320 }, { "entropy": 8.100253248214722, "epoch": 0.27932960893854747, "grad_norm": 1.125, "learning_rate": 0.000162, "loss": 7.6118, "mean_token_accuracy": 0.08767011985182763, "num_tokens": 591107.0, "step": 325 }, { "entropy": 7.984478855133057, "epoch": 0.28362698753760207, "grad_norm": 0.84765625, "learning_rate": 0.00016450000000000001, "loss": 7.6456, "mean_token_accuracy": 0.08353794142603874, "num_tokens": 600241.0, "step": 330 }, { "entropy": 8.057686376571656, "epoch": 0.2879243661366566, "grad_norm": 0.91796875, "learning_rate": 0.00016700000000000002, "loss": 7.5776, "mean_token_accuracy": 0.08751234114170074, "num_tokens": 608697.0, "step": 335 }, { "entropy": 8.016141748428344, "epoch": 0.2922217447357112, "grad_norm": 0.9453125, "learning_rate": 0.00016950000000000003, "loss": 7.568, "mean_token_accuracy": 0.09023259431123734, "num_tokens": 617275.0, "step": 340 }, { "entropy": 8.084819841384888, "epoch": 0.29651912333476577, "grad_norm": 0.8984375, "learning_rate": 0.00017199999999999998, "loss": 7.6405, "mean_token_accuracy": 0.08630914464592934, "num_tokens": 626644.0, "step": 345 }, { "entropy": 8.008595705032349, "epoch": 0.30081650193382037, "grad_norm": 0.98828125, "learning_rate": 0.00017449999999999999, "loss": 7.5665, "mean_token_accuracy": 0.08766811862587928, "num_tokens": 635110.0, "step": 350 }, { "entropy": 8.04712610244751, "epoch": 0.30511388053287497, "grad_norm": 0.87109375, "learning_rate": 0.000177, "loss": 7.7031, "mean_token_accuracy": 0.08570141717791557, "num_tokens": 644746.0, "step": 355 }, { "entropy": 8.179811954498291, "epoch": 0.3094112591319295, "grad_norm": 1.1015625, "learning_rate": 0.0001795, "loss": 7.5831, "mean_token_accuracy": 0.08595824986696243, "num_tokens": 654281.0, "step": 360 }, { "entropy": 7.987443113327027, "epoch": 0.3137086377309841, "grad_norm": 1.203125, "learning_rate": 0.000182, "loss": 7.585, "mean_token_accuracy": 0.09283285215497017, "num_tokens": 663174.0, "step": 365 }, { "entropy": 7.916810417175293, "epoch": 0.31800601633003867, "grad_norm": 0.90625, "learning_rate": 0.0001845, "loss": 7.511, "mean_token_accuracy": 0.08863886222243308, "num_tokens": 672178.0, "step": 370 }, { "entropy": 8.005489206314087, "epoch": 0.32230339492909327, "grad_norm": 0.96484375, "learning_rate": 0.000187, "loss": 7.5218, "mean_token_accuracy": 0.09131815880537034, "num_tokens": 681323.0, "step": 375 }, { "entropy": 7.9803643226623535, "epoch": 0.3266007735281478, "grad_norm": 0.890625, "learning_rate": 0.0001895, "loss": 7.4406, "mean_token_accuracy": 0.08985799476504326, "num_tokens": 690461.0, "step": 380 }, { "entropy": 7.829833698272705, "epoch": 0.3308981521272024, "grad_norm": 1.046875, "learning_rate": 0.000192, "loss": 7.5004, "mean_token_accuracy": 0.08490158319473266, "num_tokens": 699199.0, "step": 385 }, { "entropy": 8.038139152526856, "epoch": 0.33519553072625696, "grad_norm": 1.1484375, "learning_rate": 0.0001945, "loss": 7.4484, "mean_token_accuracy": 0.09670188426971435, "num_tokens": 707949.0, "step": 390 }, { "entropy": 7.9735198497772215, "epoch": 0.33949290932531156, "grad_norm": 1.203125, "learning_rate": 0.00019700000000000002, "loss": 7.5219, "mean_token_accuracy": 0.08999367579817771, "num_tokens": 715752.0, "step": 395 }, { "entropy": 7.93391604423523, "epoch": 0.3437902879243661, "grad_norm": 1.1171875, "learning_rate": 0.00019950000000000002, "loss": 7.4479, "mean_token_accuracy": 0.0979436494410038, "num_tokens": 724416.0, "step": 400 }, { "entropy": 7.925309085845948, "epoch": 0.3480876665234207, "grad_norm": 1.0546875, "learning_rate": 0.000202, "loss": 7.4953, "mean_token_accuracy": 0.09031900316476822, "num_tokens": 733116.0, "step": 405 }, { "entropy": 7.916099977493286, "epoch": 0.3523850451224753, "grad_norm": 1.0625, "learning_rate": 0.00020449999999999998, "loss": 7.4726, "mean_token_accuracy": 0.09227924942970275, "num_tokens": 742093.0, "step": 410 }, { "entropy": 7.918701934814453, "epoch": 0.35668242372152986, "grad_norm": 1.046875, "learning_rate": 0.000207, "loss": 7.4649, "mean_token_accuracy": 0.09618089124560356, "num_tokens": 750402.0, "step": 415 }, { "entropy": 7.816703271865845, "epoch": 0.36097980232058446, "grad_norm": 0.9140625, "learning_rate": 0.0002095, "loss": 7.4336, "mean_token_accuracy": 0.09461462944746017, "num_tokens": 760961.0, "step": 420 }, { "entropy": 7.944287586212158, "epoch": 0.365277180919639, "grad_norm": 1.0390625, "learning_rate": 0.000212, "loss": 7.4865, "mean_token_accuracy": 0.09455274268984795, "num_tokens": 770554.0, "step": 425 }, { "entropy": 7.750526332855225, "epoch": 0.3695745595186936, "grad_norm": 1.03125, "learning_rate": 0.0002145, "loss": 7.4618, "mean_token_accuracy": 0.09681151732802391, "num_tokens": 779172.0, "step": 430 }, { "entropy": 7.9787256717681885, "epoch": 0.37387193811774816, "grad_norm": 0.984375, "learning_rate": 0.00021700000000000002, "loss": 7.5123, "mean_token_accuracy": 0.08840151131153107, "num_tokens": 788040.0, "step": 435 }, { "entropy": 7.883750295639038, "epoch": 0.37816931671680276, "grad_norm": 1.109375, "learning_rate": 0.0002195, "loss": 7.4135, "mean_token_accuracy": 0.0939902700483799, "num_tokens": 796786.0, "step": 440 }, { "entropy": 7.851776885986328, "epoch": 0.3824666953158573, "grad_norm": 1.09375, "learning_rate": 0.000222, "loss": 7.4233, "mean_token_accuracy": 0.0923767201602459, "num_tokens": 805520.0, "step": 445 }, { "entropy": 7.805376100540161, "epoch": 0.3867640739149119, "grad_norm": 1.1484375, "learning_rate": 0.0002245, "loss": 7.3508, "mean_token_accuracy": 0.09647825658321381, "num_tokens": 814939.0, "step": 450 }, { "entropy": 7.874559307098389, "epoch": 0.39106145251396646, "grad_norm": 1.2265625, "learning_rate": 0.00022700000000000002, "loss": 7.3531, "mean_token_accuracy": 0.09795481041073799, "num_tokens": 823862.0, "step": 455 }, { "entropy": 7.7626677513122555, "epoch": 0.39535883111302106, "grad_norm": 1.1328125, "learning_rate": 0.00022950000000000002, "loss": 7.3918, "mean_token_accuracy": 0.09068166017532349, "num_tokens": 832820.0, "step": 460 }, { "entropy": 7.928297901153565, "epoch": 0.39965620971207566, "grad_norm": 1.1171875, "learning_rate": 0.00023200000000000003, "loss": 7.3494, "mean_token_accuracy": 0.09501236006617546, "num_tokens": 841538.0, "step": 465 }, { "entropy": 7.7496504306793215, "epoch": 0.4039535883111302, "grad_norm": 0.99609375, "learning_rate": 0.00023449999999999998, "loss": 7.4626, "mean_token_accuracy": 0.09104103595018387, "num_tokens": 851123.0, "step": 470 }, { "entropy": 7.8953351974487305, "epoch": 0.4082509669101848, "grad_norm": 1.125, "learning_rate": 0.000237, "loss": 7.4266, "mean_token_accuracy": 0.09596899375319481, "num_tokens": 860357.0, "step": 475 }, { "entropy": 7.76341495513916, "epoch": 0.41254834550923936, "grad_norm": 1.0703125, "learning_rate": 0.0002395, "loss": 7.3425, "mean_token_accuracy": 0.09861095696687698, "num_tokens": 869980.0, "step": 480 }, { "entropy": 7.82184157371521, "epoch": 0.41684572410829396, "grad_norm": 1.03125, "learning_rate": 0.000242, "loss": 7.2999, "mean_token_accuracy": 0.10065284445881843, "num_tokens": 878250.0, "step": 485 }, { "entropy": 7.76347074508667, "epoch": 0.4211431027073485, "grad_norm": 1.25, "learning_rate": 0.0002445, "loss": 7.4007, "mean_token_accuracy": 0.095355936139822, "num_tokens": 887624.0, "step": 490 }, { "entropy": 7.753844261169434, "epoch": 0.4254404813064031, "grad_norm": 1.1484375, "learning_rate": 0.000247, "loss": 7.3568, "mean_token_accuracy": 0.09853926301002502, "num_tokens": 897120.0, "step": 495 }, { "entropy": 7.802051830291748, "epoch": 0.42973785990545765, "grad_norm": 1.03125, "learning_rate": 0.0002495, "loss": 7.3179, "mean_token_accuracy": 0.10127250477671623, "num_tokens": 906215.0, "step": 500 }, { "epoch": 0.42973785990545765, "eval_entropy": 7.412716417699246, "eval_loss": 7.3790483474731445, "eval_mean_token_accuracy": 0.09986981684929347, "eval_num_tokens": 906215.0, "eval_runtime": 2.0966, "eval_samples_per_second": 1692.736, "eval_steps_per_second": 211.771, "step": 500 }, { "entropy": 7.651102495193482, "epoch": 0.43403523850451226, "grad_norm": 1.09375, "learning_rate": 0.000252, "loss": 7.3112, "mean_token_accuracy": 0.10008608102798462, "num_tokens": 915181.0, "step": 505 }, { "entropy": 7.728409194946289, "epoch": 0.4383326171035668, "grad_norm": 1.0703125, "learning_rate": 0.0002545, "loss": 7.3388, "mean_token_accuracy": 0.09651862978935241, "num_tokens": 924377.0, "step": 510 }, { "entropy": 7.770003318786621, "epoch": 0.4426299957026214, "grad_norm": 0.984375, "learning_rate": 0.000257, "loss": 7.4098, "mean_token_accuracy": 0.09438847750425339, "num_tokens": 933114.0, "step": 515 }, { "entropy": 7.86782751083374, "epoch": 0.44692737430167595, "grad_norm": 0.9375, "learning_rate": 0.0002595, "loss": 7.3692, "mean_token_accuracy": 0.09444344118237495, "num_tokens": 943306.0, "step": 520 }, { "entropy": 7.659075498580933, "epoch": 0.45122475290073055, "grad_norm": 1.1875, "learning_rate": 0.000262, "loss": 7.2626, "mean_token_accuracy": 0.10587219074368477, "num_tokens": 951515.0, "step": 525 }, { "entropy": 7.713227224349976, "epoch": 0.45552213149978515, "grad_norm": 1.015625, "learning_rate": 0.00026450000000000003, "loss": 7.3711, "mean_token_accuracy": 0.09387057200074196, "num_tokens": 962686.0, "step": 530 }, { "entropy": 7.780395078659057, "epoch": 0.4598195100988397, "grad_norm": 1.09375, "learning_rate": 0.00026700000000000004, "loss": 7.3777, "mean_token_accuracy": 0.10021266266703606, "num_tokens": 972136.0, "step": 535 }, { "entropy": 7.657458114624023, "epoch": 0.4641168886978943, "grad_norm": 1.09375, "learning_rate": 0.00026950000000000005, "loss": 7.2696, "mean_token_accuracy": 0.10345774069428444, "num_tokens": 981301.0, "step": 540 }, { "entropy": 7.700049114227295, "epoch": 0.46841426729694885, "grad_norm": 1.1484375, "learning_rate": 0.00027200000000000005, "loss": 7.2923, "mean_token_accuracy": 0.10189392492175102, "num_tokens": 990360.0, "step": 545 }, { "entropy": 7.770557546615601, "epoch": 0.47271164589600345, "grad_norm": 1.0859375, "learning_rate": 0.0002745, "loss": 7.3438, "mean_token_accuracy": 0.09953725263476372, "num_tokens": 999415.0, "step": 550 }, { "entropy": 7.656623125076294, "epoch": 0.477009024495058, "grad_norm": 1.0625, "learning_rate": 0.000277, "loss": 7.2635, "mean_token_accuracy": 0.10239741951227188, "num_tokens": 1008762.0, "step": 555 }, { "entropy": 7.690563821792603, "epoch": 0.4813064030941126, "grad_norm": 1.171875, "learning_rate": 0.0002795, "loss": 7.2652, "mean_token_accuracy": 0.10631422251462937, "num_tokens": 1017704.0, "step": 560 }, { "entropy": 7.641897583007813, "epoch": 0.48560378169316715, "grad_norm": 1.1640625, "learning_rate": 0.00028199999999999997, "loss": 7.2341, "mean_token_accuracy": 0.10428761765360832, "num_tokens": 1026251.0, "step": 565 }, { "entropy": 7.641419315338135, "epoch": 0.48990116029222175, "grad_norm": 1.03125, "learning_rate": 0.0002845, "loss": 7.2158, "mean_token_accuracy": 0.10731100514531136, "num_tokens": 1036191.0, "step": 570 }, { "entropy": 7.658735990524292, "epoch": 0.4941985388912763, "grad_norm": 1.0859375, "learning_rate": 0.000287, "loss": 7.2462, "mean_token_accuracy": 0.10594421103596688, "num_tokens": 1044936.0, "step": 575 }, { "entropy": 7.621677112579346, "epoch": 0.4984959174903309, "grad_norm": 1.1171875, "learning_rate": 0.0002895, "loss": 7.2472, "mean_token_accuracy": 0.10367096737027168, "num_tokens": 1053683.0, "step": 580 }, { "entropy": 7.570435047149658, "epoch": 0.5027932960893855, "grad_norm": 1.046875, "learning_rate": 0.000292, "loss": 7.2271, "mean_token_accuracy": 0.1076263040304184, "num_tokens": 1062932.0, "step": 585 }, { "entropy": 7.723283386230468, "epoch": 0.50709067468844, "grad_norm": 0.98828125, "learning_rate": 0.0002945, "loss": 7.2544, "mean_token_accuracy": 0.10264097228646278, "num_tokens": 1072313.0, "step": 590 }, { "entropy": 7.62511043548584, "epoch": 0.5113880532874946, "grad_norm": 1.171875, "learning_rate": 0.000297, "loss": 7.2228, "mean_token_accuracy": 0.09801378548145294, "num_tokens": 1081675.0, "step": 595 }, { "entropy": 7.608328151702881, "epoch": 0.5156854318865493, "grad_norm": 1.0703125, "learning_rate": 0.0002995, "loss": 7.2433, "mean_token_accuracy": 0.10141062065958976, "num_tokens": 1091541.0, "step": 600 }, { "entropy": 7.695394897460938, "epoch": 0.5199828104856038, "grad_norm": 1.015625, "learning_rate": 0.000302, "loss": 7.2462, "mean_token_accuracy": 0.10475782826542854, "num_tokens": 1100724.0, "step": 605 }, { "entropy": 7.50453405380249, "epoch": 0.5242801890846583, "grad_norm": 1.0546875, "learning_rate": 0.0003045, "loss": 7.1924, "mean_token_accuracy": 0.1077597513794899, "num_tokens": 1108869.0, "step": 610 }, { "entropy": 7.644835519790649, "epoch": 0.5285775676837129, "grad_norm": 1.1015625, "learning_rate": 0.000307, "loss": 7.2261, "mean_token_accuracy": 0.10431057810783387, "num_tokens": 1117314.0, "step": 615 }, { "entropy": 7.488267469406128, "epoch": 0.5328749462827675, "grad_norm": 1.109375, "learning_rate": 0.0003095, "loss": 7.148, "mean_token_accuracy": 0.10711429193615914, "num_tokens": 1126786.0, "step": 620 }, { "entropy": 7.577956056594848, "epoch": 0.5371723248818221, "grad_norm": 1.3046875, "learning_rate": 0.000312, "loss": 7.1645, "mean_token_accuracy": 0.10579404905438423, "num_tokens": 1136013.0, "step": 625 }, { "entropy": 7.527575206756592, "epoch": 0.5414697034808766, "grad_norm": 1.109375, "learning_rate": 0.0003145, "loss": 7.1969, "mean_token_accuracy": 0.10749110653996467, "num_tokens": 1144970.0, "step": 630 }, { "entropy": 7.613465976715088, "epoch": 0.5457670820799312, "grad_norm": 1.2578125, "learning_rate": 0.000317, "loss": 7.1614, "mean_token_accuracy": 0.11203600242733955, "num_tokens": 1153810.0, "step": 635 }, { "entropy": 7.521342611312866, "epoch": 0.5500644606789858, "grad_norm": 1.0546875, "learning_rate": 0.0003195, "loss": 7.1408, "mean_token_accuracy": 0.10991051346063614, "num_tokens": 1162498.0, "step": 640 }, { "entropy": 7.5313867092132565, "epoch": 0.5543618392780404, "grad_norm": 1.0546875, "learning_rate": 0.000322, "loss": 7.2164, "mean_token_accuracy": 0.1044546626508236, "num_tokens": 1172091.0, "step": 645 }, { "entropy": 7.653256607055664, "epoch": 0.5586592178770949, "grad_norm": 1.1015625, "learning_rate": 0.00032450000000000003, "loss": 7.1977, "mean_token_accuracy": 0.10631284043192864, "num_tokens": 1181400.0, "step": 650 }, { "entropy": 7.537307643890381, "epoch": 0.5629565964761496, "grad_norm": 1.2890625, "learning_rate": 0.00032700000000000003, "loss": 7.1721, "mean_token_accuracy": 0.11125476211309433, "num_tokens": 1189780.0, "step": 655 }, { "entropy": 7.477937269210815, "epoch": 0.5672539750752041, "grad_norm": 1.1875, "learning_rate": 0.00032950000000000004, "loss": 7.1315, "mean_token_accuracy": 0.1057468131184578, "num_tokens": 1198671.0, "step": 660 }, { "entropy": 7.589753818511963, "epoch": 0.5715513536742587, "grad_norm": 1.09375, "learning_rate": 0.00033200000000000005, "loss": 7.1652, "mean_token_accuracy": 0.1051194004714489, "num_tokens": 1207173.0, "step": 665 }, { "entropy": 7.461796855926513, "epoch": 0.5758487322733132, "grad_norm": 1.21875, "learning_rate": 0.00033450000000000005, "loss": 7.0998, "mean_token_accuracy": 0.11046240702271462, "num_tokens": 1216387.0, "step": 670 }, { "entropy": 7.622633552551269, "epoch": 0.5801461108723679, "grad_norm": 1.0234375, "learning_rate": 0.000337, "loss": 7.0722, "mean_token_accuracy": 0.11004948541522026, "num_tokens": 1224461.0, "step": 675 }, { "entropy": 7.451505851745606, "epoch": 0.5844434894714224, "grad_norm": 1.1796875, "learning_rate": 0.0003395, "loss": 7.1414, "mean_token_accuracy": 0.11011224165558815, "num_tokens": 1233774.0, "step": 680 }, { "entropy": 7.457524538040161, "epoch": 0.588740868070477, "grad_norm": 1.2109375, "learning_rate": 0.000342, "loss": 7.0938, "mean_token_accuracy": 0.1142980344593525, "num_tokens": 1242812.0, "step": 685 }, { "entropy": 7.605640840530396, "epoch": 0.5930382466695315, "grad_norm": 1.03125, "learning_rate": 0.00034449999999999997, "loss": 7.191, "mean_token_accuracy": 0.11035142987966537, "num_tokens": 1252872.0, "step": 690 }, { "entropy": 7.307473850250244, "epoch": 0.5973356252685862, "grad_norm": 1.1796875, "learning_rate": 0.000347, "loss": 6.983, "mean_token_accuracy": 0.11081922426819801, "num_tokens": 1260852.0, "step": 695 }, { "entropy": 7.438599157333374, "epoch": 0.6016330038676407, "grad_norm": 1.2578125, "learning_rate": 0.0003495, "loss": 7.0984, "mean_token_accuracy": 0.10763570070266723, "num_tokens": 1268925.0, "step": 700 }, { "entropy": 7.530004072189331, "epoch": 0.6059303824666953, "grad_norm": 1.109375, "learning_rate": 0.000352, "loss": 7.145, "mean_token_accuracy": 0.10653513446450233, "num_tokens": 1278994.0, "step": 705 }, { "entropy": 7.4260091304779055, "epoch": 0.6102277610657499, "grad_norm": 1.1640625, "learning_rate": 0.0003545, "loss": 7.1323, "mean_token_accuracy": 0.10368426591157913, "num_tokens": 1287698.0, "step": 710 }, { "entropy": 7.482218551635742, "epoch": 0.6145251396648045, "grad_norm": 1.0546875, "learning_rate": 0.000357, "loss": 7.0787, "mean_token_accuracy": 0.11120296269655228, "num_tokens": 1297475.0, "step": 715 }, { "entropy": 7.480340671539307, "epoch": 0.618822518263859, "grad_norm": 1.1328125, "learning_rate": 0.0003595, "loss": 7.1091, "mean_token_accuracy": 0.11085583940148354, "num_tokens": 1306836.0, "step": 720 }, { "entropy": 7.506947946548462, "epoch": 0.6231198968629136, "grad_norm": 1.03125, "learning_rate": 0.000362, "loss": 7.1377, "mean_token_accuracy": 0.10435779988765717, "num_tokens": 1315872.0, "step": 725 }, { "entropy": 7.4788847923278805, "epoch": 0.6274172754619682, "grad_norm": 1.1796875, "learning_rate": 0.0003645, "loss": 7.0782, "mean_token_accuracy": 0.11685637310147286, "num_tokens": 1324624.0, "step": 730 }, { "entropy": 7.444537830352783, "epoch": 0.6317146540610228, "grad_norm": 1.15625, "learning_rate": 0.000367, "loss": 7.061, "mean_token_accuracy": 0.11548577472567559, "num_tokens": 1333058.0, "step": 735 }, { "entropy": 7.262284660339356, "epoch": 0.6360120326600773, "grad_norm": 1.078125, "learning_rate": 0.0003695, "loss": 7.0248, "mean_token_accuracy": 0.11004846841096878, "num_tokens": 1342376.0, "step": 740 }, { "entropy": 7.526681852340698, "epoch": 0.6403094112591319, "grad_norm": 1.1484375, "learning_rate": 0.000372, "loss": 7.0693, "mean_token_accuracy": 0.10503109246492386, "num_tokens": 1351386.0, "step": 745 }, { "entropy": 7.364239978790283, "epoch": 0.6446067898581865, "grad_norm": 1.265625, "learning_rate": 0.0003745, "loss": 6.9832, "mean_token_accuracy": 0.11761592403054237, "num_tokens": 1358958.0, "step": 750 }, { "entropy": 7.496349859237671, "epoch": 0.6489041684572411, "grad_norm": 1.109375, "learning_rate": 0.000377, "loss": 7.1231, "mean_token_accuracy": 0.10967899858951569, "num_tokens": 1368599.0, "step": 755 }, { "entropy": 7.435608530044556, "epoch": 0.6532015470562956, "grad_norm": 1.890625, "learning_rate": 0.0003795, "loss": 7.1433, "mean_token_accuracy": 0.1064300425350666, "num_tokens": 1378529.0, "step": 760 }, { "entropy": 7.344243001937866, "epoch": 0.6574989256553503, "grad_norm": 1.25, "learning_rate": 0.000382, "loss": 6.9306, "mean_token_accuracy": 0.11750481277704239, "num_tokens": 1386993.0, "step": 765 }, { "entropy": 7.390715217590332, "epoch": 0.6617963042544048, "grad_norm": 1.5, "learning_rate": 0.0003845, "loss": 7.0322, "mean_token_accuracy": 0.11829963177442551, "num_tokens": 1395790.0, "step": 770 }, { "entropy": 7.302670812606811, "epoch": 0.6660936828534594, "grad_norm": 1.078125, "learning_rate": 0.00038700000000000003, "loss": 7.0393, "mean_token_accuracy": 0.11235549300909042, "num_tokens": 1405587.0, "step": 775 }, { "entropy": 7.348860168457032, "epoch": 0.6703910614525139, "grad_norm": 1.0390625, "learning_rate": 0.00038950000000000003, "loss": 6.9999, "mean_token_accuracy": 0.11504087448120118, "num_tokens": 1414478.0, "step": 780 }, { "entropy": 7.428205347061157, "epoch": 0.6746884400515686, "grad_norm": 1.375, "learning_rate": 0.00039200000000000004, "loss": 7.0623, "mean_token_accuracy": 0.11534775421023369, "num_tokens": 1423791.0, "step": 785 }, { "entropy": 7.467832851409912, "epoch": 0.6789858186506231, "grad_norm": 1.234375, "learning_rate": 0.00039450000000000005, "loss": 7.1014, "mean_token_accuracy": 0.10728210881352425, "num_tokens": 1432955.0, "step": 790 }, { "entropy": 7.385548782348633, "epoch": 0.6832831972496777, "grad_norm": 0.99609375, "learning_rate": 0.00039700000000000005, "loss": 7.074, "mean_token_accuracy": 0.1087567687034607, "num_tokens": 1441907.0, "step": 795 }, { "entropy": 7.290066146850586, "epoch": 0.6875805758487322, "grad_norm": 1.203125, "learning_rate": 0.0003995, "loss": 6.935, "mean_token_accuracy": 0.11768098697066307, "num_tokens": 1451062.0, "step": 800 }, { "entropy": 7.399672508239746, "epoch": 0.6918779544477869, "grad_norm": 1.0234375, "learning_rate": 0.000402, "loss": 7.0218, "mean_token_accuracy": 0.10959179401397705, "num_tokens": 1460132.0, "step": 805 }, { "entropy": 7.272280263900757, "epoch": 0.6961753330468414, "grad_norm": 1.0625, "learning_rate": 0.0004045, "loss": 6.9141, "mean_token_accuracy": 0.11885375007987023, "num_tokens": 1469582.0, "step": 810 }, { "entropy": 7.255832242965698, "epoch": 0.700472711645896, "grad_norm": 1.3515625, "learning_rate": 0.00040699999999999997, "loss": 7.012, "mean_token_accuracy": 0.10950389429926873, "num_tokens": 1479053.0, "step": 815 }, { "entropy": 7.313858604431152, "epoch": 0.7047700902449506, "grad_norm": 1.21875, "learning_rate": 0.0004095, "loss": 7.0142, "mean_token_accuracy": 0.11343196108937263, "num_tokens": 1488189.0, "step": 820 }, { "entropy": 7.236453676223755, "epoch": 0.7090674688440052, "grad_norm": 1.046875, "learning_rate": 0.000412, "loss": 6.8662, "mean_token_accuracy": 0.12046442031860352, "num_tokens": 1497324.0, "step": 825 }, { "entropy": 7.310264635086059, "epoch": 0.7133648474430597, "grad_norm": 1.015625, "learning_rate": 0.0004145, "loss": 6.9814, "mean_token_accuracy": 0.11739002540707588, "num_tokens": 1506543.0, "step": 830 }, { "entropy": 7.289929437637329, "epoch": 0.7176622260421143, "grad_norm": 1.109375, "learning_rate": 0.000417, "loss": 6.9742, "mean_token_accuracy": 0.12236066460609436, "num_tokens": 1516737.0, "step": 835 }, { "entropy": 7.161224508285523, "epoch": 0.7219596046411689, "grad_norm": 1.046875, "learning_rate": 0.0004195, "loss": 6.8503, "mean_token_accuracy": 0.11500222384929656, "num_tokens": 1525561.0, "step": 840 }, { "entropy": 7.280500030517578, "epoch": 0.7262569832402235, "grad_norm": 1.1328125, "learning_rate": 0.000422, "loss": 6.8765, "mean_token_accuracy": 0.1242159940302372, "num_tokens": 1533323.0, "step": 845 }, { "entropy": 7.292038059234619, "epoch": 0.730554361839278, "grad_norm": 1.1875, "learning_rate": 0.0004245, "loss": 6.9379, "mean_token_accuracy": 0.12142991349101066, "num_tokens": 1542632.0, "step": 850 }, { "entropy": 7.305912923812866, "epoch": 0.7348517404383326, "grad_norm": 1.265625, "learning_rate": 0.000427, "loss": 6.8775, "mean_token_accuracy": 0.12107516825199127, "num_tokens": 1551236.0, "step": 855 }, { "entropy": 7.118098545074463, "epoch": 0.7391491190373872, "grad_norm": 1.15625, "learning_rate": 0.0004295, "loss": 6.878, "mean_token_accuracy": 0.12266490310430526, "num_tokens": 1559674.0, "step": 860 }, { "entropy": 7.268103885650635, "epoch": 0.7434464976364418, "grad_norm": 1.09375, "learning_rate": 0.000432, "loss": 6.9687, "mean_token_accuracy": 0.1217973381280899, "num_tokens": 1569481.0, "step": 865 }, { "entropy": 7.2675707817077635, "epoch": 0.7477438762354963, "grad_norm": 1.0859375, "learning_rate": 0.0004345, "loss": 6.9975, "mean_token_accuracy": 0.11359266638755798, "num_tokens": 1578488.0, "step": 870 }, { "entropy": 7.171451759338379, "epoch": 0.752041254834551, "grad_norm": 1.0625, "learning_rate": 0.000437, "loss": 6.8946, "mean_token_accuracy": 0.11810402423143387, "num_tokens": 1586675.0, "step": 875 }, { "entropy": 7.285072469711304, "epoch": 0.7563386334336055, "grad_norm": 1.0859375, "learning_rate": 0.0004395, "loss": 7.0021, "mean_token_accuracy": 0.10800698548555374, "num_tokens": 1595411.0, "step": 880 }, { "entropy": 7.312672233581543, "epoch": 0.7606360120326601, "grad_norm": 1.1953125, "learning_rate": 0.000442, "loss": 6.9755, "mean_token_accuracy": 0.11759781166911125, "num_tokens": 1604046.0, "step": 885 }, { "entropy": 7.245748281478882, "epoch": 0.7649333906317146, "grad_norm": 1.0859375, "learning_rate": 0.0004445, "loss": 6.9643, "mean_token_accuracy": 0.11201045587658882, "num_tokens": 1613759.0, "step": 890 }, { "entropy": 7.238279533386231, "epoch": 0.7692307692307693, "grad_norm": 1.015625, "learning_rate": 0.000447, "loss": 6.9209, "mean_token_accuracy": 0.11877147182822227, "num_tokens": 1623323.0, "step": 895 }, { "entropy": 7.230697107315064, "epoch": 0.7735281478298238, "grad_norm": 1.1328125, "learning_rate": 0.00044950000000000003, "loss": 6.9005, "mean_token_accuracy": 0.11391794160008431, "num_tokens": 1631727.0, "step": 900 }, { "entropy": 7.194222545623779, "epoch": 0.7778255264288784, "grad_norm": 1.1875, "learning_rate": 0.00045200000000000004, "loss": 6.8583, "mean_token_accuracy": 0.12049278989434242, "num_tokens": 1639544.0, "step": 905 }, { "entropy": 7.284112405776978, "epoch": 0.7821229050279329, "grad_norm": 1.125, "learning_rate": 0.00045450000000000004, "loss": 6.9773, "mean_token_accuracy": 0.11113567724823951, "num_tokens": 1648931.0, "step": 910 }, { "entropy": 7.1627342224121096, "epoch": 0.7864202836269876, "grad_norm": 1.15625, "learning_rate": 0.00045700000000000005, "loss": 6.8345, "mean_token_accuracy": 0.12127922549843788, "num_tokens": 1657688.0, "step": 915 }, { "entropy": 7.259271335601807, "epoch": 0.7907176622260421, "grad_norm": 1.0390625, "learning_rate": 0.00045950000000000006, "loss": 6.9244, "mean_token_accuracy": 0.11565326899290085, "num_tokens": 1666879.0, "step": 920 }, { "entropy": 7.1275458335876465, "epoch": 0.7950150408250967, "grad_norm": 1.109375, "learning_rate": 0.000462, "loss": 6.8982, "mean_token_accuracy": 0.118662890791893, "num_tokens": 1676773.0, "step": 925 }, { "entropy": 7.2360998630523685, "epoch": 0.7993124194241513, "grad_norm": 1.0859375, "learning_rate": 0.0004645, "loss": 7.0092, "mean_token_accuracy": 0.11184348464012146, "num_tokens": 1686144.0, "step": 930 }, { "entropy": 7.26247010231018, "epoch": 0.8036097980232059, "grad_norm": 1.078125, "learning_rate": 0.000467, "loss": 6.9646, "mean_token_accuracy": 0.10949353277683258, "num_tokens": 1695476.0, "step": 935 }, { "entropy": 7.174946022033692, "epoch": 0.8079071766222604, "grad_norm": 1.046875, "learning_rate": 0.0004695, "loss": 6.8498, "mean_token_accuracy": 0.12084392830729485, "num_tokens": 1704907.0, "step": 940 }, { "entropy": 7.166734504699707, "epoch": 0.812204555221315, "grad_norm": 0.9609375, "learning_rate": 0.000472, "loss": 6.8948, "mean_token_accuracy": 0.12091493904590607, "num_tokens": 1714564.0, "step": 945 }, { "entropy": 7.244975614547729, "epoch": 0.8165019338203696, "grad_norm": 1.1171875, "learning_rate": 0.0004745, "loss": 6.9209, "mean_token_accuracy": 0.1155279442667961, "num_tokens": 1725285.0, "step": 950 }, { "entropy": 7.1149109363555905, "epoch": 0.8207993124194242, "grad_norm": 1.03125, "learning_rate": 0.000477, "loss": 6.9153, "mean_token_accuracy": 0.11715079098939896, "num_tokens": 1734331.0, "step": 955 }, { "entropy": 7.227117824554443, "epoch": 0.8250966910184787, "grad_norm": 1.2578125, "learning_rate": 0.0004795, "loss": 6.852, "mean_token_accuracy": 0.11185217499732972, "num_tokens": 1742340.0, "step": 960 }, { "entropy": 7.160442066192627, "epoch": 0.8293940696175333, "grad_norm": 1.109375, "learning_rate": 0.000482, "loss": 6.8351, "mean_token_accuracy": 0.12198592498898506, "num_tokens": 1751725.0, "step": 965 }, { "entropy": 6.999344539642334, "epoch": 0.8336914482165879, "grad_norm": 1.1328125, "learning_rate": 0.0004845, "loss": 6.7683, "mean_token_accuracy": 0.12398558706045151, "num_tokens": 1760294.0, "step": 970 }, { "entropy": 7.112461137771606, "epoch": 0.8379888268156425, "grad_norm": 1.0546875, "learning_rate": 0.000487, "loss": 6.8275, "mean_token_accuracy": 0.11639805063605309, "num_tokens": 1768912.0, "step": 975 }, { "entropy": 7.257990169525146, "epoch": 0.842286205414697, "grad_norm": 1.0390625, "learning_rate": 0.0004895, "loss": 7.0148, "mean_token_accuracy": 0.12016609534621239, "num_tokens": 1778633.0, "step": 980 }, { "entropy": 7.1191816329956055, "epoch": 0.8465835840137517, "grad_norm": 1.1171875, "learning_rate": 0.000492, "loss": 6.8847, "mean_token_accuracy": 0.11811531409621238, "num_tokens": 1787275.0, "step": 985 }, { "entropy": 7.235857200622559, "epoch": 0.8508809626128062, "grad_norm": 1.2578125, "learning_rate": 0.0004945, "loss": 6.8878, "mean_token_accuracy": 0.11604067236185074, "num_tokens": 1795994.0, "step": 990 }, { "entropy": 7.036646842956543, "epoch": 0.8551783412118608, "grad_norm": 0.8359375, "learning_rate": 0.000497, "loss": 6.804, "mean_token_accuracy": 0.11985133662819862, "num_tokens": 1806379.0, "step": 995 }, { "entropy": 7.154667520523072, "epoch": 0.8594757198109153, "grad_norm": 1.0546875, "learning_rate": 0.0004995, "loss": 6.8296, "mean_token_accuracy": 0.1270947828888893, "num_tokens": 1816135.0, "step": 1000 }, { "epoch": 0.8594757198109153, "eval_entropy": 6.812919497489929, "eval_loss": 6.8574419021606445, "eval_mean_token_accuracy": 0.12292942362795542, "eval_num_tokens": 1816135.0, "eval_runtime": 2.0522, "eval_samples_per_second": 1729.37, "eval_steps_per_second": 216.354, "step": 1000 }, { "entropy": 7.122643280029297, "epoch": 0.86377309840997, "grad_norm": 1.2734375, "learning_rate": 0.0004999998427807679, "loss": 6.8305, "mean_token_accuracy": 0.12133256047964096, "num_tokens": 1824777.0, "step": 1005 }, { "entropy": 7.058982563018799, "epoch": 0.8680704770090245, "grad_norm": 1.234375, "learning_rate": 0.0004999992040780138, "loss": 6.8924, "mean_token_accuracy": 0.12320492565631866, "num_tokens": 1833807.0, "step": 1010 }, { "entropy": 7.185050773620605, "epoch": 0.8723678556080791, "grad_norm": 1.0078125, "learning_rate": 0.0004999980740669294, "loss": 6.8357, "mean_token_accuracy": 0.11969011649489403, "num_tokens": 1843375.0, "step": 1015 }, { "entropy": 7.11086139678955, "epoch": 0.8766652342071336, "grad_norm": 1.140625, "learning_rate": 0.0004999964527499823, "loss": 6.9058, "mean_token_accuracy": 0.11237111985683441, "num_tokens": 1853036.0, "step": 1020 }, { "entropy": 7.120519638061523, "epoch": 0.8809626128061883, "grad_norm": 1.0703125, "learning_rate": 0.0004999943401307127, "loss": 6.8707, "mean_token_accuracy": 0.11769452393054962, "num_tokens": 1862041.0, "step": 1025 }, { "entropy": 7.087871503829956, "epoch": 0.8852599914052428, "grad_norm": 1.1015625, "learning_rate": 0.0004999917362137337, "loss": 6.7742, "mean_token_accuracy": 0.1225271351635456, "num_tokens": 1870707.0, "step": 1030 }, { "entropy": 7.055140686035156, "epoch": 0.8895573700042974, "grad_norm": 1.078125, "learning_rate": 0.0004999886410047312, "loss": 6.7705, "mean_token_accuracy": 0.11845692843198777, "num_tokens": 1879787.0, "step": 1035 }, { "entropy": 7.138674926757813, "epoch": 0.8938547486033519, "grad_norm": 0.98828125, "learning_rate": 0.0004999850545104638, "loss": 6.8315, "mean_token_accuracy": 0.1223653219640255, "num_tokens": 1889413.0, "step": 1040 }, { "entropy": 7.048402404785156, "epoch": 0.8981521272024066, "grad_norm": 1.171875, "learning_rate": 0.0004999809767387633, "loss": 6.8174, "mean_token_accuracy": 0.12110616937279702, "num_tokens": 1898283.0, "step": 1045 }, { "entropy": 7.144178056716919, "epoch": 0.9024495058014611, "grad_norm": 1.0546875, "learning_rate": 0.0004999764076985337, "loss": 6.8287, "mean_token_accuracy": 0.12670400962233544, "num_tokens": 1907175.0, "step": 1050 }, { "entropy": 6.988327312469482, "epoch": 0.9067468844005157, "grad_norm": 1.09375, "learning_rate": 0.0004999713473997519, "loss": 6.8824, "mean_token_accuracy": 0.11774980947375298, "num_tokens": 1918223.0, "step": 1055 }, { "entropy": 7.124748563766479, "epoch": 0.9110442629995703, "grad_norm": 1.09375, "learning_rate": 0.0004999657958534677, "loss": 6.8312, "mean_token_accuracy": 0.1194000355899334, "num_tokens": 1928801.0, "step": 1060 }, { "entropy": 7.008511686325074, "epoch": 0.9153416415986249, "grad_norm": 1.1328125, "learning_rate": 0.0004999597530718034, "loss": 6.7896, "mean_token_accuracy": 0.12186847031116485, "num_tokens": 1937406.0, "step": 1065 }, { "entropy": 6.997484445571899, "epoch": 0.9196390201976794, "grad_norm": 1.078125, "learning_rate": 0.000499953219067954, "loss": 6.7932, "mean_token_accuracy": 0.11857569143176079, "num_tokens": 1947184.0, "step": 1070 }, { "entropy": 7.135808944702148, "epoch": 0.923936398796734, "grad_norm": 1.09375, "learning_rate": 0.0004999461938561873, "loss": 6.8139, "mean_token_accuracy": 0.12288291603326798, "num_tokens": 1956293.0, "step": 1075 }, { "entropy": 7.027012157440185, "epoch": 0.9282337773957886, "grad_norm": 1.1328125, "learning_rate": 0.0004999386774518432, "loss": 6.7854, "mean_token_accuracy": 0.11997194737195968, "num_tokens": 1964791.0, "step": 1080 }, { "entropy": 6.975531768798828, "epoch": 0.9325311559948432, "grad_norm": 1.0703125, "learning_rate": 0.0004999306698713349, "loss": 6.7088, "mean_token_accuracy": 0.12559010088443756, "num_tokens": 1973754.0, "step": 1085 }, { "entropy": 7.052453565597534, "epoch": 0.9368285345938977, "grad_norm": 1.078125, "learning_rate": 0.0004999221711321477, "loss": 6.7738, "mean_token_accuracy": 0.12475829720497131, "num_tokens": 1983035.0, "step": 1090 }, { "entropy": 6.906819009780884, "epoch": 0.9411259131929522, "grad_norm": 1.0703125, "learning_rate": 0.0004999131812528393, "loss": 6.8003, "mean_token_accuracy": 0.12229804769158363, "num_tokens": 1992584.0, "step": 1095 }, { "entropy": 7.109902429580688, "epoch": 0.9454232917920069, "grad_norm": 0.97265625, "learning_rate": 0.00049990370025304, "loss": 6.8193, "mean_token_accuracy": 0.12188051193952561, "num_tokens": 2001876.0, "step": 1100 }, { "entropy": 7.017454195022583, "epoch": 0.9497206703910615, "grad_norm": 0.97265625, "learning_rate": 0.0004998937281534526, "loss": 6.7115, "mean_token_accuracy": 0.1300358146429062, "num_tokens": 2011067.0, "step": 1105 }, { "entropy": 7.091220808029175, "epoch": 0.954018048990116, "grad_norm": 1.09375, "learning_rate": 0.0004998832649758521, "loss": 6.8077, "mean_token_accuracy": 0.12548175528645517, "num_tokens": 2020763.0, "step": 1110 }, { "entropy": 6.9685986042022705, "epoch": 0.9583154275891707, "grad_norm": 1.1796875, "learning_rate": 0.0004998723107430862, "loss": 6.7867, "mean_token_accuracy": 0.12391732335090637, "num_tokens": 2029534.0, "step": 1115 }, { "entropy": 7.046098041534424, "epoch": 0.9626128061882252, "grad_norm": 1.09375, "learning_rate": 0.0004998608654790741, "loss": 6.7311, "mean_token_accuracy": 0.12396327033638954, "num_tokens": 2039143.0, "step": 1120 }, { "entropy": 6.939239406585694, "epoch": 0.9669101847872797, "grad_norm": 1.125, "learning_rate": 0.000499848929208808, "loss": 6.7022, "mean_token_accuracy": 0.1295892022550106, "num_tokens": 2048253.0, "step": 1125 }, { "entropy": 6.931437301635742, "epoch": 0.9712075633863343, "grad_norm": 1.1484375, "learning_rate": 0.0004998365019583519, "loss": 6.7428, "mean_token_accuracy": 0.13122318536043168, "num_tokens": 2057234.0, "step": 1130 }, { "entropy": 7.081391954421997, "epoch": 0.975504941985389, "grad_norm": 1.1953125, "learning_rate": 0.0004998235837548417, "loss": 6.7881, "mean_token_accuracy": 0.1271953523159027, "num_tokens": 2065431.0, "step": 1135 }, { "entropy": 6.974546146392822, "epoch": 0.9798023205844435, "grad_norm": 1.0625, "learning_rate": 0.000499810174626486, "loss": 6.7888, "mean_token_accuracy": 0.1228917419910431, "num_tokens": 2074723.0, "step": 1140 }, { "entropy": 7.011039209365845, "epoch": 0.984099699183498, "grad_norm": 1.1953125, "learning_rate": 0.0004997962746025646, "loss": 6.6544, "mean_token_accuracy": 0.13169871941208838, "num_tokens": 2084509.0, "step": 1145 }, { "entropy": 6.973200798034668, "epoch": 0.9883970777825526, "grad_norm": 1.21875, "learning_rate": 0.0004997818837134298, "loss": 6.8028, "mean_token_accuracy": 0.12382483929395675, "num_tokens": 2093110.0, "step": 1150 }, { "entropy": 6.879178285598755, "epoch": 0.9926944563816072, "grad_norm": 1.125, "learning_rate": 0.0004997670019905057, "loss": 6.6634, "mean_token_accuracy": 0.12532600611448289, "num_tokens": 2102355.0, "step": 1155 }, { "entropy": 6.967250823974609, "epoch": 0.9969918349806618, "grad_norm": 1.171875, "learning_rate": 0.0004997516294662876, "loss": 6.6987, "mean_token_accuracy": 0.12651606351137162, "num_tokens": 2110418.0, "step": 1160 }, { "entropy": 6.987489064534505, "epoch": 1.0008594757198108, "grad_norm": 1.1484375, "learning_rate": 0.0004997357661743433, "loss": 6.6851, "mean_token_accuracy": 0.12885562578837076, "num_tokens": 2117866.0, "step": 1165 }, { "entropy": 6.906875991821289, "epoch": 1.0051568543188656, "grad_norm": 1.09375, "learning_rate": 0.0004997194121493118, "loss": 6.5242, "mean_token_accuracy": 0.1341039627790451, "num_tokens": 2126082.0, "step": 1170 }, { "entropy": 6.9217222213745115, "epoch": 1.0094542329179201, "grad_norm": 1.078125, "learning_rate": 0.0004997025674269037, "loss": 6.496, "mean_token_accuracy": 0.14013660922646523, "num_tokens": 2134042.0, "step": 1175 }, { "entropy": 6.853777265548706, "epoch": 1.0137516115169747, "grad_norm": 1.1953125, "learning_rate": 0.0004996852320439013, "loss": 6.5756, "mean_token_accuracy": 0.13146138042211533, "num_tokens": 2142570.0, "step": 1180 }, { "entropy": 6.882978248596191, "epoch": 1.0180489901160292, "grad_norm": 0.9765625, "learning_rate": 0.0004996674060381578, "loss": 6.5116, "mean_token_accuracy": 0.13583723902702333, "num_tokens": 2151310.0, "step": 1185 }, { "entropy": 6.949011325836182, "epoch": 1.0223463687150838, "grad_norm": 1.09375, "learning_rate": 0.0004996490894485985, "loss": 6.5696, "mean_token_accuracy": 0.1317083679139614, "num_tokens": 2160662.0, "step": 1190 }, { "entropy": 6.906634664535522, "epoch": 1.0266437473141383, "grad_norm": 1.078125, "learning_rate": 0.0004996302823152193, "loss": 6.5221, "mean_token_accuracy": 0.132858457416296, "num_tokens": 2170067.0, "step": 1195 }, { "entropy": 6.835825204849243, "epoch": 1.0309411259131929, "grad_norm": 1.09375, "learning_rate": 0.0004996109846790873, "loss": 6.4844, "mean_token_accuracy": 0.13565613552927971, "num_tokens": 2178850.0, "step": 1200 }, { "entropy": 6.833173513412476, "epoch": 1.0352385045122476, "grad_norm": 0.984375, "learning_rate": 0.0004995911965823412, "loss": 6.5058, "mean_token_accuracy": 0.14241415858268738, "num_tokens": 2188307.0, "step": 1205 }, { "entropy": 6.888755178451538, "epoch": 1.0395358831113022, "grad_norm": 1.171875, "learning_rate": 0.0004995709180681899, "loss": 6.5098, "mean_token_accuracy": 0.14214854687452316, "num_tokens": 2197026.0, "step": 1210 }, { "entropy": 6.828827667236328, "epoch": 1.0438332617103567, "grad_norm": 1.109375, "learning_rate": 0.000499550149180914, "loss": 6.4795, "mean_token_accuracy": 0.13599886670708655, "num_tokens": 2205537.0, "step": 1215 }, { "entropy": 6.880095815658569, "epoch": 1.0481306403094113, "grad_norm": 1.15625, "learning_rate": 0.0004995288899658641, "loss": 6.5128, "mean_token_accuracy": 0.14047559648752211, "num_tokens": 2214508.0, "step": 1220 }, { "entropy": 6.848831415176392, "epoch": 1.0524280189084658, "grad_norm": 1.1796875, "learning_rate": 0.0004995071404694619, "loss": 6.6248, "mean_token_accuracy": 0.1286735638976097, "num_tokens": 2223084.0, "step": 1225 }, { "entropy": 6.930538558959961, "epoch": 1.0567253975075204, "grad_norm": 1.0546875, "learning_rate": 0.0004994849007391996, "loss": 6.5507, "mean_token_accuracy": 0.12893568202853203, "num_tokens": 2231406.0, "step": 1230 }, { "entropy": 6.784887790679932, "epoch": 1.061022776106575, "grad_norm": 1.0859375, "learning_rate": 0.0004994621708236401, "loss": 6.4682, "mean_token_accuracy": 0.136442781239748, "num_tokens": 2239867.0, "step": 1235 }, { "entropy": 6.8624866008758545, "epoch": 1.0653201547056295, "grad_norm": 1.203125, "learning_rate": 0.000499438950772416, "loss": 6.5264, "mean_token_accuracy": 0.1343722127377987, "num_tokens": 2248844.0, "step": 1240 }, { "entropy": 6.764705419540405, "epoch": 1.0696175333046842, "grad_norm": 1.125, "learning_rate": 0.0004994152406362311, "loss": 6.4525, "mean_token_accuracy": 0.14018251076340676, "num_tokens": 2257599.0, "step": 1245 }, { "entropy": 6.871714019775391, "epoch": 1.0739149119037388, "grad_norm": 1.2421875, "learning_rate": 0.0004993910404668586, "loss": 6.4992, "mean_token_accuracy": 0.1316287100315094, "num_tokens": 2266510.0, "step": 1250 }, { "entropy": 6.801673936843872, "epoch": 1.0782122905027933, "grad_norm": 1.0, "learning_rate": 0.000499366350317142, "loss": 6.4902, "mean_token_accuracy": 0.1355181120336056, "num_tokens": 2275462.0, "step": 1255 }, { "entropy": 6.805047512054443, "epoch": 1.0825096691018479, "grad_norm": 1.1484375, "learning_rate": 0.0004993411702409948, "loss": 6.4684, "mean_token_accuracy": 0.13499311953783036, "num_tokens": 2283826.0, "step": 1260 }, { "entropy": 6.796231460571289, "epoch": 1.0868070477009024, "grad_norm": 1.171875, "learning_rate": 0.0004993155002934002, "loss": 6.4758, "mean_token_accuracy": 0.13739539608359336, "num_tokens": 2292967.0, "step": 1265 }, { "entropy": 6.935551691055298, "epoch": 1.091104426299957, "grad_norm": 1.5078125, "learning_rate": 0.0004992893405304111, "loss": 6.6091, "mean_token_accuracy": 0.13493912890553475, "num_tokens": 2302336.0, "step": 1270 }, { "entropy": 6.757972192764282, "epoch": 1.0954018048990115, "grad_norm": 1.03125, "learning_rate": 0.00049926269100915, "loss": 6.5039, "mean_token_accuracy": 0.14085786640644074, "num_tokens": 2311465.0, "step": 1275 }, { "entropy": 6.884800767898559, "epoch": 1.0996991834980663, "grad_norm": 1.0859375, "learning_rate": 0.0004992355517878087, "loss": 6.6134, "mean_token_accuracy": 0.12797435671091079, "num_tokens": 2320281.0, "step": 1280 }, { "entropy": 6.775428581237793, "epoch": 1.1039965620971208, "grad_norm": 1.15625, "learning_rate": 0.0004992079229256484, "loss": 6.5189, "mean_token_accuracy": 0.1329084627330303, "num_tokens": 2329755.0, "step": 1285 }, { "entropy": 6.721524858474732, "epoch": 1.1082939406961754, "grad_norm": 1.015625, "learning_rate": 0.0004991798044829996, "loss": 6.4524, "mean_token_accuracy": 0.1344260886311531, "num_tokens": 2338807.0, "step": 1290 }, { "entropy": 6.870701122283935, "epoch": 1.11259131929523, "grad_norm": 1.109375, "learning_rate": 0.0004991511965212618, "loss": 6.5591, "mean_token_accuracy": 0.13554905205965043, "num_tokens": 2348056.0, "step": 1295 }, { "entropy": 6.759064626693726, "epoch": 1.1168886978942845, "grad_norm": 1.0546875, "learning_rate": 0.0004991220991029032, "loss": 6.5619, "mean_token_accuracy": 0.13164993077516557, "num_tokens": 2357780.0, "step": 1300 }, { "entropy": 6.845104169845581, "epoch": 1.121186076493339, "grad_norm": 1.296875, "learning_rate": 0.000499092512291461, "loss": 6.526, "mean_token_accuracy": 0.13971479684114457, "num_tokens": 2367060.0, "step": 1305 }, { "entropy": 6.800533056259155, "epoch": 1.1254834550923936, "grad_norm": 1.0859375, "learning_rate": 0.000499062436151541, "loss": 6.5277, "mean_token_accuracy": 0.13263508304953575, "num_tokens": 2375751.0, "step": 1310 }, { "entropy": 6.890619134902954, "epoch": 1.129780833691448, "grad_norm": 1.109375, "learning_rate": 0.0004990318707488173, "loss": 6.5788, "mean_token_accuracy": 0.12899956330657006, "num_tokens": 2385013.0, "step": 1315 }, { "entropy": 6.769053792953491, "epoch": 1.1340782122905029, "grad_norm": 1.140625, "learning_rate": 0.0004990008161500327, "loss": 6.48, "mean_token_accuracy": 0.1359359547495842, "num_tokens": 2392935.0, "step": 1320 }, { "entropy": 6.7767839431762695, "epoch": 1.1383755908895574, "grad_norm": 1.2109375, "learning_rate": 0.000498969272422998, "loss": 6.4887, "mean_token_accuracy": 0.13946662694215775, "num_tokens": 2401560.0, "step": 1325 }, { "entropy": 6.732125520706177, "epoch": 1.142672969488612, "grad_norm": 1.0546875, "learning_rate": 0.0004989372396365921, "loss": 6.4183, "mean_token_accuracy": 0.13894038647413254, "num_tokens": 2410050.0, "step": 1330 }, { "entropy": 6.8855541229248045, "epoch": 1.1469703480876665, "grad_norm": 1.1015625, "learning_rate": 0.0004989047178607618, "loss": 6.5218, "mean_token_accuracy": 0.13579266518354416, "num_tokens": 2418980.0, "step": 1335 }, { "entropy": 6.7566611766815186, "epoch": 1.151267726686721, "grad_norm": 1.09375, "learning_rate": 0.0004988717071665215, "loss": 6.5177, "mean_token_accuracy": 0.13580050468444824, "num_tokens": 2427992.0, "step": 1340 }, { "entropy": 6.821787118911743, "epoch": 1.1555651052857756, "grad_norm": 0.99609375, "learning_rate": 0.0004988382076259537, "loss": 6.4297, "mean_token_accuracy": 0.1417124703526497, "num_tokens": 2436368.0, "step": 1345 }, { "entropy": 6.65723991394043, "epoch": 1.1598624838848304, "grad_norm": 1.0, "learning_rate": 0.0004988042193122077, "loss": 6.4243, "mean_token_accuracy": 0.1399266541004181, "num_tokens": 2445499.0, "step": 1350 }, { "entropy": 6.846164894104004, "epoch": 1.164159862483885, "grad_norm": 1.171875, "learning_rate": 0.0004987697422995005, "loss": 6.4564, "mean_token_accuracy": 0.13335739225149154, "num_tokens": 2454312.0, "step": 1355 }, { "entropy": 6.705566883087158, "epoch": 1.1684572410829395, "grad_norm": 1.0625, "learning_rate": 0.0004987347766631161, "loss": 6.5179, "mean_token_accuracy": 0.13981100916862488, "num_tokens": 2462922.0, "step": 1360 }, { "entropy": 6.8054440975189205, "epoch": 1.172754619681994, "grad_norm": 1.046875, "learning_rate": 0.0004986993224794055, "loss": 6.5574, "mean_token_accuracy": 0.12931617349386215, "num_tokens": 2472195.0, "step": 1365 }, { "entropy": 6.731846857070923, "epoch": 1.1770519982810486, "grad_norm": 1.171875, "learning_rate": 0.0004986633798257865, "loss": 6.456, "mean_token_accuracy": 0.13557855412364006, "num_tokens": 2481021.0, "step": 1370 }, { "entropy": 6.709754800796508, "epoch": 1.181349376880103, "grad_norm": 1.171875, "learning_rate": 0.0004986269487807434, "loss": 6.4682, "mean_token_accuracy": 0.13462188541889192, "num_tokens": 2490250.0, "step": 1375 }, { "entropy": 6.8344573974609375, "epoch": 1.1856467554791577, "grad_norm": 1.0625, "learning_rate": 0.000498590029423827, "loss": 6.529, "mean_token_accuracy": 0.13892517015337943, "num_tokens": 2499122.0, "step": 1380 }, { "entropy": 6.794313240051269, "epoch": 1.1899441340782122, "grad_norm": 1.109375, "learning_rate": 0.0004985526218356546, "loss": 6.5102, "mean_token_accuracy": 0.13186247944831847, "num_tokens": 2508454.0, "step": 1385 }, { "entropy": 6.717947912216187, "epoch": 1.1942415126772667, "grad_norm": 1.09375, "learning_rate": 0.0004985147260979093, "loss": 6.449, "mean_token_accuracy": 0.1434843860566616, "num_tokens": 2517353.0, "step": 1390 }, { "entropy": 6.771858787536621, "epoch": 1.1985388912763215, "grad_norm": 1.140625, "learning_rate": 0.0004984763422933402, "loss": 6.4618, "mean_token_accuracy": 0.13847233429551126, "num_tokens": 2526321.0, "step": 1395 }, { "entropy": 6.732237863540649, "epoch": 1.202836269875376, "grad_norm": 0.984375, "learning_rate": 0.0004984374705057623, "loss": 6.5033, "mean_token_accuracy": 0.13528537154197692, "num_tokens": 2535924.0, "step": 1400 }, { "entropy": 6.721146202087402, "epoch": 1.2071336484744306, "grad_norm": 1.1484375, "learning_rate": 0.0004983981108200561, "loss": 6.4711, "mean_token_accuracy": 0.13535311296582223, "num_tokens": 2545606.0, "step": 1405 }, { "entropy": 6.733812093734741, "epoch": 1.2114310270734852, "grad_norm": 1.125, "learning_rate": 0.0004983582633221672, "loss": 6.4601, "mean_token_accuracy": 0.1369933992624283, "num_tokens": 2554947.0, "step": 1410 }, { "entropy": 6.855603933334351, "epoch": 1.2157284056725397, "grad_norm": 0.984375, "learning_rate": 0.0004983179280991068, "loss": 6.6134, "mean_token_accuracy": 0.12978528887033464, "num_tokens": 2564462.0, "step": 1415 }, { "entropy": 6.726688861846924, "epoch": 1.2200257842715942, "grad_norm": 1.09375, "learning_rate": 0.0004982771052389508, "loss": 6.4475, "mean_token_accuracy": 0.1368112660944462, "num_tokens": 2573124.0, "step": 1420 }, { "entropy": 6.807424783706665, "epoch": 1.224323162870649, "grad_norm": 1.1015625, "learning_rate": 0.0004982357948308401, "loss": 6.5481, "mean_token_accuracy": 0.13265790268778802, "num_tokens": 2581829.0, "step": 1425 }, { "entropy": 6.770775365829468, "epoch": 1.2286205414697036, "grad_norm": 1.1015625, "learning_rate": 0.0004981939969649799, "loss": 6.4049, "mean_token_accuracy": 0.14194427505135537, "num_tokens": 2590631.0, "step": 1430 }, { "entropy": 6.709357166290284, "epoch": 1.232917920068758, "grad_norm": 1.1640625, "learning_rate": 0.0004981517117326404, "loss": 6.5216, "mean_token_accuracy": 0.13609697446227073, "num_tokens": 2600684.0, "step": 1435 }, { "entropy": 6.725667095184326, "epoch": 1.2372152986678127, "grad_norm": 1.046875, "learning_rate": 0.0004981089392261553, "loss": 6.4349, "mean_token_accuracy": 0.14131608307361604, "num_tokens": 2609667.0, "step": 1440 }, { "entropy": 6.692513275146484, "epoch": 1.2415126772668672, "grad_norm": 0.99609375, "learning_rate": 0.000498065679538923, "loss": 6.5055, "mean_token_accuracy": 0.14114993885159494, "num_tokens": 2620025.0, "step": 1445 }, { "entropy": 6.7513340473175045, "epoch": 1.2458100558659218, "grad_norm": 1.125, "learning_rate": 0.0004980219327654049, "loss": 6.428, "mean_token_accuracy": 0.13774933964014052, "num_tokens": 2629032.0, "step": 1450 }, { "entropy": 6.702835464477539, "epoch": 1.2501074344649763, "grad_norm": 1.09375, "learning_rate": 0.000497977699001127, "loss": 6.402, "mean_token_accuracy": 0.142982679605484, "num_tokens": 2638303.0, "step": 1455 }, { "entropy": 6.761410474777222, "epoch": 1.2544048130640308, "grad_norm": 1.125, "learning_rate": 0.0004979329783426778, "loss": 6.4318, "mean_token_accuracy": 0.14380076453089713, "num_tokens": 2647902.0, "step": 1460 }, { "entropy": 6.731089019775391, "epoch": 1.2587021916630854, "grad_norm": 1.1015625, "learning_rate": 0.0004978877708877094, "loss": 6.4848, "mean_token_accuracy": 0.13676076754927635, "num_tokens": 2657902.0, "step": 1465 }, { "entropy": 6.71400637626648, "epoch": 1.2629995702621402, "grad_norm": 1.0703125, "learning_rate": 0.0004978420767349368, "loss": 6.4196, "mean_token_accuracy": 0.13780386745929718, "num_tokens": 2667082.0, "step": 1470 }, { "entropy": 6.737793684005737, "epoch": 1.2672969488611947, "grad_norm": 1.03125, "learning_rate": 0.0004977958959841379, "loss": 6.4943, "mean_token_accuracy": 0.1352358005940914, "num_tokens": 2676855.0, "step": 1475 }, { "entropy": 6.734015226364136, "epoch": 1.2715943274602493, "grad_norm": 1.0390625, "learning_rate": 0.000497749228736153, "loss": 6.4201, "mean_token_accuracy": 0.14142746701836587, "num_tokens": 2685750.0, "step": 1480 }, { "entropy": 6.656690311431885, "epoch": 1.2758917060593038, "grad_norm": 1.171875, "learning_rate": 0.0004977020750928845, "loss": 6.4771, "mean_token_accuracy": 0.14191860556602479, "num_tokens": 2695272.0, "step": 1485 }, { "entropy": 6.794925928115845, "epoch": 1.2801890846583583, "grad_norm": 1.046875, "learning_rate": 0.0004976544351572973, "loss": 6.4253, "mean_token_accuracy": 0.14196638017892838, "num_tokens": 2704806.0, "step": 1490 }, { "entropy": 6.56059627532959, "epoch": 1.2844864632574131, "grad_norm": 1.0390625, "learning_rate": 0.0004976063090334179, "loss": 6.4836, "mean_token_accuracy": 0.14093814194202423, "num_tokens": 2713521.0, "step": 1495 }, { "entropy": 6.7648594856262205, "epoch": 1.2887838418564677, "grad_norm": 1.1171875, "learning_rate": 0.0004975576968263346, "loss": 6.472, "mean_token_accuracy": 0.13531532436609267, "num_tokens": 2721848.0, "step": 1500 }, { "epoch": 1.2887838418564677, "eval_entropy": 6.583824046023257, "eval_loss": 6.552463054656982, "eval_mean_token_accuracy": 0.13841687775477096, "eval_num_tokens": 2721848.0, "eval_runtime": 2.0451, "eval_samples_per_second": 1735.359, "eval_steps_per_second": 217.103, "step": 1500 }, { "entropy": 6.6689835548400875, "epoch": 1.2930812204555222, "grad_norm": 1.0, "learning_rate": 0.000497508598642197, "loss": 6.4406, "mean_token_accuracy": 0.13946301937103273, "num_tokens": 2731473.0, "step": 1505 }, { "entropy": 6.724963998794555, "epoch": 1.2973785990545768, "grad_norm": 1.0625, "learning_rate": 0.000497459014588216, "loss": 6.5064, "mean_token_accuracy": 0.13410719558596612, "num_tokens": 2739867.0, "step": 1510 }, { "entropy": 6.701112556457519, "epoch": 1.3016759776536313, "grad_norm": 1.0859375, "learning_rate": 0.000497408944772663, "loss": 6.4165, "mean_token_accuracy": 0.14087883234024048, "num_tokens": 2748903.0, "step": 1515 }, { "entropy": 6.621306848526001, "epoch": 1.3059733562526858, "grad_norm": 1.0390625, "learning_rate": 0.0004973583893048707, "loss": 6.4144, "mean_token_accuracy": 0.13790024891495706, "num_tokens": 2757711.0, "step": 1520 }, { "entropy": 6.8078021049499515, "epoch": 1.3102707348517404, "grad_norm": 1.109375, "learning_rate": 0.0004973073482952321, "loss": 6.4178, "mean_token_accuracy": 0.14102478623390197, "num_tokens": 2765633.0, "step": 1525 }, { "entropy": 6.606275224685669, "epoch": 1.314568113450795, "grad_norm": 1.3046875, "learning_rate": 0.0004972558218552004, "loss": 6.454, "mean_token_accuracy": 0.1388860262930393, "num_tokens": 2774495.0, "step": 1530 }, { "entropy": 6.737347936630249, "epoch": 1.3188654920498495, "grad_norm": 1.1328125, "learning_rate": 0.0004972038100972885, "loss": 6.4827, "mean_token_accuracy": 0.13370617032051085, "num_tokens": 2782665.0, "step": 1535 }, { "entropy": 6.652740144729615, "epoch": 1.323162870648904, "grad_norm": 1.3125, "learning_rate": 0.0004971513131350697, "loss": 6.4163, "mean_token_accuracy": 0.13846877068281174, "num_tokens": 2791394.0, "step": 1540 }, { "entropy": 6.583173847198486, "epoch": 1.3274602492479588, "grad_norm": 1.1484375, "learning_rate": 0.0004970983310831759, "loss": 6.4113, "mean_token_accuracy": 0.13881225883960724, "num_tokens": 2800488.0, "step": 1545 }, { "entropy": 6.734278392791748, "epoch": 1.3317576278470133, "grad_norm": 1.03125, "learning_rate": 0.0004970448640572989, "loss": 6.5243, "mean_token_accuracy": 0.1339696764945984, "num_tokens": 2810116.0, "step": 1550 }, { "entropy": 6.658429765701294, "epoch": 1.336055006446068, "grad_norm": 0.94921875, "learning_rate": 0.0004969909121741895, "loss": 6.3255, "mean_token_accuracy": 0.14455484077334405, "num_tokens": 2819205.0, "step": 1555 }, { "entropy": 6.591242885589599, "epoch": 1.3403523850451224, "grad_norm": 1.109375, "learning_rate": 0.0004969364755516569, "loss": 6.4035, "mean_token_accuracy": 0.13771276026964188, "num_tokens": 2828017.0, "step": 1560 }, { "entropy": 6.73987512588501, "epoch": 1.344649763644177, "grad_norm": 1.1328125, "learning_rate": 0.0004968815543085689, "loss": 6.438, "mean_token_accuracy": 0.14133503511548043, "num_tokens": 2837125.0, "step": 1565 }, { "entropy": 6.648034620285034, "epoch": 1.3489471422432318, "grad_norm": 1.0625, "learning_rate": 0.0004968261485648516, "loss": 6.4665, "mean_token_accuracy": 0.13752973526716233, "num_tokens": 2845438.0, "step": 1570 }, { "entropy": 6.690678644180298, "epoch": 1.3532445208422863, "grad_norm": 1.015625, "learning_rate": 0.000496770258441489, "loss": 6.4311, "mean_token_accuracy": 0.14550055414438248, "num_tokens": 2854389.0, "step": 1575 }, { "entropy": 6.591717529296875, "epoch": 1.3575418994413408, "grad_norm": 1.0234375, "learning_rate": 0.0004967138840605228, "loss": 6.3947, "mean_token_accuracy": 0.1433369368314743, "num_tokens": 2863654.0, "step": 1580 }, { "entropy": 6.645109987258911, "epoch": 1.3618392780403954, "grad_norm": 1.0703125, "learning_rate": 0.000496657025545052, "loss": 6.3068, "mean_token_accuracy": 0.14519514814019202, "num_tokens": 2872871.0, "step": 1585 }, { "entropy": 6.5770776748657225, "epoch": 1.36613665663945, "grad_norm": 1.1328125, "learning_rate": 0.000496599683019233, "loss": 6.4037, "mean_token_accuracy": 0.14221980646252633, "num_tokens": 2881140.0, "step": 1590 }, { "entropy": 6.7226653575897215, "epoch": 1.3704340352385045, "grad_norm": 1.0546875, "learning_rate": 0.000496541856608279, "loss": 6.3852, "mean_token_accuracy": 0.14397331327199936, "num_tokens": 2889945.0, "step": 1595 }, { "entropy": 6.5361980438232425, "epoch": 1.374731413837559, "grad_norm": 0.95703125, "learning_rate": 0.0004964835464384595, "loss": 6.3238, "mean_token_accuracy": 0.145409494638443, "num_tokens": 2898897.0, "step": 1600 }, { "entropy": 6.686757373809814, "epoch": 1.3790287924366136, "grad_norm": 1.09375, "learning_rate": 0.000496424752637101, "loss": 6.3401, "mean_token_accuracy": 0.14611406177282332, "num_tokens": 2907717.0, "step": 1605 }, { "entropy": 6.578691530227661, "epoch": 1.3833261710356681, "grad_norm": 1.078125, "learning_rate": 0.0004963654753325853, "loss": 6.3297, "mean_token_accuracy": 0.14271921664476395, "num_tokens": 2916213.0, "step": 1610 }, { "entropy": 6.683462333679199, "epoch": 1.387623549634723, "grad_norm": 1.0, "learning_rate": 0.0004963057146543505, "loss": 6.4949, "mean_token_accuracy": 0.1387751467525959, "num_tokens": 2925706.0, "step": 1615 }, { "entropy": 6.599123191833496, "epoch": 1.3919209282337774, "grad_norm": 1.015625, "learning_rate": 0.00049624547073289, "loss": 6.4208, "mean_token_accuracy": 0.1372368849813938, "num_tokens": 2934464.0, "step": 1620 }, { "entropy": 6.672312545776367, "epoch": 1.396218306832832, "grad_norm": 1.140625, "learning_rate": 0.0004961847436997526, "loss": 6.3195, "mean_token_accuracy": 0.14415977373719216, "num_tokens": 2944095.0, "step": 1625 }, { "entropy": 6.480645990371704, "epoch": 1.4005156854318865, "grad_norm": 1.09375, "learning_rate": 0.0004961235336875416, "loss": 6.3231, "mean_token_accuracy": 0.14915895387530326, "num_tokens": 2953357.0, "step": 1630 }, { "entropy": 6.639774322509766, "epoch": 1.404813064030941, "grad_norm": 1.109375, "learning_rate": 0.0004960618408299154, "loss": 6.4687, "mean_token_accuracy": 0.13529081642627716, "num_tokens": 2963020.0, "step": 1635 }, { "entropy": 6.682909727096558, "epoch": 1.4091104426299956, "grad_norm": 1.046875, "learning_rate": 0.0004959996652615865, "loss": 6.319, "mean_token_accuracy": 0.14330243095755577, "num_tokens": 2971955.0, "step": 1640 }, { "entropy": 6.6523435592651365, "epoch": 1.4134078212290504, "grad_norm": 1.0703125, "learning_rate": 0.0004959370071183216, "loss": 6.3766, "mean_token_accuracy": 0.14444040805101394, "num_tokens": 2980662.0, "step": 1645 }, { "entropy": 6.675427007675171, "epoch": 1.417705199828105, "grad_norm": 1.1484375, "learning_rate": 0.0004958738665369407, "loss": 6.5051, "mean_token_accuracy": 0.12928852811455727, "num_tokens": 2990038.0, "step": 1650 }, { "entropy": 6.632522964477539, "epoch": 1.4220025784271595, "grad_norm": 1.1328125, "learning_rate": 0.0004958102436553179, "loss": 6.4172, "mean_token_accuracy": 0.1390580452978611, "num_tokens": 2999835.0, "step": 1655 }, { "entropy": 6.694387483596802, "epoch": 1.426299957026214, "grad_norm": 0.98828125, "learning_rate": 0.00049574613861238, "loss": 6.4118, "mean_token_accuracy": 0.13762674629688262, "num_tokens": 3009593.0, "step": 1660 }, { "entropy": 6.648862266540528, "epoch": 1.4305973356252686, "grad_norm": 0.99609375, "learning_rate": 0.0004956815515481069, "loss": 6.4348, "mean_token_accuracy": 0.144145817309618, "num_tokens": 3019187.0, "step": 1665 }, { "entropy": 6.582254266738891, "epoch": 1.4348947142243231, "grad_norm": 1.078125, "learning_rate": 0.0004956164826035309, "loss": 6.3495, "mean_token_accuracy": 0.14171260893344878, "num_tokens": 3027875.0, "step": 1670 }, { "entropy": 6.569947624206543, "epoch": 1.4391920928233777, "grad_norm": 1.1171875, "learning_rate": 0.0004955509319207363, "loss": 6.3833, "mean_token_accuracy": 0.13855091333389283, "num_tokens": 3036902.0, "step": 1675 }, { "entropy": 6.548913908004761, "epoch": 1.4434894714224322, "grad_norm": 0.9375, "learning_rate": 0.0004954848996428601, "loss": 6.36, "mean_token_accuracy": 0.14765606224536895, "num_tokens": 3046653.0, "step": 1680 }, { "entropy": 6.6836981773376465, "epoch": 1.4477868500214868, "grad_norm": 1.3515625, "learning_rate": 0.00049541838591409, "loss": 6.448, "mean_token_accuracy": 0.13707543835043906, "num_tokens": 3056273.0, "step": 1685 }, { "entropy": 6.570832586288452, "epoch": 1.4520842286205415, "grad_norm": 1.046875, "learning_rate": 0.0004953513908796657, "loss": 6.3562, "mean_token_accuracy": 0.13904846012592315, "num_tokens": 3065662.0, "step": 1690 }, { "entropy": 6.719029092788697, "epoch": 1.456381607219596, "grad_norm": 1.140625, "learning_rate": 0.0004952839146858773, "loss": 6.3883, "mean_token_accuracy": 0.14505013972520828, "num_tokens": 3073970.0, "step": 1695 }, { "entropy": 6.546349334716797, "epoch": 1.4606789858186506, "grad_norm": 1.1796875, "learning_rate": 0.0004952159574800658, "loss": 6.3978, "mean_token_accuracy": 0.13897576928138733, "num_tokens": 3082500.0, "step": 1700 }, { "entropy": 6.645324468612671, "epoch": 1.4649763644177052, "grad_norm": 1.0859375, "learning_rate": 0.0004951475194106229, "loss": 6.342, "mean_token_accuracy": 0.14458465725183486, "num_tokens": 3091574.0, "step": 1705 }, { "entropy": 6.590623474121093, "epoch": 1.4692737430167597, "grad_norm": 1.0234375, "learning_rate": 0.0004950786006269898, "loss": 6.4477, "mean_token_accuracy": 0.1356819100677967, "num_tokens": 3102402.0, "step": 1710 }, { "entropy": 6.654024839401245, "epoch": 1.4735711216158143, "grad_norm": 1.125, "learning_rate": 0.0004950092012796576, "loss": 6.2738, "mean_token_accuracy": 0.14728236198425293, "num_tokens": 3111347.0, "step": 1715 }, { "entropy": 6.553081369400024, "epoch": 1.477868500214869, "grad_norm": 1.1796875, "learning_rate": 0.0004949393215201666, "loss": 6.3455, "mean_token_accuracy": 0.14207591861486435, "num_tokens": 3120018.0, "step": 1720 }, { "entropy": 6.595822668075561, "epoch": 1.4821658788139236, "grad_norm": 0.96875, "learning_rate": 0.0004948689615011065, "loss": 6.4086, "mean_token_accuracy": 0.13704866543412209, "num_tokens": 3129669.0, "step": 1725 }, { "entropy": 6.628203105926514, "epoch": 1.4864632574129781, "grad_norm": 0.953125, "learning_rate": 0.0004947981213761154, "loss": 6.3443, "mean_token_accuracy": 0.14518199041485785, "num_tokens": 3139112.0, "step": 1730 }, { "entropy": 6.5786394596099855, "epoch": 1.4907606360120327, "grad_norm": 1.046875, "learning_rate": 0.0004947268012998797, "loss": 6.3058, "mean_token_accuracy": 0.15637002438306807, "num_tokens": 3148437.0, "step": 1735 }, { "entropy": 6.570107936859131, "epoch": 1.4950580146110872, "grad_norm": 0.9609375, "learning_rate": 0.000494655001428134, "loss": 6.2891, "mean_token_accuracy": 0.14667836502194403, "num_tokens": 3158165.0, "step": 1740 }, { "entropy": 6.586823749542236, "epoch": 1.4993553932101418, "grad_norm": 1.1015625, "learning_rate": 0.0004945827219176604, "loss": 6.3587, "mean_token_accuracy": 0.1493491068482399, "num_tokens": 3167262.0, "step": 1745 }, { "entropy": 6.514509057998657, "epoch": 1.5036527718091963, "grad_norm": 1.0078125, "learning_rate": 0.0004945099629262888, "loss": 6.3479, "mean_token_accuracy": 0.1436598651111126, "num_tokens": 3176696.0, "step": 1750 }, { "entropy": 6.673803234100342, "epoch": 1.5079501504082509, "grad_norm": 1.0546875, "learning_rate": 0.0004944367246128954, "loss": 6.4304, "mean_token_accuracy": 0.13725945726037025, "num_tokens": 3185857.0, "step": 1755 }, { "entropy": 6.5661591529846195, "epoch": 1.5122475290073054, "grad_norm": 1.0625, "learning_rate": 0.0004943630071374036, "loss": 6.2677, "mean_token_accuracy": 0.14966750741004944, "num_tokens": 3194687.0, "step": 1760 }, { "entropy": 6.554711723327637, "epoch": 1.51654490760636, "grad_norm": 1.0078125, "learning_rate": 0.0004942888106607828, "loss": 6.3291, "mean_token_accuracy": 0.14281144142150878, "num_tokens": 3204913.0, "step": 1765 }, { "entropy": 6.641019535064697, "epoch": 1.5208422862054147, "grad_norm": 1.0390625, "learning_rate": 0.0004942141353450486, "loss": 6.3145, "mean_token_accuracy": 0.1485350415110588, "num_tokens": 3213312.0, "step": 1770 }, { "entropy": 6.493930768966675, "epoch": 1.5251396648044693, "grad_norm": 0.96875, "learning_rate": 0.0004941389813532619, "loss": 6.2368, "mean_token_accuracy": 0.15905009657144548, "num_tokens": 3222992.0, "step": 1775 }, { "entropy": 6.511264657974243, "epoch": 1.5294370434035238, "grad_norm": 0.984375, "learning_rate": 0.000494063348849529, "loss": 6.2816, "mean_token_accuracy": 0.14892083406448364, "num_tokens": 3232836.0, "step": 1780 }, { "entropy": 6.616392660140991, "epoch": 1.5337344220025786, "grad_norm": 0.94140625, "learning_rate": 0.0004939872379990011, "loss": 6.4346, "mean_token_accuracy": 0.1384902000427246, "num_tokens": 3243171.0, "step": 1785 }, { "entropy": 6.671454858779907, "epoch": 1.5380318006016331, "grad_norm": 1.1796875, "learning_rate": 0.0004939106489678739, "loss": 6.3565, "mean_token_accuracy": 0.14886578172445297, "num_tokens": 3251995.0, "step": 1790 }, { "entropy": 6.483775520324707, "epoch": 1.5423291792006877, "grad_norm": 1.015625, "learning_rate": 0.000493833581923387, "loss": 6.2999, "mean_token_accuracy": 0.147441129386425, "num_tokens": 3260841.0, "step": 1795 }, { "entropy": 6.614831399917603, "epoch": 1.5466265577997422, "grad_norm": 1.0546875, "learning_rate": 0.0004937560370338244, "loss": 6.4359, "mean_token_accuracy": 0.1328293912112713, "num_tokens": 3270979.0, "step": 1800 }, { "entropy": 6.602978515625, "epoch": 1.5509239363987968, "grad_norm": 1.0859375, "learning_rate": 0.000493678014468513, "loss": 6.3703, "mean_token_accuracy": 0.14689823091030121, "num_tokens": 3279848.0, "step": 1805 }, { "entropy": 6.534598064422608, "epoch": 1.5552213149978513, "grad_norm": 0.94921875, "learning_rate": 0.0004935995143978227, "loss": 6.3674, "mean_token_accuracy": 0.14537320658564568, "num_tokens": 3289172.0, "step": 1810 }, { "entropy": 6.508708524703979, "epoch": 1.5595186935969059, "grad_norm": 1.1484375, "learning_rate": 0.0004935205369931664, "loss": 6.2677, "mean_token_accuracy": 0.1513919234275818, "num_tokens": 3297432.0, "step": 1815 }, { "entropy": 6.684668636322021, "epoch": 1.5638160721959604, "grad_norm": 0.92578125, "learning_rate": 0.0004934410824269992, "loss": 6.2954, "mean_token_accuracy": 0.1454857923090458, "num_tokens": 3307486.0, "step": 1820 }, { "entropy": 6.466551637649536, "epoch": 1.568113450795015, "grad_norm": 1.0234375, "learning_rate": 0.0004933611508728182, "loss": 6.2671, "mean_token_accuracy": 0.14967258870601655, "num_tokens": 3316296.0, "step": 1825 }, { "entropy": 6.563362693786621, "epoch": 1.5724108293940695, "grad_norm": 1.0078125, "learning_rate": 0.000493280742505162, "loss": 6.2972, "mean_token_accuracy": 0.14479405283927918, "num_tokens": 3326080.0, "step": 1830 }, { "entropy": 6.456173896789551, "epoch": 1.576708207993124, "grad_norm": 1.0546875, "learning_rate": 0.0004931998574996102, "loss": 6.217, "mean_token_accuracy": 0.15072606950998307, "num_tokens": 3334826.0, "step": 1835 }, { "entropy": 6.472858524322509, "epoch": 1.5810055865921788, "grad_norm": 1.0859375, "learning_rate": 0.0004931184960327832, "loss": 6.2177, "mean_token_accuracy": 0.1524192661046982, "num_tokens": 3343261.0, "step": 1840 }, { "entropy": 6.493236398696899, "epoch": 1.5853029651912334, "grad_norm": 1.640625, "learning_rate": 0.0004930366582823421, "loss": 6.2619, "mean_token_accuracy": 0.14549409449100495, "num_tokens": 3352513.0, "step": 1845 }, { "entropy": 6.541861534118652, "epoch": 1.589600343790288, "grad_norm": 1.1484375, "learning_rate": 0.0004929543444269879, "loss": 6.3147, "mean_token_accuracy": 0.15202615782618523, "num_tokens": 3361577.0, "step": 1850 }, { "entropy": 6.516072130203247, "epoch": 1.5938977223893425, "grad_norm": 1.1171875, "learning_rate": 0.000492871554646461, "loss": 6.3805, "mean_token_accuracy": 0.1442191444337368, "num_tokens": 3370591.0, "step": 1855 }, { "entropy": 6.489377784729004, "epoch": 1.5981951009883972, "grad_norm": 1.0703125, "learning_rate": 0.0004927882891215413, "loss": 6.2995, "mean_token_accuracy": 0.1446702793240547, "num_tokens": 3379761.0, "step": 1860 }, { "entropy": 6.6347997188568115, "epoch": 1.6024924795874518, "grad_norm": 1.203125, "learning_rate": 0.0004927045480340475, "loss": 6.3729, "mean_token_accuracy": 0.13809221014380454, "num_tokens": 3388974.0, "step": 1865 }, { "entropy": 6.515362644195557, "epoch": 1.6067898581865063, "grad_norm": 0.9765625, "learning_rate": 0.0004926203315668363, "loss": 6.2995, "mean_token_accuracy": 0.14509507045149803, "num_tokens": 3398339.0, "step": 1870 }, { "entropy": 6.501726579666138, "epoch": 1.6110872367855609, "grad_norm": 1.046875, "learning_rate": 0.0004925356399038032, "loss": 6.2645, "mean_token_accuracy": 0.14561111479997635, "num_tokens": 3408292.0, "step": 1875 }, { "entropy": 6.528331470489502, "epoch": 1.6153846153846154, "grad_norm": 1.1484375, "learning_rate": 0.0004924504732298808, "loss": 6.2363, "mean_token_accuracy": 0.15578987523913385, "num_tokens": 3417057.0, "step": 1880 }, { "entropy": 6.547144651412964, "epoch": 1.61968199398367, "grad_norm": 1.0703125, "learning_rate": 0.0004923648317310391, "loss": 6.3436, "mean_token_accuracy": 0.1472199097275734, "num_tokens": 3425830.0, "step": 1885 }, { "entropy": 6.503617954254151, "epoch": 1.6239793725827245, "grad_norm": 0.98046875, "learning_rate": 0.0004922787155942849, "loss": 6.3929, "mean_token_accuracy": 0.13893435150384903, "num_tokens": 3435513.0, "step": 1890 }, { "entropy": 6.572265768051148, "epoch": 1.628276751181779, "grad_norm": 1.03125, "learning_rate": 0.0004921921250076611, "loss": 6.2966, "mean_token_accuracy": 0.14931443706154823, "num_tokens": 3444684.0, "step": 1895 }, { "entropy": 6.4495138168334964, "epoch": 1.6325741297808336, "grad_norm": 1.1015625, "learning_rate": 0.0004921050601602475, "loss": 6.3435, "mean_token_accuracy": 0.14741323441267012, "num_tokens": 3453454.0, "step": 1900 }, { "entropy": 6.556122159957885, "epoch": 1.6368715083798882, "grad_norm": 1.0546875, "learning_rate": 0.0004920175212421587, "loss": 6.2787, "mean_token_accuracy": 0.14662181138992308, "num_tokens": 3463228.0, "step": 1905 }, { "entropy": 6.366853141784668, "epoch": 1.6411688869789427, "grad_norm": 1.03125, "learning_rate": 0.0004919295084445445, "loss": 6.166, "mean_token_accuracy": 0.15177097618579866, "num_tokens": 3472131.0, "step": 1910 }, { "entropy": 6.485814142227173, "epoch": 1.6454662655779975, "grad_norm": 0.98828125, "learning_rate": 0.0004918410219595899, "loss": 6.2547, "mean_token_accuracy": 0.1523374244570732, "num_tokens": 3480642.0, "step": 1915 }, { "entropy": 6.621995449066162, "epoch": 1.649763644177052, "grad_norm": 0.9765625, "learning_rate": 0.000491752061980514, "loss": 6.2277, "mean_token_accuracy": 0.15280286371707916, "num_tokens": 3489346.0, "step": 1920 }, { "entropy": 6.4284903049469, "epoch": 1.6540610227761066, "grad_norm": 1.1015625, "learning_rate": 0.0004916626287015697, "loss": 6.2756, "mean_token_accuracy": 0.15068823397159575, "num_tokens": 3498473.0, "step": 1925 }, { "entropy": 6.515523910522461, "epoch": 1.658358401375161, "grad_norm": 1.0, "learning_rate": 0.0004915727223180436, "loss": 6.2738, "mean_token_accuracy": 0.142893535643816, "num_tokens": 3507415.0, "step": 1930 }, { "entropy": 6.528269815444946, "epoch": 1.6626557799742159, "grad_norm": 0.984375, "learning_rate": 0.0004914823430262554, "loss": 6.3984, "mean_token_accuracy": 0.1329946205019951, "num_tokens": 3516873.0, "step": 1935 }, { "entropy": 6.484966564178467, "epoch": 1.6669531585732704, "grad_norm": 1.140625, "learning_rate": 0.0004913914910235573, "loss": 6.2479, "mean_token_accuracy": 0.14868821799755097, "num_tokens": 3525047.0, "step": 1940 }, { "entropy": 6.448112821578979, "epoch": 1.671250537172325, "grad_norm": 1.0859375, "learning_rate": 0.0004913001665083337, "loss": 6.2685, "mean_token_accuracy": 0.14392302706837654, "num_tokens": 3534354.0, "step": 1945 }, { "entropy": 6.528091144561768, "epoch": 1.6755479157713795, "grad_norm": 1.2265625, "learning_rate": 0.0004912083696800008, "loss": 6.2926, "mean_token_accuracy": 0.14562170803546906, "num_tokens": 3543830.0, "step": 1950 }, { "entropy": 6.4218017578125, "epoch": 1.679845294370434, "grad_norm": 1.09375, "learning_rate": 0.0004911161007390063, "loss": 6.1933, "mean_token_accuracy": 0.14804754853248597, "num_tokens": 3552314.0, "step": 1955 }, { "entropy": 6.470229148864746, "epoch": 1.6841426729694886, "grad_norm": 1.1875, "learning_rate": 0.0004910233598868287, "loss": 6.2765, "mean_token_accuracy": 0.14477257579565048, "num_tokens": 3561656.0, "step": 1960 }, { "entropy": 6.467269372940064, "epoch": 1.6884400515685432, "grad_norm": 1.0625, "learning_rate": 0.0004909301473259769, "loss": 6.2641, "mean_token_accuracy": 0.14551830440759658, "num_tokens": 3571784.0, "step": 1965 }, { "entropy": 6.518259859085083, "epoch": 1.6927374301675977, "grad_norm": 1.0625, "learning_rate": 0.0004908364632599899, "loss": 6.228, "mean_token_accuracy": 0.15220747292041778, "num_tokens": 3580626.0, "step": 1970 }, { "entropy": 6.378790664672851, "epoch": 1.6970348087666522, "grad_norm": 1.046875, "learning_rate": 0.0004907423078934362, "loss": 6.2467, "mean_token_accuracy": 0.14601020216941835, "num_tokens": 3589916.0, "step": 1975 }, { "entropy": 6.473833656311035, "epoch": 1.7013321873657068, "grad_norm": 1.0078125, "learning_rate": 0.0004906476814319134, "loss": 6.2572, "mean_token_accuracy": 0.14930620267987252, "num_tokens": 3599128.0, "step": 1980 }, { "entropy": 6.429199600219727, "epoch": 1.7056295659647613, "grad_norm": 0.9140625, "learning_rate": 0.0004905525840820481, "loss": 6.2686, "mean_token_accuracy": 0.1471567466855049, "num_tokens": 3608764.0, "step": 1985 }, { "entropy": 6.58309121131897, "epoch": 1.709926944563816, "grad_norm": 0.9453125, "learning_rate": 0.0004904570160514948, "loss": 6.3077, "mean_token_accuracy": 0.14043890461325645, "num_tokens": 3619082.0, "step": 1990 }, { "entropy": 6.45733323097229, "epoch": 1.7142243231628707, "grad_norm": 1.140625, "learning_rate": 0.0004903609775489358, "loss": 6.2682, "mean_token_accuracy": 0.14586469754576684, "num_tokens": 3628695.0, "step": 1995 }, { "entropy": 6.511290454864502, "epoch": 1.7185217017619252, "grad_norm": 1.015625, "learning_rate": 0.0004902644687840809, "loss": 6.267, "mean_token_accuracy": 0.14717549681663514, "num_tokens": 3637599.0, "step": 2000 }, { "epoch": 1.7185217017619252, "eval_entropy": 6.214308420817058, "eval_loss": 6.331518173217773, "eval_mean_token_accuracy": 0.14971260959702032, "eval_num_tokens": 3637599.0, "eval_runtime": 2.0415, "eval_samples_per_second": 1738.466, "eval_steps_per_second": 217.492, "step": 2000 }, { "entropy": 6.427486324310303, "epoch": 1.7228190803609797, "grad_norm": 1.1484375, "learning_rate": 0.0004901674899676667, "loss": 6.2449, "mean_token_accuracy": 0.14803531616926194, "num_tokens": 3647406.0, "step": 2005 }, { "entropy": 6.416431045532226, "epoch": 1.7271164589600345, "grad_norm": 1.03125, "learning_rate": 0.0004900700413114561, "loss": 6.1252, "mean_token_accuracy": 0.15068818926811217, "num_tokens": 3656531.0, "step": 2010 }, { "entropy": 6.388833618164062, "epoch": 1.731413837559089, "grad_norm": 1.0078125, "learning_rate": 0.000489972123028238, "loss": 6.2244, "mean_token_accuracy": 0.1465991474688053, "num_tokens": 3664922.0, "step": 2015 }, { "entropy": 6.502804613113403, "epoch": 1.7357112161581436, "grad_norm": 1.0234375, "learning_rate": 0.0004898737353318268, "loss": 6.1557, "mean_token_accuracy": 0.1519090563058853, "num_tokens": 3673283.0, "step": 2020 }, { "entropy": 6.377015924453735, "epoch": 1.7400085947571982, "grad_norm": 1.125, "learning_rate": 0.000489774878437062, "loss": 6.298, "mean_token_accuracy": 0.15162839442491532, "num_tokens": 3681760.0, "step": 2025 }, { "entropy": 6.46599555015564, "epoch": 1.7443059733562527, "grad_norm": 1.078125, "learning_rate": 0.0004896755525598074, "loss": 6.1178, "mean_token_accuracy": 0.15259039252996445, "num_tokens": 3689408.0, "step": 2030 }, { "entropy": 6.4247987270355225, "epoch": 1.7486033519553073, "grad_norm": 1.109375, "learning_rate": 0.0004895757579169511, "loss": 6.234, "mean_token_accuracy": 0.14994207322597503, "num_tokens": 3697904.0, "step": 2035 }, { "entropy": 6.579666042327881, "epoch": 1.7529007305543618, "grad_norm": 1.0078125, "learning_rate": 0.0004894754947264047, "loss": 6.2504, "mean_token_accuracy": 0.15150809586048125, "num_tokens": 3706704.0, "step": 2040 }, { "entropy": 6.433872127532959, "epoch": 1.7571981091534163, "grad_norm": 1.109375, "learning_rate": 0.000489374763207103, "loss": 6.3286, "mean_token_accuracy": 0.14471730291843415, "num_tokens": 3715690.0, "step": 2045 }, { "entropy": 6.465651893615723, "epoch": 1.761495487752471, "grad_norm": 1.109375, "learning_rate": 0.0004892735635790033, "loss": 6.125, "mean_token_accuracy": 0.15927532613277434, "num_tokens": 3724835.0, "step": 2050 }, { "entropy": 6.368647861480713, "epoch": 1.7657928663515254, "grad_norm": 0.94140625, "learning_rate": 0.000489171896063085, "loss": 6.1498, "mean_token_accuracy": 0.157290717959404, "num_tokens": 3733977.0, "step": 2055 }, { "entropy": 6.458992671966553, "epoch": 1.77009024495058, "grad_norm": 1.078125, "learning_rate": 0.0004890697608813495, "loss": 6.2682, "mean_token_accuracy": 0.14064312726259232, "num_tokens": 3742665.0, "step": 2060 }, { "entropy": 6.583484077453614, "epoch": 1.7743876235496348, "grad_norm": 1.078125, "learning_rate": 0.0004889671582568193, "loss": 6.3367, "mean_token_accuracy": 0.14621492847800255, "num_tokens": 3751647.0, "step": 2065 }, { "entropy": 6.387417125701904, "epoch": 1.7786850021486893, "grad_norm": 1.140625, "learning_rate": 0.0004888640884135374, "loss": 6.2386, "mean_token_accuracy": 0.1474798172712326, "num_tokens": 3760852.0, "step": 2070 }, { "entropy": 6.3953369617462155, "epoch": 1.7829823807477438, "grad_norm": 1.25, "learning_rate": 0.0004887605515765671, "loss": 6.1913, "mean_token_accuracy": 0.15439595878124238, "num_tokens": 3768640.0, "step": 2075 }, { "entropy": 6.503360080718994, "epoch": 1.7872797593467986, "grad_norm": 1.0546875, "learning_rate": 0.0004886565479719914, "loss": 6.2177, "mean_token_accuracy": 0.14689500331878663, "num_tokens": 3776504.0, "step": 2080 }, { "entropy": 6.52859411239624, "epoch": 1.7915771379458532, "grad_norm": 1.1875, "learning_rate": 0.0004885520778269128, "loss": 6.2515, "mean_token_accuracy": 0.1499434307217598, "num_tokens": 3786353.0, "step": 2085 }, { "entropy": 6.410916137695312, "epoch": 1.7958745165449077, "grad_norm": 1.0859375, "learning_rate": 0.0004884471413694523, "loss": 6.2783, "mean_token_accuracy": 0.15109124332666396, "num_tokens": 3795902.0, "step": 2090 }, { "entropy": 6.470384979248047, "epoch": 1.8001718951439623, "grad_norm": 0.9140625, "learning_rate": 0.0004883417388287491, "loss": 6.194, "mean_token_accuracy": 0.1435760647058487, "num_tokens": 3805986.0, "step": 2095 }, { "entropy": 6.400091123580933, "epoch": 1.8044692737430168, "grad_norm": 1.140625, "learning_rate": 0.0004882358704349603, "loss": 6.3188, "mean_token_accuracy": 0.1500417910516262, "num_tokens": 3814915.0, "step": 2100 }, { "entropy": 6.456367015838623, "epoch": 1.8087666523420713, "grad_norm": 1.15625, "learning_rate": 0.0004881295364192601, "loss": 6.2089, "mean_token_accuracy": 0.15894449651241302, "num_tokens": 3823966.0, "step": 2105 }, { "entropy": 6.510165739059448, "epoch": 1.813064030941126, "grad_norm": 1.0078125, "learning_rate": 0.0004880227370138394, "loss": 6.2729, "mean_token_accuracy": 0.142085450142622, "num_tokens": 3832775.0, "step": 2110 }, { "entropy": 6.3983588218688965, "epoch": 1.8173614095401804, "grad_norm": 0.8984375, "learning_rate": 0.0004879154724519057, "loss": 6.1809, "mean_token_accuracy": 0.15120477825403214, "num_tokens": 3842808.0, "step": 2115 }, { "entropy": 6.493490934371948, "epoch": 1.821658788139235, "grad_norm": 1.046875, "learning_rate": 0.0004878077429676816, "loss": 6.3143, "mean_token_accuracy": 0.14699392020702362, "num_tokens": 3853303.0, "step": 2120 }, { "entropy": 6.4460196018219, "epoch": 1.8259561667382895, "grad_norm": 1.046875, "learning_rate": 0.0004876995487964054, "loss": 6.2277, "mean_token_accuracy": 0.13867998719215394, "num_tokens": 3862462.0, "step": 2125 }, { "entropy": 6.459061241149902, "epoch": 1.830253545337344, "grad_norm": 1.0234375, "learning_rate": 0.00048759089017432996, "loss": 6.3388, "mean_token_accuracy": 0.14455281794071198, "num_tokens": 3871596.0, "step": 2130 }, { "entropy": 6.482069444656372, "epoch": 1.8345509239363988, "grad_norm": 1.015625, "learning_rate": 0.0004874817673387222, "loss": 6.2427, "mean_token_accuracy": 0.14856942594051362, "num_tokens": 3881276.0, "step": 2135 }, { "entropy": 6.43566927909851, "epoch": 1.8388483025354534, "grad_norm": 0.96875, "learning_rate": 0.00048737218052786275, "loss": 6.33, "mean_token_accuracy": 0.14330809488892554, "num_tokens": 3891610.0, "step": 2140 }, { "entropy": 6.498207521438599, "epoch": 1.843145681134508, "grad_norm": 0.98046875, "learning_rate": 0.00048726212998104554, "loss": 6.2531, "mean_token_accuracy": 0.14796748533844947, "num_tokens": 3900584.0, "step": 2145 }, { "entropy": 6.405120611190796, "epoch": 1.8474430597335625, "grad_norm": 1.0390625, "learning_rate": 0.0004871516159385768, "loss": 6.1817, "mean_token_accuracy": 0.1539264902472496, "num_tokens": 3910208.0, "step": 2150 }, { "entropy": 6.320563936233521, "epoch": 1.8517404383326173, "grad_norm": 1.1015625, "learning_rate": 0.0004870406386417752, "loss": 6.1061, "mean_token_accuracy": 0.15697987973690034, "num_tokens": 3918424.0, "step": 2155 }, { "entropy": 6.313277053833008, "epoch": 1.8560378169316718, "grad_norm": 1.0859375, "learning_rate": 0.0004869291983329707, "loss": 6.047, "mean_token_accuracy": 0.17023974657058716, "num_tokens": 3926206.0, "step": 2160 }, { "entropy": 6.473067951202393, "epoch": 1.8603351955307263, "grad_norm": 1.046875, "learning_rate": 0.0004868172952555044, "loss": 6.1485, "mean_token_accuracy": 0.14482472315430642, "num_tokens": 3935769.0, "step": 2165 }, { "entropy": 6.363153123855591, "epoch": 1.864632574129781, "grad_norm": 0.9453125, "learning_rate": 0.0004867049296537278, "loss": 6.1373, "mean_token_accuracy": 0.1534383252263069, "num_tokens": 3945118.0, "step": 2170 }, { "entropy": 6.399164772033691, "epoch": 1.8689299527288354, "grad_norm": 1.2578125, "learning_rate": 0.0004865921017730027, "loss": 6.2358, "mean_token_accuracy": 0.15296792089939118, "num_tokens": 3954012.0, "step": 2175 }, { "entropy": 6.471106052398682, "epoch": 1.87322733132789, "grad_norm": 0.94140625, "learning_rate": 0.00048647881185969995, "loss": 6.2355, "mean_token_accuracy": 0.15060990452766418, "num_tokens": 3964239.0, "step": 2180 }, { "entropy": 6.386410093307495, "epoch": 1.8775247099269445, "grad_norm": 1.015625, "learning_rate": 0.0004863650601611994, "loss": 6.1502, "mean_token_accuracy": 0.15660223215818406, "num_tokens": 3973694.0, "step": 2185 }, { "entropy": 6.372910404205323, "epoch": 1.881822088525999, "grad_norm": 1.0703125, "learning_rate": 0.00048625084692588937, "loss": 6.185, "mean_token_accuracy": 0.15601919442415238, "num_tokens": 3982706.0, "step": 2190 }, { "entropy": 6.401282548904419, "epoch": 1.8861194671250536, "grad_norm": 1.09375, "learning_rate": 0.00048613617240316593, "loss": 6.138, "mean_token_accuracy": 0.15665835291147232, "num_tokens": 3990934.0, "step": 2195 }, { "entropy": 6.4126348972320555, "epoch": 1.8904168457241082, "grad_norm": 1.0390625, "learning_rate": 0.0004860210368434323, "loss": 6.192, "mean_token_accuracy": 0.1556440055370331, "num_tokens": 3999864.0, "step": 2200 }, { "entropy": 6.424229860305786, "epoch": 1.8947142243231627, "grad_norm": 0.9765625, "learning_rate": 0.00048590544049809857, "loss": 6.1968, "mean_token_accuracy": 0.15178433507680894, "num_tokens": 4008273.0, "step": 2205 }, { "entropy": 6.427778577804565, "epoch": 1.8990116029222175, "grad_norm": 0.99609375, "learning_rate": 0.000485789383619581, "loss": 6.2178, "mean_token_accuracy": 0.1559001922607422, "num_tokens": 4017697.0, "step": 2210 }, { "entropy": 6.4254296779632565, "epoch": 1.903308981521272, "grad_norm": 1.09375, "learning_rate": 0.0004856728664613015, "loss": 6.2293, "mean_token_accuracy": 0.14589258283376694, "num_tokens": 4026775.0, "step": 2215 }, { "entropy": 6.351989793777466, "epoch": 1.9076063601203266, "grad_norm": 1.03125, "learning_rate": 0.00048555588927768674, "loss": 6.1972, "mean_token_accuracy": 0.15271373167634011, "num_tokens": 4036476.0, "step": 2220 }, { "entropy": 6.473893165588379, "epoch": 1.9119037387193811, "grad_norm": 1.109375, "learning_rate": 0.0004854384523241683, "loss": 6.204, "mean_token_accuracy": 0.15081721246242524, "num_tokens": 4045221.0, "step": 2225 }, { "entropy": 6.310385704040527, "epoch": 1.916201117318436, "grad_norm": 1.0078125, "learning_rate": 0.00048532055585718143, "loss": 6.1112, "mean_token_accuracy": 0.15869007259607315, "num_tokens": 4053754.0, "step": 2230 }, { "entropy": 6.390126276016235, "epoch": 1.9204984959174904, "grad_norm": 1.1015625, "learning_rate": 0.00048520220013416505, "loss": 6.1455, "mean_token_accuracy": 0.15594211518764495, "num_tokens": 4061730.0, "step": 2235 }, { "entropy": 6.3809610366821286, "epoch": 1.924795874516545, "grad_norm": 1.0390625, "learning_rate": 0.0004850833854135607, "loss": 6.197, "mean_token_accuracy": 0.15130506530404092, "num_tokens": 4070501.0, "step": 2240 }, { "entropy": 6.420936059951782, "epoch": 1.9290932531155995, "grad_norm": 0.9296875, "learning_rate": 0.0004849641119548122, "loss": 6.2763, "mean_token_accuracy": 0.1485205315053463, "num_tokens": 4079621.0, "step": 2245 }, { "entropy": 6.4735170841217045, "epoch": 1.933390631714654, "grad_norm": 1.046875, "learning_rate": 0.000484844380018365, "loss": 6.2663, "mean_token_accuracy": 0.14868344217538834, "num_tokens": 4090106.0, "step": 2250 }, { "entropy": 6.461083984375, "epoch": 1.9376880103137086, "grad_norm": 1.0, "learning_rate": 0.000484724189865666, "loss": 6.1985, "mean_token_accuracy": 0.1501224085688591, "num_tokens": 4099269.0, "step": 2255 }, { "entropy": 6.287312364578247, "epoch": 1.9419853889127632, "grad_norm": 1.046875, "learning_rate": 0.0004846035417591624, "loss": 6.1351, "mean_token_accuracy": 0.1544906511902809, "num_tokens": 4108414.0, "step": 2260 }, { "entropy": 6.426730060577393, "epoch": 1.9462827675118177, "grad_norm": 1.1328125, "learning_rate": 0.0004844824359623014, "loss": 6.2629, "mean_token_accuracy": 0.14584496468305588, "num_tokens": 4117731.0, "step": 2265 }, { "entropy": 6.451971340179443, "epoch": 1.9505801461108723, "grad_norm": 1.0703125, "learning_rate": 0.00048436087273952966, "loss": 6.2441, "mean_token_accuracy": 0.14279974550008773, "num_tokens": 4127194.0, "step": 2270 }, { "entropy": 6.396147346496582, "epoch": 1.9548775247099268, "grad_norm": 1.09375, "learning_rate": 0.00048423885235629265, "loss": 6.193, "mean_token_accuracy": 0.15773467123508453, "num_tokens": 4135594.0, "step": 2275 }, { "entropy": 6.39124755859375, "epoch": 1.9591749033089814, "grad_norm": 1.0, "learning_rate": 0.0004841163750790342, "loss": 6.2256, "mean_token_accuracy": 0.15189721137285234, "num_tokens": 4145027.0, "step": 2280 }, { "entropy": 6.383194398880005, "epoch": 1.9634722819080361, "grad_norm": 0.99609375, "learning_rate": 0.00048399344117519555, "loss": 6.087, "mean_token_accuracy": 0.15884610414505004, "num_tokens": 4153754.0, "step": 2285 }, { "entropy": 6.330159759521484, "epoch": 1.9677696605070907, "grad_norm": 0.99609375, "learning_rate": 0.00048387005091321544, "loss": 6.1553, "mean_token_accuracy": 0.15946451872587203, "num_tokens": 4162765.0, "step": 2290 }, { "entropy": 6.414357376098633, "epoch": 1.9720670391061452, "grad_norm": 1.140625, "learning_rate": 0.00048374620456252877, "loss": 6.1748, "mean_token_accuracy": 0.1570574849843979, "num_tokens": 4171589.0, "step": 2295 }, { "entropy": 6.360631132125855, "epoch": 1.9763644177052, "grad_norm": 1.015625, "learning_rate": 0.00048362190239356644, "loss": 6.1913, "mean_token_accuracy": 0.155552938580513, "num_tokens": 4181817.0, "step": 2300 }, { "entropy": 6.352840518951416, "epoch": 1.9806617963042545, "grad_norm": 0.91796875, "learning_rate": 0.00048349714467775474, "loss": 6.1462, "mean_token_accuracy": 0.1511269122362137, "num_tokens": 4191350.0, "step": 2305 }, { "entropy": 6.3630085468292235, "epoch": 1.984959174903309, "grad_norm": 1.046875, "learning_rate": 0.00048337193168751464, "loss": 6.1935, "mean_token_accuracy": 0.1461350604891777, "num_tokens": 4199888.0, "step": 2310 }, { "entropy": 6.447411775588989, "epoch": 1.9892565535023636, "grad_norm": 1.1171875, "learning_rate": 0.0004832462636962613, "loss": 6.1829, "mean_token_accuracy": 0.1507252760231495, "num_tokens": 4209509.0, "step": 2315 }, { "entropy": 6.372689247131348, "epoch": 1.9935539321014182, "grad_norm": 1.09375, "learning_rate": 0.0004831201409784034, "loss": 6.1215, "mean_token_accuracy": 0.15712654441595078, "num_tokens": 4218496.0, "step": 2320 }, { "entropy": 6.357889032363891, "epoch": 1.9978513107004727, "grad_norm": 0.99609375, "learning_rate": 0.0004829935638093424, "loss": 6.1463, "mean_token_accuracy": 0.15369027704000474, "num_tokens": 4227504.0, "step": 2325 }, { "entropy": 6.373083750406901, "epoch": 2.0017189514396216, "grad_norm": 1.046875, "learning_rate": 0.0004828665324654724, "loss": 6.0581, "mean_token_accuracy": 0.15794145895375145, "num_tokens": 4235338.0, "step": 2330 }, { "entropy": 6.4267494678497314, "epoch": 2.006016330038676, "grad_norm": 0.9765625, "learning_rate": 0.0004827390472241791, "loss": 5.8418, "mean_token_accuracy": 0.16316850185394288, "num_tokens": 4244905.0, "step": 2335 }, { "entropy": 6.314910984039306, "epoch": 2.010313708637731, "grad_norm": 0.9375, "learning_rate": 0.0004826111083638392, "loss": 5.9211, "mean_token_accuracy": 0.1677140362560749, "num_tokens": 4254533.0, "step": 2340 }, { "entropy": 6.370204210281372, "epoch": 2.0146110872367857, "grad_norm": 0.98828125, "learning_rate": 0.00048248271616382, "loss": 5.8961, "mean_token_accuracy": 0.16431671380996704, "num_tokens": 4264023.0, "step": 2345 }, { "entropy": 6.326271295547485, "epoch": 2.0189084658358403, "grad_norm": 1.015625, "learning_rate": 0.00048235387090447894, "loss": 5.9306, "mean_token_accuracy": 0.1572665750980377, "num_tokens": 4273298.0, "step": 2350 }, { "entropy": 6.378605699539184, "epoch": 2.023205844434895, "grad_norm": 1.0390625, "learning_rate": 0.00048222457286716235, "loss": 5.8756, "mean_token_accuracy": 0.16723261177539825, "num_tokens": 4283244.0, "step": 2355 }, { "entropy": 6.322220325469971, "epoch": 2.0275032230339494, "grad_norm": 1.140625, "learning_rate": 0.00048209482233420564, "loss": 5.8185, "mean_token_accuracy": 0.1769508183002472, "num_tokens": 4291677.0, "step": 2360 }, { "entropy": 6.314945793151855, "epoch": 2.031800601633004, "grad_norm": 1.0546875, "learning_rate": 0.000481964619588932, "loss": 5.8793, "mean_token_accuracy": 0.16825687736272812, "num_tokens": 4300822.0, "step": 2365 }, { "entropy": 6.339528942108155, "epoch": 2.0360979802320585, "grad_norm": 1.0859375, "learning_rate": 0.0004818339649156523, "loss": 5.8876, "mean_token_accuracy": 0.16732898950576783, "num_tokens": 4310149.0, "step": 2370 }, { "entropy": 6.19782075881958, "epoch": 2.040395358831113, "grad_norm": 1.0078125, "learning_rate": 0.00048170285859966395, "loss": 5.7924, "mean_token_accuracy": 0.17466236799955367, "num_tokens": 4319109.0, "step": 2375 }, { "entropy": 6.3286045551300045, "epoch": 2.0446927374301676, "grad_norm": 0.984375, "learning_rate": 0.00048157130092725087, "loss": 5.7843, "mean_token_accuracy": 0.1704682469367981, "num_tokens": 4327921.0, "step": 2380 }, { "entropy": 6.329291915893554, "epoch": 2.048990116029222, "grad_norm": 1.0234375, "learning_rate": 0.0004814392921856824, "loss": 5.9287, "mean_token_accuracy": 0.16586144566535949, "num_tokens": 4338026.0, "step": 2385 }, { "entropy": 6.2563072681427006, "epoch": 2.0532874946282766, "grad_norm": 0.95703125, "learning_rate": 0.0004813068326632128, "loss": 5.7762, "mean_token_accuracy": 0.17654864937067033, "num_tokens": 4347794.0, "step": 2390 }, { "entropy": 6.329816913604736, "epoch": 2.057584873227331, "grad_norm": 1.078125, "learning_rate": 0.0004811739226490809, "loss": 5.9557, "mean_token_accuracy": 0.16758598685264586, "num_tokens": 4357249.0, "step": 2395 }, { "entropy": 6.283816623687744, "epoch": 2.0618822518263857, "grad_norm": 1.0625, "learning_rate": 0.00048104056243350896, "loss": 5.9041, "mean_token_accuracy": 0.16363563090562822, "num_tokens": 4366053.0, "step": 2400 }, { "entropy": 6.297672891616822, "epoch": 2.0661796304254403, "grad_norm": 0.98046875, "learning_rate": 0.0004809067523077023, "loss": 5.9163, "mean_token_accuracy": 0.16945113092660904, "num_tokens": 4375543.0, "step": 2405 }, { "entropy": 6.2845330238342285, "epoch": 2.0704770090244953, "grad_norm": 1.0625, "learning_rate": 0.00048077249256384884, "loss": 5.8006, "mean_token_accuracy": 0.17305675595998765, "num_tokens": 4384332.0, "step": 2410 }, { "entropy": 6.210544061660767, "epoch": 2.07477438762355, "grad_norm": 1.1953125, "learning_rate": 0.0004806377834951182, "loss": 5.8994, "mean_token_accuracy": 0.16216432005167009, "num_tokens": 4393670.0, "step": 2415 }, { "entropy": 6.373771142959595, "epoch": 2.0790717662226044, "grad_norm": 1.1328125, "learning_rate": 0.00048050262539566104, "loss": 5.9012, "mean_token_accuracy": 0.16862600147724152, "num_tokens": 4402763.0, "step": 2420 }, { "entropy": 6.269940948486328, "epoch": 2.083369144821659, "grad_norm": 0.984375, "learning_rate": 0.0004803670185606087, "loss": 5.8086, "mean_token_accuracy": 0.17335692346096038, "num_tokens": 4411863.0, "step": 2425 }, { "entropy": 6.265923166275025, "epoch": 2.0876665234207135, "grad_norm": 1.078125, "learning_rate": 0.0004802309632860724, "loss": 5.9059, "mean_token_accuracy": 0.16651569604873656, "num_tokens": 4421110.0, "step": 2430 }, { "entropy": 6.352302503585816, "epoch": 2.091963902019768, "grad_norm": 1.0390625, "learning_rate": 0.00048009445986914236, "loss": 5.8854, "mean_token_accuracy": 0.16589637845754623, "num_tokens": 4430249.0, "step": 2435 }, { "entropy": 6.263960170745849, "epoch": 2.0962612806188226, "grad_norm": 1.0078125, "learning_rate": 0.00047995750860788756, "loss": 5.8661, "mean_token_accuracy": 0.15910358875989913, "num_tokens": 4439686.0, "step": 2440 }, { "entropy": 6.227327108383179, "epoch": 2.100558659217877, "grad_norm": 1.1796875, "learning_rate": 0.0004798201098013547, "loss": 5.8709, "mean_token_accuracy": 0.1692453533411026, "num_tokens": 4448645.0, "step": 2445 }, { "entropy": 6.291311168670655, "epoch": 2.1048560378169316, "grad_norm": 0.96484375, "learning_rate": 0.00047968226374956797, "loss": 5.8333, "mean_token_accuracy": 0.1675017699599266, "num_tokens": 4456870.0, "step": 2450 }, { "entropy": 6.195930767059326, "epoch": 2.109153416415986, "grad_norm": 1.03125, "learning_rate": 0.00047954397075352794, "loss": 5.8684, "mean_token_accuracy": 0.17277338951826096, "num_tokens": 4466287.0, "step": 2455 }, { "entropy": 6.2388382911682125, "epoch": 2.1134507950150407, "grad_norm": 1.0703125, "learning_rate": 0.00047940523111521136, "loss": 5.7553, "mean_token_accuracy": 0.17395039051771163, "num_tokens": 4474461.0, "step": 2460 }, { "entropy": 6.255577421188354, "epoch": 2.1177481736140953, "grad_norm": 1.1875, "learning_rate": 0.0004792660451375701, "loss": 5.835, "mean_token_accuracy": 0.16953630596399308, "num_tokens": 4483002.0, "step": 2465 }, { "entropy": 6.224816513061524, "epoch": 2.12204555221315, "grad_norm": 1.0859375, "learning_rate": 0.00047912641312453064, "loss": 5.8459, "mean_token_accuracy": 0.1695180580019951, "num_tokens": 4492061.0, "step": 2470 }, { "entropy": 6.284405374526978, "epoch": 2.1263429308122044, "grad_norm": 0.9375, "learning_rate": 0.00047898633538099363, "loss": 5.8957, "mean_token_accuracy": 0.16090027987957, "num_tokens": 4501829.0, "step": 2475 }, { "entropy": 6.258666229248047, "epoch": 2.130640309411259, "grad_norm": 0.98828125, "learning_rate": 0.0004788458122128327, "loss": 5.9181, "mean_token_accuracy": 0.1656097248196602, "num_tokens": 4511539.0, "step": 2480 }, { "entropy": 6.246809720993042, "epoch": 2.134937688010314, "grad_norm": 1.0625, "learning_rate": 0.00047870484392689434, "loss": 5.7722, "mean_token_accuracy": 0.1671189084649086, "num_tokens": 4520425.0, "step": 2485 }, { "entropy": 6.220279026031494, "epoch": 2.1392350666093685, "grad_norm": 1.0859375, "learning_rate": 0.000478563430830997, "loss": 5.8751, "mean_token_accuracy": 0.16446918100118638, "num_tokens": 4529474.0, "step": 2490 }, { "entropy": 6.2571605205535885, "epoch": 2.143532445208423, "grad_norm": 1.0546875, "learning_rate": 0.00047842157323393035, "loss": 5.8041, "mean_token_accuracy": 0.1694269135594368, "num_tokens": 4538082.0, "step": 2495 }, { "entropy": 6.218803596496582, "epoch": 2.1478298238074776, "grad_norm": 1.015625, "learning_rate": 0.0004782792714454547, "loss": 5.9987, "mean_token_accuracy": 0.16337930560112, "num_tokens": 4547340.0, "step": 2500 }, { "epoch": 2.1478298238074776, "eval_entropy": 6.073525357890773, "eval_loss": 6.213027477264404, "eval_mean_token_accuracy": 0.15643914548999016, "eval_num_tokens": 4547340.0, "eval_runtime": 2.0452, "eval_samples_per_second": 1735.325, "eval_steps_per_second": 217.099, "step": 2500 } ], "logging_steps": 5, "max_steps": 11630, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1024889190266880.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }