{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8594757198109153, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 10.742608070373535, "epoch": 0.004297378599054577, "grad_norm": 5.46875, "learning_rate": 2e-06, "loss": 10.7643, "mean_token_accuracy": 7.587253348901868e-05, "num_tokens": 10107.0, "step": 5 }, { "entropy": 10.742630290985108, "epoch": 0.008594757198109154, "grad_norm": 5.78125, "learning_rate": 4.5e-06, "loss": 10.7086, "mean_token_accuracy": 0.0, "num_tokens": 18391.0, "step": 10 }, { "entropy": 10.74263505935669, "epoch": 0.01289213579716373, "grad_norm": 5.3125, "learning_rate": 7e-06, "loss": 10.6888, "mean_token_accuracy": 7.022471982054412e-05, "num_tokens": 27061.0, "step": 15 }, { "entropy": 10.742604160308838, "epoch": 0.017189514396218308, "grad_norm": 6.0, "learning_rate": 9.5e-06, "loss": 10.6611, "mean_token_accuracy": 0.0008422504703048617, "num_tokens": 36339.0, "step": 20 }, { "entropy": 10.742517948150635, "epoch": 0.021486892995272882, "grad_norm": 4.75, "learning_rate": 1.2e-05, "loss": 10.5317, "mean_token_accuracy": 0.02025789166800678, "num_tokens": 45770.0, "step": 25 }, { "entropy": 10.741962242126466, "epoch": 0.02578427159432746, "grad_norm": 4.25, "learning_rate": 1.4500000000000002e-05, "loss": 10.399, "mean_token_accuracy": 0.04876907132565975, "num_tokens": 54575.0, "step": 30 }, { "entropy": 10.73945140838623, "epoch": 0.030081650193382038, "grad_norm": 3.15625, "learning_rate": 1.7000000000000003e-05, "loss": 10.3065, "mean_token_accuracy": 0.0514072135090828, "num_tokens": 66403.0, "step": 35 }, { "entropy": 10.730937385559082, "epoch": 0.034379028792436615, "grad_norm": 2.640625, "learning_rate": 1.95e-05, "loss": 10.0976, "mean_token_accuracy": 0.05973539762198925, "num_tokens": 76510.0, "step": 40 }, { "entropy": 10.715238952636719, "epoch": 0.03867640739149119, "grad_norm": 2.40625, "learning_rate": 2.2e-05, "loss": 9.9688, "mean_token_accuracy": 0.05614017099142075, "num_tokens": 84836.0, "step": 45 }, { "entropy": 10.702037715911866, "epoch": 0.042973785990545764, "grad_norm": 2.046875, "learning_rate": 2.4500000000000003e-05, "loss": 9.9015, "mean_token_accuracy": 0.053829558193683624, "num_tokens": 93197.0, "step": 50 }, { "entropy": 10.697910690307618, "epoch": 0.047271164589600345, "grad_norm": 2.40625, "learning_rate": 2.7e-05, "loss": 9.8366, "mean_token_accuracy": 0.05843428298830986, "num_tokens": 101546.0, "step": 55 }, { "entropy": 10.693470478057861, "epoch": 0.05156854318865492, "grad_norm": 1.9609375, "learning_rate": 2.95e-05, "loss": 9.8429, "mean_token_accuracy": 0.0558084711432457, "num_tokens": 111703.0, "step": 60 }, { "entropy": 10.680869865417481, "epoch": 0.055865921787709494, "grad_norm": 1.9453125, "learning_rate": 3.2e-05, "loss": 9.7131, "mean_token_accuracy": 0.0589165486395359, "num_tokens": 119894.0, "step": 65 }, { "entropy": 10.668927574157715, "epoch": 0.060163300386764075, "grad_norm": 1.9765625, "learning_rate": 3.4500000000000005e-05, "loss": 9.6682, "mean_token_accuracy": 0.06148771904408932, "num_tokens": 128885.0, "step": 70 }, { "entropy": 10.654484272003174, "epoch": 0.06446067898581866, "grad_norm": 1.953125, "learning_rate": 3.7e-05, "loss": 9.6297, "mean_token_accuracy": 0.057728851959109304, "num_tokens": 138106.0, "step": 75 }, { "entropy": 10.645826625823975, "epoch": 0.06875805758487323, "grad_norm": 1.9296875, "learning_rate": 3.95e-05, "loss": 9.5722, "mean_token_accuracy": 0.058954347297549246, "num_tokens": 146691.0, "step": 80 }, { "entropy": 10.637816619873046, "epoch": 0.0730554361839278, "grad_norm": 1.90625, "learning_rate": 4.2000000000000004e-05, "loss": 9.5126, "mean_token_accuracy": 0.059067190065979956, "num_tokens": 155792.0, "step": 85 }, { "entropy": 10.63103084564209, "epoch": 0.07735281478298238, "grad_norm": 1.7890625, "learning_rate": 4.45e-05, "loss": 9.5251, "mean_token_accuracy": 0.0552229531109333, "num_tokens": 166944.0, "step": 90 }, { "entropy": 10.616693305969239, "epoch": 0.08165019338203695, "grad_norm": 1.96875, "learning_rate": 4.7000000000000004e-05, "loss": 9.3423, "mean_token_accuracy": 0.060124922543764114, "num_tokens": 175303.0, "step": 95 }, { "entropy": 10.591300106048584, "epoch": 0.08594757198109153, "grad_norm": 1.8203125, "learning_rate": 4.9500000000000004e-05, "loss": 9.3133, "mean_token_accuracy": 0.06174388714134693, "num_tokens": 184708.0, "step": 100 }, { "entropy": 10.564336776733398, "epoch": 0.09024495058014612, "grad_norm": 1.7890625, "learning_rate": 5.2e-05, "loss": 9.2307, "mean_token_accuracy": 0.0674959484487772, "num_tokens": 193835.0, "step": 105 }, { "entropy": 10.52622423171997, "epoch": 0.09454232917920069, "grad_norm": 1.8828125, "learning_rate": 5.45e-05, "loss": 9.1379, "mean_token_accuracy": 0.07480009235441684, "num_tokens": 203344.0, "step": 110 }, { "entropy": 10.454349136352539, "epoch": 0.09883970777825526, "grad_norm": 1.6171875, "learning_rate": 5.7e-05, "loss": 9.1209, "mean_token_accuracy": 0.06218625903129578, "num_tokens": 213048.0, "step": 115 }, { "entropy": 10.415324211120605, "epoch": 0.10313708637730984, "grad_norm": 1.578125, "learning_rate": 5.9499999999999996e-05, "loss": 8.9306, "mean_token_accuracy": 0.07533645890653133, "num_tokens": 221784.0, "step": 120 }, { "entropy": 10.303644943237305, "epoch": 0.10743446497636441, "grad_norm": 1.4765625, "learning_rate": 6.2e-05, "loss": 8.8509, "mean_token_accuracy": 0.07504003196954727, "num_tokens": 230971.0, "step": 125 }, { "entropy": 10.209668159484863, "epoch": 0.11173184357541899, "grad_norm": 1.4296875, "learning_rate": 6.450000000000001e-05, "loss": 8.7412, "mean_token_accuracy": 0.07478504739701748, "num_tokens": 240524.0, "step": 130 }, { "entropy": 10.153745365142822, "epoch": 0.11602922217447358, "grad_norm": 1.3359375, "learning_rate": 6.7e-05, "loss": 8.6323, "mean_token_accuracy": 0.07354197278618813, "num_tokens": 249220.0, "step": 135 }, { "entropy": 10.068094253540039, "epoch": 0.12032660077352815, "grad_norm": 1.3125, "learning_rate": 6.950000000000001e-05, "loss": 8.61, "mean_token_accuracy": 0.07049238979816437, "num_tokens": 258934.0, "step": 140 }, { "entropy": 9.973960685729981, "epoch": 0.12462397937258272, "grad_norm": 1.2734375, "learning_rate": 7.2e-05, "loss": 8.4673, "mean_token_accuracy": 0.07534252405166626, "num_tokens": 267680.0, "step": 145 }, { "entropy": 9.815561103820801, "epoch": 0.1289213579716373, "grad_norm": 1.09375, "learning_rate": 7.45e-05, "loss": 8.3709, "mean_token_accuracy": 0.07952065020799637, "num_tokens": 276227.0, "step": 150 }, { "entropy": 9.66996259689331, "epoch": 0.1332187365706919, "grad_norm": 1.1875, "learning_rate": 7.7e-05, "loss": 8.2269, "mean_token_accuracy": 0.08225171342492103, "num_tokens": 286342.0, "step": 155 }, { "entropy": 9.510671615600586, "epoch": 0.13751611516974646, "grad_norm": 0.953125, "learning_rate": 7.950000000000001e-05, "loss": 8.1921, "mean_token_accuracy": 0.0742720566689968, "num_tokens": 294994.0, "step": 160 }, { "entropy": 9.346861934661865, "epoch": 0.14181349376880104, "grad_norm": 0.984375, "learning_rate": 8.2e-05, "loss": 8.113, "mean_token_accuracy": 0.08004417940974236, "num_tokens": 303882.0, "step": 165 }, { "entropy": 9.199288940429687, "epoch": 0.1461108723678556, "grad_norm": 0.9296875, "learning_rate": 8.450000000000001e-05, "loss": 8.0403, "mean_token_accuracy": 0.07799897268414498, "num_tokens": 312515.0, "step": 170 }, { "entropy": 8.978620052337646, "epoch": 0.15040825096691018, "grad_norm": 0.9375, "learning_rate": 8.7e-05, "loss": 7.9977, "mean_token_accuracy": 0.07381256259977817, "num_tokens": 320801.0, "step": 175 }, { "entropy": 8.861582374572754, "epoch": 0.15470562956596476, "grad_norm": 0.9765625, "learning_rate": 8.95e-05, "loss": 7.9642, "mean_token_accuracy": 0.08192512467503547, "num_tokens": 329382.0, "step": 180 }, { "entropy": 8.755144786834716, "epoch": 0.15900300816501933, "grad_norm": 0.9296875, "learning_rate": 9.2e-05, "loss": 7.9273, "mean_token_accuracy": 0.07583913430571557, "num_tokens": 337894.0, "step": 185 }, { "entropy": 8.582227611541748, "epoch": 0.1633003867640739, "grad_norm": 0.8984375, "learning_rate": 9.45e-05, "loss": 7.9012, "mean_token_accuracy": 0.07614588961005211, "num_tokens": 346380.0, "step": 190 }, { "entropy": 8.591823768615722, "epoch": 0.16759776536312848, "grad_norm": 0.9609375, "learning_rate": 9.7e-05, "loss": 7.9407, "mean_token_accuracy": 0.07390806600451469, "num_tokens": 356305.0, "step": 195 }, { "entropy": 8.515201950073243, "epoch": 0.17189514396218306, "grad_norm": 1.1328125, "learning_rate": 9.95e-05, "loss": 7.8901, "mean_token_accuracy": 0.07247771993279457, "num_tokens": 364899.0, "step": 200 }, { "entropy": 8.457213211059571, "epoch": 0.17619252256123766, "grad_norm": 0.93359375, "learning_rate": 0.000102, "loss": 7.8566, "mean_token_accuracy": 0.0781160645186901, "num_tokens": 373663.0, "step": 205 }, { "entropy": 8.381179523468017, "epoch": 0.18048990116029223, "grad_norm": 0.95703125, "learning_rate": 0.00010449999999999999, "loss": 7.8221, "mean_token_accuracy": 0.07758632972836495, "num_tokens": 382730.0, "step": 210 }, { "entropy": 8.390653896331788, "epoch": 0.1847872797593468, "grad_norm": 0.921875, "learning_rate": 0.000107, "loss": 7.8622, "mean_token_accuracy": 0.071787304058671, "num_tokens": 392676.0, "step": 215 }, { "entropy": 8.255177211761474, "epoch": 0.18908465835840138, "grad_norm": 1.1015625, "learning_rate": 0.0001095, "loss": 7.8473, "mean_token_accuracy": 0.08185218423604965, "num_tokens": 401050.0, "step": 220 }, { "entropy": 8.367721462249756, "epoch": 0.19338203695745596, "grad_norm": 0.796875, "learning_rate": 0.000112, "loss": 7.795, "mean_token_accuracy": 0.07991239950060844, "num_tokens": 410009.0, "step": 225 }, { "entropy": 8.268333339691162, "epoch": 0.19767941555651053, "grad_norm": 0.859375, "learning_rate": 0.0001145, "loss": 7.7757, "mean_token_accuracy": 0.08171008005738259, "num_tokens": 419302.0, "step": 230 }, { "entropy": 8.304029846191407, "epoch": 0.2019767941555651, "grad_norm": 0.984375, "learning_rate": 0.00011700000000000001, "loss": 7.6812, "mean_token_accuracy": 0.08820762410759926, "num_tokens": 427296.0, "step": 235 }, { "entropy": 8.16576337814331, "epoch": 0.20627417275461968, "grad_norm": 0.91796875, "learning_rate": 0.00011949999999999999, "loss": 7.8198, "mean_token_accuracy": 0.07870872803032399, "num_tokens": 436368.0, "step": 240 }, { "entropy": 8.189785575866699, "epoch": 0.21057155135367425, "grad_norm": 1.28125, "learning_rate": 0.000122, "loss": 7.7389, "mean_token_accuracy": 0.08551637679338456, "num_tokens": 445535.0, "step": 245 }, { "entropy": 8.265625381469727, "epoch": 0.21486892995272883, "grad_norm": 0.8671875, "learning_rate": 0.0001245, "loss": 7.7093, "mean_token_accuracy": 0.07919453792273998, "num_tokens": 454769.0, "step": 250 }, { "entropy": 8.1545090675354, "epoch": 0.2191663085517834, "grad_norm": 0.93359375, "learning_rate": 0.000127, "loss": 7.7315, "mean_token_accuracy": 0.0871740497648716, "num_tokens": 463975.0, "step": 255 }, { "entropy": 8.13952112197876, "epoch": 0.22346368715083798, "grad_norm": 0.88671875, "learning_rate": 0.0001295, "loss": 7.726, "mean_token_accuracy": 0.08799278363585472, "num_tokens": 472899.0, "step": 260 }, { "entropy": 8.196070003509522, "epoch": 0.22776106574989258, "grad_norm": 0.93359375, "learning_rate": 0.000132, "loss": 7.7354, "mean_token_accuracy": 0.08013860881328583, "num_tokens": 481556.0, "step": 265 }, { "entropy": 8.114658737182618, "epoch": 0.23205844434894715, "grad_norm": 0.91015625, "learning_rate": 0.00013450000000000002, "loss": 7.7023, "mean_token_accuracy": 0.0854449674487114, "num_tokens": 490253.0, "step": 270 }, { "entropy": 8.193334579467773, "epoch": 0.23635582294800173, "grad_norm": 1.09375, "learning_rate": 0.00013700000000000002, "loss": 7.7066, "mean_token_accuracy": 0.0806311085820198, "num_tokens": 498444.0, "step": 275 }, { "entropy": 8.104936504364014, "epoch": 0.2406532015470563, "grad_norm": 0.8046875, "learning_rate": 0.0001395, "loss": 7.6467, "mean_token_accuracy": 0.08675235286355018, "num_tokens": 508330.0, "step": 280 }, { "entropy": 8.113396596908569, "epoch": 0.24495058014611087, "grad_norm": 1.015625, "learning_rate": 0.00014199999999999998, "loss": 7.7405, "mean_token_accuracy": 0.08165572881698609, "num_tokens": 517900.0, "step": 285 }, { "entropy": 8.046846723556518, "epoch": 0.24924795874516545, "grad_norm": 0.93359375, "learning_rate": 0.0001445, "loss": 7.6901, "mean_token_accuracy": 0.08230286985635757, "num_tokens": 527808.0, "step": 290 }, { "entropy": 8.13338761329651, "epoch": 0.25354533734422, "grad_norm": 0.8984375, "learning_rate": 0.000147, "loss": 7.6711, "mean_token_accuracy": 0.08156475871801376, "num_tokens": 536931.0, "step": 295 }, { "entropy": 8.18837013244629, "epoch": 0.2578427159432746, "grad_norm": 1.1875, "learning_rate": 0.0001495, "loss": 7.7049, "mean_token_accuracy": 0.0835341140627861, "num_tokens": 545758.0, "step": 300 }, { "entropy": 8.025089168548584, "epoch": 0.26214009454232917, "grad_norm": 0.9921875, "learning_rate": 0.000152, "loss": 7.7131, "mean_token_accuracy": 0.08242038711905479, "num_tokens": 555165.0, "step": 305 }, { "entropy": 8.155539417266846, "epoch": 0.2664374731413838, "grad_norm": 0.86328125, "learning_rate": 0.00015450000000000001, "loss": 7.6144, "mean_token_accuracy": 0.08789716809988021, "num_tokens": 564719.0, "step": 310 }, { "entropy": 8.041153383255004, "epoch": 0.2707348517404383, "grad_norm": 1.0, "learning_rate": 0.000157, "loss": 7.594, "mean_token_accuracy": 0.09155945181846618, "num_tokens": 573572.0, "step": 315 }, { "entropy": 8.15259666442871, "epoch": 0.2750322303394929, "grad_norm": 1.0859375, "learning_rate": 0.0001595, "loss": 7.7634, "mean_token_accuracy": 0.08318910300731659, "num_tokens": 581497.0, "step": 320 }, { "entropy": 8.100253248214722, "epoch": 0.27932960893854747, "grad_norm": 1.125, "learning_rate": 0.000162, "loss": 7.6118, "mean_token_accuracy": 0.08767011985182763, "num_tokens": 591107.0, "step": 325 }, { "entropy": 7.984478855133057, "epoch": 0.28362698753760207, "grad_norm": 0.84765625, "learning_rate": 0.00016450000000000001, "loss": 7.6456, "mean_token_accuracy": 0.08353794142603874, "num_tokens": 600241.0, "step": 330 }, { "entropy": 8.057686376571656, "epoch": 0.2879243661366566, "grad_norm": 0.91796875, "learning_rate": 0.00016700000000000002, "loss": 7.5776, "mean_token_accuracy": 0.08751234114170074, "num_tokens": 608697.0, "step": 335 }, { "entropy": 8.016141748428344, "epoch": 0.2922217447357112, "grad_norm": 0.9453125, "learning_rate": 0.00016950000000000003, "loss": 7.568, "mean_token_accuracy": 0.09023259431123734, "num_tokens": 617275.0, "step": 340 }, { "entropy": 8.084819841384888, "epoch": 0.29651912333476577, "grad_norm": 0.8984375, "learning_rate": 0.00017199999999999998, "loss": 7.6405, "mean_token_accuracy": 0.08630914464592934, "num_tokens": 626644.0, "step": 345 }, { "entropy": 8.008595705032349, "epoch": 0.30081650193382037, "grad_norm": 0.98828125, "learning_rate": 0.00017449999999999999, "loss": 7.5665, "mean_token_accuracy": 0.08766811862587928, "num_tokens": 635110.0, "step": 350 }, { "entropy": 8.04712610244751, "epoch": 0.30511388053287497, "grad_norm": 0.87109375, "learning_rate": 0.000177, "loss": 7.7031, "mean_token_accuracy": 0.08570141717791557, "num_tokens": 644746.0, "step": 355 }, { "entropy": 8.179811954498291, "epoch": 0.3094112591319295, "grad_norm": 1.1015625, "learning_rate": 0.0001795, "loss": 7.5831, "mean_token_accuracy": 0.08595824986696243, "num_tokens": 654281.0, "step": 360 }, { "entropy": 7.987443113327027, "epoch": 0.3137086377309841, "grad_norm": 1.203125, "learning_rate": 0.000182, "loss": 7.585, "mean_token_accuracy": 0.09283285215497017, "num_tokens": 663174.0, "step": 365 }, { "entropy": 7.916810417175293, "epoch": 0.31800601633003867, "grad_norm": 0.90625, "learning_rate": 0.0001845, "loss": 7.511, "mean_token_accuracy": 0.08863886222243308, "num_tokens": 672178.0, "step": 370 }, { "entropy": 8.005489206314087, "epoch": 0.32230339492909327, "grad_norm": 0.96484375, "learning_rate": 0.000187, "loss": 7.5218, "mean_token_accuracy": 0.09131815880537034, "num_tokens": 681323.0, "step": 375 }, { "entropy": 7.9803643226623535, "epoch": 0.3266007735281478, "grad_norm": 0.890625, "learning_rate": 0.0001895, "loss": 7.4406, "mean_token_accuracy": 0.08985799476504326, "num_tokens": 690461.0, "step": 380 }, { "entropy": 7.829833698272705, "epoch": 0.3308981521272024, "grad_norm": 1.046875, "learning_rate": 0.000192, "loss": 7.5004, "mean_token_accuracy": 0.08490158319473266, "num_tokens": 699199.0, "step": 385 }, { "entropy": 8.038139152526856, "epoch": 0.33519553072625696, "grad_norm": 1.1484375, "learning_rate": 0.0001945, "loss": 7.4484, "mean_token_accuracy": 0.09670188426971435, "num_tokens": 707949.0, "step": 390 }, { "entropy": 7.9735198497772215, "epoch": 0.33949290932531156, "grad_norm": 1.203125, "learning_rate": 0.00019700000000000002, "loss": 7.5219, "mean_token_accuracy": 0.08999367579817771, "num_tokens": 715752.0, "step": 395 }, { "entropy": 7.93391604423523, "epoch": 0.3437902879243661, "grad_norm": 1.1171875, "learning_rate": 0.00019950000000000002, "loss": 7.4479, "mean_token_accuracy": 0.0979436494410038, "num_tokens": 724416.0, "step": 400 }, { "entropy": 7.925309085845948, "epoch": 0.3480876665234207, "grad_norm": 1.0546875, "learning_rate": 0.000202, "loss": 7.4953, "mean_token_accuracy": 0.09031900316476822, "num_tokens": 733116.0, "step": 405 }, { "entropy": 7.916099977493286, "epoch": 0.3523850451224753, "grad_norm": 1.0625, "learning_rate": 0.00020449999999999998, "loss": 7.4726, "mean_token_accuracy": 0.09227924942970275, "num_tokens": 742093.0, "step": 410 }, { "entropy": 7.918701934814453, "epoch": 0.35668242372152986, "grad_norm": 1.046875, "learning_rate": 0.000207, "loss": 7.4649, "mean_token_accuracy": 0.09618089124560356, "num_tokens": 750402.0, "step": 415 }, { "entropy": 7.816703271865845, "epoch": 0.36097980232058446, "grad_norm": 0.9140625, "learning_rate": 0.0002095, "loss": 7.4336, "mean_token_accuracy": 0.09461462944746017, "num_tokens": 760961.0, "step": 420 }, { "entropy": 7.944287586212158, "epoch": 0.365277180919639, "grad_norm": 1.0390625, "learning_rate": 0.000212, "loss": 7.4865, "mean_token_accuracy": 0.09455274268984795, "num_tokens": 770554.0, "step": 425 }, { "entropy": 7.750526332855225, "epoch": 0.3695745595186936, "grad_norm": 1.03125, "learning_rate": 0.0002145, "loss": 7.4618, "mean_token_accuracy": 0.09681151732802391, "num_tokens": 779172.0, "step": 430 }, { "entropy": 7.9787256717681885, "epoch": 0.37387193811774816, "grad_norm": 0.984375, "learning_rate": 0.00021700000000000002, "loss": 7.5123, "mean_token_accuracy": 0.08840151131153107, "num_tokens": 788040.0, "step": 435 }, { "entropy": 7.883750295639038, "epoch": 0.37816931671680276, "grad_norm": 1.109375, "learning_rate": 0.0002195, "loss": 7.4135, "mean_token_accuracy": 0.0939902700483799, "num_tokens": 796786.0, "step": 440 }, { "entropy": 7.851776885986328, "epoch": 0.3824666953158573, "grad_norm": 1.09375, "learning_rate": 0.000222, "loss": 7.4233, "mean_token_accuracy": 0.0923767201602459, "num_tokens": 805520.0, "step": 445 }, { "entropy": 7.805376100540161, "epoch": 0.3867640739149119, "grad_norm": 1.1484375, "learning_rate": 0.0002245, "loss": 7.3508, "mean_token_accuracy": 0.09647825658321381, "num_tokens": 814939.0, "step": 450 }, { "entropy": 7.874559307098389, "epoch": 0.39106145251396646, "grad_norm": 1.2265625, "learning_rate": 0.00022700000000000002, "loss": 7.3531, "mean_token_accuracy": 0.09795481041073799, "num_tokens": 823862.0, "step": 455 }, { "entropy": 7.7626677513122555, "epoch": 0.39535883111302106, "grad_norm": 1.1328125, "learning_rate": 0.00022950000000000002, "loss": 7.3918, "mean_token_accuracy": 0.09068166017532349, "num_tokens": 832820.0, "step": 460 }, { "entropy": 7.928297901153565, "epoch": 0.39965620971207566, "grad_norm": 1.1171875, "learning_rate": 0.00023200000000000003, "loss": 7.3494, "mean_token_accuracy": 0.09501236006617546, "num_tokens": 841538.0, "step": 465 }, { "entropy": 7.7496504306793215, "epoch": 0.4039535883111302, "grad_norm": 0.99609375, "learning_rate": 0.00023449999999999998, "loss": 7.4626, "mean_token_accuracy": 0.09104103595018387, "num_tokens": 851123.0, "step": 470 }, { "entropy": 7.8953351974487305, "epoch": 0.4082509669101848, "grad_norm": 1.125, "learning_rate": 0.000237, "loss": 7.4266, "mean_token_accuracy": 0.09596899375319481, "num_tokens": 860357.0, "step": 475 }, { "entropy": 7.76341495513916, "epoch": 0.41254834550923936, "grad_norm": 1.0703125, "learning_rate": 0.0002395, "loss": 7.3425, "mean_token_accuracy": 0.09861095696687698, "num_tokens": 869980.0, "step": 480 }, { "entropy": 7.82184157371521, "epoch": 0.41684572410829396, "grad_norm": 1.03125, "learning_rate": 0.000242, "loss": 7.2999, "mean_token_accuracy": 0.10065284445881843, "num_tokens": 878250.0, "step": 485 }, { "entropy": 7.76347074508667, "epoch": 0.4211431027073485, "grad_norm": 1.25, "learning_rate": 0.0002445, "loss": 7.4007, "mean_token_accuracy": 0.095355936139822, "num_tokens": 887624.0, "step": 490 }, { "entropy": 7.753844261169434, "epoch": 0.4254404813064031, "grad_norm": 1.1484375, "learning_rate": 0.000247, "loss": 7.3568, "mean_token_accuracy": 0.09853926301002502, "num_tokens": 897120.0, "step": 495 }, { "entropy": 7.802051830291748, "epoch": 0.42973785990545765, "grad_norm": 1.03125, "learning_rate": 0.0002495, "loss": 7.3179, "mean_token_accuracy": 0.10127250477671623, "num_tokens": 906215.0, "step": 500 }, { "epoch": 0.42973785990545765, "eval_entropy": 7.412716417699246, "eval_loss": 7.3790483474731445, "eval_mean_token_accuracy": 0.09986981684929347, "eval_num_tokens": 906215.0, "eval_runtime": 2.0966, "eval_samples_per_second": 1692.736, "eval_steps_per_second": 211.771, "step": 500 }, { "entropy": 7.651102495193482, "epoch": 0.43403523850451226, "grad_norm": 1.09375, "learning_rate": 0.000252, "loss": 7.3112, "mean_token_accuracy": 0.10008608102798462, "num_tokens": 915181.0, "step": 505 }, { "entropy": 7.728409194946289, "epoch": 0.4383326171035668, "grad_norm": 1.0703125, "learning_rate": 0.0002545, "loss": 7.3388, "mean_token_accuracy": 0.09651862978935241, "num_tokens": 924377.0, "step": 510 }, { "entropy": 7.770003318786621, "epoch": 0.4426299957026214, "grad_norm": 0.984375, "learning_rate": 0.000257, "loss": 7.4098, "mean_token_accuracy": 0.09438847750425339, "num_tokens": 933114.0, "step": 515 }, { "entropy": 7.86782751083374, "epoch": 0.44692737430167595, "grad_norm": 0.9375, "learning_rate": 0.0002595, "loss": 7.3692, "mean_token_accuracy": 0.09444344118237495, "num_tokens": 943306.0, "step": 520 }, { "entropy": 7.659075498580933, "epoch": 0.45122475290073055, "grad_norm": 1.1875, "learning_rate": 0.000262, "loss": 7.2626, "mean_token_accuracy": 0.10587219074368477, "num_tokens": 951515.0, "step": 525 }, { "entropy": 7.713227224349976, "epoch": 0.45552213149978515, "grad_norm": 1.015625, "learning_rate": 0.00026450000000000003, "loss": 7.3711, "mean_token_accuracy": 0.09387057200074196, "num_tokens": 962686.0, "step": 530 }, { "entropy": 7.780395078659057, "epoch": 0.4598195100988397, "grad_norm": 1.09375, "learning_rate": 0.00026700000000000004, "loss": 7.3777, "mean_token_accuracy": 0.10021266266703606, "num_tokens": 972136.0, "step": 535 }, { "entropy": 7.657458114624023, "epoch": 0.4641168886978943, "grad_norm": 1.09375, "learning_rate": 0.00026950000000000005, "loss": 7.2696, "mean_token_accuracy": 0.10345774069428444, "num_tokens": 981301.0, "step": 540 }, { "entropy": 7.700049114227295, "epoch": 0.46841426729694885, "grad_norm": 1.1484375, "learning_rate": 0.00027200000000000005, "loss": 7.2923, "mean_token_accuracy": 0.10189392492175102, "num_tokens": 990360.0, "step": 545 }, { "entropy": 7.770557546615601, "epoch": 0.47271164589600345, "grad_norm": 1.0859375, "learning_rate": 0.0002745, "loss": 7.3438, "mean_token_accuracy": 0.09953725263476372, "num_tokens": 999415.0, "step": 550 }, { "entropy": 7.656623125076294, "epoch": 0.477009024495058, "grad_norm": 1.0625, "learning_rate": 0.000277, "loss": 7.2635, "mean_token_accuracy": 0.10239741951227188, "num_tokens": 1008762.0, "step": 555 }, { "entropy": 7.690563821792603, "epoch": 0.4813064030941126, "grad_norm": 1.171875, "learning_rate": 0.0002795, "loss": 7.2652, "mean_token_accuracy": 0.10631422251462937, "num_tokens": 1017704.0, "step": 560 }, { "entropy": 7.641897583007813, "epoch": 0.48560378169316715, "grad_norm": 1.1640625, "learning_rate": 0.00028199999999999997, "loss": 7.2341, "mean_token_accuracy": 0.10428761765360832, "num_tokens": 1026251.0, "step": 565 }, { "entropy": 7.641419315338135, "epoch": 0.48990116029222175, "grad_norm": 1.03125, "learning_rate": 0.0002845, "loss": 7.2158, "mean_token_accuracy": 0.10731100514531136, "num_tokens": 1036191.0, "step": 570 }, { "entropy": 7.658735990524292, "epoch": 0.4941985388912763, "grad_norm": 1.0859375, "learning_rate": 0.000287, "loss": 7.2462, "mean_token_accuracy": 0.10594421103596688, "num_tokens": 1044936.0, "step": 575 }, { "entropy": 7.621677112579346, "epoch": 0.4984959174903309, "grad_norm": 1.1171875, "learning_rate": 0.0002895, "loss": 7.2472, "mean_token_accuracy": 0.10367096737027168, "num_tokens": 1053683.0, "step": 580 }, { "entropy": 7.570435047149658, "epoch": 0.5027932960893855, "grad_norm": 1.046875, "learning_rate": 0.000292, "loss": 7.2271, "mean_token_accuracy": 0.1076263040304184, "num_tokens": 1062932.0, "step": 585 }, { "entropy": 7.723283386230468, "epoch": 0.50709067468844, "grad_norm": 0.98828125, "learning_rate": 0.0002945, "loss": 7.2544, "mean_token_accuracy": 0.10264097228646278, "num_tokens": 1072313.0, "step": 590 }, { "entropy": 7.62511043548584, "epoch": 0.5113880532874946, "grad_norm": 1.171875, "learning_rate": 0.000297, "loss": 7.2228, "mean_token_accuracy": 0.09801378548145294, "num_tokens": 1081675.0, "step": 595 }, { "entropy": 7.608328151702881, "epoch": 0.5156854318865493, "grad_norm": 1.0703125, "learning_rate": 0.0002995, "loss": 7.2433, "mean_token_accuracy": 0.10141062065958976, "num_tokens": 1091541.0, "step": 600 }, { "entropy": 7.695394897460938, "epoch": 0.5199828104856038, "grad_norm": 1.015625, "learning_rate": 0.000302, "loss": 7.2462, "mean_token_accuracy": 0.10475782826542854, "num_tokens": 1100724.0, "step": 605 }, { "entropy": 7.50453405380249, "epoch": 0.5242801890846583, "grad_norm": 1.0546875, "learning_rate": 0.0003045, "loss": 7.1924, "mean_token_accuracy": 0.1077597513794899, "num_tokens": 1108869.0, "step": 610 }, { "entropy": 7.644835519790649, "epoch": 0.5285775676837129, "grad_norm": 1.1015625, "learning_rate": 0.000307, "loss": 7.2261, "mean_token_accuracy": 0.10431057810783387, "num_tokens": 1117314.0, "step": 615 }, { "entropy": 7.488267469406128, "epoch": 0.5328749462827675, "grad_norm": 1.109375, "learning_rate": 0.0003095, "loss": 7.148, "mean_token_accuracy": 0.10711429193615914, "num_tokens": 1126786.0, "step": 620 }, { "entropy": 7.577956056594848, "epoch": 0.5371723248818221, "grad_norm": 1.3046875, "learning_rate": 0.000312, "loss": 7.1645, "mean_token_accuracy": 0.10579404905438423, "num_tokens": 1136013.0, "step": 625 }, { "entropy": 7.527575206756592, "epoch": 0.5414697034808766, "grad_norm": 1.109375, "learning_rate": 0.0003145, "loss": 7.1969, "mean_token_accuracy": 0.10749110653996467, "num_tokens": 1144970.0, "step": 630 }, { "entropy": 7.613465976715088, "epoch": 0.5457670820799312, "grad_norm": 1.2578125, "learning_rate": 0.000317, "loss": 7.1614, "mean_token_accuracy": 0.11203600242733955, "num_tokens": 1153810.0, "step": 635 }, { "entropy": 7.521342611312866, "epoch": 0.5500644606789858, "grad_norm": 1.0546875, "learning_rate": 0.0003195, "loss": 7.1408, "mean_token_accuracy": 0.10991051346063614, "num_tokens": 1162498.0, "step": 640 }, { "entropy": 7.5313867092132565, "epoch": 0.5543618392780404, "grad_norm": 1.0546875, "learning_rate": 0.000322, "loss": 7.2164, "mean_token_accuracy": 0.1044546626508236, "num_tokens": 1172091.0, "step": 645 }, { "entropy": 7.653256607055664, "epoch": 0.5586592178770949, "grad_norm": 1.1015625, "learning_rate": 0.00032450000000000003, "loss": 7.1977, "mean_token_accuracy": 0.10631284043192864, "num_tokens": 1181400.0, "step": 650 }, { "entropy": 7.537307643890381, "epoch": 0.5629565964761496, "grad_norm": 1.2890625, "learning_rate": 0.00032700000000000003, "loss": 7.1721, "mean_token_accuracy": 0.11125476211309433, "num_tokens": 1189780.0, "step": 655 }, { "entropy": 7.477937269210815, "epoch": 0.5672539750752041, "grad_norm": 1.1875, "learning_rate": 0.00032950000000000004, "loss": 7.1315, "mean_token_accuracy": 0.1057468131184578, "num_tokens": 1198671.0, "step": 660 }, { "entropy": 7.589753818511963, "epoch": 0.5715513536742587, "grad_norm": 1.09375, "learning_rate": 0.00033200000000000005, "loss": 7.1652, "mean_token_accuracy": 0.1051194004714489, "num_tokens": 1207173.0, "step": 665 }, { "entropy": 7.461796855926513, "epoch": 0.5758487322733132, "grad_norm": 1.21875, "learning_rate": 0.00033450000000000005, "loss": 7.0998, "mean_token_accuracy": 0.11046240702271462, "num_tokens": 1216387.0, "step": 670 }, { "entropy": 7.622633552551269, "epoch": 0.5801461108723679, "grad_norm": 1.0234375, "learning_rate": 0.000337, "loss": 7.0722, "mean_token_accuracy": 0.11004948541522026, "num_tokens": 1224461.0, "step": 675 }, { "entropy": 7.451505851745606, "epoch": 0.5844434894714224, "grad_norm": 1.1796875, "learning_rate": 0.0003395, "loss": 7.1414, "mean_token_accuracy": 0.11011224165558815, "num_tokens": 1233774.0, "step": 680 }, { "entropy": 7.457524538040161, "epoch": 0.588740868070477, "grad_norm": 1.2109375, "learning_rate": 0.000342, "loss": 7.0938, "mean_token_accuracy": 0.1142980344593525, "num_tokens": 1242812.0, "step": 685 }, { "entropy": 7.605640840530396, "epoch": 0.5930382466695315, "grad_norm": 1.03125, "learning_rate": 0.00034449999999999997, "loss": 7.191, "mean_token_accuracy": 0.11035142987966537, "num_tokens": 1252872.0, "step": 690 }, { "entropy": 7.307473850250244, "epoch": 0.5973356252685862, "grad_norm": 1.1796875, "learning_rate": 0.000347, "loss": 6.983, "mean_token_accuracy": 0.11081922426819801, "num_tokens": 1260852.0, "step": 695 }, { "entropy": 7.438599157333374, "epoch": 0.6016330038676407, "grad_norm": 1.2578125, "learning_rate": 0.0003495, "loss": 7.0984, "mean_token_accuracy": 0.10763570070266723, "num_tokens": 1268925.0, "step": 700 }, { "entropy": 7.530004072189331, "epoch": 0.6059303824666953, "grad_norm": 1.109375, "learning_rate": 0.000352, "loss": 7.145, "mean_token_accuracy": 0.10653513446450233, "num_tokens": 1278994.0, "step": 705 }, { "entropy": 7.4260091304779055, "epoch": 0.6102277610657499, "grad_norm": 1.1640625, "learning_rate": 0.0003545, "loss": 7.1323, "mean_token_accuracy": 0.10368426591157913, "num_tokens": 1287698.0, "step": 710 }, { "entropy": 7.482218551635742, "epoch": 0.6145251396648045, "grad_norm": 1.0546875, "learning_rate": 0.000357, "loss": 7.0787, "mean_token_accuracy": 0.11120296269655228, "num_tokens": 1297475.0, "step": 715 }, { "entropy": 7.480340671539307, "epoch": 0.618822518263859, "grad_norm": 1.1328125, "learning_rate": 0.0003595, "loss": 7.1091, "mean_token_accuracy": 0.11085583940148354, "num_tokens": 1306836.0, "step": 720 }, { "entropy": 7.506947946548462, "epoch": 0.6231198968629136, "grad_norm": 1.03125, "learning_rate": 0.000362, "loss": 7.1377, "mean_token_accuracy": 0.10435779988765717, "num_tokens": 1315872.0, "step": 725 }, { "entropy": 7.4788847923278805, "epoch": 0.6274172754619682, "grad_norm": 1.1796875, "learning_rate": 0.0003645, "loss": 7.0782, "mean_token_accuracy": 0.11685637310147286, "num_tokens": 1324624.0, "step": 730 }, { "entropy": 7.444537830352783, "epoch": 0.6317146540610228, "grad_norm": 1.15625, "learning_rate": 0.000367, "loss": 7.061, "mean_token_accuracy": 0.11548577472567559, "num_tokens": 1333058.0, "step": 735 }, { "entropy": 7.262284660339356, "epoch": 0.6360120326600773, "grad_norm": 1.078125, "learning_rate": 0.0003695, "loss": 7.0248, "mean_token_accuracy": 0.11004846841096878, "num_tokens": 1342376.0, "step": 740 }, { "entropy": 7.526681852340698, "epoch": 0.6403094112591319, "grad_norm": 1.1484375, "learning_rate": 0.000372, "loss": 7.0693, "mean_token_accuracy": 0.10503109246492386, "num_tokens": 1351386.0, "step": 745 }, { "entropy": 7.364239978790283, "epoch": 0.6446067898581865, "grad_norm": 1.265625, "learning_rate": 0.0003745, "loss": 6.9832, "mean_token_accuracy": 0.11761592403054237, "num_tokens": 1358958.0, "step": 750 }, { "entropy": 7.496349859237671, "epoch": 0.6489041684572411, "grad_norm": 1.109375, "learning_rate": 0.000377, "loss": 7.1231, "mean_token_accuracy": 0.10967899858951569, "num_tokens": 1368599.0, "step": 755 }, { "entropy": 7.435608530044556, "epoch": 0.6532015470562956, "grad_norm": 1.890625, "learning_rate": 0.0003795, "loss": 7.1433, "mean_token_accuracy": 0.1064300425350666, "num_tokens": 1378529.0, "step": 760 }, { "entropy": 7.344243001937866, "epoch": 0.6574989256553503, "grad_norm": 1.25, "learning_rate": 0.000382, "loss": 6.9306, "mean_token_accuracy": 0.11750481277704239, "num_tokens": 1386993.0, "step": 765 }, { "entropy": 7.390715217590332, "epoch": 0.6617963042544048, "grad_norm": 1.5, "learning_rate": 0.0003845, "loss": 7.0322, "mean_token_accuracy": 0.11829963177442551, "num_tokens": 1395790.0, "step": 770 }, { "entropy": 7.302670812606811, "epoch": 0.6660936828534594, "grad_norm": 1.078125, "learning_rate": 0.00038700000000000003, "loss": 7.0393, "mean_token_accuracy": 0.11235549300909042, "num_tokens": 1405587.0, "step": 775 }, { "entropy": 7.348860168457032, "epoch": 0.6703910614525139, "grad_norm": 1.0390625, "learning_rate": 0.00038950000000000003, "loss": 6.9999, "mean_token_accuracy": 0.11504087448120118, "num_tokens": 1414478.0, "step": 780 }, { "entropy": 7.428205347061157, "epoch": 0.6746884400515686, "grad_norm": 1.375, "learning_rate": 0.00039200000000000004, "loss": 7.0623, "mean_token_accuracy": 0.11534775421023369, "num_tokens": 1423791.0, "step": 785 }, { "entropy": 7.467832851409912, "epoch": 0.6789858186506231, "grad_norm": 1.234375, "learning_rate": 0.00039450000000000005, "loss": 7.1014, "mean_token_accuracy": 0.10728210881352425, "num_tokens": 1432955.0, "step": 790 }, { "entropy": 7.385548782348633, "epoch": 0.6832831972496777, "grad_norm": 0.99609375, "learning_rate": 0.00039700000000000005, "loss": 7.074, "mean_token_accuracy": 0.1087567687034607, "num_tokens": 1441907.0, "step": 795 }, { "entropy": 7.290066146850586, "epoch": 0.6875805758487322, "grad_norm": 1.203125, "learning_rate": 0.0003995, "loss": 6.935, "mean_token_accuracy": 0.11768098697066307, "num_tokens": 1451062.0, "step": 800 }, { "entropy": 7.399672508239746, "epoch": 0.6918779544477869, "grad_norm": 1.0234375, "learning_rate": 0.000402, "loss": 7.0218, "mean_token_accuracy": 0.10959179401397705, "num_tokens": 1460132.0, "step": 805 }, { "entropy": 7.272280263900757, "epoch": 0.6961753330468414, "grad_norm": 1.0625, "learning_rate": 0.0004045, "loss": 6.9141, "mean_token_accuracy": 0.11885375007987023, "num_tokens": 1469582.0, "step": 810 }, { "entropy": 7.255832242965698, "epoch": 0.700472711645896, "grad_norm": 1.3515625, "learning_rate": 0.00040699999999999997, "loss": 7.012, "mean_token_accuracy": 0.10950389429926873, "num_tokens": 1479053.0, "step": 815 }, { "entropy": 7.313858604431152, "epoch": 0.7047700902449506, "grad_norm": 1.21875, "learning_rate": 0.0004095, "loss": 7.0142, "mean_token_accuracy": 0.11343196108937263, "num_tokens": 1488189.0, "step": 820 }, { "entropy": 7.236453676223755, "epoch": 0.7090674688440052, "grad_norm": 1.046875, "learning_rate": 0.000412, "loss": 6.8662, "mean_token_accuracy": 0.12046442031860352, "num_tokens": 1497324.0, "step": 825 }, { "entropy": 7.310264635086059, "epoch": 0.7133648474430597, "grad_norm": 1.015625, "learning_rate": 0.0004145, "loss": 6.9814, "mean_token_accuracy": 0.11739002540707588, "num_tokens": 1506543.0, "step": 830 }, { "entropy": 7.289929437637329, "epoch": 0.7176622260421143, "grad_norm": 1.109375, "learning_rate": 0.000417, "loss": 6.9742, "mean_token_accuracy": 0.12236066460609436, "num_tokens": 1516737.0, "step": 835 }, { "entropy": 7.161224508285523, "epoch": 0.7219596046411689, "grad_norm": 1.046875, "learning_rate": 0.0004195, "loss": 6.8503, "mean_token_accuracy": 0.11500222384929656, "num_tokens": 1525561.0, "step": 840 }, { "entropy": 7.280500030517578, "epoch": 0.7262569832402235, "grad_norm": 1.1328125, "learning_rate": 0.000422, "loss": 6.8765, "mean_token_accuracy": 0.1242159940302372, "num_tokens": 1533323.0, "step": 845 }, { "entropy": 7.292038059234619, "epoch": 0.730554361839278, "grad_norm": 1.1875, "learning_rate": 0.0004245, "loss": 6.9379, "mean_token_accuracy": 0.12142991349101066, "num_tokens": 1542632.0, "step": 850 }, { "entropy": 7.305912923812866, "epoch": 0.7348517404383326, "grad_norm": 1.265625, "learning_rate": 0.000427, "loss": 6.8775, "mean_token_accuracy": 0.12107516825199127, "num_tokens": 1551236.0, "step": 855 }, { "entropy": 7.118098545074463, "epoch": 0.7391491190373872, "grad_norm": 1.15625, "learning_rate": 0.0004295, "loss": 6.878, "mean_token_accuracy": 0.12266490310430526, "num_tokens": 1559674.0, "step": 860 }, { "entropy": 7.268103885650635, "epoch": 0.7434464976364418, "grad_norm": 1.09375, "learning_rate": 0.000432, "loss": 6.9687, "mean_token_accuracy": 0.1217973381280899, "num_tokens": 1569481.0, "step": 865 }, { "entropy": 7.2675707817077635, "epoch": 0.7477438762354963, "grad_norm": 1.0859375, "learning_rate": 0.0004345, "loss": 6.9975, "mean_token_accuracy": 0.11359266638755798, "num_tokens": 1578488.0, "step": 870 }, { "entropy": 7.171451759338379, "epoch": 0.752041254834551, "grad_norm": 1.0625, "learning_rate": 0.000437, "loss": 6.8946, "mean_token_accuracy": 0.11810402423143387, "num_tokens": 1586675.0, "step": 875 }, { "entropy": 7.285072469711304, "epoch": 0.7563386334336055, "grad_norm": 1.0859375, "learning_rate": 0.0004395, "loss": 7.0021, "mean_token_accuracy": 0.10800698548555374, "num_tokens": 1595411.0, "step": 880 }, { "entropy": 7.312672233581543, "epoch": 0.7606360120326601, "grad_norm": 1.1953125, "learning_rate": 0.000442, "loss": 6.9755, "mean_token_accuracy": 0.11759781166911125, "num_tokens": 1604046.0, "step": 885 }, { "entropy": 7.245748281478882, "epoch": 0.7649333906317146, "grad_norm": 1.0859375, "learning_rate": 0.0004445, "loss": 6.9643, "mean_token_accuracy": 0.11201045587658882, "num_tokens": 1613759.0, "step": 890 }, { "entropy": 7.238279533386231, "epoch": 0.7692307692307693, "grad_norm": 1.015625, "learning_rate": 0.000447, "loss": 6.9209, "mean_token_accuracy": 0.11877147182822227, "num_tokens": 1623323.0, "step": 895 }, { "entropy": 7.230697107315064, "epoch": 0.7735281478298238, "grad_norm": 1.1328125, "learning_rate": 0.00044950000000000003, "loss": 6.9005, "mean_token_accuracy": 0.11391794160008431, "num_tokens": 1631727.0, "step": 900 }, { "entropy": 7.194222545623779, "epoch": 0.7778255264288784, "grad_norm": 1.1875, "learning_rate": 0.00045200000000000004, "loss": 6.8583, "mean_token_accuracy": 0.12049278989434242, "num_tokens": 1639544.0, "step": 905 }, { "entropy": 7.284112405776978, "epoch": 0.7821229050279329, "grad_norm": 1.125, "learning_rate": 0.00045450000000000004, "loss": 6.9773, "mean_token_accuracy": 0.11113567724823951, "num_tokens": 1648931.0, "step": 910 }, { "entropy": 7.1627342224121096, "epoch": 0.7864202836269876, "grad_norm": 1.15625, "learning_rate": 0.00045700000000000005, "loss": 6.8345, "mean_token_accuracy": 0.12127922549843788, "num_tokens": 1657688.0, "step": 915 }, { "entropy": 7.259271335601807, "epoch": 0.7907176622260421, "grad_norm": 1.0390625, "learning_rate": 0.00045950000000000006, "loss": 6.9244, "mean_token_accuracy": 0.11565326899290085, "num_tokens": 1666879.0, "step": 920 }, { "entropy": 7.1275458335876465, "epoch": 0.7950150408250967, "grad_norm": 1.109375, "learning_rate": 0.000462, "loss": 6.8982, "mean_token_accuracy": 0.118662890791893, "num_tokens": 1676773.0, "step": 925 }, { "entropy": 7.2360998630523685, "epoch": 0.7993124194241513, "grad_norm": 1.0859375, "learning_rate": 0.0004645, "loss": 7.0092, "mean_token_accuracy": 0.11184348464012146, "num_tokens": 1686144.0, "step": 930 }, { "entropy": 7.26247010231018, "epoch": 0.8036097980232059, "grad_norm": 1.078125, "learning_rate": 0.000467, "loss": 6.9646, "mean_token_accuracy": 0.10949353277683258, "num_tokens": 1695476.0, "step": 935 }, { "entropy": 7.174946022033692, "epoch": 0.8079071766222604, "grad_norm": 1.046875, "learning_rate": 0.0004695, "loss": 6.8498, "mean_token_accuracy": 0.12084392830729485, "num_tokens": 1704907.0, "step": 940 }, { "entropy": 7.166734504699707, "epoch": 0.812204555221315, "grad_norm": 0.9609375, "learning_rate": 0.000472, "loss": 6.8948, "mean_token_accuracy": 0.12091493904590607, "num_tokens": 1714564.0, "step": 945 }, { "entropy": 7.244975614547729, "epoch": 0.8165019338203696, "grad_norm": 1.1171875, "learning_rate": 0.0004745, "loss": 6.9209, "mean_token_accuracy": 0.1155279442667961, "num_tokens": 1725285.0, "step": 950 }, { "entropy": 7.1149109363555905, "epoch": 0.8207993124194242, "grad_norm": 1.03125, "learning_rate": 0.000477, "loss": 6.9153, "mean_token_accuracy": 0.11715079098939896, "num_tokens": 1734331.0, "step": 955 }, { "entropy": 7.227117824554443, "epoch": 0.8250966910184787, "grad_norm": 1.2578125, "learning_rate": 0.0004795, "loss": 6.852, "mean_token_accuracy": 0.11185217499732972, "num_tokens": 1742340.0, "step": 960 }, { "entropy": 7.160442066192627, "epoch": 0.8293940696175333, "grad_norm": 1.109375, "learning_rate": 0.000482, "loss": 6.8351, "mean_token_accuracy": 0.12198592498898506, "num_tokens": 1751725.0, "step": 965 }, { "entropy": 6.999344539642334, "epoch": 0.8336914482165879, "grad_norm": 1.1328125, "learning_rate": 0.0004845, "loss": 6.7683, "mean_token_accuracy": 0.12398558706045151, "num_tokens": 1760294.0, "step": 970 }, { "entropy": 7.112461137771606, "epoch": 0.8379888268156425, "grad_norm": 1.0546875, "learning_rate": 0.000487, "loss": 6.8275, "mean_token_accuracy": 0.11639805063605309, "num_tokens": 1768912.0, "step": 975 }, { "entropy": 7.257990169525146, "epoch": 0.842286205414697, "grad_norm": 1.0390625, "learning_rate": 0.0004895, "loss": 7.0148, "mean_token_accuracy": 0.12016609534621239, "num_tokens": 1778633.0, "step": 980 }, { "entropy": 7.1191816329956055, "epoch": 0.8465835840137517, "grad_norm": 1.1171875, "learning_rate": 0.000492, "loss": 6.8847, "mean_token_accuracy": 0.11811531409621238, "num_tokens": 1787275.0, "step": 985 }, { "entropy": 7.235857200622559, "epoch": 0.8508809626128062, "grad_norm": 1.2578125, "learning_rate": 0.0004945, "loss": 6.8878, "mean_token_accuracy": 0.11604067236185074, "num_tokens": 1795994.0, "step": 990 }, { "entropy": 7.036646842956543, "epoch": 0.8551783412118608, "grad_norm": 0.8359375, "learning_rate": 0.000497, "loss": 6.804, "mean_token_accuracy": 0.11985133662819862, "num_tokens": 1806379.0, "step": 995 }, { "entropy": 7.154667520523072, "epoch": 0.8594757198109153, "grad_norm": 1.0546875, "learning_rate": 0.0004995, "loss": 6.8296, "mean_token_accuracy": 0.1270947828888893, "num_tokens": 1816135.0, "step": 1000 }, { "epoch": 0.8594757198109153, "eval_entropy": 6.812919497489929, "eval_loss": 6.8574419021606445, "eval_mean_token_accuracy": 0.12292942362795542, "eval_num_tokens": 1816135.0, "eval_runtime": 2.0522, "eval_samples_per_second": 1729.37, "eval_steps_per_second": 216.354, "step": 1000 } ], "logging_steps": 5, "max_steps": 11630, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 408225012940800.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }