{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.011172310545543924, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 10.7426682472229, "epoch": 0.00011172310545543924, "grad_norm": 6.25, "learning_rate": 2e-06, "loss": 10.5231, "mean_token_accuracy": 0.0, "num_tokens": 4250.0, "step": 5 }, { "entropy": 10.742681884765625, "epoch": 0.00022344621091087847, "grad_norm": 7.03125, "learning_rate": 4.5e-06, "loss": 10.4891, "mean_token_accuracy": 0.0, "num_tokens": 8228.0, "step": 10 }, { "entropy": 10.742702770233155, "epoch": 0.0003351693163663177, "grad_norm": 6.8125, "learning_rate": 7e-06, "loss": 10.4445, "mean_token_accuracy": 0.0003105590119957924, "num_tokens": 12209.0, "step": 15 }, { "entropy": 10.742706871032714, "epoch": 0.00044689242182175694, "grad_norm": 6.15625, "learning_rate": 9.5e-06, "loss": 10.3987, "mean_token_accuracy": 0.0004866180010139942, "num_tokens": 16225.0, "step": 20 }, { "entropy": 10.742706108093262, "epoch": 0.0005586155272771962, "grad_norm": 5.53125, "learning_rate": 1.2e-05, "loss": 10.2787, "mean_token_accuracy": 0.010022059944458307, "num_tokens": 20212.0, "step": 25 }, { "entropy": 10.742425346374512, "epoch": 0.0006703386327326354, "grad_norm": 5.0, "learning_rate": 1.4500000000000002e-05, "loss": 10.173, "mean_token_accuracy": 0.04117634426802397, "num_tokens": 24445.0, "step": 30 }, { "entropy": 10.741415786743165, "epoch": 0.0007820617381880747, "grad_norm": 4.3125, "learning_rate": 1.7000000000000003e-05, "loss": 9.9901, "mean_token_accuracy": 0.052767305821180346, "num_tokens": 28365.0, "step": 35 }, { "entropy": 10.739409732818604, "epoch": 0.0008937848436435139, "grad_norm": 3.578125, "learning_rate": 1.95e-05, "loss": 9.9621, "mean_token_accuracy": 0.052475782483816145, "num_tokens": 33055.0, "step": 40 }, { "entropy": 10.7363787651062, "epoch": 0.0010055079490989532, "grad_norm": 2.9375, "learning_rate": 2.2e-05, "loss": 9.811, "mean_token_accuracy": 0.062037082761526106, "num_tokens": 37599.0, "step": 45 }, { "entropy": 10.733420372009277, "epoch": 0.0011172310545543925, "grad_norm": 2.59375, "learning_rate": 2.4500000000000003e-05, "loss": 9.6744, "mean_token_accuracy": 0.06838746592402459, "num_tokens": 41934.0, "step": 50 }, { "entropy": 10.731625938415528, "epoch": 0.0012289541600098315, "grad_norm": 2.53125, "learning_rate": 2.7e-05, "loss": 9.6365, "mean_token_accuracy": 0.05915887728333473, "num_tokens": 46178.0, "step": 55 }, { "entropy": 10.729843425750733, "epoch": 0.0013406772654652708, "grad_norm": 2.5, "learning_rate": 2.95e-05, "loss": 9.5494, "mean_token_accuracy": 0.0692246112972498, "num_tokens": 50513.0, "step": 60 }, { "entropy": 10.727421474456786, "epoch": 0.0014524003709207101, "grad_norm": 2.46875, "learning_rate": 3.2e-05, "loss": 9.5028, "mean_token_accuracy": 0.0702465757727623, "num_tokens": 54924.0, "step": 65 }, { "entropy": 10.722854518890381, "epoch": 0.0015641234763761494, "grad_norm": 2.46875, "learning_rate": 3.4500000000000005e-05, "loss": 9.4107, "mean_token_accuracy": 0.06344567574560642, "num_tokens": 59083.0, "step": 70 }, { "entropy": 10.71631669998169, "epoch": 0.0016758465818315885, "grad_norm": 2.65625, "learning_rate": 3.7e-05, "loss": 9.3233, "mean_token_accuracy": 0.06774163469672204, "num_tokens": 63324.0, "step": 75 }, { "entropy": 10.705671787261963, "epoch": 0.0017875696872870278, "grad_norm": 2.34375, "learning_rate": 3.95e-05, "loss": 9.3567, "mean_token_accuracy": 0.06332114227116108, "num_tokens": 67738.0, "step": 80 }, { "entropy": 10.693737983703613, "epoch": 0.001899292792742467, "grad_norm": 2.234375, "learning_rate": 4.2000000000000004e-05, "loss": 9.2866, "mean_token_accuracy": 0.06349676214158535, "num_tokens": 72305.0, "step": 85 }, { "entropy": 10.675388622283936, "epoch": 0.0020110158981979064, "grad_norm": 2.53125, "learning_rate": 4.45e-05, "loss": 9.0821, "mean_token_accuracy": 0.06834135130047798, "num_tokens": 76579.0, "step": 90 }, { "entropy": 10.65205717086792, "epoch": 0.0021227390036533456, "grad_norm": 2.234375, "learning_rate": 4.7000000000000004e-05, "loss": 9.0421, "mean_token_accuracy": 0.06946654319763183, "num_tokens": 80812.0, "step": 95 }, { "entropy": 10.615971279144286, "epoch": 0.002234462109108785, "grad_norm": 2.109375, "learning_rate": 4.9500000000000004e-05, "loss": 8.9523, "mean_token_accuracy": 0.06732719540596008, "num_tokens": 85090.0, "step": 100 }, { "entropy": 10.59008207321167, "epoch": 0.0023461852145642242, "grad_norm": 2.234375, "learning_rate": 5.2e-05, "loss": 8.888, "mean_token_accuracy": 0.06908667460083961, "num_tokens": 89578.0, "step": 105 }, { "entropy": 10.536420917510986, "epoch": 0.002457908320019663, "grad_norm": 1.96875, "learning_rate": 5.45e-05, "loss": 8.789, "mean_token_accuracy": 0.0728236336261034, "num_tokens": 94117.0, "step": 110 }, { "entropy": 10.488511657714843, "epoch": 0.0025696314254751024, "grad_norm": 2.0625, "learning_rate": 5.7e-05, "loss": 8.6132, "mean_token_accuracy": 0.07127482630312443, "num_tokens": 98082.0, "step": 115 }, { "entropy": 10.439968013763428, "epoch": 0.0026813545309305417, "grad_norm": 1.9765625, "learning_rate": 5.9499999999999996e-05, "loss": 8.5714, "mean_token_accuracy": 0.07672090865671635, "num_tokens": 102327.0, "step": 120 }, { "entropy": 10.355792045593262, "epoch": 0.002793077636385981, "grad_norm": 1.8359375, "learning_rate": 6.2e-05, "loss": 8.4426, "mean_token_accuracy": 0.0740627009421587, "num_tokens": 106567.0, "step": 125 }, { "entropy": 10.286309623718262, "epoch": 0.0029048007418414202, "grad_norm": 1.7890625, "learning_rate": 6.450000000000001e-05, "loss": 8.3003, "mean_token_accuracy": 0.07362989187240601, "num_tokens": 110654.0, "step": 130 }, { "entropy": 10.204053020477295, "epoch": 0.0030165238472968595, "grad_norm": 1.875, "learning_rate": 6.7e-05, "loss": 8.2511, "mean_token_accuracy": 0.062000279501080516, "num_tokens": 114679.0, "step": 135 }, { "entropy": 10.102067852020264, "epoch": 0.003128246952752299, "grad_norm": 1.6171875, "learning_rate": 6.950000000000001e-05, "loss": 8.1849, "mean_token_accuracy": 0.06811538599431514, "num_tokens": 118817.0, "step": 140 }, { "entropy": 9.926943397521972, "epoch": 0.003239970058207738, "grad_norm": 1.484375, "learning_rate": 7.2e-05, "loss": 8.0767, "mean_token_accuracy": 0.06979594528675079, "num_tokens": 123188.0, "step": 145 }, { "entropy": 9.793034744262695, "epoch": 0.003351693163663177, "grad_norm": 1.5078125, "learning_rate": 7.45e-05, "loss": 7.981, "mean_token_accuracy": 0.06847230046987533, "num_tokens": 127767.0, "step": 150 }, { "entropy": 9.643688774108886, "epoch": 0.0034634162691186163, "grad_norm": 1.4921875, "learning_rate": 7.7e-05, "loss": 7.7945, "mean_token_accuracy": 0.06945906654000282, "num_tokens": 131837.0, "step": 155 }, { "entropy": 9.430543518066406, "epoch": 0.0035751393745740555, "grad_norm": 1.2890625, "learning_rate": 7.950000000000001e-05, "loss": 7.7734, "mean_token_accuracy": 0.07027286775410176, "num_tokens": 136247.0, "step": 160 }, { "entropy": 9.239261722564697, "epoch": 0.003686862480029495, "grad_norm": 1.4609375, "learning_rate": 8.2e-05, "loss": 7.5788, "mean_token_accuracy": 0.07950169630348683, "num_tokens": 140170.0, "step": 165 }, { "entropy": 8.976140880584717, "epoch": 0.003798585585484934, "grad_norm": 1.21875, "learning_rate": 8.450000000000001e-05, "loss": 7.6177, "mean_token_accuracy": 0.07785017378628253, "num_tokens": 144139.0, "step": 170 }, { "entropy": 8.843453693389893, "epoch": 0.003910308690940373, "grad_norm": 1.2578125, "learning_rate": 8.7e-05, "loss": 7.5659, "mean_token_accuracy": 0.07487303391098976, "num_tokens": 148792.0, "step": 175 }, { "entropy": 8.658325004577637, "epoch": 0.004022031796395813, "grad_norm": 1.3515625, "learning_rate": 8.95e-05, "loss": 7.4988, "mean_token_accuracy": 0.07942216768860817, "num_tokens": 152844.0, "step": 180 }, { "entropy": 8.59526195526123, "epoch": 0.0041337549018512516, "grad_norm": 1.046875, "learning_rate": 9.2e-05, "loss": 7.527, "mean_token_accuracy": 0.07417443916201591, "num_tokens": 157366.0, "step": 185 }, { "entropy": 8.467089462280274, "epoch": 0.004245478007306691, "grad_norm": 1.0390625, "learning_rate": 9.45e-05, "loss": 7.3623, "mean_token_accuracy": 0.07755868881940842, "num_tokens": 161348.0, "step": 190 }, { "entropy": 8.307873630523682, "epoch": 0.00435720111276213, "grad_norm": 1.140625, "learning_rate": 9.7e-05, "loss": 7.3815, "mean_token_accuracy": 0.08716461397707462, "num_tokens": 165647.0, "step": 195 }, { "entropy": 8.236515140533447, "epoch": 0.00446892421821757, "grad_norm": 1.4453125, "learning_rate": 9.95e-05, "loss": 7.2754, "mean_token_accuracy": 0.08076057620346547, "num_tokens": 169521.0, "step": 200 }, { "entropy": 8.256762790679932, "epoch": 0.004580647323673009, "grad_norm": 1.359375, "learning_rate": 0.000102, "loss": 7.3426, "mean_token_accuracy": 0.0812241055071354, "num_tokens": 173466.0, "step": 205 }, { "entropy": 8.131280899047852, "epoch": 0.0046923704291284484, "grad_norm": 1.1484375, "learning_rate": 0.00010449999999999999, "loss": 7.2826, "mean_token_accuracy": 0.07643571458756923, "num_tokens": 177663.0, "step": 210 }, { "entropy": 8.097990989685059, "epoch": 0.004804093534583887, "grad_norm": 1.21875, "learning_rate": 0.000107, "loss": 7.2745, "mean_token_accuracy": 0.08235705867409707, "num_tokens": 181778.0, "step": 215 }, { "entropy": 8.089111948013306, "epoch": 0.004915816640039326, "grad_norm": 1.6171875, "learning_rate": 0.0001095, "loss": 7.2736, "mean_token_accuracy": 0.08633389472961425, "num_tokens": 185525.0, "step": 220 }, { "entropy": 8.083420944213866, "epoch": 0.005027539745494766, "grad_norm": 1.3671875, "learning_rate": 0.000112, "loss": 7.153, "mean_token_accuracy": 0.08806331530213356, "num_tokens": 189418.0, "step": 225 }, { "entropy": 7.933328151702881, "epoch": 0.005139262850950205, "grad_norm": 1.359375, "learning_rate": 0.0001145, "loss": 7.2217, "mean_token_accuracy": 0.08842612579464912, "num_tokens": 193494.0, "step": 230 }, { "entropy": 8.018900680541993, "epoch": 0.0052509859564056445, "grad_norm": 1.1484375, "learning_rate": 0.00011700000000000001, "loss": 7.2661, "mean_token_accuracy": 0.08137304298579692, "num_tokens": 198018.0, "step": 235 }, { "entropy": 7.955441856384278, "epoch": 0.005362709061861083, "grad_norm": 1.2578125, "learning_rate": 0.00011949999999999999, "loss": 7.1847, "mean_token_accuracy": 0.08625513166189194, "num_tokens": 202296.0, "step": 240 }, { "entropy": 7.9594367980957035, "epoch": 0.005474432167316523, "grad_norm": 1.3203125, "learning_rate": 0.000122, "loss": 7.1706, "mean_token_accuracy": 0.08195730969309807, "num_tokens": 206694.0, "step": 245 }, { "entropy": 7.792031574249267, "epoch": 0.005586155272771962, "grad_norm": 1.390625, "learning_rate": 0.0001245, "loss": 7.2007, "mean_token_accuracy": 0.08904931843280792, "num_tokens": 210810.0, "step": 250 }, { "entropy": 7.920461797714234, "epoch": 0.005697878378227402, "grad_norm": 1.1796875, "learning_rate": 0.000127, "loss": 7.1818, "mean_token_accuracy": 0.0905133418738842, "num_tokens": 215044.0, "step": 255 }, { "entropy": 7.8493430614471436, "epoch": 0.0058096014836828405, "grad_norm": 1.3828125, "learning_rate": 0.0001295, "loss": 7.28, "mean_token_accuracy": 0.08591654896736145, "num_tokens": 219235.0, "step": 260 }, { "entropy": 7.84934287071228, "epoch": 0.005921324589138279, "grad_norm": 1.1328125, "learning_rate": 0.000132, "loss": 7.0922, "mean_token_accuracy": 0.0903876356780529, "num_tokens": 223639.0, "step": 265 }, { "entropy": 7.785561227798462, "epoch": 0.006033047694593719, "grad_norm": 1.25, "learning_rate": 0.00013450000000000002, "loss": 7.1258, "mean_token_accuracy": 0.09057728350162506, "num_tokens": 227873.0, "step": 270 }, { "entropy": 7.707937574386596, "epoch": 0.006144770800049158, "grad_norm": 1.3671875, "learning_rate": 0.00013700000000000002, "loss": 7.0661, "mean_token_accuracy": 0.09807337448000908, "num_tokens": 232147.0, "step": 275 }, { "entropy": 7.739069509506225, "epoch": 0.006256493905504598, "grad_norm": 1.6015625, "learning_rate": 0.0001395, "loss": 7.1358, "mean_token_accuracy": 0.09250000454485416, "num_tokens": 236456.0, "step": 280 }, { "entropy": 7.7190714359283445, "epoch": 0.0063682170109600365, "grad_norm": 1.1953125, "learning_rate": 0.00014199999999999998, "loss": 7.1583, "mean_token_accuracy": 0.09051149562001229, "num_tokens": 241039.0, "step": 285 }, { "entropy": 7.938947439193726, "epoch": 0.006479940116415476, "grad_norm": 1.6640625, "learning_rate": 0.0001445, "loss": 7.1915, "mean_token_accuracy": 0.08653632178902626, "num_tokens": 245132.0, "step": 290 }, { "entropy": 7.673107481002807, "epoch": 0.006591663221870915, "grad_norm": 1.28125, "learning_rate": 0.000147, "loss": 7.0872, "mean_token_accuracy": 0.09988043382763863, "num_tokens": 249152.0, "step": 295 }, { "entropy": 7.712965631484986, "epoch": 0.006703386327326354, "grad_norm": 1.21875, "learning_rate": 0.0001495, "loss": 7.0503, "mean_token_accuracy": 0.09596830010414123, "num_tokens": 253439.0, "step": 300 }, { "entropy": 7.6600532054901125, "epoch": 0.006815109432781794, "grad_norm": 1.546875, "learning_rate": 0.000152, "loss": 7.0731, "mean_token_accuracy": 0.09302671104669571, "num_tokens": 258066.0, "step": 305 }, { "entropy": 7.665358448028565, "epoch": 0.0069268325382372325, "grad_norm": 1.4375, "learning_rate": 0.00015450000000000001, "loss": 7.0332, "mean_token_accuracy": 0.0973147690296173, "num_tokens": 261954.0, "step": 310 }, { "entropy": 7.616210794448852, "epoch": 0.007038555643692672, "grad_norm": 1.4453125, "learning_rate": 0.000157, "loss": 7.0779, "mean_token_accuracy": 0.10462095588445663, "num_tokens": 266650.0, "step": 315 }, { "entropy": 7.689846324920654, "epoch": 0.007150278749148111, "grad_norm": 1.3203125, "learning_rate": 0.0001595, "loss": 7.1433, "mean_token_accuracy": 0.09891897812485695, "num_tokens": 271069.0, "step": 320 }, { "entropy": 7.705677938461304, "epoch": 0.007262001854603551, "grad_norm": 1.359375, "learning_rate": 0.000162, "loss": 7.0039, "mean_token_accuracy": 0.10242248028516769, "num_tokens": 275084.0, "step": 325 }, { "entropy": 7.603102445602417, "epoch": 0.00737372496005899, "grad_norm": 1.390625, "learning_rate": 0.00016450000000000001, "loss": 7.0745, "mean_token_accuracy": 0.1031483568251133, "num_tokens": 279721.0, "step": 330 }, { "entropy": 7.619607782363891, "epoch": 0.007485448065514429, "grad_norm": 1.296875, "learning_rate": 0.00016700000000000002, "loss": 7.0708, "mean_token_accuracy": 0.10527726709842682, "num_tokens": 284317.0, "step": 335 }, { "entropy": 7.600710487365722, "epoch": 0.007597171170969868, "grad_norm": 1.3984375, "learning_rate": 0.00016950000000000003, "loss": 7.0451, "mean_token_accuracy": 0.10766607597470283, "num_tokens": 288870.0, "step": 340 }, { "entropy": 7.61973729133606, "epoch": 0.007708894276425307, "grad_norm": 1.3046875, "learning_rate": 0.00017199999999999998, "loss": 6.9812, "mean_token_accuracy": 0.11351362988352776, "num_tokens": 292996.0, "step": 345 }, { "entropy": 7.5854551792144775, "epoch": 0.007820617381880746, "grad_norm": 1.4765625, "learning_rate": 0.00017449999999999999, "loss": 6.9806, "mean_token_accuracy": 0.10384939089417458, "num_tokens": 297238.0, "step": 350 }, { "entropy": 7.531381893157959, "epoch": 0.007932340487336187, "grad_norm": 1.484375, "learning_rate": 0.000177, "loss": 6.9793, "mean_token_accuracy": 0.1117280475795269, "num_tokens": 301453.0, "step": 355 }, { "entropy": 7.653309726715088, "epoch": 0.008044063592791625, "grad_norm": 1.7890625, "learning_rate": 0.0001795, "loss": 6.9327, "mean_token_accuracy": 0.10786554217338562, "num_tokens": 305949.0, "step": 360 }, { "entropy": 7.534815788269043, "epoch": 0.008155786698247064, "grad_norm": 1.5625, "learning_rate": 0.000182, "loss": 6.9583, "mean_token_accuracy": 0.11513907313346863, "num_tokens": 310097.0, "step": 365 }, { "entropy": 7.547474193572998, "epoch": 0.008267509803702503, "grad_norm": 1.390625, "learning_rate": 0.0001845, "loss": 6.9517, "mean_token_accuracy": 0.10539396926760673, "num_tokens": 314567.0, "step": 370 }, { "entropy": 7.457708692550659, "epoch": 0.008379232909157944, "grad_norm": 1.4375, "learning_rate": 0.000187, "loss": 7.0323, "mean_token_accuracy": 0.10927818715572357, "num_tokens": 319166.0, "step": 375 }, { "entropy": 7.515052604675293, "epoch": 0.008490956014613383, "grad_norm": 1.421875, "learning_rate": 0.0001895, "loss": 6.9204, "mean_token_accuracy": 0.11223233640193939, "num_tokens": 323682.0, "step": 380 }, { "entropy": 7.488761281967163, "epoch": 0.008602679120068821, "grad_norm": 1.421875, "learning_rate": 0.000192, "loss": 6.8299, "mean_token_accuracy": 0.12143486216664315, "num_tokens": 327994.0, "step": 385 }, { "entropy": 7.412152099609375, "epoch": 0.00871440222552426, "grad_norm": 1.3671875, "learning_rate": 0.0001945, "loss": 6.9058, "mean_token_accuracy": 0.11854805946350097, "num_tokens": 332026.0, "step": 390 }, { "entropy": 7.578387832641601, "epoch": 0.0088261253309797, "grad_norm": 1.796875, "learning_rate": 0.00019700000000000002, "loss": 6.9475, "mean_token_accuracy": 0.11811894848942757, "num_tokens": 336552.0, "step": 395 }, { "entropy": 7.504688882827759, "epoch": 0.00893784843643514, "grad_norm": 1.4921875, "learning_rate": 0.00019950000000000002, "loss": 6.8884, "mean_token_accuracy": 0.1116393692791462, "num_tokens": 340643.0, "step": 400 }, { "entropy": 7.4580738067626955, "epoch": 0.009049571541890579, "grad_norm": 1.375, "learning_rate": 0.000202, "loss": 6.8029, "mean_token_accuracy": 0.12075437754392623, "num_tokens": 344886.0, "step": 405 }, { "entropy": 7.3586314678192135, "epoch": 0.009161294647346017, "grad_norm": 1.328125, "learning_rate": 0.00020449999999999998, "loss": 6.8632, "mean_token_accuracy": 0.1191755935549736, "num_tokens": 349115.0, "step": 410 }, { "entropy": 7.527571535110473, "epoch": 0.009273017752801456, "grad_norm": 1.5625, "learning_rate": 0.000207, "loss": 6.8235, "mean_token_accuracy": 0.12523479163646697, "num_tokens": 353368.0, "step": 415 }, { "entropy": 7.509571599960327, "epoch": 0.009384740858256897, "grad_norm": 1.484375, "learning_rate": 0.0002095, "loss": 6.8657, "mean_token_accuracy": 0.10757644474506378, "num_tokens": 357382.0, "step": 420 }, { "entropy": 7.441834354400635, "epoch": 0.009496463963712336, "grad_norm": 1.7265625, "learning_rate": 0.000212, "loss": 6.7799, "mean_token_accuracy": 0.12460733354091644, "num_tokens": 361542.0, "step": 425 }, { "entropy": 7.419776153564453, "epoch": 0.009608187069167775, "grad_norm": 1.2578125, "learning_rate": 0.0002145, "loss": 6.84, "mean_token_accuracy": 0.11263928636908531, "num_tokens": 366006.0, "step": 430 }, { "entropy": 7.367758464813233, "epoch": 0.009719910174623213, "grad_norm": 1.578125, "learning_rate": 0.00021700000000000002, "loss": 6.7506, "mean_token_accuracy": 0.12978531718254088, "num_tokens": 370021.0, "step": 435 }, { "entropy": 7.408233594894409, "epoch": 0.009831633280078652, "grad_norm": 1.703125, "learning_rate": 0.0002195, "loss": 6.8773, "mean_token_accuracy": 0.12414649501442909, "num_tokens": 374434.0, "step": 440 }, { "entropy": 7.441655158996582, "epoch": 0.009943356385534093, "grad_norm": 1.3515625, "learning_rate": 0.000222, "loss": 6.9027, "mean_token_accuracy": 0.11324851140379906, "num_tokens": 378634.0, "step": 445 }, { "entropy": 7.315918588638306, "epoch": 0.010055079490989532, "grad_norm": 1.4453125, "learning_rate": 0.0002245, "loss": 6.7869, "mean_token_accuracy": 0.1252473659813404, "num_tokens": 382904.0, "step": 450 }, { "entropy": 7.470485210418701, "epoch": 0.01016680259644497, "grad_norm": 1.328125, "learning_rate": 0.00022700000000000002, "loss": 6.7635, "mean_token_accuracy": 0.12090180814266205, "num_tokens": 386970.0, "step": 455 }, { "entropy": 7.348088216781616, "epoch": 0.01027852570190041, "grad_norm": 1.25, "learning_rate": 0.00022950000000000002, "loss": 6.729, "mean_token_accuracy": 0.13390202075242996, "num_tokens": 391043.0, "step": 460 }, { "entropy": 7.392842721939087, "epoch": 0.01039024880735585, "grad_norm": 1.3125, "learning_rate": 0.00023200000000000003, "loss": 6.7204, "mean_token_accuracy": 0.13383440747857095, "num_tokens": 395413.0, "step": 465 }, { "entropy": 7.40152382850647, "epoch": 0.010501971912811289, "grad_norm": 1.6015625, "learning_rate": 0.00023449999999999998, "loss": 6.8385, "mean_token_accuracy": 0.12566340565681458, "num_tokens": 399821.0, "step": 470 }, { "entropy": 7.2655271053314205, "epoch": 0.010613695018266728, "grad_norm": 1.34375, "learning_rate": 0.000237, "loss": 6.6582, "mean_token_accuracy": 0.13715523406863211, "num_tokens": 404043.0, "step": 475 }, { "entropy": 7.422811889648438, "epoch": 0.010725418123722167, "grad_norm": 1.65625, "learning_rate": 0.0002395, "loss": 6.803, "mean_token_accuracy": 0.1260749615728855, "num_tokens": 408339.0, "step": 480 }, { "entropy": 7.263138484954834, "epoch": 0.010837141229177606, "grad_norm": 1.4453125, "learning_rate": 0.000242, "loss": 6.6856, "mean_token_accuracy": 0.13459724336862564, "num_tokens": 412384.0, "step": 485 }, { "entropy": 7.3362548828125, "epoch": 0.010948864334633046, "grad_norm": 1.4921875, "learning_rate": 0.0002445, "loss": 6.8108, "mean_token_accuracy": 0.12614913210272788, "num_tokens": 416891.0, "step": 490 }, { "entropy": 7.36023063659668, "epoch": 0.011060587440088485, "grad_norm": 1.4921875, "learning_rate": 0.000247, "loss": 6.6642, "mean_token_accuracy": 0.1329216368496418, "num_tokens": 420980.0, "step": 495 }, { "entropy": 7.2991362571716305, "epoch": 0.011172310545543924, "grad_norm": 1.6640625, "learning_rate": 0.0002495, "loss": 6.7212, "mean_token_accuracy": 0.13121648952364923, "num_tokens": 425454.0, "step": 500 } ], "logging_steps": 5, "max_steps": 4000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 91570824806400.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }