{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.5122453266120561, "eval_steps": 3000, "global_step": 18000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 10.742606925964356, "epoch": 0.0004200798151648813, "grad_norm": 5.21875, "learning_rate": 2e-06, "loss": 10.7358, "mean_token_accuracy": 0.0, "num_tokens": 8348.0, "step": 5 }, { "entropy": 10.74260492324829, "epoch": 0.0008401596303297626, "grad_norm": 5.15625, "learning_rate": 4.5e-06, "loss": 10.7547, "mean_token_accuracy": 0.0, "num_tokens": 17465.0, "step": 10 }, { "entropy": 10.742631721496583, "epoch": 0.001260239445494644, "grad_norm": 5.25, "learning_rate": 7e-06, "loss": 10.7247, "mean_token_accuracy": 0.00010341261513531208, "num_tokens": 26627.0, "step": 15 }, { "entropy": 10.742714214324952, "epoch": 0.0016803192606595252, "grad_norm": 4.96875, "learning_rate": 9.5e-06, "loss": 10.6807, "mean_token_accuracy": 0.0, "num_tokens": 36069.0, "step": 20 }, { "entropy": 10.742774486541748, "epoch": 0.002100399075824407, "grad_norm": 4.96875, "learning_rate": 1.2e-05, "loss": 10.564, "mean_token_accuracy": 0.0009151221020147204, "num_tokens": 44967.0, "step": 25 }, { "entropy": 10.742547607421875, "epoch": 0.002520478890989288, "grad_norm": 3.8125, "learning_rate": 1.4500000000000002e-05, "loss": 10.4843, "mean_token_accuracy": 0.0172414593398571, "num_tokens": 55132.0, "step": 30 }, { "entropy": 10.741770172119141, "epoch": 0.0029405587061541692, "grad_norm": 3.1875, "learning_rate": 1.7000000000000003e-05, "loss": 10.3322, "mean_token_accuracy": 0.044619453698396684, "num_tokens": 65141.0, "step": 35 }, { "entropy": 10.739381885528564, "epoch": 0.0033606385213190504, "grad_norm": 2.484375, "learning_rate": 1.95e-05, "loss": 10.2048, "mean_token_accuracy": 0.04063304513692856, "num_tokens": 74007.0, "step": 40 }, { "entropy": 10.735391807556152, "epoch": 0.003780718336483932, "grad_norm": 2.203125, "learning_rate": 2.2e-05, "loss": 10.1027, "mean_token_accuracy": 0.04380051270127296, "num_tokens": 83736.0, "step": 45 }, { "entropy": 10.731560325622558, "epoch": 0.004200798151648814, "grad_norm": 2.03125, "learning_rate": 2.4500000000000003e-05, "loss": 10.0024, "mean_token_accuracy": 0.04462047629058361, "num_tokens": 92525.0, "step": 50 }, { "entropy": 10.729215049743653, "epoch": 0.004620877966813695, "grad_norm": 2.046875, "learning_rate": 2.7e-05, "loss": 9.9462, "mean_token_accuracy": 0.042681990377604964, "num_tokens": 102015.0, "step": 55 }, { "entropy": 10.728453350067138, "epoch": 0.005040957781978576, "grad_norm": 1.7890625, "learning_rate": 2.95e-05, "loss": 9.9154, "mean_token_accuracy": 0.03954915180802345, "num_tokens": 110887.0, "step": 60 }, { "entropy": 10.727616500854491, "epoch": 0.005461037597143457, "grad_norm": 1.8828125, "learning_rate": 3.2e-05, "loss": 9.8453, "mean_token_accuracy": 0.04232911877334118, "num_tokens": 120442.0, "step": 65 }, { "entropy": 10.726141738891602, "epoch": 0.0058811174123083385, "grad_norm": 1.9609375, "learning_rate": 3.4500000000000005e-05, "loss": 9.7509, "mean_token_accuracy": 0.041194649040699007, "num_tokens": 129297.0, "step": 70 }, { "entropy": 10.723711013793945, "epoch": 0.00630119722747322, "grad_norm": 1.8828125, "learning_rate": 3.7e-05, "loss": 9.7015, "mean_token_accuracy": 0.04228766188025475, "num_tokens": 138305.0, "step": 75 }, { "entropy": 10.719814491271972, "epoch": 0.006721277042638101, "grad_norm": 1.96875, "learning_rate": 3.95e-05, "loss": 9.6499, "mean_token_accuracy": 0.04200226049870252, "num_tokens": 147640.0, "step": 80 }, { "entropy": 10.714290428161622, "epoch": 0.007141356857802983, "grad_norm": 1.8515625, "learning_rate": 4.2000000000000004e-05, "loss": 9.576, "mean_token_accuracy": 0.04255363866686821, "num_tokens": 157633.0, "step": 85 }, { "entropy": 10.707357215881348, "epoch": 0.007561436672967864, "grad_norm": 1.671875, "learning_rate": 4.45e-05, "loss": 9.5382, "mean_token_accuracy": 0.03800953794270754, "num_tokens": 167984.0, "step": 90 }, { "entropy": 10.699947547912597, "epoch": 0.007981516488132745, "grad_norm": 1.7421875, "learning_rate": 4.7000000000000004e-05, "loss": 9.4351, "mean_token_accuracy": 0.04883353523910046, "num_tokens": 176984.0, "step": 95 }, { "entropy": 10.683709812164306, "epoch": 0.008401596303297627, "grad_norm": 1.890625, "learning_rate": 4.9500000000000004e-05, "loss": 9.3133, "mean_token_accuracy": 0.051684480533003806, "num_tokens": 185931.0, "step": 100 }, { "entropy": 10.665494632720947, "epoch": 0.008821676118462508, "grad_norm": 1.859375, "learning_rate": 5.2e-05, "loss": 9.2723, "mean_token_accuracy": 0.05058838985860348, "num_tokens": 195065.0, "step": 105 }, { "entropy": 10.650426483154297, "epoch": 0.00924175593362739, "grad_norm": 1.703125, "learning_rate": 5.45e-05, "loss": 9.1345, "mean_token_accuracy": 0.05380081832408905, "num_tokens": 203687.0, "step": 110 }, { "entropy": 10.613165855407715, "epoch": 0.00966183574879227, "grad_norm": 1.6484375, "learning_rate": 5.7e-05, "loss": 9.0467, "mean_token_accuracy": 0.057396522164344786, "num_tokens": 212847.0, "step": 115 }, { "entropy": 10.554168796539306, "epoch": 0.010081915563957152, "grad_norm": 1.6875, "learning_rate": 5.9499999999999996e-05, "loss": 8.93, "mean_token_accuracy": 0.05599412247538567, "num_tokens": 222593.0, "step": 120 }, { "entropy": 10.50309362411499, "epoch": 0.010501995379122032, "grad_norm": 1.6875, "learning_rate": 6.2e-05, "loss": 8.7842, "mean_token_accuracy": 0.054633737355470655, "num_tokens": 231174.0, "step": 125 }, { "entropy": 10.446444129943847, "epoch": 0.010922075194286915, "grad_norm": 1.5546875, "learning_rate": 6.450000000000001e-05, "loss": 8.6507, "mean_token_accuracy": 0.05882068388164043, "num_tokens": 239833.0, "step": 130 }, { "entropy": 10.371571159362793, "epoch": 0.011342155009451797, "grad_norm": 1.53125, "learning_rate": 6.7e-05, "loss": 8.62, "mean_token_accuracy": 0.05638743191957474, "num_tokens": 248794.0, "step": 135 }, { "entropy": 10.297250938415527, "epoch": 0.011762234824616677, "grad_norm": 1.4375, "learning_rate": 6.950000000000001e-05, "loss": 8.5299, "mean_token_accuracy": 0.056220804899930955, "num_tokens": 257123.0, "step": 140 }, { "entropy": 10.228730010986329, "epoch": 0.012182314639781559, "grad_norm": 1.453125, "learning_rate": 7.2e-05, "loss": 8.2842, "mean_token_accuracy": 0.05619280487298965, "num_tokens": 266088.0, "step": 145 }, { "entropy": 10.08653745651245, "epoch": 0.01260239445494644, "grad_norm": 1.21875, "learning_rate": 7.45e-05, "loss": 8.3619, "mean_token_accuracy": 0.0516346599906683, "num_tokens": 276074.0, "step": 150 }, { "entropy": 9.963776969909668, "epoch": 0.013022474270111321, "grad_norm": 1.171875, "learning_rate": 7.7e-05, "loss": 8.1944, "mean_token_accuracy": 0.054025283083319664, "num_tokens": 285280.0, "step": 155 }, { "entropy": 9.805997848510742, "epoch": 0.013442554085276202, "grad_norm": 1.171875, "learning_rate": 7.950000000000001e-05, "loss": 8.151, "mean_token_accuracy": 0.052671706303954124, "num_tokens": 296115.0, "step": 160 }, { "entropy": 9.606755542755128, "epoch": 0.013862633900441084, "grad_norm": 0.99609375, "learning_rate": 8.2e-05, "loss": 7.9584, "mean_token_accuracy": 0.05575060956180096, "num_tokens": 305483.0, "step": 165 }, { "entropy": 9.449717140197754, "epoch": 0.014282713715605966, "grad_norm": 0.93359375, "learning_rate": 8.450000000000001e-05, "loss": 7.9165, "mean_token_accuracy": 0.058218777552247046, "num_tokens": 314000.0, "step": 170 }, { "entropy": 9.167982482910157, "epoch": 0.014702793530770846, "grad_norm": 1.1953125, "learning_rate": 8.7e-05, "loss": 7.8517, "mean_token_accuracy": 0.062257979065179825, "num_tokens": 323667.0, "step": 175 }, { "entropy": 8.951386070251464, "epoch": 0.015122873345935728, "grad_norm": 0.9296875, "learning_rate": 8.95e-05, "loss": 7.8029, "mean_token_accuracy": 0.06150264739990234, "num_tokens": 332695.0, "step": 180 }, { "entropy": 8.776250171661378, "epoch": 0.015542953161100609, "grad_norm": 0.9609375, "learning_rate": 9.2e-05, "loss": 7.643, "mean_token_accuracy": 0.05887415409088135, "num_tokens": 342428.0, "step": 185 }, { "entropy": 8.602806949615479, "epoch": 0.01596303297626549, "grad_norm": 0.79296875, "learning_rate": 9.45e-05, "loss": 7.7106, "mean_token_accuracy": 0.06374814324080944, "num_tokens": 353587.0, "step": 190 }, { "entropy": 8.474033164978028, "epoch": 0.01638311279143037, "grad_norm": 0.93359375, "learning_rate": 9.7e-05, "loss": 7.6401, "mean_token_accuracy": 0.06406850814819336, "num_tokens": 362997.0, "step": 195 }, { "entropy": 8.364265060424804, "epoch": 0.016803192606595255, "grad_norm": 0.95703125, "learning_rate": 9.95e-05, "loss": 7.6617, "mean_token_accuracy": 0.06993534453213215, "num_tokens": 372346.0, "step": 200 }, { "entropy": 8.375140285491943, "epoch": 0.017223272421760135, "grad_norm": 1.0, "learning_rate": 0.000102, "loss": 7.5334, "mean_token_accuracy": 0.06646758764982223, "num_tokens": 381575.0, "step": 205 }, { "entropy": 8.26815767288208, "epoch": 0.017643352236925015, "grad_norm": 0.90625, "learning_rate": 0.00010449999999999999, "loss": 7.5902, "mean_token_accuracy": 0.07085754275321961, "num_tokens": 390706.0, "step": 210 }, { "entropy": 8.218460845947266, "epoch": 0.018063432052089896, "grad_norm": 0.828125, "learning_rate": 0.000107, "loss": 7.5876, "mean_token_accuracy": 0.07221915200352669, "num_tokens": 400000.0, "step": 215 }, { "entropy": 8.139337062835693, "epoch": 0.01848351186725478, "grad_norm": 0.85546875, "learning_rate": 0.0001095, "loss": 7.5295, "mean_token_accuracy": 0.07644539698958397, "num_tokens": 409447.0, "step": 220 }, { "entropy": 8.122040271759033, "epoch": 0.01890359168241966, "grad_norm": 1.1328125, "learning_rate": 0.000112, "loss": 7.5068, "mean_token_accuracy": 0.07519292533397674, "num_tokens": 418417.0, "step": 225 }, { "entropy": 8.067694330215454, "epoch": 0.01932367149758454, "grad_norm": 0.9609375, "learning_rate": 0.0001145, "loss": 7.4664, "mean_token_accuracy": 0.07503528967499733, "num_tokens": 427619.0, "step": 230 }, { "entropy": 8.071773529052734, "epoch": 0.019743751312749424, "grad_norm": 0.96484375, "learning_rate": 0.00011700000000000001, "loss": 7.5131, "mean_token_accuracy": 0.07185145244002342, "num_tokens": 437931.0, "step": 235 }, { "entropy": 8.109980726242066, "epoch": 0.020163831127914304, "grad_norm": 0.9609375, "learning_rate": 0.00011949999999999999, "loss": 7.552, "mean_token_accuracy": 0.07611973807215691, "num_tokens": 447595.0, "step": 240 }, { "entropy": 8.026875400543213, "epoch": 0.020583910943079185, "grad_norm": 0.94921875, "learning_rate": 0.000122, "loss": 7.4164, "mean_token_accuracy": 0.07035953775048256, "num_tokens": 457062.0, "step": 245 }, { "entropy": 8.063331604003906, "epoch": 0.021003990758244065, "grad_norm": 1.015625, "learning_rate": 0.0001245, "loss": 7.5166, "mean_token_accuracy": 0.07237975299358368, "num_tokens": 466191.0, "step": 250 }, { "entropy": 8.050399017333984, "epoch": 0.02142407057340895, "grad_norm": 1.2734375, "learning_rate": 0.000127, "loss": 7.4443, "mean_token_accuracy": 0.07492763809859752, "num_tokens": 475693.0, "step": 255 }, { "entropy": 8.024266242980957, "epoch": 0.02184415038857383, "grad_norm": 1.0234375, "learning_rate": 0.0001295, "loss": 7.4691, "mean_token_accuracy": 0.07379123903810977, "num_tokens": 485173.0, "step": 260 }, { "entropy": 7.993921422958374, "epoch": 0.02226423020373871, "grad_norm": 0.99609375, "learning_rate": 0.000132, "loss": 7.3863, "mean_token_accuracy": 0.08008474782109261, "num_tokens": 493985.0, "step": 265 }, { "entropy": 7.907951974868775, "epoch": 0.022684310018903593, "grad_norm": 1.125, "learning_rate": 0.00013450000000000002, "loss": 7.4036, "mean_token_accuracy": 0.07586845718324184, "num_tokens": 502837.0, "step": 270 }, { "entropy": 7.981403732299805, "epoch": 0.023104389834068473, "grad_norm": 0.91015625, "learning_rate": 0.00013700000000000002, "loss": 7.3605, "mean_token_accuracy": 0.07924394458532333, "num_tokens": 511503.0, "step": 275 }, { "entropy": 7.977783203125, "epoch": 0.023524469649233354, "grad_norm": 0.92578125, "learning_rate": 0.0001395, "loss": 7.5335, "mean_token_accuracy": 0.0751778606325388, "num_tokens": 521499.0, "step": 280 }, { "entropy": 7.871473217010498, "epoch": 0.023944549464398234, "grad_norm": 1.0703125, "learning_rate": 0.00014199999999999998, "loss": 7.2955, "mean_token_accuracy": 0.0799000546336174, "num_tokens": 530067.0, "step": 285 }, { "entropy": 7.885423564910889, "epoch": 0.024364629279563118, "grad_norm": 0.921875, "learning_rate": 0.0001445, "loss": 7.2851, "mean_token_accuracy": 0.08089336939156055, "num_tokens": 538559.0, "step": 290 }, { "entropy": 7.956486988067627, "epoch": 0.024784709094728, "grad_norm": 1.0078125, "learning_rate": 0.000147, "loss": 7.4858, "mean_token_accuracy": 0.07482350952923297, "num_tokens": 547288.0, "step": 295 }, { "entropy": 7.870783424377441, "epoch": 0.02520478890989288, "grad_norm": 0.8828125, "learning_rate": 0.0001495, "loss": 7.3589, "mean_token_accuracy": 0.07514288201928139, "num_tokens": 557269.0, "step": 300 }, { "entropy": 7.939627742767334, "epoch": 0.025624868725057762, "grad_norm": 0.96484375, "learning_rate": 0.000152, "loss": 7.3914, "mean_token_accuracy": 0.07472754344344139, "num_tokens": 567280.0, "step": 305 }, { "entropy": 7.828274822235107, "epoch": 0.026044948540222643, "grad_norm": 0.91796875, "learning_rate": 0.00015450000000000001, "loss": 7.2341, "mean_token_accuracy": 0.07823858335614205, "num_tokens": 576609.0, "step": 310 }, { "entropy": 7.761577320098877, "epoch": 0.026465028355387523, "grad_norm": 1.046875, "learning_rate": 0.000157, "loss": 7.1336, "mean_token_accuracy": 0.08791142702102661, "num_tokens": 586053.0, "step": 315 }, { "entropy": 7.695616436004639, "epoch": 0.026885108170552403, "grad_norm": 0.94921875, "learning_rate": 0.0001595, "loss": 7.3339, "mean_token_accuracy": 0.08298731297254562, "num_tokens": 594649.0, "step": 320 }, { "entropy": 7.869348049163818, "epoch": 0.027305187985717287, "grad_norm": 1.109375, "learning_rate": 0.000162, "loss": 7.2862, "mean_token_accuracy": 0.07372522614896297, "num_tokens": 603445.0, "step": 325 }, { "entropy": 7.86638765335083, "epoch": 0.027725267800882167, "grad_norm": 1.0625, "learning_rate": 0.00016450000000000001, "loss": 7.3613, "mean_token_accuracy": 0.07848134562373162, "num_tokens": 613611.0, "step": 330 }, { "entropy": 7.971248960494995, "epoch": 0.028145347616047048, "grad_norm": 1.0703125, "learning_rate": 0.00016700000000000002, "loss": 7.5217, "mean_token_accuracy": 0.07931054159998893, "num_tokens": 623024.0, "step": 335 }, { "entropy": 7.725814580917358, "epoch": 0.02856542743121193, "grad_norm": 1.2734375, "learning_rate": 0.00016950000000000003, "loss": 7.225, "mean_token_accuracy": 0.08345521688461303, "num_tokens": 631624.0, "step": 340 }, { "entropy": 7.762637519836426, "epoch": 0.028985507246376812, "grad_norm": 1.0078125, "learning_rate": 0.00017199999999999998, "loss": 7.1844, "mean_token_accuracy": 0.08410112038254738, "num_tokens": 640473.0, "step": 345 }, { "entropy": 7.841788578033447, "epoch": 0.029405587061541692, "grad_norm": 1.0625, "learning_rate": 0.00017449999999999999, "loss": 7.3409, "mean_token_accuracy": 0.08037517666816711, "num_tokens": 649692.0, "step": 350 }, { "entropy": 7.800195980072021, "epoch": 0.029825666876706573, "grad_norm": 1.0390625, "learning_rate": 0.000177, "loss": 7.2995, "mean_token_accuracy": 0.08097823038697242, "num_tokens": 658236.0, "step": 355 }, { "entropy": 7.668969297409058, "epoch": 0.030245746691871456, "grad_norm": 1.0859375, "learning_rate": 0.0001795, "loss": 7.0948, "mean_token_accuracy": 0.08619136661291123, "num_tokens": 667175.0, "step": 360 }, { "entropy": 7.798488330841065, "epoch": 0.030665826507036337, "grad_norm": 1.125, "learning_rate": 0.000182, "loss": 7.3842, "mean_token_accuracy": 0.07823293879628182, "num_tokens": 676456.0, "step": 365 }, { "entropy": 7.812319660186768, "epoch": 0.031085906322201217, "grad_norm": 0.9765625, "learning_rate": 0.0001845, "loss": 7.3503, "mean_token_accuracy": 0.07726633399724961, "num_tokens": 686881.0, "step": 370 }, { "entropy": 7.688674831390381, "epoch": 0.0315059861373661, "grad_norm": 1.0234375, "learning_rate": 0.000187, "loss": 7.1373, "mean_token_accuracy": 0.0819906547665596, "num_tokens": 696045.0, "step": 375 }, { "entropy": 7.655067443847656, "epoch": 0.03192606595253098, "grad_norm": 1.1484375, "learning_rate": 0.0001895, "loss": 7.1112, "mean_token_accuracy": 0.08879919424653053, "num_tokens": 704729.0, "step": 380 }, { "entropy": 7.4980494499206545, "epoch": 0.032346145767695865, "grad_norm": 0.953125, "learning_rate": 0.000192, "loss": 7.1679, "mean_token_accuracy": 0.07921729236841202, "num_tokens": 714331.0, "step": 385 }, { "entropy": 7.735121536254883, "epoch": 0.03276622558286074, "grad_norm": 1.0625, "learning_rate": 0.0001945, "loss": 7.1229, "mean_token_accuracy": 0.08520057946443557, "num_tokens": 722788.0, "step": 390 }, { "entropy": 7.683975791931152, "epoch": 0.033186305398025626, "grad_norm": 1.2421875, "learning_rate": 0.00019700000000000002, "loss": 7.1944, "mean_token_accuracy": 0.08690556064248085, "num_tokens": 731417.0, "step": 395 }, { "entropy": 7.576824569702149, "epoch": 0.03360638521319051, "grad_norm": 0.9140625, "learning_rate": 0.00019950000000000002, "loss": 7.1549, "mean_token_accuracy": 0.08151165619492531, "num_tokens": 741034.0, "step": 400 }, { "entropy": 7.698281908035279, "epoch": 0.034026465028355386, "grad_norm": 0.9453125, "learning_rate": 0.000202, "loss": 7.156, "mean_token_accuracy": 0.08484743162989616, "num_tokens": 749596.0, "step": 405 }, { "entropy": 7.556124067306518, "epoch": 0.03444654484352027, "grad_norm": 0.921875, "learning_rate": 0.00020449999999999998, "loss": 7.1145, "mean_token_accuracy": 0.08153974264860153, "num_tokens": 758931.0, "step": 410 }, { "entropy": 7.533982944488526, "epoch": 0.03486662465868515, "grad_norm": 1.0390625, "learning_rate": 0.000207, "loss": 7.0206, "mean_token_accuracy": 0.09019657000899314, "num_tokens": 767534.0, "step": 415 }, { "entropy": 7.6061821460723875, "epoch": 0.03528670447385003, "grad_norm": 1.078125, "learning_rate": 0.0002095, "loss": 7.0789, "mean_token_accuracy": 0.08290171101689339, "num_tokens": 776456.0, "step": 420 }, { "entropy": 7.5107566833496096, "epoch": 0.035706784289014915, "grad_norm": 1.0078125, "learning_rate": 0.000212, "loss": 7.1362, "mean_token_accuracy": 0.08152465149760246, "num_tokens": 786172.0, "step": 425 }, { "entropy": 7.553678846359253, "epoch": 0.03612686410417979, "grad_norm": 0.97265625, "learning_rate": 0.0002145, "loss": 7.0139, "mean_token_accuracy": 0.09106989204883575, "num_tokens": 795081.0, "step": 430 }, { "entropy": 7.604944372177124, "epoch": 0.036546943919344675, "grad_norm": 1.03125, "learning_rate": 0.00021700000000000002, "loss": 7.0628, "mean_token_accuracy": 0.08461785838007926, "num_tokens": 804259.0, "step": 435 }, { "entropy": 7.534902191162109, "epoch": 0.03696702373450956, "grad_norm": 1.109375, "learning_rate": 0.0002195, "loss": 7.0873, "mean_token_accuracy": 0.08283074498176575, "num_tokens": 813463.0, "step": 440 }, { "entropy": 7.502531671524048, "epoch": 0.037387103549674436, "grad_norm": 1.046875, "learning_rate": 0.000222, "loss": 7.0035, "mean_token_accuracy": 0.09452007561922074, "num_tokens": 823029.0, "step": 445 }, { "entropy": 7.486780834197998, "epoch": 0.03780718336483932, "grad_norm": 1.015625, "learning_rate": 0.0002245, "loss": 7.0727, "mean_token_accuracy": 0.08529324010014534, "num_tokens": 832902.0, "step": 450 }, { "entropy": 7.476432847976684, "epoch": 0.0382272631800042, "grad_norm": 1.0, "learning_rate": 0.00022700000000000002, "loss": 7.0158, "mean_token_accuracy": 0.08854726403951645, "num_tokens": 842162.0, "step": 455 }, { "entropy": 7.52789797782898, "epoch": 0.03864734299516908, "grad_norm": 1.0625, "learning_rate": 0.00022950000000000002, "loss": 7.0493, "mean_token_accuracy": 0.08622511699795724, "num_tokens": 852328.0, "step": 460 }, { "entropy": 7.449561357498169, "epoch": 0.039067422810333964, "grad_norm": 1.046875, "learning_rate": 0.00023200000000000003, "loss": 7.0104, "mean_token_accuracy": 0.09133929386734962, "num_tokens": 860929.0, "step": 465 }, { "entropy": 7.458409357070923, "epoch": 0.03948750262549885, "grad_norm": 1.1015625, "learning_rate": 0.00023449999999999998, "loss": 7.0901, "mean_token_accuracy": 0.08522843271493911, "num_tokens": 869144.0, "step": 470 }, { "entropy": 7.584603118896484, "epoch": 0.039907582440663725, "grad_norm": 1.1484375, "learning_rate": 0.000237, "loss": 7.03, "mean_token_accuracy": 0.09454337358474732, "num_tokens": 877447.0, "step": 475 }, { "entropy": 7.431310081481934, "epoch": 0.04032766225582861, "grad_norm": 0.99609375, "learning_rate": 0.0002395, "loss": 6.9871, "mean_token_accuracy": 0.08733554184436798, "num_tokens": 887020.0, "step": 480 }, { "entropy": 7.453667879104614, "epoch": 0.040747742070993485, "grad_norm": 1.171875, "learning_rate": 0.000242, "loss": 7.0323, "mean_token_accuracy": 0.08681000843644142, "num_tokens": 895937.0, "step": 485 }, { "entropy": 7.41835618019104, "epoch": 0.04116782188615837, "grad_norm": 1.0234375, "learning_rate": 0.0002445, "loss": 7.0366, "mean_token_accuracy": 0.08261745497584343, "num_tokens": 905446.0, "step": 490 }, { "entropy": 7.464281463623047, "epoch": 0.04158790170132325, "grad_norm": 1.078125, "learning_rate": 0.000247, "loss": 6.9289, "mean_token_accuracy": 0.09576694294810295, "num_tokens": 914547.0, "step": 495 }, { "entropy": 7.421106290817261, "epoch": 0.04200798151648813, "grad_norm": 1.0703125, "learning_rate": 0.0002495, "loss": 6.9377, "mean_token_accuracy": 0.0962467186152935, "num_tokens": 922900.0, "step": 500 }, { "entropy": 7.401471900939941, "epoch": 0.042428061331653014, "grad_norm": 1.1484375, "learning_rate": 0.000252, "loss": 6.9572, "mean_token_accuracy": 0.09509932994842529, "num_tokens": 930876.0, "step": 505 }, { "entropy": 7.342588901519775, "epoch": 0.0428481411468179, "grad_norm": 0.98828125, "learning_rate": 0.0002545, "loss": 7.0021, "mean_token_accuracy": 0.09231638312339782, "num_tokens": 939871.0, "step": 510 }, { "entropy": 7.44086856842041, "epoch": 0.043268220961982774, "grad_norm": 1.1875, "learning_rate": 0.000257, "loss": 6.988, "mean_token_accuracy": 0.09245615154504776, "num_tokens": 948673.0, "step": 515 }, { "entropy": 7.274595832824707, "epoch": 0.04368830077714766, "grad_norm": 1.015625, "learning_rate": 0.0002595, "loss": 6.9409, "mean_token_accuracy": 0.08984568417072296, "num_tokens": 957603.0, "step": 520 }, { "entropy": 7.436605787277221, "epoch": 0.04410838059231254, "grad_norm": 1.1015625, "learning_rate": 0.000262, "loss": 7.0062, "mean_token_accuracy": 0.08319340422749519, "num_tokens": 967731.0, "step": 525 }, { "entropy": 7.435907888412475, "epoch": 0.04452846040747742, "grad_norm": 1.140625, "learning_rate": 0.00026450000000000003, "loss": 7.0032, "mean_token_accuracy": 0.09049810692667962, "num_tokens": 977427.0, "step": 530 }, { "entropy": 7.3634380340576175, "epoch": 0.0449485402226423, "grad_norm": 1.125, "learning_rate": 0.00026700000000000004, "loss": 6.9827, "mean_token_accuracy": 0.0860845424234867, "num_tokens": 986758.0, "step": 535 }, { "entropy": 7.425018453598023, "epoch": 0.045368620037807186, "grad_norm": 1.2578125, "learning_rate": 0.00026950000000000005, "loss": 6.9738, "mean_token_accuracy": 0.09986243322491646, "num_tokens": 996377.0, "step": 540 }, { "entropy": 7.333861589431763, "epoch": 0.04578869985297206, "grad_norm": 1.0859375, "learning_rate": 0.00027200000000000005, "loss": 7.0222, "mean_token_accuracy": 0.08520096391439438, "num_tokens": 1006483.0, "step": 545 }, { "entropy": 7.269639205932617, "epoch": 0.04620877966813695, "grad_norm": 0.984375, "learning_rate": 0.0002745, "loss": 6.9248, "mean_token_accuracy": 0.091129120439291, "num_tokens": 1016132.0, "step": 550 }, { "entropy": 7.3355879306793215, "epoch": 0.04662885948330183, "grad_norm": 1.171875, "learning_rate": 0.000277, "loss": 6.8796, "mean_token_accuracy": 0.09489664137363434, "num_tokens": 1024970.0, "step": 555 }, { "entropy": 7.3572368144989015, "epoch": 0.04704893929846671, "grad_norm": 0.96484375, "learning_rate": 0.0002795, "loss": 6.9525, "mean_token_accuracy": 0.09272714778780937, "num_tokens": 1034335.0, "step": 560 }, { "entropy": 7.423572063446045, "epoch": 0.04746901911363159, "grad_norm": 1.015625, "learning_rate": 0.00028199999999999997, "loss": 7.0075, "mean_token_accuracy": 0.09945140630006791, "num_tokens": 1043954.0, "step": 565 }, { "entropy": 7.319319725036621, "epoch": 0.04788909892879647, "grad_norm": 1.0234375, "learning_rate": 0.0002845, "loss": 6.9431, "mean_token_accuracy": 0.09524357318878174, "num_tokens": 1053554.0, "step": 570 }, { "entropy": 7.376662826538086, "epoch": 0.04830917874396135, "grad_norm": 1.0078125, "learning_rate": 0.000287, "loss": 6.8893, "mean_token_accuracy": 0.0956316351890564, "num_tokens": 1062008.0, "step": 575 }, { "entropy": 7.246560859680176, "epoch": 0.048729258559126236, "grad_norm": 1.1171875, "learning_rate": 0.0002895, "loss": 6.9602, "mean_token_accuracy": 0.09502239599823951, "num_tokens": 1070740.0, "step": 580 }, { "entropy": 7.361734390258789, "epoch": 0.04914933837429111, "grad_norm": 1.203125, "learning_rate": 0.000292, "loss": 6.9451, "mean_token_accuracy": 0.09238593950867653, "num_tokens": 1079681.0, "step": 585 }, { "entropy": 7.294089078903198, "epoch": 0.049569418189456, "grad_norm": 1.015625, "learning_rate": 0.0002945, "loss": 6.8326, "mean_token_accuracy": 0.09609337821602822, "num_tokens": 1088979.0, "step": 590 }, { "entropy": 7.192009592056275, "epoch": 0.04998949800462088, "grad_norm": 1.1171875, "learning_rate": 0.000297, "loss": 6.8381, "mean_token_accuracy": 0.09695586860179901, "num_tokens": 1097870.0, "step": 595 }, { "entropy": 7.285109043121338, "epoch": 0.05040957781978576, "grad_norm": 1.109375, "learning_rate": 0.0002995, "loss": 6.9361, "mean_token_accuracy": 0.09410082027316094, "num_tokens": 1107948.0, "step": 600 }, { "entropy": 7.2816235542297365, "epoch": 0.05082965763495064, "grad_norm": 1.109375, "learning_rate": 0.000302, "loss": 6.856, "mean_token_accuracy": 0.09758619442582131, "num_tokens": 1117032.0, "step": 605 }, { "entropy": 7.1946680545806885, "epoch": 0.051249737450115525, "grad_norm": 1.0078125, "learning_rate": 0.0003045, "loss": 6.8323, "mean_token_accuracy": 0.09758584424853325, "num_tokens": 1127834.0, "step": 610 }, { "entropy": 7.325930643081665, "epoch": 0.0516698172652804, "grad_norm": 1.234375, "learning_rate": 0.000307, "loss": 6.9314, "mean_token_accuracy": 0.10701763778924941, "num_tokens": 1137382.0, "step": 615 }, { "entropy": 7.191529178619385, "epoch": 0.052089897080445285, "grad_norm": 1.0546875, "learning_rate": 0.0003095, "loss": 6.7726, "mean_token_accuracy": 0.1016211412847042, "num_tokens": 1146095.0, "step": 620 }, { "entropy": 7.197086191177368, "epoch": 0.05250997689561017, "grad_norm": 1.0234375, "learning_rate": 0.000312, "loss": 6.8164, "mean_token_accuracy": 0.09977484568953514, "num_tokens": 1154981.0, "step": 625 }, { "entropy": 7.111207914352417, "epoch": 0.052930056710775046, "grad_norm": 1.203125, "learning_rate": 0.0003145, "loss": 6.822, "mean_token_accuracy": 0.09889646545052529, "num_tokens": 1164939.0, "step": 630 }, { "entropy": 7.286598014831543, "epoch": 0.05335013652593993, "grad_norm": 1.046875, "learning_rate": 0.000317, "loss": 6.9423, "mean_token_accuracy": 0.0905054323375225, "num_tokens": 1174991.0, "step": 635 }, { "entropy": 7.268424129486084, "epoch": 0.05377021634110481, "grad_norm": 0.98046875, "learning_rate": 0.0003195, "loss": 6.9893, "mean_token_accuracy": 0.09030458927154542, "num_tokens": 1184885.0, "step": 640 }, { "entropy": 7.25072751045227, "epoch": 0.05419029615626969, "grad_norm": 1.1640625, "learning_rate": 0.000322, "loss": 6.8843, "mean_token_accuracy": 0.09418094158172607, "num_tokens": 1193637.0, "step": 645 }, { "entropy": 7.144441413879394, "epoch": 0.054610375971434574, "grad_norm": 1.1328125, "learning_rate": 0.00032450000000000003, "loss": 6.6712, "mean_token_accuracy": 0.10373484939336777, "num_tokens": 1202188.0, "step": 650 }, { "entropy": 7.2327552318573, "epoch": 0.05503045578659945, "grad_norm": 1.1484375, "learning_rate": 0.00032700000000000003, "loss": 6.8046, "mean_token_accuracy": 0.09572408124804496, "num_tokens": 1210768.0, "step": 655 }, { "entropy": 7.196833848953247, "epoch": 0.055450535601764335, "grad_norm": 1.1171875, "learning_rate": 0.00032950000000000004, "loss": 6.8024, "mean_token_accuracy": 0.09782998114824296, "num_tokens": 1219819.0, "step": 660 }, { "entropy": 7.211909484863281, "epoch": 0.05587061541692922, "grad_norm": 0.91796875, "learning_rate": 0.00033200000000000005, "loss": 6.8553, "mean_token_accuracy": 0.09061138033866882, "num_tokens": 1229703.0, "step": 665 }, { "entropy": 7.242569494247436, "epoch": 0.056290695232094096, "grad_norm": 1.1796875, "learning_rate": 0.00033450000000000005, "loss": 6.8929, "mean_token_accuracy": 0.09304608702659607, "num_tokens": 1238942.0, "step": 670 }, { "entropy": 7.276552438735962, "epoch": 0.05671077504725898, "grad_norm": 1.015625, "learning_rate": 0.000337, "loss": 6.9316, "mean_token_accuracy": 0.09855509251356125, "num_tokens": 1248943.0, "step": 675 }, { "entropy": 7.130473899841308, "epoch": 0.05713085486242386, "grad_norm": 1.015625, "learning_rate": 0.0003395, "loss": 6.8196, "mean_token_accuracy": 0.09641827270388603, "num_tokens": 1257761.0, "step": 680 }, { "entropy": 7.069635629653931, "epoch": 0.05755093467758874, "grad_norm": 1.1328125, "learning_rate": 0.000342, "loss": 6.7531, "mean_token_accuracy": 0.09635655134916306, "num_tokens": 1267216.0, "step": 685 }, { "entropy": 7.244167423248291, "epoch": 0.057971014492753624, "grad_norm": 1.0703125, "learning_rate": 0.00034449999999999997, "loss": 6.8517, "mean_token_accuracy": 0.09775793552398682, "num_tokens": 1277210.0, "step": 690 }, { "entropy": 7.151098155975342, "epoch": 0.05839109430791851, "grad_norm": 1.078125, "learning_rate": 0.000347, "loss": 6.7848, "mean_token_accuracy": 0.09209914952516556, "num_tokens": 1285310.0, "step": 695 }, { "entropy": 7.133235788345337, "epoch": 0.058811174123083385, "grad_norm": 1.1015625, "learning_rate": 0.0003495, "loss": 6.7884, "mean_token_accuracy": 0.0997276745736599, "num_tokens": 1294421.0, "step": 700 }, { "entropy": 7.089715480804443, "epoch": 0.05923125393824827, "grad_norm": 1.0078125, "learning_rate": 0.000352, "loss": 6.6149, "mean_token_accuracy": 0.10670206919312478, "num_tokens": 1303281.0, "step": 705 }, { "entropy": 7.096017217636108, "epoch": 0.059651333753413145, "grad_norm": 1.3046875, "learning_rate": 0.0003545, "loss": 6.7841, "mean_token_accuracy": 0.1047137551009655, "num_tokens": 1312280.0, "step": 710 }, { "entropy": 7.01336669921875, "epoch": 0.06007141356857803, "grad_norm": 1.0390625, "learning_rate": 0.000357, "loss": 6.7519, "mean_token_accuracy": 0.09830996096134186, "num_tokens": 1321243.0, "step": 715 }, { "entropy": 7.150788021087647, "epoch": 0.06049149338374291, "grad_norm": 1.0234375, "learning_rate": 0.0003595, "loss": 6.8411, "mean_token_accuracy": 0.0983475923538208, "num_tokens": 1330324.0, "step": 720 }, { "entropy": 7.074830770492554, "epoch": 0.06091157319890779, "grad_norm": 1.140625, "learning_rate": 0.000362, "loss": 6.6865, "mean_token_accuracy": 0.1045832097530365, "num_tokens": 1339485.0, "step": 725 }, { "entropy": 7.180077934265137, "epoch": 0.06133165301407267, "grad_norm": 1.2578125, "learning_rate": 0.0003645, "loss": 6.8327, "mean_token_accuracy": 0.09178336262702942, "num_tokens": 1348640.0, "step": 730 }, { "entropy": 7.070912313461304, "epoch": 0.06175173282923756, "grad_norm": 1.203125, "learning_rate": 0.000367, "loss": 6.7313, "mean_token_accuracy": 0.10252036228775978, "num_tokens": 1357581.0, "step": 735 }, { "entropy": 7.097622108459473, "epoch": 0.062171812644402434, "grad_norm": 1.171875, "learning_rate": 0.0003695, "loss": 6.7976, "mean_token_accuracy": 0.09888288527727127, "num_tokens": 1367883.0, "step": 740 }, { "entropy": 7.072182083129883, "epoch": 0.06259189245956731, "grad_norm": 1.078125, "learning_rate": 0.000372, "loss": 6.7536, "mean_token_accuracy": 0.09760352596640587, "num_tokens": 1376936.0, "step": 745 }, { "entropy": 6.975026559829712, "epoch": 0.0630119722747322, "grad_norm": 1.15625, "learning_rate": 0.0003745, "loss": 6.6653, "mean_token_accuracy": 0.10172178596258163, "num_tokens": 1386359.0, "step": 750 }, { "entropy": 7.0470263957977295, "epoch": 0.06343205208989708, "grad_norm": 1.0234375, "learning_rate": 0.000377, "loss": 6.7205, "mean_token_accuracy": 0.10334330797195435, "num_tokens": 1395223.0, "step": 755 }, { "entropy": 7.237481212615966, "epoch": 0.06385213190506196, "grad_norm": 0.9375, "learning_rate": 0.0003795, "loss": 6.8854, "mean_token_accuracy": 0.09526007026433944, "num_tokens": 1404917.0, "step": 760 }, { "entropy": 7.060393810272217, "epoch": 0.06427221172022685, "grad_norm": 1.109375, "learning_rate": 0.000382, "loss": 6.7712, "mean_token_accuracy": 0.10844952017068862, "num_tokens": 1413348.0, "step": 765 }, { "entropy": 7.010181617736817, "epoch": 0.06469229153539173, "grad_norm": 1.109375, "learning_rate": 0.0003845, "loss": 6.751, "mean_token_accuracy": 0.0988110676407814, "num_tokens": 1421726.0, "step": 770 }, { "entropy": 7.068030214309692, "epoch": 0.0651123713505566, "grad_norm": 1.015625, "learning_rate": 0.00038700000000000003, "loss": 6.7626, "mean_token_accuracy": 0.10152493417263031, "num_tokens": 1430686.0, "step": 775 }, { "entropy": 7.124918842315674, "epoch": 0.06553245116572148, "grad_norm": 1.1015625, "learning_rate": 0.00038950000000000003, "loss": 6.7567, "mean_token_accuracy": 0.10261558443307876, "num_tokens": 1439499.0, "step": 780 }, { "entropy": 7.08576397895813, "epoch": 0.06595253098088637, "grad_norm": 1.1953125, "learning_rate": 0.00039200000000000004, "loss": 6.7308, "mean_token_accuracy": 0.10436978489160538, "num_tokens": 1448220.0, "step": 785 }, { "entropy": 6.918930721282959, "epoch": 0.06637261079605125, "grad_norm": 1.0, "learning_rate": 0.00039450000000000005, "loss": 6.7623, "mean_token_accuracy": 0.09306630715727807, "num_tokens": 1458217.0, "step": 790 }, { "entropy": 7.050667333602905, "epoch": 0.06679269061121614, "grad_norm": 1.0703125, "learning_rate": 0.00039700000000000005, "loss": 6.6615, "mean_token_accuracy": 0.10148273557424545, "num_tokens": 1467422.0, "step": 795 }, { "entropy": 7.04574761390686, "epoch": 0.06721277042638102, "grad_norm": 1.03125, "learning_rate": 0.0003995, "loss": 6.6428, "mean_token_accuracy": 0.10174536257982254, "num_tokens": 1476152.0, "step": 800 }, { "entropy": 6.920849370956421, "epoch": 0.06763285024154589, "grad_norm": 1.140625, "learning_rate": 0.000402, "loss": 6.7303, "mean_token_accuracy": 0.09813930094242096, "num_tokens": 1485248.0, "step": 805 }, { "entropy": 7.021937704086303, "epoch": 0.06805293005671077, "grad_norm": 1.09375, "learning_rate": 0.0004045, "loss": 6.6965, "mean_token_accuracy": 0.10005066767334939, "num_tokens": 1494248.0, "step": 810 }, { "entropy": 7.009239387512207, "epoch": 0.06847300987187566, "grad_norm": 1.078125, "learning_rate": 0.00040699999999999997, "loss": 6.7988, "mean_token_accuracy": 0.10206111744046212, "num_tokens": 1503565.0, "step": 815 }, { "entropy": 7.153907108306885, "epoch": 0.06889308968704054, "grad_norm": 1.046875, "learning_rate": 0.0004095, "loss": 6.8967, "mean_token_accuracy": 0.09253153279423713, "num_tokens": 1513227.0, "step": 820 }, { "entropy": 7.081949377059937, "epoch": 0.06931316950220542, "grad_norm": 1.0625, "learning_rate": 0.000412, "loss": 6.6785, "mean_token_accuracy": 0.10418465957045556, "num_tokens": 1522312.0, "step": 825 }, { "entropy": 6.934855031967163, "epoch": 0.0697332493173703, "grad_norm": 1.09375, "learning_rate": 0.0004145, "loss": 6.6359, "mean_token_accuracy": 0.1031254269182682, "num_tokens": 1531720.0, "step": 830 }, { "entropy": 6.970464134216309, "epoch": 0.07015332913253518, "grad_norm": 1.09375, "learning_rate": 0.000417, "loss": 6.7192, "mean_token_accuracy": 0.09493932947516441, "num_tokens": 1541238.0, "step": 835 }, { "entropy": 7.103578281402588, "epoch": 0.07057340894770006, "grad_norm": 1.1015625, "learning_rate": 0.0004195, "loss": 6.8114, "mean_token_accuracy": 0.0987453043460846, "num_tokens": 1550875.0, "step": 840 }, { "entropy": 6.948361873626709, "epoch": 0.07099348876286495, "grad_norm": 1.0234375, "learning_rate": 0.000422, "loss": 6.7522, "mean_token_accuracy": 0.10080962181091309, "num_tokens": 1560287.0, "step": 845 }, { "entropy": 6.981166744232178, "epoch": 0.07141356857802983, "grad_norm": 1.0546875, "learning_rate": 0.0004245, "loss": 6.6378, "mean_token_accuracy": 0.10372715294361115, "num_tokens": 1569043.0, "step": 850 }, { "entropy": 6.902826881408691, "epoch": 0.07183364839319471, "grad_norm": 1.0546875, "learning_rate": 0.000427, "loss": 6.6697, "mean_token_accuracy": 0.10197147876024246, "num_tokens": 1578112.0, "step": 855 }, { "entropy": 6.874331331253051, "epoch": 0.07225372820835958, "grad_norm": 1.1015625, "learning_rate": 0.0004295, "loss": 6.5725, "mean_token_accuracy": 0.1078405149281025, "num_tokens": 1586587.0, "step": 860 }, { "entropy": 7.059461355209351, "epoch": 0.07267380802352447, "grad_norm": 1.078125, "learning_rate": 0.000432, "loss": 6.7397, "mean_token_accuracy": 0.09989926218986511, "num_tokens": 1595585.0, "step": 865 }, { "entropy": 6.951946210861206, "epoch": 0.07309388783868935, "grad_norm": 1.09375, "learning_rate": 0.0004345, "loss": 6.6946, "mean_token_accuracy": 0.10353797450661659, "num_tokens": 1605355.0, "step": 870 }, { "entropy": 6.944614362716675, "epoch": 0.07351396765385423, "grad_norm": 1.1328125, "learning_rate": 0.000437, "loss": 6.7108, "mean_token_accuracy": 0.09883329644799232, "num_tokens": 1613637.0, "step": 875 }, { "entropy": 6.975859832763672, "epoch": 0.07393404746901912, "grad_norm": 1.109375, "learning_rate": 0.0004395, "loss": 6.6703, "mean_token_accuracy": 0.10343916267156601, "num_tokens": 1622731.0, "step": 880 }, { "entropy": 7.003747940063477, "epoch": 0.074354127284184, "grad_norm": 1.0390625, "learning_rate": 0.000442, "loss": 6.6373, "mean_token_accuracy": 0.10040950924158096, "num_tokens": 1632098.0, "step": 885 }, { "entropy": 6.826285457611084, "epoch": 0.07477420709934887, "grad_norm": 0.96484375, "learning_rate": 0.0004445, "loss": 6.6454, "mean_token_accuracy": 0.09755287617444992, "num_tokens": 1641259.0, "step": 890 }, { "entropy": 7.0150947093963625, "epoch": 0.07519428691451376, "grad_norm": 1.1875, "learning_rate": 0.000447, "loss": 6.7262, "mean_token_accuracy": 0.09560549557209015, "num_tokens": 1651362.0, "step": 895 }, { "entropy": 6.897852563858033, "epoch": 0.07561436672967864, "grad_norm": 1.171875, "learning_rate": 0.00044950000000000003, "loss": 6.6487, "mean_token_accuracy": 0.10112505033612251, "num_tokens": 1660190.0, "step": 900 }, { "entropy": 6.90705189704895, "epoch": 0.07603444654484352, "grad_norm": 1.1953125, "learning_rate": 0.00045200000000000004, "loss": 6.663, "mean_token_accuracy": 0.10142350941896439, "num_tokens": 1669020.0, "step": 905 }, { "entropy": 6.973592853546142, "epoch": 0.0764545263600084, "grad_norm": 1.140625, "learning_rate": 0.00045450000000000004, "loss": 6.6861, "mean_token_accuracy": 0.1048488400876522, "num_tokens": 1678158.0, "step": 910 }, { "entropy": 6.985338020324707, "epoch": 0.07687460617517328, "grad_norm": 1.1328125, "learning_rate": 0.00045700000000000005, "loss": 6.7084, "mean_token_accuracy": 0.10136276260018348, "num_tokens": 1687481.0, "step": 915 }, { "entropy": 6.876794004440308, "epoch": 0.07729468599033816, "grad_norm": 1.140625, "learning_rate": 0.00045950000000000006, "loss": 6.6666, "mean_token_accuracy": 0.10845559537410736, "num_tokens": 1696782.0, "step": 920 }, { "entropy": 6.932897567749023, "epoch": 0.07771476580550304, "grad_norm": 1.0390625, "learning_rate": 0.000462, "loss": 6.6725, "mean_token_accuracy": 0.10497085899114608, "num_tokens": 1706153.0, "step": 925 }, { "entropy": 6.9077776908874515, "epoch": 0.07813484562066793, "grad_norm": 1.0078125, "learning_rate": 0.0004645, "loss": 6.6889, "mean_token_accuracy": 0.10281107649207115, "num_tokens": 1715585.0, "step": 930 }, { "entropy": 7.106683778762817, "epoch": 0.07855492543583281, "grad_norm": 1.3359375, "learning_rate": 0.000467, "loss": 6.8042, "mean_token_accuracy": 0.10099845305085182, "num_tokens": 1724857.0, "step": 935 }, { "entropy": 6.858903789520264, "epoch": 0.0789750052509977, "grad_norm": 1.15625, "learning_rate": 0.0004695, "loss": 6.6175, "mean_token_accuracy": 0.10900806412100791, "num_tokens": 1733528.0, "step": 940 }, { "entropy": 7.006282758712769, "epoch": 0.07939508506616257, "grad_norm": 0.9140625, "learning_rate": 0.000472, "loss": 6.7383, "mean_token_accuracy": 0.10379872918128967, "num_tokens": 1742953.0, "step": 945 }, { "entropy": 6.92790584564209, "epoch": 0.07981516488132745, "grad_norm": 1.1015625, "learning_rate": 0.0004745, "loss": 6.6988, "mean_token_accuracy": 0.10636084228754043, "num_tokens": 1752155.0, "step": 950 }, { "entropy": 6.911950254440308, "epoch": 0.08023524469649233, "grad_norm": 1.171875, "learning_rate": 0.000477, "loss": 6.5687, "mean_token_accuracy": 0.10838210806250573, "num_tokens": 1760562.0, "step": 955 }, { "entropy": 6.83457088470459, "epoch": 0.08065532451165722, "grad_norm": 1.1796875, "learning_rate": 0.0004795, "loss": 6.5891, "mean_token_accuracy": 0.10088410004973411, "num_tokens": 1769631.0, "step": 960 }, { "entropy": 6.914610385894775, "epoch": 0.0810754043268221, "grad_norm": 1.21875, "learning_rate": 0.000482, "loss": 6.6346, "mean_token_accuracy": 0.10217849463224411, "num_tokens": 1779080.0, "step": 965 }, { "entropy": 6.8898755550384525, "epoch": 0.08149548414198697, "grad_norm": 1.296875, "learning_rate": 0.0004845, "loss": 6.6271, "mean_token_accuracy": 0.10570115596055984, "num_tokens": 1787830.0, "step": 970 }, { "entropy": 6.751455068588257, "epoch": 0.08191556395715185, "grad_norm": 1.125, "learning_rate": 0.000487, "loss": 6.5346, "mean_token_accuracy": 0.10223312452435493, "num_tokens": 1796998.0, "step": 975 }, { "entropy": 6.8943780899047855, "epoch": 0.08233564377231674, "grad_norm": 1.0625, "learning_rate": 0.0004895, "loss": 6.6202, "mean_token_accuracy": 0.10597362667322159, "num_tokens": 1806194.0, "step": 980 }, { "entropy": 6.700069093704224, "epoch": 0.08275572358748162, "grad_norm": 0.9609375, "learning_rate": 0.000492, "loss": 6.5072, "mean_token_accuracy": 0.10932167768478393, "num_tokens": 1815751.0, "step": 985 }, { "entropy": 6.749313592910767, "epoch": 0.0831758034026465, "grad_norm": 0.953125, "learning_rate": 0.0004945, "loss": 6.5857, "mean_token_accuracy": 0.10682184919714928, "num_tokens": 1825379.0, "step": 990 }, { "entropy": 6.845586490631104, "epoch": 0.08359588321781139, "grad_norm": 1.1328125, "learning_rate": 0.000497, "loss": 6.5541, "mean_token_accuracy": 0.10507402196526527, "num_tokens": 1834158.0, "step": 995 }, { "entropy": 6.844553852081299, "epoch": 0.08401596303297626, "grad_norm": 1.0625, "learning_rate": 0.0004995, "loss": 6.5161, "mean_token_accuracy": 0.10857650190591812, "num_tokens": 1842724.0, "step": 1000 }, { "entropy": 6.795124101638794, "epoch": 0.08443604284814114, "grad_norm": 1.046875, "learning_rate": 0.000499999998724557, "loss": 6.5362, "mean_token_accuracy": 0.10392995700240135, "num_tokens": 1852485.0, "step": 1005 }, { "entropy": 6.765092468261718, "epoch": 0.08485612266330603, "grad_norm": 1.109375, "learning_rate": 0.0004999999935430703, "loss": 6.575, "mean_token_accuracy": 0.10723726153373718, "num_tokens": 1861303.0, "step": 1010 }, { "entropy": 6.745694637298584, "epoch": 0.08527620247847091, "grad_norm": 1.125, "learning_rate": 0.0004999999843758243, "loss": 6.5409, "mean_token_accuracy": 0.1151320680975914, "num_tokens": 1870859.0, "step": 1015 }, { "entropy": 6.8996889114379885, "epoch": 0.0856962822936358, "grad_norm": 1.0078125, "learning_rate": 0.0004999999712228196, "loss": 6.7032, "mean_token_accuracy": 0.10041022300720215, "num_tokens": 1880295.0, "step": 1020 }, { "entropy": 6.899116802215576, "epoch": 0.08611636210880068, "grad_norm": 1.09375, "learning_rate": 0.0004999999540840562, "loss": 6.6176, "mean_token_accuracy": 0.10147540494799615, "num_tokens": 1889193.0, "step": 1025 }, { "entropy": 6.797919845581054, "epoch": 0.08653644192396555, "grad_norm": 1.0625, "learning_rate": 0.0004999999329595345, "loss": 6.709, "mean_token_accuracy": 0.09875654354691506, "num_tokens": 1899437.0, "step": 1030 }, { "entropy": 6.910034608840943, "epoch": 0.08695652173913043, "grad_norm": 1.03125, "learning_rate": 0.0004999999078492548, "loss": 6.6032, "mean_token_accuracy": 0.10777303576469421, "num_tokens": 1907882.0, "step": 1035 }, { "entropy": 6.728742361068726, "epoch": 0.08737660155429532, "grad_norm": 0.9375, "learning_rate": 0.0004999998787532176, "loss": 6.5131, "mean_token_accuracy": 0.1080910786986351, "num_tokens": 1916872.0, "step": 1040 }, { "entropy": 6.86653618812561, "epoch": 0.0877966813694602, "grad_norm": 1.0625, "learning_rate": 0.0004999998456714234, "loss": 6.6681, "mean_token_accuracy": 0.1074354499578476, "num_tokens": 1926636.0, "step": 1045 }, { "entropy": 6.773524904251099, "epoch": 0.08821676118462508, "grad_norm": 1.1640625, "learning_rate": 0.0004999998086038729, "loss": 6.5697, "mean_token_accuracy": 0.108617003262043, "num_tokens": 1935962.0, "step": 1050 }, { "entropy": 6.809631824493408, "epoch": 0.08863684099978995, "grad_norm": 1.078125, "learning_rate": 0.0004999997675505665, "loss": 6.5493, "mean_token_accuracy": 0.10353536382317544, "num_tokens": 1944600.0, "step": 1055 }, { "entropy": 6.8208941459655765, "epoch": 0.08905692081495484, "grad_norm": 1.015625, "learning_rate": 0.0004999997225115052, "loss": 6.7156, "mean_token_accuracy": 0.10389059409499168, "num_tokens": 1954234.0, "step": 1060 }, { "entropy": 6.95792784690857, "epoch": 0.08947700063011972, "grad_norm": 1.0625, "learning_rate": 0.0004999996734866896, "loss": 6.677, "mean_token_accuracy": 0.10057736709713935, "num_tokens": 1964499.0, "step": 1065 }, { "entropy": 6.662513589859008, "epoch": 0.0898970804452846, "grad_norm": 1.1640625, "learning_rate": 0.0004999996204761206, "loss": 6.3883, "mean_token_accuracy": 0.11360553354024887, "num_tokens": 1973635.0, "step": 1070 }, { "entropy": 6.745052719116211, "epoch": 0.09031716026044949, "grad_norm": 0.95703125, "learning_rate": 0.0004999995634797993, "loss": 6.5278, "mean_token_accuracy": 0.1087425634264946, "num_tokens": 1983509.0, "step": 1075 }, { "entropy": 6.769761800765991, "epoch": 0.09073724007561437, "grad_norm": 1.1484375, "learning_rate": 0.0004999995024977265, "loss": 6.5385, "mean_token_accuracy": 0.11216638460755349, "num_tokens": 1992336.0, "step": 1080 }, { "entropy": 6.855973386764527, "epoch": 0.09115731989077924, "grad_norm": 0.99609375, "learning_rate": 0.0004999994375299034, "loss": 6.5509, "mean_token_accuracy": 0.1137130968272686, "num_tokens": 2001931.0, "step": 1085 }, { "entropy": 6.615939617156982, "epoch": 0.09157739970594413, "grad_norm": 0.98828125, "learning_rate": 0.000499999368576331, "loss": 6.4174, "mean_token_accuracy": 0.11283476129174233, "num_tokens": 2010935.0, "step": 1090 }, { "entropy": 6.7152961730957035, "epoch": 0.09199747952110901, "grad_norm": 1.109375, "learning_rate": 0.0004999992956370109, "loss": 6.4684, "mean_token_accuracy": 0.11342488676309585, "num_tokens": 2020587.0, "step": 1095 }, { "entropy": 6.688837385177612, "epoch": 0.0924175593362739, "grad_norm": 1.046875, "learning_rate": 0.000499999218711944, "loss": 6.5046, "mean_token_accuracy": 0.10743609666824341, "num_tokens": 2029743.0, "step": 1100 }, { "entropy": 6.771305274963379, "epoch": 0.09283763915143878, "grad_norm": 1.1484375, "learning_rate": 0.0004999991378011317, "loss": 6.5286, "mean_token_accuracy": 0.11453117504715919, "num_tokens": 2038468.0, "step": 1105 }, { "entropy": 6.67022180557251, "epoch": 0.09325771896660366, "grad_norm": 1.046875, "learning_rate": 0.0004999990529045757, "loss": 6.4451, "mean_token_accuracy": 0.11554965823888778, "num_tokens": 2047456.0, "step": 1110 }, { "entropy": 6.870058679580689, "epoch": 0.09367779878176853, "grad_norm": 0.9765625, "learning_rate": 0.0004999989640222771, "loss": 6.7458, "mean_token_accuracy": 0.09942527562379837, "num_tokens": 2056691.0, "step": 1115 }, { "entropy": 6.829685544967651, "epoch": 0.09409787859693342, "grad_norm": 1.03125, "learning_rate": 0.000499998871154238, "loss": 6.5487, "mean_token_accuracy": 0.10888865366578102, "num_tokens": 2066068.0, "step": 1120 }, { "entropy": 6.725253868103027, "epoch": 0.0945179584120983, "grad_norm": 1.015625, "learning_rate": 0.0004999987743004597, "loss": 6.4837, "mean_token_accuracy": 0.11379996240139008, "num_tokens": 2075113.0, "step": 1125 }, { "entropy": 6.7777934074401855, "epoch": 0.09493803822726318, "grad_norm": 0.9609375, "learning_rate": 0.0004999986734609438, "loss": 6.6044, "mean_token_accuracy": 0.11070828661322593, "num_tokens": 2084557.0, "step": 1130 }, { "entropy": 6.817347526550293, "epoch": 0.09535811804242807, "grad_norm": 1.0625, "learning_rate": 0.0004999985686356923, "loss": 6.497, "mean_token_accuracy": 0.10584703534841537, "num_tokens": 2093424.0, "step": 1135 }, { "entropy": 6.7462608337402346, "epoch": 0.09577819785759294, "grad_norm": 1.03125, "learning_rate": 0.000499998459824707, "loss": 6.6329, "mean_token_accuracy": 0.10303654298186302, "num_tokens": 2103066.0, "step": 1140 }, { "entropy": 6.799277830123901, "epoch": 0.09619827767275782, "grad_norm": 1.046875, "learning_rate": 0.00049999834702799, "loss": 6.5085, "mean_token_accuracy": 0.11131441742181777, "num_tokens": 2112447.0, "step": 1145 }, { "entropy": 6.711055421829224, "epoch": 0.0966183574879227, "grad_norm": 0.9375, "learning_rate": 0.0004999982302455431, "loss": 6.52, "mean_token_accuracy": 0.11281892731785774, "num_tokens": 2121949.0, "step": 1150 }, { "entropy": 6.780323314666748, "epoch": 0.09703843730308759, "grad_norm": 1.015625, "learning_rate": 0.0004999981094773683, "loss": 6.4157, "mean_token_accuracy": 0.1144998162984848, "num_tokens": 2130464.0, "step": 1155 }, { "entropy": 6.697625207901001, "epoch": 0.09745851711825247, "grad_norm": 1.140625, "learning_rate": 0.000499997984723468, "loss": 6.5921, "mean_token_accuracy": 0.1068018026649952, "num_tokens": 2139577.0, "step": 1160 }, { "entropy": 6.569090557098389, "epoch": 0.09787859693341736, "grad_norm": 0.96484375, "learning_rate": 0.0004999978559838441, "loss": 6.3121, "mean_token_accuracy": 0.11300956755876541, "num_tokens": 2147919.0, "step": 1165 }, { "entropy": 6.716167974472046, "epoch": 0.09829867674858223, "grad_norm": 1.0390625, "learning_rate": 0.0004999977232584991, "loss": 6.4791, "mean_token_accuracy": 0.11262017637491226, "num_tokens": 2156936.0, "step": 1170 }, { "entropy": 6.6336616516113285, "epoch": 0.09871875656374711, "grad_norm": 1.0859375, "learning_rate": 0.0004999975865474354, "loss": 6.5492, "mean_token_accuracy": 0.10994603037834168, "num_tokens": 2165362.0, "step": 1175 }, { "entropy": 6.719806575775147, "epoch": 0.099138836378912, "grad_norm": 1.1796875, "learning_rate": 0.0004999974458506551, "loss": 6.4705, "mean_token_accuracy": 0.11214353889226913, "num_tokens": 2173665.0, "step": 1180 }, { "entropy": 6.786266422271728, "epoch": 0.09955891619407688, "grad_norm": 1.2578125, "learning_rate": 0.000499997301168161, "loss": 6.4531, "mean_token_accuracy": 0.11377902403473854, "num_tokens": 2182222.0, "step": 1185 }, { "entropy": 6.670177459716797, "epoch": 0.09997899600924176, "grad_norm": 0.9609375, "learning_rate": 0.0004999971524999556, "loss": 6.528, "mean_token_accuracy": 0.11228533461689949, "num_tokens": 2192358.0, "step": 1190 }, { "entropy": 6.779563045501709, "epoch": 0.10039907582440663, "grad_norm": 1.03125, "learning_rate": 0.0004999969998460414, "loss": 6.5039, "mean_token_accuracy": 0.10956505164504052, "num_tokens": 2201889.0, "step": 1195 }, { "entropy": 6.6560157299041744, "epoch": 0.10081915563957151, "grad_norm": 1.3359375, "learning_rate": 0.0004999968432064213, "loss": 6.5232, "mean_token_accuracy": 0.11500915959477424, "num_tokens": 2211810.0, "step": 1200 }, { "entropy": 6.652071762084961, "epoch": 0.1012392354547364, "grad_norm": 0.921875, "learning_rate": 0.0004999966825810979, "loss": 6.4474, "mean_token_accuracy": 0.11259665861725807, "num_tokens": 2221123.0, "step": 1205 }, { "entropy": 6.634405040740967, "epoch": 0.10165931526990128, "grad_norm": 1.0703125, "learning_rate": 0.0004999965179700742, "loss": 6.402, "mean_token_accuracy": 0.1181789293885231, "num_tokens": 2230129.0, "step": 1210 }, { "entropy": 6.625933122634888, "epoch": 0.10207939508506617, "grad_norm": 1.03125, "learning_rate": 0.000499996349373353, "loss": 6.4624, "mean_token_accuracy": 0.11246607527136802, "num_tokens": 2239929.0, "step": 1215 }, { "entropy": 6.709180927276611, "epoch": 0.10249947490023105, "grad_norm": 1.0390625, "learning_rate": 0.0004999961767909374, "loss": 6.4292, "mean_token_accuracy": 0.11479318514466286, "num_tokens": 2248078.0, "step": 1220 }, { "entropy": 6.59263162612915, "epoch": 0.10291955471539592, "grad_norm": 1.0625, "learning_rate": 0.0004999960002228303, "loss": 6.5262, "mean_token_accuracy": 0.11000767946243287, "num_tokens": 2256975.0, "step": 1225 }, { "entropy": 6.708470964431763, "epoch": 0.1033396345305608, "grad_norm": 1.15625, "learning_rate": 0.0004999958196690349, "loss": 6.3792, "mean_token_accuracy": 0.11624118462204933, "num_tokens": 2265797.0, "step": 1230 }, { "entropy": 6.645881128311157, "epoch": 0.10375971434572569, "grad_norm": 1.0234375, "learning_rate": 0.0004999956351295545, "loss": 6.4736, "mean_token_accuracy": 0.1176276110112667, "num_tokens": 2274099.0, "step": 1235 }, { "entropy": 6.599815797805786, "epoch": 0.10417979416089057, "grad_norm": 1.03125, "learning_rate": 0.0004999954466043922, "loss": 6.3853, "mean_token_accuracy": 0.11810432821512222, "num_tokens": 2282360.0, "step": 1240 }, { "entropy": 6.57668776512146, "epoch": 0.10459987397605545, "grad_norm": 0.96875, "learning_rate": 0.0004999952540935514, "loss": 6.4891, "mean_token_accuracy": 0.11048517748713493, "num_tokens": 2292714.0, "step": 1245 }, { "entropy": 6.675060033798218, "epoch": 0.10501995379122034, "grad_norm": 1.0859375, "learning_rate": 0.0004999950575970356, "loss": 6.4361, "mean_token_accuracy": 0.11576245203614235, "num_tokens": 2301633.0, "step": 1250 }, { "entropy": 6.642887592315674, "epoch": 0.10544003360638521, "grad_norm": 1.0234375, "learning_rate": 0.0004999948571148482, "loss": 6.3931, "mean_token_accuracy": 0.12049147412180901, "num_tokens": 2310067.0, "step": 1255 }, { "entropy": 6.610925579071045, "epoch": 0.10586011342155009, "grad_norm": 1.046875, "learning_rate": 0.0004999946526469927, "loss": 6.4927, "mean_token_accuracy": 0.11412879601120948, "num_tokens": 2320090.0, "step": 1260 }, { "entropy": 6.649963521957398, "epoch": 0.10628019323671498, "grad_norm": 1.03125, "learning_rate": 0.0004999944441934728, "loss": 6.4451, "mean_token_accuracy": 0.11852803751826287, "num_tokens": 2329255.0, "step": 1265 }, { "entropy": 6.678138732910156, "epoch": 0.10670027305187986, "grad_norm": 1.109375, "learning_rate": 0.0004999942317542922, "loss": 6.5261, "mean_token_accuracy": 0.11407028958201408, "num_tokens": 2339535.0, "step": 1270 }, { "entropy": 6.635104560852051, "epoch": 0.10712035286704474, "grad_norm": 1.0546875, "learning_rate": 0.0004999940153294546, "loss": 6.425, "mean_token_accuracy": 0.11798783987760544, "num_tokens": 2348948.0, "step": 1275 }, { "entropy": 6.629437446594238, "epoch": 0.10754043268220961, "grad_norm": 0.99609375, "learning_rate": 0.000499993794918964, "loss": 6.4518, "mean_token_accuracy": 0.10851866900920867, "num_tokens": 2359141.0, "step": 1280 }, { "entropy": 6.612447357177734, "epoch": 0.1079605124973745, "grad_norm": 1.1875, "learning_rate": 0.0004999935705228241, "loss": 6.5007, "mean_token_accuracy": 0.10988411605358124, "num_tokens": 2368906.0, "step": 1285 }, { "entropy": 6.720192527770996, "epoch": 0.10838059231253938, "grad_norm": 1.15625, "learning_rate": 0.0004999933421410389, "loss": 6.4756, "mean_token_accuracy": 0.11632761880755424, "num_tokens": 2377029.0, "step": 1290 }, { "entropy": 6.682251882553101, "epoch": 0.10880067212770426, "grad_norm": 0.84765625, "learning_rate": 0.0004999931097736125, "loss": 6.5226, "mean_token_accuracy": 0.10841714516282082, "num_tokens": 2387088.0, "step": 1295 }, { "entropy": 6.616416501998901, "epoch": 0.10922075194286915, "grad_norm": 1.015625, "learning_rate": 0.0004999928734205492, "loss": 6.4358, "mean_token_accuracy": 0.11085559725761414, "num_tokens": 2395596.0, "step": 1300 }, { "entropy": 6.630216932296753, "epoch": 0.10964083175803403, "grad_norm": 1.09375, "learning_rate": 0.0004999926330818528, "loss": 6.4278, "mean_token_accuracy": 0.11868382543325424, "num_tokens": 2404506.0, "step": 1305 }, { "entropy": 6.615355587005615, "epoch": 0.1100609115731989, "grad_norm": 1.109375, "learning_rate": 0.0004999923887575278, "loss": 6.4742, "mean_token_accuracy": 0.11464583277702331, "num_tokens": 2414342.0, "step": 1310 }, { "entropy": 6.68165545463562, "epoch": 0.11048099138836379, "grad_norm": 1.0859375, "learning_rate": 0.0004999921404475785, "loss": 6.4271, "mean_token_accuracy": 0.11960532069206238, "num_tokens": 2423076.0, "step": 1315 }, { "entropy": 6.567938899993896, "epoch": 0.11090107120352867, "grad_norm": 0.91796875, "learning_rate": 0.0004999918881520093, "loss": 6.3809, "mean_token_accuracy": 0.1204459622502327, "num_tokens": 2432492.0, "step": 1320 }, { "entropy": 6.610611057281494, "epoch": 0.11132115101869355, "grad_norm": 0.96875, "learning_rate": 0.0004999916318708246, "loss": 6.3447, "mean_token_accuracy": 0.1213211365044117, "num_tokens": 2441916.0, "step": 1325 }, { "entropy": 6.550094270706177, "epoch": 0.11174123083385844, "grad_norm": 1.1015625, "learning_rate": 0.0004999913716040291, "loss": 6.4, "mean_token_accuracy": 0.11803905665874481, "num_tokens": 2450932.0, "step": 1330 }, { "entropy": 6.5825268745422365, "epoch": 0.11216131064902331, "grad_norm": 1.0859375, "learning_rate": 0.0004999911073516272, "loss": 6.4156, "mean_token_accuracy": 0.11501810997724533, "num_tokens": 2460058.0, "step": 1335 }, { "entropy": 6.541036558151245, "epoch": 0.11258139046418819, "grad_norm": 0.98046875, "learning_rate": 0.0004999908391136237, "loss": 6.3486, "mean_token_accuracy": 0.11862518936395645, "num_tokens": 2469607.0, "step": 1340 }, { "entropy": 6.54659481048584, "epoch": 0.11300147027935308, "grad_norm": 1.09375, "learning_rate": 0.0004999905668900234, "loss": 6.4037, "mean_token_accuracy": 0.11429757624864578, "num_tokens": 2478345.0, "step": 1345 }, { "entropy": 6.665723133087158, "epoch": 0.11342155009451796, "grad_norm": 1.15625, "learning_rate": 0.000499990290680831, "loss": 6.3362, "mean_token_accuracy": 0.11939993128180504, "num_tokens": 2486662.0, "step": 1350 }, { "entropy": 6.539735174179077, "epoch": 0.11384162990968284, "grad_norm": 1.0859375, "learning_rate": 0.0004999900104860516, "loss": 6.4496, "mean_token_accuracy": 0.11450904607772827, "num_tokens": 2495392.0, "step": 1355 }, { "entropy": 6.640576314926148, "epoch": 0.11426170972484773, "grad_norm": 1.0546875, "learning_rate": 0.0004999897263056898, "loss": 6.4824, "mean_token_accuracy": 0.11427311152219773, "num_tokens": 2505254.0, "step": 1360 }, { "entropy": 6.6059410572052, "epoch": 0.1146817895400126, "grad_norm": 1.0, "learning_rate": 0.000499989438139751, "loss": 6.2902, "mean_token_accuracy": 0.12163057401776314, "num_tokens": 2514096.0, "step": 1365 }, { "entropy": 6.572102785110474, "epoch": 0.11510186935517748, "grad_norm": 0.9453125, "learning_rate": 0.0004999891459882401, "loss": 6.3036, "mean_token_accuracy": 0.12106614261865616, "num_tokens": 2523635.0, "step": 1370 }, { "entropy": 6.518535518646241, "epoch": 0.11552194917034236, "grad_norm": 0.99609375, "learning_rate": 0.0004999888498511624, "loss": 6.3872, "mean_token_accuracy": 0.117999816685915, "num_tokens": 2532528.0, "step": 1375 }, { "entropy": 6.522701168060303, "epoch": 0.11594202898550725, "grad_norm": 1.0625, "learning_rate": 0.0004999885497285229, "loss": 6.3026, "mean_token_accuracy": 0.11809839084744453, "num_tokens": 2541893.0, "step": 1380 }, { "entropy": 6.516852188110351, "epoch": 0.11636210880067213, "grad_norm": 0.99609375, "learning_rate": 0.0004999882456203273, "loss": 6.3627, "mean_token_accuracy": 0.11867272853851318, "num_tokens": 2551551.0, "step": 1385 }, { "entropy": 6.592957019805908, "epoch": 0.11678218861583702, "grad_norm": 1.1171875, "learning_rate": 0.0004999879375265806, "loss": 6.314, "mean_token_accuracy": 0.1192450650036335, "num_tokens": 2560183.0, "step": 1390 }, { "entropy": 6.526823472976685, "epoch": 0.11720226843100189, "grad_norm": 1.1484375, "learning_rate": 0.0004999876254472886, "loss": 6.2065, "mean_token_accuracy": 0.127345572412014, "num_tokens": 2568697.0, "step": 1395 }, { "entropy": 6.488171815872192, "epoch": 0.11762234824616677, "grad_norm": 0.97265625, "learning_rate": 0.0004999873093824565, "loss": 6.4136, "mean_token_accuracy": 0.1172497920691967, "num_tokens": 2578151.0, "step": 1400 }, { "entropy": 6.697162342071533, "epoch": 0.11804242806133165, "grad_norm": 1.1171875, "learning_rate": 0.0004999869893320902, "loss": 6.5415, "mean_token_accuracy": 0.11695929765701293, "num_tokens": 2585901.0, "step": 1405 }, { "entropy": 6.558137512207031, "epoch": 0.11846250787649654, "grad_norm": 1.0234375, "learning_rate": 0.0004999866652961952, "loss": 6.3565, "mean_token_accuracy": 0.11195311546325684, "num_tokens": 2595655.0, "step": 1410 }, { "entropy": 6.547592639923096, "epoch": 0.11888258769166142, "grad_norm": 0.984375, "learning_rate": 0.0004999863372747773, "loss": 6.3241, "mean_token_accuracy": 0.1137452982366085, "num_tokens": 2604949.0, "step": 1415 }, { "entropy": 6.549184036254883, "epoch": 0.11930266750682629, "grad_norm": 1.125, "learning_rate": 0.0004999860052678423, "loss": 6.3987, "mean_token_accuracy": 0.12182095795869827, "num_tokens": 2614260.0, "step": 1420 }, { "entropy": 6.533220100402832, "epoch": 0.11972274732199117, "grad_norm": 1.046875, "learning_rate": 0.0004999856692753959, "loss": 6.3846, "mean_token_accuracy": 0.11606933474540711, "num_tokens": 2623740.0, "step": 1425 }, { "entropy": 6.56026554107666, "epoch": 0.12014282713715606, "grad_norm": 1.0390625, "learning_rate": 0.0004999853292974444, "loss": 6.2829, "mean_token_accuracy": 0.1191012591123581, "num_tokens": 2631998.0, "step": 1430 }, { "entropy": 6.436700010299683, "epoch": 0.12056290695232094, "grad_norm": 0.96875, "learning_rate": 0.0004999849853339936, "loss": 6.4441, "mean_token_accuracy": 0.12089451104402542, "num_tokens": 2641169.0, "step": 1435 }, { "entropy": 6.6503981590271, "epoch": 0.12098298676748583, "grad_norm": 0.9140625, "learning_rate": 0.0004999846373850497, "loss": 6.2726, "mean_token_accuracy": 0.12328374907374381, "num_tokens": 2650576.0, "step": 1440 }, { "entropy": 6.504758834838867, "epoch": 0.12140306658265071, "grad_norm": 1.0234375, "learning_rate": 0.0004999842854506186, "loss": 6.3597, "mean_token_accuracy": 0.11508475914597512, "num_tokens": 2660817.0, "step": 1445 }, { "entropy": 6.454709720611572, "epoch": 0.12182314639781558, "grad_norm": 1.0859375, "learning_rate": 0.0004999839295307069, "loss": 6.317, "mean_token_accuracy": 0.11818674132227898, "num_tokens": 2669338.0, "step": 1450 }, { "entropy": 6.5724732875823975, "epoch": 0.12224322621298046, "grad_norm": 1.078125, "learning_rate": 0.0004999835696253206, "loss": 6.3698, "mean_token_accuracy": 0.11763316094875335, "num_tokens": 2679108.0, "step": 1455 }, { "entropy": 6.542471504211425, "epoch": 0.12266330602814535, "grad_norm": 0.9453125, "learning_rate": 0.0004999832057344664, "loss": 6.3312, "mean_token_accuracy": 0.11857884675264359, "num_tokens": 2688126.0, "step": 1460 }, { "entropy": 6.3690132141113285, "epoch": 0.12308338584331023, "grad_norm": 1.0390625, "learning_rate": 0.0004999828378581504, "loss": 6.2827, "mean_token_accuracy": 0.12631092369556426, "num_tokens": 2697245.0, "step": 1465 }, { "entropy": 6.5668089389801025, "epoch": 0.12350346565847511, "grad_norm": 1.046875, "learning_rate": 0.0004999824659963793, "loss": 6.3543, "mean_token_accuracy": 0.12048940360546112, "num_tokens": 2705934.0, "step": 1470 }, { "entropy": 6.516648006439209, "epoch": 0.12392354547364, "grad_norm": 1.125, "learning_rate": 0.0004999820901491598, "loss": 6.2753, "mean_token_accuracy": 0.12523386031389236, "num_tokens": 2714367.0, "step": 1475 }, { "entropy": 6.416815328598022, "epoch": 0.12434362528880487, "grad_norm": 1.0390625, "learning_rate": 0.0004999817103164983, "loss": 6.3117, "mean_token_accuracy": 0.12113343179225922, "num_tokens": 2724366.0, "step": 1480 }, { "entropy": 6.518594264984131, "epoch": 0.12476370510396975, "grad_norm": 0.953125, "learning_rate": 0.0004999813264984017, "loss": 6.3262, "mean_token_accuracy": 0.11913523152470588, "num_tokens": 2733980.0, "step": 1485 }, { "entropy": 6.520108652114868, "epoch": 0.12518378491913462, "grad_norm": 1.0234375, "learning_rate": 0.0004999809386948767, "loss": 6.3232, "mean_token_accuracy": 0.11875561475753785, "num_tokens": 2744013.0, "step": 1490 }, { "entropy": 6.4508843421936035, "epoch": 0.12560386473429952, "grad_norm": 1.1640625, "learning_rate": 0.0004999805469059302, "loss": 6.3917, "mean_token_accuracy": 0.1202739343047142, "num_tokens": 2753385.0, "step": 1495 }, { "entropy": 6.467165565490722, "epoch": 0.1260239445494644, "grad_norm": 1.03125, "learning_rate": 0.0004999801511315693, "loss": 6.2443, "mean_token_accuracy": 0.11950960382819176, "num_tokens": 2762875.0, "step": 1500 }, { "entropy": 6.561000490188599, "epoch": 0.1264440243646293, "grad_norm": 1.0234375, "learning_rate": 0.0004999797513718007, "loss": 6.3133, "mean_token_accuracy": 0.12554540634155273, "num_tokens": 2772182.0, "step": 1505 }, { "entropy": 6.398244476318359, "epoch": 0.12686410417979416, "grad_norm": 1.0234375, "learning_rate": 0.0004999793476266317, "loss": 6.2652, "mean_token_accuracy": 0.12494927272200584, "num_tokens": 2780814.0, "step": 1510 }, { "entropy": 6.759689664840698, "epoch": 0.12728418399495905, "grad_norm": 1.0234375, "learning_rate": 0.0004999789398960695, "loss": 6.5371, "mean_token_accuracy": 0.120218076556921, "num_tokens": 2791104.0, "step": 1515 }, { "entropy": 6.380699729919433, "epoch": 0.12770426381012392, "grad_norm": 0.9921875, "learning_rate": 0.0004999785281801212, "loss": 6.2392, "mean_token_accuracy": 0.12141881808638573, "num_tokens": 2800081.0, "step": 1520 }, { "entropy": 6.502162122726441, "epoch": 0.1281243436252888, "grad_norm": 1.0703125, "learning_rate": 0.000499978112478794, "loss": 6.3645, "mean_token_accuracy": 0.11820052862167359, "num_tokens": 2809096.0, "step": 1525 }, { "entropy": 6.559705686569214, "epoch": 0.1285444234404537, "grad_norm": 1.0, "learning_rate": 0.0004999776927920955, "loss": 6.3324, "mean_token_accuracy": 0.12376131415367127, "num_tokens": 2818857.0, "step": 1530 }, { "entropy": 6.478033876419067, "epoch": 0.12896450325561856, "grad_norm": 1.0703125, "learning_rate": 0.000499977269120033, "loss": 6.3924, "mean_token_accuracy": 0.11640017554163933, "num_tokens": 2829332.0, "step": 1535 }, { "entropy": 6.471277475357056, "epoch": 0.12938458307078346, "grad_norm": 0.9453125, "learning_rate": 0.000499976841462614, "loss": 6.3118, "mean_token_accuracy": 0.11578154116868973, "num_tokens": 2839193.0, "step": 1540 }, { "entropy": 6.515983152389526, "epoch": 0.12980466288594833, "grad_norm": 0.94921875, "learning_rate": 0.000499976409819846, "loss": 6.3126, "mean_token_accuracy": 0.1165178470313549, "num_tokens": 2848535.0, "step": 1545 }, { "entropy": 6.329218864440918, "epoch": 0.1302247427011132, "grad_norm": 0.9609375, "learning_rate": 0.0004999759741917369, "loss": 6.2119, "mean_token_accuracy": 0.12768493369221687, "num_tokens": 2858090.0, "step": 1550 }, { "entropy": 6.4847986698150635, "epoch": 0.1306448225162781, "grad_norm": 1.1640625, "learning_rate": 0.0004999755345782941, "loss": 6.3672, "mean_token_accuracy": 0.1186487466096878, "num_tokens": 2866984.0, "step": 1555 }, { "entropy": 6.419411611557007, "epoch": 0.13106490233144297, "grad_norm": 0.89453125, "learning_rate": 0.0004999750909795256, "loss": 6.1757, "mean_token_accuracy": 0.1280258044600487, "num_tokens": 2876550.0, "step": 1560 }, { "entropy": 6.461032104492188, "epoch": 0.13148498214660786, "grad_norm": 0.98046875, "learning_rate": 0.0004999746433954394, "loss": 6.2774, "mean_token_accuracy": 0.1213872842490673, "num_tokens": 2885782.0, "step": 1565 }, { "entropy": 6.447916793823242, "epoch": 0.13190506196177273, "grad_norm": 1.0, "learning_rate": 0.000499974191826043, "loss": 6.2448, "mean_token_accuracy": 0.13687582612037658, "num_tokens": 2894807.0, "step": 1570 }, { "entropy": 6.439778518676758, "epoch": 0.1323251417769376, "grad_norm": 1.171875, "learning_rate": 0.0004999737362713448, "loss": 6.2925, "mean_token_accuracy": 0.1238982230424881, "num_tokens": 2904076.0, "step": 1575 }, { "entropy": 6.471430492401123, "epoch": 0.1327452215921025, "grad_norm": 1.0390625, "learning_rate": 0.0004999732767313527, "loss": 6.2033, "mean_token_accuracy": 0.1205870471894741, "num_tokens": 2913761.0, "step": 1580 }, { "entropy": 6.509069633483887, "epoch": 0.13316530140726737, "grad_norm": 1.0546875, "learning_rate": 0.0004999728132060746, "loss": 6.4228, "mean_token_accuracy": 0.12286271527409554, "num_tokens": 2922848.0, "step": 1585 }, { "entropy": 6.5165454864501955, "epoch": 0.13358538122243227, "grad_norm": 0.953125, "learning_rate": 0.0004999723456955192, "loss": 6.3079, "mean_token_accuracy": 0.11906806230545045, "num_tokens": 2932718.0, "step": 1590 }, { "entropy": 6.353040504455566, "epoch": 0.13400546103759714, "grad_norm": 0.9765625, "learning_rate": 0.0004999718741996945, "loss": 6.2648, "mean_token_accuracy": 0.12362491562962533, "num_tokens": 2942686.0, "step": 1595 }, { "entropy": 6.480581188201905, "epoch": 0.13442554085276204, "grad_norm": 0.98046875, "learning_rate": 0.000499971398718609, "loss": 6.2304, "mean_token_accuracy": 0.12233746945858001, "num_tokens": 2952096.0, "step": 1600 }, { "entropy": 6.41249566078186, "epoch": 0.1348456206679269, "grad_norm": 1.0234375, "learning_rate": 0.0004999709192522708, "loss": 6.3139, "mean_token_accuracy": 0.12512291446328164, "num_tokens": 2960660.0, "step": 1605 }, { "entropy": 6.536613845825196, "epoch": 0.13526570048309178, "grad_norm": 0.91796875, "learning_rate": 0.0004999704358006887, "loss": 6.3118, "mean_token_accuracy": 0.12129077091813087, "num_tokens": 2969834.0, "step": 1610 }, { "entropy": 6.4085368633270265, "epoch": 0.13568578029825668, "grad_norm": 1.09375, "learning_rate": 0.0004999699483638712, "loss": 6.2906, "mean_token_accuracy": 0.12232841104269028, "num_tokens": 2979023.0, "step": 1615 }, { "entropy": 6.476312971115112, "epoch": 0.13610586011342155, "grad_norm": 1.015625, "learning_rate": 0.0004999694569418269, "loss": 6.2964, "mean_token_accuracy": 0.12233099341392517, "num_tokens": 2988083.0, "step": 1620 }, { "entropy": 6.359239149093628, "epoch": 0.13652593992858644, "grad_norm": 0.9921875, "learning_rate": 0.0004999689615345645, "loss": 6.2196, "mean_token_accuracy": 0.12490532472729683, "num_tokens": 2997240.0, "step": 1625 }, { "entropy": 6.505274820327759, "epoch": 0.1369460197437513, "grad_norm": 1.0859375, "learning_rate": 0.0004999684621420928, "loss": 6.2805, "mean_token_accuracy": 0.12174654453992843, "num_tokens": 3007077.0, "step": 1630 }, { "entropy": 6.501539659500122, "epoch": 0.13736609955891618, "grad_norm": 1.0078125, "learning_rate": 0.0004999679587644205, "loss": 6.3282, "mean_token_accuracy": 0.11869422942399979, "num_tokens": 3015821.0, "step": 1635 }, { "entropy": 6.434766483306885, "epoch": 0.13778617937408108, "grad_norm": 1.046875, "learning_rate": 0.0004999674514015568, "loss": 6.2508, "mean_token_accuracy": 0.1246812529861927, "num_tokens": 3025858.0, "step": 1640 }, { "entropy": 6.406217813491821, "epoch": 0.13820625918924595, "grad_norm": 0.98046875, "learning_rate": 0.0004999669400535105, "loss": 6.2132, "mean_token_accuracy": 0.12023203670978547, "num_tokens": 3035537.0, "step": 1645 }, { "entropy": 6.359542560577393, "epoch": 0.13862633900441085, "grad_norm": 1.140625, "learning_rate": 0.0004999664247202907, "loss": 6.152, "mean_token_accuracy": 0.12406394928693772, "num_tokens": 3044204.0, "step": 1650 }, { "entropy": 6.404636430740356, "epoch": 0.13904641881957572, "grad_norm": 1.03125, "learning_rate": 0.0004999659054019066, "loss": 6.2994, "mean_token_accuracy": 0.12448503151535988, "num_tokens": 3053111.0, "step": 1655 }, { "entropy": 6.443476963043213, "epoch": 0.1394664986347406, "grad_norm": 1.0390625, "learning_rate": 0.0004999653820983673, "loss": 6.2201, "mean_token_accuracy": 0.12843194082379342, "num_tokens": 3062456.0, "step": 1660 }, { "entropy": 6.356498098373413, "epoch": 0.13988657844990549, "grad_norm": 0.98828125, "learning_rate": 0.000499964854809682, "loss": 6.2579, "mean_token_accuracy": 0.12453076243400574, "num_tokens": 3071132.0, "step": 1665 }, { "entropy": 6.388091611862182, "epoch": 0.14030665826507036, "grad_norm": 0.98046875, "learning_rate": 0.0004999643235358602, "loss": 6.2078, "mean_token_accuracy": 0.12833356559276582, "num_tokens": 3080892.0, "step": 1670 }, { "entropy": 6.392906522750854, "epoch": 0.14072673808023525, "grad_norm": 1.015625, "learning_rate": 0.0004999637882769112, "loss": 6.1429, "mean_token_accuracy": 0.12803655937314035, "num_tokens": 3089874.0, "step": 1675 }, { "entropy": 6.369514799118042, "epoch": 0.14114681789540012, "grad_norm": 0.91796875, "learning_rate": 0.0004999632490328447, "loss": 6.2814, "mean_token_accuracy": 0.12487674206495285, "num_tokens": 3099535.0, "step": 1680 }, { "entropy": 6.432224130630493, "epoch": 0.14156689771056502, "grad_norm": 0.984375, "learning_rate": 0.0004999627058036699, "loss": 6.24, "mean_token_accuracy": 0.12075779214501381, "num_tokens": 3108772.0, "step": 1685 }, { "entropy": 6.430401134490967, "epoch": 0.1419869775257299, "grad_norm": 1.0234375, "learning_rate": 0.0004999621585893966, "loss": 6.2696, "mean_token_accuracy": 0.11704754754900933, "num_tokens": 3118333.0, "step": 1690 }, { "entropy": 6.450057506561279, "epoch": 0.14240705734089476, "grad_norm": 1.0625, "learning_rate": 0.0004999616073900346, "loss": 6.3013, "mean_token_accuracy": 0.12180939391255378, "num_tokens": 3127356.0, "step": 1695 }, { "entropy": 6.412153673171997, "epoch": 0.14282713715605966, "grad_norm": 1.0859375, "learning_rate": 0.0004999610522055935, "loss": 6.2662, "mean_token_accuracy": 0.1200573742389679, "num_tokens": 3136859.0, "step": 1700 }, { "entropy": 6.451931762695312, "epoch": 0.14324721697122453, "grad_norm": 0.9921875, "learning_rate": 0.0004999604930360832, "loss": 6.2945, "mean_token_accuracy": 0.12161469012498856, "num_tokens": 3146607.0, "step": 1705 }, { "entropy": 6.3816108226776125, "epoch": 0.14366729678638943, "grad_norm": 0.95703125, "learning_rate": 0.0004999599298815136, "loss": 6.2364, "mean_token_accuracy": 0.12764545828104018, "num_tokens": 3156327.0, "step": 1710 }, { "entropy": 6.309280204772949, "epoch": 0.1440873766015543, "grad_norm": 1.5390625, "learning_rate": 0.0004999593627418947, "loss": 6.177, "mean_token_accuracy": 0.13247063681483268, "num_tokens": 3165559.0, "step": 1715 }, { "entropy": 6.405248212814331, "epoch": 0.14450745641671917, "grad_norm": 1.0625, "learning_rate": 0.0004999587916172365, "loss": 6.2704, "mean_token_accuracy": 0.1183898076415062, "num_tokens": 3173850.0, "step": 1720 }, { "entropy": 6.435620069503784, "epoch": 0.14492753623188406, "grad_norm": 1.0078125, "learning_rate": 0.0004999582165075492, "loss": 6.22, "mean_token_accuracy": 0.11956866905093193, "num_tokens": 3182838.0, "step": 1725 }, { "entropy": 6.2884269714355465, "epoch": 0.14534761604704893, "grad_norm": 1.0234375, "learning_rate": 0.0004999576374128429, "loss": 6.202, "mean_token_accuracy": 0.1219302274286747, "num_tokens": 3191692.0, "step": 1730 }, { "entropy": 6.500776195526123, "epoch": 0.14576769586221383, "grad_norm": 1.0703125, "learning_rate": 0.0004999570543331279, "loss": 6.226, "mean_token_accuracy": 0.1263854332268238, "num_tokens": 3200069.0, "step": 1735 }, { "entropy": 6.411444854736328, "epoch": 0.1461877756773787, "grad_norm": 1.140625, "learning_rate": 0.0004999564672684145, "loss": 6.3228, "mean_token_accuracy": 0.12090336456894875, "num_tokens": 3209653.0, "step": 1740 }, { "entropy": 6.448664712905884, "epoch": 0.14660785549254357, "grad_norm": 1.03125, "learning_rate": 0.0004999558762187131, "loss": 6.1938, "mean_token_accuracy": 0.12701231315732003, "num_tokens": 3218313.0, "step": 1745 }, { "entropy": 6.32896614074707, "epoch": 0.14702793530770847, "grad_norm": 1.015625, "learning_rate": 0.0004999552811840342, "loss": 6.1297, "mean_token_accuracy": 0.12769370079040526, "num_tokens": 3227525.0, "step": 1750 }, { "entropy": 6.335414171218872, "epoch": 0.14744801512287334, "grad_norm": 0.94921875, "learning_rate": 0.0004999546821643884, "loss": 6.2408, "mean_token_accuracy": 0.12636618986725806, "num_tokens": 3237022.0, "step": 1755 }, { "entropy": 6.317769384384155, "epoch": 0.14786809493803824, "grad_norm": 0.9921875, "learning_rate": 0.0004999540791597861, "loss": 6.1464, "mean_token_accuracy": 0.12537204548716546, "num_tokens": 3246605.0, "step": 1760 }, { "entropy": 6.258312511444092, "epoch": 0.1482881747532031, "grad_norm": 1.03125, "learning_rate": 0.0004999534721702383, "loss": 6.0956, "mean_token_accuracy": 0.13141294568777084, "num_tokens": 3255587.0, "step": 1765 }, { "entropy": 6.364277791976929, "epoch": 0.148708254568368, "grad_norm": 1.0234375, "learning_rate": 0.0004999528611957553, "loss": 6.1968, "mean_token_accuracy": 0.1267327442765236, "num_tokens": 3265669.0, "step": 1770 }, { "entropy": 6.433037424087525, "epoch": 0.14912833438353287, "grad_norm": 1.078125, "learning_rate": 0.0004999522462363485, "loss": 6.1795, "mean_token_accuracy": 0.12822128161787988, "num_tokens": 3275013.0, "step": 1775 }, { "entropy": 6.372742748260498, "epoch": 0.14954841419869774, "grad_norm": 0.91796875, "learning_rate": 0.0004999516272920283, "loss": 6.2775, "mean_token_accuracy": 0.12774404734373093, "num_tokens": 3284723.0, "step": 1780 }, { "entropy": 6.256136322021485, "epoch": 0.14996849401386264, "grad_norm": 0.96484375, "learning_rate": 0.000499951004362806, "loss": 6.1087, "mean_token_accuracy": 0.13196263536810876, "num_tokens": 3293860.0, "step": 1785 }, { "entropy": 6.278848552703858, "epoch": 0.1503885738290275, "grad_norm": 0.9765625, "learning_rate": 0.0004999503774486924, "loss": 6.1623, "mean_token_accuracy": 0.13007338494062423, "num_tokens": 3303158.0, "step": 1790 }, { "entropy": 6.253765487670899, "epoch": 0.1508086536441924, "grad_norm": 0.96484375, "learning_rate": 0.0004999497465496987, "loss": 6.1083, "mean_token_accuracy": 0.1231241799890995, "num_tokens": 3313068.0, "step": 1795 }, { "entropy": 6.319281959533692, "epoch": 0.15122873345935728, "grad_norm": 1.0390625, "learning_rate": 0.000499949111665836, "loss": 6.1761, "mean_token_accuracy": 0.12510209009051323, "num_tokens": 3321885.0, "step": 1800 }, { "entropy": 6.368197298049926, "epoch": 0.15164881327452215, "grad_norm": 1.015625, "learning_rate": 0.0004999484727971158, "loss": 6.1707, "mean_token_accuracy": 0.12798358947038652, "num_tokens": 3330924.0, "step": 1805 }, { "entropy": 6.339307403564453, "epoch": 0.15206889308968705, "grad_norm": 1.0625, "learning_rate": 0.000499947829943549, "loss": 6.1964, "mean_token_accuracy": 0.12618306949734687, "num_tokens": 3340070.0, "step": 1810 }, { "entropy": 6.394219160079956, "epoch": 0.15248897290485192, "grad_norm": 0.984375, "learning_rate": 0.0004999471831051474, "loss": 6.1922, "mean_token_accuracy": 0.13684661015868188, "num_tokens": 3349870.0, "step": 1815 }, { "entropy": 6.330759143829345, "epoch": 0.1529090527200168, "grad_norm": 0.94921875, "learning_rate": 0.0004999465322819222, "loss": 6.2371, "mean_token_accuracy": 0.12111249193549156, "num_tokens": 3359573.0, "step": 1820 }, { "entropy": 6.372816276550293, "epoch": 0.15332913253518168, "grad_norm": 1.046875, "learning_rate": 0.0004999458774738851, "loss": 6.1732, "mean_token_accuracy": 0.13470285311341285, "num_tokens": 3368577.0, "step": 1825 }, { "entropy": 6.352361059188842, "epoch": 0.15374921235034655, "grad_norm": 1.078125, "learning_rate": 0.0004999452186810476, "loss": 6.1469, "mean_token_accuracy": 0.13113251850008964, "num_tokens": 3377801.0, "step": 1830 }, { "entropy": 6.3680521011352536, "epoch": 0.15416929216551145, "grad_norm": 1.046875, "learning_rate": 0.0004999445559034214, "loss": 6.1995, "mean_token_accuracy": 0.12895982414484025, "num_tokens": 3386666.0, "step": 1835 }, { "entropy": 6.443807363510132, "epoch": 0.15458937198067632, "grad_norm": 0.97265625, "learning_rate": 0.0004999438891410181, "loss": 6.3344, "mean_token_accuracy": 0.12429568618535995, "num_tokens": 3396086.0, "step": 1840 }, { "entropy": 6.371559190750122, "epoch": 0.15500945179584122, "grad_norm": 1.0234375, "learning_rate": 0.0004999432183938496, "loss": 6.2503, "mean_token_accuracy": 0.1258139818906784, "num_tokens": 3404894.0, "step": 1845 }, { "entropy": 6.40411787033081, "epoch": 0.1554295316110061, "grad_norm": 1.015625, "learning_rate": 0.0004999425436619279, "loss": 6.2301, "mean_token_accuracy": 0.1250107169151306, "num_tokens": 3414172.0, "step": 1850 }, { "entropy": 6.4263053894042965, "epoch": 0.15584961142617096, "grad_norm": 0.9375, "learning_rate": 0.000499941864945265, "loss": 6.2069, "mean_token_accuracy": 0.12341500893235206, "num_tokens": 3423409.0, "step": 1855 }, { "entropy": 6.2579625129699705, "epoch": 0.15626969124133586, "grad_norm": 0.99609375, "learning_rate": 0.0004999411822438726, "loss": 6.1554, "mean_token_accuracy": 0.12717969343066216, "num_tokens": 3433047.0, "step": 1860 }, { "entropy": 6.4037513256073, "epoch": 0.15668977105650073, "grad_norm": 1.078125, "learning_rate": 0.000499940495557763, "loss": 6.1468, "mean_token_accuracy": 0.12783457711338997, "num_tokens": 3442490.0, "step": 1865 }, { "entropy": 6.303406810760498, "epoch": 0.15710985087166562, "grad_norm": 0.9921875, "learning_rate": 0.0004999398048869485, "loss": 6.2099, "mean_token_accuracy": 0.129954195022583, "num_tokens": 3451804.0, "step": 1870 }, { "entropy": 6.385490417480469, "epoch": 0.1575299306868305, "grad_norm": 0.984375, "learning_rate": 0.000499939110231441, "loss": 6.199, "mean_token_accuracy": 0.1304432988166809, "num_tokens": 3461481.0, "step": 1875 }, { "entropy": 6.364220190048218, "epoch": 0.1579500105019954, "grad_norm": 1.0234375, "learning_rate": 0.0004999384115912531, "loss": 6.2449, "mean_token_accuracy": 0.13135363310575485, "num_tokens": 3471798.0, "step": 1880 }, { "entropy": 6.247316694259643, "epoch": 0.15837009031716026, "grad_norm": 0.96875, "learning_rate": 0.000499937708966397, "loss": 6.1296, "mean_token_accuracy": 0.12637364491820335, "num_tokens": 3481386.0, "step": 1885 }, { "entropy": 6.332306051254273, "epoch": 0.15879017013232513, "grad_norm": 0.97265625, "learning_rate": 0.0004999370023568853, "loss": 6.127, "mean_token_accuracy": 0.1316571466624737, "num_tokens": 3489981.0, "step": 1890 }, { "entropy": 6.299954462051391, "epoch": 0.15921024994749003, "grad_norm": 1.03125, "learning_rate": 0.0004999362917627304, "loss": 6.1227, "mean_token_accuracy": 0.1305247150361538, "num_tokens": 3498551.0, "step": 1895 }, { "entropy": 6.316105461120605, "epoch": 0.1596303297626549, "grad_norm": 1.046875, "learning_rate": 0.0004999355771839448, "loss": 6.0979, "mean_token_accuracy": 0.12954429015517235, "num_tokens": 3507921.0, "step": 1900 }, { "entropy": 6.470440483093261, "epoch": 0.1600504095778198, "grad_norm": 1.078125, "learning_rate": 0.0004999348586205414, "loss": 6.2729, "mean_token_accuracy": 0.13220328316092492, "num_tokens": 3517570.0, "step": 1905 }, { "entropy": 6.38808388710022, "epoch": 0.16047048939298467, "grad_norm": 1.0703125, "learning_rate": 0.0004999341360725327, "loss": 6.2438, "mean_token_accuracy": 0.123927091807127, "num_tokens": 3526774.0, "step": 1910 }, { "entropy": 6.285849714279175, "epoch": 0.16089056920814954, "grad_norm": 1.03125, "learning_rate": 0.0004999334095399317, "loss": 6.1859, "mean_token_accuracy": 0.1361298866569996, "num_tokens": 3535319.0, "step": 1915 }, { "entropy": 6.249746656417846, "epoch": 0.16131064902331443, "grad_norm": 0.98828125, "learning_rate": 0.0004999326790227512, "loss": 6.1605, "mean_token_accuracy": 0.1271871216595173, "num_tokens": 3544468.0, "step": 1920 }, { "entropy": 6.217294788360595, "epoch": 0.1617307288384793, "grad_norm": 0.9140625, "learning_rate": 0.0004999319445210041, "loss": 6.0261, "mean_token_accuracy": 0.1361843690276146, "num_tokens": 3553529.0, "step": 1925 }, { "entropy": 6.290815734863282, "epoch": 0.1621508086536442, "grad_norm": 0.96875, "learning_rate": 0.0004999312060347034, "loss": 6.1011, "mean_token_accuracy": 0.13233864828944206, "num_tokens": 3563053.0, "step": 1930 }, { "entropy": 6.224975728988648, "epoch": 0.16257088846880907, "grad_norm": 0.953125, "learning_rate": 0.0004999304635638621, "loss": 6.0288, "mean_token_accuracy": 0.1342104844748974, "num_tokens": 3571877.0, "step": 1935 }, { "entropy": 6.233099460601807, "epoch": 0.16299096828397394, "grad_norm": 0.92578125, "learning_rate": 0.0004999297171084935, "loss": 6.091, "mean_token_accuracy": 0.13373700231313707, "num_tokens": 3581496.0, "step": 1940 }, { "entropy": 6.324843549728394, "epoch": 0.16341104809913884, "grad_norm": 0.98828125, "learning_rate": 0.0004999289666686109, "loss": 6.1071, "mean_token_accuracy": 0.1308230109512806, "num_tokens": 3590752.0, "step": 1945 }, { "entropy": 6.129473495483398, "epoch": 0.1638311279143037, "grad_norm": 0.98046875, "learning_rate": 0.0004999282122442274, "loss": 6.1072, "mean_token_accuracy": 0.1328013814985752, "num_tokens": 3599885.0, "step": 1950 }, { "entropy": 6.387533235549927, "epoch": 0.1642512077294686, "grad_norm": 0.9296875, "learning_rate": 0.0004999274538353564, "loss": 6.1968, "mean_token_accuracy": 0.12293331325054169, "num_tokens": 3610039.0, "step": 1955 }, { "entropy": 6.2677867889404295, "epoch": 0.16467128754463348, "grad_norm": 1.015625, "learning_rate": 0.0004999266914420114, "loss": 6.1123, "mean_token_accuracy": 0.12491545528173446, "num_tokens": 3619954.0, "step": 1960 }, { "entropy": 6.291842746734619, "epoch": 0.16509136735979837, "grad_norm": 1.0078125, "learning_rate": 0.000499925925064206, "loss": 6.0646, "mean_token_accuracy": 0.13617814630270003, "num_tokens": 3628164.0, "step": 1965 }, { "entropy": 6.377547359466552, "epoch": 0.16551144717496324, "grad_norm": 1.046875, "learning_rate": 0.0004999251547019535, "loss": 6.2126, "mean_token_accuracy": 0.13370679765939714, "num_tokens": 3636778.0, "step": 1970 }, { "entropy": 6.318364191055298, "epoch": 0.16593152699012811, "grad_norm": 0.9609375, "learning_rate": 0.0004999243803552678, "loss": 6.1666, "mean_token_accuracy": 0.13474627435207367, "num_tokens": 3647046.0, "step": 1975 }, { "entropy": 6.2661604404449465, "epoch": 0.166351606805293, "grad_norm": 1.03125, "learning_rate": 0.0004999236020241625, "loss": 6.0969, "mean_token_accuracy": 0.1302388660609722, "num_tokens": 3656130.0, "step": 1980 }, { "entropy": 6.294794940948487, "epoch": 0.16677168662045788, "grad_norm": 0.9921875, "learning_rate": 0.0004999228197086514, "loss": 6.1791, "mean_token_accuracy": 0.12147556319832802, "num_tokens": 3666145.0, "step": 1985 }, { "entropy": 6.308886766433716, "epoch": 0.16719176643562278, "grad_norm": 0.88671875, "learning_rate": 0.0004999220334087484, "loss": 6.2221, "mean_token_accuracy": 0.12820759564638137, "num_tokens": 3676722.0, "step": 1990 }, { "entropy": 6.34148588180542, "epoch": 0.16761184625078765, "grad_norm": 1.0, "learning_rate": 0.0004999212431244673, "loss": 6.1977, "mean_token_accuracy": 0.1265730917453766, "num_tokens": 3685880.0, "step": 1995 }, { "entropy": 6.220745372772217, "epoch": 0.16803192606595252, "grad_norm": 0.98828125, "learning_rate": 0.0004999204488558222, "loss": 6.0332, "mean_token_accuracy": 0.13368572890758515, "num_tokens": 3695167.0, "step": 2000 }, { "entropy": 6.279938268661499, "epoch": 0.16845200588111742, "grad_norm": 0.96875, "learning_rate": 0.0004999196506028273, "loss": 6.1455, "mean_token_accuracy": 0.12803823873400688, "num_tokens": 3703700.0, "step": 2005 }, { "entropy": 6.340878582000732, "epoch": 0.1688720856962823, "grad_norm": 1.0390625, "learning_rate": 0.0004999188483654965, "loss": 6.0938, "mean_token_accuracy": 0.12776080071926116, "num_tokens": 3712825.0, "step": 2010 }, { "entropy": 6.229676914215088, "epoch": 0.16929216551144718, "grad_norm": 0.9453125, "learning_rate": 0.0004999180421438442, "loss": 6.0447, "mean_token_accuracy": 0.13442618474364282, "num_tokens": 3721807.0, "step": 2015 }, { "entropy": 6.3377564430236815, "epoch": 0.16971224532661205, "grad_norm": 1.0625, "learning_rate": 0.0004999172319378846, "loss": 6.2308, "mean_token_accuracy": 0.12342165559530258, "num_tokens": 3730502.0, "step": 2020 }, { "entropy": 6.334515047073364, "epoch": 0.17013232514177692, "grad_norm": 0.98828125, "learning_rate": 0.0004999164177476319, "loss": 6.1138, "mean_token_accuracy": 0.13388336971402168, "num_tokens": 3739696.0, "step": 2025 }, { "entropy": 6.170955038070678, "epoch": 0.17055240495694182, "grad_norm": 1.0625, "learning_rate": 0.0004999155995731009, "loss": 6.1168, "mean_token_accuracy": 0.1329979881644249, "num_tokens": 3748675.0, "step": 2030 }, { "entropy": 6.440923643112183, "epoch": 0.1709724847721067, "grad_norm": 1.0234375, "learning_rate": 0.0004999147774143057, "loss": 6.1895, "mean_token_accuracy": 0.12849014177918433, "num_tokens": 3757714.0, "step": 2035 }, { "entropy": 6.217456531524658, "epoch": 0.1713925645872716, "grad_norm": 1.0, "learning_rate": 0.000499913951271261, "loss": 6.0181, "mean_token_accuracy": 0.13668849244713782, "num_tokens": 3767589.0, "step": 2040 }, { "entropy": 6.216994047164917, "epoch": 0.17181264440243646, "grad_norm": 1.125, "learning_rate": 0.0004999131211439816, "loss": 6.1246, "mean_token_accuracy": 0.13397686704993247, "num_tokens": 3777261.0, "step": 2045 }, { "entropy": 6.3198566913604735, "epoch": 0.17223272421760136, "grad_norm": 1.015625, "learning_rate": 0.000499912287032482, "loss": 6.0738, "mean_token_accuracy": 0.13602124899625778, "num_tokens": 3786658.0, "step": 2050 }, { "entropy": 6.19984622001648, "epoch": 0.17265280403276623, "grad_norm": 1.0703125, "learning_rate": 0.000499911448936777, "loss": 6.0669, "mean_token_accuracy": 0.14067015573382377, "num_tokens": 3794977.0, "step": 2055 }, { "entropy": 6.179085731506348, "epoch": 0.1730728838479311, "grad_norm": 0.93359375, "learning_rate": 0.0004999106068568816, "loss": 6.1457, "mean_token_accuracy": 0.12947675883769988, "num_tokens": 3805138.0, "step": 2060 }, { "entropy": 6.279845762252807, "epoch": 0.173492963663096, "grad_norm": 1.015625, "learning_rate": 0.0004999097607928106, "loss": 6.0911, "mean_token_accuracy": 0.13879665359854698, "num_tokens": 3814444.0, "step": 2065 }, { "entropy": 6.212150764465332, "epoch": 0.17391304347826086, "grad_norm": 0.984375, "learning_rate": 0.0004999089107445788, "loss": 6.0398, "mean_token_accuracy": 0.13306153938174248, "num_tokens": 3822859.0, "step": 2070 }, { "entropy": 6.133330869674682, "epoch": 0.17433312329342576, "grad_norm": 0.9140625, "learning_rate": 0.0004999080567122016, "loss": 6.0707, "mean_token_accuracy": 0.13198764845728875, "num_tokens": 3833159.0, "step": 2075 }, { "entropy": 6.295455646514893, "epoch": 0.17475320310859063, "grad_norm": 1.015625, "learning_rate": 0.0004999071986956941, "loss": 6.0856, "mean_token_accuracy": 0.13797224685549736, "num_tokens": 3842136.0, "step": 2080 }, { "entropy": 6.208657741546631, "epoch": 0.1751732829237555, "grad_norm": 1.0234375, "learning_rate": 0.0004999063366950713, "loss": 6.1499, "mean_token_accuracy": 0.12877421900629998, "num_tokens": 3851406.0, "step": 2085 }, { "entropy": 6.217505025863647, "epoch": 0.1755933627389204, "grad_norm": 1.0078125, "learning_rate": 0.0004999054707103486, "loss": 6.0713, "mean_token_accuracy": 0.1279774695634842, "num_tokens": 3861061.0, "step": 2090 }, { "entropy": 6.265169095993042, "epoch": 0.17601344255408527, "grad_norm": 1.0234375, "learning_rate": 0.0004999046007415412, "loss": 6.0378, "mean_token_accuracy": 0.12900712937116623, "num_tokens": 3870357.0, "step": 2095 }, { "entropy": 6.2917054176330565, "epoch": 0.17643352236925017, "grad_norm": 1.0, "learning_rate": 0.0004999037267886646, "loss": 6.0715, "mean_token_accuracy": 0.13141706436872483, "num_tokens": 3879393.0, "step": 2100 }, { "entropy": 6.180794954299927, "epoch": 0.17685360218441504, "grad_norm": 1.046875, "learning_rate": 0.0004999028488517343, "loss": 6.0832, "mean_token_accuracy": 0.13525146320462228, "num_tokens": 3888030.0, "step": 2105 }, { "entropy": 6.266747093200683, "epoch": 0.1772736819995799, "grad_norm": 1.0234375, "learning_rate": 0.0004999019669307659, "loss": 6.0788, "mean_token_accuracy": 0.1376435212790966, "num_tokens": 3897430.0, "step": 2110 }, { "entropy": 6.238908100128174, "epoch": 0.1776937618147448, "grad_norm": 0.9296875, "learning_rate": 0.0004999010810257749, "loss": 6.0977, "mean_token_accuracy": 0.12719068825244903, "num_tokens": 3907711.0, "step": 2115 }, { "entropy": 6.189173746109009, "epoch": 0.17811384162990967, "grad_norm": 0.9765625, "learning_rate": 0.0004999001911367771, "loss": 6.0411, "mean_token_accuracy": 0.13638337776064874, "num_tokens": 3915816.0, "step": 2120 }, { "entropy": 6.22648811340332, "epoch": 0.17853392144507457, "grad_norm": 0.96484375, "learning_rate": 0.0004998992972637883, "loss": 6.1538, "mean_token_accuracy": 0.12582943066954613, "num_tokens": 3925162.0, "step": 2125 }, { "entropy": 6.284874153137207, "epoch": 0.17895400126023944, "grad_norm": 0.94921875, "learning_rate": 0.0004998983994068242, "loss": 6.0395, "mean_token_accuracy": 0.13122835606336594, "num_tokens": 3934476.0, "step": 2130 }, { "entropy": 6.186276054382324, "epoch": 0.17937408107540434, "grad_norm": 0.93359375, "learning_rate": 0.0004998974975659006, "loss": 6.0907, "mean_token_accuracy": 0.1297646477818489, "num_tokens": 3943501.0, "step": 2135 }, { "entropy": 6.205726194381714, "epoch": 0.1797941608905692, "grad_norm": 0.96484375, "learning_rate": 0.0004998965917410338, "loss": 6.0816, "mean_token_accuracy": 0.12778471410274506, "num_tokens": 3953663.0, "step": 2140 }, { "entropy": 6.211074018478394, "epoch": 0.18021424070573408, "grad_norm": 1.0078125, "learning_rate": 0.0004998956819322397, "loss": 6.0495, "mean_token_accuracy": 0.13608243688941002, "num_tokens": 3962634.0, "step": 2145 }, { "entropy": 6.177238512039184, "epoch": 0.18063432052089898, "grad_norm": 0.94921875, "learning_rate": 0.0004998947681395343, "loss": 6.052, "mean_token_accuracy": 0.13605224341154099, "num_tokens": 3972496.0, "step": 2150 }, { "entropy": 6.390697908401489, "epoch": 0.18105440033606385, "grad_norm": 1.03125, "learning_rate": 0.000499893850362934, "loss": 6.2977, "mean_token_accuracy": 0.12441082820296287, "num_tokens": 3980724.0, "step": 2155 }, { "entropy": 6.262918901443482, "epoch": 0.18147448015122875, "grad_norm": 0.96875, "learning_rate": 0.0004998929286024548, "loss": 6.1304, "mean_token_accuracy": 0.1300631955265999, "num_tokens": 3989842.0, "step": 2160 }, { "entropy": 6.230935716629029, "epoch": 0.18189455996639362, "grad_norm": 1.109375, "learning_rate": 0.0004998920028581133, "loss": 6.0378, "mean_token_accuracy": 0.14167480319738388, "num_tokens": 3998534.0, "step": 2165 }, { "entropy": 6.241239356994629, "epoch": 0.18231463978155849, "grad_norm": 0.9765625, "learning_rate": 0.0004998910731299258, "loss": 6.0631, "mean_token_accuracy": 0.13066420927643776, "num_tokens": 4007677.0, "step": 2170 }, { "entropy": 6.19789605140686, "epoch": 0.18273471959672338, "grad_norm": 1.0234375, "learning_rate": 0.0004998901394179085, "loss": 6.1007, "mean_token_accuracy": 0.12627347633242608, "num_tokens": 4016347.0, "step": 2175 }, { "entropy": 6.198655843734741, "epoch": 0.18315479941188825, "grad_norm": 1.046875, "learning_rate": 0.0004998892017220784, "loss": 5.9767, "mean_token_accuracy": 0.14088783264160157, "num_tokens": 4025199.0, "step": 2180 }, { "entropy": 6.262273931503296, "epoch": 0.18357487922705315, "grad_norm": 1.0859375, "learning_rate": 0.0004998882600424519, "loss": 6.0603, "mean_token_accuracy": 0.1286892294883728, "num_tokens": 4033933.0, "step": 2185 }, { "entropy": 6.162368822097778, "epoch": 0.18399495904221802, "grad_norm": 0.9609375, "learning_rate": 0.0004998873143790455, "loss": 5.9753, "mean_token_accuracy": 0.1438771367073059, "num_tokens": 4042891.0, "step": 2190 }, { "entropy": 6.274066638946533, "epoch": 0.1844150388573829, "grad_norm": 0.9609375, "learning_rate": 0.0004998863647318763, "loss": 6.1041, "mean_token_accuracy": 0.13264708146452903, "num_tokens": 4051123.0, "step": 2195 }, { "entropy": 6.144877004623413, "epoch": 0.1848351186725478, "grad_norm": 1.046875, "learning_rate": 0.0004998854111009608, "loss": 6.0715, "mean_token_accuracy": 0.12865814492106437, "num_tokens": 4060025.0, "step": 2200 }, { "entropy": 6.182585954666138, "epoch": 0.18525519848771266, "grad_norm": 0.90625, "learning_rate": 0.0004998844534863161, "loss": 5.991, "mean_token_accuracy": 0.1295328378677368, "num_tokens": 4069363.0, "step": 2205 }, { "entropy": 6.241155099868775, "epoch": 0.18567527830287756, "grad_norm": 0.99609375, "learning_rate": 0.0004998834918879592, "loss": 6.1376, "mean_token_accuracy": 0.133307021856308, "num_tokens": 4078855.0, "step": 2210 }, { "entropy": 6.206245565414429, "epoch": 0.18609535811804243, "grad_norm": 0.9453125, "learning_rate": 0.000499882526305907, "loss": 6.0804, "mean_token_accuracy": 0.12953457087278367, "num_tokens": 4087801.0, "step": 2215 }, { "entropy": 6.248236179351807, "epoch": 0.18651543793320732, "grad_norm": 0.91796875, "learning_rate": 0.0004998815567401765, "loss": 6.0926, "mean_token_accuracy": 0.1376325160264969, "num_tokens": 4096949.0, "step": 2220 }, { "entropy": 6.279425954818725, "epoch": 0.1869355177483722, "grad_norm": 1.03125, "learning_rate": 0.0004998805831907851, "loss": 6.0617, "mean_token_accuracy": 0.13082574903964997, "num_tokens": 4105399.0, "step": 2225 }, { "entropy": 6.169968605041504, "epoch": 0.18735559756353706, "grad_norm": 1.0078125, "learning_rate": 0.0004998796056577501, "loss": 6.0071, "mean_token_accuracy": 0.12926321402192115, "num_tokens": 4113873.0, "step": 2230 }, { "entropy": 6.154512643814087, "epoch": 0.18777567737870196, "grad_norm": 0.90625, "learning_rate": 0.0004998786241410886, "loss": 6.0586, "mean_token_accuracy": 0.13699585050344468, "num_tokens": 4123528.0, "step": 2235 }, { "entropy": 6.2988721370697025, "epoch": 0.18819575719386683, "grad_norm": 0.9140625, "learning_rate": 0.000499877638640818, "loss": 6.0699, "mean_token_accuracy": 0.13017342165112494, "num_tokens": 4133370.0, "step": 2240 }, { "entropy": 6.184452104568481, "epoch": 0.18861583700903173, "grad_norm": 0.94140625, "learning_rate": 0.000499876649156956, "loss": 5.9844, "mean_token_accuracy": 0.13666255846619607, "num_tokens": 4142370.0, "step": 2245 }, { "entropy": 6.133312082290649, "epoch": 0.1890359168241966, "grad_norm": 0.96875, "learning_rate": 0.0004998756556895196, "loss": 6.0725, "mean_token_accuracy": 0.1354515865445137, "num_tokens": 4152367.0, "step": 2250 }, { "entropy": 6.21663122177124, "epoch": 0.18945599663936147, "grad_norm": 1.0078125, "learning_rate": 0.000499874658238527, "loss": 6.0625, "mean_token_accuracy": 0.13495326191186904, "num_tokens": 4161126.0, "step": 2255 }, { "entropy": 6.186970901489258, "epoch": 0.18987607645452637, "grad_norm": 1.0078125, "learning_rate": 0.0004998736568039957, "loss": 5.9748, "mean_token_accuracy": 0.13723411411046982, "num_tokens": 4169910.0, "step": 2260 }, { "entropy": 6.1857301712036135, "epoch": 0.19029615626969124, "grad_norm": 0.9921875, "learning_rate": 0.0004998726513859432, "loss": 6.1067, "mean_token_accuracy": 0.12761787325143814, "num_tokens": 4179893.0, "step": 2265 }, { "entropy": 6.308238935470581, "epoch": 0.19071623608485613, "grad_norm": 0.9140625, "learning_rate": 0.0004998716419843875, "loss": 6.12, "mean_token_accuracy": 0.13745217099785806, "num_tokens": 4190065.0, "step": 2270 }, { "entropy": 6.090948486328125, "epoch": 0.191136315900021, "grad_norm": 1.015625, "learning_rate": 0.0004998706285993465, "loss": 6.0313, "mean_token_accuracy": 0.1420229621231556, "num_tokens": 4198395.0, "step": 2275 }, { "entropy": 6.282499647140503, "epoch": 0.19155639571518587, "grad_norm": 0.9453125, "learning_rate": 0.0004998696112308381, "loss": 6.0533, "mean_token_accuracy": 0.1310360386967659, "num_tokens": 4207555.0, "step": 2280 }, { "entropy": 6.088230180740356, "epoch": 0.19197647553035077, "grad_norm": 0.9296875, "learning_rate": 0.0004998685898788803, "loss": 5.9946, "mean_token_accuracy": 0.13536595478653907, "num_tokens": 4216533.0, "step": 2285 }, { "entropy": 6.274929618835449, "epoch": 0.19239655534551564, "grad_norm": 1.0390625, "learning_rate": 0.0004998675645434914, "loss": 6.1095, "mean_token_accuracy": 0.13767784610390663, "num_tokens": 4225575.0, "step": 2290 }, { "entropy": 6.153714513778686, "epoch": 0.19281663516068054, "grad_norm": 1.0234375, "learning_rate": 0.0004998665352246891, "loss": 5.8958, "mean_token_accuracy": 0.14245088025927544, "num_tokens": 4234306.0, "step": 2295 }, { "entropy": 6.08680305480957, "epoch": 0.1932367149758454, "grad_norm": 0.9609375, "learning_rate": 0.0004998655019224921, "loss": 6.0823, "mean_token_accuracy": 0.1359329827129841, "num_tokens": 4243998.0, "step": 2300 }, { "entropy": 6.237053394317627, "epoch": 0.19365679479101028, "grad_norm": 0.98828125, "learning_rate": 0.0004998644646369185, "loss": 5.9776, "mean_token_accuracy": 0.13352483361959458, "num_tokens": 4253653.0, "step": 2305 }, { "entropy": 6.139167737960816, "epoch": 0.19407687460617518, "grad_norm": 0.98828125, "learning_rate": 0.0004998634233679865, "loss": 6.0652, "mean_token_accuracy": 0.1278400629758835, "num_tokens": 4263305.0, "step": 2310 }, { "entropy": 6.127392339706421, "epoch": 0.19449695442134005, "grad_norm": 1.0078125, "learning_rate": 0.000499862378115715, "loss": 5.9342, "mean_token_accuracy": 0.14543856382369996, "num_tokens": 4272212.0, "step": 2315 }, { "entropy": 6.305202007293701, "epoch": 0.19491703423650494, "grad_norm": 1.0625, "learning_rate": 0.0004998613288801221, "loss": 6.1375, "mean_token_accuracy": 0.13151465207338334, "num_tokens": 4281445.0, "step": 2320 }, { "entropy": 6.2177956104278564, "epoch": 0.1953371140516698, "grad_norm": 0.9609375, "learning_rate": 0.0004998602756612267, "loss": 6.055, "mean_token_accuracy": 0.1372949168086052, "num_tokens": 4290938.0, "step": 2325 }, { "entropy": 6.175972557067871, "epoch": 0.1957571938668347, "grad_norm": 0.9765625, "learning_rate": 0.0004998592184590471, "loss": 6.0786, "mean_token_accuracy": 0.13233636021614076, "num_tokens": 4300022.0, "step": 2330 }, { "entropy": 6.134920358657837, "epoch": 0.19617727368199958, "grad_norm": 1.0, "learning_rate": 0.0004998581572736024, "loss": 5.9674, "mean_token_accuracy": 0.1363460712134838, "num_tokens": 4308910.0, "step": 2335 }, { "entropy": 6.092206907272339, "epoch": 0.19659735349716445, "grad_norm": 0.93359375, "learning_rate": 0.0004998570921049112, "loss": 5.9454, "mean_token_accuracy": 0.13969452679157257, "num_tokens": 4317136.0, "step": 2340 }, { "entropy": 6.112558746337891, "epoch": 0.19701743331232935, "grad_norm": 1.046875, "learning_rate": 0.0004998560229529924, "loss": 5.9993, "mean_token_accuracy": 0.1428337089717388, "num_tokens": 4326163.0, "step": 2345 }, { "entropy": 6.308993816375732, "epoch": 0.19743751312749422, "grad_norm": 0.97265625, "learning_rate": 0.0004998549498178649, "loss": 6.1402, "mean_token_accuracy": 0.13658420667052268, "num_tokens": 4335837.0, "step": 2350 }, { "entropy": 6.216946363449097, "epoch": 0.19785759294265912, "grad_norm": 1.09375, "learning_rate": 0.0004998538726995477, "loss": 6.0561, "mean_token_accuracy": 0.1374947391450405, "num_tokens": 4345108.0, "step": 2355 }, { "entropy": 6.217574787139893, "epoch": 0.198277672757824, "grad_norm": 0.953125, "learning_rate": 0.00049985279159806, "loss": 6.0722, "mean_token_accuracy": 0.1334306165575981, "num_tokens": 4353761.0, "step": 2360 }, { "entropy": 6.1630774974823, "epoch": 0.19869775257298886, "grad_norm": 0.99609375, "learning_rate": 0.0004998517065134208, "loss": 6.0354, "mean_token_accuracy": 0.13587109968066216, "num_tokens": 4363244.0, "step": 2365 }, { "entropy": 6.205533790588379, "epoch": 0.19911783238815375, "grad_norm": 0.92578125, "learning_rate": 0.0004998506174456494, "loss": 6.0386, "mean_token_accuracy": 0.13257589265704156, "num_tokens": 4373034.0, "step": 2370 }, { "entropy": 6.200410652160644, "epoch": 0.19953791220331862, "grad_norm": 0.90625, "learning_rate": 0.0004998495243947653, "loss": 5.9816, "mean_token_accuracy": 0.13029902279376984, "num_tokens": 4382554.0, "step": 2375 }, { "entropy": 6.191087865829468, "epoch": 0.19995799201848352, "grad_norm": 1.03125, "learning_rate": 0.0004998484273607875, "loss": 5.9843, "mean_token_accuracy": 0.14299238696694375, "num_tokens": 4391001.0, "step": 2380 }, { "entropy": 6.023518228530884, "epoch": 0.2003780718336484, "grad_norm": 0.9140625, "learning_rate": 0.0004998473263437356, "loss": 5.9141, "mean_token_accuracy": 0.13673870489001275, "num_tokens": 4400632.0, "step": 2385 }, { "entropy": 6.105119514465332, "epoch": 0.20079815164881326, "grad_norm": 0.97265625, "learning_rate": 0.000499846221343629, "loss": 6.0095, "mean_token_accuracy": 0.12952324375510216, "num_tokens": 4409565.0, "step": 2390 }, { "entropy": 6.128167533874512, "epoch": 0.20121823146397816, "grad_norm": 1.0234375, "learning_rate": 0.0004998451123604875, "loss": 5.944, "mean_token_accuracy": 0.14282809123396872, "num_tokens": 4418384.0, "step": 2395 }, { "entropy": 6.1983355522155765, "epoch": 0.20163831127914303, "grad_norm": 1.0546875, "learning_rate": 0.0004998439993943306, "loss": 6.0692, "mean_token_accuracy": 0.1389256276190281, "num_tokens": 4427581.0, "step": 2400 }, { "entropy": 6.267655086517334, "epoch": 0.20205839109430793, "grad_norm": 1.0078125, "learning_rate": 0.0004998428824451779, "loss": 6.0521, "mean_token_accuracy": 0.1341543450951576, "num_tokens": 4436572.0, "step": 2405 }, { "entropy": 6.1763083934783936, "epoch": 0.2024784709094728, "grad_norm": 1.0078125, "learning_rate": 0.0004998417615130495, "loss": 6.055, "mean_token_accuracy": 0.13537125810980796, "num_tokens": 4445230.0, "step": 2410 }, { "entropy": 6.247248315811158, "epoch": 0.2028985507246377, "grad_norm": 0.98046875, "learning_rate": 0.0004998406365979649, "loss": 6.1134, "mean_token_accuracy": 0.13383878991007805, "num_tokens": 4454251.0, "step": 2415 }, { "entropy": 6.136447811126709, "epoch": 0.20331863053980256, "grad_norm": 0.9375, "learning_rate": 0.0004998395076999443, "loss": 5.9699, "mean_token_accuracy": 0.13695907220244408, "num_tokens": 4463949.0, "step": 2420 }, { "entropy": 6.227413558959961, "epoch": 0.20373871035496743, "grad_norm": 1.03125, "learning_rate": 0.0004998383748190076, "loss": 6.1649, "mean_token_accuracy": 0.12917085587978364, "num_tokens": 4473373.0, "step": 2425 }, { "entropy": 6.249214363098145, "epoch": 0.20415879017013233, "grad_norm": 1.0234375, "learning_rate": 0.0004998372379551748, "loss": 5.9842, "mean_token_accuracy": 0.1414948470890522, "num_tokens": 4482303.0, "step": 2430 }, { "entropy": 6.117572832107544, "epoch": 0.2045788699852972, "grad_norm": 0.9765625, "learning_rate": 0.0004998360971084663, "loss": 5.9567, "mean_token_accuracy": 0.1317524030804634, "num_tokens": 4491214.0, "step": 2435 }, { "entropy": 6.057681226730347, "epoch": 0.2049989498004621, "grad_norm": 0.97265625, "learning_rate": 0.0004998349522789019, "loss": 5.8856, "mean_token_accuracy": 0.14377139806747435, "num_tokens": 4500099.0, "step": 2440 }, { "entropy": 6.115459060668945, "epoch": 0.20541902961562697, "grad_norm": 0.96875, "learning_rate": 0.0004998338034665021, "loss": 5.9692, "mean_token_accuracy": 0.1437109664082527, "num_tokens": 4509893.0, "step": 2445 }, { "entropy": 6.08744249343872, "epoch": 0.20583910943079184, "grad_norm": 0.98828125, "learning_rate": 0.0004998326506712872, "loss": 5.9375, "mean_token_accuracy": 0.13774847760796546, "num_tokens": 4518606.0, "step": 2450 }, { "entropy": 6.11673412322998, "epoch": 0.20625918924595674, "grad_norm": 0.99609375, "learning_rate": 0.0004998314938932778, "loss": 6.0218, "mean_token_accuracy": 0.14001012295484544, "num_tokens": 4528392.0, "step": 2455 }, { "entropy": 6.221143388748169, "epoch": 0.2066792690611216, "grad_norm": 0.96875, "learning_rate": 0.0004998303331324943, "loss": 5.9923, "mean_token_accuracy": 0.13821439668536187, "num_tokens": 4536983.0, "step": 2460 }, { "entropy": 6.041988134384155, "epoch": 0.2070993488762865, "grad_norm": 0.96875, "learning_rate": 0.0004998291683889571, "loss": 5.9145, "mean_token_accuracy": 0.1391140677034855, "num_tokens": 4544967.0, "step": 2465 }, { "entropy": 6.134957313537598, "epoch": 0.20751942869145137, "grad_norm": 1.0234375, "learning_rate": 0.000499827999662687, "loss": 5.9727, "mean_token_accuracy": 0.13200750946998596, "num_tokens": 4554646.0, "step": 2470 }, { "entropy": 6.192252588272095, "epoch": 0.20793950850661624, "grad_norm": 0.9453125, "learning_rate": 0.0004998268269537046, "loss": 5.9954, "mean_token_accuracy": 0.1370847873389721, "num_tokens": 4564040.0, "step": 2475 }, { "entropy": 6.091167068481445, "epoch": 0.20835958832178114, "grad_norm": 0.96875, "learning_rate": 0.0004998256502620308, "loss": 6.0187, "mean_token_accuracy": 0.14094985872507096, "num_tokens": 4573758.0, "step": 2480 }, { "entropy": 6.206011056900024, "epoch": 0.208779668136946, "grad_norm": 0.92578125, "learning_rate": 0.0004998244695876864, "loss": 6.0452, "mean_token_accuracy": 0.13380730673670768, "num_tokens": 4582097.0, "step": 2485 }, { "entropy": 6.0949585914611815, "epoch": 0.2091997479521109, "grad_norm": 1.015625, "learning_rate": 0.0004998232849306921, "loss": 6.0055, "mean_token_accuracy": 0.13993047401309014, "num_tokens": 4590687.0, "step": 2490 }, { "entropy": 6.1933338165283205, "epoch": 0.20961982776727578, "grad_norm": 0.9765625, "learning_rate": 0.0004998220962910693, "loss": 5.9965, "mean_token_accuracy": 0.13453714549541473, "num_tokens": 4599497.0, "step": 2495 }, { "entropy": 6.101396179199218, "epoch": 0.21003990758244068, "grad_norm": 1.0390625, "learning_rate": 0.0004998209036688386, "loss": 5.9532, "mean_token_accuracy": 0.13716981932520866, "num_tokens": 4607958.0, "step": 2500 }, { "entropy": 6.216299772262573, "epoch": 0.21045998739760555, "grad_norm": 0.96484375, "learning_rate": 0.0004998197070640216, "loss": 6.0812, "mean_token_accuracy": 0.1314453199505806, "num_tokens": 4617515.0, "step": 2505 }, { "entropy": 6.2111225605010985, "epoch": 0.21088006721277042, "grad_norm": 0.9765625, "learning_rate": 0.0004998185064766391, "loss": 5.9892, "mean_token_accuracy": 0.135587390512228, "num_tokens": 4627037.0, "step": 2510 }, { "entropy": 6.083059787750244, "epoch": 0.21130014702793531, "grad_norm": 0.91015625, "learning_rate": 0.0004998173019067127, "loss": 5.9864, "mean_token_accuracy": 0.13536423593759536, "num_tokens": 4637393.0, "step": 2515 }, { "entropy": 6.111885261535645, "epoch": 0.21172022684310018, "grad_norm": 0.98828125, "learning_rate": 0.0004998160933542633, "loss": 6.0252, "mean_token_accuracy": 0.12426691725850106, "num_tokens": 4646832.0, "step": 2520 }, { "entropy": 6.200415229797363, "epoch": 0.21214030665826508, "grad_norm": 1.0703125, "learning_rate": 0.0004998148808193128, "loss": 6.0364, "mean_token_accuracy": 0.1378290109336376, "num_tokens": 4655719.0, "step": 2525 }, { "entropy": 6.140298128128052, "epoch": 0.21256038647342995, "grad_norm": 0.953125, "learning_rate": 0.0004998136643018823, "loss": 5.9978, "mean_token_accuracy": 0.1409161224961281, "num_tokens": 4665364.0, "step": 2530 }, { "entropy": 6.113859462738037, "epoch": 0.21298046628859482, "grad_norm": 1.0234375, "learning_rate": 0.0004998124438019935, "loss": 5.9707, "mean_token_accuracy": 0.13255369514226914, "num_tokens": 4674760.0, "step": 2535 }, { "entropy": 6.032169342041016, "epoch": 0.21340054610375972, "grad_norm": 0.9375, "learning_rate": 0.0004998112193196681, "loss": 5.8954, "mean_token_accuracy": 0.1398087151348591, "num_tokens": 4683900.0, "step": 2540 }, { "entropy": 6.009505701065064, "epoch": 0.2138206259189246, "grad_norm": 0.98046875, "learning_rate": 0.0004998099908549277, "loss": 5.9487, "mean_token_accuracy": 0.1326383799314499, "num_tokens": 4693915.0, "step": 2545 }, { "entropy": 6.048102998733521, "epoch": 0.2142407057340895, "grad_norm": 0.98046875, "learning_rate": 0.000499808758407794, "loss": 5.7948, "mean_token_accuracy": 0.1494914174079895, "num_tokens": 4703102.0, "step": 2550 }, { "entropy": 6.130202150344848, "epoch": 0.21466078554925436, "grad_norm": 0.96875, "learning_rate": 0.0004998075219782889, "loss": 6.0201, "mean_token_accuracy": 0.13604088351130486, "num_tokens": 4712925.0, "step": 2555 }, { "entropy": 6.086578845977783, "epoch": 0.21508086536441923, "grad_norm": 1.0078125, "learning_rate": 0.0004998062815664344, "loss": 5.9508, "mean_token_accuracy": 0.13391971811652184, "num_tokens": 4722641.0, "step": 2560 }, { "entropy": 6.060202693939209, "epoch": 0.21550094517958412, "grad_norm": 0.9375, "learning_rate": 0.0004998050371722524, "loss": 6.028, "mean_token_accuracy": 0.13827937468886375, "num_tokens": 4732603.0, "step": 2565 }, { "entropy": 6.060051965713501, "epoch": 0.215921024994749, "grad_norm": 0.90625, "learning_rate": 0.0004998037887957649, "loss": 5.8655, "mean_token_accuracy": 0.1426350235939026, "num_tokens": 4742644.0, "step": 2570 }, { "entropy": 6.2458967685699465, "epoch": 0.2163411048099139, "grad_norm": 0.9765625, "learning_rate": 0.0004998025364369939, "loss": 6.1759, "mean_token_accuracy": 0.1332129217684269, "num_tokens": 4751482.0, "step": 2575 }, { "entropy": 6.246464967727661, "epoch": 0.21676118462507876, "grad_norm": 1.03125, "learning_rate": 0.0004998012800959619, "loss": 6.0435, "mean_token_accuracy": 0.13494925051927567, "num_tokens": 4760593.0, "step": 2580 }, { "entropy": 6.139482402801514, "epoch": 0.21718126444024366, "grad_norm": 1.046875, "learning_rate": 0.0004998000197726909, "loss": 6.041, "mean_token_accuracy": 0.14071242287755012, "num_tokens": 4769294.0, "step": 2585 }, { "entropy": 6.151182079315186, "epoch": 0.21760134425540853, "grad_norm": 0.87890625, "learning_rate": 0.0004997987554672033, "loss": 5.9433, "mean_token_accuracy": 0.13458855599164962, "num_tokens": 4779239.0, "step": 2590 }, { "entropy": 6.153560495376587, "epoch": 0.2180214240705734, "grad_norm": 0.921875, "learning_rate": 0.0004997974871795215, "loss": 6.0165, "mean_token_accuracy": 0.13904761373996735, "num_tokens": 4788211.0, "step": 2595 }, { "entropy": 6.1266923427581785, "epoch": 0.2184415038857383, "grad_norm": 0.87109375, "learning_rate": 0.000499796214909668, "loss": 5.9707, "mean_token_accuracy": 0.14307306259870528, "num_tokens": 4797921.0, "step": 2600 }, { "entropy": 6.151721715927124, "epoch": 0.21886158370090317, "grad_norm": 0.97265625, "learning_rate": 0.0004997949386576653, "loss": 5.9792, "mean_token_accuracy": 0.1372672997415066, "num_tokens": 4807772.0, "step": 2605 }, { "entropy": 5.999966764450074, "epoch": 0.21928166351606806, "grad_norm": 0.9375, "learning_rate": 0.000499793658423536, "loss": 6.0037, "mean_token_accuracy": 0.13394766226410865, "num_tokens": 4817999.0, "step": 2610 }, { "entropy": 6.197027158737183, "epoch": 0.21970174333123293, "grad_norm": 1.0625, "learning_rate": 0.0004997923742073028, "loss": 5.9552, "mean_token_accuracy": 0.14477612674236298, "num_tokens": 4826679.0, "step": 2615 }, { "entropy": 6.0403674125671385, "epoch": 0.2201218231463978, "grad_norm": 1.015625, "learning_rate": 0.0004997910860089884, "loss": 5.9647, "mean_token_accuracy": 0.13903913348913194, "num_tokens": 4834998.0, "step": 2620 }, { "entropy": 6.119702100753784, "epoch": 0.2205419029615627, "grad_norm": 1.0234375, "learning_rate": 0.0004997897938286156, "loss": 5.9173, "mean_token_accuracy": 0.13934070989489555, "num_tokens": 4843635.0, "step": 2625 }, { "entropy": 6.135205316543579, "epoch": 0.22096198277672757, "grad_norm": 1.0859375, "learning_rate": 0.0004997884976662075, "loss": 6.0334, "mean_token_accuracy": 0.13847846239805223, "num_tokens": 4852027.0, "step": 2630 }, { "entropy": 6.115947484970093, "epoch": 0.22138206259189247, "grad_norm": 1.0390625, "learning_rate": 0.0004997871975217868, "loss": 5.9555, "mean_token_accuracy": 0.1428781971335411, "num_tokens": 4861244.0, "step": 2635 }, { "entropy": 6.043252468109131, "epoch": 0.22180214240705734, "grad_norm": 0.95703125, "learning_rate": 0.0004997858933953768, "loss": 5.8579, "mean_token_accuracy": 0.14281281381845473, "num_tokens": 4869902.0, "step": 2640 }, { "entropy": 6.012739181518555, "epoch": 0.2222222222222222, "grad_norm": 0.95703125, "learning_rate": 0.0004997845852870004, "loss": 5.8421, "mean_token_accuracy": 0.1463296964764595, "num_tokens": 4878502.0, "step": 2645 }, { "entropy": 6.089871215820312, "epoch": 0.2226423020373871, "grad_norm": 0.9765625, "learning_rate": 0.0004997832731966806, "loss": 5.9032, "mean_token_accuracy": 0.14714645445346833, "num_tokens": 4888348.0, "step": 2650 }, { "entropy": 6.06225700378418, "epoch": 0.22306238185255198, "grad_norm": 1.015625, "learning_rate": 0.0004997819571244411, "loss": 5.972, "mean_token_accuracy": 0.1450254276394844, "num_tokens": 4897302.0, "step": 2655 }, { "entropy": 6.0446860790252686, "epoch": 0.22348246166771688, "grad_norm": 1.0, "learning_rate": 0.0004997806370703049, "loss": 5.9876, "mean_token_accuracy": 0.14430617392063141, "num_tokens": 4907078.0, "step": 2660 }, { "entropy": 6.057806348800659, "epoch": 0.22390254148288175, "grad_norm": 0.8671875, "learning_rate": 0.0004997793130342954, "loss": 5.8272, "mean_token_accuracy": 0.1456086441874504, "num_tokens": 4917489.0, "step": 2665 }, { "entropy": 5.973814630508423, "epoch": 0.22432262129804661, "grad_norm": 0.9765625, "learning_rate": 0.0004997779850164363, "loss": 5.9156, "mean_token_accuracy": 0.140571466088295, "num_tokens": 4927073.0, "step": 2670 }, { "entropy": 6.177860355377197, "epoch": 0.2247427011132115, "grad_norm": 0.98828125, "learning_rate": 0.0004997766530167508, "loss": 6.019, "mean_token_accuracy": 0.1344543881714344, "num_tokens": 4935464.0, "step": 2675 }, { "entropy": 6.22092981338501, "epoch": 0.22516278092837638, "grad_norm": 1.0078125, "learning_rate": 0.0004997753170352627, "loss": 6.0914, "mean_token_accuracy": 0.13605839386582375, "num_tokens": 4944718.0, "step": 2680 }, { "entropy": 6.105925226211548, "epoch": 0.22558286074354128, "grad_norm": 1.03125, "learning_rate": 0.0004997739770719955, "loss": 5.9844, "mean_token_accuracy": 0.13587288782000542, "num_tokens": 4954223.0, "step": 2685 }, { "entropy": 6.107930469512939, "epoch": 0.22600294055870615, "grad_norm": 0.921875, "learning_rate": 0.000499772633126973, "loss": 6.0132, "mean_token_accuracy": 0.13594387769699096, "num_tokens": 4963371.0, "step": 2690 }, { "entropy": 6.04271125793457, "epoch": 0.22642302037387105, "grad_norm": 0.98046875, "learning_rate": 0.0004997712852002192, "loss": 5.8679, "mean_token_accuracy": 0.1471228800714016, "num_tokens": 4972973.0, "step": 2695 }, { "entropy": 6.086397647857666, "epoch": 0.22684310018903592, "grad_norm": 1.0234375, "learning_rate": 0.0004997699332917578, "loss": 6.1119, "mean_token_accuracy": 0.12916670590639115, "num_tokens": 4982808.0, "step": 2700 }, { "entropy": 6.201492786407471, "epoch": 0.2272631800042008, "grad_norm": 0.94140625, "learning_rate": 0.0004997685774016127, "loss": 5.9896, "mean_token_accuracy": 0.13685485795140268, "num_tokens": 4992427.0, "step": 2705 }, { "entropy": 6.162964010238648, "epoch": 0.22768325981936569, "grad_norm": 0.84375, "learning_rate": 0.000499767217529808, "loss": 6.1604, "mean_token_accuracy": 0.12921097874641418, "num_tokens": 5003562.0, "step": 2710 }, { "entropy": 6.098525857925415, "epoch": 0.22810333963453056, "grad_norm": 0.890625, "learning_rate": 0.0004997658536763678, "loss": 5.8638, "mean_token_accuracy": 0.1451013281941414, "num_tokens": 5013429.0, "step": 2715 }, { "entropy": 6.117339611053467, "epoch": 0.22852341944969545, "grad_norm": 0.953125, "learning_rate": 0.0004997644858413163, "loss": 6.0022, "mean_token_accuracy": 0.14247513711452484, "num_tokens": 5022045.0, "step": 2720 }, { "entropy": 6.008642053604126, "epoch": 0.22894349926486032, "grad_norm": 0.88671875, "learning_rate": 0.0004997631140246775, "loss": 5.8287, "mean_token_accuracy": 0.14408515840768815, "num_tokens": 5032260.0, "step": 2725 }, { "entropy": 6.021863174438477, "epoch": 0.2293635790800252, "grad_norm": 0.9453125, "learning_rate": 0.000499761738226476, "loss": 5.8626, "mean_token_accuracy": 0.14258013665676117, "num_tokens": 5041688.0, "step": 2730 }, { "entropy": 6.056025457382202, "epoch": 0.2297836588951901, "grad_norm": 0.9765625, "learning_rate": 0.000499760358446736, "loss": 5.9702, "mean_token_accuracy": 0.13718490228056907, "num_tokens": 5051005.0, "step": 2735 }, { "entropy": 6.152891635894775, "epoch": 0.23020373871035496, "grad_norm": 0.96484375, "learning_rate": 0.000499758974685482, "loss": 5.9147, "mean_token_accuracy": 0.13967233374714852, "num_tokens": 5060084.0, "step": 2740 }, { "entropy": 6.059838390350341, "epoch": 0.23062381852551986, "grad_norm": 1.0859375, "learning_rate": 0.0004997575869427385, "loss": 5.9122, "mean_token_accuracy": 0.14734914749860764, "num_tokens": 5069081.0, "step": 2745 }, { "entropy": 6.0928624153137205, "epoch": 0.23104389834068473, "grad_norm": 0.9609375, "learning_rate": 0.00049975619521853, "loss": 5.9121, "mean_token_accuracy": 0.13845374211668968, "num_tokens": 5078597.0, "step": 2750 }, { "entropy": 6.052087306976318, "epoch": 0.2314639781558496, "grad_norm": 0.953125, "learning_rate": 0.0004997547995128814, "loss": 5.9554, "mean_token_accuracy": 0.14530446976423264, "num_tokens": 5087607.0, "step": 2755 }, { "entropy": 6.094136476516724, "epoch": 0.2318840579710145, "grad_norm": 1.078125, "learning_rate": 0.0004997533998258171, "loss": 5.9424, "mean_token_accuracy": 0.14329736083745956, "num_tokens": 5097412.0, "step": 2760 }, { "entropy": 6.16567211151123, "epoch": 0.23230413778617937, "grad_norm": 0.984375, "learning_rate": 0.0004997519961573622, "loss": 6.0152, "mean_token_accuracy": 0.13348544016480446, "num_tokens": 5105817.0, "step": 2765 }, { "entropy": 6.226717376708985, "epoch": 0.23272421760134426, "grad_norm": 1.0625, "learning_rate": 0.0004997505885075414, "loss": 6.0522, "mean_token_accuracy": 0.13480133637785913, "num_tokens": 5114958.0, "step": 2770 }, { "entropy": 6.084324312210083, "epoch": 0.23314429741650913, "grad_norm": 0.9609375, "learning_rate": 0.0004997491768763795, "loss": 5.9898, "mean_token_accuracy": 0.13868246227502823, "num_tokens": 5123728.0, "step": 2775 }, { "entropy": 6.100927209854126, "epoch": 0.23356437723167403, "grad_norm": 0.9921875, "learning_rate": 0.0004997477612639018, "loss": 6.0218, "mean_token_accuracy": 0.13395264372229576, "num_tokens": 5134099.0, "step": 2780 }, { "entropy": 6.162116241455078, "epoch": 0.2339844570468389, "grad_norm": 1.0, "learning_rate": 0.0004997463416701332, "loss": 6.0325, "mean_token_accuracy": 0.13172747194766998, "num_tokens": 5142934.0, "step": 2785 }, { "entropy": 6.000607919692993, "epoch": 0.23440453686200377, "grad_norm": 0.99609375, "learning_rate": 0.0004997449180950989, "loss": 5.8681, "mean_token_accuracy": 0.15649961084127426, "num_tokens": 5151835.0, "step": 2790 }, { "entropy": 6.038245487213135, "epoch": 0.23482461667716867, "grad_norm": 0.9140625, "learning_rate": 0.0004997434905388241, "loss": 5.921, "mean_token_accuracy": 0.1477814018726349, "num_tokens": 5161136.0, "step": 2795 }, { "entropy": 6.029763174057007, "epoch": 0.23524469649233354, "grad_norm": 0.921875, "learning_rate": 0.000499742059001334, "loss": 5.8684, "mean_token_accuracy": 0.14450337663292884, "num_tokens": 5170741.0, "step": 2800 }, { "entropy": 6.046102046966553, "epoch": 0.23566477630749844, "grad_norm": 0.9921875, "learning_rate": 0.0004997406234826541, "loss": 5.9001, "mean_token_accuracy": 0.14729267880320548, "num_tokens": 5180549.0, "step": 2805 }, { "entropy": 5.980107164382934, "epoch": 0.2360848561226633, "grad_norm": 0.88671875, "learning_rate": 0.0004997391839828098, "loss": 5.8667, "mean_token_accuracy": 0.14962306916713713, "num_tokens": 5189486.0, "step": 2810 }, { "entropy": 6.044159746170044, "epoch": 0.23650493593782818, "grad_norm": 0.96484375, "learning_rate": 0.0004997377405018266, "loss": 5.9303, "mean_token_accuracy": 0.13750530928373336, "num_tokens": 5198525.0, "step": 2815 }, { "entropy": 6.075648498535156, "epoch": 0.23692501575299307, "grad_norm": 0.99609375, "learning_rate": 0.00049973629303973, "loss": 5.9734, "mean_token_accuracy": 0.14086321070790292, "num_tokens": 5207124.0, "step": 2820 }, { "entropy": 5.964286422729492, "epoch": 0.23734509556815794, "grad_norm": 0.8984375, "learning_rate": 0.0004997348415965457, "loss": 5.8079, "mean_token_accuracy": 0.14603810012340546, "num_tokens": 5216529.0, "step": 2825 }, { "entropy": 6.12622709274292, "epoch": 0.23776517538332284, "grad_norm": 1.03125, "learning_rate": 0.0004997333861722995, "loss": 5.9402, "mean_token_accuracy": 0.14331007972359658, "num_tokens": 5225796.0, "step": 2830 }, { "entropy": 6.085462188720703, "epoch": 0.2381852551984877, "grad_norm": 1.0703125, "learning_rate": 0.000499731926767017, "loss": 5.9732, "mean_token_accuracy": 0.14003979936242103, "num_tokens": 5233876.0, "step": 2835 }, { "entropy": 6.016348743438721, "epoch": 0.23860533501365258, "grad_norm": 0.9375, "learning_rate": 0.0004997304633807242, "loss": 5.9695, "mean_token_accuracy": 0.13823127001523972, "num_tokens": 5244782.0, "step": 2840 }, { "entropy": 6.077929925918579, "epoch": 0.23902541482881748, "grad_norm": 0.99609375, "learning_rate": 0.0004997289960134468, "loss": 5.8993, "mean_token_accuracy": 0.14192162305116654, "num_tokens": 5253453.0, "step": 2845 }, { "entropy": 6.049857330322266, "epoch": 0.23944549464398235, "grad_norm": 1.0546875, "learning_rate": 0.0004997275246652111, "loss": 5.9414, "mean_token_accuracy": 0.14183279648423194, "num_tokens": 5262355.0, "step": 2850 }, { "entropy": 6.019342088699341, "epoch": 0.23986557445914725, "grad_norm": 1.0, "learning_rate": 0.000499726049336043, "loss": 5.8652, "mean_token_accuracy": 0.14227822795510292, "num_tokens": 5271959.0, "step": 2855 }, { "entropy": 6.045290803909301, "epoch": 0.24028565427431212, "grad_norm": 1.0546875, "learning_rate": 0.0004997245700259686, "loss": 5.8938, "mean_token_accuracy": 0.14394148513674737, "num_tokens": 5281393.0, "step": 2860 }, { "entropy": 6.126777935028076, "epoch": 0.240705734089477, "grad_norm": 0.921875, "learning_rate": 0.0004997230867350141, "loss": 6.0153, "mean_token_accuracy": 0.13795892894268036, "num_tokens": 5290979.0, "step": 2865 }, { "entropy": 6.170654964447022, "epoch": 0.24112581390464188, "grad_norm": 0.9921875, "learning_rate": 0.0004997215994632059, "loss": 5.9662, "mean_token_accuracy": 0.1420626498758793, "num_tokens": 5300263.0, "step": 2870 }, { "entropy": 6.098070096969605, "epoch": 0.24154589371980675, "grad_norm": 0.94921875, "learning_rate": 0.0004997201082105704, "loss": 5.9973, "mean_token_accuracy": 0.1376795694231987, "num_tokens": 5309522.0, "step": 2875 }, { "entropy": 6.09854941368103, "epoch": 0.24196597353497165, "grad_norm": 1.03125, "learning_rate": 0.0004997186129771338, "loss": 5.9906, "mean_token_accuracy": 0.1443823680281639, "num_tokens": 5319770.0, "step": 2880 }, { "entropy": 6.159392309188843, "epoch": 0.24238605335013652, "grad_norm": 1.015625, "learning_rate": 0.0004997171137629226, "loss": 5.9994, "mean_token_accuracy": 0.14119460731744765, "num_tokens": 5328400.0, "step": 2885 }, { "entropy": 6.00137939453125, "epoch": 0.24280613316530142, "grad_norm": 1.03125, "learning_rate": 0.0004997156105679636, "loss": 5.8054, "mean_token_accuracy": 0.15445883423089982, "num_tokens": 5336338.0, "step": 2890 }, { "entropy": 5.9904273509979244, "epoch": 0.2432262129804663, "grad_norm": 0.97265625, "learning_rate": 0.0004997141033922832, "loss": 5.8983, "mean_token_accuracy": 0.1381608746945858, "num_tokens": 5345391.0, "step": 2895 }, { "entropy": 6.080091238021851, "epoch": 0.24364629279563116, "grad_norm": 0.9921875, "learning_rate": 0.0004997125922359081, "loss": 5.9345, "mean_token_accuracy": 0.13472433462738992, "num_tokens": 5354709.0, "step": 2900 }, { "entropy": 6.0483152866363525, "epoch": 0.24406637261079606, "grad_norm": 1.0, "learning_rate": 0.0004997110770988652, "loss": 5.8441, "mean_token_accuracy": 0.14647466093301773, "num_tokens": 5363738.0, "step": 2905 }, { "entropy": 6.065390634536743, "epoch": 0.24448645242596093, "grad_norm": 1.078125, "learning_rate": 0.0004997095579811813, "loss": 5.9742, "mean_token_accuracy": 0.14132302552461623, "num_tokens": 5373583.0, "step": 2910 }, { "entropy": 6.1408384323120115, "epoch": 0.24490653224112582, "grad_norm": 0.875, "learning_rate": 0.0004997080348828833, "loss": 6.0104, "mean_token_accuracy": 0.14406906738877295, "num_tokens": 5383486.0, "step": 2915 }, { "entropy": 6.012083101272583, "epoch": 0.2453266120562907, "grad_norm": 1.0390625, "learning_rate": 0.0004997065078039981, "loss": 5.9283, "mean_token_accuracy": 0.13883504942059516, "num_tokens": 5391974.0, "step": 2920 }, { "entropy": 6.098450088500977, "epoch": 0.24574669187145556, "grad_norm": 1.03125, "learning_rate": 0.0004997049767445529, "loss": 5.9688, "mean_token_accuracy": 0.13587900176644324, "num_tokens": 5400882.0, "step": 2925 }, { "entropy": 6.1687455654144285, "epoch": 0.24616677168662046, "grad_norm": 0.96484375, "learning_rate": 0.0004997034417045746, "loss": 5.9199, "mean_token_accuracy": 0.13755179792642594, "num_tokens": 5410538.0, "step": 2930 }, { "entropy": 6.019326400756836, "epoch": 0.24658685150178533, "grad_norm": 0.99609375, "learning_rate": 0.0004997019026840907, "loss": 5.8134, "mean_token_accuracy": 0.14420632421970367, "num_tokens": 5419406.0, "step": 2935 }, { "entropy": 5.9686970710754395, "epoch": 0.24700693131695023, "grad_norm": 0.98046875, "learning_rate": 0.0004997003596831282, "loss": 5.941, "mean_token_accuracy": 0.13971618413925171, "num_tokens": 5428817.0, "step": 2940 }, { "entropy": 6.097631120681763, "epoch": 0.2474270111321151, "grad_norm": 0.98828125, "learning_rate": 0.0004996988127017145, "loss": 5.9448, "mean_token_accuracy": 0.13872243240475654, "num_tokens": 5438277.0, "step": 2945 }, { "entropy": 6.047083616256714, "epoch": 0.24784709094728, "grad_norm": 1.0234375, "learning_rate": 0.0004996972617398772, "loss": 5.974, "mean_token_accuracy": 0.13909853398799896, "num_tokens": 5447440.0, "step": 2950 }, { "entropy": 6.065885257720947, "epoch": 0.24826717076244487, "grad_norm": 0.98828125, "learning_rate": 0.0004996957067976435, "loss": 5.9005, "mean_token_accuracy": 0.13819090723991395, "num_tokens": 5455988.0, "step": 2955 }, { "entropy": 6.079396390914917, "epoch": 0.24868725057760974, "grad_norm": 0.96875, "learning_rate": 0.0004996941478750411, "loss": 5.895, "mean_token_accuracy": 0.14170320481061935, "num_tokens": 5464996.0, "step": 2960 }, { "entropy": 6.131442737579346, "epoch": 0.24910733039277463, "grad_norm": 0.9140625, "learning_rate": 0.0004996925849720975, "loss": 6.0433, "mean_token_accuracy": 0.13297844752669336, "num_tokens": 5474174.0, "step": 2965 }, { "entropy": 6.144496154785156, "epoch": 0.2495274102079395, "grad_norm": 1.0390625, "learning_rate": 0.0004996910180888405, "loss": 5.928, "mean_token_accuracy": 0.14379495605826378, "num_tokens": 5482838.0, "step": 2970 }, { "entropy": 6.089239263534546, "epoch": 0.2499474900231044, "grad_norm": 0.9609375, "learning_rate": 0.0004996894472252977, "loss": 5.9339, "mean_token_accuracy": 0.1420593172311783, "num_tokens": 5491616.0, "step": 2975 }, { "entropy": 5.992457008361816, "epoch": 0.25036756983826924, "grad_norm": 0.94921875, "learning_rate": 0.0004996878723814973, "loss": 5.9265, "mean_token_accuracy": 0.13892921283841134, "num_tokens": 5500942.0, "step": 2980 }, { "entropy": 6.117427587509155, "epoch": 0.25078764965343414, "grad_norm": 0.94921875, "learning_rate": 0.0004996862935574667, "loss": 5.8788, "mean_token_accuracy": 0.13912170454859735, "num_tokens": 5510078.0, "step": 2985 }, { "entropy": 5.943054437637329, "epoch": 0.25120772946859904, "grad_norm": 0.94140625, "learning_rate": 0.0004996847107532342, "loss": 5.9134, "mean_token_accuracy": 0.14340257570147513, "num_tokens": 5518924.0, "step": 2990 }, { "entropy": 6.108536148071289, "epoch": 0.25162780928376394, "grad_norm": 0.93359375, "learning_rate": 0.0004996831239688277, "loss": 5.9216, "mean_token_accuracy": 0.13749035373330115, "num_tokens": 5527385.0, "step": 2995 }, { "entropy": 5.977105903625488, "epoch": 0.2520478890989288, "grad_norm": 0.95703125, "learning_rate": 0.0004996815332042754, "loss": 5.766, "mean_token_accuracy": 0.15047305673360825, "num_tokens": 5536781.0, "step": 3000 }, { "epoch": 0.2520478890989288, "eval_entropy": 5.7445289912557636, "eval_loss": 5.931798458099365, "eval_mean_token_accuracy": 0.1480788363722414, "eval_num_tokens": 5536781.0, "eval_runtime": 21.0325, "eval_samples_per_second": 1776.586, "eval_steps_per_second": 222.085, "step": 3000 }, { "entropy": 6.008361387252807, "epoch": 0.2524679689140937, "grad_norm": 0.96484375, "learning_rate": 0.0004996799384596054, "loss": 5.9477, "mean_token_accuracy": 0.14386533573269844, "num_tokens": 5545893.0, "step": 3005 }, { "entropy": 6.112303066253662, "epoch": 0.2528880487292586, "grad_norm": 0.90625, "learning_rate": 0.0004996783397348461, "loss": 5.9152, "mean_token_accuracy": 0.13690555915236474, "num_tokens": 5555818.0, "step": 3010 }, { "entropy": 6.042035245895386, "epoch": 0.2533081285444234, "grad_norm": 0.8671875, "learning_rate": 0.0004996767370300256, "loss": 5.8717, "mean_token_accuracy": 0.14453656524419783, "num_tokens": 5565331.0, "step": 3015 }, { "entropy": 6.081929445266724, "epoch": 0.2537282083595883, "grad_norm": 1.0, "learning_rate": 0.0004996751303451724, "loss": 5.8599, "mean_token_accuracy": 0.14481035768985748, "num_tokens": 5574003.0, "step": 3020 }, { "entropy": 5.977067756652832, "epoch": 0.2541482881747532, "grad_norm": 0.9765625, "learning_rate": 0.0004996735196803149, "loss": 5.7815, "mean_token_accuracy": 0.15307400673627852, "num_tokens": 5582517.0, "step": 3025 }, { "entropy": 6.072621822357178, "epoch": 0.2545683679899181, "grad_norm": 0.875, "learning_rate": 0.0004996719050354818, "loss": 5.9948, "mean_token_accuracy": 0.13989571258425712, "num_tokens": 5591952.0, "step": 3030 }, { "entropy": 6.03379979133606, "epoch": 0.25498844780508295, "grad_norm": 0.953125, "learning_rate": 0.0004996702864107015, "loss": 5.8913, "mean_token_accuracy": 0.14787303507328034, "num_tokens": 5601460.0, "step": 3035 }, { "entropy": 6.189465713500977, "epoch": 0.25540852762024785, "grad_norm": 0.98828125, "learning_rate": 0.0004996686638060028, "loss": 6.0052, "mean_token_accuracy": 0.13520606160163878, "num_tokens": 5610776.0, "step": 3040 }, { "entropy": 6.085352611541748, "epoch": 0.25582860743541275, "grad_norm": 0.91015625, "learning_rate": 0.0004996670372214144, "loss": 5.9054, "mean_token_accuracy": 0.14562050476670266, "num_tokens": 5619627.0, "step": 3045 }, { "entropy": 5.9095056533813475, "epoch": 0.2562486872505776, "grad_norm": 0.87890625, "learning_rate": 0.0004996654066569651, "loss": 5.7872, "mean_token_accuracy": 0.14956104382872581, "num_tokens": 5628969.0, "step": 3050 }, { "entropy": 5.998289918899536, "epoch": 0.2566687670657425, "grad_norm": 0.9921875, "learning_rate": 0.0004996637721126839, "loss": 5.8501, "mean_token_accuracy": 0.14419863522052764, "num_tokens": 5638629.0, "step": 3055 }, { "entropy": 6.084632110595703, "epoch": 0.2570888468809074, "grad_norm": 1.046875, "learning_rate": 0.0004996621335885996, "loss": 5.9249, "mean_token_accuracy": 0.13865133970975876, "num_tokens": 5647571.0, "step": 3060 }, { "entropy": 6.059264850616455, "epoch": 0.2575089266960722, "grad_norm": 1.21875, "learning_rate": 0.0004996604910847413, "loss": 5.8418, "mean_token_accuracy": 0.1548224687576294, "num_tokens": 5656709.0, "step": 3065 }, { "entropy": 6.037788724899292, "epoch": 0.2579290065112371, "grad_norm": 0.96484375, "learning_rate": 0.000499658844601138, "loss": 6.0136, "mean_token_accuracy": 0.14061269238591195, "num_tokens": 5665714.0, "step": 3070 }, { "entropy": 6.112887382507324, "epoch": 0.258349086326402, "grad_norm": 0.91796875, "learning_rate": 0.000499657194137819, "loss": 5.9813, "mean_token_accuracy": 0.1434816040098667, "num_tokens": 5675854.0, "step": 3075 }, { "entropy": 6.10079174041748, "epoch": 0.2587691661415669, "grad_norm": 0.96875, "learning_rate": 0.0004996555396948136, "loss": 5.8062, "mean_token_accuracy": 0.14445895925164223, "num_tokens": 5685690.0, "step": 3080 }, { "entropy": 6.008033037185669, "epoch": 0.25918924595673176, "grad_norm": 0.88671875, "learning_rate": 0.0004996538812721509, "loss": 5.8654, "mean_token_accuracy": 0.14993129372596742, "num_tokens": 5695766.0, "step": 3085 }, { "entropy": 6.072084999084472, "epoch": 0.25960932577189666, "grad_norm": 1.046875, "learning_rate": 0.0004996522188698603, "loss": 5.8982, "mean_token_accuracy": 0.14610292240977288, "num_tokens": 5704365.0, "step": 3090 }, { "entropy": 6.0555907726287845, "epoch": 0.26002940558706156, "grad_norm": 1.125, "learning_rate": 0.0004996505524879714, "loss": 6.0101, "mean_token_accuracy": 0.14055205136537552, "num_tokens": 5713345.0, "step": 3095 }, { "entropy": 6.035314083099365, "epoch": 0.2604494854022264, "grad_norm": 0.91796875, "learning_rate": 0.0004996488821265137, "loss": 5.816, "mean_token_accuracy": 0.14724740535020828, "num_tokens": 5722907.0, "step": 3100 }, { "entropy": 6.007513093948364, "epoch": 0.2608695652173913, "grad_norm": 0.98828125, "learning_rate": 0.0004996472077855166, "loss": 5.8596, "mean_token_accuracy": 0.1498942032456398, "num_tokens": 5731589.0, "step": 3105 }, { "entropy": 5.998636054992676, "epoch": 0.2612896450325562, "grad_norm": 0.984375, "learning_rate": 0.00049964552946501, "loss": 5.8476, "mean_token_accuracy": 0.1439466342329979, "num_tokens": 5739922.0, "step": 3110 }, { "entropy": 5.9389458179473875, "epoch": 0.2617097248477211, "grad_norm": 0.96484375, "learning_rate": 0.0004996438471650235, "loss": 5.7675, "mean_token_accuracy": 0.15062671899795532, "num_tokens": 5749206.0, "step": 3115 }, { "entropy": 6.008351278305054, "epoch": 0.26212980466288593, "grad_norm": 0.92578125, "learning_rate": 0.0004996421608855869, "loss": 5.8288, "mean_token_accuracy": 0.15271472856402396, "num_tokens": 5758803.0, "step": 3120 }, { "entropy": 6.044885444641113, "epoch": 0.26254988447805083, "grad_norm": 0.96875, "learning_rate": 0.0004996404706267301, "loss": 5.9065, "mean_token_accuracy": 0.13532925099134446, "num_tokens": 5768368.0, "step": 3125 }, { "entropy": 5.958721733093261, "epoch": 0.26296996429321573, "grad_norm": 1.0625, "learning_rate": 0.000499638776388483, "loss": 5.7648, "mean_token_accuracy": 0.1534928262233734, "num_tokens": 5776707.0, "step": 3130 }, { "entropy": 5.986162996292114, "epoch": 0.26339004410838057, "grad_norm": 0.97265625, "learning_rate": 0.0004996370781708757, "loss": 5.9532, "mean_token_accuracy": 0.13491747826337813, "num_tokens": 5787037.0, "step": 3135 }, { "entropy": 6.018689870834351, "epoch": 0.26381012392354547, "grad_norm": 0.875, "learning_rate": 0.0004996353759739382, "loss": 5.9005, "mean_token_accuracy": 0.14967331141233445, "num_tokens": 5796630.0, "step": 3140 }, { "entropy": 5.985601377487183, "epoch": 0.26423020373871037, "grad_norm": 1.015625, "learning_rate": 0.0004996336697977007, "loss": 5.8974, "mean_token_accuracy": 0.14190822690725327, "num_tokens": 5806402.0, "step": 3145 }, { "entropy": 5.99180235862732, "epoch": 0.2646502835538752, "grad_norm": 0.98828125, "learning_rate": 0.0004996319596421933, "loss": 5.853, "mean_token_accuracy": 0.14679677560925483, "num_tokens": 5815742.0, "step": 3150 }, { "entropy": 6.00025954246521, "epoch": 0.2650703633690401, "grad_norm": 0.90625, "learning_rate": 0.0004996302455074466, "loss": 5.8679, "mean_token_accuracy": 0.14232094436883927, "num_tokens": 5824915.0, "step": 3155 }, { "entropy": 6.032740592956543, "epoch": 0.265490443184205, "grad_norm": 0.921875, "learning_rate": 0.0004996285273934906, "loss": 5.8901, "mean_token_accuracy": 0.14556412398815155, "num_tokens": 5834978.0, "step": 3160 }, { "entropy": 6.078465604782105, "epoch": 0.2659105229993699, "grad_norm": 0.87890625, "learning_rate": 0.000499626805300356, "loss": 6.0439, "mean_token_accuracy": 0.14277126342058183, "num_tokens": 5845684.0, "step": 3165 }, { "entropy": 6.094513893127441, "epoch": 0.26633060281453474, "grad_norm": 0.97265625, "learning_rate": 0.0004996250792280732, "loss": 5.9226, "mean_token_accuracy": 0.13814914003014564, "num_tokens": 5854905.0, "step": 3170 }, { "entropy": 6.054658889770508, "epoch": 0.26675068262969964, "grad_norm": 1.03125, "learning_rate": 0.0004996233491766727, "loss": 5.934, "mean_token_accuracy": 0.14257717728614808, "num_tokens": 5863654.0, "step": 3175 }, { "entropy": 6.036546421051026, "epoch": 0.26717076244486454, "grad_norm": 1.03125, "learning_rate": 0.0004996216151461854, "loss": 5.9289, "mean_token_accuracy": 0.14137156009674073, "num_tokens": 5872442.0, "step": 3180 }, { "entropy": 6.089460277557373, "epoch": 0.2675908422600294, "grad_norm": 0.9609375, "learning_rate": 0.0004996198771366417, "loss": 5.8594, "mean_token_accuracy": 0.14687168076634408, "num_tokens": 5882372.0, "step": 3185 }, { "entropy": 5.836459922790527, "epoch": 0.2680109220751943, "grad_norm": 0.98828125, "learning_rate": 0.0004996181351480726, "loss": 5.6727, "mean_token_accuracy": 0.15421667248010634, "num_tokens": 5891113.0, "step": 3190 }, { "entropy": 5.909378480911255, "epoch": 0.2684310018903592, "grad_norm": 0.94140625, "learning_rate": 0.0004996163891805089, "loss": 5.9167, "mean_token_accuracy": 0.14929258525371553, "num_tokens": 5899582.0, "step": 3195 }, { "entropy": 6.088847398757935, "epoch": 0.2688510817055241, "grad_norm": 0.94921875, "learning_rate": 0.0004996146392339815, "loss": 5.8788, "mean_token_accuracy": 0.137289460003376, "num_tokens": 5908938.0, "step": 3200 }, { "entropy": 6.025485897064209, "epoch": 0.2692711615206889, "grad_norm": 0.9453125, "learning_rate": 0.0004996128853085215, "loss": 5.8462, "mean_token_accuracy": 0.14703118950128555, "num_tokens": 5918055.0, "step": 3205 }, { "entropy": 6.024847555160522, "epoch": 0.2696912413358538, "grad_norm": 0.921875, "learning_rate": 0.0004996111274041598, "loss": 5.8169, "mean_token_accuracy": 0.14159609079360963, "num_tokens": 5926744.0, "step": 3210 }, { "entropy": 6.007894611358642, "epoch": 0.2701113211510187, "grad_norm": 0.87109375, "learning_rate": 0.0004996093655209277, "loss": 5.9028, "mean_token_accuracy": 0.1412175938487053, "num_tokens": 5936521.0, "step": 3215 }, { "entropy": 6.093644618988037, "epoch": 0.27053140096618356, "grad_norm": 0.98828125, "learning_rate": 0.0004996075996588563, "loss": 5.9689, "mean_token_accuracy": 0.1381188787519932, "num_tokens": 5945010.0, "step": 3220 }, { "entropy": 6.014964437484741, "epoch": 0.27095148078134845, "grad_norm": 0.9296875, "learning_rate": 0.000499605829817977, "loss": 5.8629, "mean_token_accuracy": 0.15120311975479125, "num_tokens": 5953766.0, "step": 3225 }, { "entropy": 5.982144498825074, "epoch": 0.27137156059651335, "grad_norm": 0.90234375, "learning_rate": 0.000499604055998321, "loss": 5.8001, "mean_token_accuracy": 0.14623286202549934, "num_tokens": 5962168.0, "step": 3230 }, { "entropy": 5.941414022445679, "epoch": 0.2717916404116782, "grad_norm": 0.890625, "learning_rate": 0.0004996022781999198, "loss": 5.8249, "mean_token_accuracy": 0.14706685170531272, "num_tokens": 5971627.0, "step": 3235 }, { "entropy": 6.00689377784729, "epoch": 0.2722117202268431, "grad_norm": 0.97265625, "learning_rate": 0.000499600496422805, "loss": 5.8993, "mean_token_accuracy": 0.14405820965766908, "num_tokens": 5981775.0, "step": 3240 }, { "entropy": 5.973731327056885, "epoch": 0.272631800042008, "grad_norm": 0.9453125, "learning_rate": 0.000499598710667008, "loss": 5.838, "mean_token_accuracy": 0.1444271594285965, "num_tokens": 5991097.0, "step": 3245 }, { "entropy": 5.973551654815674, "epoch": 0.2730518798571729, "grad_norm": 1.0234375, "learning_rate": 0.0004995969209325604, "loss": 5.8988, "mean_token_accuracy": 0.14417145103216172, "num_tokens": 5999517.0, "step": 3250 }, { "entropy": 5.939422225952148, "epoch": 0.2734719596723377, "grad_norm": 0.953125, "learning_rate": 0.0004995951272194941, "loss": 5.8778, "mean_token_accuracy": 0.139290714263916, "num_tokens": 6008545.0, "step": 3255 }, { "entropy": 6.07567138671875, "epoch": 0.2738920394875026, "grad_norm": 0.9765625, "learning_rate": 0.0004995933295278407, "loss": 5.8603, "mean_token_accuracy": 0.14346815124154091, "num_tokens": 6017366.0, "step": 3260 }, { "entropy": 5.989615488052368, "epoch": 0.2743121193026675, "grad_norm": 1.046875, "learning_rate": 0.0004995915278576321, "loss": 5.8024, "mean_token_accuracy": 0.14921536892652512, "num_tokens": 6025597.0, "step": 3265 }, { "entropy": 5.995965671539307, "epoch": 0.27473219911783237, "grad_norm": 0.87890625, "learning_rate": 0.0004995897222089004, "loss": 5.9055, "mean_token_accuracy": 0.1438031278550625, "num_tokens": 6034239.0, "step": 3270 }, { "entropy": 6.17506217956543, "epoch": 0.27515227893299726, "grad_norm": 0.953125, "learning_rate": 0.0004995879125816772, "loss": 5.9388, "mean_token_accuracy": 0.14314718097448348, "num_tokens": 6043837.0, "step": 3275 }, { "entropy": 5.962472629547119, "epoch": 0.27557235874816216, "grad_norm": 0.87109375, "learning_rate": 0.0004995860989759949, "loss": 5.8709, "mean_token_accuracy": 0.14632273614406585, "num_tokens": 6053217.0, "step": 3280 }, { "entropy": 6.029792261123657, "epoch": 0.27599243856332706, "grad_norm": 1.0078125, "learning_rate": 0.0004995842813918855, "loss": 5.8948, "mean_token_accuracy": 0.1460642173886299, "num_tokens": 6061553.0, "step": 3285 }, { "entropy": 5.981232643127441, "epoch": 0.2764125183784919, "grad_norm": 1.046875, "learning_rate": 0.0004995824598293812, "loss": 5.7712, "mean_token_accuracy": 0.1501307800412178, "num_tokens": 6070080.0, "step": 3290 }, { "entropy": 6.045267486572266, "epoch": 0.2768325981936568, "grad_norm": 0.89453125, "learning_rate": 0.0004995806342885142, "loss": 5.9245, "mean_token_accuracy": 0.14930349588394165, "num_tokens": 6078438.0, "step": 3295 }, { "entropy": 6.0462220191955565, "epoch": 0.2772526780088217, "grad_norm": 1.0078125, "learning_rate": 0.000499578804769317, "loss": 5.9092, "mean_token_accuracy": 0.13776859119534493, "num_tokens": 6087794.0, "step": 3300 }, { "entropy": 6.104273176193237, "epoch": 0.27767275782398654, "grad_norm": 0.90234375, "learning_rate": 0.0004995769712718218, "loss": 5.9152, "mean_token_accuracy": 0.14523780345916748, "num_tokens": 6096709.0, "step": 3305 }, { "entropy": 5.998883199691773, "epoch": 0.27809283763915144, "grad_norm": 1.0, "learning_rate": 0.0004995751337960613, "loss": 5.8495, "mean_token_accuracy": 0.14268894568085672, "num_tokens": 6105866.0, "step": 3310 }, { "entropy": 6.001236534118652, "epoch": 0.27851291745431633, "grad_norm": 0.953125, "learning_rate": 0.0004995732923420679, "loss": 5.8071, "mean_token_accuracy": 0.15081177204847335, "num_tokens": 6114882.0, "step": 3315 }, { "entropy": 5.930415248870849, "epoch": 0.2789329972694812, "grad_norm": 0.9453125, "learning_rate": 0.0004995714469098743, "loss": 5.7725, "mean_token_accuracy": 0.14834588766098022, "num_tokens": 6123978.0, "step": 3320 }, { "entropy": 5.966728734970093, "epoch": 0.2793530770846461, "grad_norm": 0.93359375, "learning_rate": 0.000499569597499513, "loss": 5.9104, "mean_token_accuracy": 0.1466206818819046, "num_tokens": 6133246.0, "step": 3325 }, { "entropy": 5.988458681106567, "epoch": 0.27977315689981097, "grad_norm": 0.8671875, "learning_rate": 0.0004995677441110172, "loss": 5.7702, "mean_token_accuracy": 0.14939837008714676, "num_tokens": 6142865.0, "step": 3330 }, { "entropy": 6.014625930786133, "epoch": 0.28019323671497587, "grad_norm": 0.94140625, "learning_rate": 0.0004995658867444192, "loss": 5.8654, "mean_token_accuracy": 0.13881808668375015, "num_tokens": 6152492.0, "step": 3335 }, { "entropy": 5.975307273864746, "epoch": 0.2806133165301407, "grad_norm": 1.0078125, "learning_rate": 0.0004995640253997523, "loss": 5.8652, "mean_token_accuracy": 0.1395415373146534, "num_tokens": 6161953.0, "step": 3340 }, { "entropy": 5.848208713531494, "epoch": 0.2810333963453056, "grad_norm": 0.86328125, "learning_rate": 0.0004995621600770492, "loss": 5.7285, "mean_token_accuracy": 0.1502986840903759, "num_tokens": 6171467.0, "step": 3345 }, { "entropy": 5.9759973049163815, "epoch": 0.2814534761604705, "grad_norm": 0.87890625, "learning_rate": 0.0004995602907763431, "loss": 5.8103, "mean_token_accuracy": 0.1470308281481266, "num_tokens": 6180646.0, "step": 3350 }, { "entropy": 5.981297445297241, "epoch": 0.28187355597563535, "grad_norm": 1.0234375, "learning_rate": 0.0004995584174976672, "loss": 5.8029, "mean_token_accuracy": 0.14213321059942247, "num_tokens": 6189832.0, "step": 3355 }, { "entropy": 5.966393995285034, "epoch": 0.28229363579080025, "grad_norm": 0.95703125, "learning_rate": 0.0004995565402410544, "loss": 5.7274, "mean_token_accuracy": 0.1558822512626648, "num_tokens": 6198339.0, "step": 3360 }, { "entropy": 5.935036706924438, "epoch": 0.28271371560596514, "grad_norm": 1.0859375, "learning_rate": 0.0004995546590065383, "loss": 5.8126, "mean_token_accuracy": 0.14656742215156554, "num_tokens": 6207564.0, "step": 3365 }, { "entropy": 6.000332260131836, "epoch": 0.28313379542113004, "grad_norm": 0.98828125, "learning_rate": 0.0004995527737941518, "loss": 5.8581, "mean_token_accuracy": 0.14725540429353715, "num_tokens": 6216056.0, "step": 3370 }, { "entropy": 5.969868230819702, "epoch": 0.2835538752362949, "grad_norm": 0.9609375, "learning_rate": 0.0004995508846039287, "loss": 5.8259, "mean_token_accuracy": 0.1441423200070858, "num_tokens": 6225573.0, "step": 3375 }, { "entropy": 6.054820203781128, "epoch": 0.2839739550514598, "grad_norm": 0.93359375, "learning_rate": 0.0004995489914359023, "loss": 5.9519, "mean_token_accuracy": 0.13889921978116035, "num_tokens": 6235057.0, "step": 3380 }, { "entropy": 6.0446230411529545, "epoch": 0.2843940348666247, "grad_norm": 0.98046875, "learning_rate": 0.0004995470942901061, "loss": 5.8635, "mean_token_accuracy": 0.1436339296400547, "num_tokens": 6244164.0, "step": 3385 }, { "entropy": 6.036704730987549, "epoch": 0.2848141146817895, "grad_norm": 1.0, "learning_rate": 0.0004995451931665738, "loss": 5.8685, "mean_token_accuracy": 0.14183638542890548, "num_tokens": 6253095.0, "step": 3390 }, { "entropy": 5.9995965480804445, "epoch": 0.2852341944969544, "grad_norm": 0.94921875, "learning_rate": 0.000499543288065339, "loss": 5.817, "mean_token_accuracy": 0.14616027921438218, "num_tokens": 6261134.0, "step": 3395 }, { "entropy": 5.918176984786987, "epoch": 0.2856542743121193, "grad_norm": 1.015625, "learning_rate": 0.0004995413789864354, "loss": 5.8093, "mean_token_accuracy": 0.15111583173274995, "num_tokens": 6270384.0, "step": 3400 }, { "entropy": 5.925231647491455, "epoch": 0.28607435412728416, "grad_norm": 0.90234375, "learning_rate": 0.0004995394659298971, "loss": 5.7581, "mean_token_accuracy": 0.15247000753879547, "num_tokens": 6279702.0, "step": 3405 }, { "entropy": 5.9355387687683105, "epoch": 0.28649443394244906, "grad_norm": 0.90625, "learning_rate": 0.0004995375488957576, "loss": 5.8087, "mean_token_accuracy": 0.14355491399765014, "num_tokens": 6288297.0, "step": 3410 }, { "entropy": 5.953091335296631, "epoch": 0.28691451375761395, "grad_norm": 0.95703125, "learning_rate": 0.000499535627884051, "loss": 5.8943, "mean_token_accuracy": 0.13816075548529624, "num_tokens": 6297288.0, "step": 3415 }, { "entropy": 6.1151526927947994, "epoch": 0.28733459357277885, "grad_norm": 0.93359375, "learning_rate": 0.0004995337028948115, "loss": 5.912, "mean_token_accuracy": 0.13960782587528228, "num_tokens": 6306719.0, "step": 3420 }, { "entropy": 5.956048154830933, "epoch": 0.2877546733879437, "grad_norm": 0.9609375, "learning_rate": 0.0004995317739280731, "loss": 5.7384, "mean_token_accuracy": 0.15413220077753068, "num_tokens": 6316639.0, "step": 3425 }, { "entropy": 5.9882111072540285, "epoch": 0.2881747532031086, "grad_norm": 0.9375, "learning_rate": 0.0004995298409838699, "loss": 5.8729, "mean_token_accuracy": 0.14296835884451867, "num_tokens": 6326879.0, "step": 3430 }, { "entropy": 5.922442245483398, "epoch": 0.2885948330182735, "grad_norm": 0.90234375, "learning_rate": 0.000499527904062236, "loss": 5.7735, "mean_token_accuracy": 0.15226557850837708, "num_tokens": 6335729.0, "step": 3435 }, { "entropy": 5.973740720748902, "epoch": 0.28901491283343833, "grad_norm": 0.89453125, "learning_rate": 0.0004995259631632061, "loss": 5.8537, "mean_token_accuracy": 0.1386033460497856, "num_tokens": 6345154.0, "step": 3440 }, { "entropy": 5.9747546195983885, "epoch": 0.28943499264860323, "grad_norm": 0.9609375, "learning_rate": 0.0004995240182868143, "loss": 5.8072, "mean_token_accuracy": 0.14772575795650483, "num_tokens": 6354309.0, "step": 3445 }, { "entropy": 5.879770755767822, "epoch": 0.2898550724637681, "grad_norm": 0.89453125, "learning_rate": 0.0004995220694330951, "loss": 5.764, "mean_token_accuracy": 0.14814788177609445, "num_tokens": 6363389.0, "step": 3450 }, { "entropy": 5.928126335144043, "epoch": 0.290275152278933, "grad_norm": 0.921875, "learning_rate": 0.0004995201166020832, "loss": 5.8394, "mean_token_accuracy": 0.1423036128282547, "num_tokens": 6372475.0, "step": 3455 }, { "entropy": 6.01046404838562, "epoch": 0.29069523209409787, "grad_norm": 1.015625, "learning_rate": 0.000499518159793813, "loss": 5.7909, "mean_token_accuracy": 0.15391181409358978, "num_tokens": 6380906.0, "step": 3460 }, { "entropy": 5.901024436950683, "epoch": 0.29111531190926276, "grad_norm": 0.984375, "learning_rate": 0.000499516199008319, "loss": 5.7893, "mean_token_accuracy": 0.147665573656559, "num_tokens": 6390085.0, "step": 3465 }, { "entropy": 6.005919504165649, "epoch": 0.29153539172442766, "grad_norm": 1.0, "learning_rate": 0.0004995142342456364, "loss": 5.8587, "mean_token_accuracy": 0.14177713990211488, "num_tokens": 6399441.0, "step": 3470 }, { "entropy": 6.037836742401123, "epoch": 0.2919554715395925, "grad_norm": 0.95703125, "learning_rate": 0.0004995122655057997, "loss": 5.9277, "mean_token_accuracy": 0.14434729218482972, "num_tokens": 6408995.0, "step": 3475 }, { "entropy": 5.8759626865386965, "epoch": 0.2923755513547574, "grad_norm": 0.9453125, "learning_rate": 0.0004995102927888437, "loss": 5.6769, "mean_token_accuracy": 0.15346557945013045, "num_tokens": 6418080.0, "step": 3480 }, { "entropy": 5.980447435379029, "epoch": 0.2927956311699223, "grad_norm": 1.0390625, "learning_rate": 0.0004995083160948036, "loss": 5.8654, "mean_token_accuracy": 0.14365637302398682, "num_tokens": 6426732.0, "step": 3485 }, { "entropy": 5.918527126312256, "epoch": 0.29321571098508714, "grad_norm": 0.953125, "learning_rate": 0.0004995063354237141, "loss": 5.8601, "mean_token_accuracy": 0.14886348843574523, "num_tokens": 6435957.0, "step": 3490 }, { "entropy": 5.965629720687867, "epoch": 0.29363579080025204, "grad_norm": 1.1015625, "learning_rate": 0.0004995043507756107, "loss": 5.807, "mean_token_accuracy": 0.14377646446228026, "num_tokens": 6445642.0, "step": 3495 }, { "entropy": 5.966208457946777, "epoch": 0.29405587061541694, "grad_norm": 1.0859375, "learning_rate": 0.0004995023621505282, "loss": 5.8468, "mean_token_accuracy": 0.14531085640192032, "num_tokens": 6454664.0, "step": 3500 }, { "entropy": 5.846572160720825, "epoch": 0.29447595043058183, "grad_norm": 0.96484375, "learning_rate": 0.000499500369548502, "loss": 5.7718, "mean_token_accuracy": 0.14744968637824057, "num_tokens": 6463224.0, "step": 3505 }, { "entropy": 6.10300350189209, "epoch": 0.2948960302457467, "grad_norm": 0.90625, "learning_rate": 0.0004994983729695674, "loss": 5.9886, "mean_token_accuracy": 0.13981593102216722, "num_tokens": 6473112.0, "step": 3510 }, { "entropy": 5.991326189041137, "epoch": 0.2953161100609116, "grad_norm": 1.046875, "learning_rate": 0.0004994963724137595, "loss": 5.834, "mean_token_accuracy": 0.14485643282532693, "num_tokens": 6482062.0, "step": 3515 }, { "entropy": 5.928696584701538, "epoch": 0.29573618987607647, "grad_norm": 1.0625, "learning_rate": 0.0004994943678811142, "loss": 5.8362, "mean_token_accuracy": 0.1416163809597492, "num_tokens": 6490568.0, "step": 3520 }, { "entropy": 5.993920183181762, "epoch": 0.2961562696912413, "grad_norm": 0.9296875, "learning_rate": 0.0004994923593716667, "loss": 5.8772, "mean_token_accuracy": 0.14611808955669403, "num_tokens": 6500815.0, "step": 3525 }, { "entropy": 5.930905771255493, "epoch": 0.2965763495064062, "grad_norm": 0.94921875, "learning_rate": 0.0004994903468854527, "loss": 5.7544, "mean_token_accuracy": 0.15672436058521272, "num_tokens": 6509529.0, "step": 3530 }, { "entropy": 5.8914727687835695, "epoch": 0.2969964293215711, "grad_norm": 0.9609375, "learning_rate": 0.0004994883304225077, "loss": 5.8141, "mean_token_accuracy": 0.1436660371720791, "num_tokens": 6517934.0, "step": 3535 }, { "entropy": 6.048480892181397, "epoch": 0.297416509136736, "grad_norm": 0.90234375, "learning_rate": 0.0004994863099828675, "loss": 5.7902, "mean_token_accuracy": 0.14704177230596543, "num_tokens": 6526098.0, "step": 3540 }, { "entropy": 5.920773935317993, "epoch": 0.29783658895190085, "grad_norm": 0.953125, "learning_rate": 0.000499484285566568, "loss": 5.8221, "mean_token_accuracy": 0.14378595799207688, "num_tokens": 6535831.0, "step": 3545 }, { "entropy": 5.922514152526856, "epoch": 0.29825666876706575, "grad_norm": 0.859375, "learning_rate": 0.0004994822571736449, "loss": 5.7254, "mean_token_accuracy": 0.1482064038515091, "num_tokens": 6545704.0, "step": 3550 }, { "entropy": 5.899800491333008, "epoch": 0.29867674858223064, "grad_norm": 1.046875, "learning_rate": 0.0004994802248041342, "loss": 5.7535, "mean_token_accuracy": 0.14916675686836242, "num_tokens": 6554423.0, "step": 3555 }, { "entropy": 5.932198619842529, "epoch": 0.2990968283973955, "grad_norm": 0.96875, "learning_rate": 0.000499478188458072, "loss": 5.8022, "mean_token_accuracy": 0.14890404120087625, "num_tokens": 6563989.0, "step": 3560 }, { "entropy": 5.968116617202758, "epoch": 0.2995169082125604, "grad_norm": 1.1171875, "learning_rate": 0.0004994761481354943, "loss": 5.9483, "mean_token_accuracy": 0.1441567473113537, "num_tokens": 6572745.0, "step": 3565 }, { "entropy": 6.137206792831421, "epoch": 0.2999369880277253, "grad_norm": 0.99609375, "learning_rate": 0.0004994741038364371, "loss": 5.9343, "mean_token_accuracy": 0.142555071413517, "num_tokens": 6581723.0, "step": 3570 }, { "entropy": 5.88220705986023, "epoch": 0.3003570678428901, "grad_norm": 0.96875, "learning_rate": 0.0004994720555609369, "loss": 5.6659, "mean_token_accuracy": 0.1542235180735588, "num_tokens": 6590342.0, "step": 3575 }, { "entropy": 5.829970359802246, "epoch": 0.300777147658055, "grad_norm": 1.03125, "learning_rate": 0.0004994700033090297, "loss": 5.7501, "mean_token_accuracy": 0.1582304283976555, "num_tokens": 6599206.0, "step": 3580 }, { "entropy": 6.041889762878418, "epoch": 0.3011972274732199, "grad_norm": 1.0703125, "learning_rate": 0.000499467947080752, "loss": 6.0318, "mean_token_accuracy": 0.13561916202306748, "num_tokens": 6608947.0, "step": 3585 }, { "entropy": 6.06544942855835, "epoch": 0.3016173072883848, "grad_norm": 0.9765625, "learning_rate": 0.0004994658868761402, "loss": 5.8283, "mean_token_accuracy": 0.15170362889766692, "num_tokens": 6618378.0, "step": 3590 }, { "entropy": 5.914470624923706, "epoch": 0.30203738710354966, "grad_norm": 1.0078125, "learning_rate": 0.0004994638226952307, "loss": 5.8836, "mean_token_accuracy": 0.14195557832717895, "num_tokens": 6627527.0, "step": 3595 }, { "entropy": 5.982400751113891, "epoch": 0.30245746691871456, "grad_norm": 0.98828125, "learning_rate": 0.0004994617545380604, "loss": 5.8286, "mean_token_accuracy": 0.14527858346700667, "num_tokens": 6636964.0, "step": 3600 }, { "entropy": 5.908453559875488, "epoch": 0.30287754673387945, "grad_norm": 1.03125, "learning_rate": 0.0004994596824046656, "loss": 5.7718, "mean_token_accuracy": 0.14911266565322875, "num_tokens": 6646074.0, "step": 3605 }, { "entropy": 5.99076018333435, "epoch": 0.3032976265490443, "grad_norm": 0.9296875, "learning_rate": 0.000499457606295083, "loss": 5.8447, "mean_token_accuracy": 0.14240661412477493, "num_tokens": 6655027.0, "step": 3610 }, { "entropy": 5.808787536621094, "epoch": 0.3037177063642092, "grad_norm": 1.0390625, "learning_rate": 0.0004994555262093495, "loss": 5.6321, "mean_token_accuracy": 0.1570141136646271, "num_tokens": 6663747.0, "step": 3615 }, { "entropy": 6.046371412277222, "epoch": 0.3041377861793741, "grad_norm": 0.99609375, "learning_rate": 0.000499453442147502, "loss": 5.9593, "mean_token_accuracy": 0.1389522023499012, "num_tokens": 6672922.0, "step": 3620 }, { "entropy": 5.9334362030029295, "epoch": 0.304557865994539, "grad_norm": 0.9296875, "learning_rate": 0.0004994513541095773, "loss": 5.7735, "mean_token_accuracy": 0.15685406178236008, "num_tokens": 6682233.0, "step": 3625 }, { "entropy": 5.922385549545288, "epoch": 0.30497794580970383, "grad_norm": 0.97265625, "learning_rate": 0.0004994492620956126, "loss": 5.8112, "mean_token_accuracy": 0.15047757476568221, "num_tokens": 6691593.0, "step": 3630 }, { "entropy": 5.917299842834472, "epoch": 0.30539802562486873, "grad_norm": 0.91796875, "learning_rate": 0.0004994471661056445, "loss": 5.8207, "mean_token_accuracy": 0.15176298022270202, "num_tokens": 6701318.0, "step": 3635 }, { "entropy": 6.031417036056519, "epoch": 0.3058181054400336, "grad_norm": 0.9140625, "learning_rate": 0.0004994450661397106, "loss": 5.8199, "mean_token_accuracy": 0.1515482097864151, "num_tokens": 6710059.0, "step": 3640 }, { "entropy": 6.035120582580566, "epoch": 0.30623818525519847, "grad_norm": 0.921875, "learning_rate": 0.000499442962197848, "loss": 5.9111, "mean_token_accuracy": 0.14002010971307755, "num_tokens": 6719811.0, "step": 3645 }, { "entropy": 5.872648668289185, "epoch": 0.30665826507036337, "grad_norm": 0.93359375, "learning_rate": 0.0004994408542800937, "loss": 5.7991, "mean_token_accuracy": 0.15095670521259308, "num_tokens": 6728789.0, "step": 3650 }, { "entropy": 5.943379068374634, "epoch": 0.30707834488552826, "grad_norm": 0.984375, "learning_rate": 0.0004994387423864855, "loss": 5.7834, "mean_token_accuracy": 0.1460746333003044, "num_tokens": 6737706.0, "step": 3655 }, { "entropy": 5.941844272613525, "epoch": 0.3074984247006931, "grad_norm": 0.98828125, "learning_rate": 0.0004994366265170603, "loss": 5.7446, "mean_token_accuracy": 0.16055794954299926, "num_tokens": 6746861.0, "step": 3660 }, { "entropy": 6.028618669509887, "epoch": 0.307918504515858, "grad_norm": 1.0390625, "learning_rate": 0.0004994345066718558, "loss": 5.916, "mean_token_accuracy": 0.14116688221693038, "num_tokens": 6755242.0, "step": 3665 }, { "entropy": 6.008127069473266, "epoch": 0.3083385843310229, "grad_norm": 0.9609375, "learning_rate": 0.0004994323828509098, "loss": 5.8727, "mean_token_accuracy": 0.14286566898226738, "num_tokens": 6764549.0, "step": 3670 }, { "entropy": 5.929146242141724, "epoch": 0.3087586641461878, "grad_norm": 1.0390625, "learning_rate": 0.0004994302550542596, "loss": 5.8471, "mean_token_accuracy": 0.1538454920053482, "num_tokens": 6774123.0, "step": 3675 }, { "entropy": 5.80585126876831, "epoch": 0.30917874396135264, "grad_norm": 1.046875, "learning_rate": 0.000499428123281943, "loss": 5.6317, "mean_token_accuracy": 0.1558361306786537, "num_tokens": 6782922.0, "step": 3680 }, { "entropy": 5.925417232513428, "epoch": 0.30959882377651754, "grad_norm": 0.94921875, "learning_rate": 0.0004994259875339978, "loss": 5.8838, "mean_token_accuracy": 0.14831040799617767, "num_tokens": 6792042.0, "step": 3685 }, { "entropy": 6.067014694213867, "epoch": 0.31001890359168244, "grad_norm": 1.0234375, "learning_rate": 0.0004994238478104617, "loss": 5.872, "mean_token_accuracy": 0.14466599076986314, "num_tokens": 6800994.0, "step": 3690 }, { "entropy": 5.913062810897827, "epoch": 0.3104389834068473, "grad_norm": 0.9375, "learning_rate": 0.0004994217041113727, "loss": 5.8012, "mean_token_accuracy": 0.15395486801862718, "num_tokens": 6809938.0, "step": 3695 }, { "entropy": 6.028704833984375, "epoch": 0.3108590632220122, "grad_norm": 0.8828125, "learning_rate": 0.0004994195564367688, "loss": 5.9148, "mean_token_accuracy": 0.14361433312296867, "num_tokens": 6820289.0, "step": 3700 }, { "entropy": 5.998479652404785, "epoch": 0.3112791430371771, "grad_norm": 1.0078125, "learning_rate": 0.0004994174047866882, "loss": 5.7538, "mean_token_accuracy": 0.15162525251507758, "num_tokens": 6830068.0, "step": 3705 }, { "entropy": 5.830403566360474, "epoch": 0.3116992228523419, "grad_norm": 0.98828125, "learning_rate": 0.0004994152491611686, "loss": 5.7916, "mean_token_accuracy": 0.14659319072961807, "num_tokens": 6838591.0, "step": 3710 }, { "entropy": 5.838834381103515, "epoch": 0.3121193026675068, "grad_norm": 0.94921875, "learning_rate": 0.0004994130895602485, "loss": 5.7583, "mean_token_accuracy": 0.14570422172546388, "num_tokens": 6847796.0, "step": 3715 }, { "entropy": 6.02327971458435, "epoch": 0.3125393824826717, "grad_norm": 0.92578125, "learning_rate": 0.000499410925983966, "loss": 5.8457, "mean_token_accuracy": 0.14952262938022615, "num_tokens": 6856585.0, "step": 3720 }, { "entropy": 5.887494659423828, "epoch": 0.3129594622978366, "grad_norm": 1.015625, "learning_rate": 0.0004994087584323596, "loss": 5.7583, "mean_token_accuracy": 0.15517981797456742, "num_tokens": 6865757.0, "step": 3725 }, { "entropy": 5.853988265991211, "epoch": 0.31337954211300145, "grad_norm": 0.90625, "learning_rate": 0.0004994065869054676, "loss": 5.796, "mean_token_accuracy": 0.1451224982738495, "num_tokens": 6875371.0, "step": 3730 }, { "entropy": 6.02379846572876, "epoch": 0.31379962192816635, "grad_norm": 1.03125, "learning_rate": 0.0004994044114033283, "loss": 5.8687, "mean_token_accuracy": 0.1440061092376709, "num_tokens": 6884050.0, "step": 3735 }, { "entropy": 6.026759815216065, "epoch": 0.31421970174333125, "grad_norm": 1.0859375, "learning_rate": 0.0004994022319259806, "loss": 5.8372, "mean_token_accuracy": 0.14598554819822313, "num_tokens": 6893079.0, "step": 3740 }, { "entropy": 5.911620283126831, "epoch": 0.3146397815584961, "grad_norm": 0.984375, "learning_rate": 0.0004994000484734629, "loss": 5.9136, "mean_token_accuracy": 0.15156169682741166, "num_tokens": 6903100.0, "step": 3745 }, { "entropy": 5.923766088485718, "epoch": 0.315059861373661, "grad_norm": 0.875, "learning_rate": 0.0004993978610458137, "loss": 5.7654, "mean_token_accuracy": 0.15068738907575607, "num_tokens": 6912164.0, "step": 3750 }, { "entropy": 5.878131437301636, "epoch": 0.3154799411888259, "grad_norm": 0.9296875, "learning_rate": 0.0004993956696430721, "loss": 5.7781, "mean_token_accuracy": 0.1453731819987297, "num_tokens": 6921183.0, "step": 3755 }, { "entropy": 5.950732278823852, "epoch": 0.3159000210039908, "grad_norm": 0.9296875, "learning_rate": 0.0004993934742652768, "loss": 5.8422, "mean_token_accuracy": 0.14924204498529434, "num_tokens": 6931325.0, "step": 3760 }, { "entropy": 5.98630108833313, "epoch": 0.3163201008191556, "grad_norm": 0.9140625, "learning_rate": 0.0004993912749124665, "loss": 5.7579, "mean_token_accuracy": 0.15365685075521468, "num_tokens": 6940234.0, "step": 3765 }, { "entropy": 5.933948040008545, "epoch": 0.3167401806343205, "grad_norm": 0.92578125, "learning_rate": 0.0004993890715846804, "loss": 5.8442, "mean_token_accuracy": 0.1472316324710846, "num_tokens": 6949067.0, "step": 3770 }, { "entropy": 5.98266453742981, "epoch": 0.3171602604494854, "grad_norm": 0.9375, "learning_rate": 0.0004993868642819574, "loss": 5.8092, "mean_token_accuracy": 0.14614944905042648, "num_tokens": 6959085.0, "step": 3775 }, { "entropy": 5.905980443954467, "epoch": 0.31758034026465026, "grad_norm": 1.015625, "learning_rate": 0.0004993846530043367, "loss": 5.8539, "mean_token_accuracy": 0.14434425979852678, "num_tokens": 6967392.0, "step": 3780 }, { "entropy": 5.910531997680664, "epoch": 0.31800042007981516, "grad_norm": 1.09375, "learning_rate": 0.0004993824377518574, "loss": 5.7851, "mean_token_accuracy": 0.1514693483710289, "num_tokens": 6976369.0, "step": 3785 }, { "entropy": 5.976119804382324, "epoch": 0.31842049989498006, "grad_norm": 0.94140625, "learning_rate": 0.0004993802185245587, "loss": 5.8013, "mean_token_accuracy": 0.14934585690498353, "num_tokens": 6985889.0, "step": 3790 }, { "entropy": 5.925661182403564, "epoch": 0.3188405797101449, "grad_norm": 0.9921875, "learning_rate": 0.00049937799532248, "loss": 5.8359, "mean_token_accuracy": 0.13918048441410064, "num_tokens": 6995396.0, "step": 3795 }, { "entropy": 6.0729657173156735, "epoch": 0.3192606595253098, "grad_norm": 0.921875, "learning_rate": 0.0004993757681456607, "loss": 5.8718, "mean_token_accuracy": 0.1478106528520584, "num_tokens": 7004666.0, "step": 3800 }, { "entropy": 5.967416000366211, "epoch": 0.3196807393404747, "grad_norm": 0.87890625, "learning_rate": 0.0004993735369941401, "loss": 5.8998, "mean_token_accuracy": 0.14525311812758446, "num_tokens": 7014608.0, "step": 3805 }, { "entropy": 5.966092729568482, "epoch": 0.3201008191556396, "grad_norm": 0.91015625, "learning_rate": 0.0004993713018679579, "loss": 5.7888, "mean_token_accuracy": 0.14646613076329232, "num_tokens": 7023671.0, "step": 3810 }, { "entropy": 5.904713773727417, "epoch": 0.32052089897080444, "grad_norm": 0.8984375, "learning_rate": 0.0004993690627671536, "loss": 5.8148, "mean_token_accuracy": 0.1434755489230156, "num_tokens": 7033786.0, "step": 3815 }, { "entropy": 5.907800912857056, "epoch": 0.32094097878596933, "grad_norm": 0.9609375, "learning_rate": 0.0004993668196917669, "loss": 5.7268, "mean_token_accuracy": 0.15316082686185836, "num_tokens": 7042162.0, "step": 3820 }, { "entropy": 5.994227170944214, "epoch": 0.32136105860113423, "grad_norm": 0.9140625, "learning_rate": 0.0004993645726418375, "loss": 5.8618, "mean_token_accuracy": 0.15052291825413705, "num_tokens": 7051903.0, "step": 3825 }, { "entropy": 5.900808525085449, "epoch": 0.3217811384162991, "grad_norm": 0.96875, "learning_rate": 0.0004993623216174053, "loss": 5.7121, "mean_token_accuracy": 0.161135034263134, "num_tokens": 7060229.0, "step": 3830 }, { "entropy": 5.845855093002319, "epoch": 0.32220121823146397, "grad_norm": 0.99609375, "learning_rate": 0.00049936006661851, "loss": 5.7989, "mean_token_accuracy": 0.1526742696762085, "num_tokens": 7069040.0, "step": 3835 }, { "entropy": 5.919027471542359, "epoch": 0.32262129804662887, "grad_norm": 1.03125, "learning_rate": 0.0004993578076451917, "loss": 5.6805, "mean_token_accuracy": 0.15347311198711394, "num_tokens": 7078409.0, "step": 3840 }, { "entropy": 5.853667831420898, "epoch": 0.32304137786179377, "grad_norm": 0.9453125, "learning_rate": 0.0004993555446974903, "loss": 5.765, "mean_token_accuracy": 0.14782839864492417, "num_tokens": 7087983.0, "step": 3845 }, { "entropy": 5.853893089294433, "epoch": 0.3234614576769586, "grad_norm": 1.0234375, "learning_rate": 0.000499353277775446, "loss": 5.7182, "mean_token_accuracy": 0.1580560803413391, "num_tokens": 7097277.0, "step": 3850 }, { "entropy": 5.87832407951355, "epoch": 0.3238815374921235, "grad_norm": 1.0859375, "learning_rate": 0.0004993510068790989, "loss": 5.6187, "mean_token_accuracy": 0.16494725197553634, "num_tokens": 7105918.0, "step": 3855 }, { "entropy": 5.8204621315002445, "epoch": 0.3243016173072884, "grad_norm": 0.92578125, "learning_rate": 0.0004993487320084892, "loss": 5.6885, "mean_token_accuracy": 0.1581684559583664, "num_tokens": 7115049.0, "step": 3860 }, { "entropy": 5.950232267379761, "epoch": 0.32472169712245325, "grad_norm": 0.921875, "learning_rate": 0.0004993464531636573, "loss": 5.7875, "mean_token_accuracy": 0.1498127706348896, "num_tokens": 7124862.0, "step": 3865 }, { "entropy": 5.82954216003418, "epoch": 0.32514177693761814, "grad_norm": 0.984375, "learning_rate": 0.0004993441703446435, "loss": 5.6777, "mean_token_accuracy": 0.1620057240128517, "num_tokens": 7133280.0, "step": 3870 }, { "entropy": 5.929150485992432, "epoch": 0.32556185675278304, "grad_norm": 0.9921875, "learning_rate": 0.0004993418835514882, "loss": 5.8773, "mean_token_accuracy": 0.14564588218927382, "num_tokens": 7142446.0, "step": 3875 }, { "entropy": 5.9440654754638675, "epoch": 0.3259819365679479, "grad_norm": 0.875, "learning_rate": 0.0004993395927842321, "loss": 5.7755, "mean_token_accuracy": 0.14392856359481812, "num_tokens": 7152143.0, "step": 3880 }, { "entropy": 6.021526956558228, "epoch": 0.3264020163831128, "grad_norm": 0.98046875, "learning_rate": 0.0004993372980429155, "loss": 5.8501, "mean_token_accuracy": 0.14762358814477922, "num_tokens": 7162046.0, "step": 3885 }, { "entropy": 5.937510824203491, "epoch": 0.3268220961982777, "grad_norm": 0.95703125, "learning_rate": 0.0004993349993275792, "loss": 5.7358, "mean_token_accuracy": 0.1501179426908493, "num_tokens": 7171557.0, "step": 3890 }, { "entropy": 5.722299528121948, "epoch": 0.3272421760134426, "grad_norm": 0.86328125, "learning_rate": 0.0004993326966382639, "loss": 5.6455, "mean_token_accuracy": 0.15715345591306687, "num_tokens": 7180927.0, "step": 3895 }, { "entropy": 5.841052865982055, "epoch": 0.3276622558286074, "grad_norm": 1.0546875, "learning_rate": 0.0004993303899750104, "loss": 5.728, "mean_token_accuracy": 0.15390928834676743, "num_tokens": 7189552.0, "step": 3900 }, { "entropy": 5.984076976776123, "epoch": 0.3280823356437723, "grad_norm": 1.015625, "learning_rate": 0.0004993280793378595, "loss": 5.7447, "mean_token_accuracy": 0.14799359515309335, "num_tokens": 7197857.0, "step": 3905 }, { "entropy": 5.883258295059204, "epoch": 0.3285024154589372, "grad_norm": 0.9921875, "learning_rate": 0.0004993257647268522, "loss": 5.7153, "mean_token_accuracy": 0.15892730355262757, "num_tokens": 7206785.0, "step": 3910 }, { "entropy": 5.8749652862548825, "epoch": 0.32892249527410206, "grad_norm": 0.9140625, "learning_rate": 0.0004993234461420295, "loss": 5.8032, "mean_token_accuracy": 0.1540107510983944, "num_tokens": 7216360.0, "step": 3915 }, { "entropy": 5.903149938583374, "epoch": 0.32934257508926695, "grad_norm": 1.015625, "learning_rate": 0.0004993211235834326, "loss": 5.6111, "mean_token_accuracy": 0.1713676080107689, "num_tokens": 7224890.0, "step": 3920 }, { "entropy": 5.803111982345581, "epoch": 0.32976265490443185, "grad_norm": 1.078125, "learning_rate": 0.0004993187970511023, "loss": 5.6647, "mean_token_accuracy": 0.17485086023807525, "num_tokens": 7234442.0, "step": 3925 }, { "entropy": 5.873620986938477, "epoch": 0.33018273471959675, "grad_norm": 0.94921875, "learning_rate": 0.0004993164665450801, "loss": 5.8228, "mean_token_accuracy": 0.15156899392604828, "num_tokens": 7244023.0, "step": 3930 }, { "entropy": 5.843383169174194, "epoch": 0.3306028145347616, "grad_norm": 0.9140625, "learning_rate": 0.0004993141320654072, "loss": 5.6665, "mean_token_accuracy": 0.15884078443050384, "num_tokens": 7253548.0, "step": 3935 }, { "entropy": 5.8344789981842045, "epoch": 0.3310228943499265, "grad_norm": 0.9609375, "learning_rate": 0.000499311793612125, "loss": 5.7347, "mean_token_accuracy": 0.15194563269615174, "num_tokens": 7262962.0, "step": 3940 }, { "entropy": 5.9449968338012695, "epoch": 0.3314429741650914, "grad_norm": 0.91015625, "learning_rate": 0.0004993094511852748, "loss": 5.7609, "mean_token_accuracy": 0.14924739301204681, "num_tokens": 7272234.0, "step": 3945 }, { "entropy": 5.968133401870728, "epoch": 0.33186305398025623, "grad_norm": 0.984375, "learning_rate": 0.0004993071047848983, "loss": 5.7413, "mean_token_accuracy": 0.15319221317768097, "num_tokens": 7281524.0, "step": 3950 }, { "entropy": 5.790039682388306, "epoch": 0.3322831337954211, "grad_norm": 0.98046875, "learning_rate": 0.0004993047544110368, "loss": 5.6528, "mean_token_accuracy": 0.15719158425927163, "num_tokens": 7289601.0, "step": 3955 }, { "entropy": 5.721573781967163, "epoch": 0.332703213610586, "grad_norm": 1.046875, "learning_rate": 0.0004993024000637321, "loss": 5.6074, "mean_token_accuracy": 0.16373219192028046, "num_tokens": 7298508.0, "step": 3960 }, { "entropy": 5.854639863967895, "epoch": 0.33312329342575087, "grad_norm": 0.9296875, "learning_rate": 0.0004993000417430259, "loss": 5.8333, "mean_token_accuracy": 0.14586606696248056, "num_tokens": 7309065.0, "step": 3965 }, { "entropy": 6.050255537033081, "epoch": 0.33354337324091576, "grad_norm": 0.86328125, "learning_rate": 0.00049929767944896, "loss": 5.8607, "mean_token_accuracy": 0.14968539252877236, "num_tokens": 7319669.0, "step": 3970 }, { "entropy": 5.973075866699219, "epoch": 0.33396345305608066, "grad_norm": 0.96875, "learning_rate": 0.0004992953131815761, "loss": 5.7964, "mean_token_accuracy": 0.14924187809228898, "num_tokens": 7328425.0, "step": 3975 }, { "entropy": 5.858473682403565, "epoch": 0.33438353287124556, "grad_norm": 1.0703125, "learning_rate": 0.0004992929429409164, "loss": 5.6701, "mean_token_accuracy": 0.15970652550458908, "num_tokens": 7337369.0, "step": 3980 }, { "entropy": 5.832104206085205, "epoch": 0.3348036126864104, "grad_norm": 0.92578125, "learning_rate": 0.0004992905687270225, "loss": 5.7375, "mean_token_accuracy": 0.15307654216885566, "num_tokens": 7346829.0, "step": 3985 }, { "entropy": 5.9267027378082275, "epoch": 0.3352236925015753, "grad_norm": 0.96484375, "learning_rate": 0.0004992881905399368, "loss": 5.7952, "mean_token_accuracy": 0.14916737228631974, "num_tokens": 7355976.0, "step": 3990 }, { "entropy": 5.941111850738525, "epoch": 0.3356437723167402, "grad_norm": 1.0859375, "learning_rate": 0.0004992858083797013, "loss": 5.7675, "mean_token_accuracy": 0.1473349630832672, "num_tokens": 7365210.0, "step": 3995 }, { "entropy": 5.9041369438171385, "epoch": 0.33606385213190504, "grad_norm": 1.015625, "learning_rate": 0.0004992834222463581, "loss": 5.8093, "mean_token_accuracy": 0.14046019837260246, "num_tokens": 7374175.0, "step": 4000 }, { "entropy": 5.923312139511109, "epoch": 0.33648393194706994, "grad_norm": 0.94921875, "learning_rate": 0.0004992810321399496, "loss": 5.8383, "mean_token_accuracy": 0.147621788084507, "num_tokens": 7383302.0, "step": 4005 }, { "entropy": 5.99611988067627, "epoch": 0.33690401176223483, "grad_norm": 0.96484375, "learning_rate": 0.0004992786380605182, "loss": 5.8018, "mean_token_accuracy": 0.15006497725844384, "num_tokens": 7392746.0, "step": 4010 }, { "entropy": 5.865422248840332, "epoch": 0.33732409157739973, "grad_norm": 1.0, "learning_rate": 0.0004992762400081062, "loss": 5.6537, "mean_token_accuracy": 0.1529911682009697, "num_tokens": 7401604.0, "step": 4015 }, { "entropy": 5.859767580032349, "epoch": 0.3377441713925646, "grad_norm": 0.99609375, "learning_rate": 0.0004992738379827559, "loss": 5.7575, "mean_token_accuracy": 0.15247822627425195, "num_tokens": 7410594.0, "step": 4020 }, { "entropy": 5.920141792297363, "epoch": 0.33816425120772947, "grad_norm": 0.9765625, "learning_rate": 0.0004992714319845101, "loss": 5.658, "mean_token_accuracy": 0.16050563454627992, "num_tokens": 7418831.0, "step": 4025 }, { "entropy": 5.809229993820191, "epoch": 0.33858433102289437, "grad_norm": 0.92578125, "learning_rate": 0.0004992690220134116, "loss": 5.7047, "mean_token_accuracy": 0.15451119393110274, "num_tokens": 7427731.0, "step": 4030 }, { "entropy": 5.96991548538208, "epoch": 0.3390044108380592, "grad_norm": 1.0, "learning_rate": 0.0004992666080695027, "loss": 5.8101, "mean_token_accuracy": 0.14591643139719962, "num_tokens": 7436447.0, "step": 4035 }, { "entropy": 5.9149298667907715, "epoch": 0.3394244906532241, "grad_norm": 1.015625, "learning_rate": 0.0004992641901528262, "loss": 5.7195, "mean_token_accuracy": 0.15583046823740004, "num_tokens": 7445352.0, "step": 4040 }, { "entropy": 5.908085584640503, "epoch": 0.339844570468389, "grad_norm": 0.89453125, "learning_rate": 0.0004992617682634252, "loss": 5.7887, "mean_token_accuracy": 0.1540717288851738, "num_tokens": 7454298.0, "step": 4045 }, { "entropy": 5.891385746002197, "epoch": 0.34026465028355385, "grad_norm": 0.8828125, "learning_rate": 0.0004992593424013424, "loss": 5.7978, "mean_token_accuracy": 0.15331364274024964, "num_tokens": 7463543.0, "step": 4050 }, { "entropy": 5.913450384140015, "epoch": 0.34068473009871875, "grad_norm": 0.98046875, "learning_rate": 0.0004992569125666209, "loss": 5.8148, "mean_token_accuracy": 0.14611926972866057, "num_tokens": 7472701.0, "step": 4055 }, { "entropy": 6.010456657409668, "epoch": 0.34110480991388364, "grad_norm": 0.92578125, "learning_rate": 0.0004992544787593037, "loss": 5.817, "mean_token_accuracy": 0.14246124625205994, "num_tokens": 7481123.0, "step": 4060 }, { "entropy": 5.905852317810059, "epoch": 0.34152488972904854, "grad_norm": 0.92578125, "learning_rate": 0.0004992520409794338, "loss": 5.8641, "mean_token_accuracy": 0.1508338287472725, "num_tokens": 7490439.0, "step": 4065 }, { "entropy": 5.901952314376831, "epoch": 0.3419449695442134, "grad_norm": 0.953125, "learning_rate": 0.0004992495992270544, "loss": 5.7351, "mean_token_accuracy": 0.1509379267692566, "num_tokens": 7499326.0, "step": 4070 }, { "entropy": 5.938205337524414, "epoch": 0.3423650493593783, "grad_norm": 0.92578125, "learning_rate": 0.0004992471535022089, "loss": 5.7857, "mean_token_accuracy": 0.1451237343251705, "num_tokens": 7509407.0, "step": 4075 }, { "entropy": 5.869676685333252, "epoch": 0.3427851291745432, "grad_norm": 0.92578125, "learning_rate": 0.0004992447038049405, "loss": 5.829, "mean_token_accuracy": 0.14850043952465058, "num_tokens": 7518443.0, "step": 4080 }, { "entropy": 5.861940097808838, "epoch": 0.343205208989708, "grad_norm": 1.015625, "learning_rate": 0.0004992422501352927, "loss": 5.6977, "mean_token_accuracy": 0.15755705237388612, "num_tokens": 7527609.0, "step": 4085 }, { "entropy": 5.978248167037964, "epoch": 0.3436252888048729, "grad_norm": 1.0234375, "learning_rate": 0.0004992397924933089, "loss": 5.7788, "mean_token_accuracy": 0.15250536054372787, "num_tokens": 7536890.0, "step": 4090 }, { "entropy": 5.899935388565064, "epoch": 0.3440453686200378, "grad_norm": 0.99609375, "learning_rate": 0.0004992373308790325, "loss": 5.731, "mean_token_accuracy": 0.1621832400560379, "num_tokens": 7546509.0, "step": 4095 }, { "entropy": 5.818875694274903, "epoch": 0.3444654484352027, "grad_norm": 0.9765625, "learning_rate": 0.0004992348652925074, "loss": 5.7667, "mean_token_accuracy": 0.15332106947898866, "num_tokens": 7555336.0, "step": 4100 }, { "entropy": 5.907353639602661, "epoch": 0.34488552825036756, "grad_norm": 1.046875, "learning_rate": 0.0004992323957337771, "loss": 5.7278, "mean_token_accuracy": 0.1509070634841919, "num_tokens": 7565210.0, "step": 4105 }, { "entropy": 5.929575109481812, "epoch": 0.34530560806553245, "grad_norm": 0.89453125, "learning_rate": 0.0004992299222028855, "loss": 5.8127, "mean_token_accuracy": 0.15723925679922104, "num_tokens": 7574516.0, "step": 4110 }, { "entropy": 5.839164924621582, "epoch": 0.34572568788069735, "grad_norm": 0.9921875, "learning_rate": 0.0004992274446998761, "loss": 5.6588, "mean_token_accuracy": 0.1544717237353325, "num_tokens": 7583219.0, "step": 4115 }, { "entropy": 5.916603851318359, "epoch": 0.3461457676958622, "grad_norm": 0.9765625, "learning_rate": 0.0004992249632247929, "loss": 5.902, "mean_token_accuracy": 0.14321533888578414, "num_tokens": 7592050.0, "step": 4120 }, { "entropy": 5.9809043407440186, "epoch": 0.3465658475110271, "grad_norm": 0.95703125, "learning_rate": 0.0004992224777776802, "loss": 5.732, "mean_token_accuracy": 0.1493101716041565, "num_tokens": 7600718.0, "step": 4125 }, { "entropy": 5.901517105102539, "epoch": 0.346985927326192, "grad_norm": 0.98046875, "learning_rate": 0.0004992199883585816, "loss": 5.7557, "mean_token_accuracy": 0.15382387340068818, "num_tokens": 7609191.0, "step": 4130 }, { "entropy": 5.910360288619995, "epoch": 0.34740600714135683, "grad_norm": 0.98046875, "learning_rate": 0.0004992174949675413, "loss": 5.7894, "mean_token_accuracy": 0.152114437520504, "num_tokens": 7618509.0, "step": 4135 }, { "entropy": 5.890322875976563, "epoch": 0.34782608695652173, "grad_norm": 1.03125, "learning_rate": 0.0004992149976046037, "loss": 5.7136, "mean_token_accuracy": 0.15040391087532043, "num_tokens": 7627851.0, "step": 4140 }, { "entropy": 5.837684154510498, "epoch": 0.3482461667716866, "grad_norm": 0.921875, "learning_rate": 0.0004992124962698128, "loss": 5.7584, "mean_token_accuracy": 0.15606331154704095, "num_tokens": 7636748.0, "step": 4145 }, { "entropy": 5.921899652481079, "epoch": 0.3486662465868515, "grad_norm": 1.03125, "learning_rate": 0.000499209990963213, "loss": 5.7078, "mean_token_accuracy": 0.15208663642406464, "num_tokens": 7645436.0, "step": 4150 }, { "entropy": 5.917012548446655, "epoch": 0.34908632640201637, "grad_norm": 0.98828125, "learning_rate": 0.0004992074816848487, "loss": 5.8094, "mean_token_accuracy": 0.15278877168893815, "num_tokens": 7655414.0, "step": 4155 }, { "entropy": 5.772976493835449, "epoch": 0.34950640621718126, "grad_norm": 0.98828125, "learning_rate": 0.0004992049684347642, "loss": 5.6074, "mean_token_accuracy": 0.15534141510725022, "num_tokens": 7664295.0, "step": 4160 }, { "entropy": 5.917826843261719, "epoch": 0.34992648603234616, "grad_norm": 1.0546875, "learning_rate": 0.0004992024512130042, "loss": 5.7416, "mean_token_accuracy": 0.15260617434978485, "num_tokens": 7673295.0, "step": 4165 }, { "entropy": 5.788580131530762, "epoch": 0.350346565847511, "grad_norm": 0.859375, "learning_rate": 0.0004991999300196132, "loss": 5.7469, "mean_token_accuracy": 0.15305035635828973, "num_tokens": 7682932.0, "step": 4170 }, { "entropy": 5.923834562301636, "epoch": 0.3507666456626759, "grad_norm": 1.0078125, "learning_rate": 0.0004991974048546359, "loss": 5.753, "mean_token_accuracy": 0.1500132530927658, "num_tokens": 7692105.0, "step": 4175 }, { "entropy": 5.925296068191528, "epoch": 0.3511867254778408, "grad_norm": 0.9765625, "learning_rate": 0.000499194875718117, "loss": 5.7511, "mean_token_accuracy": 0.15551865100860596, "num_tokens": 7701294.0, "step": 4180 }, { "entropy": 5.861107254028321, "epoch": 0.3516068052930057, "grad_norm": 0.92578125, "learning_rate": 0.0004991923426101013, "loss": 5.7386, "mean_token_accuracy": 0.14845747649669647, "num_tokens": 7710964.0, "step": 4185 }, { "entropy": 5.949919605255127, "epoch": 0.35202688510817054, "grad_norm": 0.953125, "learning_rate": 0.0004991898055306337, "loss": 5.8577, "mean_token_accuracy": 0.14658492356538771, "num_tokens": 7719938.0, "step": 4190 }, { "entropy": 5.951687955856324, "epoch": 0.35244696492333544, "grad_norm": 0.89453125, "learning_rate": 0.0004991872644797591, "loss": 5.7808, "mean_token_accuracy": 0.15141311138868332, "num_tokens": 7729129.0, "step": 4195 }, { "entropy": 5.855287361145019, "epoch": 0.35286704473850034, "grad_norm": 1.03125, "learning_rate": 0.0004991847194575226, "loss": 5.7901, "mean_token_accuracy": 0.14619968980550765, "num_tokens": 7738506.0, "step": 4200 }, { "entropy": 5.942954778671265, "epoch": 0.3532871245536652, "grad_norm": 0.9140625, "learning_rate": 0.0004991821704639693, "loss": 5.8959, "mean_token_accuracy": 0.14654064998030664, "num_tokens": 7749320.0, "step": 4205 }, { "entropy": 6.01116943359375, "epoch": 0.3537072043688301, "grad_norm": 1.0, "learning_rate": 0.0004991796174991443, "loss": 5.7415, "mean_token_accuracy": 0.1537883497774601, "num_tokens": 7758735.0, "step": 4210 }, { "entropy": 5.822880458831787, "epoch": 0.354127284183995, "grad_norm": 0.99609375, "learning_rate": 0.0004991770605630927, "loss": 5.7132, "mean_token_accuracy": 0.15271057039499283, "num_tokens": 7767556.0, "step": 4215 }, { "entropy": 5.818714237213134, "epoch": 0.3545473639991598, "grad_norm": 0.9765625, "learning_rate": 0.0004991744996558599, "loss": 5.7336, "mean_token_accuracy": 0.15282744243741037, "num_tokens": 7776615.0, "step": 4220 }, { "entropy": 5.915001726150512, "epoch": 0.3549674438143247, "grad_norm": 0.94921875, "learning_rate": 0.0004991719347774913, "loss": 5.7682, "mean_token_accuracy": 0.15577882081270217, "num_tokens": 7785288.0, "step": 4225 }, { "entropy": 5.918221855163575, "epoch": 0.3553875236294896, "grad_norm": 0.95703125, "learning_rate": 0.0004991693659280324, "loss": 5.6811, "mean_token_accuracy": 0.15442655980587006, "num_tokens": 7794381.0, "step": 4230 }, { "entropy": 5.821169424057007, "epoch": 0.3558076034446545, "grad_norm": 0.984375, "learning_rate": 0.0004991667931075284, "loss": 5.6546, "mean_token_accuracy": 0.15124934762716294, "num_tokens": 7803265.0, "step": 4235 }, { "entropy": 5.829122161865234, "epoch": 0.35622768325981935, "grad_norm": 0.9296875, "learning_rate": 0.0004991642163160252, "loss": 5.7671, "mean_token_accuracy": 0.15388772487640381, "num_tokens": 7812445.0, "step": 4240 }, { "entropy": 5.934730339050293, "epoch": 0.35664776307498425, "grad_norm": 0.87109375, "learning_rate": 0.0004991616355535684, "loss": 5.7542, "mean_token_accuracy": 0.15821312218904496, "num_tokens": 7822073.0, "step": 4245 }, { "entropy": 5.918817663192749, "epoch": 0.35706784289014915, "grad_norm": 0.9375, "learning_rate": 0.0004991590508202036, "loss": 5.7264, "mean_token_accuracy": 0.15280235260725022, "num_tokens": 7831193.0, "step": 4250 }, { "entropy": 5.89573392868042, "epoch": 0.357487922705314, "grad_norm": 0.9765625, "learning_rate": 0.0004991564621159766, "loss": 5.7728, "mean_token_accuracy": 0.15194582045078278, "num_tokens": 7840311.0, "step": 4255 }, { "entropy": 5.8731294631958, "epoch": 0.3579080025204789, "grad_norm": 0.9609375, "learning_rate": 0.0004991538694409334, "loss": 5.7954, "mean_token_accuracy": 0.14721263125538825, "num_tokens": 7849622.0, "step": 4260 }, { "entropy": 5.876342821121216, "epoch": 0.3583280823356438, "grad_norm": 1.03125, "learning_rate": 0.0004991512727951198, "loss": 5.7558, "mean_token_accuracy": 0.15003474354743956, "num_tokens": 7859494.0, "step": 4265 }, { "entropy": 5.9838221073150635, "epoch": 0.3587481621508087, "grad_norm": 0.93359375, "learning_rate": 0.0004991486721785818, "loss": 5.8503, "mean_token_accuracy": 0.14846469163894654, "num_tokens": 7868526.0, "step": 4270 }, { "entropy": 5.859622812271118, "epoch": 0.3591682419659735, "grad_norm": 0.99609375, "learning_rate": 0.0004991460675913655, "loss": 5.6799, "mean_token_accuracy": 0.1537486046552658, "num_tokens": 7877631.0, "step": 4275 }, { "entropy": 5.85202202796936, "epoch": 0.3595883217811384, "grad_norm": 0.96875, "learning_rate": 0.000499143459033517, "loss": 5.7338, "mean_token_accuracy": 0.15869542211294174, "num_tokens": 7886814.0, "step": 4280 }, { "entropy": 5.794212818145752, "epoch": 0.3600084015963033, "grad_norm": 0.9765625, "learning_rate": 0.0004991408465050825, "loss": 5.5727, "mean_token_accuracy": 0.1595866084098816, "num_tokens": 7896337.0, "step": 4285 }, { "entropy": 5.852896070480346, "epoch": 0.36042848141146816, "grad_norm": 0.890625, "learning_rate": 0.0004991382300061084, "loss": 5.8163, "mean_token_accuracy": 0.14354490041732787, "num_tokens": 7906071.0, "step": 4290 }, { "entropy": 5.937732839584351, "epoch": 0.36084856122663306, "grad_norm": 0.92578125, "learning_rate": 0.0004991356095366409, "loss": 5.8111, "mean_token_accuracy": 0.14974057525396348, "num_tokens": 7915003.0, "step": 4295 }, { "entropy": 5.904038953781128, "epoch": 0.36126864104179796, "grad_norm": 0.94140625, "learning_rate": 0.0004991329850967266, "loss": 5.6791, "mean_token_accuracy": 0.15475230365991594, "num_tokens": 7924408.0, "step": 4300 }, { "entropy": 5.8507331848144535, "epoch": 0.3616887208569628, "grad_norm": 0.89453125, "learning_rate": 0.0004991303566864118, "loss": 5.637, "mean_token_accuracy": 0.1542945459485054, "num_tokens": 7934717.0, "step": 4305 }, { "entropy": 5.7739667892456055, "epoch": 0.3621088006721277, "grad_norm": 0.88671875, "learning_rate": 0.0004991277243057431, "loss": 5.7101, "mean_token_accuracy": 0.1505005143582821, "num_tokens": 7944278.0, "step": 4310 }, { "entropy": 5.808600950241089, "epoch": 0.3625288804872926, "grad_norm": 0.9609375, "learning_rate": 0.0004991250879547673, "loss": 5.7235, "mean_token_accuracy": 0.1538018502295017, "num_tokens": 7953344.0, "step": 4315 }, { "entropy": 5.829892158508301, "epoch": 0.3629489603024575, "grad_norm": 0.90234375, "learning_rate": 0.0004991224476335309, "loss": 5.7448, "mean_token_accuracy": 0.149826068431139, "num_tokens": 7962869.0, "step": 4320 }, { "entropy": 5.963926601409912, "epoch": 0.36336904011762233, "grad_norm": 0.98046875, "learning_rate": 0.0004991198033420807, "loss": 5.7344, "mean_token_accuracy": 0.15306216776371, "num_tokens": 7971981.0, "step": 4325 }, { "entropy": 5.884770917892456, "epoch": 0.36378911993278723, "grad_norm": 0.91796875, "learning_rate": 0.0004991171550804636, "loss": 5.7019, "mean_token_accuracy": 0.15474960654973985, "num_tokens": 7980979.0, "step": 4330 }, { "entropy": 5.863976860046387, "epoch": 0.36420919974795213, "grad_norm": 0.9453125, "learning_rate": 0.0004991145028487266, "loss": 5.7748, "mean_token_accuracy": 0.1529791235923767, "num_tokens": 7989607.0, "step": 4335 }, { "entropy": 5.7957190990448, "epoch": 0.36462927956311697, "grad_norm": 0.91796875, "learning_rate": 0.0004991118466469165, "loss": 5.5897, "mean_token_accuracy": 0.1639975592494011, "num_tokens": 7998356.0, "step": 4340 }, { "entropy": 5.849919033050537, "epoch": 0.36504935937828187, "grad_norm": 0.9609375, "learning_rate": 0.0004991091864750805, "loss": 5.7033, "mean_token_accuracy": 0.1553362563252449, "num_tokens": 8007596.0, "step": 4345 }, { "entropy": 5.909917688369751, "epoch": 0.36546943919344677, "grad_norm": 0.94921875, "learning_rate": 0.0004991065223332655, "loss": 5.7587, "mean_token_accuracy": 0.15085091739892958, "num_tokens": 8016493.0, "step": 4350 }, { "entropy": 5.884606981277466, "epoch": 0.36588951900861166, "grad_norm": 0.984375, "learning_rate": 0.0004991038542215191, "loss": 5.7272, "mean_token_accuracy": 0.1481338232755661, "num_tokens": 8025867.0, "step": 4355 }, { "entropy": 5.814969539642334, "epoch": 0.3663095988237765, "grad_norm": 0.921875, "learning_rate": 0.0004991011821398882, "loss": 5.7464, "mean_token_accuracy": 0.15548805743455887, "num_tokens": 8036251.0, "step": 4360 }, { "entropy": 5.905033826828003, "epoch": 0.3667296786389414, "grad_norm": 1.0390625, "learning_rate": 0.0004990985060884202, "loss": 5.7024, "mean_token_accuracy": 0.1582213595509529, "num_tokens": 8045647.0, "step": 4365 }, { "entropy": 5.88990044593811, "epoch": 0.3671497584541063, "grad_norm": 0.90625, "learning_rate": 0.0004990958260671627, "loss": 5.79, "mean_token_accuracy": 0.1454270862042904, "num_tokens": 8056025.0, "step": 4370 }, { "entropy": 5.809770679473877, "epoch": 0.36756983826927114, "grad_norm": 0.98828125, "learning_rate": 0.0004990931420761629, "loss": 5.7083, "mean_token_accuracy": 0.16103482097387314, "num_tokens": 8065029.0, "step": 4375 }, { "entropy": 5.914457368850708, "epoch": 0.36798991808443604, "grad_norm": 1.015625, "learning_rate": 0.0004990904541154685, "loss": 5.6763, "mean_token_accuracy": 0.16559941172599793, "num_tokens": 8073249.0, "step": 4380 }, { "entropy": 5.894069719314575, "epoch": 0.36840999789960094, "grad_norm": 1.0, "learning_rate": 0.0004990877621851271, "loss": 5.8002, "mean_token_accuracy": 0.153408020734787, "num_tokens": 8082039.0, "step": 4385 }, { "entropy": 5.8033387660980225, "epoch": 0.3688300777147658, "grad_norm": 1.078125, "learning_rate": 0.0004990850662851863, "loss": 5.6375, "mean_token_accuracy": 0.15707656592130662, "num_tokens": 8090011.0, "step": 4390 }, { "entropy": 5.879843854904175, "epoch": 0.3692501575299307, "grad_norm": 0.95703125, "learning_rate": 0.0004990823664156941, "loss": 5.7455, "mean_token_accuracy": 0.1648575708270073, "num_tokens": 8099934.0, "step": 4395 }, { "entropy": 5.963798093795776, "epoch": 0.3696702373450956, "grad_norm": 0.97265625, "learning_rate": 0.0004990796625766981, "loss": 5.7681, "mean_token_accuracy": 0.14946894496679305, "num_tokens": 8108969.0, "step": 4400 }, { "entropy": 5.835124111175537, "epoch": 0.3700903171602605, "grad_norm": 0.98046875, "learning_rate": 0.0004990769547682462, "loss": 5.6935, "mean_token_accuracy": 0.15169232487678527, "num_tokens": 8117372.0, "step": 4405 }, { "entropy": 5.979207563400268, "epoch": 0.3705103969754253, "grad_norm": 0.9375, "learning_rate": 0.0004990742429903866, "loss": 5.8757, "mean_token_accuracy": 0.14571133852005005, "num_tokens": 8127108.0, "step": 4410 }, { "entropy": 5.961515951156616, "epoch": 0.3709304767905902, "grad_norm": 0.8984375, "learning_rate": 0.000499071527243167, "loss": 5.8507, "mean_token_accuracy": 0.14516980648040773, "num_tokens": 8137392.0, "step": 4415 }, { "entropy": 5.880073976516724, "epoch": 0.3713505566057551, "grad_norm": 0.984375, "learning_rate": 0.0004990688075266357, "loss": 5.7019, "mean_token_accuracy": 0.15986401289701463, "num_tokens": 8146257.0, "step": 4420 }, { "entropy": 5.805649709701538, "epoch": 0.37177063642091995, "grad_norm": 0.96484375, "learning_rate": 0.0004990660838408409, "loss": 5.6521, "mean_token_accuracy": 0.15721987932920456, "num_tokens": 8154952.0, "step": 4425 }, { "entropy": 5.893301391601563, "epoch": 0.37219071623608485, "grad_norm": 0.921875, "learning_rate": 0.0004990633561858308, "loss": 5.7106, "mean_token_accuracy": 0.14765800014138222, "num_tokens": 8164365.0, "step": 4430 }, { "entropy": 5.924961233139038, "epoch": 0.37261079605124975, "grad_norm": 1.0390625, "learning_rate": 0.0004990606245616537, "loss": 5.7205, "mean_token_accuracy": 0.15445269271731377, "num_tokens": 8172614.0, "step": 4435 }, { "entropy": 5.877901983261109, "epoch": 0.37303087586641465, "grad_norm": 1.0, "learning_rate": 0.0004990578889683579, "loss": 5.7888, "mean_token_accuracy": 0.150545197725296, "num_tokens": 8182445.0, "step": 4440 }, { "entropy": 5.885668134689331, "epoch": 0.3734509556815795, "grad_norm": 0.90234375, "learning_rate": 0.0004990551494059921, "loss": 5.6613, "mean_token_accuracy": 0.15747766494750975, "num_tokens": 8191871.0, "step": 4445 }, { "entropy": 5.893858480453491, "epoch": 0.3738710354967444, "grad_norm": 0.9375, "learning_rate": 0.0004990524058746047, "loss": 5.8285, "mean_token_accuracy": 0.15561486929655075, "num_tokens": 8200658.0, "step": 4450 }, { "entropy": 5.879518842697143, "epoch": 0.3742911153119093, "grad_norm": 1.015625, "learning_rate": 0.0004990496583742443, "loss": 5.7547, "mean_token_accuracy": 0.15101703256368637, "num_tokens": 8209776.0, "step": 4455 }, { "entropy": 5.868221950531006, "epoch": 0.3747111951270741, "grad_norm": 1.046875, "learning_rate": 0.0004990469069049596, "loss": 5.6747, "mean_token_accuracy": 0.15401403456926346, "num_tokens": 8219401.0, "step": 4460 }, { "entropy": 5.809508180618286, "epoch": 0.375131274942239, "grad_norm": 0.9765625, "learning_rate": 0.0004990441514667993, "loss": 5.7095, "mean_token_accuracy": 0.15698247104883195, "num_tokens": 8228762.0, "step": 4465 }, { "entropy": 5.932300424575805, "epoch": 0.3755513547574039, "grad_norm": 0.984375, "learning_rate": 0.0004990413920598121, "loss": 5.7223, "mean_token_accuracy": 0.15662275701761247, "num_tokens": 8236612.0, "step": 4470 }, { "entropy": 5.896757698059082, "epoch": 0.37597143457256876, "grad_norm": 1.0625, "learning_rate": 0.0004990386286840471, "loss": 5.7335, "mean_token_accuracy": 0.15207386016845703, "num_tokens": 8245043.0, "step": 4475 }, { "entropy": 5.995736980438233, "epoch": 0.37639151438773366, "grad_norm": 0.9921875, "learning_rate": 0.0004990358613395532, "loss": 5.8307, "mean_token_accuracy": 0.15044568330049515, "num_tokens": 8255270.0, "step": 4480 }, { "entropy": 5.938156318664551, "epoch": 0.37681159420289856, "grad_norm": 0.8828125, "learning_rate": 0.0004990330900263792, "loss": 5.7971, "mean_token_accuracy": 0.14653817862272261, "num_tokens": 8264761.0, "step": 4485 }, { "entropy": 5.8954840183258055, "epoch": 0.37723167401806346, "grad_norm": 0.9296875, "learning_rate": 0.0004990303147445745, "loss": 5.7454, "mean_token_accuracy": 0.15479619354009627, "num_tokens": 8274308.0, "step": 4490 }, { "entropy": 5.815971899032593, "epoch": 0.3776517538332283, "grad_norm": 0.96875, "learning_rate": 0.0004990275354941881, "loss": 5.6288, "mean_token_accuracy": 0.1646218091249466, "num_tokens": 8283323.0, "step": 4495 }, { "entropy": 5.893220853805542, "epoch": 0.3780718336483932, "grad_norm": 0.9296875, "learning_rate": 0.0004990247522752694, "loss": 5.9629, "mean_token_accuracy": 0.14029839560389518, "num_tokens": 8293452.0, "step": 4500 }, { "entropy": 5.897252893447876, "epoch": 0.3784919134635581, "grad_norm": 0.9453125, "learning_rate": 0.0004990219650878674, "loss": 5.6576, "mean_token_accuracy": 0.16113524734973908, "num_tokens": 8302941.0, "step": 4505 }, { "entropy": 5.781876134872436, "epoch": 0.37891199327872294, "grad_norm": 1.4140625, "learning_rate": 0.0004990191739320318, "loss": 5.6671, "mean_token_accuracy": 0.1652265876531601, "num_tokens": 8311811.0, "step": 4510 }, { "entropy": 5.76027250289917, "epoch": 0.37933207309388783, "grad_norm": 0.9296875, "learning_rate": 0.0004990163788078117, "loss": 5.5692, "mean_token_accuracy": 0.15842368602752685, "num_tokens": 8321130.0, "step": 4515 }, { "entropy": 5.842820358276367, "epoch": 0.37975215290905273, "grad_norm": 0.8984375, "learning_rate": 0.0004990135797152569, "loss": 5.6768, "mean_token_accuracy": 0.15367345213890077, "num_tokens": 8330233.0, "step": 4520 }, { "entropy": 5.770590019226074, "epoch": 0.3801722327242176, "grad_norm": 0.9609375, "learning_rate": 0.0004990107766544169, "loss": 5.6599, "mean_token_accuracy": 0.16070746779441833, "num_tokens": 8338585.0, "step": 4525 }, { "entropy": 5.844082069396973, "epoch": 0.38059231253938247, "grad_norm": 0.97265625, "learning_rate": 0.0004990079696253413, "loss": 5.7068, "mean_token_accuracy": 0.15848116278648378, "num_tokens": 8346618.0, "step": 4530 }, { "entropy": 5.902699136734009, "epoch": 0.38101239235454737, "grad_norm": 0.96484375, "learning_rate": 0.0004990051586280799, "loss": 5.6829, "mean_token_accuracy": 0.15385363698005677, "num_tokens": 8356273.0, "step": 4535 }, { "entropy": 5.847843742370605, "epoch": 0.38143247216971227, "grad_norm": 0.87890625, "learning_rate": 0.0004990023436626824, "loss": 5.674, "mean_token_accuracy": 0.15799472630023956, "num_tokens": 8366668.0, "step": 4540 }, { "entropy": 5.954341840744019, "epoch": 0.3818525519848771, "grad_norm": 1.1015625, "learning_rate": 0.0004989995247291988, "loss": 5.7933, "mean_token_accuracy": 0.15496921986341478, "num_tokens": 8375610.0, "step": 4545 }, { "entropy": 5.860501337051391, "epoch": 0.382272631800042, "grad_norm": 0.95703125, "learning_rate": 0.0004989967018276789, "loss": 5.6729, "mean_token_accuracy": 0.1558580845594406, "num_tokens": 8384455.0, "step": 4550 }, { "entropy": 5.7317808151245115, "epoch": 0.3826927116152069, "grad_norm": 0.9140625, "learning_rate": 0.0004989938749581727, "loss": 5.7105, "mean_token_accuracy": 0.14987761974334718, "num_tokens": 8393868.0, "step": 4555 }, { "entropy": 5.8618772506713865, "epoch": 0.38311279143037175, "grad_norm": 0.890625, "learning_rate": 0.0004989910441207305, "loss": 5.7312, "mean_token_accuracy": 0.15411882251501083, "num_tokens": 8402916.0, "step": 4560 }, { "entropy": 5.830321025848389, "epoch": 0.38353287124553664, "grad_norm": 1.0546875, "learning_rate": 0.0004989882093154023, "loss": 5.6485, "mean_token_accuracy": 0.1575123891234398, "num_tokens": 8411649.0, "step": 4565 }, { "entropy": 5.8616162776947025, "epoch": 0.38395295106070154, "grad_norm": 0.890625, "learning_rate": 0.0004989853705422381, "loss": 5.769, "mean_token_accuracy": 0.14645260721445083, "num_tokens": 8420393.0, "step": 4570 }, { "entropy": 5.813478136062622, "epoch": 0.38437303087586644, "grad_norm": 0.97265625, "learning_rate": 0.0004989825278012886, "loss": 5.6629, "mean_token_accuracy": 0.154879230260849, "num_tokens": 8429404.0, "step": 4575 }, { "entropy": 5.851570463180542, "epoch": 0.3847931106910313, "grad_norm": 1.078125, "learning_rate": 0.000498979681092604, "loss": 5.703, "mean_token_accuracy": 0.149764809012413, "num_tokens": 8438299.0, "step": 4580 }, { "entropy": 5.760462951660156, "epoch": 0.3852131905061962, "grad_norm": 0.88671875, "learning_rate": 0.0004989768304162345, "loss": 5.6615, "mean_token_accuracy": 0.15541962534189224, "num_tokens": 8447392.0, "step": 4585 }, { "entropy": 5.89559907913208, "epoch": 0.3856332703213611, "grad_norm": 0.90625, "learning_rate": 0.0004989739757722308, "loss": 5.7474, "mean_token_accuracy": 0.14751126170158385, "num_tokens": 8456361.0, "step": 4590 }, { "entropy": 5.852615118026733, "epoch": 0.3860533501365259, "grad_norm": 1.015625, "learning_rate": 0.0004989711171606436, "loss": 5.6747, "mean_token_accuracy": 0.15710035860538482, "num_tokens": 8465548.0, "step": 4595 }, { "entropy": 5.885403347015381, "epoch": 0.3864734299516908, "grad_norm": 0.921875, "learning_rate": 0.0004989682545815232, "loss": 5.6869, "mean_token_accuracy": 0.1525876745581627, "num_tokens": 8474454.0, "step": 4600 }, { "entropy": 5.8074538230896, "epoch": 0.3868935097668557, "grad_norm": 1.0625, "learning_rate": 0.0004989653880349207, "loss": 5.6074, "mean_token_accuracy": 0.1573283538222313, "num_tokens": 8482694.0, "step": 4605 }, { "entropy": 5.842355585098266, "epoch": 0.38731358958202056, "grad_norm": 0.99609375, "learning_rate": 0.0004989625175208864, "loss": 5.7257, "mean_token_accuracy": 0.15177675783634187, "num_tokens": 8491162.0, "step": 4610 }, { "entropy": 5.787636756896973, "epoch": 0.38773366939718545, "grad_norm": 0.9921875, "learning_rate": 0.0004989596430394717, "loss": 5.5752, "mean_token_accuracy": 0.17091956436634065, "num_tokens": 8500716.0, "step": 4615 }, { "entropy": 5.7534934997558596, "epoch": 0.38815374921235035, "grad_norm": 0.9453125, "learning_rate": 0.000498956764590727, "loss": 5.6231, "mean_token_accuracy": 0.1520329423248768, "num_tokens": 8508871.0, "step": 4620 }, { "entropy": 5.890595149993897, "epoch": 0.38857382902751525, "grad_norm": 1.03125, "learning_rate": 0.0004989538821747037, "loss": 5.8315, "mean_token_accuracy": 0.15000174939632416, "num_tokens": 8518450.0, "step": 4625 }, { "entropy": 5.941072607040406, "epoch": 0.3889939088426801, "grad_norm": 0.91015625, "learning_rate": 0.0004989509957914527, "loss": 5.7284, "mean_token_accuracy": 0.15086407959461212, "num_tokens": 8528238.0, "step": 4630 }, { "entropy": 5.794663810729981, "epoch": 0.389413988657845, "grad_norm": 0.89453125, "learning_rate": 0.0004989481054410251, "loss": 5.6258, "mean_token_accuracy": 0.1528220996260643, "num_tokens": 8537587.0, "step": 4635 }, { "entropy": 5.795312023162841, "epoch": 0.3898340684730099, "grad_norm": 0.9609375, "learning_rate": 0.0004989452111234721, "loss": 5.7462, "mean_token_accuracy": 0.1528109699487686, "num_tokens": 8547703.0, "step": 4640 }, { "entropy": 5.84535961151123, "epoch": 0.39025414828817473, "grad_norm": 1.0546875, "learning_rate": 0.000498942312838845, "loss": 5.6766, "mean_token_accuracy": 0.1572122886776924, "num_tokens": 8557001.0, "step": 4645 }, { "entropy": 5.796119689941406, "epoch": 0.3906742281033396, "grad_norm": 1.015625, "learning_rate": 0.0004989394105871952, "loss": 5.5616, "mean_token_accuracy": 0.16711176037788392, "num_tokens": 8565638.0, "step": 4650 }, { "entropy": 5.91137285232544, "epoch": 0.3910943079185045, "grad_norm": 1.046875, "learning_rate": 0.000498936504368574, "loss": 5.7305, "mean_token_accuracy": 0.15593890845775604, "num_tokens": 8574428.0, "step": 4655 }, { "entropy": 5.800365591049195, "epoch": 0.3915143877336694, "grad_norm": 0.93359375, "learning_rate": 0.0004989335941830329, "loss": 5.684, "mean_token_accuracy": 0.15439117401838304, "num_tokens": 8583157.0, "step": 4660 }, { "entropy": 5.817437553405762, "epoch": 0.39193446754883426, "grad_norm": 1.046875, "learning_rate": 0.0004989306800306236, "loss": 5.6621, "mean_token_accuracy": 0.149759341776371, "num_tokens": 8592382.0, "step": 4665 }, { "entropy": 5.7860520362854, "epoch": 0.39235454736399916, "grad_norm": 1.0078125, "learning_rate": 0.0004989277619113975, "loss": 5.6345, "mean_token_accuracy": 0.16216987669467925, "num_tokens": 8601058.0, "step": 4670 }, { "entropy": 5.875742197036743, "epoch": 0.39277462717916406, "grad_norm": 1.0234375, "learning_rate": 0.0004989248398254065, "loss": 5.7352, "mean_token_accuracy": 0.15142691284418106, "num_tokens": 8609479.0, "step": 4675 }, { "entropy": 5.859423112869263, "epoch": 0.3931947069943289, "grad_norm": 0.9296875, "learning_rate": 0.0004989219137727021, "loss": 5.7036, "mean_token_accuracy": 0.15549542009830475, "num_tokens": 8618860.0, "step": 4680 }, { "entropy": 5.81779637336731, "epoch": 0.3936147868094938, "grad_norm": 0.93359375, "learning_rate": 0.0004989189837533365, "loss": 5.6363, "mean_token_accuracy": 0.1587088495492935, "num_tokens": 8627462.0, "step": 4685 }, { "entropy": 5.924579429626465, "epoch": 0.3940348666246587, "grad_norm": 0.83203125, "learning_rate": 0.0004989160497673613, "loss": 5.8254, "mean_token_accuracy": 0.1513897880911827, "num_tokens": 8637569.0, "step": 4690 }, { "entropy": 5.850678825378418, "epoch": 0.39445494643982354, "grad_norm": 1.0546875, "learning_rate": 0.0004989131118148286, "loss": 5.6177, "mean_token_accuracy": 0.15605207085609435, "num_tokens": 8645440.0, "step": 4695 }, { "entropy": 5.835308980941773, "epoch": 0.39487502625498844, "grad_norm": 0.9453125, "learning_rate": 0.0004989101698957904, "loss": 5.7682, "mean_token_accuracy": 0.15626595616340638, "num_tokens": 8655077.0, "step": 4700 }, { "entropy": 5.830049610137939, "epoch": 0.39529510607015333, "grad_norm": 1.0390625, "learning_rate": 0.0004989072240102988, "loss": 5.6957, "mean_token_accuracy": 0.16012858897447585, "num_tokens": 8663126.0, "step": 4705 }, { "entropy": 5.901100158691406, "epoch": 0.39571518588531823, "grad_norm": 0.94921875, "learning_rate": 0.0004989042741584061, "loss": 5.6726, "mean_token_accuracy": 0.15270041525363923, "num_tokens": 8672386.0, "step": 4710 }, { "entropy": 5.7314942359924315, "epoch": 0.3961352657004831, "grad_norm": 0.91796875, "learning_rate": 0.0004989013203401645, "loss": 5.612, "mean_token_accuracy": 0.1580759972333908, "num_tokens": 8681930.0, "step": 4715 }, { "entropy": 5.797902965545655, "epoch": 0.396555345515648, "grad_norm": 0.9375, "learning_rate": 0.0004988983625556264, "loss": 5.6787, "mean_token_accuracy": 0.15581901967525483, "num_tokens": 8690993.0, "step": 4720 }, { "entropy": 5.798060894012451, "epoch": 0.39697542533081287, "grad_norm": 1.0234375, "learning_rate": 0.0004988954008048438, "loss": 5.672, "mean_token_accuracy": 0.15935962349176408, "num_tokens": 8699497.0, "step": 4725 }, { "entropy": 5.933620643615723, "epoch": 0.3973955051459777, "grad_norm": 0.9296875, "learning_rate": 0.0004988924350878697, "loss": 5.8568, "mean_token_accuracy": 0.14457278251647948, "num_tokens": 8709274.0, "step": 4730 }, { "entropy": 5.934816789627075, "epoch": 0.3978155849611426, "grad_norm": 0.96484375, "learning_rate": 0.0004988894654047563, "loss": 5.7297, "mean_token_accuracy": 0.15009873509407043, "num_tokens": 8718158.0, "step": 4735 }, { "entropy": 5.786411237716675, "epoch": 0.3982356647763075, "grad_norm": 0.93359375, "learning_rate": 0.0004988864917555562, "loss": 5.5866, "mean_token_accuracy": 0.15930677056312562, "num_tokens": 8727459.0, "step": 4740 }, { "entropy": 5.864226961135865, "epoch": 0.3986557445914724, "grad_norm": 0.9609375, "learning_rate": 0.0004988835141403224, "loss": 5.7293, "mean_token_accuracy": 0.15878916680812835, "num_tokens": 8737614.0, "step": 4745 }, { "entropy": 5.824589109420776, "epoch": 0.39907582440663725, "grad_norm": 0.9921875, "learning_rate": 0.0004988805325591073, "loss": 5.56, "mean_token_accuracy": 0.15695197582244874, "num_tokens": 8746799.0, "step": 4750 }, { "entropy": 5.8385083198547365, "epoch": 0.39949590422180214, "grad_norm": 0.96484375, "learning_rate": 0.0004988775470119639, "loss": 5.7326, "mean_token_accuracy": 0.14953183978796006, "num_tokens": 8756555.0, "step": 4755 }, { "entropy": 5.7729175090789795, "epoch": 0.39991598403696704, "grad_norm": 0.9296875, "learning_rate": 0.0004988745574989451, "loss": 5.7535, "mean_token_accuracy": 0.15938151776790618, "num_tokens": 8765849.0, "step": 4760 }, { "entropy": 5.965050411224365, "epoch": 0.4003360638521319, "grad_norm": 0.9296875, "learning_rate": 0.0004988715640201036, "loss": 5.8322, "mean_token_accuracy": 0.14530889242887496, "num_tokens": 8775713.0, "step": 4765 }, { "entropy": 5.839820480346679, "epoch": 0.4007561436672968, "grad_norm": 0.953125, "learning_rate": 0.0004988685665754928, "loss": 5.6466, "mean_token_accuracy": 0.1569948598742485, "num_tokens": 8784717.0, "step": 4770 }, { "entropy": 5.792028474807739, "epoch": 0.4011762234824617, "grad_norm": 0.98046875, "learning_rate": 0.0004988655651651656, "loss": 5.6649, "mean_token_accuracy": 0.15628512352705, "num_tokens": 8794388.0, "step": 4775 }, { "entropy": 5.755618572235107, "epoch": 0.4015963032976265, "grad_norm": 1.0078125, "learning_rate": 0.0004988625597891751, "loss": 5.6762, "mean_token_accuracy": 0.15925197303295135, "num_tokens": 8802436.0, "step": 4780 }, { "entropy": 5.85797004699707, "epoch": 0.4020163831127914, "grad_norm": 0.9140625, "learning_rate": 0.0004988595504475746, "loss": 5.6376, "mean_token_accuracy": 0.15845684409141542, "num_tokens": 8811184.0, "step": 4785 }, { "entropy": 5.920813274383545, "epoch": 0.4024364629279563, "grad_norm": 0.94921875, "learning_rate": 0.0004988565371404175, "loss": 5.7115, "mean_token_accuracy": 0.15826244726777078, "num_tokens": 8820525.0, "step": 4790 }, { "entropy": 5.790119886398315, "epoch": 0.4028565427431212, "grad_norm": 1.03125, "learning_rate": 0.0004988535198677571, "loss": 5.5798, "mean_token_accuracy": 0.16315356642007828, "num_tokens": 8828928.0, "step": 4795 }, { "entropy": 5.902295684814453, "epoch": 0.40327662255828606, "grad_norm": 1.0234375, "learning_rate": 0.0004988504986296469, "loss": 5.7884, "mean_token_accuracy": 0.1443356990814209, "num_tokens": 8838615.0, "step": 4800 }, { "entropy": 5.862144041061401, "epoch": 0.40369670237345096, "grad_norm": 0.88671875, "learning_rate": 0.0004988474734261404, "loss": 5.769, "mean_token_accuracy": 0.1485462300479412, "num_tokens": 8848709.0, "step": 4805 }, { "entropy": 5.8929126262664795, "epoch": 0.40411678218861585, "grad_norm": 0.9140625, "learning_rate": 0.0004988444442572911, "loss": 5.7251, "mean_token_accuracy": 0.14630650877952575, "num_tokens": 8858277.0, "step": 4810 }, { "entropy": 5.814572858810425, "epoch": 0.4045368620037807, "grad_norm": 0.9140625, "learning_rate": 0.0004988414111231528, "loss": 5.6716, "mean_token_accuracy": 0.15942000597715378, "num_tokens": 8868436.0, "step": 4815 }, { "entropy": 5.8521270751953125, "epoch": 0.4049569418189456, "grad_norm": 0.86328125, "learning_rate": 0.000498838374023779, "loss": 5.6738, "mean_token_accuracy": 0.15392234772443772, "num_tokens": 8877740.0, "step": 4820 }, { "entropy": 5.896619701385498, "epoch": 0.4053770216341105, "grad_norm": 0.875, "learning_rate": 0.0004988353329592239, "loss": 5.6449, "mean_token_accuracy": 0.15986622273921966, "num_tokens": 8887408.0, "step": 4825 }, { "entropy": 5.889400386810303, "epoch": 0.4057971014492754, "grad_norm": 0.984375, "learning_rate": 0.0004988322879295409, "loss": 5.8084, "mean_token_accuracy": 0.151357901096344, "num_tokens": 8897141.0, "step": 4830 }, { "entropy": 5.732660865783691, "epoch": 0.40621718126444023, "grad_norm": 0.96875, "learning_rate": 0.0004988292389347844, "loss": 5.5937, "mean_token_accuracy": 0.16834330409765244, "num_tokens": 8905747.0, "step": 4835 }, { "entropy": 5.910235500335693, "epoch": 0.40663726107960513, "grad_norm": 0.99609375, "learning_rate": 0.000498826185975008, "loss": 5.7403, "mean_token_accuracy": 0.15051692128181457, "num_tokens": 8914926.0, "step": 4840 }, { "entropy": 5.855715417861939, "epoch": 0.40705734089477, "grad_norm": 0.9375, "learning_rate": 0.0004988231290502662, "loss": 5.7351, "mean_token_accuracy": 0.15608510375022888, "num_tokens": 8923956.0, "step": 4845 }, { "entropy": 5.844746065139771, "epoch": 0.40747742070993487, "grad_norm": 0.99609375, "learning_rate": 0.0004988200681606127, "loss": 5.6105, "mean_token_accuracy": 0.15472539961338044, "num_tokens": 8932654.0, "step": 4850 }, { "entropy": 5.819759750366211, "epoch": 0.40789750052509977, "grad_norm": 0.97265625, "learning_rate": 0.000498817003306102, "loss": 5.602, "mean_token_accuracy": 0.1623125731945038, "num_tokens": 8941716.0, "step": 4855 }, { "entropy": 5.776214361190796, "epoch": 0.40831758034026466, "grad_norm": 1.0234375, "learning_rate": 0.0004988139344867884, "loss": 5.6825, "mean_token_accuracy": 0.1535426653921604, "num_tokens": 8950377.0, "step": 4860 }, { "entropy": 5.807446241378784, "epoch": 0.4087376601554295, "grad_norm": 0.984375, "learning_rate": 0.0004988108617027261, "loss": 5.6579, "mean_token_accuracy": 0.15453788191080092, "num_tokens": 8959857.0, "step": 4865 }, { "entropy": 5.781218004226685, "epoch": 0.4091577399705944, "grad_norm": 0.90234375, "learning_rate": 0.0004988077849539698, "loss": 5.5902, "mean_token_accuracy": 0.15969525128602982, "num_tokens": 8968272.0, "step": 4870 }, { "entropy": 5.820656394958496, "epoch": 0.4095778197857593, "grad_norm": 1.015625, "learning_rate": 0.0004988047042405736, "loss": 5.6674, "mean_token_accuracy": 0.15931978076696396, "num_tokens": 8977445.0, "step": 4875 }, { "entropy": 5.915397691726684, "epoch": 0.4099978996009242, "grad_norm": 0.875, "learning_rate": 0.0004988016195625924, "loss": 5.7299, "mean_token_accuracy": 0.15139664933085442, "num_tokens": 8987315.0, "step": 4880 }, { "entropy": 5.871594667434692, "epoch": 0.41041797941608904, "grad_norm": 0.90234375, "learning_rate": 0.0004987985309200807, "loss": 5.7173, "mean_token_accuracy": 0.15377188473939896, "num_tokens": 8998119.0, "step": 4885 }, { "entropy": 5.775591278076172, "epoch": 0.41083805923125394, "grad_norm": 0.97265625, "learning_rate": 0.0004987954383130934, "loss": 5.6066, "mean_token_accuracy": 0.16712582856416702, "num_tokens": 9007167.0, "step": 4890 }, { "entropy": 5.807595109939575, "epoch": 0.41125813904641884, "grad_norm": 0.9453125, "learning_rate": 0.000498792341741685, "loss": 5.6687, "mean_token_accuracy": 0.1526729181408882, "num_tokens": 9016690.0, "step": 4895 }, { "entropy": 5.874031114578247, "epoch": 0.4116782188615837, "grad_norm": 0.91015625, "learning_rate": 0.0004987892412059106, "loss": 5.758, "mean_token_accuracy": 0.15407043546438218, "num_tokens": 9026117.0, "step": 4900 }, { "entropy": 5.780725193023682, "epoch": 0.4120982986767486, "grad_norm": 0.95703125, "learning_rate": 0.0004987861367058251, "loss": 5.644, "mean_token_accuracy": 0.1559523746371269, "num_tokens": 9035754.0, "step": 4905 }, { "entropy": 5.826504945755005, "epoch": 0.4125183784919135, "grad_norm": 0.96875, "learning_rate": 0.0004987830282414833, "loss": 5.642, "mean_token_accuracy": 0.15711333677172662, "num_tokens": 9045453.0, "step": 4910 }, { "entropy": 5.873796701431274, "epoch": 0.41293845830707837, "grad_norm": 0.94921875, "learning_rate": 0.0004987799158129404, "loss": 5.7527, "mean_token_accuracy": 0.15677697360515594, "num_tokens": 9056045.0, "step": 4915 }, { "entropy": 5.820205545425415, "epoch": 0.4133585381222432, "grad_norm": 0.94921875, "learning_rate": 0.0004987767994202516, "loss": 5.6455, "mean_token_accuracy": 0.1496775045990944, "num_tokens": 9065728.0, "step": 4920 }, { "entropy": 5.809246253967285, "epoch": 0.4137786179374081, "grad_norm": 0.9375, "learning_rate": 0.0004987736790634719, "loss": 5.6661, "mean_token_accuracy": 0.15184428542852402, "num_tokens": 9075522.0, "step": 4925 }, { "entropy": 5.794481945037842, "epoch": 0.414198697752573, "grad_norm": 1.09375, "learning_rate": 0.0004987705547426568, "loss": 5.6358, "mean_token_accuracy": 0.1499626338481903, "num_tokens": 9084412.0, "step": 4930 }, { "entropy": 5.868565320968628, "epoch": 0.41461877756773785, "grad_norm": 0.88671875, "learning_rate": 0.0004987674264578615, "loss": 5.6942, "mean_token_accuracy": 0.15214097648859023, "num_tokens": 9094289.0, "step": 4935 }, { "entropy": 5.820976829528808, "epoch": 0.41503885738290275, "grad_norm": 0.9921875, "learning_rate": 0.0004987642942091414, "loss": 5.6177, "mean_token_accuracy": 0.15684758871793747, "num_tokens": 9103124.0, "step": 4940 }, { "entropy": 5.808840274810791, "epoch": 0.41545893719806765, "grad_norm": 0.8984375, "learning_rate": 0.0004987611579965523, "loss": 5.5534, "mean_token_accuracy": 0.15804969370365143, "num_tokens": 9112794.0, "step": 4945 }, { "entropy": 5.837375354766846, "epoch": 0.4158790170132325, "grad_norm": 0.8359375, "learning_rate": 0.0004987580178201492, "loss": 5.7246, "mean_token_accuracy": 0.16285934299230576, "num_tokens": 9122718.0, "step": 4950 }, { "entropy": 5.831628942489624, "epoch": 0.4162990968283974, "grad_norm": 1.0234375, "learning_rate": 0.0004987548736799882, "loss": 5.7454, "mean_token_accuracy": 0.1529500514268875, "num_tokens": 9131855.0, "step": 4955 }, { "entropy": 5.798128986358643, "epoch": 0.4167191766435623, "grad_norm": 1.0, "learning_rate": 0.0004987517255761248, "loss": 5.6019, "mean_token_accuracy": 0.1599896475672722, "num_tokens": 9141102.0, "step": 4960 }, { "entropy": 5.77801775932312, "epoch": 0.4171392564587272, "grad_norm": 1.0234375, "learning_rate": 0.0004987485735086148, "loss": 5.6601, "mean_token_accuracy": 0.16009112149477006, "num_tokens": 9150552.0, "step": 4965 }, { "entropy": 5.852486228942871, "epoch": 0.417559336273892, "grad_norm": 0.93359375, "learning_rate": 0.000498745417477514, "loss": 5.657, "mean_token_accuracy": 0.15402564853429795, "num_tokens": 9160105.0, "step": 4970 }, { "entropy": 5.779581785202026, "epoch": 0.4179794160890569, "grad_norm": 0.9296875, "learning_rate": 0.0004987422574828784, "loss": 5.6566, "mean_token_accuracy": 0.15598243325948716, "num_tokens": 9169367.0, "step": 4975 }, { "entropy": 5.786018943786621, "epoch": 0.4183994959042218, "grad_norm": 1.046875, "learning_rate": 0.0004987390935247639, "loss": 5.5264, "mean_token_accuracy": 0.16368313133716583, "num_tokens": 9177872.0, "step": 4980 }, { "entropy": 5.82407512664795, "epoch": 0.41881957571938666, "grad_norm": 1.109375, "learning_rate": 0.0004987359256032265, "loss": 5.7466, "mean_token_accuracy": 0.151212839782238, "num_tokens": 9187879.0, "step": 4985 }, { "entropy": 5.807058525085449, "epoch": 0.41923965553455156, "grad_norm": 0.8671875, "learning_rate": 0.0004987327537183225, "loss": 5.6561, "mean_token_accuracy": 0.15415959805250168, "num_tokens": 9198281.0, "step": 4990 }, { "entropy": 5.805870008468628, "epoch": 0.41965973534971646, "grad_norm": 0.89453125, "learning_rate": 0.0004987295778701078, "loss": 5.6394, "mean_token_accuracy": 0.16050323396921157, "num_tokens": 9207670.0, "step": 4995 }, { "entropy": 5.877247047424317, "epoch": 0.42007981516488135, "grad_norm": 1.046875, "learning_rate": 0.000498726398058639, "loss": 5.6482, "mean_token_accuracy": 0.16082072257995605, "num_tokens": 9216995.0, "step": 5000 }, { "entropy": 5.812716388702393, "epoch": 0.4204998949800462, "grad_norm": 0.875, "learning_rate": 0.0004987232142839723, "loss": 5.7482, "mean_token_accuracy": 0.1490781858563423, "num_tokens": 9227330.0, "step": 5005 }, { "entropy": 5.844203805923462, "epoch": 0.4209199747952111, "grad_norm": 0.91796875, "learning_rate": 0.0004987200265461638, "loss": 5.656, "mean_token_accuracy": 0.16385895162820815, "num_tokens": 9236666.0, "step": 5010 }, { "entropy": 5.85231499671936, "epoch": 0.421340054610376, "grad_norm": 0.9609375, "learning_rate": 0.0004987168348452705, "loss": 5.6595, "mean_token_accuracy": 0.16210315823554994, "num_tokens": 9246388.0, "step": 5015 }, { "entropy": 5.789185667037964, "epoch": 0.42176013442554083, "grad_norm": 0.93359375, "learning_rate": 0.0004987136391813485, "loss": 5.6096, "mean_token_accuracy": 0.16511590033769608, "num_tokens": 9255239.0, "step": 5020 }, { "entropy": 5.742922639846801, "epoch": 0.42218021424070573, "grad_norm": 0.95703125, "learning_rate": 0.0004987104395544547, "loss": 5.5924, "mean_token_accuracy": 0.15797384828329086, "num_tokens": 9264468.0, "step": 5025 }, { "entropy": 5.819699382781982, "epoch": 0.42260029405587063, "grad_norm": 0.98046875, "learning_rate": 0.0004987072359646455, "loss": 5.6607, "mean_token_accuracy": 0.16205601245164872, "num_tokens": 9274140.0, "step": 5030 }, { "entropy": 5.83985595703125, "epoch": 0.42302037387103547, "grad_norm": 0.9609375, "learning_rate": 0.0004987040284119778, "loss": 5.6327, "mean_token_accuracy": 0.1588321939110756, "num_tokens": 9283539.0, "step": 5035 }, { "entropy": 5.751109886169433, "epoch": 0.42344045368620037, "grad_norm": 1.0234375, "learning_rate": 0.0004987008168965087, "loss": 5.6403, "mean_token_accuracy": 0.1550469622015953, "num_tokens": 9292664.0, "step": 5040 }, { "entropy": 5.876785469055176, "epoch": 0.42386053350136527, "grad_norm": 0.890625, "learning_rate": 0.0004986976014182946, "loss": 5.7374, "mean_token_accuracy": 0.1531568393111229, "num_tokens": 9302814.0, "step": 5045 }, { "entropy": 5.890387535095215, "epoch": 0.42428061331653016, "grad_norm": 0.98046875, "learning_rate": 0.0004986943819773927, "loss": 5.7332, "mean_token_accuracy": 0.15649186819791794, "num_tokens": 9312654.0, "step": 5050 }, { "entropy": 5.8707475662231445, "epoch": 0.424700693131695, "grad_norm": 0.92578125, "learning_rate": 0.00049869115857386, "loss": 5.7558, "mean_token_accuracy": 0.14800945520401002, "num_tokens": 9322271.0, "step": 5055 }, { "entropy": 5.878791618347168, "epoch": 0.4251207729468599, "grad_norm": 0.86328125, "learning_rate": 0.0004986879312077536, "loss": 5.688, "mean_token_accuracy": 0.15585887283086777, "num_tokens": 9331341.0, "step": 5060 }, { "entropy": 5.796487426757812, "epoch": 0.4255408527620248, "grad_norm": 1.0078125, "learning_rate": 0.0004986846998791308, "loss": 5.6274, "mean_token_accuracy": 0.15625337660312652, "num_tokens": 9339863.0, "step": 5065 }, { "entropy": 5.72486629486084, "epoch": 0.42596093257718964, "grad_norm": 1.0078125, "learning_rate": 0.0004986814645880485, "loss": 5.5974, "mean_token_accuracy": 0.16185437515377998, "num_tokens": 9349488.0, "step": 5070 }, { "entropy": 5.7803843975067135, "epoch": 0.42638101239235454, "grad_norm": 0.89453125, "learning_rate": 0.0004986782253345645, "loss": 5.6105, "mean_token_accuracy": 0.15332376062870026, "num_tokens": 9357977.0, "step": 5075 }, { "entropy": 5.823932743072509, "epoch": 0.42680109220751944, "grad_norm": 0.92578125, "learning_rate": 0.0004986749821187358, "loss": 5.7156, "mean_token_accuracy": 0.15630935728549958, "num_tokens": 9367449.0, "step": 5080 }, { "entropy": 5.89394211769104, "epoch": 0.42722117202268434, "grad_norm": 0.97265625, "learning_rate": 0.00049867173494062, "loss": 5.7321, "mean_token_accuracy": 0.15639646500349044, "num_tokens": 9377070.0, "step": 5085 }, { "entropy": 5.765441846847534, "epoch": 0.4276412518378492, "grad_norm": 1.0, "learning_rate": 0.0004986684838002744, "loss": 5.5217, "mean_token_accuracy": 0.15419476479291916, "num_tokens": 9385881.0, "step": 5090 }, { "entropy": 5.770947122573853, "epoch": 0.4280613316530141, "grad_norm": 0.94140625, "learning_rate": 0.0004986652286977569, "loss": 5.6523, "mean_token_accuracy": 0.15255010426044463, "num_tokens": 9395159.0, "step": 5095 }, { "entropy": 5.805099630355835, "epoch": 0.428481411468179, "grad_norm": 0.91015625, "learning_rate": 0.0004986619696331252, "loss": 5.6045, "mean_token_accuracy": 0.1583484500646591, "num_tokens": 9404590.0, "step": 5100 }, { "entropy": 5.841793823242187, "epoch": 0.4289014912833438, "grad_norm": 0.8515625, "learning_rate": 0.0004986587066064367, "loss": 5.6238, "mean_token_accuracy": 0.1618543565273285, "num_tokens": 9414452.0, "step": 5105 }, { "entropy": 5.882272624969483, "epoch": 0.4293215710985087, "grad_norm": 0.96875, "learning_rate": 0.0004986554396177494, "loss": 5.7691, "mean_token_accuracy": 0.1512654058635235, "num_tokens": 9424004.0, "step": 5110 }, { "entropy": 5.826911163330078, "epoch": 0.4297416509136736, "grad_norm": 0.88671875, "learning_rate": 0.0004986521686671212, "loss": 5.6377, "mean_token_accuracy": 0.16602189987897872, "num_tokens": 9433487.0, "step": 5115 }, { "entropy": 5.761785840988159, "epoch": 0.43016173072883845, "grad_norm": 1.046875, "learning_rate": 0.00049864889375461, "loss": 5.701, "mean_token_accuracy": 0.15255770534276963, "num_tokens": 9442742.0, "step": 5120 }, { "entropy": 5.816967296600342, "epoch": 0.43058181054400335, "grad_norm": 0.8984375, "learning_rate": 0.0004986456148802738, "loss": 5.7673, "mean_token_accuracy": 0.15205237418413162, "num_tokens": 9452550.0, "step": 5125 }, { "entropy": 5.930779886245728, "epoch": 0.43100189035916825, "grad_norm": 0.94921875, "learning_rate": 0.0004986423320441707, "loss": 5.7143, "mean_token_accuracy": 0.14957663267850876, "num_tokens": 9461920.0, "step": 5130 }, { "entropy": 5.818691873550415, "epoch": 0.43142197017433315, "grad_norm": 1.0, "learning_rate": 0.0004986390452463588, "loss": 5.6211, "mean_token_accuracy": 0.15580169409513472, "num_tokens": 9470817.0, "step": 5135 }, { "entropy": 5.700370407104492, "epoch": 0.431842049989498, "grad_norm": 0.9921875, "learning_rate": 0.0004986357544868964, "loss": 5.5801, "mean_token_accuracy": 0.1596447467803955, "num_tokens": 9479936.0, "step": 5140 }, { "entropy": 5.841777086257935, "epoch": 0.4322621298046629, "grad_norm": 0.96875, "learning_rate": 0.0004986324597658418, "loss": 5.6155, "mean_token_accuracy": 0.16243926435709, "num_tokens": 9489818.0, "step": 5145 }, { "entropy": 5.728731489181518, "epoch": 0.4326822096198278, "grad_norm": 0.9375, "learning_rate": 0.0004986291610832533, "loss": 5.624, "mean_token_accuracy": 0.153781495988369, "num_tokens": 9499688.0, "step": 5150 }, { "entropy": 5.918451547622681, "epoch": 0.4331022894349926, "grad_norm": 0.99609375, "learning_rate": 0.0004986258584391892, "loss": 5.6774, "mean_token_accuracy": 0.15540721267461777, "num_tokens": 9509581.0, "step": 5155 }, { "entropy": 5.923600053787231, "epoch": 0.4335223692501575, "grad_norm": 0.96484375, "learning_rate": 0.0004986225518337084, "loss": 5.7525, "mean_token_accuracy": 0.15666318088769912, "num_tokens": 9518556.0, "step": 5160 }, { "entropy": 5.714486789703369, "epoch": 0.4339424490653224, "grad_norm": 0.91015625, "learning_rate": 0.0004986192412668692, "loss": 5.6587, "mean_token_accuracy": 0.1547637924551964, "num_tokens": 9527612.0, "step": 5165 }, { "entropy": 5.787137269973755, "epoch": 0.4343625288804873, "grad_norm": 1.03125, "learning_rate": 0.0004986159267387302, "loss": 5.5546, "mean_token_accuracy": 0.16138194501399994, "num_tokens": 9535882.0, "step": 5170 }, { "entropy": 5.797946739196777, "epoch": 0.43478260869565216, "grad_norm": 0.953125, "learning_rate": 0.0004986126082493502, "loss": 5.656, "mean_token_accuracy": 0.1613065406680107, "num_tokens": 9544799.0, "step": 5175 }, { "entropy": 5.779606723785401, "epoch": 0.43520268851081706, "grad_norm": 0.890625, "learning_rate": 0.0004986092857987881, "loss": 5.5729, "mean_token_accuracy": 0.1618928477168083, "num_tokens": 9553805.0, "step": 5180 }, { "entropy": 5.782668399810791, "epoch": 0.43562276832598196, "grad_norm": 0.94921875, "learning_rate": 0.0004986059593871026, "loss": 5.5971, "mean_token_accuracy": 0.1598972573876381, "num_tokens": 9563493.0, "step": 5185 }, { "entropy": 5.800241613388062, "epoch": 0.4360428481411468, "grad_norm": 0.89453125, "learning_rate": 0.0004986026290143527, "loss": 5.6842, "mean_token_accuracy": 0.15388598516583443, "num_tokens": 9572297.0, "step": 5190 }, { "entropy": 5.936120653152466, "epoch": 0.4364629279563117, "grad_norm": 1.0390625, "learning_rate": 0.0004985992946805973, "loss": 5.8134, "mean_token_accuracy": 0.15065453350543975, "num_tokens": 9581967.0, "step": 5195 }, { "entropy": 5.819184160232544, "epoch": 0.4368830077714766, "grad_norm": 0.8828125, "learning_rate": 0.0004985959563858955, "loss": 5.7273, "mean_token_accuracy": 0.16100031584501268, "num_tokens": 9590885.0, "step": 5200 }, { "entropy": 5.860151624679565, "epoch": 0.43730308758664144, "grad_norm": 0.953125, "learning_rate": 0.0004985926141303066, "loss": 5.6532, "mean_token_accuracy": 0.1567025899887085, "num_tokens": 9599247.0, "step": 5205 }, { "entropy": 5.818394136428833, "epoch": 0.43772316740180633, "grad_norm": 1.0546875, "learning_rate": 0.0004985892679138896, "loss": 5.571, "mean_token_accuracy": 0.16371893361210824, "num_tokens": 9608296.0, "step": 5210 }, { "entropy": 5.8166498184204105, "epoch": 0.43814324721697123, "grad_norm": 1.1015625, "learning_rate": 0.0004985859177367038, "loss": 5.6242, "mean_token_accuracy": 0.15776645839214326, "num_tokens": 9616734.0, "step": 5215 }, { "entropy": 5.83067135810852, "epoch": 0.43856332703213613, "grad_norm": 0.890625, "learning_rate": 0.0004985825635988087, "loss": 5.699, "mean_token_accuracy": 0.1571464478969574, "num_tokens": 9626246.0, "step": 5220 }, { "entropy": 5.7702131271362305, "epoch": 0.43898340684730097, "grad_norm": 0.96484375, "learning_rate": 0.0004985792055002635, "loss": 5.5794, "mean_token_accuracy": 0.16028426140546798, "num_tokens": 9634963.0, "step": 5225 }, { "entropy": 5.8400349617004395, "epoch": 0.43940348666246587, "grad_norm": 1.0078125, "learning_rate": 0.0004985758434411278, "loss": 5.6513, "mean_token_accuracy": 0.16422291100025177, "num_tokens": 9643615.0, "step": 5230 }, { "entropy": 5.810837030410767, "epoch": 0.43982356647763077, "grad_norm": 0.96484375, "learning_rate": 0.0004985724774214613, "loss": 5.6244, "mean_token_accuracy": 0.15992441177368164, "num_tokens": 9653306.0, "step": 5235 }, { "entropy": 5.767703294754028, "epoch": 0.4402436462927956, "grad_norm": 0.9453125, "learning_rate": 0.0004985691074413233, "loss": 5.6505, "mean_token_accuracy": 0.15613847076892853, "num_tokens": 9662389.0, "step": 5240 }, { "entropy": 5.753371381759644, "epoch": 0.4406637261079605, "grad_norm": 0.94921875, "learning_rate": 0.0004985657335007739, "loss": 5.6446, "mean_token_accuracy": 0.15534982979297637, "num_tokens": 9671183.0, "step": 5245 }, { "entropy": 5.836323595046997, "epoch": 0.4410838059231254, "grad_norm": 0.90234375, "learning_rate": 0.0004985623555998725, "loss": 5.6222, "mean_token_accuracy": 0.16474147886037827, "num_tokens": 9680544.0, "step": 5250 }, { "entropy": 5.819104290008545, "epoch": 0.4415038857382903, "grad_norm": 0.99609375, "learning_rate": 0.0004985589737386791, "loss": 5.6779, "mean_token_accuracy": 0.15779446437954903, "num_tokens": 9690137.0, "step": 5255 }, { "entropy": 5.74895076751709, "epoch": 0.44192396555345514, "grad_norm": 0.94921875, "learning_rate": 0.0004985555879172535, "loss": 5.6131, "mean_token_accuracy": 0.16228249818086624, "num_tokens": 9699149.0, "step": 5260 }, { "entropy": 5.830872917175293, "epoch": 0.44234404536862004, "grad_norm": 0.89453125, "learning_rate": 0.000498552198135656, "loss": 5.6857, "mean_token_accuracy": 0.16091985404491424, "num_tokens": 9709308.0, "step": 5265 }, { "entropy": 5.817913627624511, "epoch": 0.44276412518378494, "grad_norm": 1.15625, "learning_rate": 0.0004985488043939462, "loss": 5.6133, "mean_token_accuracy": 0.15377137959003448, "num_tokens": 9718462.0, "step": 5270 }, { "entropy": 5.762473201751709, "epoch": 0.4431842049989498, "grad_norm": 0.953125, "learning_rate": 0.0004985454066921846, "loss": 5.5442, "mean_token_accuracy": 0.16455349177122117, "num_tokens": 9727626.0, "step": 5275 }, { "entropy": 5.663512516021728, "epoch": 0.4436042848141147, "grad_norm": 0.91015625, "learning_rate": 0.0004985420050304312, "loss": 5.5827, "mean_token_accuracy": 0.15936666429042817, "num_tokens": 9737091.0, "step": 5280 }, { "entropy": 5.770118761062622, "epoch": 0.4440243646292796, "grad_norm": 1.015625, "learning_rate": 0.0004985385994087462, "loss": 5.6417, "mean_token_accuracy": 0.1584844209253788, "num_tokens": 9746135.0, "step": 5285 }, { "entropy": 5.844138050079346, "epoch": 0.4444444444444444, "grad_norm": 1.015625, "learning_rate": 0.0004985351898271901, "loss": 5.5853, "mean_token_accuracy": 0.1622116059064865, "num_tokens": 9754549.0, "step": 5290 }, { "entropy": 5.83607120513916, "epoch": 0.4448645242596093, "grad_norm": 0.95703125, "learning_rate": 0.0004985317762858231, "loss": 5.7065, "mean_token_accuracy": 0.1499613419175148, "num_tokens": 9764219.0, "step": 5295 }, { "entropy": 5.792026853561401, "epoch": 0.4452846040747742, "grad_norm": 0.984375, "learning_rate": 0.000498528358784706, "loss": 5.5519, "mean_token_accuracy": 0.1638228639960289, "num_tokens": 9772234.0, "step": 5300 }, { "entropy": 5.749575090408325, "epoch": 0.4457046838899391, "grad_norm": 0.9375, "learning_rate": 0.000498524937323899, "loss": 5.6106, "mean_token_accuracy": 0.16515014916658402, "num_tokens": 9781417.0, "step": 5305 }, { "entropy": 5.9356084823608395, "epoch": 0.44612476370510395, "grad_norm": 0.90625, "learning_rate": 0.0004985215119034628, "loss": 5.7505, "mean_token_accuracy": 0.14851112440228462, "num_tokens": 9791286.0, "step": 5310 }, { "entropy": 5.8016856670379635, "epoch": 0.44654484352026885, "grad_norm": 0.95703125, "learning_rate": 0.0004985180825234582, "loss": 5.7329, "mean_token_accuracy": 0.15573213249444962, "num_tokens": 9802157.0, "step": 5315 }, { "entropy": 5.89680552482605, "epoch": 0.44696492333543375, "grad_norm": 0.93359375, "learning_rate": 0.0004985146491839459, "loss": 5.7173, "mean_token_accuracy": 0.1475129798054695, "num_tokens": 9812646.0, "step": 5320 }, { "entropy": 5.870607805252075, "epoch": 0.4473850031505986, "grad_norm": 0.9765625, "learning_rate": 0.0004985112118849865, "loss": 5.7088, "mean_token_accuracy": 0.15120236873626708, "num_tokens": 9822274.0, "step": 5325 }, { "entropy": 5.753091526031494, "epoch": 0.4478050829657635, "grad_norm": 0.9609375, "learning_rate": 0.0004985077706266412, "loss": 5.5294, "mean_token_accuracy": 0.15791643261909485, "num_tokens": 9831337.0, "step": 5330 }, { "entropy": 5.79245548248291, "epoch": 0.4482251627809284, "grad_norm": 0.8828125, "learning_rate": 0.0004985043254089708, "loss": 5.6629, "mean_token_accuracy": 0.15153390020132065, "num_tokens": 9840798.0, "step": 5335 }, { "entropy": 5.723747682571411, "epoch": 0.44864524259609323, "grad_norm": 0.953125, "learning_rate": 0.0004985008762320364, "loss": 5.637, "mean_token_accuracy": 0.15859152227640153, "num_tokens": 9850117.0, "step": 5340 }, { "entropy": 5.79846601486206, "epoch": 0.4490653224112581, "grad_norm": 0.9921875, "learning_rate": 0.000498497423095899, "loss": 5.5724, "mean_token_accuracy": 0.16569938510656357, "num_tokens": 9858227.0, "step": 5345 }, { "entropy": 5.755469799041748, "epoch": 0.449485402226423, "grad_norm": 0.9140625, "learning_rate": 0.0004984939660006199, "loss": 5.6759, "mean_token_accuracy": 0.15846239179372787, "num_tokens": 9867157.0, "step": 5350 }, { "entropy": 5.7474853515625, "epoch": 0.4499054820415879, "grad_norm": 0.9453125, "learning_rate": 0.0004984905049462602, "loss": 5.5876, "mean_token_accuracy": 0.15728517472743989, "num_tokens": 9877045.0, "step": 5355 }, { "entropy": 5.918812370300293, "epoch": 0.45032556185675277, "grad_norm": 0.953125, "learning_rate": 0.0004984870399328814, "loss": 5.7228, "mean_token_accuracy": 0.15240922719240188, "num_tokens": 9886637.0, "step": 5360 }, { "entropy": 5.742618703842163, "epoch": 0.45074564167191766, "grad_norm": 0.93359375, "learning_rate": 0.0004984835709605446, "loss": 5.5883, "mean_token_accuracy": 0.16404919177293778, "num_tokens": 9895601.0, "step": 5365 }, { "entropy": 5.8194098472595215, "epoch": 0.45116572148708256, "grad_norm": 1.0, "learning_rate": 0.0004984800980293116, "loss": 5.738, "mean_token_accuracy": 0.1579892724752426, "num_tokens": 9904775.0, "step": 5370 }, { "entropy": 5.780790996551514, "epoch": 0.4515858013022474, "grad_norm": 0.9609375, "learning_rate": 0.0004984766211392435, "loss": 5.6783, "mean_token_accuracy": 0.15692917853593827, "num_tokens": 9913795.0, "step": 5375 }, { "entropy": 5.802691316604614, "epoch": 0.4520058811174123, "grad_norm": 0.90234375, "learning_rate": 0.0004984731402904024, "loss": 5.5113, "mean_token_accuracy": 0.16487460136413573, "num_tokens": 9922576.0, "step": 5380 }, { "entropy": 5.772703742980957, "epoch": 0.4524259609325772, "grad_norm": 0.93359375, "learning_rate": 0.0004984696554828496, "loss": 5.4922, "mean_token_accuracy": 0.1670244887471199, "num_tokens": 9930971.0, "step": 5385 }, { "entropy": 5.794325065612793, "epoch": 0.4528460407477421, "grad_norm": 1.0859375, "learning_rate": 0.0004984661667166468, "loss": 5.6128, "mean_token_accuracy": 0.16192587018013, "num_tokens": 9939628.0, "step": 5390 }, { "entropy": 5.7850220680236815, "epoch": 0.45326612056290694, "grad_norm": 0.9140625, "learning_rate": 0.0004984626739918561, "loss": 5.5903, "mean_token_accuracy": 0.16074153482913972, "num_tokens": 9948397.0, "step": 5395 }, { "entropy": 5.814194774627685, "epoch": 0.45368620037807184, "grad_norm": 0.87890625, "learning_rate": 0.0004984591773085391, "loss": 5.67, "mean_token_accuracy": 0.15753872096538543, "num_tokens": 9957683.0, "step": 5400 }, { "entropy": 5.814547824859619, "epoch": 0.45410628019323673, "grad_norm": 0.921875, "learning_rate": 0.0004984556766667578, "loss": 5.6587, "mean_token_accuracy": 0.1586209386587143, "num_tokens": 9966756.0, "step": 5405 }, { "entropy": 5.744683790206909, "epoch": 0.4545263600084016, "grad_norm": 0.97265625, "learning_rate": 0.0004984521720665743, "loss": 5.6532, "mean_token_accuracy": 0.16073551923036575, "num_tokens": 9976000.0, "step": 5410 }, { "entropy": 5.857652235031128, "epoch": 0.4549464398235665, "grad_norm": 0.90625, "learning_rate": 0.0004984486635080507, "loss": 5.6506, "mean_token_accuracy": 0.15694389641284942, "num_tokens": 9985509.0, "step": 5415 }, { "entropy": 5.7711996078491214, "epoch": 0.45536651963873137, "grad_norm": 0.94140625, "learning_rate": 0.0004984451509912489, "loss": 5.5899, "mean_token_accuracy": 0.1618253692984581, "num_tokens": 9994342.0, "step": 5420 }, { "entropy": 5.746224308013916, "epoch": 0.4557865994538962, "grad_norm": 0.91796875, "learning_rate": 0.0004984416345162315, "loss": 5.6478, "mean_token_accuracy": 0.15566404908895493, "num_tokens": 10004249.0, "step": 5425 }, { "entropy": 5.76487717628479, "epoch": 0.4562066792690611, "grad_norm": 1.0234375, "learning_rate": 0.0004984381140830605, "loss": 5.6061, "mean_token_accuracy": 0.16023263484239578, "num_tokens": 10012430.0, "step": 5430 }, { "entropy": 5.82148494720459, "epoch": 0.456626759084226, "grad_norm": 0.94140625, "learning_rate": 0.0004984345896917984, "loss": 5.615, "mean_token_accuracy": 0.15671578347682952, "num_tokens": 10021434.0, "step": 5435 }, { "entropy": 5.7957844734191895, "epoch": 0.4570468388993909, "grad_norm": 1.0234375, "learning_rate": 0.0004984310613425076, "loss": 5.6077, "mean_token_accuracy": 0.16273672878742218, "num_tokens": 10030473.0, "step": 5440 }, { "entropy": 5.7984706401824955, "epoch": 0.45746691871455575, "grad_norm": 1.109375, "learning_rate": 0.0004984275290352506, "loss": 5.6027, "mean_token_accuracy": 0.16592728793621064, "num_tokens": 10039057.0, "step": 5445 }, { "entropy": 5.82614917755127, "epoch": 0.45788699852972065, "grad_norm": 0.98046875, "learning_rate": 0.0004984239927700899, "loss": 5.6993, "mean_token_accuracy": 0.15564172416925431, "num_tokens": 10047998.0, "step": 5450 }, { "entropy": 5.890322923660278, "epoch": 0.45830707834488554, "grad_norm": 0.94921875, "learning_rate": 0.0004984204525470883, "loss": 5.6293, "mean_token_accuracy": 0.1547103099524975, "num_tokens": 10057479.0, "step": 5455 }, { "entropy": 5.735934209823609, "epoch": 0.4587271581600504, "grad_norm": 0.89453125, "learning_rate": 0.0004984169083663084, "loss": 5.6068, "mean_token_accuracy": 0.1534338653087616, "num_tokens": 10067754.0, "step": 5460 }, { "entropy": 5.795390987396241, "epoch": 0.4591472379752153, "grad_norm": 0.8828125, "learning_rate": 0.0004984133602278129, "loss": 5.6835, "mean_token_accuracy": 0.157898972928524, "num_tokens": 10076815.0, "step": 5465 }, { "entropy": 5.918915462493897, "epoch": 0.4595673177903802, "grad_norm": 0.95703125, "learning_rate": 0.000498409808131665, "loss": 5.6866, "mean_token_accuracy": 0.15232098400592803, "num_tokens": 10086300.0, "step": 5470 }, { "entropy": 5.7501527786254885, "epoch": 0.4599873976055451, "grad_norm": 0.8828125, "learning_rate": 0.0004984062520779272, "loss": 5.5857, "mean_token_accuracy": 0.16250389367341994, "num_tokens": 10095383.0, "step": 5475 }, { "entropy": 5.6954700469970705, "epoch": 0.4604074774207099, "grad_norm": 0.94921875, "learning_rate": 0.0004984026920666628, "loss": 5.5697, "mean_token_accuracy": 0.15912551581859588, "num_tokens": 10103971.0, "step": 5480 }, { "entropy": 5.814951801300049, "epoch": 0.4608275572358748, "grad_norm": 0.92578125, "learning_rate": 0.0004983991280979347, "loss": 5.5799, "mean_token_accuracy": 0.16145333349704744, "num_tokens": 10113028.0, "step": 5485 }, { "entropy": 5.79097695350647, "epoch": 0.4612476370510397, "grad_norm": 0.9453125, "learning_rate": 0.0004983955601718061, "loss": 5.5408, "mean_token_accuracy": 0.16365961581468583, "num_tokens": 10121890.0, "step": 5490 }, { "entropy": 5.804393863677978, "epoch": 0.46166771686620456, "grad_norm": 0.97265625, "learning_rate": 0.0004983919882883401, "loss": 5.6663, "mean_token_accuracy": 0.1603729695081711, "num_tokens": 10131655.0, "step": 5495 }, { "entropy": 5.873544406890869, "epoch": 0.46208779668136946, "grad_norm": 0.9765625, "learning_rate": 0.0004983884124476, "loss": 5.6699, "mean_token_accuracy": 0.15749045610427856, "num_tokens": 10140778.0, "step": 5500 }, { "entropy": 5.814252138137817, "epoch": 0.46250787649653435, "grad_norm": 0.92578125, "learning_rate": 0.0004983848326496494, "loss": 5.7045, "mean_token_accuracy": 0.15820754915475846, "num_tokens": 10150229.0, "step": 5505 }, { "entropy": 5.815248012542725, "epoch": 0.4629279563116992, "grad_norm": 0.98828125, "learning_rate": 0.0004983812488945513, "loss": 5.6102, "mean_token_accuracy": 0.15927310138940812, "num_tokens": 10158939.0, "step": 5510 }, { "entropy": 5.772242593765259, "epoch": 0.4633480361268641, "grad_norm": 0.984375, "learning_rate": 0.0004983776611823696, "loss": 5.6172, "mean_token_accuracy": 0.15591025203466416, "num_tokens": 10168383.0, "step": 5515 }, { "entropy": 5.762513542175293, "epoch": 0.463768115942029, "grad_norm": 0.93359375, "learning_rate": 0.0004983740695131676, "loss": 5.614, "mean_token_accuracy": 0.16522103548049927, "num_tokens": 10178678.0, "step": 5520 }, { "entropy": 5.78189172744751, "epoch": 0.4641881957571939, "grad_norm": 0.90625, "learning_rate": 0.000498370473887009, "loss": 5.5993, "mean_token_accuracy": 0.1618872195482254, "num_tokens": 10188964.0, "step": 5525 }, { "entropy": 5.797432947158813, "epoch": 0.46460827557235873, "grad_norm": 0.95703125, "learning_rate": 0.0004983668743039573, "loss": 5.626, "mean_token_accuracy": 0.16132238358259202, "num_tokens": 10198333.0, "step": 5530 }, { "entropy": 5.7464605331420895, "epoch": 0.46502835538752363, "grad_norm": 0.99609375, "learning_rate": 0.0004983632707640766, "loss": 5.6385, "mean_token_accuracy": 0.15782831460237504, "num_tokens": 10207876.0, "step": 5535 }, { "entropy": 5.7676252841949465, "epoch": 0.4654484352026885, "grad_norm": 0.95703125, "learning_rate": 0.0004983596632674306, "loss": 5.5836, "mean_token_accuracy": 0.15963911265134811, "num_tokens": 10216822.0, "step": 5540 }, { "entropy": 5.864213514328003, "epoch": 0.46586851501785337, "grad_norm": 0.93359375, "learning_rate": 0.0004983560518140831, "loss": 5.6988, "mean_token_accuracy": 0.15088534951210023, "num_tokens": 10226887.0, "step": 5545 }, { "entropy": 5.807913875579834, "epoch": 0.46628859483301827, "grad_norm": 0.96875, "learning_rate": 0.0004983524364040982, "loss": 5.5379, "mean_token_accuracy": 0.16848834306001664, "num_tokens": 10235935.0, "step": 5550 }, { "entropy": 5.751170539855957, "epoch": 0.46670867464818316, "grad_norm": 0.89453125, "learning_rate": 0.0004983488170375399, "loss": 5.5025, "mean_token_accuracy": 0.16097538769245148, "num_tokens": 10245590.0, "step": 5555 }, { "entropy": 5.752688026428222, "epoch": 0.46712875446334806, "grad_norm": 0.95703125, "learning_rate": 0.0004983451937144723, "loss": 5.5925, "mean_token_accuracy": 0.15908439457416534, "num_tokens": 10255104.0, "step": 5560 }, { "entropy": 5.625225067138672, "epoch": 0.4675488342785129, "grad_norm": 0.9140625, "learning_rate": 0.0004983415664349595, "loss": 5.4479, "mean_token_accuracy": 0.16906733959913253, "num_tokens": 10264236.0, "step": 5565 }, { "entropy": 5.791613006591797, "epoch": 0.4679689140936778, "grad_norm": 0.921875, "learning_rate": 0.0004983379351990659, "loss": 5.5634, "mean_token_accuracy": 0.16491406708955764, "num_tokens": 10273335.0, "step": 5570 }, { "entropy": 5.73756160736084, "epoch": 0.4683889939088427, "grad_norm": 0.83203125, "learning_rate": 0.0004983343000068559, "loss": 5.5392, "mean_token_accuracy": 0.16353048831224443, "num_tokens": 10282206.0, "step": 5575 }, { "entropy": 5.679240655899048, "epoch": 0.46880907372400754, "grad_norm": 1.0, "learning_rate": 0.0004983306608583937, "loss": 5.4798, "mean_token_accuracy": 0.17844018042087556, "num_tokens": 10290056.0, "step": 5580 }, { "entropy": 5.697105741500854, "epoch": 0.46922915353917244, "grad_norm": 0.92578125, "learning_rate": 0.0004983270177537438, "loss": 5.5596, "mean_token_accuracy": 0.16428319364786148, "num_tokens": 10299726.0, "step": 5585 }, { "entropy": 5.741534852981568, "epoch": 0.46964923335433734, "grad_norm": 0.99609375, "learning_rate": 0.0004983233706929708, "loss": 5.6128, "mean_token_accuracy": 0.1574200913310051, "num_tokens": 10308696.0, "step": 5590 }, { "entropy": 5.87669529914856, "epoch": 0.4700693131695022, "grad_norm": 0.92578125, "learning_rate": 0.0004983197196761392, "loss": 5.706, "mean_token_accuracy": 0.1552853010594845, "num_tokens": 10317845.0, "step": 5595 }, { "entropy": 5.774369955062866, "epoch": 0.4704893929846671, "grad_norm": 1.0546875, "learning_rate": 0.0004983160647033139, "loss": 5.5975, "mean_token_accuracy": 0.16107087433338166, "num_tokens": 10326563.0, "step": 5600 }, { "entropy": 5.75340576171875, "epoch": 0.470909472799832, "grad_norm": 0.90234375, "learning_rate": 0.0004983124057745595, "loss": 5.5791, "mean_token_accuracy": 0.15735821723937987, "num_tokens": 10335931.0, "step": 5605 }, { "entropy": 5.707799339294434, "epoch": 0.47132955261499687, "grad_norm": 0.96484375, "learning_rate": 0.0004983087428899408, "loss": 5.5773, "mean_token_accuracy": 0.15221105068922042, "num_tokens": 10344984.0, "step": 5610 }, { "entropy": 5.7647332668304445, "epoch": 0.4717496324301617, "grad_norm": 1.09375, "learning_rate": 0.0004983050760495227, "loss": 5.5966, "mean_token_accuracy": 0.1603370040655136, "num_tokens": 10353522.0, "step": 5615 }, { "entropy": 5.7834312438964846, "epoch": 0.4721697122453266, "grad_norm": 0.96875, "learning_rate": 0.0004983014052533702, "loss": 5.6121, "mean_token_accuracy": 0.15812979638576508, "num_tokens": 10363527.0, "step": 5620 }, { "entropy": 5.723613166809082, "epoch": 0.4725897920604915, "grad_norm": 0.88671875, "learning_rate": 0.0004982977305015481, "loss": 5.5439, "mean_token_accuracy": 0.15958572328090667, "num_tokens": 10372040.0, "step": 5625 }, { "entropy": 5.772522401809693, "epoch": 0.47300987187565635, "grad_norm": 0.9296875, "learning_rate": 0.0004982940517941219, "loss": 5.5227, "mean_token_accuracy": 0.16043394133448602, "num_tokens": 10381279.0, "step": 5630 }, { "entropy": 5.790616703033447, "epoch": 0.47342995169082125, "grad_norm": 1.0078125, "learning_rate": 0.0004982903691311564, "loss": 5.6984, "mean_token_accuracy": 0.15549325048923493, "num_tokens": 10390608.0, "step": 5635 }, { "entropy": 5.768335485458374, "epoch": 0.47385003150598615, "grad_norm": 0.9609375, "learning_rate": 0.0004982866825127172, "loss": 5.4862, "mean_token_accuracy": 0.16711296737194062, "num_tokens": 10399851.0, "step": 5640 }, { "entropy": 5.826428365707398, "epoch": 0.47427011132115104, "grad_norm": 0.953125, "learning_rate": 0.0004982829919388692, "loss": 5.7573, "mean_token_accuracy": 0.15294661596417428, "num_tokens": 10410425.0, "step": 5645 }, { "entropy": 5.705338096618652, "epoch": 0.4746901911363159, "grad_norm": 0.9140625, "learning_rate": 0.0004982792974096781, "loss": 5.5446, "mean_token_accuracy": 0.16691708862781524, "num_tokens": 10418783.0, "step": 5650 }, { "entropy": 5.836835145950317, "epoch": 0.4751102709514808, "grad_norm": 1.1796875, "learning_rate": 0.000498275598925209, "loss": 5.7114, "mean_token_accuracy": 0.15507804453372956, "num_tokens": 10427360.0, "step": 5655 }, { "entropy": 5.856819105148316, "epoch": 0.4755303507666457, "grad_norm": 0.94140625, "learning_rate": 0.0004982718964855277, "loss": 5.6653, "mean_token_accuracy": 0.1575305789709091, "num_tokens": 10436613.0, "step": 5660 }, { "entropy": 5.742249536514282, "epoch": 0.4759504305818105, "grad_norm": 0.98828125, "learning_rate": 0.0004982681900907, "loss": 5.7114, "mean_token_accuracy": 0.15877616107463838, "num_tokens": 10445055.0, "step": 5665 }, { "entropy": 5.744962549209594, "epoch": 0.4763705103969754, "grad_norm": 0.89453125, "learning_rate": 0.000498264479740791, "loss": 5.5379, "mean_token_accuracy": 0.16900296211242677, "num_tokens": 10454516.0, "step": 5670 }, { "entropy": 5.830320215225219, "epoch": 0.4767905902121403, "grad_norm": 0.8984375, "learning_rate": 0.0004982607654358668, "loss": 5.6596, "mean_token_accuracy": 0.15974192917346955, "num_tokens": 10463771.0, "step": 5675 }, { "entropy": 5.769126272201538, "epoch": 0.47721067002730516, "grad_norm": 0.875, "learning_rate": 0.000498257047175993, "loss": 5.5908, "mean_token_accuracy": 0.15908040702342988, "num_tokens": 10473783.0, "step": 5680 }, { "entropy": 5.78115234375, "epoch": 0.47763074984247006, "grad_norm": 0.9609375, "learning_rate": 0.0004982533249612357, "loss": 5.5629, "mean_token_accuracy": 0.16332129687070845, "num_tokens": 10483424.0, "step": 5685 }, { "entropy": 5.69402847290039, "epoch": 0.47805082965763496, "grad_norm": 0.9375, "learning_rate": 0.0004982495987916607, "loss": 5.5045, "mean_token_accuracy": 0.1687542662024498, "num_tokens": 10492536.0, "step": 5690 }, { "entropy": 5.782306718826294, "epoch": 0.47847090947279985, "grad_norm": 1.0625, "learning_rate": 0.0004982458686673339, "loss": 5.6148, "mean_token_accuracy": 0.15962855368852616, "num_tokens": 10501616.0, "step": 5695 }, { "entropy": 5.8774285316467285, "epoch": 0.4788909892879647, "grad_norm": 1.0, "learning_rate": 0.0004982421345883217, "loss": 5.6435, "mean_token_accuracy": 0.1528232589364052, "num_tokens": 10511190.0, "step": 5700 }, { "entropy": 5.737439727783203, "epoch": 0.4793110691031296, "grad_norm": 0.9609375, "learning_rate": 0.0004982383965546898, "loss": 5.5899, "mean_token_accuracy": 0.15596046000719072, "num_tokens": 10520310.0, "step": 5705 }, { "entropy": 5.806997585296631, "epoch": 0.4797311489182945, "grad_norm": 0.9765625, "learning_rate": 0.0004982346545665048, "loss": 5.563, "mean_token_accuracy": 0.16304250210523605, "num_tokens": 10528711.0, "step": 5710 }, { "entropy": 5.757972669601441, "epoch": 0.48015122873345933, "grad_norm": 1.0078125, "learning_rate": 0.0004982309086238328, "loss": 5.6498, "mean_token_accuracy": 0.15384584218263625, "num_tokens": 10538484.0, "step": 5715 }, { "entropy": 5.7749903202056885, "epoch": 0.48057130854862423, "grad_norm": 0.94140625, "learning_rate": 0.0004982271587267403, "loss": 5.5947, "mean_token_accuracy": 0.15901431441307068, "num_tokens": 10547623.0, "step": 5720 }, { "entropy": 5.7751219272613525, "epoch": 0.48099138836378913, "grad_norm": 1.015625, "learning_rate": 0.0004982234048752935, "loss": 5.5458, "mean_token_accuracy": 0.16144074499607086, "num_tokens": 10556234.0, "step": 5725 }, { "entropy": 5.856562280654908, "epoch": 0.481411468178954, "grad_norm": 0.94921875, "learning_rate": 0.000498219647069559, "loss": 5.7641, "mean_token_accuracy": 0.1533028818666935, "num_tokens": 10566308.0, "step": 5730 }, { "entropy": 5.8091706275939945, "epoch": 0.48183154799411887, "grad_norm": 1.015625, "learning_rate": 0.0004982158853096035, "loss": 5.7108, "mean_token_accuracy": 0.15445562452077866, "num_tokens": 10575212.0, "step": 5735 }, { "entropy": 5.755967473983764, "epoch": 0.48225162780928377, "grad_norm": 0.9296875, "learning_rate": 0.0004982121195954935, "loss": 5.4688, "mean_token_accuracy": 0.1693451941013336, "num_tokens": 10584590.0, "step": 5740 }, { "entropy": 5.736726951599121, "epoch": 0.48267170762444866, "grad_norm": 0.9296875, "learning_rate": 0.0004982083499272957, "loss": 5.5512, "mean_token_accuracy": 0.16557496339082717, "num_tokens": 10593997.0, "step": 5745 }, { "entropy": 5.806335926055908, "epoch": 0.4830917874396135, "grad_norm": 0.97265625, "learning_rate": 0.0004982045763050768, "loss": 5.6777, "mean_token_accuracy": 0.157341568171978, "num_tokens": 10603299.0, "step": 5750 }, { "entropy": 5.790657663345337, "epoch": 0.4835118672547784, "grad_norm": 0.953125, "learning_rate": 0.0004982007987289041, "loss": 5.5987, "mean_token_accuracy": 0.15882896780967712, "num_tokens": 10613546.0, "step": 5755 }, { "entropy": 5.743067026138306, "epoch": 0.4839319470699433, "grad_norm": 1.0234375, "learning_rate": 0.0004981970171988439, "loss": 5.5707, "mean_token_accuracy": 0.16890112310647964, "num_tokens": 10622966.0, "step": 5760 }, { "entropy": 5.773163938522339, "epoch": 0.48435202688510814, "grad_norm": 1.0859375, "learning_rate": 0.0004981932317149636, "loss": 5.6484, "mean_token_accuracy": 0.1565729409456253, "num_tokens": 10633441.0, "step": 5765 }, { "entropy": 5.843293190002441, "epoch": 0.48477210670027304, "grad_norm": 0.93359375, "learning_rate": 0.00049818944227733, "loss": 5.6374, "mean_token_accuracy": 0.15993442833423616, "num_tokens": 10643124.0, "step": 5770 }, { "entropy": 5.831496477127075, "epoch": 0.48519218651543794, "grad_norm": 0.95703125, "learning_rate": 0.0004981856488860105, "loss": 5.6117, "mean_token_accuracy": 0.1523417502641678, "num_tokens": 10652517.0, "step": 5775 }, { "entropy": 5.804540205001831, "epoch": 0.48561226633060284, "grad_norm": 0.99609375, "learning_rate": 0.0004981818515410721, "loss": 5.6591, "mean_token_accuracy": 0.1497793585062027, "num_tokens": 10663352.0, "step": 5780 }, { "entropy": 5.732200670242309, "epoch": 0.4860323461457677, "grad_norm": 0.96484375, "learning_rate": 0.0004981780502425821, "loss": 5.6688, "mean_token_accuracy": 0.15934486985206603, "num_tokens": 10672430.0, "step": 5785 }, { "entropy": 5.7780238628387455, "epoch": 0.4864524259609326, "grad_norm": 0.9140625, "learning_rate": 0.0004981742449906079, "loss": 5.6075, "mean_token_accuracy": 0.16593022048473358, "num_tokens": 10681908.0, "step": 5790 }, { "entropy": 5.821439170837403, "epoch": 0.4868725057760975, "grad_norm": 1.0078125, "learning_rate": 0.0004981704357852168, "loss": 5.6032, "mean_token_accuracy": 0.16017231941223145, "num_tokens": 10691259.0, "step": 5795 }, { "entropy": 5.739565515518189, "epoch": 0.4872925855912623, "grad_norm": 0.98046875, "learning_rate": 0.0004981666226264764, "loss": 5.5018, "mean_token_accuracy": 0.16552049070596694, "num_tokens": 10699668.0, "step": 5800 }, { "entropy": 5.741326379776001, "epoch": 0.4877126654064272, "grad_norm": 0.84765625, "learning_rate": 0.0004981628055144542, "loss": 5.5384, "mean_token_accuracy": 0.16326582431793213, "num_tokens": 10709146.0, "step": 5805 }, { "entropy": 5.826295614242554, "epoch": 0.4881327452215921, "grad_norm": 0.90234375, "learning_rate": 0.0004981589844492177, "loss": 5.6268, "mean_token_accuracy": 0.1511153683066368, "num_tokens": 10718724.0, "step": 5810 }, { "entropy": 5.774454784393311, "epoch": 0.488552825036757, "grad_norm": 0.91015625, "learning_rate": 0.0004981551594308349, "loss": 5.6002, "mean_token_accuracy": 0.16163085922598838, "num_tokens": 10728101.0, "step": 5815 }, { "entropy": 5.8604474544525145, "epoch": 0.48897290485192185, "grad_norm": 1.015625, "learning_rate": 0.0004981513304593733, "loss": 5.5894, "mean_token_accuracy": 0.16614548563957215, "num_tokens": 10736750.0, "step": 5820 }, { "entropy": 5.813880395889282, "epoch": 0.48939298466708675, "grad_norm": 0.89453125, "learning_rate": 0.0004981474975349006, "loss": 5.7934, "mean_token_accuracy": 0.15620144009590148, "num_tokens": 10746914.0, "step": 5825 }, { "entropy": 5.775779962539673, "epoch": 0.48981306448225165, "grad_norm": 1.0078125, "learning_rate": 0.000498143660657485, "loss": 5.6266, "mean_token_accuracy": 0.160403074324131, "num_tokens": 10755786.0, "step": 5830 }, { "entropy": 5.672336006164551, "epoch": 0.4902331442974165, "grad_norm": 0.90625, "learning_rate": 0.0004981398198271944, "loss": 5.512, "mean_token_accuracy": 0.16457450538873672, "num_tokens": 10764821.0, "step": 5835 }, { "entropy": 5.762319898605346, "epoch": 0.4906532241125814, "grad_norm": 0.97265625, "learning_rate": 0.0004981359750440968, "loss": 5.5981, "mean_token_accuracy": 0.15791754126548768, "num_tokens": 10773569.0, "step": 5840 }, { "entropy": 5.703838157653808, "epoch": 0.4910733039277463, "grad_norm": 0.98046875, "learning_rate": 0.0004981321263082603, "loss": 5.5547, "mean_token_accuracy": 0.15730964243412018, "num_tokens": 10782298.0, "step": 5845 }, { "entropy": 5.705076360702515, "epoch": 0.4914933837429111, "grad_norm": 0.921875, "learning_rate": 0.000498128273619753, "loss": 5.5491, "mean_token_accuracy": 0.1628515049815178, "num_tokens": 10792087.0, "step": 5850 }, { "entropy": 5.771277141571045, "epoch": 0.491913463558076, "grad_norm": 0.9140625, "learning_rate": 0.0004981244169786433, "loss": 5.6458, "mean_token_accuracy": 0.15582741051912308, "num_tokens": 10801641.0, "step": 5855 }, { "entropy": 5.861782169342041, "epoch": 0.4923335433732409, "grad_norm": 0.94140625, "learning_rate": 0.0004981205563849994, "loss": 5.7007, "mean_token_accuracy": 0.15648430287837983, "num_tokens": 10811612.0, "step": 5860 }, { "entropy": 5.788508701324463, "epoch": 0.4927536231884058, "grad_norm": 0.95703125, "learning_rate": 0.0004981166918388897, "loss": 5.5149, "mean_token_accuracy": 0.16366831362247466, "num_tokens": 10821608.0, "step": 5865 }, { "entropy": 5.720433568954467, "epoch": 0.49317370300357066, "grad_norm": 0.98828125, "learning_rate": 0.0004981128233403828, "loss": 5.4915, "mean_token_accuracy": 0.16485851109027863, "num_tokens": 10830679.0, "step": 5870 }, { "entropy": 5.718778944015503, "epoch": 0.49359378281873556, "grad_norm": 0.890625, "learning_rate": 0.000498108950889547, "loss": 5.5507, "mean_token_accuracy": 0.16066077202558518, "num_tokens": 10839669.0, "step": 5875 }, { "entropy": 5.787919759750366, "epoch": 0.49401386263390046, "grad_norm": 0.92578125, "learning_rate": 0.0004981050744864512, "loss": 5.5387, "mean_token_accuracy": 0.16012917906045915, "num_tokens": 10849666.0, "step": 5880 }, { "entropy": 5.731645965576172, "epoch": 0.4944339424490653, "grad_norm": 0.9765625, "learning_rate": 0.0004981011941311638, "loss": 5.455, "mean_token_accuracy": 0.1706133618950844, "num_tokens": 10858225.0, "step": 5885 }, { "entropy": 5.7152073860168455, "epoch": 0.4948540222642302, "grad_norm": 0.9609375, "learning_rate": 0.0004980973098237535, "loss": 5.5608, "mean_token_accuracy": 0.1573803097009659, "num_tokens": 10867466.0, "step": 5890 }, { "entropy": 5.793262910842896, "epoch": 0.4952741020793951, "grad_norm": 0.95703125, "learning_rate": 0.0004980934215642894, "loss": 5.5967, "mean_token_accuracy": 0.1668254867196083, "num_tokens": 10875850.0, "step": 5895 }, { "entropy": 5.676056289672852, "epoch": 0.49569418189456, "grad_norm": 0.98828125, "learning_rate": 0.00049808952935284, "loss": 5.5231, "mean_token_accuracy": 0.16948444843292237, "num_tokens": 10885154.0, "step": 5900 }, { "entropy": 5.739302301406861, "epoch": 0.49611426170972484, "grad_norm": 0.984375, "learning_rate": 0.0004980856331894747, "loss": 5.6296, "mean_token_accuracy": 0.16090053021907808, "num_tokens": 10894080.0, "step": 5905 }, { "entropy": 5.7569280624389645, "epoch": 0.49653434152488973, "grad_norm": 0.9453125, "learning_rate": 0.0004980817330742621, "loss": 5.6161, "mean_token_accuracy": 0.15483176559209824, "num_tokens": 10903248.0, "step": 5910 }, { "entropy": 5.768988418579101, "epoch": 0.49695442134005463, "grad_norm": 0.91015625, "learning_rate": 0.0004980778290072716, "loss": 5.5804, "mean_token_accuracy": 0.16294265836477279, "num_tokens": 10912939.0, "step": 5915 }, { "entropy": 5.777530717849731, "epoch": 0.4973745011552195, "grad_norm": 0.9765625, "learning_rate": 0.0004980739209885722, "loss": 5.6127, "mean_token_accuracy": 0.16438234001398086, "num_tokens": 10921505.0, "step": 5920 }, { "entropy": 5.802098226547241, "epoch": 0.49779458097038437, "grad_norm": 0.9375, "learning_rate": 0.0004980700090182331, "loss": 5.6819, "mean_token_accuracy": 0.16335346847772597, "num_tokens": 10931861.0, "step": 5925 }, { "entropy": 5.83542947769165, "epoch": 0.49821466078554927, "grad_norm": 0.921875, "learning_rate": 0.0004980660930963238, "loss": 5.5848, "mean_token_accuracy": 0.16074420511722565, "num_tokens": 10940810.0, "step": 5930 }, { "entropy": 5.723906135559082, "epoch": 0.4986347406007141, "grad_norm": 0.94140625, "learning_rate": 0.0004980621732229133, "loss": 5.4722, "mean_token_accuracy": 0.16402249783277512, "num_tokens": 10949514.0, "step": 5935 }, { "entropy": 5.749081373214722, "epoch": 0.499054820415879, "grad_norm": 0.96875, "learning_rate": 0.0004980582493980714, "loss": 5.6742, "mean_token_accuracy": 0.1556909427046776, "num_tokens": 10959161.0, "step": 5940 }, { "entropy": 5.750719594955444, "epoch": 0.4994749002310439, "grad_norm": 0.890625, "learning_rate": 0.0004980543216218674, "loss": 5.5569, "mean_token_accuracy": 0.17051900774240494, "num_tokens": 10968983.0, "step": 5945 }, { "entropy": 5.795907783508301, "epoch": 0.4998949800462088, "grad_norm": 0.9921875, "learning_rate": 0.0004980503898943711, "loss": 5.6755, "mean_token_accuracy": 0.16463214308023452, "num_tokens": 10978044.0, "step": 5950 }, { "entropy": 5.818535089492798, "epoch": 0.5003150598613737, "grad_norm": 0.90625, "learning_rate": 0.0004980464542156519, "loss": 5.5895, "mean_token_accuracy": 0.16786763817071915, "num_tokens": 10986980.0, "step": 5955 }, { "entropy": 5.744042301177979, "epoch": 0.5007351396765385, "grad_norm": 0.953125, "learning_rate": 0.0004980425145857796, "loss": 5.5404, "mean_token_accuracy": 0.17190210670232772, "num_tokens": 10995163.0, "step": 5960 }, { "entropy": 5.6839663028717045, "epoch": 0.5011552194917034, "grad_norm": 0.92578125, "learning_rate": 0.000498038571004824, "loss": 5.4658, "mean_token_accuracy": 0.1701178327202797, "num_tokens": 11003722.0, "step": 5965 }, { "entropy": 5.658802843093872, "epoch": 0.5015752993068683, "grad_norm": 0.93359375, "learning_rate": 0.0004980346234728549, "loss": 5.5459, "mean_token_accuracy": 0.1696319282054901, "num_tokens": 11013176.0, "step": 5970 }, { "entropy": 5.7755608558654785, "epoch": 0.5019953791220332, "grad_norm": 0.94140625, "learning_rate": 0.0004980306719899424, "loss": 5.601, "mean_token_accuracy": 0.16234323978424073, "num_tokens": 11022636.0, "step": 5975 }, { "entropy": 5.711779022216797, "epoch": 0.5024154589371981, "grad_norm": 0.96875, "learning_rate": 0.0004980267165561564, "loss": 5.5409, "mean_token_accuracy": 0.16729752868413925, "num_tokens": 11031896.0, "step": 5980 }, { "entropy": 5.725300073623657, "epoch": 0.502835538752363, "grad_norm": 0.91796875, "learning_rate": 0.0004980227571715669, "loss": 5.579, "mean_token_accuracy": 0.15976378172636033, "num_tokens": 11040802.0, "step": 5985 }, { "entropy": 5.731253290176392, "epoch": 0.5032556185675279, "grad_norm": 0.96484375, "learning_rate": 0.0004980187938362441, "loss": 5.5153, "mean_token_accuracy": 0.1588967353105545, "num_tokens": 11049701.0, "step": 5990 }, { "entropy": 5.786366033554077, "epoch": 0.5036756983826927, "grad_norm": 0.9296875, "learning_rate": 0.0004980148265502581, "loss": 5.694, "mean_token_accuracy": 0.15498168617486954, "num_tokens": 11059555.0, "step": 5995 }, { "entropy": 5.793335866928101, "epoch": 0.5040957781978576, "grad_norm": 0.9921875, "learning_rate": 0.0004980108553136795, "loss": 5.6141, "mean_token_accuracy": 0.16208730340003968, "num_tokens": 11068940.0, "step": 6000 }, { "epoch": 0.5040957781978576, "eval_entropy": 5.5702669805797465, "eval_loss": 5.591900825500488, "eval_mean_token_accuracy": 0.1687953193199262, "eval_num_tokens": 11068940.0, "eval_runtime": 21.0876, "eval_samples_per_second": 1771.942, "eval_steps_per_second": 221.505, "step": 6000 }, { "entropy": 5.811098432540893, "epoch": 0.5045158580130225, "grad_norm": 0.85546875, "learning_rate": 0.0004980068801265783, "loss": 5.5883, "mean_token_accuracy": 0.16337504461407662, "num_tokens": 11079014.0, "step": 6005 }, { "entropy": 5.81418080329895, "epoch": 0.5049359378281874, "grad_norm": 0.953125, "learning_rate": 0.0004980029009890251, "loss": 5.6866, "mean_token_accuracy": 0.15968995168805122, "num_tokens": 11089526.0, "step": 6010 }, { "entropy": 5.788970947265625, "epoch": 0.5053560176433523, "grad_norm": 0.9609375, "learning_rate": 0.0004979989179010904, "loss": 5.5593, "mean_token_accuracy": 0.16788360476493835, "num_tokens": 11099156.0, "step": 6015 }, { "entropy": 5.692927360534668, "epoch": 0.5057760974585171, "grad_norm": 1.03125, "learning_rate": 0.0004979949308628445, "loss": 5.5502, "mean_token_accuracy": 0.1613062158226967, "num_tokens": 11108242.0, "step": 6020 }, { "entropy": 5.660373067855835, "epoch": 0.506196177273682, "grad_norm": 0.9453125, "learning_rate": 0.0004979909398743584, "loss": 5.5452, "mean_token_accuracy": 0.16594227254390717, "num_tokens": 11118076.0, "step": 6025 }, { "entropy": 5.79341983795166, "epoch": 0.5066162570888468, "grad_norm": 0.87109375, "learning_rate": 0.0004979869449357026, "loss": 5.6164, "mean_token_accuracy": 0.16827214658260345, "num_tokens": 11127265.0, "step": 6030 }, { "entropy": 5.790122604370117, "epoch": 0.5070363369040117, "grad_norm": 0.90625, "learning_rate": 0.0004979829460469478, "loss": 5.537, "mean_token_accuracy": 0.16255101412534714, "num_tokens": 11136429.0, "step": 6035 }, { "entropy": 5.731788492202758, "epoch": 0.5074564167191766, "grad_norm": 0.921875, "learning_rate": 0.0004979789432081649, "loss": 5.5406, "mean_token_accuracy": 0.16618361473083496, "num_tokens": 11146201.0, "step": 6040 }, { "entropy": 5.780402135848999, "epoch": 0.5078764965343415, "grad_norm": 1.015625, "learning_rate": 0.000497974936419425, "loss": 5.5673, "mean_token_accuracy": 0.16517029255628585, "num_tokens": 11154867.0, "step": 6045 }, { "entropy": 5.6448524475097654, "epoch": 0.5082965763495064, "grad_norm": 0.96875, "learning_rate": 0.0004979709256807989, "loss": 5.6053, "mean_token_accuracy": 0.16172728240489959, "num_tokens": 11164092.0, "step": 6050 }, { "entropy": 5.781954717636109, "epoch": 0.5087166561646713, "grad_norm": 0.921875, "learning_rate": 0.0004979669109923575, "loss": 5.6388, "mean_token_accuracy": 0.15602062940597533, "num_tokens": 11173176.0, "step": 6055 }, { "entropy": 5.811630630493164, "epoch": 0.5091367359798362, "grad_norm": 0.9375, "learning_rate": 0.0004979628923541721, "loss": 5.5983, "mean_token_accuracy": 0.16064341068267823, "num_tokens": 11182397.0, "step": 6060 }, { "entropy": 5.808675527572632, "epoch": 0.509556815795001, "grad_norm": 0.92578125, "learning_rate": 0.000497958869766314, "loss": 5.6302, "mean_token_accuracy": 0.16145683825016022, "num_tokens": 11191790.0, "step": 6065 }, { "entropy": 5.783952713012695, "epoch": 0.5099768956101659, "grad_norm": 0.9140625, "learning_rate": 0.0004979548432288543, "loss": 5.541, "mean_token_accuracy": 0.1686972364783287, "num_tokens": 11201104.0, "step": 6070 }, { "entropy": 5.752194738388061, "epoch": 0.5103969754253308, "grad_norm": 1.0, "learning_rate": 0.0004979508127418643, "loss": 5.5324, "mean_token_accuracy": 0.16261855214834214, "num_tokens": 11209578.0, "step": 6075 }, { "entropy": 5.768413734436035, "epoch": 0.5108170552404957, "grad_norm": 1.0546875, "learning_rate": 0.0004979467783054155, "loss": 5.5069, "mean_token_accuracy": 0.1716530740261078, "num_tokens": 11218380.0, "step": 6080 }, { "entropy": 5.7077422618865965, "epoch": 0.5112371350556606, "grad_norm": 0.953125, "learning_rate": 0.0004979427399195793, "loss": 5.5338, "mean_token_accuracy": 0.16079539507627488, "num_tokens": 11227810.0, "step": 6085 }, { "entropy": 5.74758620262146, "epoch": 0.5116572148708255, "grad_norm": 1.1015625, "learning_rate": 0.0004979386975844274, "loss": 5.5518, "mean_token_accuracy": 0.1612395703792572, "num_tokens": 11236631.0, "step": 6090 }, { "entropy": 5.72519702911377, "epoch": 0.5120772946859904, "grad_norm": 0.8125, "learning_rate": 0.0004979346513000311, "loss": 5.5893, "mean_token_accuracy": 0.15641138106584548, "num_tokens": 11247418.0, "step": 6095 }, { "entropy": 5.719963645935058, "epoch": 0.5124973745011552, "grad_norm": 0.9296875, "learning_rate": 0.0004979306010664623, "loss": 5.5085, "mean_token_accuracy": 0.1705961272120476, "num_tokens": 11256246.0, "step": 6100 }, { "entropy": 5.627693128585816, "epoch": 0.5129174543163201, "grad_norm": 0.87890625, "learning_rate": 0.0004979265468837927, "loss": 5.4941, "mean_token_accuracy": 0.16766001135110856, "num_tokens": 11265980.0, "step": 6105 }, { "entropy": 5.759566164016723, "epoch": 0.513337534131485, "grad_norm": 0.9140625, "learning_rate": 0.000497922488752094, "loss": 5.529, "mean_token_accuracy": 0.1628105789422989, "num_tokens": 11276158.0, "step": 6110 }, { "entropy": 5.7324329853057865, "epoch": 0.5137576139466499, "grad_norm": 0.94140625, "learning_rate": 0.0004979184266714383, "loss": 5.4482, "mean_token_accuracy": 0.16801214665174485, "num_tokens": 11284957.0, "step": 6115 }, { "entropy": 5.649721574783325, "epoch": 0.5141776937618148, "grad_norm": 0.98046875, "learning_rate": 0.0004979143606418974, "loss": 5.482, "mean_token_accuracy": 0.16361449509859086, "num_tokens": 11294340.0, "step": 6120 }, { "entropy": 5.820867586135864, "epoch": 0.5145977735769797, "grad_norm": 0.9140625, "learning_rate": 0.0004979102906635435, "loss": 5.7268, "mean_token_accuracy": 0.15687822848558425, "num_tokens": 11303344.0, "step": 6125 }, { "entropy": 5.772322273254394, "epoch": 0.5150178533921445, "grad_norm": 1.09375, "learning_rate": 0.0004979062167364486, "loss": 5.5951, "mean_token_accuracy": 0.16613196283578874, "num_tokens": 11311338.0, "step": 6130 }, { "entropy": 5.675012588500977, "epoch": 0.5154379332073094, "grad_norm": 0.9765625, "learning_rate": 0.0004979021388606847, "loss": 5.4169, "mean_token_accuracy": 0.17838600128889084, "num_tokens": 11320194.0, "step": 6135 }, { "entropy": 5.737465143203735, "epoch": 0.5158580130224742, "grad_norm": 0.8671875, "learning_rate": 0.0004978980570363243, "loss": 5.6104, "mean_token_accuracy": 0.1656625747680664, "num_tokens": 11329952.0, "step": 6140 }, { "entropy": 5.681354331970215, "epoch": 0.5162780928376391, "grad_norm": 0.98046875, "learning_rate": 0.0004978939712634396, "loss": 5.5507, "mean_token_accuracy": 0.16612329334020615, "num_tokens": 11339384.0, "step": 6145 }, { "entropy": 5.825163555145264, "epoch": 0.516698172652804, "grad_norm": 0.9375, "learning_rate": 0.0004978898815421029, "loss": 5.7224, "mean_token_accuracy": 0.1597566932439804, "num_tokens": 11348409.0, "step": 6150 }, { "entropy": 5.876521301269531, "epoch": 0.5171182524679689, "grad_norm": 1.03125, "learning_rate": 0.0004978857878723867, "loss": 5.6278, "mean_token_accuracy": 0.16059536784887313, "num_tokens": 11357478.0, "step": 6155 }, { "entropy": 5.806015205383301, "epoch": 0.5175383322831338, "grad_norm": 1.0, "learning_rate": 0.0004978816902543636, "loss": 5.6454, "mean_token_accuracy": 0.15964788049459458, "num_tokens": 11366379.0, "step": 6160 }, { "entropy": 5.7852825164794925, "epoch": 0.5179584120982986, "grad_norm": 0.90625, "learning_rate": 0.0004978775886881062, "loss": 5.6466, "mean_token_accuracy": 0.15952356532216072, "num_tokens": 11376357.0, "step": 6165 }, { "entropy": 5.7297625064849855, "epoch": 0.5183784919134635, "grad_norm": 1.015625, "learning_rate": 0.000497873483173687, "loss": 5.5309, "mean_token_accuracy": 0.17101848274469375, "num_tokens": 11384995.0, "step": 6170 }, { "entropy": 5.71215410232544, "epoch": 0.5187985717286284, "grad_norm": 0.90234375, "learning_rate": 0.0004978693737111787, "loss": 5.5337, "mean_token_accuracy": 0.1644275352358818, "num_tokens": 11395363.0, "step": 6175 }, { "entropy": 5.756123781204224, "epoch": 0.5192186515437933, "grad_norm": 0.9296875, "learning_rate": 0.0004978652603006543, "loss": 5.5116, "mean_token_accuracy": 0.1630913570523262, "num_tokens": 11404511.0, "step": 6180 }, { "entropy": 5.801231575012207, "epoch": 0.5196387313589582, "grad_norm": 0.9375, "learning_rate": 0.0004978611429421866, "loss": 5.5624, "mean_token_accuracy": 0.16539832353591918, "num_tokens": 11413400.0, "step": 6185 }, { "entropy": 5.74934229850769, "epoch": 0.5200588111741231, "grad_norm": 0.95703125, "learning_rate": 0.0004978570216358485, "loss": 5.6156, "mean_token_accuracy": 0.15378101766109467, "num_tokens": 11423693.0, "step": 6190 }, { "entropy": 5.790632915496826, "epoch": 0.520478890989288, "grad_norm": 0.9296875, "learning_rate": 0.000497852896381713, "loss": 5.5801, "mean_token_accuracy": 0.15778316110372542, "num_tokens": 11433195.0, "step": 6195 }, { "entropy": 5.809474945068359, "epoch": 0.5208989708044528, "grad_norm": 0.99609375, "learning_rate": 0.0004978487671798531, "loss": 5.702, "mean_token_accuracy": 0.15822496265172958, "num_tokens": 11443416.0, "step": 6200 }, { "entropy": 5.802986145019531, "epoch": 0.5213190506196177, "grad_norm": 0.984375, "learning_rate": 0.0004978446340303422, "loss": 5.5712, "mean_token_accuracy": 0.16286559998989106, "num_tokens": 11452487.0, "step": 6205 }, { "entropy": 5.800027227401733, "epoch": 0.5217391304347826, "grad_norm": 0.98046875, "learning_rate": 0.0004978404969332533, "loss": 5.5917, "mean_token_accuracy": 0.16486820578575134, "num_tokens": 11461893.0, "step": 6210 }, { "entropy": 5.672508907318115, "epoch": 0.5221592102499475, "grad_norm": 0.91015625, "learning_rate": 0.0004978363558886597, "loss": 5.5188, "mean_token_accuracy": 0.1554739385843277, "num_tokens": 11471238.0, "step": 6215 }, { "entropy": 5.718248462677002, "epoch": 0.5225792900651124, "grad_norm": 0.9609375, "learning_rate": 0.0004978322108966348, "loss": 5.6277, "mean_token_accuracy": 0.15638385266065596, "num_tokens": 11480571.0, "step": 6220 }, { "entropy": 5.763249778747559, "epoch": 0.5229993698802773, "grad_norm": 0.90234375, "learning_rate": 0.0004978280619572521, "loss": 5.6024, "mean_token_accuracy": 0.16089674681425095, "num_tokens": 11489552.0, "step": 6225 }, { "entropy": 5.776705312728882, "epoch": 0.5234194496954422, "grad_norm": 0.95703125, "learning_rate": 0.000497823909070585, "loss": 5.6565, "mean_token_accuracy": 0.15730864256620408, "num_tokens": 11498715.0, "step": 6230 }, { "entropy": 5.766147661209106, "epoch": 0.523839529510607, "grad_norm": 0.96875, "learning_rate": 0.0004978197522367071, "loss": 5.5864, "mean_token_accuracy": 0.15774240344762802, "num_tokens": 11508472.0, "step": 6235 }, { "entropy": 5.825910902023315, "epoch": 0.5242596093257719, "grad_norm": 0.90234375, "learning_rate": 0.0004978155914556919, "loss": 5.5261, "mean_token_accuracy": 0.17228852659463884, "num_tokens": 11517620.0, "step": 6240 }, { "entropy": 5.7359106063842775, "epoch": 0.5246796891409368, "grad_norm": 0.93359375, "learning_rate": 0.0004978114267276134, "loss": 5.5822, "mean_token_accuracy": 0.1607842430472374, "num_tokens": 11526106.0, "step": 6245 }, { "entropy": 5.735781860351563, "epoch": 0.5250997689561017, "grad_norm": 0.97265625, "learning_rate": 0.0004978072580525451, "loss": 5.6084, "mean_token_accuracy": 0.16850581914186477, "num_tokens": 11535840.0, "step": 6250 }, { "entropy": 5.773589372634888, "epoch": 0.5255198487712666, "grad_norm": 0.953125, "learning_rate": 0.000497803085430561, "loss": 5.5746, "mean_token_accuracy": 0.16597797349095345, "num_tokens": 11545110.0, "step": 6255 }, { "entropy": 5.841729068756104, "epoch": 0.5259399285864315, "grad_norm": 1.0, "learning_rate": 0.0004977989088617349, "loss": 5.6189, "mean_token_accuracy": 0.158939990401268, "num_tokens": 11554382.0, "step": 6260 }, { "entropy": 5.731269979476929, "epoch": 0.5263600084015964, "grad_norm": 0.9609375, "learning_rate": 0.000497794728346141, "loss": 5.4784, "mean_token_accuracy": 0.1658696085214615, "num_tokens": 11562821.0, "step": 6265 }, { "entropy": 5.805121564865113, "epoch": 0.5267800882167611, "grad_norm": 0.99609375, "learning_rate": 0.0004977905438838531, "loss": 5.6848, "mean_token_accuracy": 0.15056246519088745, "num_tokens": 11571705.0, "step": 6270 }, { "entropy": 5.646053838729858, "epoch": 0.527200168031926, "grad_norm": 0.9375, "learning_rate": 0.0004977863554749453, "loss": 5.5048, "mean_token_accuracy": 0.1614176630973816, "num_tokens": 11580692.0, "step": 6275 }, { "entropy": 5.706112480163574, "epoch": 0.5276202478470909, "grad_norm": 0.8828125, "learning_rate": 0.0004977821631194922, "loss": 5.5261, "mean_token_accuracy": 0.15832365602254866, "num_tokens": 11589966.0, "step": 6280 }, { "entropy": 5.80370602607727, "epoch": 0.5280403276622558, "grad_norm": 0.85546875, "learning_rate": 0.0004977779668175677, "loss": 5.6014, "mean_token_accuracy": 0.15689835250377654, "num_tokens": 11599627.0, "step": 6285 }, { "entropy": 5.776365804672241, "epoch": 0.5284604074774207, "grad_norm": 0.93359375, "learning_rate": 0.0004977737665692461, "loss": 5.577, "mean_token_accuracy": 0.16786182373762132, "num_tokens": 11608431.0, "step": 6290 }, { "entropy": 5.7039391040802006, "epoch": 0.5288804872925856, "grad_norm": 0.92578125, "learning_rate": 0.0004977695623746021, "loss": 5.4668, "mean_token_accuracy": 0.1596635565161705, "num_tokens": 11617552.0, "step": 6295 }, { "entropy": 5.688570165634156, "epoch": 0.5293005671077504, "grad_norm": 1.015625, "learning_rate": 0.0004977653542337099, "loss": 5.505, "mean_token_accuracy": 0.168349389731884, "num_tokens": 11626828.0, "step": 6300 }, { "entropy": 5.760095262527466, "epoch": 0.5297206469229153, "grad_norm": 0.96484375, "learning_rate": 0.0004977611421466443, "loss": 5.5798, "mean_token_accuracy": 0.16194986999034883, "num_tokens": 11635867.0, "step": 6305 }, { "entropy": 5.784766721725464, "epoch": 0.5301407267380802, "grad_norm": 0.953125, "learning_rate": 0.0004977569261134797, "loss": 5.4934, "mean_token_accuracy": 0.1665690392255783, "num_tokens": 11644711.0, "step": 6310 }, { "entropy": 5.731612682342529, "epoch": 0.5305608065532451, "grad_norm": 1.0078125, "learning_rate": 0.0004977527061342908, "loss": 5.5935, "mean_token_accuracy": 0.16445396840572357, "num_tokens": 11653320.0, "step": 6315 }, { "entropy": 5.741483688354492, "epoch": 0.53098088636841, "grad_norm": 0.890625, "learning_rate": 0.0004977484822091524, "loss": 5.5402, "mean_token_accuracy": 0.16396106481552125, "num_tokens": 11662753.0, "step": 6320 }, { "entropy": 5.774769592285156, "epoch": 0.5314009661835749, "grad_norm": 1.140625, "learning_rate": 0.0004977442543381394, "loss": 5.5781, "mean_token_accuracy": 0.1614773690700531, "num_tokens": 11671622.0, "step": 6325 }, { "entropy": 5.791268253326416, "epoch": 0.5318210459987398, "grad_norm": 1.0078125, "learning_rate": 0.0004977400225213266, "loss": 5.5734, "mean_token_accuracy": 0.158200266957283, "num_tokens": 11679964.0, "step": 6330 }, { "entropy": 5.711006307601929, "epoch": 0.5322411258139046, "grad_norm": 0.9375, "learning_rate": 0.000497735786758789, "loss": 5.5241, "mean_token_accuracy": 0.16033022105693817, "num_tokens": 11688700.0, "step": 6335 }, { "entropy": 5.755384397506714, "epoch": 0.5326612056290695, "grad_norm": 0.96484375, "learning_rate": 0.0004977315470506016, "loss": 5.642, "mean_token_accuracy": 0.1639561802148819, "num_tokens": 11698425.0, "step": 6340 }, { "entropy": 5.840785360336303, "epoch": 0.5330812854442344, "grad_norm": 0.984375, "learning_rate": 0.0004977273033968397, "loss": 5.6163, "mean_token_accuracy": 0.15594931393861772, "num_tokens": 11707705.0, "step": 6345 }, { "entropy": 5.712259912490845, "epoch": 0.5335013652593993, "grad_norm": 0.97265625, "learning_rate": 0.0004977230557975782, "loss": 5.5145, "mean_token_accuracy": 0.16725486963987352, "num_tokens": 11717079.0, "step": 6350 }, { "entropy": 5.719585847854614, "epoch": 0.5339214450745642, "grad_norm": 0.9921875, "learning_rate": 0.0004977188042528923, "loss": 5.5149, "mean_token_accuracy": 0.16303325742483138, "num_tokens": 11725504.0, "step": 6355 }, { "entropy": 5.7473976612091064, "epoch": 0.5343415248897291, "grad_norm": 0.94921875, "learning_rate": 0.0004977145487628576, "loss": 5.5969, "mean_token_accuracy": 0.1630012646317482, "num_tokens": 11735282.0, "step": 6360 }, { "entropy": 5.7814888000488285, "epoch": 0.534761604704894, "grad_norm": 0.921875, "learning_rate": 0.0004977102893275494, "loss": 5.5763, "mean_token_accuracy": 0.15976961851119995, "num_tokens": 11744827.0, "step": 6365 }, { "entropy": 5.773612451553345, "epoch": 0.5351816845200588, "grad_norm": 1.03125, "learning_rate": 0.000497706025947043, "loss": 5.5367, "mean_token_accuracy": 0.16401349604129792, "num_tokens": 11753066.0, "step": 6370 }, { "entropy": 5.736017036437988, "epoch": 0.5356017643352237, "grad_norm": 1.0703125, "learning_rate": 0.0004977017586214142, "loss": 5.5737, "mean_token_accuracy": 0.16376062780618666, "num_tokens": 11761190.0, "step": 6375 }, { "entropy": 5.764604949951172, "epoch": 0.5360218441503886, "grad_norm": 0.94140625, "learning_rate": 0.0004976974873507382, "loss": 5.5103, "mean_token_accuracy": 0.1692323476076126, "num_tokens": 11770321.0, "step": 6380 }, { "entropy": 5.737596845626831, "epoch": 0.5364419239655535, "grad_norm": 0.9140625, "learning_rate": 0.000497693212135091, "loss": 5.5927, "mean_token_accuracy": 0.16351059675216675, "num_tokens": 11778388.0, "step": 6385 }, { "entropy": 5.7780561447143555, "epoch": 0.5368620037807184, "grad_norm": 1.015625, "learning_rate": 0.0004976889329745482, "loss": 5.4529, "mean_token_accuracy": 0.17066252157092093, "num_tokens": 11786250.0, "step": 6390 }, { "entropy": 5.686847257614136, "epoch": 0.5372820835958833, "grad_norm": 0.97265625, "learning_rate": 0.0004976846498691857, "loss": 5.4307, "mean_token_accuracy": 0.1696195513010025, "num_tokens": 11794831.0, "step": 6395 }, { "entropy": 5.72705192565918, "epoch": 0.5377021634110482, "grad_norm": 1.0703125, "learning_rate": 0.0004976803628190792, "loss": 5.4736, "mean_token_accuracy": 0.17794454842805862, "num_tokens": 11803550.0, "step": 6400 }, { "entropy": 5.74969711303711, "epoch": 0.5381222432262129, "grad_norm": 0.9375, "learning_rate": 0.0004976760718243047, "loss": 5.5546, "mean_token_accuracy": 0.16087348014116287, "num_tokens": 11812478.0, "step": 6405 }, { "entropy": 5.741421031951904, "epoch": 0.5385423230413778, "grad_norm": 0.9921875, "learning_rate": 0.0004976717768849383, "loss": 5.516, "mean_token_accuracy": 0.15771948993206025, "num_tokens": 11822463.0, "step": 6410 }, { "entropy": 5.702504682540893, "epoch": 0.5389624028565427, "grad_norm": 0.984375, "learning_rate": 0.0004976674780010561, "loss": 5.5713, "mean_token_accuracy": 0.15501011312007903, "num_tokens": 11831853.0, "step": 6415 }, { "entropy": 5.749214172363281, "epoch": 0.5393824826717076, "grad_norm": 0.94140625, "learning_rate": 0.000497663175172734, "loss": 5.5864, "mean_token_accuracy": 0.15965323597192765, "num_tokens": 11841574.0, "step": 6420 }, { "entropy": 5.805502128601074, "epoch": 0.5398025624868725, "grad_norm": 0.890625, "learning_rate": 0.0004976588684000486, "loss": 5.6666, "mean_token_accuracy": 0.1459944285452366, "num_tokens": 11852489.0, "step": 6425 }, { "entropy": 5.7659282207489015, "epoch": 0.5402226423020374, "grad_norm": 0.921875, "learning_rate": 0.0004976545576830759, "loss": 5.5435, "mean_token_accuracy": 0.15960338413715364, "num_tokens": 11861499.0, "step": 6430 }, { "entropy": 5.764361619949341, "epoch": 0.5406427221172023, "grad_norm": 0.9296875, "learning_rate": 0.0004976502430218924, "loss": 5.6215, "mean_token_accuracy": 0.15667859464883804, "num_tokens": 11871685.0, "step": 6435 }, { "entropy": 5.763283014297485, "epoch": 0.5410628019323671, "grad_norm": 0.83984375, "learning_rate": 0.0004976459244165744, "loss": 5.5216, "mean_token_accuracy": 0.16296222656965256, "num_tokens": 11881340.0, "step": 6440 }, { "entropy": 5.711437559127807, "epoch": 0.541482881747532, "grad_norm": 0.88671875, "learning_rate": 0.0004976416018671986, "loss": 5.5449, "mean_token_accuracy": 0.15986063182353974, "num_tokens": 11890700.0, "step": 6445 }, { "entropy": 5.737738609313965, "epoch": 0.5419029615626969, "grad_norm": 0.98046875, "learning_rate": 0.0004976372753738415, "loss": 5.5329, "mean_token_accuracy": 0.1589517265558243, "num_tokens": 11900329.0, "step": 6450 }, { "entropy": 5.888564586639404, "epoch": 0.5423230413778618, "grad_norm": 0.94921875, "learning_rate": 0.0004976329449365795, "loss": 5.5801, "mean_token_accuracy": 0.1566044047474861, "num_tokens": 11909915.0, "step": 6455 }, { "entropy": 5.737349987030029, "epoch": 0.5427431211930267, "grad_norm": 0.99609375, "learning_rate": 0.0004976286105554897, "loss": 5.5918, "mean_token_accuracy": 0.16518180966377258, "num_tokens": 11918302.0, "step": 6460 }, { "entropy": 5.755007314682007, "epoch": 0.5431632010081916, "grad_norm": 0.9296875, "learning_rate": 0.0004976242722306487, "loss": 5.5454, "mean_token_accuracy": 0.16296235620975494, "num_tokens": 11927794.0, "step": 6465 }, { "entropy": 5.803985500335694, "epoch": 0.5435832808233564, "grad_norm": 0.8515625, "learning_rate": 0.0004976199299621333, "loss": 5.5802, "mean_token_accuracy": 0.16151558607816696, "num_tokens": 11937701.0, "step": 6470 }, { "entropy": 5.689332914352417, "epoch": 0.5440033606385213, "grad_norm": 1.1328125, "learning_rate": 0.0004976155837500205, "loss": 5.4851, "mean_token_accuracy": 0.1696722015738487, "num_tokens": 11946106.0, "step": 6475 }, { "entropy": 5.72600040435791, "epoch": 0.5444234404536862, "grad_norm": 0.98828125, "learning_rate": 0.0004976112335943872, "loss": 5.4262, "mean_token_accuracy": 0.16228668838739396, "num_tokens": 11954604.0, "step": 6480 }, { "entropy": 5.630837154388428, "epoch": 0.5448435202688511, "grad_norm": 0.98046875, "learning_rate": 0.0004976068794953106, "loss": 5.4824, "mean_token_accuracy": 0.16968904286623002, "num_tokens": 11963664.0, "step": 6485 }, { "entropy": 5.780311393737793, "epoch": 0.545263600084016, "grad_norm": 0.8671875, "learning_rate": 0.0004976025214528677, "loss": 5.4771, "mean_token_accuracy": 0.16729624718427658, "num_tokens": 11973426.0, "step": 6490 }, { "entropy": 5.74679913520813, "epoch": 0.5456836798991809, "grad_norm": 0.9765625, "learning_rate": 0.0004975981594671359, "loss": 5.5305, "mean_token_accuracy": 0.16190839260816575, "num_tokens": 11982339.0, "step": 6495 }, { "entropy": 5.776019430160522, "epoch": 0.5461037597143458, "grad_norm": 1.0234375, "learning_rate": 0.0004975937935381921, "loss": 5.5592, "mean_token_accuracy": 0.16586280912160872, "num_tokens": 11992016.0, "step": 6500 }, { "entropy": 5.698467683792114, "epoch": 0.5465238395295106, "grad_norm": 1.1015625, "learning_rate": 0.000497589423666114, "loss": 5.565, "mean_token_accuracy": 0.16193219423294067, "num_tokens": 12000616.0, "step": 6505 }, { "entropy": 5.5959553718566895, "epoch": 0.5469439193446755, "grad_norm": 0.90234375, "learning_rate": 0.0004975850498509789, "loss": 5.4744, "mean_token_accuracy": 0.1637238934636116, "num_tokens": 12009717.0, "step": 6510 }, { "entropy": 5.679888772964477, "epoch": 0.5473639991598404, "grad_norm": 1.0234375, "learning_rate": 0.0004975806720928642, "loss": 5.5583, "mean_token_accuracy": 0.1625445678830147, "num_tokens": 12018020.0, "step": 6515 }, { "entropy": 5.791135978698731, "epoch": 0.5477840789750053, "grad_norm": 1.0, "learning_rate": 0.0004975762903918475, "loss": 5.5404, "mean_token_accuracy": 0.16019310504198075, "num_tokens": 12027119.0, "step": 6520 }, { "entropy": 5.763389539718628, "epoch": 0.5482041587901701, "grad_norm": 1.0234375, "learning_rate": 0.0004975719047480064, "loss": 5.5369, "mean_token_accuracy": 0.16848756968975068, "num_tokens": 12035566.0, "step": 6525 }, { "entropy": 5.692020082473755, "epoch": 0.548624238605335, "grad_norm": 0.9609375, "learning_rate": 0.0004975675151614187, "loss": 5.4426, "mean_token_accuracy": 0.170123191177845, "num_tokens": 12044505.0, "step": 6530 }, { "entropy": 5.619001770019532, "epoch": 0.5490443184204999, "grad_norm": 1.0390625, "learning_rate": 0.000497563121632162, "loss": 5.5066, "mean_token_accuracy": 0.16735866218805312, "num_tokens": 12053338.0, "step": 6535 }, { "entropy": 5.727146291732788, "epoch": 0.5494643982356647, "grad_norm": 0.984375, "learning_rate": 0.0004975587241603142, "loss": 5.5111, "mean_token_accuracy": 0.16334682554006577, "num_tokens": 12063235.0, "step": 6540 }, { "entropy": 5.80925874710083, "epoch": 0.5498844780508296, "grad_norm": 0.96484375, "learning_rate": 0.0004975543227459533, "loss": 5.5874, "mean_token_accuracy": 0.1605127662420273, "num_tokens": 12072490.0, "step": 6545 }, { "entropy": 5.744976043701172, "epoch": 0.5503045578659945, "grad_norm": 0.921875, "learning_rate": 0.0004975499173891571, "loss": 5.6339, "mean_token_accuracy": 0.15866934806108474, "num_tokens": 12081474.0, "step": 6550 }, { "entropy": 5.681692361831665, "epoch": 0.5507246376811594, "grad_norm": 0.921875, "learning_rate": 0.0004975455080900037, "loss": 5.5062, "mean_token_accuracy": 0.1674065738916397, "num_tokens": 12090963.0, "step": 6555 }, { "entropy": 5.727477884292602, "epoch": 0.5511447174963243, "grad_norm": 1.0078125, "learning_rate": 0.0004975410948485713, "loss": 5.5206, "mean_token_accuracy": 0.16142902970314027, "num_tokens": 12099786.0, "step": 6560 }, { "entropy": 5.699660587310791, "epoch": 0.5515647973114892, "grad_norm": 0.94140625, "learning_rate": 0.0004975366776649379, "loss": 5.5353, "mean_token_accuracy": 0.16478368937969207, "num_tokens": 12108469.0, "step": 6565 }, { "entropy": 5.764699554443359, "epoch": 0.5519848771266541, "grad_norm": 0.94140625, "learning_rate": 0.0004975322565391818, "loss": 5.4985, "mean_token_accuracy": 0.16510994732379913, "num_tokens": 12118287.0, "step": 6570 }, { "entropy": 5.802925443649292, "epoch": 0.5524049569418189, "grad_norm": 0.97265625, "learning_rate": 0.0004975278314713814, "loss": 5.6693, "mean_token_accuracy": 0.15847567915916444, "num_tokens": 12127122.0, "step": 6575 }, { "entropy": 5.770570850372314, "epoch": 0.5528250367569838, "grad_norm": 0.9921875, "learning_rate": 0.0004975234024616152, "loss": 5.5604, "mean_token_accuracy": 0.17042070776224136, "num_tokens": 12136395.0, "step": 6580 }, { "entropy": 5.65176568031311, "epoch": 0.5532451165721487, "grad_norm": 0.9921875, "learning_rate": 0.0004975189695099613, "loss": 5.5381, "mean_token_accuracy": 0.16711881011724472, "num_tokens": 12145025.0, "step": 6585 }, { "entropy": 5.764221954345703, "epoch": 0.5536651963873136, "grad_norm": 0.9375, "learning_rate": 0.0004975145326164985, "loss": 5.5774, "mean_token_accuracy": 0.15798249989748, "num_tokens": 12154352.0, "step": 6590 }, { "entropy": 5.7215704917907715, "epoch": 0.5540852762024785, "grad_norm": 0.95703125, "learning_rate": 0.0004975100917813055, "loss": 5.4733, "mean_token_accuracy": 0.16243733167648317, "num_tokens": 12163802.0, "step": 6595 }, { "entropy": 5.689832258224487, "epoch": 0.5545053560176434, "grad_norm": 0.9140625, "learning_rate": 0.0004975056470044606, "loss": 5.5086, "mean_token_accuracy": 0.16092772781848907, "num_tokens": 12173111.0, "step": 6600 }, { "entropy": 5.729002904891968, "epoch": 0.5549254358328082, "grad_norm": 0.984375, "learning_rate": 0.0004975011982860428, "loss": 5.5485, "mean_token_accuracy": 0.16289519965648652, "num_tokens": 12182048.0, "step": 6605 }, { "entropy": 5.7060850143432615, "epoch": 0.5553455156479731, "grad_norm": 0.93359375, "learning_rate": 0.0004974967456261309, "loss": 5.5435, "mean_token_accuracy": 0.16328554153442382, "num_tokens": 12191501.0, "step": 6610 }, { "entropy": 5.75695481300354, "epoch": 0.555765595463138, "grad_norm": 0.92578125, "learning_rate": 0.0004974922890248036, "loss": 5.5591, "mean_token_accuracy": 0.16566281169652938, "num_tokens": 12201132.0, "step": 6615 }, { "entropy": 5.838721704483032, "epoch": 0.5561856752783029, "grad_norm": 0.9765625, "learning_rate": 0.00049748782848214, "loss": 5.6971, "mean_token_accuracy": 0.15937435030937194, "num_tokens": 12211082.0, "step": 6620 }, { "entropy": 5.763456106185913, "epoch": 0.5566057550934678, "grad_norm": 0.88671875, "learning_rate": 0.0004974833639982192, "loss": 5.5107, "mean_token_accuracy": 0.16620800793170928, "num_tokens": 12219946.0, "step": 6625 }, { "entropy": 5.808733177185059, "epoch": 0.5570258349086327, "grad_norm": 0.98046875, "learning_rate": 0.00049747889557312, "loss": 5.6113, "mean_token_accuracy": 0.1599217653274536, "num_tokens": 12229668.0, "step": 6630 }, { "entropy": 5.788719987869262, "epoch": 0.5574459147237976, "grad_norm": 0.9296875, "learning_rate": 0.0004974744232069219, "loss": 5.6015, "mean_token_accuracy": 0.16660431921482086, "num_tokens": 12238750.0, "step": 6635 }, { "entropy": 5.708747816085816, "epoch": 0.5578659945389624, "grad_norm": 1.0390625, "learning_rate": 0.0004974699468997038, "loss": 5.5569, "mean_token_accuracy": 0.15997037440538406, "num_tokens": 12246825.0, "step": 6640 }, { "entropy": 5.660719966888427, "epoch": 0.5582860743541272, "grad_norm": 0.99609375, "learning_rate": 0.0004974654666515452, "loss": 5.4793, "mean_token_accuracy": 0.1639156773686409, "num_tokens": 12256413.0, "step": 6645 }, { "entropy": 5.72155442237854, "epoch": 0.5587061541692921, "grad_norm": 1.0703125, "learning_rate": 0.0004974609824625254, "loss": 5.5267, "mean_token_accuracy": 0.17064841985702514, "num_tokens": 12265458.0, "step": 6650 }, { "entropy": 5.6467994213104244, "epoch": 0.559126233984457, "grad_norm": 0.984375, "learning_rate": 0.0004974564943327239, "loss": 5.4547, "mean_token_accuracy": 0.164512038230896, "num_tokens": 12274124.0, "step": 6655 }, { "entropy": 5.596433830261231, "epoch": 0.5595463137996219, "grad_norm": 0.8984375, "learning_rate": 0.00049745200226222, "loss": 5.4174, "mean_token_accuracy": 0.18331650793552398, "num_tokens": 12283513.0, "step": 6660 }, { "entropy": 5.68831057548523, "epoch": 0.5599663936147868, "grad_norm": 1.03125, "learning_rate": 0.0004974475062510936, "loss": 5.5525, "mean_token_accuracy": 0.1645463690161705, "num_tokens": 12292396.0, "step": 6665 }, { "entropy": 5.747618198394775, "epoch": 0.5603864734299517, "grad_norm": 0.92578125, "learning_rate": 0.0004974430062994242, "loss": 5.5838, "mean_token_accuracy": 0.16515465825796127, "num_tokens": 12301604.0, "step": 6670 }, { "entropy": 5.796191072463989, "epoch": 0.5608065532451165, "grad_norm": 1.0, "learning_rate": 0.0004974385024072912, "loss": 5.6032, "mean_token_accuracy": 0.1587561160326004, "num_tokens": 12310458.0, "step": 6675 }, { "entropy": 5.805469274520874, "epoch": 0.5612266330602814, "grad_norm": 0.97265625, "learning_rate": 0.000497433994574775, "loss": 5.61, "mean_token_accuracy": 0.16035176664590836, "num_tokens": 12319620.0, "step": 6680 }, { "entropy": 5.804027795791626, "epoch": 0.5616467128754463, "grad_norm": 0.91796875, "learning_rate": 0.000497429482801955, "loss": 5.675, "mean_token_accuracy": 0.15980809777975083, "num_tokens": 12329518.0, "step": 6685 }, { "entropy": 5.729493474960327, "epoch": 0.5620667926906112, "grad_norm": 0.90234375, "learning_rate": 0.0004974249670889111, "loss": 5.4737, "mean_token_accuracy": 0.16783603131771088, "num_tokens": 12338244.0, "step": 6690 }, { "entropy": 5.815527057647705, "epoch": 0.5624868725057761, "grad_norm": 1.0078125, "learning_rate": 0.0004974204474357237, "loss": 5.6511, "mean_token_accuracy": 0.16130429953336717, "num_tokens": 12347962.0, "step": 6695 }, { "entropy": 5.79982476234436, "epoch": 0.562906952320941, "grad_norm": 1.0390625, "learning_rate": 0.0004974159238424723, "loss": 5.5647, "mean_token_accuracy": 0.160567244887352, "num_tokens": 12357020.0, "step": 6700 }, { "entropy": 5.701817035675049, "epoch": 0.5633270321361059, "grad_norm": 0.9453125, "learning_rate": 0.0004974113963092376, "loss": 5.5462, "mean_token_accuracy": 0.16599834561347962, "num_tokens": 12366108.0, "step": 6705 }, { "entropy": 5.80378737449646, "epoch": 0.5637471119512707, "grad_norm": 1.0, "learning_rate": 0.0004974068648360995, "loss": 5.4608, "mean_token_accuracy": 0.1770256206393242, "num_tokens": 12374508.0, "step": 6710 }, { "entropy": 5.7393152713775635, "epoch": 0.5641671917664356, "grad_norm": 0.9609375, "learning_rate": 0.0004974023294231383, "loss": 5.4842, "mean_token_accuracy": 0.17291969954967498, "num_tokens": 12383555.0, "step": 6715 }, { "entropy": 5.696082067489624, "epoch": 0.5645872715816005, "grad_norm": 0.921875, "learning_rate": 0.0004973977900704342, "loss": 5.5761, "mean_token_accuracy": 0.1606935977935791, "num_tokens": 12392680.0, "step": 6720 }, { "entropy": 5.782982110977173, "epoch": 0.5650073513967654, "grad_norm": 0.90234375, "learning_rate": 0.0004973932467780679, "loss": 5.6217, "mean_token_accuracy": 0.15996287018060684, "num_tokens": 12401881.0, "step": 6725 }, { "entropy": 5.783317613601684, "epoch": 0.5654274312119303, "grad_norm": 0.93359375, "learning_rate": 0.0004973886995461197, "loss": 5.604, "mean_token_accuracy": 0.15776107162237168, "num_tokens": 12411487.0, "step": 6730 }, { "entropy": 5.699482107162476, "epoch": 0.5658475110270952, "grad_norm": 0.91796875, "learning_rate": 0.0004973841483746703, "loss": 5.4248, "mean_token_accuracy": 0.17524536103010177, "num_tokens": 12420376.0, "step": 6735 }, { "entropy": 5.609754610061645, "epoch": 0.5662675908422601, "grad_norm": 0.8984375, "learning_rate": 0.0004973795932638001, "loss": 5.458, "mean_token_accuracy": 0.17494328320026398, "num_tokens": 12429518.0, "step": 6740 }, { "entropy": 5.715028858184814, "epoch": 0.5666876706574249, "grad_norm": 0.96484375, "learning_rate": 0.00049737503421359, "loss": 5.4491, "mean_token_accuracy": 0.17262679785490037, "num_tokens": 12438952.0, "step": 6745 }, { "entropy": 5.708644962310791, "epoch": 0.5671077504725898, "grad_norm": 1.0859375, "learning_rate": 0.0004973704712241206, "loss": 5.4558, "mean_token_accuracy": 0.16454821228981018, "num_tokens": 12448576.0, "step": 6750 }, { "entropy": 5.688517618179321, "epoch": 0.5675278302877547, "grad_norm": 0.94140625, "learning_rate": 0.0004973659042954729, "loss": 5.4982, "mean_token_accuracy": 0.1647478923201561, "num_tokens": 12458166.0, "step": 6755 }, { "entropy": 5.596537494659424, "epoch": 0.5679479101029196, "grad_norm": 0.9921875, "learning_rate": 0.0004973613334277277, "loss": 5.4163, "mean_token_accuracy": 0.17238699346780778, "num_tokens": 12467271.0, "step": 6760 }, { "entropy": 5.7394147396087645, "epoch": 0.5683679899180845, "grad_norm": 0.96484375, "learning_rate": 0.0004973567586209658, "loss": 5.5871, "mean_token_accuracy": 0.16045358031988144, "num_tokens": 12476255.0, "step": 6765 }, { "entropy": 5.756132364273071, "epoch": 0.5687880697332494, "grad_norm": 0.91796875, "learning_rate": 0.0004973521798752686, "loss": 5.5557, "mean_token_accuracy": 0.16549135744571686, "num_tokens": 12485096.0, "step": 6770 }, { "entropy": 5.816721343994141, "epoch": 0.5692081495484141, "grad_norm": 0.95703125, "learning_rate": 0.000497347597190717, "loss": 5.5779, "mean_token_accuracy": 0.16754906624555588, "num_tokens": 12494405.0, "step": 6775 }, { "entropy": 5.721153497695923, "epoch": 0.569628229363579, "grad_norm": 0.98828125, "learning_rate": 0.0004973430105673921, "loss": 5.5031, "mean_token_accuracy": 0.1665035143494606, "num_tokens": 12503349.0, "step": 6780 }, { "entropy": 5.731798124313355, "epoch": 0.5700483091787439, "grad_norm": 0.94140625, "learning_rate": 0.0004973384200053754, "loss": 5.5885, "mean_token_accuracy": 0.166769115626812, "num_tokens": 12513122.0, "step": 6785 }, { "entropy": 5.723275518417358, "epoch": 0.5704683889939088, "grad_norm": 0.98046875, "learning_rate": 0.000497333825504748, "loss": 5.5249, "mean_token_accuracy": 0.16382726579904555, "num_tokens": 12523614.0, "step": 6790 }, { "entropy": 5.735034370422364, "epoch": 0.5708884688090737, "grad_norm": 0.95703125, "learning_rate": 0.0004973292270655914, "loss": 5.5736, "mean_token_accuracy": 0.15909015834331514, "num_tokens": 12532031.0, "step": 6795 }, { "entropy": 5.809251117706299, "epoch": 0.5713085486242386, "grad_norm": 0.94921875, "learning_rate": 0.000497324624687987, "loss": 5.666, "mean_token_accuracy": 0.15627140551805496, "num_tokens": 12542239.0, "step": 6800 }, { "entropy": 5.856173467636109, "epoch": 0.5717286284394035, "grad_norm": 0.984375, "learning_rate": 0.0004973200183720164, "loss": 5.5806, "mean_token_accuracy": 0.15812304764986038, "num_tokens": 12552608.0, "step": 6805 }, { "entropy": 5.668775606155395, "epoch": 0.5721487082545683, "grad_norm": 0.97265625, "learning_rate": 0.0004973154081177611, "loss": 5.4123, "mean_token_accuracy": 0.16480085700750352, "num_tokens": 12562020.0, "step": 6810 }, { "entropy": 5.667333602905273, "epoch": 0.5725687880697332, "grad_norm": 1.03125, "learning_rate": 0.0004973107939253027, "loss": 5.4832, "mean_token_accuracy": 0.17599694728851317, "num_tokens": 12570519.0, "step": 6815 }, { "entropy": 5.634785413742065, "epoch": 0.5729888678848981, "grad_norm": 0.96875, "learning_rate": 0.0004973061757947233, "loss": 5.4905, "mean_token_accuracy": 0.1663898229598999, "num_tokens": 12579324.0, "step": 6820 }, { "entropy": 5.702479887008667, "epoch": 0.573408947700063, "grad_norm": 0.96875, "learning_rate": 0.0004973015537261043, "loss": 5.5443, "mean_token_accuracy": 0.16654133200645446, "num_tokens": 12588014.0, "step": 6825 }, { "entropy": 5.789632272720337, "epoch": 0.5738290275152279, "grad_norm": 0.921875, "learning_rate": 0.0004972969277195279, "loss": 5.5606, "mean_token_accuracy": 0.16706208139657974, "num_tokens": 12596882.0, "step": 6830 }, { "entropy": 5.706000423431396, "epoch": 0.5742491073303928, "grad_norm": 0.97265625, "learning_rate": 0.0004972922977750757, "loss": 5.4794, "mean_token_accuracy": 0.16413741260766984, "num_tokens": 12606069.0, "step": 6835 }, { "entropy": 5.719677686691284, "epoch": 0.5746691871455577, "grad_norm": 1.4765625, "learning_rate": 0.00049728766389283, "loss": 5.4886, "mean_token_accuracy": 0.16409489065408706, "num_tokens": 12615167.0, "step": 6840 }, { "entropy": 5.6706328868865965, "epoch": 0.5750892669607225, "grad_norm": 1.0546875, "learning_rate": 0.0004972830260728729, "loss": 5.5367, "mean_token_accuracy": 0.1674353748559952, "num_tokens": 12624230.0, "step": 6845 }, { "entropy": 5.757194995880127, "epoch": 0.5755093467758874, "grad_norm": 0.9921875, "learning_rate": 0.0004972783843152863, "loss": 5.5197, "mean_token_accuracy": 0.16837731450796128, "num_tokens": 12633158.0, "step": 6850 }, { "entropy": 5.757494592666626, "epoch": 0.5759294265910523, "grad_norm": 1.046875, "learning_rate": 0.0004972737386201527, "loss": 5.4829, "mean_token_accuracy": 0.16184937953948975, "num_tokens": 12641465.0, "step": 6855 }, { "entropy": 5.663051462173462, "epoch": 0.5763495064062172, "grad_norm": 1.0078125, "learning_rate": 0.0004972690889875541, "loss": 5.4432, "mean_token_accuracy": 0.16741454750299453, "num_tokens": 12650437.0, "step": 6860 }, { "entropy": 5.84966139793396, "epoch": 0.5767695862213821, "grad_norm": 1.0078125, "learning_rate": 0.0004972644354175732, "loss": 5.6532, "mean_token_accuracy": 0.16014729291200638, "num_tokens": 12660072.0, "step": 6865 }, { "entropy": 5.82061538696289, "epoch": 0.577189666036547, "grad_norm": 0.91796875, "learning_rate": 0.0004972597779102922, "loss": 5.6685, "mean_token_accuracy": 0.1602156087756157, "num_tokens": 12670405.0, "step": 6870 }, { "entropy": 5.689360618591309, "epoch": 0.5776097458517119, "grad_norm": 0.94140625, "learning_rate": 0.0004972551164657937, "loss": 5.5423, "mean_token_accuracy": 0.1655457064509392, "num_tokens": 12679992.0, "step": 6875 }, { "entropy": 5.799532318115235, "epoch": 0.5780298256668767, "grad_norm": 0.9453125, "learning_rate": 0.0004972504510841602, "loss": 5.595, "mean_token_accuracy": 0.15801346749067308, "num_tokens": 12690289.0, "step": 6880 }, { "entropy": 5.8381139755249025, "epoch": 0.5784499054820416, "grad_norm": 0.91796875, "learning_rate": 0.0004972457817654745, "loss": 5.5865, "mean_token_accuracy": 0.16085358709096909, "num_tokens": 12700518.0, "step": 6885 }, { "entropy": 5.80426549911499, "epoch": 0.5788699852972065, "grad_norm": 0.99609375, "learning_rate": 0.0004972411085098191, "loss": 5.6329, "mean_token_accuracy": 0.15670239478349685, "num_tokens": 12710603.0, "step": 6890 }, { "entropy": 5.79871392250061, "epoch": 0.5792900651123714, "grad_norm": 0.90625, "learning_rate": 0.000497236431317277, "loss": 5.5266, "mean_token_accuracy": 0.16727050095796586, "num_tokens": 12719298.0, "step": 6895 }, { "entropy": 5.751122093200683, "epoch": 0.5797101449275363, "grad_norm": 1.046875, "learning_rate": 0.000497231750187931, "loss": 5.5178, "mean_token_accuracy": 0.16697220504283905, "num_tokens": 12728368.0, "step": 6900 }, { "entropy": 5.777011489868164, "epoch": 0.5801302247427012, "grad_norm": 0.98046875, "learning_rate": 0.0004972270651218638, "loss": 5.5793, "mean_token_accuracy": 0.16862293779850007, "num_tokens": 12737898.0, "step": 6905 }, { "entropy": 5.75121955871582, "epoch": 0.580550304557866, "grad_norm": 1.046875, "learning_rate": 0.0004972223761191587, "loss": 5.5282, "mean_token_accuracy": 0.16244126260280609, "num_tokens": 12746761.0, "step": 6910 }, { "entropy": 5.668338012695313, "epoch": 0.5809703843730308, "grad_norm": 1.0390625, "learning_rate": 0.0004972176831798986, "loss": 5.4701, "mean_token_accuracy": 0.17220567017793656, "num_tokens": 12755128.0, "step": 6915 }, { "entropy": 5.745266914367676, "epoch": 0.5813904641881957, "grad_norm": 0.89453125, "learning_rate": 0.0004972129863041667, "loss": 5.6462, "mean_token_accuracy": 0.15389580130577088, "num_tokens": 12764727.0, "step": 6920 }, { "entropy": 5.753053855895996, "epoch": 0.5818105440033606, "grad_norm": 0.91796875, "learning_rate": 0.0004972082854920462, "loss": 5.4956, "mean_token_accuracy": 0.16948433965444565, "num_tokens": 12773557.0, "step": 6925 }, { "entropy": 5.731849002838135, "epoch": 0.5822306238185255, "grad_norm": 0.97265625, "learning_rate": 0.0004972035807436203, "loss": 5.5, "mean_token_accuracy": 0.1690128982067108, "num_tokens": 12782525.0, "step": 6930 }, { "entropy": 5.796383476257324, "epoch": 0.5826507036336904, "grad_norm": 0.984375, "learning_rate": 0.0004971988720589723, "loss": 5.5955, "mean_token_accuracy": 0.16173771321773528, "num_tokens": 12791534.0, "step": 6935 }, { "entropy": 5.767693090438843, "epoch": 0.5830707834488553, "grad_norm": 0.91796875, "learning_rate": 0.0004971941594381858, "loss": 5.4897, "mean_token_accuracy": 0.16691604107618332, "num_tokens": 12800662.0, "step": 6940 }, { "entropy": 5.758454275131226, "epoch": 0.5834908632640201, "grad_norm": 0.890625, "learning_rate": 0.0004971894428813441, "loss": 5.5308, "mean_token_accuracy": 0.16786673665046692, "num_tokens": 12809440.0, "step": 6945 }, { "entropy": 5.777513122558593, "epoch": 0.583910943079185, "grad_norm": 0.99609375, "learning_rate": 0.000497184722388531, "loss": 5.6051, "mean_token_accuracy": 0.1612379416823387, "num_tokens": 12818560.0, "step": 6950 }, { "entropy": 5.839304399490357, "epoch": 0.5843310228943499, "grad_norm": 0.96484375, "learning_rate": 0.0004971799979598297, "loss": 5.5324, "mean_token_accuracy": 0.16134003400802613, "num_tokens": 12827898.0, "step": 6955 }, { "entropy": 5.717963171005249, "epoch": 0.5847511027095148, "grad_norm": 0.97265625, "learning_rate": 0.0004971752695953243, "loss": 5.4782, "mean_token_accuracy": 0.16631890833377838, "num_tokens": 12837199.0, "step": 6960 }, { "entropy": 5.713119792938232, "epoch": 0.5851711825246797, "grad_norm": 0.9453125, "learning_rate": 0.0004971705372950984, "loss": 5.5118, "mean_token_accuracy": 0.163696525990963, "num_tokens": 12846493.0, "step": 6965 }, { "entropy": 5.761196613311768, "epoch": 0.5855912623398446, "grad_norm": 0.9140625, "learning_rate": 0.0004971658010592358, "loss": 5.5286, "mean_token_accuracy": 0.16277743577957154, "num_tokens": 12855026.0, "step": 6970 }, { "entropy": 5.776597166061402, "epoch": 0.5860113421550095, "grad_norm": 0.8984375, "learning_rate": 0.0004971610608878205, "loss": 5.5984, "mean_token_accuracy": 0.16150868386030198, "num_tokens": 12864563.0, "step": 6975 }, { "entropy": 5.825516939163208, "epoch": 0.5864314219701743, "grad_norm": 1.0390625, "learning_rate": 0.0004971563167809363, "loss": 5.5258, "mean_token_accuracy": 0.16631446927785873, "num_tokens": 12874358.0, "step": 6980 }, { "entropy": 5.731455850601196, "epoch": 0.5868515017853392, "grad_norm": 0.921875, "learning_rate": 0.0004971515687386674, "loss": 5.5443, "mean_token_accuracy": 0.16318022608757018, "num_tokens": 12883110.0, "step": 6985 }, { "entropy": 5.775928068161011, "epoch": 0.5872715816005041, "grad_norm": 0.8828125, "learning_rate": 0.0004971468167610978, "loss": 5.6099, "mean_token_accuracy": 0.16461408585309983, "num_tokens": 12892977.0, "step": 6990 }, { "entropy": 5.700873947143554, "epoch": 0.587691661415669, "grad_norm": 0.890625, "learning_rate": 0.0004971420608483117, "loss": 5.4117, "mean_token_accuracy": 0.1737138643860817, "num_tokens": 12902327.0, "step": 6995 }, { "entropy": 5.639215755462646, "epoch": 0.5881117412308339, "grad_norm": 0.90625, "learning_rate": 0.0004971373010003936, "loss": 5.4297, "mean_token_accuracy": 0.17889968156814576, "num_tokens": 12911957.0, "step": 7000 }, { "entropy": 5.739131689071655, "epoch": 0.5885318210459988, "grad_norm": 1.0390625, "learning_rate": 0.0004971325372174274, "loss": 5.5105, "mean_token_accuracy": 0.16423840969800949, "num_tokens": 12920380.0, "step": 7005 }, { "entropy": 5.683195638656616, "epoch": 0.5889519008611637, "grad_norm": 0.953125, "learning_rate": 0.0004971277694994976, "loss": 5.5872, "mean_token_accuracy": 0.16479117721319197, "num_tokens": 12929670.0, "step": 7010 }, { "entropy": 5.747731018066406, "epoch": 0.5893719806763285, "grad_norm": 1.0546875, "learning_rate": 0.000497122997846689, "loss": 5.5008, "mean_token_accuracy": 0.1721497356891632, "num_tokens": 12938185.0, "step": 7015 }, { "entropy": 5.772720766067505, "epoch": 0.5897920604914934, "grad_norm": 0.95703125, "learning_rate": 0.0004971182222590857, "loss": 5.5124, "mean_token_accuracy": 0.17144393175840378, "num_tokens": 12947706.0, "step": 7020 }, { "entropy": 5.684038400650024, "epoch": 0.5902121403066583, "grad_norm": 0.91796875, "learning_rate": 0.0004971134427367725, "loss": 5.5055, "mean_token_accuracy": 0.16635899543762206, "num_tokens": 12957393.0, "step": 7025 }, { "entropy": 5.712462902069092, "epoch": 0.5906322201218231, "grad_norm": 0.9296875, "learning_rate": 0.000497108659279834, "loss": 5.4101, "mean_token_accuracy": 0.1759818136692047, "num_tokens": 12967165.0, "step": 7030 }, { "entropy": 5.792285919189453, "epoch": 0.591052299936988, "grad_norm": 1.015625, "learning_rate": 0.0004971038718883551, "loss": 5.5544, "mean_token_accuracy": 0.16032245606184006, "num_tokens": 12976490.0, "step": 7035 }, { "entropy": 5.789936065673828, "epoch": 0.5914723797521529, "grad_norm": 0.9453125, "learning_rate": 0.0004970990805624203, "loss": 5.5441, "mean_token_accuracy": 0.1614286109805107, "num_tokens": 12985423.0, "step": 7040 }, { "entropy": 5.701582956314087, "epoch": 0.5918924595673178, "grad_norm": 1.0546875, "learning_rate": 0.0004970942853021147, "loss": 5.4223, "mean_token_accuracy": 0.17460384517908095, "num_tokens": 12994510.0, "step": 7045 }, { "entropy": 5.767253828048706, "epoch": 0.5923125393824826, "grad_norm": 0.890625, "learning_rate": 0.0004970894861075232, "loss": 5.5559, "mean_token_accuracy": 0.16429632902145386, "num_tokens": 13003383.0, "step": 7050 }, { "entropy": 5.748055267333984, "epoch": 0.5927326191976475, "grad_norm": 0.95703125, "learning_rate": 0.0004970846829787309, "loss": 5.495, "mean_token_accuracy": 0.16619571596384047, "num_tokens": 13012550.0, "step": 7055 }, { "entropy": 5.745292520523071, "epoch": 0.5931526990128124, "grad_norm": 1.015625, "learning_rate": 0.0004970798759158227, "loss": 5.5579, "mean_token_accuracy": 0.16078388690948486, "num_tokens": 13022066.0, "step": 7060 }, { "entropy": 5.743168926239013, "epoch": 0.5935727788279773, "grad_norm": 0.98046875, "learning_rate": 0.0004970750649188839, "loss": 5.536, "mean_token_accuracy": 0.17519628554582595, "num_tokens": 13031008.0, "step": 7065 }, { "entropy": 5.685877513885498, "epoch": 0.5939928586431422, "grad_norm": 0.9140625, "learning_rate": 0.0004970702499879998, "loss": 5.5128, "mean_token_accuracy": 0.16871291399002075, "num_tokens": 13040366.0, "step": 7070 }, { "entropy": 5.668898677825927, "epoch": 0.5944129384583071, "grad_norm": 0.88671875, "learning_rate": 0.0004970654311232554, "loss": 5.5243, "mean_token_accuracy": 0.16426745504140855, "num_tokens": 13051140.0, "step": 7075 }, { "entropy": 5.718292331695556, "epoch": 0.594833018273472, "grad_norm": 1.0234375, "learning_rate": 0.0004970606083247362, "loss": 5.4459, "mean_token_accuracy": 0.16791134625673293, "num_tokens": 13059835.0, "step": 7080 }, { "entropy": 5.667575120925903, "epoch": 0.5952530980886368, "grad_norm": 0.921875, "learning_rate": 0.0004970557815925278, "loss": 5.4135, "mean_token_accuracy": 0.16934545636177062, "num_tokens": 13068909.0, "step": 7085 }, { "entropy": 5.693837451934814, "epoch": 0.5956731779038017, "grad_norm": 0.9609375, "learning_rate": 0.0004970509509267155, "loss": 5.5084, "mean_token_accuracy": 0.16520608812570572, "num_tokens": 13078380.0, "step": 7090 }, { "entropy": 5.738383483886719, "epoch": 0.5960932577189666, "grad_norm": 0.92578125, "learning_rate": 0.0004970461163273849, "loss": 5.5358, "mean_token_accuracy": 0.1652810513973236, "num_tokens": 13087774.0, "step": 7095 }, { "entropy": 5.6614855289459225, "epoch": 0.5965133375341315, "grad_norm": 1.0078125, "learning_rate": 0.0004970412777946219, "loss": 5.3538, "mean_token_accuracy": 0.1728409618139267, "num_tokens": 13095938.0, "step": 7100 }, { "entropy": 5.661937522888183, "epoch": 0.5969334173492964, "grad_norm": 0.95703125, "learning_rate": 0.0004970364353285117, "loss": 5.5099, "mean_token_accuracy": 0.1667941018939018, "num_tokens": 13104661.0, "step": 7105 }, { "entropy": 5.769843101501465, "epoch": 0.5973534971644613, "grad_norm": 1.0078125, "learning_rate": 0.0004970315889291405, "loss": 5.5054, "mean_token_accuracy": 0.16266342252492905, "num_tokens": 13114505.0, "step": 7110 }, { "entropy": 5.647045612335205, "epoch": 0.5977735769796261, "grad_norm": 0.953125, "learning_rate": 0.0004970267385965941, "loss": 5.4399, "mean_token_accuracy": 0.1659441262483597, "num_tokens": 13124590.0, "step": 7115 }, { "entropy": 5.659963178634643, "epoch": 0.598193656794791, "grad_norm": 1.09375, "learning_rate": 0.0004970218843309583, "loss": 5.4255, "mean_token_accuracy": 0.17648224532604218, "num_tokens": 13134026.0, "step": 7120 }, { "entropy": 5.784970045089722, "epoch": 0.5986137366099559, "grad_norm": 0.98828125, "learning_rate": 0.0004970170261323192, "loss": 5.588, "mean_token_accuracy": 0.16741684675216675, "num_tokens": 13142654.0, "step": 7125 }, { "entropy": 5.68078384399414, "epoch": 0.5990338164251208, "grad_norm": 1.0078125, "learning_rate": 0.0004970121640007627, "loss": 5.4971, "mean_token_accuracy": 0.16654934138059616, "num_tokens": 13151177.0, "step": 7130 }, { "entropy": 5.7274463176727295, "epoch": 0.5994538962402857, "grad_norm": 0.99609375, "learning_rate": 0.0004970072979363751, "loss": 5.4843, "mean_token_accuracy": 0.1642145425081253, "num_tokens": 13159689.0, "step": 7135 }, { "entropy": 5.673399639129639, "epoch": 0.5998739760554506, "grad_norm": 0.98046875, "learning_rate": 0.0004970024279392425, "loss": 5.5339, "mean_token_accuracy": 0.16159643679857255, "num_tokens": 13168601.0, "step": 7140 }, { "entropy": 5.740363311767578, "epoch": 0.6002940558706155, "grad_norm": 0.99609375, "learning_rate": 0.0004969975540094513, "loss": 5.5042, "mean_token_accuracy": 0.16813595741987228, "num_tokens": 13177035.0, "step": 7145 }, { "entropy": 5.774371862411499, "epoch": 0.6007141356857802, "grad_norm": 0.96484375, "learning_rate": 0.0004969926761470876, "loss": 5.4729, "mean_token_accuracy": 0.1695594534277916, "num_tokens": 13185444.0, "step": 7150 }, { "entropy": 5.700524473190308, "epoch": 0.6011342155009451, "grad_norm": 0.96875, "learning_rate": 0.000496987794352238, "loss": 5.4721, "mean_token_accuracy": 0.1695847913622856, "num_tokens": 13194987.0, "step": 7155 }, { "entropy": 5.63314642906189, "epoch": 0.60155429531611, "grad_norm": 0.9765625, "learning_rate": 0.0004969829086249889, "loss": 5.5057, "mean_token_accuracy": 0.1687454789876938, "num_tokens": 13203807.0, "step": 7160 }, { "entropy": 5.757598972320556, "epoch": 0.6019743751312749, "grad_norm": 1.0234375, "learning_rate": 0.000496978018965427, "loss": 5.6103, "mean_token_accuracy": 0.16279578655958177, "num_tokens": 13214362.0, "step": 7165 }, { "entropy": 5.789872074127198, "epoch": 0.6023944549464398, "grad_norm": 0.9375, "learning_rate": 0.0004969731253736387, "loss": 5.6048, "mean_token_accuracy": 0.16099970787763596, "num_tokens": 13224192.0, "step": 7170 }, { "entropy": 5.700117921829223, "epoch": 0.6028145347616047, "grad_norm": 0.92578125, "learning_rate": 0.0004969682278497109, "loss": 5.5621, "mean_token_accuracy": 0.1684303879737854, "num_tokens": 13234430.0, "step": 7175 }, { "entropy": 5.711915159225464, "epoch": 0.6032346145767696, "grad_norm": 1.015625, "learning_rate": 0.0004969633263937301, "loss": 5.458, "mean_token_accuracy": 0.1688069686293602, "num_tokens": 13243681.0, "step": 7180 }, { "entropy": 5.86687707901001, "epoch": 0.6036546943919344, "grad_norm": 0.91796875, "learning_rate": 0.0004969584210057832, "loss": 5.7426, "mean_token_accuracy": 0.15597060322761536, "num_tokens": 13254334.0, "step": 7185 }, { "entropy": 5.815514802932739, "epoch": 0.6040747742070993, "grad_norm": 0.88671875, "learning_rate": 0.0004969535116859573, "loss": 5.5268, "mean_token_accuracy": 0.16894952207803726, "num_tokens": 13263781.0, "step": 7190 }, { "entropy": 5.621768617630005, "epoch": 0.6044948540222642, "grad_norm": 1.0078125, "learning_rate": 0.0004969485984343392, "loss": 5.4558, "mean_token_accuracy": 0.16743801385164261, "num_tokens": 13272831.0, "step": 7195 }, { "entropy": 5.7688243865966795, "epoch": 0.6049149338374291, "grad_norm": 1.1015625, "learning_rate": 0.000496943681251016, "loss": 5.5035, "mean_token_accuracy": 0.16227193921804428, "num_tokens": 13281621.0, "step": 7200 }, { "entropy": 5.678845548629761, "epoch": 0.605335013652594, "grad_norm": 1.0, "learning_rate": 0.0004969387601360747, "loss": 5.5005, "mean_token_accuracy": 0.161435130238533, "num_tokens": 13291021.0, "step": 7205 }, { "entropy": 5.728114938735962, "epoch": 0.6057550934677589, "grad_norm": 1.0390625, "learning_rate": 0.0004969338350896026, "loss": 5.5067, "mean_token_accuracy": 0.16854603439569474, "num_tokens": 13299752.0, "step": 7210 }, { "entropy": 5.761625099182129, "epoch": 0.6061751732829238, "grad_norm": 0.9609375, "learning_rate": 0.0004969289061116869, "loss": 5.5252, "mean_token_accuracy": 0.15588051974773406, "num_tokens": 13309112.0, "step": 7215 }, { "entropy": 5.784453392028809, "epoch": 0.6065952530980886, "grad_norm": 1.0, "learning_rate": 0.0004969239732024148, "loss": 5.5312, "mean_token_accuracy": 0.17264840453863145, "num_tokens": 13318328.0, "step": 7220 }, { "entropy": 5.609762954711914, "epoch": 0.6070153329132535, "grad_norm": 0.87109375, "learning_rate": 0.0004969190363618739, "loss": 5.4207, "mean_token_accuracy": 0.16983553618192673, "num_tokens": 13328940.0, "step": 7225 }, { "entropy": 5.653651523590088, "epoch": 0.6074354127284184, "grad_norm": 1.1015625, "learning_rate": 0.0004969140955901516, "loss": 5.4583, "mean_token_accuracy": 0.17219654768705367, "num_tokens": 13337829.0, "step": 7230 }, { "entropy": 5.808972644805908, "epoch": 0.6078554925435833, "grad_norm": 0.875, "learning_rate": 0.0004969091508873352, "loss": 5.6215, "mean_token_accuracy": 0.16035659611225128, "num_tokens": 13348289.0, "step": 7235 }, { "entropy": 5.754089260101319, "epoch": 0.6082755723587482, "grad_norm": 0.99609375, "learning_rate": 0.0004969042022535126, "loss": 5.5477, "mean_token_accuracy": 0.16541809737682342, "num_tokens": 13357292.0, "step": 7240 }, { "entropy": 5.7452880382537845, "epoch": 0.6086956521739131, "grad_norm": 0.99609375, "learning_rate": 0.0004968992496887713, "loss": 5.5828, "mean_token_accuracy": 0.16221534311771393, "num_tokens": 13366640.0, "step": 7245 }, { "entropy": 5.746783971786499, "epoch": 0.609115731989078, "grad_norm": 0.7890625, "learning_rate": 0.0004968942931931989, "loss": 5.4881, "mean_token_accuracy": 0.17632103711366653, "num_tokens": 13377509.0, "step": 7250 }, { "entropy": 5.718003177642823, "epoch": 0.6095358118042428, "grad_norm": 1.0625, "learning_rate": 0.0004968893327668835, "loss": 5.5859, "mean_token_accuracy": 0.1615411803126335, "num_tokens": 13386573.0, "step": 7255 }, { "entropy": 5.676197052001953, "epoch": 0.6099558916194077, "grad_norm": 0.9375, "learning_rate": 0.0004968843684099128, "loss": 5.4274, "mean_token_accuracy": 0.1722585678100586, "num_tokens": 13395790.0, "step": 7260 }, { "entropy": 5.6800004005432125, "epoch": 0.6103759714345726, "grad_norm": 1.0546875, "learning_rate": 0.0004968794001223747, "loss": 5.4747, "mean_token_accuracy": 0.16489816904067994, "num_tokens": 13405265.0, "step": 7265 }, { "entropy": 5.692203521728516, "epoch": 0.6107960512497375, "grad_norm": 1.015625, "learning_rate": 0.0004968744279043574, "loss": 5.4777, "mean_token_accuracy": 0.17131679356098176, "num_tokens": 13413796.0, "step": 7270 }, { "entropy": 5.744186496734619, "epoch": 0.6112161310649024, "grad_norm": 0.97265625, "learning_rate": 0.0004968694517559488, "loss": 5.5307, "mean_token_accuracy": 0.16541121006011963, "num_tokens": 13423299.0, "step": 7275 }, { "entropy": 5.668656826019287, "epoch": 0.6116362108800673, "grad_norm": 0.890625, "learning_rate": 0.0004968644716772371, "loss": 5.4529, "mean_token_accuracy": 0.17369708567857742, "num_tokens": 13432267.0, "step": 7280 }, { "entropy": 5.680675506591797, "epoch": 0.612056290695232, "grad_norm": 0.9296875, "learning_rate": 0.0004968594876683105, "loss": 5.5412, "mean_token_accuracy": 0.16298353672027588, "num_tokens": 13442332.0, "step": 7285 }, { "entropy": 5.697410249710083, "epoch": 0.6124763705103969, "grad_norm": 0.94921875, "learning_rate": 0.0004968544997292572, "loss": 5.4937, "mean_token_accuracy": 0.17212583422660827, "num_tokens": 13451700.0, "step": 7290 }, { "entropy": 5.737648773193359, "epoch": 0.6128964503255618, "grad_norm": 1.0, "learning_rate": 0.0004968495078601659, "loss": 5.5918, "mean_token_accuracy": 0.16140649616718292, "num_tokens": 13461009.0, "step": 7295 }, { "entropy": 5.7446732997894285, "epoch": 0.6133165301407267, "grad_norm": 0.953125, "learning_rate": 0.0004968445120611247, "loss": 5.5815, "mean_token_accuracy": 0.16554148495197296, "num_tokens": 13470341.0, "step": 7300 }, { "entropy": 5.743761396408081, "epoch": 0.6137366099558916, "grad_norm": 0.9140625, "learning_rate": 0.0004968395123322223, "loss": 5.5025, "mean_token_accuracy": 0.1652843788266182, "num_tokens": 13479898.0, "step": 7305 }, { "entropy": 5.698557806015015, "epoch": 0.6141566897710565, "grad_norm": 0.96875, "learning_rate": 0.000496834508673547, "loss": 5.4265, "mean_token_accuracy": 0.16626278609037398, "num_tokens": 13488116.0, "step": 7310 }, { "entropy": 5.7173277854919435, "epoch": 0.6145767695862214, "grad_norm": 0.96484375, "learning_rate": 0.0004968295010851877, "loss": 5.4667, "mean_token_accuracy": 0.1695254847407341, "num_tokens": 13497814.0, "step": 7315 }, { "entropy": 5.703423404693604, "epoch": 0.6149968494013862, "grad_norm": 1.0234375, "learning_rate": 0.0004968244895672331, "loss": 5.4664, "mean_token_accuracy": 0.16524181365966797, "num_tokens": 13506617.0, "step": 7320 }, { "entropy": 5.680415296554566, "epoch": 0.6154169292165511, "grad_norm": 0.93359375, "learning_rate": 0.0004968194741197718, "loss": 5.6305, "mean_token_accuracy": 0.16197476536035538, "num_tokens": 13516632.0, "step": 7325 }, { "entropy": 5.829664039611816, "epoch": 0.615837009031716, "grad_norm": 1.0, "learning_rate": 0.0004968144547428927, "loss": 5.5466, "mean_token_accuracy": 0.16924293488264083, "num_tokens": 13526452.0, "step": 7330 }, { "entropy": 5.744434928894043, "epoch": 0.6162570888468809, "grad_norm": 1.046875, "learning_rate": 0.0004968094314366848, "loss": 5.4566, "mean_token_accuracy": 0.16510533839464187, "num_tokens": 13535663.0, "step": 7335 }, { "entropy": 5.645727968215942, "epoch": 0.6166771686620458, "grad_norm": 0.95703125, "learning_rate": 0.000496804404201237, "loss": 5.3726, "mean_token_accuracy": 0.1793311506509781, "num_tokens": 13544574.0, "step": 7340 }, { "entropy": 5.812160348892212, "epoch": 0.6170972484772107, "grad_norm": 1.078125, "learning_rate": 0.0004967993730366385, "loss": 5.5617, "mean_token_accuracy": 0.16627434641122818, "num_tokens": 13553041.0, "step": 7345 }, { "entropy": 5.655124235153198, "epoch": 0.6175173282923756, "grad_norm": 0.9453125, "learning_rate": 0.0004967943379429781, "loss": 5.5015, "mean_token_accuracy": 0.16323864310979844, "num_tokens": 13562108.0, "step": 7350 }, { "entropy": 5.850181436538696, "epoch": 0.6179374081075404, "grad_norm": 0.93359375, "learning_rate": 0.0004967892989203454, "loss": 5.6673, "mean_token_accuracy": 0.1569588676095009, "num_tokens": 13571500.0, "step": 7355 }, { "entropy": 5.801353788375854, "epoch": 0.6183574879227053, "grad_norm": 0.94921875, "learning_rate": 0.0004967842559688295, "loss": 5.5814, "mean_token_accuracy": 0.16009139716625215, "num_tokens": 13581304.0, "step": 7360 }, { "entropy": 5.694891834259034, "epoch": 0.6187775677378702, "grad_norm": 0.93359375, "learning_rate": 0.0004967792090885195, "loss": 5.4246, "mean_token_accuracy": 0.16926718205213548, "num_tokens": 13590734.0, "step": 7365 }, { "entropy": 5.6554632663726805, "epoch": 0.6191976475530351, "grad_norm": 0.90625, "learning_rate": 0.0004967741582795052, "loss": 5.5091, "mean_token_accuracy": 0.16807454824447632, "num_tokens": 13600486.0, "step": 7370 }, { "entropy": 5.801201295852661, "epoch": 0.6196177273682, "grad_norm": 0.88671875, "learning_rate": 0.0004967691035418758, "loss": 5.5316, "mean_token_accuracy": 0.15901947170495986, "num_tokens": 13610542.0, "step": 7375 }, { "entropy": 5.691852474212647, "epoch": 0.6200378071833649, "grad_norm": 0.9921875, "learning_rate": 0.000496764044875721, "loss": 5.4888, "mean_token_accuracy": 0.16762082427740096, "num_tokens": 13619431.0, "step": 7380 }, { "entropy": 5.662903547286987, "epoch": 0.6204578869985298, "grad_norm": 0.89453125, "learning_rate": 0.0004967589822811303, "loss": 5.5149, "mean_token_accuracy": 0.1655088871717453, "num_tokens": 13629930.0, "step": 7385 }, { "entropy": 5.862593698501587, "epoch": 0.6208779668136946, "grad_norm": 0.90234375, "learning_rate": 0.0004967539157581934, "loss": 5.6389, "mean_token_accuracy": 0.1590859979391098, "num_tokens": 13639439.0, "step": 7390 }, { "entropy": 5.801116275787353, "epoch": 0.6212980466288595, "grad_norm": 0.94140625, "learning_rate": 0.000496748845307, "loss": 5.5583, "mean_token_accuracy": 0.16589334830641747, "num_tokens": 13648548.0, "step": 7395 }, { "entropy": 5.766993808746338, "epoch": 0.6217181264440244, "grad_norm": 0.9453125, "learning_rate": 0.0004967437709276401, "loss": 5.6084, "mean_token_accuracy": 0.16399488151073455, "num_tokens": 13657658.0, "step": 7400 }, { "entropy": 5.650314474105835, "epoch": 0.6221382062591893, "grad_norm": 0.94140625, "learning_rate": 0.0004967386926202034, "loss": 5.3795, "mean_token_accuracy": 0.17246145755052567, "num_tokens": 13666763.0, "step": 7405 }, { "entropy": 5.782988977432251, "epoch": 0.6225582860743542, "grad_norm": 0.96875, "learning_rate": 0.00049673361038478, "loss": 5.6075, "mean_token_accuracy": 0.15505822673439978, "num_tokens": 13676527.0, "step": 7410 }, { "entropy": 5.719121265411377, "epoch": 0.622978365889519, "grad_norm": 0.890625, "learning_rate": 0.0004967285242214599, "loss": 5.578, "mean_token_accuracy": 0.17219377309083939, "num_tokens": 13685404.0, "step": 7415 }, { "entropy": 5.698868083953857, "epoch": 0.6233984457046838, "grad_norm": 1.0625, "learning_rate": 0.000496723434130333, "loss": 5.395, "mean_token_accuracy": 0.16921012550592424, "num_tokens": 13693118.0, "step": 7420 }, { "entropy": 5.7081413745880125, "epoch": 0.6238185255198487, "grad_norm": 0.953125, "learning_rate": 0.0004967183401114898, "loss": 5.4705, "mean_token_accuracy": 0.16425008475780487, "num_tokens": 13702015.0, "step": 7425 }, { "entropy": 5.7164053440094, "epoch": 0.6242386053350136, "grad_norm": 1.5703125, "learning_rate": 0.0004967132421650203, "loss": 5.4688, "mean_token_accuracy": 0.1687057375907898, "num_tokens": 13711658.0, "step": 7430 }, { "entropy": 5.650258636474609, "epoch": 0.6246586851501785, "grad_norm": 0.9921875, "learning_rate": 0.0004967081402910149, "loss": 5.5199, "mean_token_accuracy": 0.1659772053360939, "num_tokens": 13720718.0, "step": 7435 }, { "entropy": 5.714274263381958, "epoch": 0.6250787649653434, "grad_norm": 1.015625, "learning_rate": 0.000496703034489564, "loss": 5.3741, "mean_token_accuracy": 0.17356953918933868, "num_tokens": 13729364.0, "step": 7440 }, { "entropy": 5.797601222991943, "epoch": 0.6254988447805083, "grad_norm": 0.93359375, "learning_rate": 0.0004966979247607579, "loss": 5.684, "mean_token_accuracy": 0.16203884929418563, "num_tokens": 13739436.0, "step": 7445 }, { "entropy": 5.794081306457519, "epoch": 0.6259189245956732, "grad_norm": 0.88671875, "learning_rate": 0.0004966928111046873, "loss": 5.581, "mean_token_accuracy": 0.17139979004859923, "num_tokens": 13749196.0, "step": 7450 }, { "entropy": 5.7291075706481935, "epoch": 0.626339004410838, "grad_norm": 0.90625, "learning_rate": 0.0004966876935214426, "loss": 5.4214, "mean_token_accuracy": 0.17418570071458817, "num_tokens": 13758414.0, "step": 7455 }, { "entropy": 5.686338424682617, "epoch": 0.6267590842260029, "grad_norm": 0.96875, "learning_rate": 0.0004966825720111147, "loss": 5.4894, "mean_token_accuracy": 0.1629626229405403, "num_tokens": 13767496.0, "step": 7460 }, { "entropy": 5.751576089859009, "epoch": 0.6271791640411678, "grad_norm": 1.078125, "learning_rate": 0.0004966774465737942, "loss": 5.6138, "mean_token_accuracy": 0.165596853941679, "num_tokens": 13777033.0, "step": 7465 }, { "entropy": 5.7884539604187015, "epoch": 0.6275992438563327, "grad_norm": 0.95703125, "learning_rate": 0.0004966723172095717, "loss": 5.5726, "mean_token_accuracy": 0.1648782819509506, "num_tokens": 13786313.0, "step": 7470 }, { "entropy": 5.696361017227173, "epoch": 0.6280193236714976, "grad_norm": 0.98046875, "learning_rate": 0.0004966671839185384, "loss": 5.4952, "mean_token_accuracy": 0.16794500648975372, "num_tokens": 13795257.0, "step": 7475 }, { "entropy": 5.62646861076355, "epoch": 0.6284394034866625, "grad_norm": 0.92578125, "learning_rate": 0.0004966620467007851, "loss": 5.4277, "mean_token_accuracy": 0.1720203161239624, "num_tokens": 13804582.0, "step": 7480 }, { "entropy": 5.688060522079468, "epoch": 0.6288594833018274, "grad_norm": 0.8984375, "learning_rate": 0.0004966569055564027, "loss": 5.4029, "mean_token_accuracy": 0.1695487268269062, "num_tokens": 13813248.0, "step": 7485 }, { "entropy": 5.774942111968994, "epoch": 0.6292795631169922, "grad_norm": 0.95703125, "learning_rate": 0.0004966517604854823, "loss": 5.6697, "mean_token_accuracy": 0.1593285620212555, "num_tokens": 13823301.0, "step": 7490 }, { "entropy": 5.691988086700439, "epoch": 0.6296996429321571, "grad_norm": 0.98046875, "learning_rate": 0.0004966466114881152, "loss": 5.4052, "mean_token_accuracy": 0.1739303633570671, "num_tokens": 13832040.0, "step": 7495 }, { "entropy": 5.735798263549805, "epoch": 0.630119722747322, "grad_norm": 0.90625, "learning_rate": 0.0004966414585643925, "loss": 5.6088, "mean_token_accuracy": 0.16045339405536652, "num_tokens": 13841874.0, "step": 7500 }, { "entropy": 5.665113925933838, "epoch": 0.6305398025624869, "grad_norm": 0.97265625, "learning_rate": 0.0004966363017144055, "loss": 5.4215, "mean_token_accuracy": 0.17605502754449845, "num_tokens": 13850755.0, "step": 7505 }, { "entropy": 5.679509687423706, "epoch": 0.6309598823776518, "grad_norm": 0.93359375, "learning_rate": 0.0004966311409382455, "loss": 5.4801, "mean_token_accuracy": 0.16642314195632935, "num_tokens": 13860009.0, "step": 7510 }, { "entropy": 5.653228378295898, "epoch": 0.6313799621928167, "grad_norm": 1.0546875, "learning_rate": 0.0004966259762360039, "loss": 5.4039, "mean_token_accuracy": 0.17477345317602158, "num_tokens": 13868476.0, "step": 7515 }, { "entropy": 5.598419427871704, "epoch": 0.6318000420079816, "grad_norm": 0.92578125, "learning_rate": 0.0004966208076077723, "loss": 5.4152, "mean_token_accuracy": 0.17099131792783737, "num_tokens": 13877367.0, "step": 7520 }, { "entropy": 5.699641418457031, "epoch": 0.6322201218231464, "grad_norm": 0.98828125, "learning_rate": 0.0004966156350536422, "loss": 5.5002, "mean_token_accuracy": 0.1632213681936264, "num_tokens": 13885985.0, "step": 7525 }, { "entropy": 5.636166906356811, "epoch": 0.6326402016383113, "grad_norm": 0.9296875, "learning_rate": 0.0004966104585737054, "loss": 5.4183, "mean_token_accuracy": 0.17092742025852203, "num_tokens": 13895059.0, "step": 7530 }, { "entropy": 5.690598201751709, "epoch": 0.6330602814534761, "grad_norm": 0.9453125, "learning_rate": 0.0004966052781680534, "loss": 5.4839, "mean_token_accuracy": 0.16899570524692537, "num_tokens": 13903789.0, "step": 7535 }, { "entropy": 5.756844615936279, "epoch": 0.633480361268641, "grad_norm": 0.97265625, "learning_rate": 0.0004966000938367778, "loss": 5.4591, "mean_token_accuracy": 0.16894406527280809, "num_tokens": 13913377.0, "step": 7540 }, { "entropy": 5.608310413360596, "epoch": 0.6339004410838059, "grad_norm": 0.9453125, "learning_rate": 0.0004965949055799708, "loss": 5.4127, "mean_token_accuracy": 0.18185660988092422, "num_tokens": 13922141.0, "step": 7545 }, { "entropy": 5.723646020889282, "epoch": 0.6343205208989708, "grad_norm": 1.03125, "learning_rate": 0.0004965897133977241, "loss": 5.4807, "mean_token_accuracy": 0.16371672451496125, "num_tokens": 13930717.0, "step": 7550 }, { "entropy": 5.771508407592774, "epoch": 0.6347406007141357, "grad_norm": 0.94140625, "learning_rate": 0.0004965845172901298, "loss": 5.5515, "mean_token_accuracy": 0.16789867728948593, "num_tokens": 13940344.0, "step": 7555 }, { "entropy": 5.700416374206543, "epoch": 0.6351606805293005, "grad_norm": 1.0625, "learning_rate": 0.0004965793172572798, "loss": 5.4076, "mean_token_accuracy": 0.1729632467031479, "num_tokens": 13948400.0, "step": 7560 }, { "entropy": 5.65832405090332, "epoch": 0.6355807603444654, "grad_norm": 0.921875, "learning_rate": 0.0004965741132992663, "loss": 5.5048, "mean_token_accuracy": 0.16236102432012559, "num_tokens": 13957939.0, "step": 7565 }, { "entropy": 5.741348743438721, "epoch": 0.6360008401596303, "grad_norm": 0.90625, "learning_rate": 0.0004965689054161814, "loss": 5.4767, "mean_token_accuracy": 0.17106067687273024, "num_tokens": 13966943.0, "step": 7570 }, { "entropy": 5.667167472839355, "epoch": 0.6364209199747952, "grad_norm": 0.96484375, "learning_rate": 0.0004965636936081176, "loss": 5.4057, "mean_token_accuracy": 0.16865545958280564, "num_tokens": 13975850.0, "step": 7575 }, { "entropy": 5.769311952590942, "epoch": 0.6368409997899601, "grad_norm": 1.03125, "learning_rate": 0.000496558477875167, "loss": 5.4734, "mean_token_accuracy": 0.17367706149816514, "num_tokens": 13985059.0, "step": 7580 }, { "entropy": 5.775439119338989, "epoch": 0.637261079605125, "grad_norm": 0.94921875, "learning_rate": 0.000496553258217422, "loss": 5.5413, "mean_token_accuracy": 0.1580244779586792, "num_tokens": 13993571.0, "step": 7585 }, { "entropy": 5.728666591644287, "epoch": 0.6376811594202898, "grad_norm": 0.96875, "learning_rate": 0.0004965480346349751, "loss": 5.5175, "mean_token_accuracy": 0.16710771322250367, "num_tokens": 14002326.0, "step": 7590 }, { "entropy": 5.852581930160523, "epoch": 0.6381012392354547, "grad_norm": 0.91796875, "learning_rate": 0.000496542807127919, "loss": 5.661, "mean_token_accuracy": 0.16399567797780037, "num_tokens": 14012002.0, "step": 7595 }, { "entropy": 5.758628559112549, "epoch": 0.6385213190506196, "grad_norm": 0.95703125, "learning_rate": 0.000496537575696346, "loss": 5.5335, "mean_token_accuracy": 0.16143829822540284, "num_tokens": 14022085.0, "step": 7600 }, { "entropy": 5.683157110214234, "epoch": 0.6389413988657845, "grad_norm": 0.9765625, "learning_rate": 0.0004965323403403488, "loss": 5.4223, "mean_token_accuracy": 0.1673789069056511, "num_tokens": 14030706.0, "step": 7605 }, { "entropy": 5.625469160079956, "epoch": 0.6393614786809494, "grad_norm": 0.91015625, "learning_rate": 0.0004965271010600205, "loss": 5.4607, "mean_token_accuracy": 0.1712944433093071, "num_tokens": 14039520.0, "step": 7610 }, { "entropy": 5.717896509170532, "epoch": 0.6397815584961143, "grad_norm": 0.96875, "learning_rate": 0.0004965218578554535, "loss": 5.5437, "mean_token_accuracy": 0.16942658126354218, "num_tokens": 14048407.0, "step": 7615 }, { "entropy": 5.655859279632568, "epoch": 0.6402016383112792, "grad_norm": 1.03125, "learning_rate": 0.000496516610726741, "loss": 5.467, "mean_token_accuracy": 0.17381453812122344, "num_tokens": 14057534.0, "step": 7620 }, { "entropy": 5.645468664169312, "epoch": 0.640621718126444, "grad_norm": 0.97265625, "learning_rate": 0.0004965113596739759, "loss": 5.4169, "mean_token_accuracy": 0.17614233940839769, "num_tokens": 14065992.0, "step": 7625 }, { "entropy": 5.627894401550293, "epoch": 0.6410417979416089, "grad_norm": 1.0078125, "learning_rate": 0.0004965061046972508, "loss": 5.4111, "mean_token_accuracy": 0.16821854412555695, "num_tokens": 14074806.0, "step": 7630 }, { "entropy": 5.659251022338867, "epoch": 0.6414618777567738, "grad_norm": 0.90625, "learning_rate": 0.0004965008457966594, "loss": 5.4789, "mean_token_accuracy": 0.16432067304849624, "num_tokens": 14083813.0, "step": 7635 }, { "entropy": 5.681667470932007, "epoch": 0.6418819575719387, "grad_norm": 0.984375, "learning_rate": 0.0004964955829722945, "loss": 5.4099, "mean_token_accuracy": 0.17027026712894439, "num_tokens": 14092193.0, "step": 7640 }, { "entropy": 5.813520383834839, "epoch": 0.6423020373871036, "grad_norm": 1.0078125, "learning_rate": 0.0004964903162242493, "loss": 5.6342, "mean_token_accuracy": 0.15789156556129455, "num_tokens": 14102797.0, "step": 7645 }, { "entropy": 5.684490537643432, "epoch": 0.6427221172022685, "grad_norm": 0.921875, "learning_rate": 0.0004964850455526173, "loss": 5.4773, "mean_token_accuracy": 0.17116947323083878, "num_tokens": 14112226.0, "step": 7650 }, { "entropy": 5.610788440704345, "epoch": 0.6431421970174334, "grad_norm": 1.0, "learning_rate": 0.0004964797709574917, "loss": 5.4149, "mean_token_accuracy": 0.16700370907783507, "num_tokens": 14121775.0, "step": 7655 }, { "entropy": 5.635930681228638, "epoch": 0.6435622768325981, "grad_norm": 0.91796875, "learning_rate": 0.000496474492438966, "loss": 5.4038, "mean_token_accuracy": 0.17071498185396194, "num_tokens": 14130415.0, "step": 7660 }, { "entropy": 5.693044376373291, "epoch": 0.643982356647763, "grad_norm": 0.890625, "learning_rate": 0.0004964692099971338, "loss": 5.4429, "mean_token_accuracy": 0.17001585066318511, "num_tokens": 14140204.0, "step": 7665 }, { "entropy": 5.683203983306885, "epoch": 0.6444024364629279, "grad_norm": 0.95703125, "learning_rate": 0.0004964639236320885, "loss": 5.378, "mean_token_accuracy": 0.16871996819972992, "num_tokens": 14149595.0, "step": 7670 }, { "entropy": 5.608165884017945, "epoch": 0.6448225162780928, "grad_norm": 0.94140625, "learning_rate": 0.0004964586333439239, "loss": 5.4553, "mean_token_accuracy": 0.1665859803557396, "num_tokens": 14158865.0, "step": 7675 }, { "entropy": 5.660884809494019, "epoch": 0.6452425960932577, "grad_norm": 1.03125, "learning_rate": 0.0004964533391327335, "loss": 5.4102, "mean_token_accuracy": 0.1763300195336342, "num_tokens": 14167962.0, "step": 7680 }, { "entropy": 5.667941331863403, "epoch": 0.6456626759084226, "grad_norm": 1.0390625, "learning_rate": 0.0004964480409986113, "loss": 5.4635, "mean_token_accuracy": 0.1680685743689537, "num_tokens": 14176479.0, "step": 7685 }, { "entropy": 5.753057384490967, "epoch": 0.6460827557235875, "grad_norm": 1.0234375, "learning_rate": 0.0004964427389416512, "loss": 5.4711, "mean_token_accuracy": 0.1684321254491806, "num_tokens": 14185408.0, "step": 7690 }, { "entropy": 5.6439416885375975, "epoch": 0.6465028355387523, "grad_norm": 1.046875, "learning_rate": 0.000496437432961947, "loss": 5.4766, "mean_token_accuracy": 0.17091822624206543, "num_tokens": 14194155.0, "step": 7695 }, { "entropy": 5.633873081207275, "epoch": 0.6469229153539172, "grad_norm": 0.96875, "learning_rate": 0.0004964321230595925, "loss": 5.5054, "mean_token_accuracy": 0.1647911474108696, "num_tokens": 14202779.0, "step": 7700 }, { "entropy": 5.800021934509277, "epoch": 0.6473429951690821, "grad_norm": 0.88671875, "learning_rate": 0.0004964268092346821, "loss": 5.69, "mean_token_accuracy": 0.15756986886262894, "num_tokens": 14212552.0, "step": 7705 }, { "entropy": 5.822913599014282, "epoch": 0.647763074984247, "grad_norm": 0.8984375, "learning_rate": 0.0004964214914873098, "loss": 5.4764, "mean_token_accuracy": 0.16087636500597, "num_tokens": 14222783.0, "step": 7710 }, { "entropy": 5.634449625015259, "epoch": 0.6481831547994119, "grad_norm": 1.0, "learning_rate": 0.0004964161698175697, "loss": 5.358, "mean_token_accuracy": 0.16693367213010787, "num_tokens": 14232085.0, "step": 7715 }, { "entropy": 5.668655920028686, "epoch": 0.6486032346145768, "grad_norm": 0.96484375, "learning_rate": 0.0004964108442255562, "loss": 5.5381, "mean_token_accuracy": 0.16392049193382263, "num_tokens": 14241969.0, "step": 7720 }, { "entropy": 5.647701263427734, "epoch": 0.6490233144297417, "grad_norm": 1.1015625, "learning_rate": 0.0004964055147113637, "loss": 5.4328, "mean_token_accuracy": 0.17618952840566635, "num_tokens": 14251012.0, "step": 7725 }, { "entropy": 5.772505426406861, "epoch": 0.6494433942449065, "grad_norm": 1.09375, "learning_rate": 0.0004964001812750864, "loss": 5.5328, "mean_token_accuracy": 0.1665820762515068, "num_tokens": 14261110.0, "step": 7730 }, { "entropy": 5.718127250671387, "epoch": 0.6498634740600714, "grad_norm": 0.98828125, "learning_rate": 0.000496394843916819, "loss": 5.5204, "mean_token_accuracy": 0.16651073694229127, "num_tokens": 14270869.0, "step": 7735 }, { "entropy": 5.704468631744385, "epoch": 0.6502835538752363, "grad_norm": 0.953125, "learning_rate": 0.0004963895026366558, "loss": 5.4869, "mean_token_accuracy": 0.1666564702987671, "num_tokens": 14279607.0, "step": 7740 }, { "entropy": 5.662124490737915, "epoch": 0.6507036336904012, "grad_norm": 0.9140625, "learning_rate": 0.0004963841574346917, "loss": 5.4635, "mean_token_accuracy": 0.16543798744678498, "num_tokens": 14289282.0, "step": 7745 }, { "entropy": 5.6414776802062985, "epoch": 0.6511237135055661, "grad_norm": 0.9609375, "learning_rate": 0.0004963788083110212, "loss": 5.3949, "mean_token_accuracy": 0.17210839688777924, "num_tokens": 14298658.0, "step": 7750 }, { "entropy": 5.771036195755005, "epoch": 0.651543793320731, "grad_norm": 0.91796875, "learning_rate": 0.000496373455265739, "loss": 5.486, "mean_token_accuracy": 0.16423814594745637, "num_tokens": 14307832.0, "step": 7755 }, { "entropy": 5.679357814788818, "epoch": 0.6519638731358958, "grad_norm": 0.921875, "learning_rate": 0.0004963680982989402, "loss": 5.3936, "mean_token_accuracy": 0.17518658488988875, "num_tokens": 14317122.0, "step": 7760 }, { "entropy": 5.649253225326538, "epoch": 0.6523839529510607, "grad_norm": 1.0, "learning_rate": 0.0004963627374107195, "loss": 5.4302, "mean_token_accuracy": 0.17149852067232133, "num_tokens": 14326069.0, "step": 7765 }, { "entropy": 5.62903995513916, "epoch": 0.6528040327662256, "grad_norm": 0.94140625, "learning_rate": 0.0004963573726011717, "loss": 5.4428, "mean_token_accuracy": 0.17162297070026397, "num_tokens": 14335260.0, "step": 7770 }, { "entropy": 5.761579751968384, "epoch": 0.6532241125813905, "grad_norm": 0.97265625, "learning_rate": 0.0004963520038703922, "loss": 5.5357, "mean_token_accuracy": 0.1569953978061676, "num_tokens": 14345823.0, "step": 7775 }, { "entropy": 5.697815990447998, "epoch": 0.6536441923965554, "grad_norm": 1.0390625, "learning_rate": 0.000496346631218476, "loss": 5.4087, "mean_token_accuracy": 0.1733380824327469, "num_tokens": 14354316.0, "step": 7780 }, { "entropy": 5.62894639968872, "epoch": 0.6540642722117203, "grad_norm": 0.92578125, "learning_rate": 0.000496341254645518, "loss": 5.4553, "mean_token_accuracy": 0.17333490997552872, "num_tokens": 14364539.0, "step": 7785 }, { "entropy": 5.681260681152343, "epoch": 0.6544843520268852, "grad_norm": 0.88671875, "learning_rate": 0.0004963358741516138, "loss": 5.5558, "mean_token_accuracy": 0.16200231909751892, "num_tokens": 14374081.0, "step": 7790 }, { "entropy": 5.70987491607666, "epoch": 0.6549044318420499, "grad_norm": 0.90625, "learning_rate": 0.0004963304897368585, "loss": 5.4557, "mean_token_accuracy": 0.16147168278694152, "num_tokens": 14383255.0, "step": 7795 }, { "entropy": 5.78820161819458, "epoch": 0.6553245116572148, "grad_norm": 1.1015625, "learning_rate": 0.0004963251014013475, "loss": 5.5896, "mean_token_accuracy": 0.1648208513855934, "num_tokens": 14392417.0, "step": 7800 }, { "entropy": 5.850169658660889, "epoch": 0.6557445914723797, "grad_norm": 1.140625, "learning_rate": 0.0004963197091451763, "loss": 5.6177, "mean_token_accuracy": 0.15744412541389466, "num_tokens": 14401899.0, "step": 7805 }, { "entropy": 5.796338748931885, "epoch": 0.6561646712875446, "grad_norm": 0.984375, "learning_rate": 0.0004963143129684405, "loss": 5.5729, "mean_token_accuracy": 0.1602293811738491, "num_tokens": 14411245.0, "step": 7810 }, { "entropy": 5.653019428253174, "epoch": 0.6565847511027095, "grad_norm": 1.0625, "learning_rate": 0.0004963089128712355, "loss": 5.4488, "mean_token_accuracy": 0.17309630364179612, "num_tokens": 14419710.0, "step": 7815 }, { "entropy": 5.627862644195557, "epoch": 0.6570048309178744, "grad_norm": 0.88671875, "learning_rate": 0.0004963035088536571, "loss": 5.4218, "mean_token_accuracy": 0.17990072220563888, "num_tokens": 14430266.0, "step": 7820 }, { "entropy": 5.702354001998901, "epoch": 0.6574249107330393, "grad_norm": 0.9140625, "learning_rate": 0.0004962981009158012, "loss": 5.3956, "mean_token_accuracy": 0.16377443671226502, "num_tokens": 14439515.0, "step": 7825 }, { "entropy": 5.699529790878296, "epoch": 0.6578449905482041, "grad_norm": 1.15625, "learning_rate": 0.0004962926890577632, "loss": 5.4635, "mean_token_accuracy": 0.17154118418693542, "num_tokens": 14448091.0, "step": 7830 }, { "entropy": 5.681157159805298, "epoch": 0.658265070363369, "grad_norm": 0.94140625, "learning_rate": 0.000496287273279639, "loss": 5.4892, "mean_token_accuracy": 0.1665187358856201, "num_tokens": 14457744.0, "step": 7835 }, { "entropy": 5.740079164505005, "epoch": 0.6586851501785339, "grad_norm": 0.953125, "learning_rate": 0.000496281853581525, "loss": 5.4732, "mean_token_accuracy": 0.17109596878290176, "num_tokens": 14467597.0, "step": 7840 }, { "entropy": 5.694699621200561, "epoch": 0.6591052299936988, "grad_norm": 1.015625, "learning_rate": 0.0004962764299635168, "loss": 5.4526, "mean_token_accuracy": 0.17279575616121293, "num_tokens": 14476662.0, "step": 7845 }, { "entropy": 5.751010799407959, "epoch": 0.6595253098088637, "grad_norm": 0.93359375, "learning_rate": 0.0004962710024257105, "loss": 5.5324, "mean_token_accuracy": 0.1658242180943489, "num_tokens": 14486583.0, "step": 7850 }, { "entropy": 5.749286460876465, "epoch": 0.6599453896240286, "grad_norm": 0.94140625, "learning_rate": 0.0004962655709682025, "loss": 5.5343, "mean_token_accuracy": 0.16569894403219224, "num_tokens": 14496528.0, "step": 7855 }, { "entropy": 5.736885070800781, "epoch": 0.6603654694391935, "grad_norm": 0.8671875, "learning_rate": 0.0004962601355910887, "loss": 5.5294, "mean_token_accuracy": 0.16236354857683183, "num_tokens": 14507026.0, "step": 7860 }, { "entropy": 5.597323274612426, "epoch": 0.6607855492543583, "grad_norm": 0.91015625, "learning_rate": 0.0004962546962944656, "loss": 5.3851, "mean_token_accuracy": 0.17176171392202377, "num_tokens": 14516480.0, "step": 7865 }, { "entropy": 5.6523455619812015, "epoch": 0.6612056290695232, "grad_norm": 0.97265625, "learning_rate": 0.0004962492530784295, "loss": 5.3455, "mean_token_accuracy": 0.18076795786619188, "num_tokens": 14525068.0, "step": 7870 }, { "entropy": 5.65107307434082, "epoch": 0.6616257088846881, "grad_norm": 0.921875, "learning_rate": 0.0004962438059430768, "loss": 5.4659, "mean_token_accuracy": 0.17110495269298553, "num_tokens": 14534441.0, "step": 7875 }, { "entropy": 5.705241060256958, "epoch": 0.662045788699853, "grad_norm": 0.9765625, "learning_rate": 0.0004962383548885039, "loss": 5.5629, "mean_token_accuracy": 0.16295325756072998, "num_tokens": 14543026.0, "step": 7880 }, { "entropy": 5.681460857391357, "epoch": 0.6624658685150179, "grad_norm": 0.98046875, "learning_rate": 0.0004962328999148075, "loss": 5.4079, "mean_token_accuracy": 0.17630907893180847, "num_tokens": 14552068.0, "step": 7885 }, { "entropy": 5.707177734375, "epoch": 0.6628859483301828, "grad_norm": 0.96875, "learning_rate": 0.0004962274410220842, "loss": 5.5553, "mean_token_accuracy": 0.16349743753671647, "num_tokens": 14561587.0, "step": 7890 }, { "entropy": 5.7315949440002445, "epoch": 0.6633060281453477, "grad_norm": 0.9453125, "learning_rate": 0.0004962219782104308, "loss": 5.5674, "mean_token_accuracy": 0.17078642100095748, "num_tokens": 14571020.0, "step": 7895 }, { "entropy": 5.748089981079102, "epoch": 0.6637261079605125, "grad_norm": 0.9765625, "learning_rate": 0.0004962165114799439, "loss": 5.5219, "mean_token_accuracy": 0.1581498920917511, "num_tokens": 14580638.0, "step": 7900 }, { "entropy": 5.684006977081299, "epoch": 0.6641461877756774, "grad_norm": 0.87890625, "learning_rate": 0.0004962110408307204, "loss": 5.4244, "mean_token_accuracy": 0.16590397357940673, "num_tokens": 14590173.0, "step": 7905 }, { "entropy": 5.631041860580444, "epoch": 0.6645662675908423, "grad_norm": 0.9609375, "learning_rate": 0.0004962055662628571, "loss": 5.4432, "mean_token_accuracy": 0.17124811559915543, "num_tokens": 14598635.0, "step": 7910 }, { "entropy": 5.715653800964356, "epoch": 0.6649863474060071, "grad_norm": 1.0, "learning_rate": 0.0004962000877764513, "loss": 5.4518, "mean_token_accuracy": 0.1737048864364624, "num_tokens": 14607233.0, "step": 7915 }, { "entropy": 5.8200788497924805, "epoch": 0.665406427221172, "grad_norm": 1.078125, "learning_rate": 0.0004961946053715998, "loss": 5.6133, "mean_token_accuracy": 0.1551756888628006, "num_tokens": 14617483.0, "step": 7920 }, { "entropy": 5.674804592132569, "epoch": 0.665826507036337, "grad_norm": 1.078125, "learning_rate": 0.0004961891190483997, "loss": 5.4394, "mean_token_accuracy": 0.1664857968688011, "num_tokens": 14625805.0, "step": 7925 }, { "entropy": 5.6124043464660645, "epoch": 0.6662465868515017, "grad_norm": 0.98046875, "learning_rate": 0.0004961836288069483, "loss": 5.3569, "mean_token_accuracy": 0.17260607928037644, "num_tokens": 14634605.0, "step": 7930 }, { "entropy": 5.724435472488404, "epoch": 0.6666666666666666, "grad_norm": 0.93359375, "learning_rate": 0.0004961781346473428, "loss": 5.5688, "mean_token_accuracy": 0.15710362046957016, "num_tokens": 14644970.0, "step": 7935 }, { "entropy": 5.721081972122192, "epoch": 0.6670867464818315, "grad_norm": 0.8515625, "learning_rate": 0.0004961726365696805, "loss": 5.4484, "mean_token_accuracy": 0.1679193213582039, "num_tokens": 14655043.0, "step": 7940 }, { "entropy": 5.728677034378052, "epoch": 0.6675068262969964, "grad_norm": 0.94921875, "learning_rate": 0.0004961671345740589, "loss": 5.4375, "mean_token_accuracy": 0.16690576076507568, "num_tokens": 14663994.0, "step": 7945 }, { "entropy": 5.65096526145935, "epoch": 0.6679269061121613, "grad_norm": 0.890625, "learning_rate": 0.0004961616286605753, "loss": 5.4334, "mean_token_accuracy": 0.15957258641719818, "num_tokens": 14674101.0, "step": 7950 }, { "entropy": 5.669146871566772, "epoch": 0.6683469859273262, "grad_norm": 0.921875, "learning_rate": 0.0004961561188293273, "loss": 5.5061, "mean_token_accuracy": 0.16300352662801743, "num_tokens": 14684156.0, "step": 7955 }, { "entropy": 5.617530393600464, "epoch": 0.6687670657424911, "grad_norm": 0.953125, "learning_rate": 0.0004961506050804126, "loss": 5.4206, "mean_token_accuracy": 0.17437569051980972, "num_tokens": 14693223.0, "step": 7960 }, { "entropy": 5.734630155563354, "epoch": 0.6691871455576559, "grad_norm": 0.9296875, "learning_rate": 0.000496145087413929, "loss": 5.4317, "mean_token_accuracy": 0.16657862663269044, "num_tokens": 14702959.0, "step": 7965 }, { "entropy": 5.7842125415802, "epoch": 0.6696072253728208, "grad_norm": 1.0390625, "learning_rate": 0.0004961395658299737, "loss": 5.5569, "mean_token_accuracy": 0.16241346150636674, "num_tokens": 14712146.0, "step": 7970 }, { "entropy": 5.694394731521607, "epoch": 0.6700273051879857, "grad_norm": 0.97265625, "learning_rate": 0.0004961340403286451, "loss": 5.467, "mean_token_accuracy": 0.16332777589559555, "num_tokens": 14721932.0, "step": 7975 }, { "entropy": 5.655285358428955, "epoch": 0.6704473850031506, "grad_norm": 0.96875, "learning_rate": 0.0004961285109100408, "loss": 5.3965, "mean_token_accuracy": 0.173796084523201, "num_tokens": 14731080.0, "step": 7980 }, { "entropy": 5.561992931365967, "epoch": 0.6708674648183155, "grad_norm": 1.015625, "learning_rate": 0.0004961229775742587, "loss": 5.3988, "mean_token_accuracy": 0.1744252011179924, "num_tokens": 14740057.0, "step": 7985 }, { "entropy": 5.722016334533691, "epoch": 0.6712875446334804, "grad_norm": 1.0546875, "learning_rate": 0.000496117440321397, "loss": 5.471, "mean_token_accuracy": 0.1750819519162178, "num_tokens": 14748399.0, "step": 7990 }, { "entropy": 5.704089832305908, "epoch": 0.6717076244486453, "grad_norm": 1.03125, "learning_rate": 0.0004961118991515537, "loss": 5.4962, "mean_token_accuracy": 0.16623101234436036, "num_tokens": 14757215.0, "step": 7995 }, { "entropy": 5.635396480560303, "epoch": 0.6721277042638101, "grad_norm": 0.9921875, "learning_rate": 0.000496106354064827, "loss": 5.4871, "mean_token_accuracy": 0.17475164234638213, "num_tokens": 14766191.0, "step": 8000 }, { "entropy": 5.778530263900757, "epoch": 0.672547784078975, "grad_norm": 0.90234375, "learning_rate": 0.0004961008050613149, "loss": 5.5478, "mean_token_accuracy": 0.16085838228464128, "num_tokens": 14775220.0, "step": 8005 }, { "entropy": 5.735023260116577, "epoch": 0.6729678638941399, "grad_norm": 1.0, "learning_rate": 0.0004960952521411161, "loss": 5.5139, "mean_token_accuracy": 0.16576410979032516, "num_tokens": 14784287.0, "step": 8010 }, { "entropy": 5.812222099304199, "epoch": 0.6733879437093048, "grad_norm": 0.9296875, "learning_rate": 0.0004960896953043287, "loss": 5.5705, "mean_token_accuracy": 0.16489047110080718, "num_tokens": 14794219.0, "step": 8015 }, { "entropy": 5.734062957763672, "epoch": 0.6738080235244697, "grad_norm": 1.0390625, "learning_rate": 0.0004960841345510511, "loss": 5.481, "mean_token_accuracy": 0.1697475478053093, "num_tokens": 14803324.0, "step": 8020 }, { "entropy": 5.720947408676148, "epoch": 0.6742281033396346, "grad_norm": 1.046875, "learning_rate": 0.000496078569881382, "loss": 5.4933, "mean_token_accuracy": 0.16686712205410004, "num_tokens": 14811963.0, "step": 8025 }, { "entropy": 5.6574663639068605, "epoch": 0.6746481831547995, "grad_norm": 1.046875, "learning_rate": 0.0004960730012954198, "loss": 5.442, "mean_token_accuracy": 0.16583069860935212, "num_tokens": 14821903.0, "step": 8030 }, { "entropy": 5.6191630363464355, "epoch": 0.6750682629699643, "grad_norm": 0.984375, "learning_rate": 0.0004960674287932634, "loss": 5.4474, "mean_token_accuracy": 0.16195174753665925, "num_tokens": 14831215.0, "step": 8035 }, { "entropy": 5.726200246810913, "epoch": 0.6754883427851291, "grad_norm": 0.97265625, "learning_rate": 0.0004960618523750111, "loss": 5.3643, "mean_token_accuracy": 0.17416994720697404, "num_tokens": 14840354.0, "step": 8040 }, { "entropy": 5.735062551498413, "epoch": 0.675908422600294, "grad_norm": 0.99609375, "learning_rate": 0.000496056272040762, "loss": 5.5518, "mean_token_accuracy": 0.1663343757390976, "num_tokens": 14849660.0, "step": 8045 }, { "entropy": 5.725352334976196, "epoch": 0.6763285024154589, "grad_norm": 0.94921875, "learning_rate": 0.0004960506877906149, "loss": 5.4372, "mean_token_accuracy": 0.1609252318739891, "num_tokens": 14859819.0, "step": 8050 }, { "entropy": 5.705269145965576, "epoch": 0.6767485822306238, "grad_norm": 0.95703125, "learning_rate": 0.0004960450996246686, "loss": 5.4611, "mean_token_accuracy": 0.16848595291376114, "num_tokens": 14869260.0, "step": 8055 }, { "entropy": 5.671531009674072, "epoch": 0.6771686620457887, "grad_norm": 0.99609375, "learning_rate": 0.0004960395075430222, "loss": 5.4232, "mean_token_accuracy": 0.16953370571136475, "num_tokens": 14878685.0, "step": 8060 }, { "entropy": 5.6622340202331545, "epoch": 0.6775887418609536, "grad_norm": 0.86328125, "learning_rate": 0.0004960339115457748, "loss": 5.4268, "mean_token_accuracy": 0.1659297153353691, "num_tokens": 14888456.0, "step": 8065 }, { "entropy": 5.7056151866912845, "epoch": 0.6780088216761184, "grad_norm": 1.0078125, "learning_rate": 0.0004960283116330255, "loss": 5.5355, "mean_token_accuracy": 0.16633066833019255, "num_tokens": 14897401.0, "step": 8070 }, { "entropy": 5.736236810684204, "epoch": 0.6784289014912833, "grad_norm": 0.93359375, "learning_rate": 0.0004960227078048735, "loss": 5.4573, "mean_token_accuracy": 0.16657501608133315, "num_tokens": 14906741.0, "step": 8075 }, { "entropy": 5.738412714004516, "epoch": 0.6788489813064482, "grad_norm": 0.9296875, "learning_rate": 0.0004960171000614179, "loss": 5.352, "mean_token_accuracy": 0.1779150739312172, "num_tokens": 14916002.0, "step": 8080 }, { "entropy": 5.5718223571777346, "epoch": 0.6792690611216131, "grad_norm": 1.046875, "learning_rate": 0.0004960114884027583, "loss": 5.293, "mean_token_accuracy": 0.18619335889816285, "num_tokens": 14925247.0, "step": 8085 }, { "entropy": 5.642387247085571, "epoch": 0.679689140936778, "grad_norm": 0.97265625, "learning_rate": 0.0004960058728289939, "loss": 5.4049, "mean_token_accuracy": 0.16639461666345595, "num_tokens": 14933925.0, "step": 8090 }, { "entropy": 5.771145582199097, "epoch": 0.6801092207519429, "grad_norm": 1.1015625, "learning_rate": 0.0004960002533402243, "loss": 5.4809, "mean_token_accuracy": 0.16957206577062606, "num_tokens": 14943368.0, "step": 8095 }, { "entropy": 5.723950052261353, "epoch": 0.6805293005671077, "grad_norm": 0.9375, "learning_rate": 0.0004959946299365491, "loss": 5.492, "mean_token_accuracy": 0.16733952164649962, "num_tokens": 14953710.0, "step": 8100 }, { "entropy": 5.733155155181885, "epoch": 0.6809493803822726, "grad_norm": 0.94140625, "learning_rate": 0.0004959890026180677, "loss": 5.5124, "mean_token_accuracy": 0.16363269835710526, "num_tokens": 14962814.0, "step": 8105 }, { "entropy": 5.602628993988037, "epoch": 0.6813694601974375, "grad_norm": 0.9296875, "learning_rate": 0.00049598337138488, "loss": 5.3951, "mean_token_accuracy": 0.1761382609605789, "num_tokens": 14971631.0, "step": 8110 }, { "entropy": 5.703770446777344, "epoch": 0.6817895400126024, "grad_norm": 1.0234375, "learning_rate": 0.0004959777362370855, "loss": 5.3987, "mean_token_accuracy": 0.17302963733673096, "num_tokens": 14980528.0, "step": 8115 }, { "entropy": 5.681969165802002, "epoch": 0.6822096198277673, "grad_norm": 0.953125, "learning_rate": 0.0004959720971747843, "loss": 5.4208, "mean_token_accuracy": 0.17004417777061462, "num_tokens": 14989331.0, "step": 8120 }, { "entropy": 5.671417903900147, "epoch": 0.6826296996429322, "grad_norm": 0.9609375, "learning_rate": 0.0004959664541980762, "loss": 5.4188, "mean_token_accuracy": 0.17401942908763884, "num_tokens": 14999403.0, "step": 8125 }, { "entropy": 5.710601329803467, "epoch": 0.6830497794580971, "grad_norm": 0.98046875, "learning_rate": 0.0004959608073070612, "loss": 5.5114, "mean_token_accuracy": 0.16626310646533965, "num_tokens": 15009388.0, "step": 8130 }, { "entropy": 5.728013801574707, "epoch": 0.6834698592732619, "grad_norm": 0.9609375, "learning_rate": 0.0004959551565018392, "loss": 5.4168, "mean_token_accuracy": 0.17476363331079484, "num_tokens": 15018586.0, "step": 8135 }, { "entropy": 5.666727352142334, "epoch": 0.6838899390884268, "grad_norm": 0.9375, "learning_rate": 0.0004959495017825104, "loss": 5.4379, "mean_token_accuracy": 0.1766646295785904, "num_tokens": 15027982.0, "step": 8140 }, { "entropy": 5.610442161560059, "epoch": 0.6843100189035917, "grad_norm": 0.9765625, "learning_rate": 0.0004959438431491749, "loss": 5.4186, "mean_token_accuracy": 0.1739590048789978, "num_tokens": 15037103.0, "step": 8145 }, { "entropy": 5.644532632827759, "epoch": 0.6847300987187566, "grad_norm": 0.91015625, "learning_rate": 0.000495938180601933, "loss": 5.5204, "mean_token_accuracy": 0.17002057135105134, "num_tokens": 15046739.0, "step": 8150 }, { "entropy": 5.745840978622437, "epoch": 0.6851501785339215, "grad_norm": 0.97265625, "learning_rate": 0.0004959325141408851, "loss": 5.4676, "mean_token_accuracy": 0.1691015213727951, "num_tokens": 15056586.0, "step": 8155 }, { "entropy": 5.6763612747192385, "epoch": 0.6855702583490864, "grad_norm": 1.0078125, "learning_rate": 0.0004959268437661313, "loss": 5.449, "mean_token_accuracy": 0.1687142327427864, "num_tokens": 15066622.0, "step": 8160 }, { "entropy": 5.678127193450928, "epoch": 0.6859903381642513, "grad_norm": 1.1640625, "learning_rate": 0.0004959211694777724, "loss": 5.4139, "mean_token_accuracy": 0.17046130895614625, "num_tokens": 15075415.0, "step": 8165 }, { "entropy": 5.642941427230835, "epoch": 0.686410417979416, "grad_norm": 0.90234375, "learning_rate": 0.0004959154912759086, "loss": 5.41, "mean_token_accuracy": 0.16956793367862702, "num_tokens": 15085087.0, "step": 8170 }, { "entropy": 5.674217653274536, "epoch": 0.6868304977945809, "grad_norm": 1.015625, "learning_rate": 0.0004959098091606406, "loss": 5.4276, "mean_token_accuracy": 0.17282803803682328, "num_tokens": 15093580.0, "step": 8175 }, { "entropy": 5.6009931564331055, "epoch": 0.6872505776097458, "grad_norm": 1.1328125, "learning_rate": 0.0004959041231320692, "loss": 5.4085, "mean_token_accuracy": 0.1755705252289772, "num_tokens": 15104033.0, "step": 8180 }, { "entropy": 5.684960508346558, "epoch": 0.6876706574249107, "grad_norm": 1.078125, "learning_rate": 0.0004958984331902951, "loss": 5.4874, "mean_token_accuracy": 0.16569405645132065, "num_tokens": 15113164.0, "step": 8185 }, { "entropy": 5.652985095977783, "epoch": 0.6880907372400756, "grad_norm": 0.9765625, "learning_rate": 0.0004958927393354188, "loss": 5.4253, "mean_token_accuracy": 0.1720282956957817, "num_tokens": 15122215.0, "step": 8190 }, { "entropy": 5.695438480377197, "epoch": 0.6885108170552405, "grad_norm": 0.99609375, "learning_rate": 0.0004958870415675415, "loss": 5.4113, "mean_token_accuracy": 0.1668440580368042, "num_tokens": 15130877.0, "step": 8195 }, { "entropy": 5.642064094543457, "epoch": 0.6889308968704054, "grad_norm": 0.98046875, "learning_rate": 0.0004958813398867639, "loss": 5.395, "mean_token_accuracy": 0.1738823726773262, "num_tokens": 15140227.0, "step": 8200 }, { "entropy": 5.766198778152466, "epoch": 0.6893509766855702, "grad_norm": 0.9375, "learning_rate": 0.0004958756342931872, "loss": 5.5718, "mean_token_accuracy": 0.16096531748771667, "num_tokens": 15150006.0, "step": 8205 }, { "entropy": 5.710210561752319, "epoch": 0.6897710565007351, "grad_norm": 0.94140625, "learning_rate": 0.0004958699247869122, "loss": 5.4481, "mean_token_accuracy": 0.17095823884010314, "num_tokens": 15160032.0, "step": 8210 }, { "entropy": 5.657556676864624, "epoch": 0.6901911363159, "grad_norm": 0.9140625, "learning_rate": 0.0004958642113680404, "loss": 5.4142, "mean_token_accuracy": 0.169447460770607, "num_tokens": 15168966.0, "step": 8215 }, { "entropy": 5.795995044708252, "epoch": 0.6906112161310649, "grad_norm": 1.1484375, "learning_rate": 0.0004958584940366727, "loss": 5.5844, "mean_token_accuracy": 0.16470508724451066, "num_tokens": 15179337.0, "step": 8220 }, { "entropy": 5.751201152801514, "epoch": 0.6910312959462298, "grad_norm": 0.921875, "learning_rate": 0.0004958527727929106, "loss": 5.4862, "mean_token_accuracy": 0.16709066778421403, "num_tokens": 15188395.0, "step": 8225 }, { "entropy": 5.686253881454467, "epoch": 0.6914513757613947, "grad_norm": 0.91796875, "learning_rate": 0.0004958470476368552, "loss": 5.405, "mean_token_accuracy": 0.17547234743833542, "num_tokens": 15198669.0, "step": 8230 }, { "entropy": 5.664592313766479, "epoch": 0.6918714555765595, "grad_norm": 1.03125, "learning_rate": 0.0004958413185686082, "loss": 5.4306, "mean_token_accuracy": 0.17083754986524582, "num_tokens": 15207371.0, "step": 8235 }, { "entropy": 5.721945762634277, "epoch": 0.6922915353917244, "grad_norm": 1.0078125, "learning_rate": 0.0004958355855882709, "loss": 5.4682, "mean_token_accuracy": 0.16971587836742402, "num_tokens": 15215694.0, "step": 8240 }, { "entropy": 5.71007285118103, "epoch": 0.6927116152068893, "grad_norm": 0.95703125, "learning_rate": 0.000495829848695945, "loss": 5.4325, "mean_token_accuracy": 0.16733045727014542, "num_tokens": 15224963.0, "step": 8245 }, { "entropy": 5.581372213363648, "epoch": 0.6931316950220542, "grad_norm": 0.97265625, "learning_rate": 0.000495824107891732, "loss": 5.2622, "mean_token_accuracy": 0.17676324248313904, "num_tokens": 15233569.0, "step": 8250 }, { "entropy": 5.66594181060791, "epoch": 0.6935517748372191, "grad_norm": 0.98046875, "learning_rate": 0.0004958183631757336, "loss": 5.4461, "mean_token_accuracy": 0.16882045865058898, "num_tokens": 15242671.0, "step": 8255 }, { "entropy": 5.650574827194214, "epoch": 0.693971854652384, "grad_norm": 0.9453125, "learning_rate": 0.0004958126145480517, "loss": 5.402, "mean_token_accuracy": 0.17607998102903366, "num_tokens": 15251698.0, "step": 8260 }, { "entropy": 5.7385705471038815, "epoch": 0.6943919344675489, "grad_norm": 1.0390625, "learning_rate": 0.0004958068620087879, "loss": 5.5143, "mean_token_accuracy": 0.16893487125635148, "num_tokens": 15260608.0, "step": 8265 }, { "entropy": 5.675213384628296, "epoch": 0.6948120142827137, "grad_norm": 0.97265625, "learning_rate": 0.0004958011055580443, "loss": 5.3836, "mean_token_accuracy": 0.1763719156384468, "num_tokens": 15268866.0, "step": 8270 }, { "entropy": 5.606337022781372, "epoch": 0.6952320940978786, "grad_norm": 1.0, "learning_rate": 0.0004957953451959229, "loss": 5.3531, "mean_token_accuracy": 0.18196955025196077, "num_tokens": 15277600.0, "step": 8275 }, { "entropy": 5.608048725128174, "epoch": 0.6956521739130435, "grad_norm": 0.90625, "learning_rate": 0.0004957895809225254, "loss": 5.3712, "mean_token_accuracy": 0.1749396950006485, "num_tokens": 15286016.0, "step": 8280 }, { "entropy": 5.654298305511475, "epoch": 0.6960722537282084, "grad_norm": 0.98046875, "learning_rate": 0.0004957838127379544, "loss": 5.4302, "mean_token_accuracy": 0.1747421383857727, "num_tokens": 15294676.0, "step": 8285 }, { "entropy": 5.679285192489624, "epoch": 0.6964923335433733, "grad_norm": 0.94921875, "learning_rate": 0.0004957780406423118, "loss": 5.4205, "mean_token_accuracy": 0.17060777693986892, "num_tokens": 15304084.0, "step": 8290 }, { "entropy": 5.68450517654419, "epoch": 0.6969124133585382, "grad_norm": 1.0390625, "learning_rate": 0.0004957722646356999, "loss": 5.4217, "mean_token_accuracy": 0.16955252438783647, "num_tokens": 15314182.0, "step": 8295 }, { "entropy": 5.683474969863892, "epoch": 0.697332493173703, "grad_norm": 0.94921875, "learning_rate": 0.0004957664847182209, "loss": 5.5497, "mean_token_accuracy": 0.16295086219906807, "num_tokens": 15324213.0, "step": 8300 }, { "entropy": 5.713848733901978, "epoch": 0.6977525729888678, "grad_norm": 1.078125, "learning_rate": 0.0004957607008899774, "loss": 5.4767, "mean_token_accuracy": 0.16117004156112671, "num_tokens": 15333122.0, "step": 8305 }, { "entropy": 5.745818662643432, "epoch": 0.6981726528040327, "grad_norm": 1.015625, "learning_rate": 0.0004957549131510717, "loss": 5.5546, "mean_token_accuracy": 0.1615897461771965, "num_tokens": 15342199.0, "step": 8310 }, { "entropy": 5.769120693206787, "epoch": 0.6985927326191976, "grad_norm": 0.9453125, "learning_rate": 0.0004957491215016065, "loss": 5.5272, "mean_token_accuracy": 0.1642034813761711, "num_tokens": 15352463.0, "step": 8315 }, { "entropy": 5.628503799438477, "epoch": 0.6990128124343625, "grad_norm": 1.0078125, "learning_rate": 0.0004957433259416841, "loss": 5.3547, "mean_token_accuracy": 0.16713788211345673, "num_tokens": 15361815.0, "step": 8320 }, { "entropy": 5.698371410369873, "epoch": 0.6994328922495274, "grad_norm": 0.9609375, "learning_rate": 0.0004957375264714075, "loss": 5.4572, "mean_token_accuracy": 0.16114626228809356, "num_tokens": 15371773.0, "step": 8325 }, { "entropy": 5.616191101074219, "epoch": 0.6998529720646923, "grad_norm": 0.96484375, "learning_rate": 0.0004957317230908792, "loss": 5.4195, "mean_token_accuracy": 0.16928454339504242, "num_tokens": 15380881.0, "step": 8330 }, { "entropy": 5.6188719272613525, "epoch": 0.7002730518798572, "grad_norm": 0.99609375, "learning_rate": 0.0004957259158002022, "loss": 5.2754, "mean_token_accuracy": 0.17819535881280898, "num_tokens": 15389310.0, "step": 8335 }, { "entropy": 5.60901665687561, "epoch": 0.700693131695022, "grad_norm": 0.94921875, "learning_rate": 0.0004957201045994791, "loss": 5.3881, "mean_token_accuracy": 0.17114198654890062, "num_tokens": 15398584.0, "step": 8340 }, { "entropy": 5.68906078338623, "epoch": 0.7011132115101869, "grad_norm": 1.0078125, "learning_rate": 0.0004957142894888131, "loss": 5.4298, "mean_token_accuracy": 0.17326382100582122, "num_tokens": 15407208.0, "step": 8345 }, { "entropy": 5.684459161758423, "epoch": 0.7015332913253518, "grad_norm": 0.92578125, "learning_rate": 0.0004957084704683071, "loss": 5.466, "mean_token_accuracy": 0.16816584765911102, "num_tokens": 15416474.0, "step": 8350 }, { "entropy": 5.681583738327026, "epoch": 0.7019533711405167, "grad_norm": 0.98828125, "learning_rate": 0.0004957026475380642, "loss": 5.4514, "mean_token_accuracy": 0.17442281246185304, "num_tokens": 15426101.0, "step": 8355 }, { "entropy": 5.714492893218994, "epoch": 0.7023734509556816, "grad_norm": 1.0078125, "learning_rate": 0.0004956968206981875, "loss": 5.4927, "mean_token_accuracy": 0.16649986654520035, "num_tokens": 15435910.0, "step": 8360 }, { "entropy": 5.751401758193969, "epoch": 0.7027935307708465, "grad_norm": 0.96484375, "learning_rate": 0.0004956909899487803, "loss": 5.5138, "mean_token_accuracy": 0.1684841424226761, "num_tokens": 15445494.0, "step": 8365 }, { "entropy": 5.6478063583374025, "epoch": 0.7032136105860114, "grad_norm": 0.84375, "learning_rate": 0.0004956851552899459, "loss": 5.4225, "mean_token_accuracy": 0.17504344284534454, "num_tokens": 15455332.0, "step": 8370 }, { "entropy": 5.68330488204956, "epoch": 0.7036336904011762, "grad_norm": 0.91796875, "learning_rate": 0.0004956793167217874, "loss": 5.484, "mean_token_accuracy": 0.16238084584474563, "num_tokens": 15464241.0, "step": 8375 }, { "entropy": 5.763062286376953, "epoch": 0.7040537702163411, "grad_norm": 1.109375, "learning_rate": 0.0004956734742444087, "loss": 5.4807, "mean_token_accuracy": 0.17389054596424103, "num_tokens": 15473473.0, "step": 8380 }, { "entropy": 5.65634708404541, "epoch": 0.704473850031506, "grad_norm": 1.0390625, "learning_rate": 0.0004956676278579129, "loss": 5.3614, "mean_token_accuracy": 0.1734451323747635, "num_tokens": 15482494.0, "step": 8385 }, { "entropy": 5.589113998413086, "epoch": 0.7048939298466709, "grad_norm": 0.94140625, "learning_rate": 0.0004956617775624037, "loss": 5.3843, "mean_token_accuracy": 0.17073923498392105, "num_tokens": 15491180.0, "step": 8390 }, { "entropy": 5.657165622711181, "epoch": 0.7053140096618358, "grad_norm": 1.0078125, "learning_rate": 0.0004956559233579848, "loss": 5.4323, "mean_token_accuracy": 0.16821539252996445, "num_tokens": 15501035.0, "step": 8395 }, { "entropy": 5.6735498905181885, "epoch": 0.7057340894770007, "grad_norm": 0.96875, "learning_rate": 0.0004956500652447598, "loss": 5.426, "mean_token_accuracy": 0.17123993486166, "num_tokens": 15510191.0, "step": 8400 }, { "entropy": 5.642197751998902, "epoch": 0.7061541692921655, "grad_norm": 0.9453125, "learning_rate": 0.0004956442032228324, "loss": 5.486, "mean_token_accuracy": 0.17094990164041518, "num_tokens": 15519253.0, "step": 8405 }, { "entropy": 5.679303741455078, "epoch": 0.7065742491073304, "grad_norm": 0.9921875, "learning_rate": 0.0004956383372923067, "loss": 5.4521, "mean_token_accuracy": 0.16864797472953796, "num_tokens": 15528348.0, "step": 8410 }, { "entropy": 5.756013870239258, "epoch": 0.7069943289224953, "grad_norm": 0.89453125, "learning_rate": 0.0004956324674532864, "loss": 5.5294, "mean_token_accuracy": 0.16600346565246582, "num_tokens": 15537557.0, "step": 8415 }, { "entropy": 5.761501026153565, "epoch": 0.7074144087376601, "grad_norm": 0.90234375, "learning_rate": 0.0004956265937058757, "loss": 5.4449, "mean_token_accuracy": 0.17098963260650635, "num_tokens": 15546745.0, "step": 8420 }, { "entropy": 5.6882706642150875, "epoch": 0.707834488552825, "grad_norm": 0.96875, "learning_rate": 0.0004956207160501784, "loss": 5.3722, "mean_token_accuracy": 0.17445978671312332, "num_tokens": 15555532.0, "step": 8425 }, { "entropy": 5.64253797531128, "epoch": 0.70825456836799, "grad_norm": 0.9765625, "learning_rate": 0.0004956148344862987, "loss": 5.4332, "mean_token_accuracy": 0.17582773566246032, "num_tokens": 15564189.0, "step": 8430 }, { "entropy": 5.58995246887207, "epoch": 0.7086746481831548, "grad_norm": 0.9453125, "learning_rate": 0.0004956089490143408, "loss": 5.4465, "mean_token_accuracy": 0.16621713638305663, "num_tokens": 15574116.0, "step": 8435 }, { "entropy": 5.764248561859131, "epoch": 0.7090947279983196, "grad_norm": 0.98828125, "learning_rate": 0.0004956030596344089, "loss": 5.4297, "mean_token_accuracy": 0.1704532414674759, "num_tokens": 15583031.0, "step": 8440 }, { "entropy": 5.756300067901611, "epoch": 0.7095148078134845, "grad_norm": 0.8671875, "learning_rate": 0.0004955971663466075, "loss": 5.5617, "mean_token_accuracy": 0.1687937393784523, "num_tokens": 15592576.0, "step": 8445 }, { "entropy": 5.753180599212646, "epoch": 0.7099348876286494, "grad_norm": 0.96484375, "learning_rate": 0.0004955912691510407, "loss": 5.479, "mean_token_accuracy": 0.17366782128810881, "num_tokens": 15601065.0, "step": 8450 }, { "entropy": 5.669482135772705, "epoch": 0.7103549674438143, "grad_norm": 0.98828125, "learning_rate": 0.0004955853680478134, "loss": 5.4236, "mean_token_accuracy": 0.16465443670749663, "num_tokens": 15610112.0, "step": 8455 }, { "entropy": 5.672362327575684, "epoch": 0.7107750472589792, "grad_norm": 0.984375, "learning_rate": 0.0004955794630370297, "loss": 5.4069, "mean_token_accuracy": 0.16875406056642533, "num_tokens": 15618890.0, "step": 8460 }, { "entropy": 5.661868476867676, "epoch": 0.7111951270741441, "grad_norm": 0.95703125, "learning_rate": 0.0004955735541187945, "loss": 5.4497, "mean_token_accuracy": 0.17067780196666718, "num_tokens": 15627678.0, "step": 8465 }, { "entropy": 5.752597522735596, "epoch": 0.711615206889309, "grad_norm": 1.0546875, "learning_rate": 0.0004955676412932124, "loss": 5.4364, "mean_token_accuracy": 0.17605146616697312, "num_tokens": 15636833.0, "step": 8470 }, { "entropy": 5.6645129203796385, "epoch": 0.7120352867044738, "grad_norm": 1.0390625, "learning_rate": 0.0004955617245603881, "loss": 5.4432, "mean_token_accuracy": 0.1653780534863472, "num_tokens": 15646571.0, "step": 8475 }, { "entropy": 5.627372455596924, "epoch": 0.7124553665196387, "grad_norm": 1.0546875, "learning_rate": 0.0004955558039204263, "loss": 5.4762, "mean_token_accuracy": 0.17126149088144302, "num_tokens": 15654907.0, "step": 8480 }, { "entropy": 5.70549750328064, "epoch": 0.7128754463348036, "grad_norm": 0.98828125, "learning_rate": 0.0004955498793734321, "loss": 5.4261, "mean_token_accuracy": 0.17386842668056487, "num_tokens": 15664336.0, "step": 8485 }, { "entropy": 5.71790714263916, "epoch": 0.7132955261499685, "grad_norm": 0.98828125, "learning_rate": 0.0004955439509195103, "loss": 5.4874, "mean_token_accuracy": 0.17014443427324294, "num_tokens": 15674000.0, "step": 8490 }, { "entropy": 5.714636373519897, "epoch": 0.7137156059651334, "grad_norm": 0.9921875, "learning_rate": 0.0004955380185587661, "loss": 5.4778, "mean_token_accuracy": 0.1748084157705307, "num_tokens": 15684214.0, "step": 8495 }, { "entropy": 5.718602418899536, "epoch": 0.7141356857802983, "grad_norm": 1.0390625, "learning_rate": 0.0004955320822913043, "loss": 5.4875, "mean_token_accuracy": 0.1683593288064003, "num_tokens": 15693546.0, "step": 8500 }, { "entropy": 5.689378499984741, "epoch": 0.7145557655954632, "grad_norm": 0.95703125, "learning_rate": 0.0004955261421172302, "loss": 5.3905, "mean_token_accuracy": 0.17118469923734664, "num_tokens": 15702310.0, "step": 8505 }, { "entropy": 5.666873931884766, "epoch": 0.714975845410628, "grad_norm": 1.0234375, "learning_rate": 0.0004955201980366493, "loss": 5.4483, "mean_token_accuracy": 0.17365573197603226, "num_tokens": 15711544.0, "step": 8510 }, { "entropy": 5.563192462921142, "epoch": 0.7153959252257929, "grad_norm": 1.015625, "learning_rate": 0.0004955142500496665, "loss": 5.3457, "mean_token_accuracy": 0.17584063559770585, "num_tokens": 15720914.0, "step": 8515 }, { "entropy": 5.685083055496216, "epoch": 0.7158160050409578, "grad_norm": 0.96875, "learning_rate": 0.0004955082981563872, "loss": 5.4339, "mean_token_accuracy": 0.1660164326429367, "num_tokens": 15729825.0, "step": 8520 }, { "entropy": 5.712343072891235, "epoch": 0.7162360848561227, "grad_norm": 0.97265625, "learning_rate": 0.000495502342356917, "loss": 5.4367, "mean_token_accuracy": 0.17206156849861146, "num_tokens": 15739649.0, "step": 8525 }, { "entropy": 5.704998302459717, "epoch": 0.7166561646712876, "grad_norm": 1.078125, "learning_rate": 0.0004954963826513614, "loss": 5.3455, "mean_token_accuracy": 0.1762731447815895, "num_tokens": 15747805.0, "step": 8530 }, { "entropy": 5.720434188842773, "epoch": 0.7170762444864525, "grad_norm": 0.9140625, "learning_rate": 0.000495490419039826, "loss": 5.4727, "mean_token_accuracy": 0.16803782433271408, "num_tokens": 15757267.0, "step": 8535 }, { "entropy": 5.670850324630737, "epoch": 0.7174963243016174, "grad_norm": 0.9375, "learning_rate": 0.0004954844515224162, "loss": 5.4318, "mean_token_accuracy": 0.1713361293077469, "num_tokens": 15767412.0, "step": 8540 }, { "entropy": 5.62501802444458, "epoch": 0.7179164041167821, "grad_norm": 1.0078125, "learning_rate": 0.0004954784800992379, "loss": 5.4551, "mean_token_accuracy": 0.16513469964265823, "num_tokens": 15776813.0, "step": 8545 }, { "entropy": 5.7299168586730955, "epoch": 0.718336483931947, "grad_norm": 0.953125, "learning_rate": 0.0004954725047703969, "loss": 5.4713, "mean_token_accuracy": 0.16940504908561707, "num_tokens": 15786258.0, "step": 8550 }, { "entropy": 5.700893259048462, "epoch": 0.7187565637471119, "grad_norm": 0.95703125, "learning_rate": 0.000495466525535999, "loss": 5.4489, "mean_token_accuracy": 0.1717313602566719, "num_tokens": 15795673.0, "step": 8555 }, { "entropy": 5.704733180999756, "epoch": 0.7191766435622768, "grad_norm": 0.9609375, "learning_rate": 0.0004954605423961501, "loss": 5.4469, "mean_token_accuracy": 0.16943047791719437, "num_tokens": 15805050.0, "step": 8560 }, { "entropy": 5.609813070297241, "epoch": 0.7195967233774417, "grad_norm": 0.9921875, "learning_rate": 0.0004954545553509562, "loss": 5.3888, "mean_token_accuracy": 0.181489497423172, "num_tokens": 15813347.0, "step": 8565 }, { "entropy": 5.7514173030853275, "epoch": 0.7200168031926066, "grad_norm": 0.92578125, "learning_rate": 0.0004954485644005235, "loss": 5.517, "mean_token_accuracy": 0.16980722844600676, "num_tokens": 15823528.0, "step": 8570 }, { "entropy": 5.688001346588135, "epoch": 0.7204368830077714, "grad_norm": 1.0078125, "learning_rate": 0.0004954425695449578, "loss": 5.4175, "mean_token_accuracy": 0.16882595270872117, "num_tokens": 15832727.0, "step": 8575 }, { "entropy": 5.731327629089355, "epoch": 0.7208569628229363, "grad_norm": 0.8828125, "learning_rate": 0.0004954365707843657, "loss": 5.4921, "mean_token_accuracy": 0.16411554515361787, "num_tokens": 15842402.0, "step": 8580 }, { "entropy": 5.649729061126709, "epoch": 0.7212770426381012, "grad_norm": 0.98046875, "learning_rate": 0.0004954305681188531, "loss": 5.3546, "mean_token_accuracy": 0.17328893691301345, "num_tokens": 15850886.0, "step": 8585 }, { "entropy": 5.83349871635437, "epoch": 0.7216971224532661, "grad_norm": 1.125, "learning_rate": 0.0004954245615485265, "loss": 5.6498, "mean_token_accuracy": 0.16681575551629066, "num_tokens": 15860093.0, "step": 8590 }, { "entropy": 5.686976051330566, "epoch": 0.722117202268431, "grad_norm": 1.0, "learning_rate": 0.0004954185510734924, "loss": 5.3677, "mean_token_accuracy": 0.17604906260967254, "num_tokens": 15868681.0, "step": 8595 }, { "entropy": 5.683244371414185, "epoch": 0.7225372820835959, "grad_norm": 1.0, "learning_rate": 0.0004954125366938571, "loss": 5.4332, "mean_token_accuracy": 0.1739838093519211, "num_tokens": 15878041.0, "step": 8600 }, { "entropy": 5.649758672714233, "epoch": 0.7229573618987608, "grad_norm": 0.984375, "learning_rate": 0.0004954065184097271, "loss": 5.4359, "mean_token_accuracy": 0.16881918907165527, "num_tokens": 15887562.0, "step": 8605 }, { "entropy": 5.665257740020752, "epoch": 0.7233774417139256, "grad_norm": 1.03125, "learning_rate": 0.0004954004962212092, "loss": 5.3612, "mean_token_accuracy": 0.18196807503700257, "num_tokens": 15896480.0, "step": 8610 }, { "entropy": 5.814667558670044, "epoch": 0.7237975215290905, "grad_norm": 0.9296875, "learning_rate": 0.0004953944701284101, "loss": 5.5867, "mean_token_accuracy": 0.1643269270658493, "num_tokens": 15906743.0, "step": 8615 }, { "entropy": 5.7049860000610355, "epoch": 0.7242176013442554, "grad_norm": 0.94140625, "learning_rate": 0.0004953884401314363, "loss": 5.5353, "mean_token_accuracy": 0.15585060715675353, "num_tokens": 15915981.0, "step": 8620 }, { "entropy": 5.70078763961792, "epoch": 0.7246376811594203, "grad_norm": 0.98828125, "learning_rate": 0.0004953824062303949, "loss": 5.3731, "mean_token_accuracy": 0.17181483656167984, "num_tokens": 15924117.0, "step": 8625 }, { "entropy": 5.644768333435058, "epoch": 0.7250577609745852, "grad_norm": 0.98828125, "learning_rate": 0.0004953763684253926, "loss": 5.3972, "mean_token_accuracy": 0.17694538086652756, "num_tokens": 15933124.0, "step": 8630 }, { "entropy": 5.634309864044189, "epoch": 0.7254778407897501, "grad_norm": 0.96875, "learning_rate": 0.0004953703267165364, "loss": 5.2899, "mean_token_accuracy": 0.16893458366394043, "num_tokens": 15942422.0, "step": 8635 }, { "entropy": 5.707332992553711, "epoch": 0.725897920604915, "grad_norm": 1.0078125, "learning_rate": 0.0004953642811039332, "loss": 5.5175, "mean_token_accuracy": 0.16249137073755265, "num_tokens": 15950989.0, "step": 8640 }, { "entropy": 5.731482267379761, "epoch": 0.7263180004200798, "grad_norm": 0.94921875, "learning_rate": 0.0004953582315876904, "loss": 5.5094, "mean_token_accuracy": 0.17023360580205918, "num_tokens": 15959659.0, "step": 8645 }, { "entropy": 5.667933464050293, "epoch": 0.7267380802352447, "grad_norm": 0.984375, "learning_rate": 0.000495352178167915, "loss": 5.3771, "mean_token_accuracy": 0.184345543384552, "num_tokens": 15968102.0, "step": 8650 }, { "entropy": 5.740264129638672, "epoch": 0.7271581600504096, "grad_norm": 0.99609375, "learning_rate": 0.0004953461208447143, "loss": 5.5071, "mean_token_accuracy": 0.16474466323852538, "num_tokens": 15977705.0, "step": 8655 }, { "entropy": 5.689959383010864, "epoch": 0.7275782398655745, "grad_norm": 1.0546875, "learning_rate": 0.0004953400596181953, "loss": 5.5055, "mean_token_accuracy": 0.1639854222536087, "num_tokens": 15986703.0, "step": 8660 }, { "entropy": 5.696602773666382, "epoch": 0.7279983196807394, "grad_norm": 0.9765625, "learning_rate": 0.0004953339944884657, "loss": 5.413, "mean_token_accuracy": 0.17565635591745377, "num_tokens": 15995672.0, "step": 8665 }, { "entropy": 5.609744215011597, "epoch": 0.7284183994959043, "grad_norm": 0.95703125, "learning_rate": 0.0004953279254556329, "loss": 5.3653, "mean_token_accuracy": 0.17787732928991318, "num_tokens": 16004437.0, "step": 8670 }, { "entropy": 5.684988927841187, "epoch": 0.7288384793110692, "grad_norm": 1.0625, "learning_rate": 0.0004953218525198043, "loss": 5.3973, "mean_token_accuracy": 0.16983381360769273, "num_tokens": 16012847.0, "step": 8675 }, { "entropy": 5.713296937942505, "epoch": 0.7292585591262339, "grad_norm": 0.91015625, "learning_rate": 0.0004953157756810876, "loss": 5.4282, "mean_token_accuracy": 0.1725606307387352, "num_tokens": 16022213.0, "step": 8680 }, { "entropy": 5.682115602493286, "epoch": 0.7296786389413988, "grad_norm": 0.93359375, "learning_rate": 0.0004953096949395902, "loss": 5.4937, "mean_token_accuracy": 0.17437110096216202, "num_tokens": 16031411.0, "step": 8685 }, { "entropy": 5.728541135787964, "epoch": 0.7300987187565637, "grad_norm": 0.921875, "learning_rate": 0.0004953036102954202, "loss": 5.5302, "mean_token_accuracy": 0.1656832292675972, "num_tokens": 16041227.0, "step": 8690 }, { "entropy": 5.642122364044189, "epoch": 0.7305187985717286, "grad_norm": 0.921875, "learning_rate": 0.0004952975217486852, "loss": 5.351, "mean_token_accuracy": 0.17716092318296434, "num_tokens": 16049777.0, "step": 8695 }, { "entropy": 5.650162410736084, "epoch": 0.7309388783868935, "grad_norm": 0.92578125, "learning_rate": 0.0004952914292994928, "loss": 5.4486, "mean_token_accuracy": 0.17793299108743668, "num_tokens": 16059093.0, "step": 8700 }, { "entropy": 5.73065881729126, "epoch": 0.7313589582020584, "grad_norm": 1.046875, "learning_rate": 0.0004952853329479514, "loss": 5.4722, "mean_token_accuracy": 0.17585868388414383, "num_tokens": 16068550.0, "step": 8705 }, { "entropy": 5.72199182510376, "epoch": 0.7317790380172233, "grad_norm": 1.0078125, "learning_rate": 0.0004952792326941686, "loss": 5.5006, "mean_token_accuracy": 0.1687643826007843, "num_tokens": 16078286.0, "step": 8710 }, { "entropy": 5.703016853332519, "epoch": 0.7321991178323881, "grad_norm": 0.9453125, "learning_rate": 0.0004952731285382527, "loss": 5.4409, "mean_token_accuracy": 0.17153105437755584, "num_tokens": 16087560.0, "step": 8715 }, { "entropy": 5.625837564468384, "epoch": 0.732619197647553, "grad_norm": 0.99609375, "learning_rate": 0.0004952670204803118, "loss": 5.4053, "mean_token_accuracy": 0.17746168673038482, "num_tokens": 16097478.0, "step": 8720 }, { "entropy": 5.729750633239746, "epoch": 0.7330392774627179, "grad_norm": 0.9296875, "learning_rate": 0.0004952609085204539, "loss": 5.5063, "mean_token_accuracy": 0.1720663473010063, "num_tokens": 16106884.0, "step": 8725 }, { "entropy": 5.6860052108764645, "epoch": 0.7334593572778828, "grad_norm": 0.9921875, "learning_rate": 0.0004952547926587876, "loss": 5.441, "mean_token_accuracy": 0.16728868782520295, "num_tokens": 16115689.0, "step": 8730 }, { "entropy": 5.649883890151978, "epoch": 0.7338794370930477, "grad_norm": 0.95703125, "learning_rate": 0.0004952486728954209, "loss": 5.3619, "mean_token_accuracy": 0.1765601083636284, "num_tokens": 16125237.0, "step": 8735 }, { "entropy": 5.612076044082642, "epoch": 0.7342995169082126, "grad_norm": 0.953125, "learning_rate": 0.0004952425492304624, "loss": 5.3915, "mean_token_accuracy": 0.17717382460832595, "num_tokens": 16133940.0, "step": 8740 }, { "entropy": 5.666138172149658, "epoch": 0.7347195967233774, "grad_norm": 1.0234375, "learning_rate": 0.0004952364216640207, "loss": 5.4628, "mean_token_accuracy": 0.17273564040660858, "num_tokens": 16143256.0, "step": 8745 }, { "entropy": 5.705369853973389, "epoch": 0.7351396765385423, "grad_norm": 0.8671875, "learning_rate": 0.000495230290196204, "loss": 5.3631, "mean_token_accuracy": 0.17574335932731627, "num_tokens": 16153259.0, "step": 8750 }, { "entropy": 5.7202253341674805, "epoch": 0.7355597563537072, "grad_norm": 1.1796875, "learning_rate": 0.0004952241548271212, "loss": 5.5937, "mean_token_accuracy": 0.1582058347761631, "num_tokens": 16162125.0, "step": 8755 }, { "entropy": 5.736495399475098, "epoch": 0.7359798361688721, "grad_norm": 0.96484375, "learning_rate": 0.0004952180155568809, "loss": 5.5053, "mean_token_accuracy": 0.16641440689563752, "num_tokens": 16171680.0, "step": 8760 }, { "entropy": 5.735837650299072, "epoch": 0.736399915984037, "grad_norm": 0.88671875, "learning_rate": 0.0004952118723855919, "loss": 5.5002, "mean_token_accuracy": 0.1728304609656334, "num_tokens": 16181559.0, "step": 8765 }, { "entropy": 5.693457317352295, "epoch": 0.7368199957992019, "grad_norm": 0.9609375, "learning_rate": 0.0004952057253133628, "loss": 5.4567, "mean_token_accuracy": 0.1683572053909302, "num_tokens": 16190611.0, "step": 8770 }, { "entropy": 5.697279071807861, "epoch": 0.7372400756143668, "grad_norm": 1.0, "learning_rate": 0.0004951995743403028, "loss": 5.4704, "mean_token_accuracy": 0.1670347899198532, "num_tokens": 16200156.0, "step": 8775 }, { "entropy": 5.6646442890167235, "epoch": 0.7376601554295316, "grad_norm": 0.91796875, "learning_rate": 0.0004951934194665208, "loss": 5.4466, "mean_token_accuracy": 0.16776914596557618, "num_tokens": 16209808.0, "step": 8780 }, { "entropy": 5.636610317230224, "epoch": 0.7380802352446965, "grad_norm": 0.95703125, "learning_rate": 0.0004951872606921257, "loss": 5.3929, "mean_token_accuracy": 0.17188532203435897, "num_tokens": 16219243.0, "step": 8785 }, { "entropy": 5.651103210449219, "epoch": 0.7385003150598614, "grad_norm": 0.89453125, "learning_rate": 0.0004951810980172265, "loss": 5.4078, "mean_token_accuracy": 0.1813236728310585, "num_tokens": 16228180.0, "step": 8790 }, { "entropy": 5.680718803405762, "epoch": 0.7389203948750263, "grad_norm": 1.0078125, "learning_rate": 0.0004951749314419327, "loss": 5.4423, "mean_token_accuracy": 0.1698632076382637, "num_tokens": 16237045.0, "step": 8795 }, { "entropy": 5.674114608764649, "epoch": 0.7393404746901912, "grad_norm": 1.0390625, "learning_rate": 0.0004951687609663533, "loss": 5.3539, "mean_token_accuracy": 0.17704771608114242, "num_tokens": 16245307.0, "step": 8800 }, { "entropy": 5.695535135269165, "epoch": 0.739760554505356, "grad_norm": 0.91796875, "learning_rate": 0.0004951625865905977, "loss": 5.394, "mean_token_accuracy": 0.16814257353544235, "num_tokens": 16255047.0, "step": 8805 }, { "entropy": 5.661872816085816, "epoch": 0.740180634320521, "grad_norm": 0.9765625, "learning_rate": 0.0004951564083147753, "loss": 5.4255, "mean_token_accuracy": 0.17673753798007966, "num_tokens": 16264969.0, "step": 8810 }, { "entropy": 5.688477516174316, "epoch": 0.7406007141356857, "grad_norm": 0.9140625, "learning_rate": 0.0004951502261389953, "loss": 5.5307, "mean_token_accuracy": 0.1647869899868965, "num_tokens": 16274757.0, "step": 8815 }, { "entropy": 5.648443365097046, "epoch": 0.7410207939508506, "grad_norm": 0.9921875, "learning_rate": 0.0004951440400633677, "loss": 5.4089, "mean_token_accuracy": 0.18536317497491836, "num_tokens": 16283409.0, "step": 8820 }, { "entropy": 5.6198039054870605, "epoch": 0.7414408737660155, "grad_norm": 1.0234375, "learning_rate": 0.0004951378500880015, "loss": 5.3963, "mean_token_accuracy": 0.1721627041697502, "num_tokens": 16293206.0, "step": 8825 }, { "entropy": 5.728368425369263, "epoch": 0.7418609535811804, "grad_norm": 1.0234375, "learning_rate": 0.0004951316562130067, "loss": 5.4044, "mean_token_accuracy": 0.1735895723104477, "num_tokens": 16303121.0, "step": 8830 }, { "entropy": 5.662971448898316, "epoch": 0.7422810333963453, "grad_norm": 0.87109375, "learning_rate": 0.000495125458438493, "loss": 5.3791, "mean_token_accuracy": 0.1796284094452858, "num_tokens": 16312710.0, "step": 8835 }, { "entropy": 5.774871492385865, "epoch": 0.7427011132115102, "grad_norm": 1.0078125, "learning_rate": 0.0004951192567645702, "loss": 5.59, "mean_token_accuracy": 0.16738404482603073, "num_tokens": 16322280.0, "step": 8840 }, { "entropy": 5.625352287292481, "epoch": 0.7431211930266751, "grad_norm": 0.99609375, "learning_rate": 0.0004951130511913481, "loss": 5.4247, "mean_token_accuracy": 0.17108162343502045, "num_tokens": 16331656.0, "step": 8845 }, { "entropy": 5.648809814453125, "epoch": 0.7435412728418399, "grad_norm": 0.97265625, "learning_rate": 0.0004951068417189366, "loss": 5.4397, "mean_token_accuracy": 0.17098239809274673, "num_tokens": 16341074.0, "step": 8850 }, { "entropy": 5.7191088676452635, "epoch": 0.7439613526570048, "grad_norm": 0.9609375, "learning_rate": 0.0004951006283474457, "loss": 5.4336, "mean_token_accuracy": 0.16694654077291488, "num_tokens": 16350097.0, "step": 8855 }, { "entropy": 5.541397190093994, "epoch": 0.7443814324721697, "grad_norm": 0.94140625, "learning_rate": 0.0004950944110769856, "loss": 5.3389, "mean_token_accuracy": 0.17541486024856567, "num_tokens": 16359274.0, "step": 8860 }, { "entropy": 5.590544271469116, "epoch": 0.7448015122873346, "grad_norm": 0.9921875, "learning_rate": 0.0004950881899076663, "loss": 5.3201, "mean_token_accuracy": 0.18839261084794998, "num_tokens": 16368445.0, "step": 8865 }, { "entropy": 5.711699295043945, "epoch": 0.7452215921024995, "grad_norm": 0.91796875, "learning_rate": 0.0004950819648395979, "loss": 5.4246, "mean_token_accuracy": 0.17058226317167283, "num_tokens": 16377689.0, "step": 8870 }, { "entropy": 5.673012161254883, "epoch": 0.7456416719176644, "grad_norm": 0.95703125, "learning_rate": 0.000495075735872891, "loss": 5.3794, "mean_token_accuracy": 0.16905369162559508, "num_tokens": 16386713.0, "step": 8875 }, { "entropy": 5.6853879451751705, "epoch": 0.7460617517328293, "grad_norm": 1.0390625, "learning_rate": 0.0004950695030076557, "loss": 5.4044, "mean_token_accuracy": 0.17320440262556075, "num_tokens": 16395390.0, "step": 8880 }, { "entropy": 5.667076969146729, "epoch": 0.7464818315479941, "grad_norm": 1.125, "learning_rate": 0.0004950632662440027, "loss": 5.4718, "mean_token_accuracy": 0.17193606197834016, "num_tokens": 16404531.0, "step": 8885 }, { "entropy": 5.617186498641968, "epoch": 0.746901911363159, "grad_norm": 1.015625, "learning_rate": 0.0004950570255820419, "loss": 5.366, "mean_token_accuracy": 0.1783306986093521, "num_tokens": 16413649.0, "step": 8890 }, { "entropy": 5.63929853439331, "epoch": 0.7473219911783239, "grad_norm": 0.92578125, "learning_rate": 0.0004950507810218843, "loss": 5.4949, "mean_token_accuracy": 0.16654341220855712, "num_tokens": 16423247.0, "step": 8895 }, { "entropy": 5.712381362915039, "epoch": 0.7477420709934888, "grad_norm": 1.0859375, "learning_rate": 0.0004950445325636405, "loss": 5.4008, "mean_token_accuracy": 0.1695175752043724, "num_tokens": 16432190.0, "step": 8900 }, { "entropy": 5.728821516036987, "epoch": 0.7481621508086537, "grad_norm": 0.87890625, "learning_rate": 0.0004950382802074211, "loss": 5.3723, "mean_token_accuracy": 0.18152749091386794, "num_tokens": 16443091.0, "step": 8905 }, { "entropy": 5.64563479423523, "epoch": 0.7485822306238186, "grad_norm": 0.9453125, "learning_rate": 0.0004950320239533369, "loss": 5.4228, "mean_token_accuracy": 0.17709367275238036, "num_tokens": 16452077.0, "step": 8910 }, { "entropy": 5.783951139450073, "epoch": 0.7490023104389834, "grad_norm": 1.1328125, "learning_rate": 0.0004950257638014986, "loss": 5.5452, "mean_token_accuracy": 0.16610444486141204, "num_tokens": 16461893.0, "step": 8915 }, { "entropy": 5.720726251602173, "epoch": 0.7494223902541483, "grad_norm": 0.9453125, "learning_rate": 0.0004950194997520172, "loss": 5.3846, "mean_token_accuracy": 0.1724250763654709, "num_tokens": 16470904.0, "step": 8920 }, { "entropy": 5.644548130035401, "epoch": 0.7498424700693131, "grad_norm": 1.1484375, "learning_rate": 0.0004950132318050037, "loss": 5.4305, "mean_token_accuracy": 0.1721814066171646, "num_tokens": 16480130.0, "step": 8925 }, { "entropy": 5.673683929443359, "epoch": 0.750262549884478, "grad_norm": 1.046875, "learning_rate": 0.0004950069599605691, "loss": 5.4916, "mean_token_accuracy": 0.17197586894035338, "num_tokens": 16489485.0, "step": 8930 }, { "entropy": 5.701393413543701, "epoch": 0.750682629699643, "grad_norm": 1.078125, "learning_rate": 0.0004950006842188245, "loss": 5.4405, "mean_token_accuracy": 0.17944686561822892, "num_tokens": 16498529.0, "step": 8935 }, { "entropy": 5.677621603012085, "epoch": 0.7511027095148078, "grad_norm": 0.98046875, "learning_rate": 0.000494994404579881, "loss": 5.3675, "mean_token_accuracy": 0.1745915085077286, "num_tokens": 16508094.0, "step": 8940 }, { "entropy": 5.678492450714112, "epoch": 0.7515227893299727, "grad_norm": 1.015625, "learning_rate": 0.00049498812104385, "loss": 5.4613, "mean_token_accuracy": 0.16839174926280975, "num_tokens": 16517620.0, "step": 8945 }, { "entropy": 5.660134744644165, "epoch": 0.7519428691451375, "grad_norm": 1.0078125, "learning_rate": 0.0004949818336108425, "loss": 5.4496, "mean_token_accuracy": 0.17120658308267594, "num_tokens": 16526720.0, "step": 8950 }, { "entropy": 5.636377191543579, "epoch": 0.7523629489603024, "grad_norm": 0.99609375, "learning_rate": 0.0004949755422809703, "loss": 5.4296, "mean_token_accuracy": 0.1683360904455185, "num_tokens": 16535979.0, "step": 8955 }, { "entropy": 5.661738634109497, "epoch": 0.7527830287754673, "grad_norm": 1.0390625, "learning_rate": 0.0004949692470543446, "loss": 5.3185, "mean_token_accuracy": 0.18360351473093034, "num_tokens": 16544538.0, "step": 8960 }, { "entropy": 5.621352195739746, "epoch": 0.7532031085906322, "grad_norm": 0.96875, "learning_rate": 0.0004949629479310769, "loss": 5.3992, "mean_token_accuracy": 0.174703386425972, "num_tokens": 16553962.0, "step": 8965 }, { "entropy": 5.662003660202027, "epoch": 0.7536231884057971, "grad_norm": 1.0, "learning_rate": 0.0004949566449112788, "loss": 5.3139, "mean_token_accuracy": 0.1806609570980072, "num_tokens": 16562652.0, "step": 8970 }, { "entropy": 5.713546705245972, "epoch": 0.754043268220962, "grad_norm": 1.03125, "learning_rate": 0.0004949503379950621, "loss": 5.4185, "mean_token_accuracy": 0.17080322057008743, "num_tokens": 16570887.0, "step": 8975 }, { "entropy": 5.730198669433594, "epoch": 0.7544633480361269, "grad_norm": 0.91015625, "learning_rate": 0.0004949440271825385, "loss": 5.5407, "mean_token_accuracy": 0.17042307704687118, "num_tokens": 16581469.0, "step": 8980 }, { "entropy": 5.709934711456299, "epoch": 0.7548834278512917, "grad_norm": 1.0078125, "learning_rate": 0.0004949377124738196, "loss": 5.4467, "mean_token_accuracy": 0.1689629077911377, "num_tokens": 16590213.0, "step": 8985 }, { "entropy": 5.6945000171661375, "epoch": 0.7553035076664566, "grad_norm": 1.015625, "learning_rate": 0.0004949313938690174, "loss": 5.4155, "mean_token_accuracy": 0.1702682465314865, "num_tokens": 16598384.0, "step": 8990 }, { "entropy": 5.618514919281006, "epoch": 0.7557235874816215, "grad_norm": 0.9765625, "learning_rate": 0.0004949250713682438, "loss": 5.397, "mean_token_accuracy": 0.1767728477716446, "num_tokens": 16607670.0, "step": 8995 }, { "entropy": 5.694167900085449, "epoch": 0.7561436672967864, "grad_norm": 0.98046875, "learning_rate": 0.0004949187449716107, "loss": 5.4889, "mean_token_accuracy": 0.171578086912632, "num_tokens": 16617560.0, "step": 9000 }, { "epoch": 0.7561436672967864, "eval_entropy": 5.450514390263918, "eval_loss": 5.435747146606445, "eval_mean_token_accuracy": 0.17926591218436613, "eval_num_tokens": 16617560.0, "eval_runtime": 21.0554, "eval_samples_per_second": 1774.652, "eval_steps_per_second": 221.843, "step": 9000 }, { "entropy": 5.6758159637451175, "epoch": 0.7565637471119513, "grad_norm": 0.98828125, "learning_rate": 0.0004949124146792304, "loss": 5.3883, "mean_token_accuracy": 0.17687657326459885, "num_tokens": 16626038.0, "step": 9005 }, { "entropy": 5.6274542808532715, "epoch": 0.7569838269271162, "grad_norm": 0.984375, "learning_rate": 0.0004949060804912149, "loss": 5.4018, "mean_token_accuracy": 0.17318589389324188, "num_tokens": 16636490.0, "step": 9010 }, { "entropy": 5.682026958465576, "epoch": 0.7574039067422811, "grad_norm": 0.984375, "learning_rate": 0.0004948997424076764, "loss": 5.413, "mean_token_accuracy": 0.17051018327474593, "num_tokens": 16645369.0, "step": 9015 }, { "entropy": 5.732332229614258, "epoch": 0.7578239865574459, "grad_norm": 1.0, "learning_rate": 0.0004948934004287272, "loss": 5.484, "mean_token_accuracy": 0.17207153737545014, "num_tokens": 16654348.0, "step": 9020 }, { "entropy": 5.762863874435425, "epoch": 0.7582440663726108, "grad_norm": 0.9375, "learning_rate": 0.0004948870545544796, "loss": 5.4804, "mean_token_accuracy": 0.1658631831407547, "num_tokens": 16664009.0, "step": 9025 }, { "entropy": 5.6739654541015625, "epoch": 0.7586641461877757, "grad_norm": 0.98828125, "learning_rate": 0.000494880704785046, "loss": 5.4876, "mean_token_accuracy": 0.17170191258192063, "num_tokens": 16674079.0, "step": 9030 }, { "entropy": 5.731717777252197, "epoch": 0.7590842260029406, "grad_norm": 0.9609375, "learning_rate": 0.0004948743511205392, "loss": 5.4384, "mean_token_accuracy": 0.1691107079386711, "num_tokens": 16683687.0, "step": 9035 }, { "entropy": 5.668745756149292, "epoch": 0.7595043058181055, "grad_norm": 0.97265625, "learning_rate": 0.0004948679935610712, "loss": 5.3443, "mean_token_accuracy": 0.18468340039253234, "num_tokens": 16693311.0, "step": 9040 }, { "entropy": 5.64910397529602, "epoch": 0.7599243856332704, "grad_norm": 0.99609375, "learning_rate": 0.000494861632106755, "loss": 5.3904, "mean_token_accuracy": 0.17058712840080262, "num_tokens": 16702121.0, "step": 9045 }, { "entropy": 5.669192361831665, "epoch": 0.7603444654484351, "grad_norm": 1.03125, "learning_rate": 0.0004948552667577033, "loss": 5.406, "mean_token_accuracy": 0.1719843327999115, "num_tokens": 16711883.0, "step": 9050 }, { "entropy": 5.662943029403687, "epoch": 0.7607645452636, "grad_norm": 0.98828125, "learning_rate": 0.0004948488975140286, "loss": 5.4657, "mean_token_accuracy": 0.16901538372039795, "num_tokens": 16721449.0, "step": 9055 }, { "entropy": 5.680603170394898, "epoch": 0.7611846250787649, "grad_norm": 1.0, "learning_rate": 0.000494842524375844, "loss": 5.4104, "mean_token_accuracy": 0.1730765789747238, "num_tokens": 16730068.0, "step": 9060 }, { "entropy": 5.667465734481811, "epoch": 0.7616047048939298, "grad_norm": 0.890625, "learning_rate": 0.0004948361473432623, "loss": 5.4285, "mean_token_accuracy": 0.17619971334934234, "num_tokens": 16739970.0, "step": 9065 }, { "entropy": 5.72866530418396, "epoch": 0.7620247847090947, "grad_norm": 0.9609375, "learning_rate": 0.0004948297664163964, "loss": 5.4784, "mean_token_accuracy": 0.1656711742281914, "num_tokens": 16749461.0, "step": 9070 }, { "entropy": 5.713016700744629, "epoch": 0.7624448645242596, "grad_norm": 0.97265625, "learning_rate": 0.0004948233815953593, "loss": 5.5532, "mean_token_accuracy": 0.1602052167057991, "num_tokens": 16758747.0, "step": 9075 }, { "entropy": 5.6398927688598635, "epoch": 0.7628649443394245, "grad_norm": 1.015625, "learning_rate": 0.0004948169928802643, "loss": 5.2702, "mean_token_accuracy": 0.17992345839738846, "num_tokens": 16767212.0, "step": 9080 }, { "entropy": 5.667152070999146, "epoch": 0.7632850241545893, "grad_norm": 0.93359375, "learning_rate": 0.0004948106002712245, "loss": 5.425, "mean_token_accuracy": 0.17433855682611465, "num_tokens": 16776514.0, "step": 9085 }, { "entropy": 5.701340246200561, "epoch": 0.7637051039697542, "grad_norm": 0.94140625, "learning_rate": 0.0004948042037683529, "loss": 5.4129, "mean_token_accuracy": 0.16563538014888762, "num_tokens": 16786310.0, "step": 9090 }, { "entropy": 5.709695339202881, "epoch": 0.7641251837849191, "grad_norm": 0.98046875, "learning_rate": 0.0004947978033717632, "loss": 5.4413, "mean_token_accuracy": 0.17060359418392182, "num_tokens": 16795551.0, "step": 9095 }, { "entropy": 5.68322868347168, "epoch": 0.764545263600084, "grad_norm": 0.90625, "learning_rate": 0.0004947913990815684, "loss": 5.4055, "mean_token_accuracy": 0.17115796357393265, "num_tokens": 16805099.0, "step": 9100 }, { "entropy": 5.681840467453003, "epoch": 0.7649653434152489, "grad_norm": 0.9609375, "learning_rate": 0.0004947849908978824, "loss": 5.4718, "mean_token_accuracy": 0.16847485899925232, "num_tokens": 16813963.0, "step": 9105 }, { "entropy": 5.740784549713135, "epoch": 0.7653854232304138, "grad_norm": 0.98828125, "learning_rate": 0.0004947785788208182, "loss": 5.4893, "mean_token_accuracy": 0.16571144610643387, "num_tokens": 16822814.0, "step": 9110 }, { "entropy": 5.741289281845093, "epoch": 0.7658055030455787, "grad_norm": 0.96875, "learning_rate": 0.0004947721628504898, "loss": 5.5179, "mean_token_accuracy": 0.16745201647281646, "num_tokens": 16831906.0, "step": 9115 }, { "entropy": 5.615249156951904, "epoch": 0.7662255828607435, "grad_norm": 0.98828125, "learning_rate": 0.0004947657429870108, "loss": 5.3477, "mean_token_accuracy": 0.17425337433815002, "num_tokens": 16840050.0, "step": 9120 }, { "entropy": 5.595571756362915, "epoch": 0.7666456626759084, "grad_norm": 1.0078125, "learning_rate": 0.0004947593192304946, "loss": 5.372, "mean_token_accuracy": 0.17037794291973113, "num_tokens": 16848404.0, "step": 9125 }, { "entropy": 5.659189987182617, "epoch": 0.7670657424910733, "grad_norm": 1.0546875, "learning_rate": 0.0004947528915810554, "loss": 5.3628, "mean_token_accuracy": 0.17191173434257506, "num_tokens": 16856568.0, "step": 9130 }, { "entropy": 5.687161254882812, "epoch": 0.7674858223062382, "grad_norm": 1.0546875, "learning_rate": 0.0004947464600388066, "loss": 5.3941, "mean_token_accuracy": 0.1742753341794014, "num_tokens": 16864936.0, "step": 9135 }, { "entropy": 5.821179676055908, "epoch": 0.7679059021214031, "grad_norm": 1.0546875, "learning_rate": 0.0004947400246038627, "loss": 5.5132, "mean_token_accuracy": 0.1674949362874031, "num_tokens": 16874504.0, "step": 9140 }, { "entropy": 5.598482227325439, "epoch": 0.768325981936568, "grad_norm": 1.046875, "learning_rate": 0.0004947335852763374, "loss": 5.2836, "mean_token_accuracy": 0.1745596244931221, "num_tokens": 16883365.0, "step": 9145 }, { "entropy": 5.6922197341918945, "epoch": 0.7687460617517329, "grad_norm": 0.98828125, "learning_rate": 0.0004947271420563447, "loss": 5.5242, "mean_token_accuracy": 0.15882697999477385, "num_tokens": 16892701.0, "step": 9150 }, { "entropy": 5.697641658782959, "epoch": 0.7691661415668977, "grad_norm": 0.96484375, "learning_rate": 0.0004947206949439989, "loss": 5.3554, "mean_token_accuracy": 0.16590554565191268, "num_tokens": 16901864.0, "step": 9155 }, { "entropy": 5.706714868545532, "epoch": 0.7695862213820626, "grad_norm": 1.0546875, "learning_rate": 0.000494714243939414, "loss": 5.3969, "mean_token_accuracy": 0.17753439396619797, "num_tokens": 16910908.0, "step": 9160 }, { "entropy": 5.638397073745727, "epoch": 0.7700063011972275, "grad_norm": 1.046875, "learning_rate": 0.0004947077890427045, "loss": 5.4198, "mean_token_accuracy": 0.17354961782693862, "num_tokens": 16920299.0, "step": 9165 }, { "entropy": 5.777770471572876, "epoch": 0.7704263810123924, "grad_norm": 0.9140625, "learning_rate": 0.0004947013302539846, "loss": 5.5436, "mean_token_accuracy": 0.15923818945884705, "num_tokens": 16930027.0, "step": 9170 }, { "entropy": 5.793980407714844, "epoch": 0.7708464608275573, "grad_norm": 1.0078125, "learning_rate": 0.0004946948675733688, "loss": 5.436, "mean_token_accuracy": 0.1715144395828247, "num_tokens": 16939387.0, "step": 9175 }, { "entropy": 5.651628112792968, "epoch": 0.7712665406427222, "grad_norm": 0.9453125, "learning_rate": 0.0004946884010009714, "loss": 5.4081, "mean_token_accuracy": 0.17500587105751036, "num_tokens": 16950024.0, "step": 9180 }, { "entropy": 5.615873241424561, "epoch": 0.771686620457887, "grad_norm": 1.1171875, "learning_rate": 0.0004946819305369073, "loss": 5.318, "mean_token_accuracy": 0.18173859417438507, "num_tokens": 16958219.0, "step": 9185 }, { "entropy": 5.6286674499511715, "epoch": 0.7721067002730518, "grad_norm": 0.93359375, "learning_rate": 0.0004946754561812909, "loss": 5.2861, "mean_token_accuracy": 0.1802246168255806, "num_tokens": 16966829.0, "step": 9190 }, { "entropy": 5.635646390914917, "epoch": 0.7725267800882167, "grad_norm": 1.03125, "learning_rate": 0.0004946689779342367, "loss": 5.3906, "mean_token_accuracy": 0.172538061439991, "num_tokens": 16975585.0, "step": 9195 }, { "entropy": 5.672124814987183, "epoch": 0.7729468599033816, "grad_norm": 0.90625, "learning_rate": 0.0004946624957958599, "loss": 5.3695, "mean_token_accuracy": 0.17662529796361923, "num_tokens": 16984848.0, "step": 9200 }, { "entropy": 5.6059465408325195, "epoch": 0.7733669397185465, "grad_norm": 1.046875, "learning_rate": 0.000494656009766275, "loss": 5.3718, "mean_token_accuracy": 0.18152703046798707, "num_tokens": 16993179.0, "step": 9205 }, { "entropy": 5.59189395904541, "epoch": 0.7737870195337114, "grad_norm": 1.0, "learning_rate": 0.000494649519845597, "loss": 5.405, "mean_token_accuracy": 0.17100384682416916, "num_tokens": 17002563.0, "step": 9210 }, { "entropy": 5.68180480003357, "epoch": 0.7742070993488763, "grad_norm": 0.94921875, "learning_rate": 0.0004946430260339409, "loss": 5.4273, "mean_token_accuracy": 0.17044441252946854, "num_tokens": 17011805.0, "step": 9215 }, { "entropy": 5.6784077167510985, "epoch": 0.7746271791640411, "grad_norm": 0.8984375, "learning_rate": 0.0004946365283314216, "loss": 5.3632, "mean_token_accuracy": 0.1790889710187912, "num_tokens": 17020398.0, "step": 9220 }, { "entropy": 5.606709718704224, "epoch": 0.775047258979206, "grad_norm": 0.921875, "learning_rate": 0.0004946300267381545, "loss": 5.3653, "mean_token_accuracy": 0.17063608020544052, "num_tokens": 17030805.0, "step": 9225 }, { "entropy": 5.677940416336059, "epoch": 0.7754673387943709, "grad_norm": 0.94140625, "learning_rate": 0.0004946235212542544, "loss": 5.4126, "mean_token_accuracy": 0.17253420054912566, "num_tokens": 17040164.0, "step": 9230 }, { "entropy": 5.6994188785552975, "epoch": 0.7758874186095358, "grad_norm": 0.97265625, "learning_rate": 0.0004946170118798367, "loss": 5.4582, "mean_token_accuracy": 0.17055218815803527, "num_tokens": 17049519.0, "step": 9235 }, { "entropy": 5.653487968444824, "epoch": 0.7763074984247007, "grad_norm": 1.0546875, "learning_rate": 0.0004946104986150167, "loss": 5.3967, "mean_token_accuracy": 0.1756555050611496, "num_tokens": 17058042.0, "step": 9240 }, { "entropy": 5.6883646011352536, "epoch": 0.7767275782398656, "grad_norm": 1.046875, "learning_rate": 0.0004946039814599099, "loss": 5.4105, "mean_token_accuracy": 0.17587194144725798, "num_tokens": 17067107.0, "step": 9245 }, { "entropy": 5.6508525848388675, "epoch": 0.7771476580550305, "grad_norm": 0.9921875, "learning_rate": 0.0004945974604146316, "loss": 5.5087, "mean_token_accuracy": 0.17771736830472945, "num_tokens": 17076975.0, "step": 9250 }, { "entropy": 5.6237695693969725, "epoch": 0.7775677378701953, "grad_norm": 0.9296875, "learning_rate": 0.0004945909354792974, "loss": 5.3663, "mean_token_accuracy": 0.17659880518913268, "num_tokens": 17086405.0, "step": 9255 }, { "entropy": 5.692893171310425, "epoch": 0.7779878176853602, "grad_norm": 1.015625, "learning_rate": 0.0004945844066540229, "loss": 5.4305, "mean_token_accuracy": 0.17194305509328842, "num_tokens": 17095333.0, "step": 9260 }, { "entropy": 5.725101661682129, "epoch": 0.7784078975005251, "grad_norm": 1.015625, "learning_rate": 0.0004945778739389236, "loss": 5.473, "mean_token_accuracy": 0.16830769330263137, "num_tokens": 17103631.0, "step": 9265 }, { "entropy": 5.69491868019104, "epoch": 0.77882797731569, "grad_norm": 0.94921875, "learning_rate": 0.0004945713373341152, "loss": 5.3804, "mean_token_accuracy": 0.16958286464214326, "num_tokens": 17112612.0, "step": 9270 }, { "entropy": 5.71229772567749, "epoch": 0.7792480571308549, "grad_norm": 0.94140625, "learning_rate": 0.0004945647968397139, "loss": 5.4127, "mean_token_accuracy": 0.16797387152910231, "num_tokens": 17121592.0, "step": 9275 }, { "entropy": 5.642221212387085, "epoch": 0.7796681369460198, "grad_norm": 0.98828125, "learning_rate": 0.0004945582524558352, "loss": 5.4148, "mean_token_accuracy": 0.17645399272441864, "num_tokens": 17131003.0, "step": 9280 }, { "entropy": 5.6932426452636715, "epoch": 0.7800882167611847, "grad_norm": 0.96875, "learning_rate": 0.000494551704182595, "loss": 5.4247, "mean_token_accuracy": 0.1688477948307991, "num_tokens": 17140013.0, "step": 9285 }, { "entropy": 5.773512697219848, "epoch": 0.7805082965763495, "grad_norm": 0.9453125, "learning_rate": 0.0004945451520201095, "loss": 5.5571, "mean_token_accuracy": 0.15975792109966278, "num_tokens": 17150406.0, "step": 9290 }, { "entropy": 5.720021438598633, "epoch": 0.7809283763915144, "grad_norm": 0.9296875, "learning_rate": 0.0004945385959684947, "loss": 5.4174, "mean_token_accuracy": 0.172013159096241, "num_tokens": 17159757.0, "step": 9295 }, { "entropy": 5.721838426589966, "epoch": 0.7813484562066793, "grad_norm": 0.90625, "learning_rate": 0.0004945320360278667, "loss": 5.4373, "mean_token_accuracy": 0.1781976416707039, "num_tokens": 17169317.0, "step": 9300 }, { "entropy": 5.723817920684814, "epoch": 0.7817685360218442, "grad_norm": 1.34375, "learning_rate": 0.0004945254721983416, "loss": 5.4298, "mean_token_accuracy": 0.1794099360704422, "num_tokens": 17178410.0, "step": 9305 }, { "entropy": 5.652346086502075, "epoch": 0.782188615837009, "grad_norm": 1.0, "learning_rate": 0.000494518904480036, "loss": 5.3687, "mean_token_accuracy": 0.1771707609295845, "num_tokens": 17186922.0, "step": 9310 }, { "entropy": 5.725099563598633, "epoch": 0.782608695652174, "grad_norm": 1.0078125, "learning_rate": 0.0004945123328730659, "loss": 5.4633, "mean_token_accuracy": 0.1683615952730179, "num_tokens": 17197125.0, "step": 9315 }, { "entropy": 5.631491565704346, "epoch": 0.7830287754673388, "grad_norm": 1.0078125, "learning_rate": 0.000494505757377548, "loss": 5.3865, "mean_token_accuracy": 0.16576175540685653, "num_tokens": 17206169.0, "step": 9320 }, { "entropy": 5.567856502532959, "epoch": 0.7834488552825036, "grad_norm": 0.98046875, "learning_rate": 0.0004944991779935985, "loss": 5.3104, "mean_token_accuracy": 0.17814598530530928, "num_tokens": 17214607.0, "step": 9325 }, { "entropy": 5.618004560470581, "epoch": 0.7838689350976685, "grad_norm": 1.0, "learning_rate": 0.000494492594721334, "loss": 5.2968, "mean_token_accuracy": 0.17776899933815002, "num_tokens": 17223616.0, "step": 9330 }, { "entropy": 5.717865657806397, "epoch": 0.7842890149128334, "grad_norm": 0.98828125, "learning_rate": 0.0004944860075608715, "loss": 5.3919, "mean_token_accuracy": 0.1704510435461998, "num_tokens": 17232729.0, "step": 9335 }, { "entropy": 5.664163494110108, "epoch": 0.7847090947279983, "grad_norm": 0.9921875, "learning_rate": 0.0004944794165123272, "loss": 5.4389, "mean_token_accuracy": 0.17305568903684615, "num_tokens": 17242128.0, "step": 9340 }, { "entropy": 5.6507659435272215, "epoch": 0.7851291745431632, "grad_norm": 1.0234375, "learning_rate": 0.000494472821575818, "loss": 5.3466, "mean_token_accuracy": 0.1746900722384453, "num_tokens": 17250806.0, "step": 9345 }, { "entropy": 5.771410274505615, "epoch": 0.7855492543583281, "grad_norm": 0.91796875, "learning_rate": 0.0004944662227514609, "loss": 5.5973, "mean_token_accuracy": 0.1623526006937027, "num_tokens": 17260888.0, "step": 9350 }, { "entropy": 5.636618947982788, "epoch": 0.785969334173493, "grad_norm": 0.88671875, "learning_rate": 0.0004944596200393726, "loss": 5.35, "mean_token_accuracy": 0.1740969493985176, "num_tokens": 17270387.0, "step": 9355 }, { "entropy": 5.668696594238281, "epoch": 0.7863894139886578, "grad_norm": 0.93359375, "learning_rate": 0.0004944530134396702, "loss": 5.3817, "mean_token_accuracy": 0.17400230541825296, "num_tokens": 17279866.0, "step": 9360 }, { "entropy": 5.594512128829956, "epoch": 0.7868094938038227, "grad_norm": 1.0, "learning_rate": 0.0004944464029524707, "loss": 5.3741, "mean_token_accuracy": 0.17609733641147612, "num_tokens": 17289233.0, "step": 9365 }, { "entropy": 5.7164270877838135, "epoch": 0.7872295736189876, "grad_norm": 0.98828125, "learning_rate": 0.000494439788577891, "loss": 5.4587, "mean_token_accuracy": 0.16937633454799653, "num_tokens": 17298705.0, "step": 9370 }, { "entropy": 5.700407218933106, "epoch": 0.7876496534341525, "grad_norm": 1.015625, "learning_rate": 0.0004944331703160486, "loss": 5.4213, "mean_token_accuracy": 0.17114079892635345, "num_tokens": 17307793.0, "step": 9375 }, { "entropy": 5.637381267547608, "epoch": 0.7880697332493174, "grad_norm": 0.98828125, "learning_rate": 0.0004944265481670605, "loss": 5.4801, "mean_token_accuracy": 0.16938104182481767, "num_tokens": 17318248.0, "step": 9380 }, { "entropy": 5.712375593185425, "epoch": 0.7884898130644823, "grad_norm": 0.97265625, "learning_rate": 0.0004944199221310441, "loss": 5.3981, "mean_token_accuracy": 0.16927841305732727, "num_tokens": 17327281.0, "step": 9385 }, { "entropy": 5.70103087425232, "epoch": 0.7889098928796471, "grad_norm": 0.96484375, "learning_rate": 0.0004944132922081168, "loss": 5.3907, "mean_token_accuracy": 0.17860170900821687, "num_tokens": 17336805.0, "step": 9390 }, { "entropy": 5.641070079803467, "epoch": 0.789329972694812, "grad_norm": 0.9375, "learning_rate": 0.0004944066583983961, "loss": 5.3746, "mean_token_accuracy": 0.17205886095762252, "num_tokens": 17346024.0, "step": 9395 }, { "entropy": 5.695710468292236, "epoch": 0.7897500525099769, "grad_norm": 1.078125, "learning_rate": 0.0004944000207019992, "loss": 5.4554, "mean_token_accuracy": 0.1663645848631859, "num_tokens": 17355100.0, "step": 9400 }, { "entropy": 5.7563300132751465, "epoch": 0.7901701323251418, "grad_norm": 0.890625, "learning_rate": 0.0004943933791190441, "loss": 5.4802, "mean_token_accuracy": 0.16857121735811234, "num_tokens": 17364769.0, "step": 9405 }, { "entropy": 5.674240827560425, "epoch": 0.7905902121403067, "grad_norm": 0.97265625, "learning_rate": 0.0004943867336496482, "loss": 5.3386, "mean_token_accuracy": 0.17411471754312516, "num_tokens": 17374082.0, "step": 9410 }, { "entropy": 5.630914258956909, "epoch": 0.7910102919554716, "grad_norm": 0.8984375, "learning_rate": 0.0004943800842939293, "loss": 5.3914, "mean_token_accuracy": 0.1749129608273506, "num_tokens": 17383570.0, "step": 9415 }, { "entropy": 5.6687126636505125, "epoch": 0.7914303717706365, "grad_norm": 1.0, "learning_rate": 0.000494373431052005, "loss": 5.378, "mean_token_accuracy": 0.17296216636896133, "num_tokens": 17392105.0, "step": 9420 }, { "entropy": 5.600577068328858, "epoch": 0.7918504515858013, "grad_norm": 1.0234375, "learning_rate": 0.0004943667739239935, "loss": 5.3472, "mean_token_accuracy": 0.17239360362291337, "num_tokens": 17401363.0, "step": 9425 }, { "entropy": 5.712628173828125, "epoch": 0.7922705314009661, "grad_norm": 0.96484375, "learning_rate": 0.0004943601129100125, "loss": 5.3737, "mean_token_accuracy": 0.17288942486047745, "num_tokens": 17411333.0, "step": 9430 }, { "entropy": 5.6918822765350345, "epoch": 0.792690611216131, "grad_norm": 1.0234375, "learning_rate": 0.0004943534480101801, "loss": 5.4239, "mean_token_accuracy": 0.1769603595137596, "num_tokens": 17421162.0, "step": 9435 }, { "entropy": 5.650821733474731, "epoch": 0.793110691031296, "grad_norm": 0.9609375, "learning_rate": 0.0004943467792246142, "loss": 5.3695, "mean_token_accuracy": 0.17250554263591766, "num_tokens": 17430119.0, "step": 9440 }, { "entropy": 5.6431236743927, "epoch": 0.7935307708464608, "grad_norm": 1.0625, "learning_rate": 0.0004943401065534332, "loss": 5.3832, "mean_token_accuracy": 0.17289817184209824, "num_tokens": 17439617.0, "step": 9445 }, { "entropy": 5.6074810981750485, "epoch": 0.7939508506616257, "grad_norm": 0.953125, "learning_rate": 0.0004943334299967551, "loss": 5.5164, "mean_token_accuracy": 0.1691092312335968, "num_tokens": 17448720.0, "step": 9450 }, { "entropy": 5.643406248092651, "epoch": 0.7943709304767906, "grad_norm": 0.90234375, "learning_rate": 0.0004943267495546982, "loss": 5.3735, "mean_token_accuracy": 0.17951832860708236, "num_tokens": 17457458.0, "step": 9455 }, { "entropy": 5.729147338867188, "epoch": 0.7947910102919554, "grad_norm": 0.90234375, "learning_rate": 0.0004943200652273809, "loss": 5.4062, "mean_token_accuracy": 0.17448301315307618, "num_tokens": 17467095.0, "step": 9460 }, { "entropy": 5.666112852096558, "epoch": 0.7952110901071203, "grad_norm": 1.0859375, "learning_rate": 0.0004943133770149216, "loss": 5.4427, "mean_token_accuracy": 0.16660526692867278, "num_tokens": 17476247.0, "step": 9465 }, { "entropy": 5.716661071777343, "epoch": 0.7956311699222852, "grad_norm": 0.8984375, "learning_rate": 0.0004943066849174386, "loss": 5.4292, "mean_token_accuracy": 0.17646256387233733, "num_tokens": 17486352.0, "step": 9470 }, { "entropy": 5.712063312530518, "epoch": 0.7960512497374501, "grad_norm": 0.953125, "learning_rate": 0.0004942999889350508, "loss": 5.4027, "mean_token_accuracy": 0.1734911397099495, "num_tokens": 17495633.0, "step": 9475 }, { "entropy": 5.705736541748047, "epoch": 0.796471329552615, "grad_norm": 0.9375, "learning_rate": 0.0004942932890678765, "loss": 5.4501, "mean_token_accuracy": 0.16955653727054595, "num_tokens": 17504325.0, "step": 9480 }, { "entropy": 5.712059164047242, "epoch": 0.7968914093677799, "grad_norm": 1.09375, "learning_rate": 0.0004942865853160346, "loss": 5.4562, "mean_token_accuracy": 0.16996009796857833, "num_tokens": 17513265.0, "step": 9485 }, { "entropy": 5.7019891262054445, "epoch": 0.7973114891829448, "grad_norm": 0.9609375, "learning_rate": 0.0004942798776796436, "loss": 5.4773, "mean_token_accuracy": 0.16690149605274202, "num_tokens": 17522939.0, "step": 9490 }, { "entropy": 5.746654319763183, "epoch": 0.7977315689981096, "grad_norm": 1.09375, "learning_rate": 0.0004942731661588226, "loss": 5.4931, "mean_token_accuracy": 0.16729601174592973, "num_tokens": 17532250.0, "step": 9495 }, { "entropy": 5.792857694625854, "epoch": 0.7981516488132745, "grad_norm": 0.9140625, "learning_rate": 0.0004942664507536904, "loss": 5.5005, "mean_token_accuracy": 0.1796313926577568, "num_tokens": 17541368.0, "step": 9500 }, { "entropy": 5.658935594558716, "epoch": 0.7985717286284394, "grad_norm": 1.03125, "learning_rate": 0.0004942597314643659, "loss": 5.4308, "mean_token_accuracy": 0.1705750435590744, "num_tokens": 17550871.0, "step": 9505 }, { "entropy": 5.671110582351685, "epoch": 0.7989918084436043, "grad_norm": 0.99609375, "learning_rate": 0.0004942530082909681, "loss": 5.3488, "mean_token_accuracy": 0.1821463704109192, "num_tokens": 17559683.0, "step": 9510 }, { "entropy": 5.663429307937622, "epoch": 0.7994118882587692, "grad_norm": 0.984375, "learning_rate": 0.0004942462812336163, "loss": 5.3657, "mean_token_accuracy": 0.17269751131534578, "num_tokens": 17568877.0, "step": 9515 }, { "entropy": 5.799855709075928, "epoch": 0.7998319680739341, "grad_norm": 1.046875, "learning_rate": 0.0004942395502924293, "loss": 5.5168, "mean_token_accuracy": 0.16290245950222015, "num_tokens": 17578202.0, "step": 9520 }, { "entropy": 5.667945051193238, "epoch": 0.800252047889099, "grad_norm": 1.015625, "learning_rate": 0.0004942328154675268, "loss": 5.3608, "mean_token_accuracy": 0.17819298058748245, "num_tokens": 17587342.0, "step": 9525 }, { "entropy": 5.60935788154602, "epoch": 0.8006721277042638, "grad_norm": 1.0546875, "learning_rate": 0.0004942260767590277, "loss": 5.2463, "mean_token_accuracy": 0.17922573685646057, "num_tokens": 17595671.0, "step": 9530 }, { "entropy": 5.69202733039856, "epoch": 0.8010922075194287, "grad_norm": 0.99609375, "learning_rate": 0.0004942193341670516, "loss": 5.547, "mean_token_accuracy": 0.16388892084360124, "num_tokens": 17605649.0, "step": 9535 }, { "entropy": 5.740158987045288, "epoch": 0.8015122873345936, "grad_norm": 0.8671875, "learning_rate": 0.0004942125876917178, "loss": 5.428, "mean_token_accuracy": 0.1680595800280571, "num_tokens": 17615286.0, "step": 9540 }, { "entropy": 5.624123573303223, "epoch": 0.8019323671497585, "grad_norm": 0.94921875, "learning_rate": 0.000494205837333146, "loss": 5.4067, "mean_token_accuracy": 0.17107090801000596, "num_tokens": 17624583.0, "step": 9545 }, { "entropy": 5.651376008987427, "epoch": 0.8023524469649234, "grad_norm": 1.03125, "learning_rate": 0.0004941990830914557, "loss": 5.3933, "mean_token_accuracy": 0.174582402408123, "num_tokens": 17633894.0, "step": 9550 }, { "entropy": 5.756941223144532, "epoch": 0.8027725267800883, "grad_norm": 0.9765625, "learning_rate": 0.0004941923249667663, "loss": 5.4906, "mean_token_accuracy": 0.16728989481925965, "num_tokens": 17643172.0, "step": 9555 }, { "entropy": 5.737690401077271, "epoch": 0.803192606595253, "grad_norm": 1.0234375, "learning_rate": 0.0004941855629591979, "loss": 5.365, "mean_token_accuracy": 0.17130153775215148, "num_tokens": 17651901.0, "step": 9560 }, { "entropy": 5.62269434928894, "epoch": 0.8036126864104179, "grad_norm": 0.94921875, "learning_rate": 0.0004941787970688701, "loss": 5.3557, "mean_token_accuracy": 0.18150702863931656, "num_tokens": 17660806.0, "step": 9565 }, { "entropy": 5.668051624298096, "epoch": 0.8040327662255828, "grad_norm": 1.0390625, "learning_rate": 0.0004941720272959027, "loss": 5.4259, "mean_token_accuracy": 0.17730468660593032, "num_tokens": 17669157.0, "step": 9570 }, { "entropy": 5.628180742263794, "epoch": 0.8044528460407477, "grad_norm": 0.91015625, "learning_rate": 0.0004941652536404157, "loss": 5.3344, "mean_token_accuracy": 0.1761705294251442, "num_tokens": 17678664.0, "step": 9575 }, { "entropy": 5.681865787506103, "epoch": 0.8048729258559126, "grad_norm": 0.9765625, "learning_rate": 0.0004941584761025291, "loss": 5.3889, "mean_token_accuracy": 0.17209957987070085, "num_tokens": 17688252.0, "step": 9580 }, { "entropy": 5.639874029159546, "epoch": 0.8052930056710775, "grad_norm": 1.078125, "learning_rate": 0.000494151694682363, "loss": 5.3959, "mean_token_accuracy": 0.17382072359323503, "num_tokens": 17696473.0, "step": 9585 }, { "entropy": 5.655614233016967, "epoch": 0.8057130854862424, "grad_norm": 0.94921875, "learning_rate": 0.0004941449093800374, "loss": 5.4367, "mean_token_accuracy": 0.16963195502758027, "num_tokens": 17706177.0, "step": 9590 }, { "entropy": 5.614628314971924, "epoch": 0.8061331653014072, "grad_norm": 1.0234375, "learning_rate": 0.0004941381201956726, "loss": 5.2762, "mean_token_accuracy": 0.1837581515312195, "num_tokens": 17715355.0, "step": 9595 }, { "entropy": 5.603265762329102, "epoch": 0.8065532451165721, "grad_norm": 0.87890625, "learning_rate": 0.0004941313271293889, "loss": 5.3795, "mean_token_accuracy": 0.1786747917532921, "num_tokens": 17724345.0, "step": 9600 }, { "entropy": 5.635318088531494, "epoch": 0.806973324931737, "grad_norm": 1.015625, "learning_rate": 0.0004941245301813065, "loss": 5.3068, "mean_token_accuracy": 0.18749790042638778, "num_tokens": 17732805.0, "step": 9605 }, { "entropy": 5.628582906723023, "epoch": 0.8073934047469019, "grad_norm": 0.95703125, "learning_rate": 0.0004941177293515459, "loss": 5.3601, "mean_token_accuracy": 0.17732518911361694, "num_tokens": 17741963.0, "step": 9610 }, { "entropy": 5.5815001964569095, "epoch": 0.8078134845620668, "grad_norm": 0.96875, "learning_rate": 0.0004941109246402275, "loss": 5.3704, "mean_token_accuracy": 0.17502357810735703, "num_tokens": 17751858.0, "step": 9615 }, { "entropy": 5.737301635742187, "epoch": 0.8082335643772317, "grad_norm": 0.93359375, "learning_rate": 0.0004941041160474721, "loss": 5.4819, "mean_token_accuracy": 0.172784486413002, "num_tokens": 17761152.0, "step": 9620 }, { "entropy": 5.7025964736938475, "epoch": 0.8086536441923966, "grad_norm": 1.1015625, "learning_rate": 0.0004940973035733999, "loss": 5.4168, "mean_token_accuracy": 0.17396082878112792, "num_tokens": 17770493.0, "step": 9625 }, { "entropy": 5.751491403579712, "epoch": 0.8090737240075614, "grad_norm": 1.0078125, "learning_rate": 0.0004940904872181318, "loss": 5.4209, "mean_token_accuracy": 0.17233685702085494, "num_tokens": 17779871.0, "step": 9630 }, { "entropy": 5.772523689270019, "epoch": 0.8094938038227263, "grad_norm": 1.0859375, "learning_rate": 0.0004940836669817887, "loss": 5.4554, "mean_token_accuracy": 0.17020966857671738, "num_tokens": 17788606.0, "step": 9635 }, { "entropy": 5.6202620506286625, "epoch": 0.8099138836378912, "grad_norm": 0.9921875, "learning_rate": 0.0004940768428644911, "loss": 5.3808, "mean_token_accuracy": 0.17673110961914062, "num_tokens": 17797458.0, "step": 9640 }, { "entropy": 5.547230815887451, "epoch": 0.8103339634530561, "grad_norm": 0.98046875, "learning_rate": 0.0004940700148663601, "loss": 5.3138, "mean_token_accuracy": 0.17491302639245987, "num_tokens": 17806902.0, "step": 9645 }, { "entropy": 5.712979698181153, "epoch": 0.810754043268221, "grad_norm": 0.91796875, "learning_rate": 0.0004940631829875165, "loss": 5.4385, "mean_token_accuracy": 0.16947968900203705, "num_tokens": 17816374.0, "step": 9650 }, { "entropy": 5.68047456741333, "epoch": 0.8111741230833859, "grad_norm": 1.015625, "learning_rate": 0.0004940563472280815, "loss": 5.4252, "mean_token_accuracy": 0.17470954358577728, "num_tokens": 17825267.0, "step": 9655 }, { "entropy": 5.672005701065063, "epoch": 0.8115942028985508, "grad_norm": 0.9765625, "learning_rate": 0.0004940495075881761, "loss": 5.3575, "mean_token_accuracy": 0.17719605565071106, "num_tokens": 17834027.0, "step": 9660 }, { "entropy": 5.637646913528442, "epoch": 0.8120142827137156, "grad_norm": 0.9375, "learning_rate": 0.0004940426640679214, "loss": 5.3568, "mean_token_accuracy": 0.16857334673404695, "num_tokens": 17843587.0, "step": 9665 }, { "entropy": 5.6841387271881105, "epoch": 0.8124343625288805, "grad_norm": 1.0625, "learning_rate": 0.0004940358166674388, "loss": 5.3734, "mean_token_accuracy": 0.17750074714422226, "num_tokens": 17852284.0, "step": 9670 }, { "entropy": 5.7013506412506105, "epoch": 0.8128544423440454, "grad_norm": 1.09375, "learning_rate": 0.0004940289653868494, "loss": 5.3848, "mean_token_accuracy": 0.17865592539310454, "num_tokens": 17860896.0, "step": 9675 }, { "entropy": 5.612197351455689, "epoch": 0.8132745221592103, "grad_norm": 0.8984375, "learning_rate": 0.0004940221102262747, "loss": 5.3734, "mean_token_accuracy": 0.16798353642225267, "num_tokens": 17870796.0, "step": 9680 }, { "entropy": 5.683709669113159, "epoch": 0.8136946019743752, "grad_norm": 0.88671875, "learning_rate": 0.0004940152511858361, "loss": 5.446, "mean_token_accuracy": 0.17569046914577485, "num_tokens": 17880016.0, "step": 9685 }, { "entropy": 5.7279314517974855, "epoch": 0.81411468178954, "grad_norm": 1.0625, "learning_rate": 0.0004940083882656551, "loss": 5.4975, "mean_token_accuracy": 0.17040724009275438, "num_tokens": 17889348.0, "step": 9690 }, { "entropy": 5.667710590362549, "epoch": 0.814534761604705, "grad_norm": 0.921875, "learning_rate": 0.0004940015214658532, "loss": 5.3621, "mean_token_accuracy": 0.17611367851495743, "num_tokens": 17898392.0, "step": 9695 }, { "entropy": 5.7369222164154055, "epoch": 0.8149548414198697, "grad_norm": 1.0078125, "learning_rate": 0.0004939946507865522, "loss": 5.4452, "mean_token_accuracy": 0.1715082749724388, "num_tokens": 17907141.0, "step": 9700 }, { "entropy": 5.6773934841156, "epoch": 0.8153749212350346, "grad_norm": 1.0234375, "learning_rate": 0.0004939877762278737, "loss": 5.3374, "mean_token_accuracy": 0.17571880370378495, "num_tokens": 17915792.0, "step": 9705 }, { "entropy": 5.693300676345825, "epoch": 0.8157950010501995, "grad_norm": 0.984375, "learning_rate": 0.0004939808977899396, "loss": 5.4848, "mean_token_accuracy": 0.16464592069387435, "num_tokens": 17925603.0, "step": 9710 }, { "entropy": 5.679123973846435, "epoch": 0.8162150808653644, "grad_norm": 1.0390625, "learning_rate": 0.0004939740154728716, "loss": 5.4281, "mean_token_accuracy": 0.17061283215880393, "num_tokens": 17934436.0, "step": 9715 }, { "entropy": 5.694638204574585, "epoch": 0.8166351606805293, "grad_norm": 1.171875, "learning_rate": 0.0004939671292767915, "loss": 5.3757, "mean_token_accuracy": 0.18426200300455092, "num_tokens": 17942969.0, "step": 9720 }, { "entropy": 5.741085481643677, "epoch": 0.8170552404956942, "grad_norm": 0.9609375, "learning_rate": 0.0004939602392018216, "loss": 5.4741, "mean_token_accuracy": 0.16958497911691667, "num_tokens": 17952053.0, "step": 9725 }, { "entropy": 5.683176898956299, "epoch": 0.817475320310859, "grad_norm": 1.0390625, "learning_rate": 0.0004939533452480839, "loss": 5.4302, "mean_token_accuracy": 0.17523318082094191, "num_tokens": 17960707.0, "step": 9730 }, { "entropy": 5.739932584762573, "epoch": 0.8178954001260239, "grad_norm": 1.09375, "learning_rate": 0.0004939464474157003, "loss": 5.5009, "mean_token_accuracy": 0.16142288893461226, "num_tokens": 17971035.0, "step": 9735 }, { "entropy": 5.691849613189698, "epoch": 0.8183154799411888, "grad_norm": 1.0, "learning_rate": 0.0004939395457047932, "loss": 5.3844, "mean_token_accuracy": 0.16710171103477478, "num_tokens": 17980656.0, "step": 9740 }, { "entropy": 5.74945387840271, "epoch": 0.8187355597563537, "grad_norm": 0.9296875, "learning_rate": 0.0004939326401154847, "loss": 5.4101, "mean_token_accuracy": 0.1662217080593109, "num_tokens": 17990977.0, "step": 9745 }, { "entropy": 5.6544195175170895, "epoch": 0.8191556395715186, "grad_norm": 1.0390625, "learning_rate": 0.0004939257306478973, "loss": 5.417, "mean_token_accuracy": 0.17341124564409255, "num_tokens": 18000186.0, "step": 9750 }, { "entropy": 5.658962821960449, "epoch": 0.8195757193866835, "grad_norm": 0.92578125, "learning_rate": 0.0004939188173021532, "loss": 5.3835, "mean_token_accuracy": 0.17590223550796508, "num_tokens": 18010269.0, "step": 9755 }, { "entropy": 5.665806579589844, "epoch": 0.8199957992018484, "grad_norm": 1.109375, "learning_rate": 0.0004939119000783751, "loss": 5.3359, "mean_token_accuracy": 0.18055529445409774, "num_tokens": 18018461.0, "step": 9760 }, { "entropy": 5.642783117294312, "epoch": 0.8204158790170132, "grad_norm": 1.0625, "learning_rate": 0.0004939049789766855, "loss": 5.3346, "mean_token_accuracy": 0.17124349772930145, "num_tokens": 18027173.0, "step": 9765 }, { "entropy": 5.655908679962158, "epoch": 0.8208359588321781, "grad_norm": 0.984375, "learning_rate": 0.0004938980539972068, "loss": 5.4516, "mean_token_accuracy": 0.17004227191209792, "num_tokens": 18036791.0, "step": 9770 }, { "entropy": 5.636841058731079, "epoch": 0.821256038647343, "grad_norm": 0.984375, "learning_rate": 0.0004938911251400617, "loss": 5.3929, "mean_token_accuracy": 0.17712161391973497, "num_tokens": 18046908.0, "step": 9775 }, { "entropy": 5.639864301681518, "epoch": 0.8216761184625079, "grad_norm": 0.9296875, "learning_rate": 0.0004938841924053731, "loss": 5.3136, "mean_token_accuracy": 0.18543954640626908, "num_tokens": 18055825.0, "step": 9780 }, { "entropy": 5.77684268951416, "epoch": 0.8220961982776728, "grad_norm": 1.0234375, "learning_rate": 0.0004938772557932637, "loss": 5.45, "mean_token_accuracy": 0.1684145912528038, "num_tokens": 18065334.0, "step": 9785 }, { "entropy": 5.690200519561768, "epoch": 0.8225162780928377, "grad_norm": 0.9609375, "learning_rate": 0.0004938703153038565, "loss": 5.3402, "mean_token_accuracy": 0.17953094094991684, "num_tokens": 18073999.0, "step": 9790 }, { "entropy": 5.615552663803101, "epoch": 0.8229363579080026, "grad_norm": 0.953125, "learning_rate": 0.0004938633709372744, "loss": 5.3765, "mean_token_accuracy": 0.17298656851053237, "num_tokens": 18083665.0, "step": 9795 }, { "entropy": 5.665377140045166, "epoch": 0.8233564377231674, "grad_norm": 0.99609375, "learning_rate": 0.0004938564226936403, "loss": 5.3777, "mean_token_accuracy": 0.17178147584199904, "num_tokens": 18092501.0, "step": 9800 }, { "entropy": 5.634387683868408, "epoch": 0.8237765175383323, "grad_norm": 0.9921875, "learning_rate": 0.0004938494705730773, "loss": 5.3701, "mean_token_accuracy": 0.17193470448255538, "num_tokens": 18101320.0, "step": 9805 }, { "entropy": 5.626653051376342, "epoch": 0.8241965973534972, "grad_norm": 0.9375, "learning_rate": 0.0004938425145757087, "loss": 5.404, "mean_token_accuracy": 0.16800276488065718, "num_tokens": 18110190.0, "step": 9810 }, { "entropy": 5.6710772037506105, "epoch": 0.824616677168662, "grad_norm": 0.921875, "learning_rate": 0.0004938355547016577, "loss": 5.3732, "mean_token_accuracy": 0.17679596990346907, "num_tokens": 18119301.0, "step": 9815 }, { "entropy": 5.773650026321411, "epoch": 0.825036756983827, "grad_norm": 0.890625, "learning_rate": 0.0004938285909510474, "loss": 5.4424, "mean_token_accuracy": 0.16882517337799072, "num_tokens": 18128959.0, "step": 9820 }, { "entropy": 5.685972213745117, "epoch": 0.8254568367989918, "grad_norm": 0.96484375, "learning_rate": 0.0004938216233240014, "loss": 5.4202, "mean_token_accuracy": 0.17675695270299913, "num_tokens": 18138156.0, "step": 9825 }, { "entropy": 5.7510429382324215, "epoch": 0.8258769166141567, "grad_norm": 1.09375, "learning_rate": 0.000493814651820643, "loss": 5.4371, "mean_token_accuracy": 0.17143779397010803, "num_tokens": 18147244.0, "step": 9830 }, { "entropy": 5.71959228515625, "epoch": 0.8262969964293215, "grad_norm": 1.0, "learning_rate": 0.0004938076764410956, "loss": 5.437, "mean_token_accuracy": 0.16952096670866013, "num_tokens": 18156040.0, "step": 9835 }, { "entropy": 5.6597912311553955, "epoch": 0.8267170762444864, "grad_norm": 1.0546875, "learning_rate": 0.000493800697185483, "loss": 5.3655, "mean_token_accuracy": 0.17140455543994904, "num_tokens": 18165210.0, "step": 9840 }, { "entropy": 5.730273342132568, "epoch": 0.8271371560596513, "grad_norm": 0.9453125, "learning_rate": 0.0004937937140539288, "loss": 5.4458, "mean_token_accuracy": 0.17019027918577195, "num_tokens": 18174841.0, "step": 9845 }, { "entropy": 5.632910060882568, "epoch": 0.8275572358748162, "grad_norm": 1.0234375, "learning_rate": 0.0004937867270465564, "loss": 5.3141, "mean_token_accuracy": 0.17400020807981492, "num_tokens": 18184112.0, "step": 9850 }, { "entropy": 5.716655921936035, "epoch": 0.8279773156899811, "grad_norm": 0.96484375, "learning_rate": 0.0004937797361634899, "loss": 5.5002, "mean_token_accuracy": 0.16874652355909348, "num_tokens": 18193564.0, "step": 9855 }, { "entropy": 5.611610460281372, "epoch": 0.828397395505146, "grad_norm": 0.9375, "learning_rate": 0.000493772741404853, "loss": 5.2407, "mean_token_accuracy": 0.18054767847061157, "num_tokens": 18202836.0, "step": 9860 }, { "entropy": 5.683074474334717, "epoch": 0.8288174753203108, "grad_norm": 0.9375, "learning_rate": 0.0004937657427707698, "loss": 5.3623, "mean_token_accuracy": 0.18435101211071014, "num_tokens": 18212098.0, "step": 9865 }, { "entropy": 5.6672765731811525, "epoch": 0.8292375551354757, "grad_norm": 0.97265625, "learning_rate": 0.0004937587402613639, "loss": 5.4042, "mean_token_accuracy": 0.17455776929855346, "num_tokens": 18221541.0, "step": 9870 }, { "entropy": 5.645079851150513, "epoch": 0.8296576349506406, "grad_norm": 1.0703125, "learning_rate": 0.0004937517338767597, "loss": 5.3999, "mean_token_accuracy": 0.1701893076300621, "num_tokens": 18231015.0, "step": 9875 }, { "entropy": 5.732269382476806, "epoch": 0.8300777147658055, "grad_norm": 1.078125, "learning_rate": 0.0004937447236170811, "loss": 5.4257, "mean_token_accuracy": 0.172461798787117, "num_tokens": 18239729.0, "step": 9880 }, { "entropy": 5.719193649291992, "epoch": 0.8304977945809704, "grad_norm": 1.0, "learning_rate": 0.0004937377094824523, "loss": 5.4551, "mean_token_accuracy": 0.17185138463973998, "num_tokens": 18249773.0, "step": 9885 }, { "entropy": 5.6862548828125, "epoch": 0.8309178743961353, "grad_norm": 1.046875, "learning_rate": 0.0004937306914729977, "loss": 5.4294, "mean_token_accuracy": 0.17101262509822845, "num_tokens": 18259179.0, "step": 9890 }, { "entropy": 5.597580003738403, "epoch": 0.8313379542113002, "grad_norm": 0.91015625, "learning_rate": 0.0004937236695888416, "loss": 5.3092, "mean_token_accuracy": 0.1850727081298828, "num_tokens": 18268164.0, "step": 9895 }, { "entropy": 5.67072491645813, "epoch": 0.831758034026465, "grad_norm": 1.03125, "learning_rate": 0.0004937166438301082, "loss": 5.4835, "mean_token_accuracy": 0.17437790036201478, "num_tokens": 18276259.0, "step": 9900 }, { "entropy": 5.679338598251343, "epoch": 0.8321781138416299, "grad_norm": 0.99609375, "learning_rate": 0.0004937096141969221, "loss": 5.4607, "mean_token_accuracy": 0.16842739880084992, "num_tokens": 18285729.0, "step": 9905 }, { "entropy": 5.786121082305908, "epoch": 0.8325981936567948, "grad_norm": 1.0078125, "learning_rate": 0.0004937025806894077, "loss": 5.6006, "mean_token_accuracy": 0.15972199216485022, "num_tokens": 18295873.0, "step": 9910 }, { "entropy": 5.7330389499664305, "epoch": 0.8330182734719597, "grad_norm": 0.9375, "learning_rate": 0.0004936955433076899, "loss": 5.4548, "mean_token_accuracy": 0.17373140752315522, "num_tokens": 18305135.0, "step": 9915 }, { "entropy": 5.766932106018066, "epoch": 0.8334383532871246, "grad_norm": 0.92578125, "learning_rate": 0.000493688502051893, "loss": 5.4746, "mean_token_accuracy": 0.1683718182146549, "num_tokens": 18314251.0, "step": 9920 }, { "entropy": 5.648023509979248, "epoch": 0.8338584331022895, "grad_norm": 1.125, "learning_rate": 0.0004936814569221421, "loss": 5.3062, "mean_token_accuracy": 0.1848917841911316, "num_tokens": 18322863.0, "step": 9925 }, { "entropy": 5.617942428588867, "epoch": 0.8342785129174544, "grad_norm": 1.0078125, "learning_rate": 0.0004936744079185616, "loss": 5.3203, "mean_token_accuracy": 0.1726486638188362, "num_tokens": 18332129.0, "step": 9930 }, { "entropy": 5.689961910247803, "epoch": 0.8346985927326191, "grad_norm": 0.9609375, "learning_rate": 0.0004936673550412767, "loss": 5.4305, "mean_token_accuracy": 0.16863320767879486, "num_tokens": 18341457.0, "step": 9935 }, { "entropy": 5.67056131362915, "epoch": 0.835118672547784, "grad_norm": 0.91796875, "learning_rate": 0.000493660298290412, "loss": 5.4006, "mean_token_accuracy": 0.16785395741462708, "num_tokens": 18351397.0, "step": 9940 }, { "entropy": 5.674264669418335, "epoch": 0.8355387523629489, "grad_norm": 0.984375, "learning_rate": 0.0004936532376660929, "loss": 5.3538, "mean_token_accuracy": 0.17242660373449326, "num_tokens": 18360005.0, "step": 9945 }, { "entropy": 5.705958461761474, "epoch": 0.8359588321781138, "grad_norm": 0.9453125, "learning_rate": 0.0004936461731684442, "loss": 5.4323, "mean_token_accuracy": 0.1716474175453186, "num_tokens": 18369707.0, "step": 9950 }, { "entropy": 5.765536737442017, "epoch": 0.8363789119932787, "grad_norm": 1.0, "learning_rate": 0.0004936391047975912, "loss": 5.5547, "mean_token_accuracy": 0.17000313699245453, "num_tokens": 18379514.0, "step": 9955 }, { "entropy": 5.6530413150787355, "epoch": 0.8367989918084436, "grad_norm": 0.90625, "learning_rate": 0.0004936320325536589, "loss": 5.2659, "mean_token_accuracy": 0.1756296768784523, "num_tokens": 18388854.0, "step": 9960 }, { "entropy": 5.690809488296509, "epoch": 0.8372190716236085, "grad_norm": 0.96875, "learning_rate": 0.0004936249564367729, "loss": 5.4482, "mean_token_accuracy": 0.1744495376944542, "num_tokens": 18397806.0, "step": 9965 }, { "entropy": 5.587341928482056, "epoch": 0.8376391514387733, "grad_norm": 1.0078125, "learning_rate": 0.0004936178764470583, "loss": 5.3033, "mean_token_accuracy": 0.17813027948141097, "num_tokens": 18406645.0, "step": 9970 }, { "entropy": 5.598927116394043, "epoch": 0.8380592312539382, "grad_norm": 0.984375, "learning_rate": 0.0004936107925846405, "loss": 5.3166, "mean_token_accuracy": 0.17652271538972855, "num_tokens": 18415730.0, "step": 9975 }, { "entropy": 5.701291227340699, "epoch": 0.8384793110691031, "grad_norm": 0.98828125, "learning_rate": 0.0004936037048496452, "loss": 5.4449, "mean_token_accuracy": 0.17364403009414672, "num_tokens": 18424638.0, "step": 9980 }, { "entropy": 5.641800451278686, "epoch": 0.838899390884268, "grad_norm": 0.96484375, "learning_rate": 0.0004935966132421977, "loss": 5.4767, "mean_token_accuracy": 0.1686299055814743, "num_tokens": 18434090.0, "step": 9985 }, { "entropy": 5.585489130020141, "epoch": 0.8393194706994329, "grad_norm": 0.984375, "learning_rate": 0.0004935895177624239, "loss": 5.3101, "mean_token_accuracy": 0.17633936703205108, "num_tokens": 18442965.0, "step": 9990 }, { "entropy": 5.753833341598511, "epoch": 0.8397395505145978, "grad_norm": 1.09375, "learning_rate": 0.0004935824184104493, "loss": 5.3258, "mean_token_accuracy": 0.18311144560575485, "num_tokens": 18451553.0, "step": 9995 }, { "entropy": 5.610658264160156, "epoch": 0.8401596303297627, "grad_norm": 1.015625, "learning_rate": 0.0004935753151863997, "loss": 5.4077, "mean_token_accuracy": 0.17045740485191346, "num_tokens": 18461325.0, "step": 10000 }, { "entropy": 5.673952293395996, "epoch": 0.8405797101449275, "grad_norm": 1.03125, "learning_rate": 0.0004935682080904009, "loss": 5.392, "mean_token_accuracy": 0.18152427077293395, "num_tokens": 18469977.0, "step": 10005 }, { "entropy": 5.7100663661956785, "epoch": 0.8409997899600924, "grad_norm": 0.96484375, "learning_rate": 0.0004935610971225789, "loss": 5.3463, "mean_token_accuracy": 0.17433822005987168, "num_tokens": 18479534.0, "step": 10010 }, { "entropy": 5.677320671081543, "epoch": 0.8414198697752573, "grad_norm": 1.0625, "learning_rate": 0.0004935539822830597, "loss": 5.4543, "mean_token_accuracy": 0.16725490391254424, "num_tokens": 18488800.0, "step": 10015 }, { "entropy": 5.686785411834717, "epoch": 0.8418399495904222, "grad_norm": 1.09375, "learning_rate": 0.000493546863571969, "loss": 5.4162, "mean_token_accuracy": 0.1751448079943657, "num_tokens": 18498083.0, "step": 10020 }, { "entropy": 5.667457914352417, "epoch": 0.8422600294055871, "grad_norm": 0.96875, "learning_rate": 0.0004935397409894333, "loss": 5.3822, "mean_token_accuracy": 0.16698685437440872, "num_tokens": 18508265.0, "step": 10025 }, { "entropy": 5.683035039901734, "epoch": 0.842680109220752, "grad_norm": 0.98828125, "learning_rate": 0.0004935326145355787, "loss": 5.4065, "mean_token_accuracy": 0.17130510360002518, "num_tokens": 18517283.0, "step": 10030 }, { "entropy": 5.704162645339966, "epoch": 0.8431001890359168, "grad_norm": 0.96875, "learning_rate": 0.0004935254842105311, "loss": 5.4392, "mean_token_accuracy": 0.17427613139152526, "num_tokens": 18526482.0, "step": 10035 }, { "entropy": 5.5832741260528564, "epoch": 0.8435202688510817, "grad_norm": 1.0234375, "learning_rate": 0.0004935183500144173, "loss": 5.2462, "mean_token_accuracy": 0.19439249634742736, "num_tokens": 18536150.0, "step": 10040 }, { "entropy": 5.761940956115723, "epoch": 0.8439403486662466, "grad_norm": 0.96484375, "learning_rate": 0.0004935112119473634, "loss": 5.4701, "mean_token_accuracy": 0.16816117018461227, "num_tokens": 18545168.0, "step": 10045 }, { "entropy": 5.713206243515015, "epoch": 0.8443604284814115, "grad_norm": 1.015625, "learning_rate": 0.0004935040700094959, "loss": 5.4048, "mean_token_accuracy": 0.1740398332476616, "num_tokens": 18553363.0, "step": 10050 }, { "entropy": 5.632670116424561, "epoch": 0.8447805082965764, "grad_norm": 0.98828125, "learning_rate": 0.0004934969242009412, "loss": 5.335, "mean_token_accuracy": 0.18240262418985367, "num_tokens": 18562546.0, "step": 10055 }, { "entropy": 5.643794059753418, "epoch": 0.8452005881117413, "grad_norm": 0.91015625, "learning_rate": 0.0004934897745218262, "loss": 5.412, "mean_token_accuracy": 0.1717964932322502, "num_tokens": 18572149.0, "step": 10060 }, { "entropy": 5.635669326782226, "epoch": 0.8456206679269062, "grad_norm": 1.0078125, "learning_rate": 0.0004934826209722772, "loss": 5.2815, "mean_token_accuracy": 0.18115414083003997, "num_tokens": 18580842.0, "step": 10065 }, { "entropy": 5.675256872177124, "epoch": 0.8460407477420709, "grad_norm": 1.0078125, "learning_rate": 0.0004934754635524211, "loss": 5.3866, "mean_token_accuracy": 0.17737181186676027, "num_tokens": 18589765.0, "step": 10070 }, { "entropy": 5.686821413040161, "epoch": 0.8464608275572358, "grad_norm": 0.953125, "learning_rate": 0.0004934683022623847, "loss": 5.3894, "mean_token_accuracy": 0.17366087436676025, "num_tokens": 18599532.0, "step": 10075 }, { "entropy": 5.601221370697021, "epoch": 0.8468809073724007, "grad_norm": 0.99609375, "learning_rate": 0.0004934611371022947, "loss": 5.3019, "mean_token_accuracy": 0.17929604947566985, "num_tokens": 18608438.0, "step": 10080 }, { "entropy": 5.69543571472168, "epoch": 0.8473009871875656, "grad_norm": 1.125, "learning_rate": 0.0004934539680722783, "loss": 5.4566, "mean_token_accuracy": 0.17062439769506454, "num_tokens": 18617313.0, "step": 10085 }, { "entropy": 5.641908073425293, "epoch": 0.8477210670027305, "grad_norm": 1.015625, "learning_rate": 0.0004934467951724622, "loss": 5.2895, "mean_token_accuracy": 0.18004074692726135, "num_tokens": 18625880.0, "step": 10090 }, { "entropy": 5.646237468719482, "epoch": 0.8481411468178954, "grad_norm": 0.87109375, "learning_rate": 0.0004934396184029737, "loss": 5.3739, "mean_token_accuracy": 0.17587161511182786, "num_tokens": 18635727.0, "step": 10095 }, { "entropy": 5.7043040752410885, "epoch": 0.8485612266330603, "grad_norm": 0.94921875, "learning_rate": 0.0004934324377639398, "loss": 5.4194, "mean_token_accuracy": 0.17132848203182222, "num_tokens": 18645619.0, "step": 10100 }, { "entropy": 5.650476789474487, "epoch": 0.8489813064482251, "grad_norm": 1.09375, "learning_rate": 0.0004934252532554878, "loss": 5.3306, "mean_token_accuracy": 0.1743270680308342, "num_tokens": 18654901.0, "step": 10105 }, { "entropy": 5.79214334487915, "epoch": 0.84940138626339, "grad_norm": 1.421875, "learning_rate": 0.0004934180648777449, "loss": 5.5602, "mean_token_accuracy": 0.16714076846837997, "num_tokens": 18664523.0, "step": 10110 }, { "entropy": 5.732706499099732, "epoch": 0.8498214660785549, "grad_norm": 1.0, "learning_rate": 0.0004934108726308384, "loss": 5.4103, "mean_token_accuracy": 0.16757257729768754, "num_tokens": 18673685.0, "step": 10115 }, { "entropy": 5.653525066375733, "epoch": 0.8502415458937198, "grad_norm": 0.9609375, "learning_rate": 0.0004934036765148958, "loss": 5.3871, "mean_token_accuracy": 0.16773250848054885, "num_tokens": 18682889.0, "step": 10120 }, { "entropy": 5.715424919128418, "epoch": 0.8506616257088847, "grad_norm": 0.921875, "learning_rate": 0.0004933964765300446, "loss": 5.4234, "mean_token_accuracy": 0.16933207511901854, "num_tokens": 18692978.0, "step": 10125 }, { "entropy": 5.673181962966919, "epoch": 0.8510817055240496, "grad_norm": 1.0, "learning_rate": 0.000493389272676412, "loss": 5.3329, "mean_token_accuracy": 0.18106912821531296, "num_tokens": 18701846.0, "step": 10130 }, { "entropy": 5.6389687061309814, "epoch": 0.8515017853392145, "grad_norm": 0.98828125, "learning_rate": 0.0004933820649541262, "loss": 5.3758, "mean_token_accuracy": 0.18400225341320037, "num_tokens": 18711492.0, "step": 10135 }, { "entropy": 5.616599369049072, "epoch": 0.8519218651543793, "grad_norm": 1.03125, "learning_rate": 0.0004933748533633145, "loss": 5.3019, "mean_token_accuracy": 0.18677648305892944, "num_tokens": 18720407.0, "step": 10140 }, { "entropy": 5.609155321121216, "epoch": 0.8523419449695442, "grad_norm": 0.91015625, "learning_rate": 0.0004933676379041045, "loss": 5.363, "mean_token_accuracy": 0.17978433668613433, "num_tokens": 18729968.0, "step": 10145 }, { "entropy": 5.71057243347168, "epoch": 0.8527620247847091, "grad_norm": 0.9140625, "learning_rate": 0.0004933604185766245, "loss": 5.4657, "mean_token_accuracy": 0.1697831556200981, "num_tokens": 18739525.0, "step": 10150 }, { "entropy": 5.706951570510864, "epoch": 0.853182104599874, "grad_norm": 1.03125, "learning_rate": 0.0004933531953810019, "loss": 5.3499, "mean_token_accuracy": 0.1800963342189789, "num_tokens": 18749087.0, "step": 10155 }, { "entropy": 5.738570928573608, "epoch": 0.8536021844150389, "grad_norm": 0.97265625, "learning_rate": 0.0004933459683173652, "loss": 5.4, "mean_token_accuracy": 0.17840160429477692, "num_tokens": 18758174.0, "step": 10160 }, { "entropy": 5.732457017898559, "epoch": 0.8540222642302038, "grad_norm": 1.0625, "learning_rate": 0.0004933387373858418, "loss": 5.4135, "mean_token_accuracy": 0.17395816147327423, "num_tokens": 18767679.0, "step": 10165 }, { "entropy": 5.6596081256866455, "epoch": 0.8544423440453687, "grad_norm": 0.9453125, "learning_rate": 0.0004933315025865602, "loss": 5.3518, "mean_token_accuracy": 0.17111423760652542, "num_tokens": 18776749.0, "step": 10170 }, { "entropy": 5.720188093185425, "epoch": 0.8548624238605335, "grad_norm": 1.015625, "learning_rate": 0.0004933242639196485, "loss": 5.5198, "mean_token_accuracy": 0.16034067794680595, "num_tokens": 18786313.0, "step": 10175 }, { "entropy": 5.790711116790772, "epoch": 0.8552825036756984, "grad_norm": 0.99609375, "learning_rate": 0.0004933170213852348, "loss": 5.413, "mean_token_accuracy": 0.16812869757413865, "num_tokens": 18795340.0, "step": 10180 }, { "entropy": 5.658047342300415, "epoch": 0.8557025834908633, "grad_norm": 0.984375, "learning_rate": 0.0004933097749834476, "loss": 5.3361, "mean_token_accuracy": 0.1710760474205017, "num_tokens": 18804114.0, "step": 10185 }, { "entropy": 5.664382362365723, "epoch": 0.8561226633060282, "grad_norm": 0.92578125, "learning_rate": 0.000493302524714415, "loss": 5.3399, "mean_token_accuracy": 0.1744011804461479, "num_tokens": 18813797.0, "step": 10190 }, { "entropy": 5.689053630828857, "epoch": 0.856542743121193, "grad_norm": 1.0078125, "learning_rate": 0.0004932952705782657, "loss": 5.4006, "mean_token_accuracy": 0.17014651298522948, "num_tokens": 18822410.0, "step": 10195 }, { "entropy": 5.629875612258911, "epoch": 0.856962822936358, "grad_norm": 0.94140625, "learning_rate": 0.000493288012575128, "loss": 5.344, "mean_token_accuracy": 0.18000112026929854, "num_tokens": 18832091.0, "step": 10200 }, { "entropy": 5.665528726577759, "epoch": 0.8573829027515227, "grad_norm": 0.9140625, "learning_rate": 0.0004932807507051307, "loss": 5.3696, "mean_token_accuracy": 0.1682669073343277, "num_tokens": 18841298.0, "step": 10205 }, { "entropy": 5.631195068359375, "epoch": 0.8578029825666876, "grad_norm": 1.078125, "learning_rate": 0.0004932734849684022, "loss": 5.3376, "mean_token_accuracy": 0.1766723573207855, "num_tokens": 18849683.0, "step": 10210 }, { "entropy": 5.644782638549804, "epoch": 0.8582230623818525, "grad_norm": 1.0703125, "learning_rate": 0.0004932662153650712, "loss": 5.2853, "mean_token_accuracy": 0.17846843749284744, "num_tokens": 18858832.0, "step": 10215 }, { "entropy": 5.577565240859985, "epoch": 0.8586431421970174, "grad_norm": 1.0390625, "learning_rate": 0.0004932589418952668, "loss": 5.3166, "mean_token_accuracy": 0.17870389074087142, "num_tokens": 18867652.0, "step": 10220 }, { "entropy": 5.691090250015259, "epoch": 0.8590632220121823, "grad_norm": 0.9453125, "learning_rate": 0.0004932516645591175, "loss": 5.3969, "mean_token_accuracy": 0.17297440618276597, "num_tokens": 18877282.0, "step": 10225 }, { "entropy": 5.701138544082641, "epoch": 0.8594833018273472, "grad_norm": 0.96875, "learning_rate": 0.0004932443833567524, "loss": 5.5111, "mean_token_accuracy": 0.17179336547851562, "num_tokens": 18886565.0, "step": 10230 }, { "entropy": 5.716158056259156, "epoch": 0.8599033816425121, "grad_norm": 0.96875, "learning_rate": 0.0004932370982883003, "loss": 5.4324, "mean_token_accuracy": 0.17571771889925003, "num_tokens": 18896440.0, "step": 10235 }, { "entropy": 5.717927169799805, "epoch": 0.8603234614576769, "grad_norm": 0.9609375, "learning_rate": 0.0004932298093538905, "loss": 5.4729, "mean_token_accuracy": 0.17269988656044005, "num_tokens": 18906246.0, "step": 10240 }, { "entropy": 5.629774570465088, "epoch": 0.8607435412728418, "grad_norm": 0.9609375, "learning_rate": 0.000493222516553652, "loss": 5.3792, "mean_token_accuracy": 0.1674125984311104, "num_tokens": 18915108.0, "step": 10245 }, { "entropy": 5.680619382858277, "epoch": 0.8611636210880067, "grad_norm": 1.0390625, "learning_rate": 0.0004932152198877139, "loss": 5.3707, "mean_token_accuracy": 0.17601692378520967, "num_tokens": 18923664.0, "step": 10250 }, { "entropy": 5.6753013134002686, "epoch": 0.8615837009031716, "grad_norm": 0.9609375, "learning_rate": 0.0004932079193562057, "loss": 5.4758, "mean_token_accuracy": 0.1755344331264496, "num_tokens": 18933496.0, "step": 10255 }, { "entropy": 5.648006105422974, "epoch": 0.8620037807183365, "grad_norm": 0.97265625, "learning_rate": 0.0004932006149592564, "loss": 5.3267, "mean_token_accuracy": 0.1777518406510353, "num_tokens": 18942222.0, "step": 10260 }, { "entropy": 5.727283143997193, "epoch": 0.8624238605335014, "grad_norm": 0.96484375, "learning_rate": 0.0004931933066969957, "loss": 5.358, "mean_token_accuracy": 0.17596026062965392, "num_tokens": 18952057.0, "step": 10265 }, { "entropy": 5.678538227081299, "epoch": 0.8628439403486663, "grad_norm": 0.97265625, "learning_rate": 0.0004931859945695528, "loss": 5.4122, "mean_token_accuracy": 0.16999841332435608, "num_tokens": 18961664.0, "step": 10270 }, { "entropy": 5.55914340019226, "epoch": 0.8632640201638311, "grad_norm": 1.0234375, "learning_rate": 0.0004931786785770575, "loss": 5.1972, "mean_token_accuracy": 0.19336993843317032, "num_tokens": 18969900.0, "step": 10275 }, { "entropy": 5.68212857246399, "epoch": 0.863684099978996, "grad_norm": 1.0078125, "learning_rate": 0.0004931713587196392, "loss": 5.4862, "mean_token_accuracy": 0.16879372000694276, "num_tokens": 18979286.0, "step": 10280 }, { "entropy": 5.765684461593628, "epoch": 0.8641041797941609, "grad_norm": 0.93359375, "learning_rate": 0.0004931640349974275, "loss": 5.3922, "mean_token_accuracy": 0.17158469334244728, "num_tokens": 18987553.0, "step": 10285 }, { "entropy": 5.713182687759399, "epoch": 0.8645242596093258, "grad_norm": 0.9765625, "learning_rate": 0.0004931567074105524, "loss": 5.4559, "mean_token_accuracy": 0.1700283855199814, "num_tokens": 18996354.0, "step": 10290 }, { "entropy": 5.617878055572509, "epoch": 0.8649443394244907, "grad_norm": 0.96875, "learning_rate": 0.0004931493759591435, "loss": 5.3414, "mean_token_accuracy": 0.17417115867137908, "num_tokens": 19005150.0, "step": 10295 }, { "entropy": 5.703291225433349, "epoch": 0.8653644192396556, "grad_norm": 0.96484375, "learning_rate": 0.0004931420406433308, "loss": 5.3743, "mean_token_accuracy": 0.16856130808591843, "num_tokens": 19014572.0, "step": 10300 }, { "entropy": 5.656052541732788, "epoch": 0.8657844990548205, "grad_norm": 1.03125, "learning_rate": 0.000493134701463244, "loss": 5.2214, "mean_token_accuracy": 0.1826751485466957, "num_tokens": 19023462.0, "step": 10305 }, { "entropy": 5.603452110290528, "epoch": 0.8662045788699853, "grad_norm": 0.9921875, "learning_rate": 0.0004931273584190135, "loss": 5.3087, "mean_token_accuracy": 0.18033095747232436, "num_tokens": 19032460.0, "step": 10310 }, { "entropy": 5.622628927230835, "epoch": 0.8666246586851502, "grad_norm": 0.97265625, "learning_rate": 0.0004931200115107691, "loss": 5.3639, "mean_token_accuracy": 0.17935606837272644, "num_tokens": 19041734.0, "step": 10315 }, { "entropy": 5.665598297119141, "epoch": 0.867044738500315, "grad_norm": 1.03125, "learning_rate": 0.000493112660738641, "loss": 5.3308, "mean_token_accuracy": 0.17009156942367554, "num_tokens": 19050867.0, "step": 10320 }, { "entropy": 5.628075218200683, "epoch": 0.86746481831548, "grad_norm": 1.0234375, "learning_rate": 0.0004931053061027594, "loss": 5.3359, "mean_token_accuracy": 0.17076550424098969, "num_tokens": 19060518.0, "step": 10325 }, { "entropy": 5.653892850875854, "epoch": 0.8678848981306448, "grad_norm": 1.0078125, "learning_rate": 0.0004930979476032546, "loss": 5.3318, "mean_token_accuracy": 0.1787944793701172, "num_tokens": 19069588.0, "step": 10330 }, { "entropy": 5.6787598609924315, "epoch": 0.8683049779458097, "grad_norm": 1.0078125, "learning_rate": 0.000493090585240257, "loss": 5.3741, "mean_token_accuracy": 0.16588689833879472, "num_tokens": 19079060.0, "step": 10335 }, { "entropy": 5.634321069717407, "epoch": 0.8687250577609746, "grad_norm": 0.9296875, "learning_rate": 0.0004930832190138969, "loss": 5.2979, "mean_token_accuracy": 0.17264353334903718, "num_tokens": 19087721.0, "step": 10340 }, { "entropy": 5.63018684387207, "epoch": 0.8691451375761394, "grad_norm": 0.9765625, "learning_rate": 0.000493075848924305, "loss": 5.3449, "mean_token_accuracy": 0.1799220323562622, "num_tokens": 19096800.0, "step": 10345 }, { "entropy": 5.673685550689697, "epoch": 0.8695652173913043, "grad_norm": 0.91796875, "learning_rate": 0.0004930684749716117, "loss": 5.4094, "mean_token_accuracy": 0.17281698882579805, "num_tokens": 19106774.0, "step": 10350 }, { "entropy": 5.688310146331787, "epoch": 0.8699852972064692, "grad_norm": 0.87890625, "learning_rate": 0.0004930610971559476, "loss": 5.3351, "mean_token_accuracy": 0.1781072050333023, "num_tokens": 19116413.0, "step": 10355 }, { "entropy": 5.695785903930664, "epoch": 0.8704053770216341, "grad_norm": 1.0234375, "learning_rate": 0.0004930537154774436, "loss": 5.3729, "mean_token_accuracy": 0.17046389281749724, "num_tokens": 19125363.0, "step": 10360 }, { "entropy": 5.711740732192993, "epoch": 0.870825456836799, "grad_norm": 0.94921875, "learning_rate": 0.0004930463299362302, "loss": 5.4541, "mean_token_accuracy": 0.16621775925159454, "num_tokens": 19135461.0, "step": 10365 }, { "entropy": 5.695140600204468, "epoch": 0.8712455366519639, "grad_norm": 1.03125, "learning_rate": 0.0004930389405324383, "loss": 5.3204, "mean_token_accuracy": 0.18364957422018052, "num_tokens": 19144085.0, "step": 10370 }, { "entropy": 5.679112386703491, "epoch": 0.8716656164671287, "grad_norm": 1.0078125, "learning_rate": 0.0004930315472661987, "loss": 5.3457, "mean_token_accuracy": 0.17913586497306824, "num_tokens": 19153291.0, "step": 10375 }, { "entropy": 5.632014751434326, "epoch": 0.8720856962822936, "grad_norm": 0.94140625, "learning_rate": 0.0004930241501376428, "loss": 5.3594, "mean_token_accuracy": 0.1725214645266533, "num_tokens": 19163514.0, "step": 10380 }, { "entropy": 5.540747404098511, "epoch": 0.8725057760974585, "grad_norm": 0.99609375, "learning_rate": 0.0004930167491469013, "loss": 5.246, "mean_token_accuracy": 0.18007472455501555, "num_tokens": 19172103.0, "step": 10385 }, { "entropy": 5.644805955886841, "epoch": 0.8729258559126234, "grad_norm": 1.125, "learning_rate": 0.0004930093442941053, "loss": 5.3266, "mean_token_accuracy": 0.17474670708179474, "num_tokens": 19180893.0, "step": 10390 }, { "entropy": 5.617472839355469, "epoch": 0.8733459357277883, "grad_norm": 0.96875, "learning_rate": 0.0004930019355793858, "loss": 5.2578, "mean_token_accuracy": 0.1790870323777199, "num_tokens": 19190495.0, "step": 10395 }, { "entropy": 5.602152919769287, "epoch": 0.8737660155429532, "grad_norm": 1.046875, "learning_rate": 0.0004929945230028746, "loss": 5.3481, "mean_token_accuracy": 0.18140414208173752, "num_tokens": 19198988.0, "step": 10400 }, { "entropy": 5.59614930152893, "epoch": 0.8741860953581181, "grad_norm": 0.96484375, "learning_rate": 0.0004929871065647024, "loss": 5.2564, "mean_token_accuracy": 0.181388983130455, "num_tokens": 19208014.0, "step": 10405 }, { "entropy": 5.710901403427124, "epoch": 0.8746061751732829, "grad_norm": 0.94921875, "learning_rate": 0.0004929796862650011, "loss": 5.4532, "mean_token_accuracy": 0.17245894074440002, "num_tokens": 19218220.0, "step": 10410 }, { "entropy": 5.594465065002441, "epoch": 0.8750262549884478, "grad_norm": 1.140625, "learning_rate": 0.0004929722621039018, "loss": 5.3443, "mean_token_accuracy": 0.17253897339105606, "num_tokens": 19227176.0, "step": 10415 }, { "entropy": 5.604946041107178, "epoch": 0.8754463348036127, "grad_norm": 1.0, "learning_rate": 0.0004929648340815362, "loss": 5.3778, "mean_token_accuracy": 0.169038288295269, "num_tokens": 19236085.0, "step": 10420 }, { "entropy": 5.690009498596192, "epoch": 0.8758664146187776, "grad_norm": 0.875, "learning_rate": 0.0004929574021980355, "loss": 5.4141, "mean_token_accuracy": 0.17079854011535645, "num_tokens": 19246671.0, "step": 10425 }, { "entropy": 5.636815977096558, "epoch": 0.8762864944339425, "grad_norm": 0.90234375, "learning_rate": 0.0004929499664535319, "loss": 5.3285, "mean_token_accuracy": 0.17400482743978501, "num_tokens": 19256321.0, "step": 10430 }, { "entropy": 5.626712036132813, "epoch": 0.8767065742491074, "grad_norm": 1.03125, "learning_rate": 0.0004929425268481569, "loss": 5.2847, "mean_token_accuracy": 0.1827862948179245, "num_tokens": 19265518.0, "step": 10435 }, { "entropy": 5.6520655155181885, "epoch": 0.8771266540642723, "grad_norm": 1.0234375, "learning_rate": 0.0004929350833820422, "loss": 5.294, "mean_token_accuracy": 0.17669767588377, "num_tokens": 19274120.0, "step": 10440 }, { "entropy": 5.650471496582031, "epoch": 0.877546733879437, "grad_norm": 0.98828125, "learning_rate": 0.0004929276360553197, "loss": 5.3501, "mean_token_accuracy": 0.17820055037736893, "num_tokens": 19284377.0, "step": 10445 }, { "entropy": 5.599103164672852, "epoch": 0.8779668136946019, "grad_norm": 0.9140625, "learning_rate": 0.0004929201848681213, "loss": 5.2371, "mean_token_accuracy": 0.1824801653623581, "num_tokens": 19293326.0, "step": 10450 }, { "entropy": 5.627392721176148, "epoch": 0.8783868935097668, "grad_norm": 0.98828125, "learning_rate": 0.0004929127298205792, "loss": 5.2929, "mean_token_accuracy": 0.18438757508993148, "num_tokens": 19302086.0, "step": 10455 }, { "entropy": 5.686365413665771, "epoch": 0.8788069733249317, "grad_norm": 1.1171875, "learning_rate": 0.0004929052709128251, "loss": 5.3198, "mean_token_accuracy": 0.18251776546239853, "num_tokens": 19310124.0, "step": 10460 }, { "entropy": 5.600974988937378, "epoch": 0.8792270531400966, "grad_norm": 0.953125, "learning_rate": 0.0004928978081449914, "loss": 5.3312, "mean_token_accuracy": 0.17491378784179687, "num_tokens": 19321269.0, "step": 10465 }, { "entropy": 5.636794519424439, "epoch": 0.8796471329552615, "grad_norm": 1.0078125, "learning_rate": 0.0004928903415172103, "loss": 5.3578, "mean_token_accuracy": 0.17981308251619338, "num_tokens": 19330390.0, "step": 10470 }, { "entropy": 5.671741247177124, "epoch": 0.8800672127704264, "grad_norm": 0.95703125, "learning_rate": 0.000492882871029614, "loss": 5.3451, "mean_token_accuracy": 0.17643130719661712, "num_tokens": 19339457.0, "step": 10475 }, { "entropy": 5.66839599609375, "epoch": 0.8804872925855912, "grad_norm": 0.94140625, "learning_rate": 0.0004928753966823348, "loss": 5.4182, "mean_token_accuracy": 0.17329920828342438, "num_tokens": 19348710.0, "step": 10480 }, { "entropy": 5.710315990447998, "epoch": 0.8809073724007561, "grad_norm": 0.9609375, "learning_rate": 0.0004928679184755051, "loss": 5.4529, "mean_token_accuracy": 0.1797290176153183, "num_tokens": 19357215.0, "step": 10485 }, { "entropy": 5.646717691421509, "epoch": 0.881327452215921, "grad_norm": 1.0859375, "learning_rate": 0.0004928604364092574, "loss": 5.3764, "mean_token_accuracy": 0.17588409632444382, "num_tokens": 19366043.0, "step": 10490 }, { "entropy": 5.684279441833496, "epoch": 0.8817475320310859, "grad_norm": 1.03125, "learning_rate": 0.0004928529504837243, "loss": 5.4411, "mean_token_accuracy": 0.17388026416301727, "num_tokens": 19375468.0, "step": 10495 }, { "entropy": 5.740813875198365, "epoch": 0.8821676118462508, "grad_norm": 1.015625, "learning_rate": 0.0004928454606990383, "loss": 5.3035, "mean_token_accuracy": 0.17651602178812026, "num_tokens": 19384467.0, "step": 10500 }, { "entropy": 5.634351110458374, "epoch": 0.8825876916614157, "grad_norm": 1.03125, "learning_rate": 0.0004928379670553322, "loss": 5.3621, "mean_token_accuracy": 0.17722720056772232, "num_tokens": 19393618.0, "step": 10505 }, { "entropy": 5.713768482208252, "epoch": 0.8830077714765806, "grad_norm": 0.98046875, "learning_rate": 0.0004928304695527387, "loss": 5.42, "mean_token_accuracy": 0.16624423265457153, "num_tokens": 19402921.0, "step": 10510 }, { "entropy": 5.715269470214844, "epoch": 0.8834278512917454, "grad_norm": 1.0546875, "learning_rate": 0.0004928229681913905, "loss": 5.3922, "mean_token_accuracy": 0.1776364266872406, "num_tokens": 19412048.0, "step": 10515 }, { "entropy": 5.709134864807129, "epoch": 0.8838479311069103, "grad_norm": 1.1015625, "learning_rate": 0.0004928154629714207, "loss": 5.3853, "mean_token_accuracy": 0.17549546360969542, "num_tokens": 19420993.0, "step": 10520 }, { "entropy": 5.625003433227539, "epoch": 0.8842680109220752, "grad_norm": 0.9453125, "learning_rate": 0.000492807953892962, "loss": 5.3522, "mean_token_accuracy": 0.1715557023882866, "num_tokens": 19430145.0, "step": 10525 }, { "entropy": 5.622083139419556, "epoch": 0.8846880907372401, "grad_norm": 1.0078125, "learning_rate": 0.0004928004409561476, "loss": 5.2665, "mean_token_accuracy": 0.17825621664524077, "num_tokens": 19438918.0, "step": 10530 }, { "entropy": 5.593374490737915, "epoch": 0.885108170552405, "grad_norm": 0.93359375, "learning_rate": 0.0004927929241611106, "loss": 5.3163, "mean_token_accuracy": 0.1802013874053955, "num_tokens": 19448490.0, "step": 10535 }, { "entropy": 5.630385017395019, "epoch": 0.8855282503675699, "grad_norm": 0.97265625, "learning_rate": 0.000492785403507984, "loss": 5.3816, "mean_token_accuracy": 0.1738127812743187, "num_tokens": 19457098.0, "step": 10540 }, { "entropy": 5.6495198726654055, "epoch": 0.8859483301827347, "grad_norm": 1.0, "learning_rate": 0.0004927778789969012, "loss": 5.3635, "mean_token_accuracy": 0.1741376906633377, "num_tokens": 19466419.0, "step": 10545 }, { "entropy": 5.673911285400391, "epoch": 0.8863684099978996, "grad_norm": 1.0625, "learning_rate": 0.0004927703506279955, "loss": 5.411, "mean_token_accuracy": 0.167750808596611, "num_tokens": 19475882.0, "step": 10550 }, { "entropy": 5.7675553321838375, "epoch": 0.8867884898130645, "grad_norm": 0.98828125, "learning_rate": 0.0004927628184014, "loss": 5.4587, "mean_token_accuracy": 0.1710841953754425, "num_tokens": 19485917.0, "step": 10555 }, { "entropy": 5.712929439544678, "epoch": 0.8872085696282294, "grad_norm": 0.96484375, "learning_rate": 0.0004927552823172483, "loss": 5.3961, "mean_token_accuracy": 0.16941948533058165, "num_tokens": 19494984.0, "step": 10560 }, { "entropy": 5.692627954483032, "epoch": 0.8876286494433943, "grad_norm": 0.921875, "learning_rate": 0.000492747742375674, "loss": 5.3412, "mean_token_accuracy": 0.1776975154876709, "num_tokens": 19504087.0, "step": 10565 }, { "entropy": 5.676058101654053, "epoch": 0.8880487292585592, "grad_norm": 1.0, "learning_rate": 0.0004927401985768106, "loss": 5.3739, "mean_token_accuracy": 0.18018916249275208, "num_tokens": 19512880.0, "step": 10570 }, { "entropy": 5.616356086730957, "epoch": 0.888468809073724, "grad_norm": 1.015625, "learning_rate": 0.0004927326509207915, "loss": 5.3349, "mean_token_accuracy": 0.17871468365192414, "num_tokens": 19521723.0, "step": 10575 }, { "entropy": 5.689477491378784, "epoch": 0.8888888888888888, "grad_norm": 0.9375, "learning_rate": 0.0004927250994077508, "loss": 5.4278, "mean_token_accuracy": 0.17393076568841934, "num_tokens": 19531352.0, "step": 10580 }, { "entropy": 5.758683967590332, "epoch": 0.8893089687040537, "grad_norm": 0.94140625, "learning_rate": 0.000492717544037822, "loss": 5.49, "mean_token_accuracy": 0.1832442432641983, "num_tokens": 19540943.0, "step": 10585 }, { "entropy": 5.591853666305542, "epoch": 0.8897290485192186, "grad_norm": 0.94140625, "learning_rate": 0.000492709984811139, "loss": 5.2782, "mean_token_accuracy": 0.1818720817565918, "num_tokens": 19550527.0, "step": 10590 }, { "entropy": 5.607506847381591, "epoch": 0.8901491283343835, "grad_norm": 0.98828125, "learning_rate": 0.0004927024217278358, "loss": 5.2795, "mean_token_accuracy": 0.18710044920444488, "num_tokens": 19559746.0, "step": 10595 }, { "entropy": 5.725629425048828, "epoch": 0.8905692081495484, "grad_norm": 0.95703125, "learning_rate": 0.0004926948547880462, "loss": 5.4634, "mean_token_accuracy": 0.1625555321574211, "num_tokens": 19569286.0, "step": 10600 }, { "entropy": 5.599992036819458, "epoch": 0.8909892879647133, "grad_norm": 0.921875, "learning_rate": 0.0004926872839919044, "loss": 5.3178, "mean_token_accuracy": 0.17441273480653763, "num_tokens": 19578245.0, "step": 10605 }, { "entropy": 5.61881742477417, "epoch": 0.8914093677798782, "grad_norm": 0.9453125, "learning_rate": 0.0004926797093395446, "loss": 5.2807, "mean_token_accuracy": 0.17928027957677842, "num_tokens": 19587244.0, "step": 10610 }, { "entropy": 5.675804281234742, "epoch": 0.891829447595043, "grad_norm": 0.97265625, "learning_rate": 0.0004926721308311006, "loss": 5.3807, "mean_token_accuracy": 0.1804608568549156, "num_tokens": 19596932.0, "step": 10615 }, { "entropy": 5.745219326019287, "epoch": 0.8922495274102079, "grad_norm": 1.03125, "learning_rate": 0.0004926645484667069, "loss": 5.4645, "mean_token_accuracy": 0.17389744371175767, "num_tokens": 19606256.0, "step": 10620 }, { "entropy": 5.755998325347901, "epoch": 0.8926696072253728, "grad_norm": 1.0078125, "learning_rate": 0.0004926569622464979, "loss": 5.483, "mean_token_accuracy": 0.1706532880663872, "num_tokens": 19615726.0, "step": 10625 }, { "entropy": 5.641065979003907, "epoch": 0.8930896870405377, "grad_norm": 1.0390625, "learning_rate": 0.0004926493721706079, "loss": 5.336, "mean_token_accuracy": 0.17481352388858795, "num_tokens": 19624037.0, "step": 10630 }, { "entropy": 5.64189043045044, "epoch": 0.8935097668557026, "grad_norm": 1.09375, "learning_rate": 0.0004926417782391713, "loss": 5.3356, "mean_token_accuracy": 0.18309717029333114, "num_tokens": 19632882.0, "step": 10635 }, { "entropy": 5.701260328292847, "epoch": 0.8939298466708675, "grad_norm": 1.03125, "learning_rate": 0.0004926341804523227, "loss": 5.4513, "mean_token_accuracy": 0.17101485580205916, "num_tokens": 19642686.0, "step": 10640 }, { "entropy": 5.636244487762451, "epoch": 0.8943499264860324, "grad_norm": 1.0546875, "learning_rate": 0.0004926265788101966, "loss": 5.3582, "mean_token_accuracy": 0.17762073278427123, "num_tokens": 19651380.0, "step": 10645 }, { "entropy": 5.626670694351196, "epoch": 0.8947700063011972, "grad_norm": 0.984375, "learning_rate": 0.0004926189733129278, "loss": 5.2879, "mean_token_accuracy": 0.1763039141893387, "num_tokens": 19660136.0, "step": 10650 }, { "entropy": 5.630255508422851, "epoch": 0.8951900861163621, "grad_norm": 0.97265625, "learning_rate": 0.0004926113639606509, "loss": 5.3335, "mean_token_accuracy": 0.18763954043388367, "num_tokens": 19669146.0, "step": 10655 }, { "entropy": 5.731260824203491, "epoch": 0.895610165931527, "grad_norm": 0.9453125, "learning_rate": 0.0004926037507535008, "loss": 5.4589, "mean_token_accuracy": 0.17182486057281493, "num_tokens": 19678627.0, "step": 10660 }, { "entropy": 5.69344391822815, "epoch": 0.8960302457466919, "grad_norm": 0.96484375, "learning_rate": 0.0004925961336916122, "loss": 5.3929, "mean_token_accuracy": 0.1749602437019348, "num_tokens": 19688033.0, "step": 10665 }, { "entropy": 5.678943729400634, "epoch": 0.8964503255618568, "grad_norm": 0.99609375, "learning_rate": 0.0004925885127751202, "loss": 5.3958, "mean_token_accuracy": 0.17912110537290574, "num_tokens": 19696523.0, "step": 10670 }, { "entropy": 5.71374773979187, "epoch": 0.8968704053770217, "grad_norm": 0.953125, "learning_rate": 0.0004925808880041596, "loss": 5.3098, "mean_token_accuracy": 0.17729369103908538, "num_tokens": 19706339.0, "step": 10675 }, { "entropy": 5.706824398040771, "epoch": 0.8972904851921865, "grad_norm": 1.03125, "learning_rate": 0.0004925732593788658, "loss": 5.3648, "mean_token_accuracy": 0.17486973702907563, "num_tokens": 19714779.0, "step": 10680 }, { "entropy": 5.654337596893311, "epoch": 0.8977105650073514, "grad_norm": 0.99609375, "learning_rate": 0.0004925656268993737, "loss": 5.4284, "mean_token_accuracy": 0.17219052016735076, "num_tokens": 19723727.0, "step": 10685 }, { "entropy": 5.5989068984985355, "epoch": 0.8981306448225163, "grad_norm": 0.99609375, "learning_rate": 0.0004925579905658185, "loss": 5.3966, "mean_token_accuracy": 0.17663999646902084, "num_tokens": 19732783.0, "step": 10690 }, { "entropy": 5.684259414672852, "epoch": 0.8985507246376812, "grad_norm": 1.03125, "learning_rate": 0.0004925503503783355, "loss": 5.3575, "mean_token_accuracy": 0.17356715351343155, "num_tokens": 19741268.0, "step": 10695 }, { "entropy": 5.7133454322814945, "epoch": 0.898970804452846, "grad_norm": 0.98828125, "learning_rate": 0.0004925427063370601, "loss": 5.2861, "mean_token_accuracy": 0.17883872389793395, "num_tokens": 19751490.0, "step": 10700 }, { "entropy": 5.6417115211486815, "epoch": 0.899390884268011, "grad_norm": 0.984375, "learning_rate": 0.0004925350584421278, "loss": 5.3329, "mean_token_accuracy": 0.1776866912841797, "num_tokens": 19760487.0, "step": 10705 }, { "entropy": 5.714428424835205, "epoch": 0.8998109640831758, "grad_norm": 1.0, "learning_rate": 0.0004925274066936738, "loss": 5.2981, "mean_token_accuracy": 0.18347593098878862, "num_tokens": 19768984.0, "step": 10710 }, { "entropy": 5.635104084014893, "epoch": 0.9002310438983406, "grad_norm": 0.88671875, "learning_rate": 0.0004925197510918339, "loss": 5.2809, "mean_token_accuracy": 0.18501415103673935, "num_tokens": 19778335.0, "step": 10715 }, { "entropy": 5.648560380935669, "epoch": 0.9006511237135055, "grad_norm": 0.97265625, "learning_rate": 0.0004925120916367435, "loss": 5.4169, "mean_token_accuracy": 0.1690452665090561, "num_tokens": 19789082.0, "step": 10720 }, { "entropy": 5.579227638244629, "epoch": 0.9010712035286704, "grad_norm": 0.9375, "learning_rate": 0.0004925044283285384, "loss": 5.1668, "mean_token_accuracy": 0.1959148555994034, "num_tokens": 19797902.0, "step": 10725 }, { "entropy": 5.5749458312988285, "epoch": 0.9014912833438353, "grad_norm": 0.9765625, "learning_rate": 0.0004924967611673544, "loss": 5.3299, "mean_token_accuracy": 0.18181584328413009, "num_tokens": 19806481.0, "step": 10730 }, { "entropy": 5.515876817703247, "epoch": 0.9019113631590002, "grad_norm": 1.0078125, "learning_rate": 0.0004924890901533273, "loss": 5.23, "mean_token_accuracy": 0.1879657819867134, "num_tokens": 19815226.0, "step": 10735 }, { "entropy": 5.775722980499268, "epoch": 0.9023314429741651, "grad_norm": 1.015625, "learning_rate": 0.0004924814152865929, "loss": 5.4411, "mean_token_accuracy": 0.17111022472381593, "num_tokens": 19824577.0, "step": 10740 }, { "entropy": 5.704782867431641, "epoch": 0.90275152278933, "grad_norm": 1.0, "learning_rate": 0.0004924737365672873, "loss": 5.3492, "mean_token_accuracy": 0.16909119486808777, "num_tokens": 19832936.0, "step": 10745 }, { "entropy": 5.746354103088379, "epoch": 0.9031716026044948, "grad_norm": 0.98046875, "learning_rate": 0.0004924660539955463, "loss": 5.4814, "mean_token_accuracy": 0.17577190846204757, "num_tokens": 19841946.0, "step": 10750 }, { "entropy": 5.6899515151977536, "epoch": 0.9035916824196597, "grad_norm": 0.9296875, "learning_rate": 0.0004924583675715063, "loss": 5.3771, "mean_token_accuracy": 0.1771700993180275, "num_tokens": 19851469.0, "step": 10755 }, { "entropy": 5.7029258728027346, "epoch": 0.9040117622348246, "grad_norm": 1.109375, "learning_rate": 0.0004924506772953031, "loss": 5.4306, "mean_token_accuracy": 0.1773010015487671, "num_tokens": 19860731.0, "step": 10760 }, { "entropy": 5.667733097076416, "epoch": 0.9044318420499895, "grad_norm": 1.03125, "learning_rate": 0.0004924429831670733, "loss": 5.466, "mean_token_accuracy": 0.16822922378778457, "num_tokens": 19869717.0, "step": 10765 }, { "entropy": 5.689977073669434, "epoch": 0.9048519218651544, "grad_norm": 0.95703125, "learning_rate": 0.000492435285186953, "loss": 5.4019, "mean_token_accuracy": 0.1751111000776291, "num_tokens": 19879229.0, "step": 10770 }, { "entropy": 5.751946258544922, "epoch": 0.9052720016803193, "grad_norm": 1.0546875, "learning_rate": 0.0004924275833550785, "loss": 5.3806, "mean_token_accuracy": 0.1700134187936783, "num_tokens": 19888260.0, "step": 10775 }, { "entropy": 5.6942836284637455, "epoch": 0.9056920814954842, "grad_norm": 1.1171875, "learning_rate": 0.0004924198776715865, "loss": 5.4168, "mean_token_accuracy": 0.17697516679763795, "num_tokens": 19897070.0, "step": 10780 }, { "entropy": 5.696029090881348, "epoch": 0.906112161310649, "grad_norm": 1.0234375, "learning_rate": 0.0004924121681366132, "loss": 5.3914, "mean_token_accuracy": 0.16845138520002365, "num_tokens": 19907170.0, "step": 10785 }, { "entropy": 5.684460258483886, "epoch": 0.9065322411258139, "grad_norm": 0.921875, "learning_rate": 0.0004924044547502951, "loss": 5.3508, "mean_token_accuracy": 0.17326620221138, "num_tokens": 19917220.0, "step": 10790 }, { "entropy": 5.6519941806793215, "epoch": 0.9069523209409788, "grad_norm": 0.96484375, "learning_rate": 0.0004923967375127692, "loss": 5.4097, "mean_token_accuracy": 0.17902130484580994, "num_tokens": 19926724.0, "step": 10795 }, { "entropy": 5.768009805679322, "epoch": 0.9073724007561437, "grad_norm": 0.94921875, "learning_rate": 0.000492389016424172, "loss": 5.5174, "mean_token_accuracy": 0.16810329854488373, "num_tokens": 19936429.0, "step": 10800 }, { "entropy": 5.658706521987915, "epoch": 0.9077924805713086, "grad_norm": 0.97265625, "learning_rate": 0.0004923812914846404, "loss": 5.2875, "mean_token_accuracy": 0.18738020360469818, "num_tokens": 19945096.0, "step": 10805 }, { "entropy": 5.655237054824829, "epoch": 0.9082125603864735, "grad_norm": 1.03125, "learning_rate": 0.0004923735626943111, "loss": 5.359, "mean_token_accuracy": 0.18075794279575347, "num_tokens": 19953560.0, "step": 10810 }, { "entropy": 5.65477728843689, "epoch": 0.9086326402016384, "grad_norm": 0.9765625, "learning_rate": 0.0004923658300533211, "loss": 5.3434, "mean_token_accuracy": 0.17028914839029313, "num_tokens": 19962669.0, "step": 10815 }, { "entropy": 5.733688879013061, "epoch": 0.9090527200168032, "grad_norm": 1.0078125, "learning_rate": 0.0004923580935618073, "loss": 5.3839, "mean_token_accuracy": 0.17526840418577194, "num_tokens": 19971990.0, "step": 10820 }, { "entropy": 5.652723407745361, "epoch": 0.909472799831968, "grad_norm": 0.95703125, "learning_rate": 0.0004923503532199069, "loss": 5.3542, "mean_token_accuracy": 0.1781252607703209, "num_tokens": 19981850.0, "step": 10825 }, { "entropy": 5.692673301696777, "epoch": 0.909892879647133, "grad_norm": 0.9609375, "learning_rate": 0.0004923426090277567, "loss": 5.4176, "mean_token_accuracy": 0.16977173537015916, "num_tokens": 19991574.0, "step": 10830 }, { "entropy": 5.711677980422974, "epoch": 0.9103129594622978, "grad_norm": 0.92578125, "learning_rate": 0.0004923348609854943, "loss": 5.3754, "mean_token_accuracy": 0.17961972057819367, "num_tokens": 20001392.0, "step": 10835 }, { "entropy": 5.6754333019256595, "epoch": 0.9107330392774627, "grad_norm": 1.0234375, "learning_rate": 0.0004923271090932566, "loss": 5.4111, "mean_token_accuracy": 0.1694395825266838, "num_tokens": 20011277.0, "step": 10840 }, { "entropy": 5.628445672988891, "epoch": 0.9111531190926276, "grad_norm": 0.97265625, "learning_rate": 0.0004923193533511812, "loss": 5.3311, "mean_token_accuracy": 0.17614845037460328, "num_tokens": 20021171.0, "step": 10845 }, { "entropy": 5.787916660308838, "epoch": 0.9115731989077924, "grad_norm": 0.9921875, "learning_rate": 0.0004923115937594053, "loss": 5.3986, "mean_token_accuracy": 0.17411455661058425, "num_tokens": 20030189.0, "step": 10850 }, { "entropy": 5.716488170623779, "epoch": 0.9119932787229573, "grad_norm": 1.0234375, "learning_rate": 0.0004923038303180664, "loss": 5.3607, "mean_token_accuracy": 0.1790910094976425, "num_tokens": 20038287.0, "step": 10855 }, { "entropy": 5.659997892379761, "epoch": 0.9124133585381222, "grad_norm": 1.078125, "learning_rate": 0.000492296063027302, "loss": 5.4, "mean_token_accuracy": 0.17194890081882477, "num_tokens": 20047653.0, "step": 10860 }, { "entropy": 5.664541530609131, "epoch": 0.9128334383532871, "grad_norm": 0.94140625, "learning_rate": 0.0004922882918872498, "loss": 5.3998, "mean_token_accuracy": 0.16832661479711533, "num_tokens": 20057415.0, "step": 10865 }, { "entropy": 5.760810708999633, "epoch": 0.913253518168452, "grad_norm": 1.0546875, "learning_rate": 0.0004922805168980475, "loss": 5.4229, "mean_token_accuracy": 0.1723007947206497, "num_tokens": 20065996.0, "step": 10870 }, { "entropy": 5.667379570007324, "epoch": 0.9136735979836169, "grad_norm": 0.95703125, "learning_rate": 0.0004922727380598326, "loss": 5.3391, "mean_token_accuracy": 0.17363977879285813, "num_tokens": 20075376.0, "step": 10875 }, { "entropy": 5.643923425674439, "epoch": 0.9140936777987818, "grad_norm": 0.8984375, "learning_rate": 0.000492264955372743, "loss": 5.3902, "mean_token_accuracy": 0.16837187707424164, "num_tokens": 20084950.0, "step": 10880 }, { "entropy": 5.709530448913574, "epoch": 0.9145137576139466, "grad_norm": 1.0546875, "learning_rate": 0.0004922571688369165, "loss": 5.3685, "mean_token_accuracy": 0.1781516268849373, "num_tokens": 20094011.0, "step": 10885 }, { "entropy": 5.67348690032959, "epoch": 0.9149338374291115, "grad_norm": 1.015625, "learning_rate": 0.0004922493784524914, "loss": 5.3264, "mean_token_accuracy": 0.17602846771478653, "num_tokens": 20103037.0, "step": 10890 }, { "entropy": 5.714393281936646, "epoch": 0.9153539172442764, "grad_norm": 1.0390625, "learning_rate": 0.0004922415842196052, "loss": 5.4744, "mean_token_accuracy": 0.16465481072664262, "num_tokens": 20112727.0, "step": 10895 }, { "entropy": 5.595464420318604, "epoch": 0.9157739970594413, "grad_norm": 0.96875, "learning_rate": 0.0004922337861383963, "loss": 5.2824, "mean_token_accuracy": 0.17985836565494537, "num_tokens": 20122341.0, "step": 10900 }, { "entropy": 5.646765804290771, "epoch": 0.9161940768746062, "grad_norm": 1.078125, "learning_rate": 0.0004922259842090027, "loss": 5.2889, "mean_token_accuracy": 0.17536766231060028, "num_tokens": 20131354.0, "step": 10905 }, { "entropy": 5.696325302124023, "epoch": 0.9166141566897711, "grad_norm": 1.0234375, "learning_rate": 0.0004922181784315627, "loss": 5.3262, "mean_token_accuracy": 0.18010227829217912, "num_tokens": 20140440.0, "step": 10910 }, { "entropy": 5.639437341690064, "epoch": 0.917034236504936, "grad_norm": 0.98046875, "learning_rate": 0.0004922103688062145, "loss": 5.3263, "mean_token_accuracy": 0.1752904921770096, "num_tokens": 20149331.0, "step": 10915 }, { "entropy": 5.6195207118988035, "epoch": 0.9174543163201008, "grad_norm": 0.9453125, "learning_rate": 0.0004922025553330964, "loss": 5.2964, "mean_token_accuracy": 0.18341011852025985, "num_tokens": 20158566.0, "step": 10920 }, { "entropy": 5.697876596450806, "epoch": 0.9178743961352657, "grad_norm": 0.9609375, "learning_rate": 0.000492194738012347, "loss": 5.3874, "mean_token_accuracy": 0.17685852944850922, "num_tokens": 20168339.0, "step": 10925 }, { "entropy": 5.74979796409607, "epoch": 0.9182944759504306, "grad_norm": 0.95703125, "learning_rate": 0.0004921869168441045, "loss": 5.3959, "mean_token_accuracy": 0.17119568437337876, "num_tokens": 20177967.0, "step": 10930 }, { "entropy": 5.668965911865234, "epoch": 0.9187145557655955, "grad_norm": 0.91015625, "learning_rate": 0.0004921790918285077, "loss": 5.4004, "mean_token_accuracy": 0.17676358073949813, "num_tokens": 20187279.0, "step": 10935 }, { "entropy": 5.69212589263916, "epoch": 0.9191346355807604, "grad_norm": 1.09375, "learning_rate": 0.0004921712629656951, "loss": 5.4789, "mean_token_accuracy": 0.1859880730509758, "num_tokens": 20195324.0, "step": 10940 }, { "entropy": 5.7375670909881595, "epoch": 0.9195547153959253, "grad_norm": 1.1015625, "learning_rate": 0.0004921634302558054, "loss": 5.4088, "mean_token_accuracy": 0.1750117763876915, "num_tokens": 20204985.0, "step": 10945 }, { "entropy": 5.680584812164307, "epoch": 0.9199747952110902, "grad_norm": 1.1015625, "learning_rate": 0.0004921555936989773, "loss": 5.4359, "mean_token_accuracy": 0.17013416737318038, "num_tokens": 20214553.0, "step": 10950 }, { "entropy": 5.717116165161133, "epoch": 0.9203948750262549, "grad_norm": 0.9375, "learning_rate": 0.0004921477532953497, "loss": 5.346, "mean_token_accuracy": 0.1803850382566452, "num_tokens": 20224118.0, "step": 10955 }, { "entropy": 5.627071475982666, "epoch": 0.9208149548414198, "grad_norm": 0.94140625, "learning_rate": 0.0004921399090450616, "loss": 5.2961, "mean_token_accuracy": 0.17257336229085923, "num_tokens": 20233719.0, "step": 10960 }, { "entropy": 5.683680868148803, "epoch": 0.9212350346565847, "grad_norm": 1.0859375, "learning_rate": 0.0004921320609482517, "loss": 5.394, "mean_token_accuracy": 0.17871596813201904, "num_tokens": 20242311.0, "step": 10965 }, { "entropy": 5.724655532836914, "epoch": 0.9216551144717496, "grad_norm": 1.078125, "learning_rate": 0.0004921242090050591, "loss": 5.4436, "mean_token_accuracy": 0.1749306097626686, "num_tokens": 20252998.0, "step": 10970 }, { "entropy": 5.705359888076782, "epoch": 0.9220751942869145, "grad_norm": 0.8828125, "learning_rate": 0.000492116353215623, "loss": 5.4804, "mean_token_accuracy": 0.17137448936700822, "num_tokens": 20262456.0, "step": 10975 }, { "entropy": 5.605144882202149, "epoch": 0.9224952741020794, "grad_norm": 0.9765625, "learning_rate": 0.0004921084935800825, "loss": 5.2568, "mean_token_accuracy": 0.1819299876689911, "num_tokens": 20271516.0, "step": 10980 }, { "entropy": 5.6282915592193605, "epoch": 0.9229153539172443, "grad_norm": 1.015625, "learning_rate": 0.0004921006300985768, "loss": 5.2946, "mean_token_accuracy": 0.18205133378505706, "num_tokens": 20280373.0, "step": 10985 }, { "entropy": 5.680062294006348, "epoch": 0.9233354337324091, "grad_norm": 0.91015625, "learning_rate": 0.0004920927627712453, "loss": 5.2896, "mean_token_accuracy": 0.18193748593330383, "num_tokens": 20289426.0, "step": 10990 }, { "entropy": 5.711742544174195, "epoch": 0.923755513547574, "grad_norm": 0.98828125, "learning_rate": 0.0004920848915982273, "loss": 5.4352, "mean_token_accuracy": 0.1710486590862274, "num_tokens": 20298045.0, "step": 10995 }, { "entropy": 5.623484086990357, "epoch": 0.9241755933627389, "grad_norm": 1.0625, "learning_rate": 0.0004920770165796622, "loss": 5.3441, "mean_token_accuracy": 0.1761312246322632, "num_tokens": 20307352.0, "step": 11000 }, { "entropy": 5.646529912948608, "epoch": 0.9245956731779038, "grad_norm": 0.9140625, "learning_rate": 0.0004920691377156895, "loss": 5.3619, "mean_token_accuracy": 0.17627190798521042, "num_tokens": 20316448.0, "step": 11005 }, { "entropy": 5.741350555419922, "epoch": 0.9250157529930687, "grad_norm": 0.9921875, "learning_rate": 0.0004920612550064488, "loss": 5.4951, "mean_token_accuracy": 0.16989560574293136, "num_tokens": 20326440.0, "step": 11010 }, { "entropy": 5.673906469345093, "epoch": 0.9254358328082336, "grad_norm": 1.0078125, "learning_rate": 0.0004920533684520797, "loss": 5.2596, "mean_token_accuracy": 0.1794683814048767, "num_tokens": 20335447.0, "step": 11015 }, { "entropy": 5.671544456481934, "epoch": 0.9258559126233984, "grad_norm": 0.99609375, "learning_rate": 0.000492045478052722, "loss": 5.4318, "mean_token_accuracy": 0.1717248573899269, "num_tokens": 20344523.0, "step": 11020 }, { "entropy": 5.695364141464234, "epoch": 0.9262759924385633, "grad_norm": 0.9609375, "learning_rate": 0.0004920375838085154, "loss": 5.3793, "mean_token_accuracy": 0.17682462185621262, "num_tokens": 20354267.0, "step": 11025 }, { "entropy": 5.683427429199218, "epoch": 0.9266960722537282, "grad_norm": 0.9765625, "learning_rate": 0.0004920296857195998, "loss": 5.4421, "mean_token_accuracy": 0.17508164495229722, "num_tokens": 20364137.0, "step": 11030 }, { "entropy": 5.6386407852172855, "epoch": 0.9271161520688931, "grad_norm": 1.0703125, "learning_rate": 0.000492021783786115, "loss": 5.349, "mean_token_accuracy": 0.17995063215494156, "num_tokens": 20372583.0, "step": 11035 }, { "entropy": 5.67758994102478, "epoch": 0.927536231884058, "grad_norm": 0.9453125, "learning_rate": 0.0004920138780082011, "loss": 5.3024, "mean_token_accuracy": 0.17726746946573257, "num_tokens": 20382050.0, "step": 11040 }, { "entropy": 5.647405672073364, "epoch": 0.9279563116992229, "grad_norm": 0.9921875, "learning_rate": 0.0004920059683859981, "loss": 5.2611, "mean_token_accuracy": 0.1826525717973709, "num_tokens": 20391425.0, "step": 11045 }, { "entropy": 5.685978555679322, "epoch": 0.9283763915143878, "grad_norm": 1.0625, "learning_rate": 0.0004919980549196461, "loss": 5.4329, "mean_token_accuracy": 0.17503868341445922, "num_tokens": 20400559.0, "step": 11050 }, { "entropy": 5.667359781265259, "epoch": 0.9287964713295526, "grad_norm": 1.0, "learning_rate": 0.0004919901376092853, "loss": 5.3372, "mean_token_accuracy": 0.1793399080634117, "num_tokens": 20408985.0, "step": 11055 }, { "entropy": 5.659483909606934, "epoch": 0.9292165511447175, "grad_norm": 1.1328125, "learning_rate": 0.0004919822164550559, "loss": 5.4491, "mean_token_accuracy": 0.1629319354891777, "num_tokens": 20417855.0, "step": 11060 }, { "entropy": 5.6782361507415775, "epoch": 0.9296366309598824, "grad_norm": 0.96875, "learning_rate": 0.0004919742914570983, "loss": 5.4076, "mean_token_accuracy": 0.17369400411844255, "num_tokens": 20426191.0, "step": 11065 }, { "entropy": 5.6799122333526615, "epoch": 0.9300567107750473, "grad_norm": 0.9765625, "learning_rate": 0.000491966362615553, "loss": 5.3722, "mean_token_accuracy": 0.16640562266111375, "num_tokens": 20435592.0, "step": 11070 }, { "entropy": 5.739400577545166, "epoch": 0.9304767905902122, "grad_norm": 0.98046875, "learning_rate": 0.00049195842993056, "loss": 5.4083, "mean_token_accuracy": 0.17765518575906752, "num_tokens": 20445504.0, "step": 11075 }, { "entropy": 5.683113241195679, "epoch": 0.930896870405377, "grad_norm": 0.9140625, "learning_rate": 0.0004919504934022604, "loss": 5.3654, "mean_token_accuracy": 0.17199593484401704, "num_tokens": 20455153.0, "step": 11080 }, { "entropy": 5.664229488372802, "epoch": 0.931316950220542, "grad_norm": 0.8828125, "learning_rate": 0.0004919425530307943, "loss": 5.3306, "mean_token_accuracy": 0.17326945215463638, "num_tokens": 20465101.0, "step": 11085 }, { "entropy": 5.676412534713745, "epoch": 0.9317370300357067, "grad_norm": 0.99609375, "learning_rate": 0.0004919346088163028, "loss": 5.3994, "mean_token_accuracy": 0.17614241689443588, "num_tokens": 20474700.0, "step": 11090 }, { "entropy": 5.702908229827881, "epoch": 0.9321571098508716, "grad_norm": 0.984375, "learning_rate": 0.0004919266607589263, "loss": 5.4236, "mean_token_accuracy": 0.16938048750162124, "num_tokens": 20483945.0, "step": 11095 }, { "entropy": 5.644795513153076, "epoch": 0.9325771896660365, "grad_norm": 0.99609375, "learning_rate": 0.0004919187088588057, "loss": 5.3988, "mean_token_accuracy": 0.1764049381017685, "num_tokens": 20493307.0, "step": 11100 }, { "entropy": 5.673937082290649, "epoch": 0.9329972694812014, "grad_norm": 1.078125, "learning_rate": 0.0004919107531160819, "loss": 5.3133, "mean_token_accuracy": 0.18090127259492875, "num_tokens": 20501889.0, "step": 11105 }, { "entropy": 5.720638227462769, "epoch": 0.9334173492963663, "grad_norm": 1.0078125, "learning_rate": 0.0004919027935308957, "loss": 5.3598, "mean_token_accuracy": 0.17753426134586334, "num_tokens": 20510577.0, "step": 11110 }, { "entropy": 5.619483757019043, "epoch": 0.9338374291115312, "grad_norm": 0.984375, "learning_rate": 0.0004918948301033884, "loss": 5.3135, "mean_token_accuracy": 0.17889769226312638, "num_tokens": 20520025.0, "step": 11115 }, { "entropy": 5.647424840927124, "epoch": 0.9342575089266961, "grad_norm": 0.98046875, "learning_rate": 0.0004918868628337007, "loss": 5.371, "mean_token_accuracy": 0.17375398725271224, "num_tokens": 20528989.0, "step": 11120 }, { "entropy": 5.713294744491577, "epoch": 0.9346775887418609, "grad_norm": 0.984375, "learning_rate": 0.0004918788917219739, "loss": 5.3155, "mean_token_accuracy": 0.17422395646572114, "num_tokens": 20538328.0, "step": 11125 }, { "entropy": 5.701392555236817, "epoch": 0.9350976685570258, "grad_norm": 0.9765625, "learning_rate": 0.0004918709167683493, "loss": 5.4675, "mean_token_accuracy": 0.17042749971151352, "num_tokens": 20548069.0, "step": 11130 }, { "entropy": 5.6277083396911625, "epoch": 0.9355177483721907, "grad_norm": 0.921875, "learning_rate": 0.0004918629379729681, "loss": 5.2041, "mean_token_accuracy": 0.18889956921339035, "num_tokens": 20557128.0, "step": 11135 }, { "entropy": 5.607458400726318, "epoch": 0.9359378281873556, "grad_norm": 0.99609375, "learning_rate": 0.0004918549553359715, "loss": 5.3379, "mean_token_accuracy": 0.17913254648447036, "num_tokens": 20566352.0, "step": 11140 }, { "entropy": 5.6762834072113035, "epoch": 0.9363579080025205, "grad_norm": 1.0234375, "learning_rate": 0.0004918469688575012, "loss": 5.3676, "mean_token_accuracy": 0.1735250934958458, "num_tokens": 20575814.0, "step": 11145 }, { "entropy": 5.660510301589966, "epoch": 0.9367779878176854, "grad_norm": 0.984375, "learning_rate": 0.0004918389785376983, "loss": 5.2389, "mean_token_accuracy": 0.1822280541062355, "num_tokens": 20584715.0, "step": 11150 }, { "entropy": 5.613585424423218, "epoch": 0.9371980676328503, "grad_norm": 0.953125, "learning_rate": 0.0004918309843767047, "loss": 5.3157, "mean_token_accuracy": 0.17783235311508178, "num_tokens": 20594630.0, "step": 11155 }, { "entropy": 5.652712726593018, "epoch": 0.9376181474480151, "grad_norm": 0.98046875, "learning_rate": 0.0004918229863746618, "loss": 5.3097, "mean_token_accuracy": 0.17647896260023116, "num_tokens": 20603653.0, "step": 11160 }, { "entropy": 5.693472957611084, "epoch": 0.93803822726318, "grad_norm": 1.0, "learning_rate": 0.0004918149845317114, "loss": 5.3828, "mean_token_accuracy": 0.176460599899292, "num_tokens": 20612188.0, "step": 11165 }, { "entropy": 5.66741714477539, "epoch": 0.9384583070783449, "grad_norm": 1.09375, "learning_rate": 0.0004918069788479952, "loss": 5.3134, "mean_token_accuracy": 0.17836859524250032, "num_tokens": 20620933.0, "step": 11170 }, { "entropy": 5.6451616287231445, "epoch": 0.9388783868935098, "grad_norm": 0.99609375, "learning_rate": 0.0004917989693236549, "loss": 5.3423, "mean_token_accuracy": 0.18023978918790817, "num_tokens": 20629919.0, "step": 11175 }, { "entropy": 5.654939460754394, "epoch": 0.9392984667086747, "grad_norm": 1.0546875, "learning_rate": 0.0004917909559588326, "loss": 5.3167, "mean_token_accuracy": 0.1752507507801056, "num_tokens": 20638475.0, "step": 11180 }, { "entropy": 5.749017667770386, "epoch": 0.9397185465238396, "grad_norm": 1.0703125, "learning_rate": 0.00049178293875367, "loss": 5.4355, "mean_token_accuracy": 0.16887526959180832, "num_tokens": 20648105.0, "step": 11185 }, { "entropy": 5.650721788406372, "epoch": 0.9401386263390044, "grad_norm": 0.97265625, "learning_rate": 0.0004917749177083094, "loss": 5.334, "mean_token_accuracy": 0.17548102438449859, "num_tokens": 20657527.0, "step": 11190 }, { "entropy": 5.642503833770752, "epoch": 0.9405587061541693, "grad_norm": 1.03125, "learning_rate": 0.0004917668928228927, "loss": 5.3403, "mean_token_accuracy": 0.17788160145282744, "num_tokens": 20666375.0, "step": 11195 }, { "entropy": 5.654184818267822, "epoch": 0.9409787859693342, "grad_norm": 1.0390625, "learning_rate": 0.0004917588640975622, "loss": 5.2927, "mean_token_accuracy": 0.18204714208841324, "num_tokens": 20675350.0, "step": 11200 }, { "entropy": 5.602049160003662, "epoch": 0.941398865784499, "grad_norm": 1.0859375, "learning_rate": 0.00049175083153246, "loss": 5.2336, "mean_token_accuracy": 0.18081634789705275, "num_tokens": 20684072.0, "step": 11205 }, { "entropy": 5.6149890422821045, "epoch": 0.941818945599664, "grad_norm": 0.9765625, "learning_rate": 0.0004917427951277284, "loss": 5.3228, "mean_token_accuracy": 0.18008925169706344, "num_tokens": 20692989.0, "step": 11210 }, { "entropy": 5.68301157951355, "epoch": 0.9422390254148288, "grad_norm": 1.078125, "learning_rate": 0.0004917347548835097, "loss": 5.2906, "mean_token_accuracy": 0.18037984669208526, "num_tokens": 20701269.0, "step": 11215 }, { "entropy": 5.6879767894744875, "epoch": 0.9426591052299937, "grad_norm": 1.0, "learning_rate": 0.0004917267107999466, "loss": 5.3934, "mean_token_accuracy": 0.17140883356332778, "num_tokens": 20709739.0, "step": 11220 }, { "entropy": 5.652856111526489, "epoch": 0.9430791850451585, "grad_norm": 1.0546875, "learning_rate": 0.0004917186628771812, "loss": 5.3416, "mean_token_accuracy": 0.17723431885242463, "num_tokens": 20718950.0, "step": 11225 }, { "entropy": 5.639612436294556, "epoch": 0.9434992648603234, "grad_norm": 0.88671875, "learning_rate": 0.0004917106111153565, "loss": 5.3279, "mean_token_accuracy": 0.1770992934703827, "num_tokens": 20729469.0, "step": 11230 }, { "entropy": 5.716261482238769, "epoch": 0.9439193446754883, "grad_norm": 0.98828125, "learning_rate": 0.0004917025555146148, "loss": 5.3498, "mean_token_accuracy": 0.1811720013618469, "num_tokens": 20738231.0, "step": 11235 }, { "entropy": 5.739324712753296, "epoch": 0.9443394244906532, "grad_norm": 0.91015625, "learning_rate": 0.000491694496075099, "loss": 5.5333, "mean_token_accuracy": 0.16331195831298828, "num_tokens": 20748578.0, "step": 11240 }, { "entropy": 5.743290853500366, "epoch": 0.9447595043058181, "grad_norm": 1.0078125, "learning_rate": 0.0004916864327969517, "loss": 5.4694, "mean_token_accuracy": 0.16192684173583985, "num_tokens": 20759284.0, "step": 11245 }, { "entropy": 5.732106304168701, "epoch": 0.945179584120983, "grad_norm": 0.98828125, "learning_rate": 0.0004916783656803158, "loss": 5.4025, "mean_token_accuracy": 0.17940037101507186, "num_tokens": 20768186.0, "step": 11250 }, { "entropy": 5.629659414291382, "epoch": 0.9455996639361479, "grad_norm": 1.03125, "learning_rate": 0.0004916702947253342, "loss": 5.254, "mean_token_accuracy": 0.18592254370450972, "num_tokens": 20776711.0, "step": 11255 }, { "entropy": 5.65093960762024, "epoch": 0.9460197437513127, "grad_norm": 1.03125, "learning_rate": 0.0004916622199321501, "loss": 5.3393, "mean_token_accuracy": 0.17673785835504532, "num_tokens": 20785154.0, "step": 11260 }, { "entropy": 5.702999019622803, "epoch": 0.9464398235664776, "grad_norm": 1.1953125, "learning_rate": 0.0004916541413009062, "loss": 5.2943, "mean_token_accuracy": 0.1821382761001587, "num_tokens": 20794114.0, "step": 11265 }, { "entropy": 5.678973388671875, "epoch": 0.9468599033816425, "grad_norm": 1.03125, "learning_rate": 0.0004916460588317458, "loss": 5.3681, "mean_token_accuracy": 0.18067531138658524, "num_tokens": 20803892.0, "step": 11270 }, { "entropy": 5.612544918060303, "epoch": 0.9472799831968074, "grad_norm": 0.890625, "learning_rate": 0.0004916379725248118, "loss": 5.2788, "mean_token_accuracy": 0.18359356075525285, "num_tokens": 20812892.0, "step": 11275 }, { "entropy": 5.6894134998321535, "epoch": 0.9477000630119723, "grad_norm": 1.0, "learning_rate": 0.0004916298823802479, "loss": 5.341, "mean_token_accuracy": 0.17472478598356248, "num_tokens": 20821934.0, "step": 11280 }, { "entropy": 5.623543453216553, "epoch": 0.9481201428271372, "grad_norm": 1.0078125, "learning_rate": 0.0004916217883981971, "loss": 5.2553, "mean_token_accuracy": 0.179837603867054, "num_tokens": 20830100.0, "step": 11285 }, { "entropy": 5.641614484786987, "epoch": 0.9485402226423021, "grad_norm": 1.03125, "learning_rate": 0.0004916136905788029, "loss": 5.338, "mean_token_accuracy": 0.1801730141043663, "num_tokens": 20839890.0, "step": 11290 }, { "entropy": 5.7246862888336185, "epoch": 0.9489603024574669, "grad_norm": 1.1171875, "learning_rate": 0.0004916055889222087, "loss": 5.4509, "mean_token_accuracy": 0.16322606652975083, "num_tokens": 20848670.0, "step": 11295 }, { "entropy": 5.675036525726318, "epoch": 0.9493803822726318, "grad_norm": 1.015625, "learning_rate": 0.000491597483428558, "loss": 5.3036, "mean_token_accuracy": 0.17835162729024887, "num_tokens": 20857291.0, "step": 11300 }, { "entropy": 5.565452003479004, "epoch": 0.9498004620877967, "grad_norm": 1.046875, "learning_rate": 0.0004915893740979944, "loss": 5.2757, "mean_token_accuracy": 0.18624432384967804, "num_tokens": 20865341.0, "step": 11305 }, { "entropy": 5.6904114246368405, "epoch": 0.9502205419029616, "grad_norm": 0.89453125, "learning_rate": 0.0004915812609306617, "loss": 5.3872, "mean_token_accuracy": 0.17745151966810227, "num_tokens": 20875194.0, "step": 11310 }, { "entropy": 5.69218077659607, "epoch": 0.9506406217181265, "grad_norm": 1.0078125, "learning_rate": 0.0004915731439267034, "loss": 5.3142, "mean_token_accuracy": 0.18038685172796248, "num_tokens": 20884831.0, "step": 11315 }, { "entropy": 5.590726280212403, "epoch": 0.9510607015332914, "grad_norm": 0.96484375, "learning_rate": 0.0004915650230862634, "loss": 5.1948, "mean_token_accuracy": 0.1872086152434349, "num_tokens": 20893790.0, "step": 11320 }, { "entropy": 5.633654308319092, "epoch": 0.9514807813484563, "grad_norm": 0.98828125, "learning_rate": 0.0004915568984094854, "loss": 5.3178, "mean_token_accuracy": 0.17930887192487716, "num_tokens": 20902175.0, "step": 11325 }, { "entropy": 5.7570061683654785, "epoch": 0.951900861163621, "grad_norm": 1.046875, "learning_rate": 0.0004915487698965136, "loss": 5.4596, "mean_token_accuracy": 0.16509962379932402, "num_tokens": 20911484.0, "step": 11330 }, { "entropy": 5.739006423950196, "epoch": 0.952320940978786, "grad_norm": 0.9453125, "learning_rate": 0.0004915406375474917, "loss": 5.3914, "mean_token_accuracy": 0.17322561144828796, "num_tokens": 20920916.0, "step": 11335 }, { "entropy": 5.6644574165344235, "epoch": 0.9527410207939508, "grad_norm": 1.03125, "learning_rate": 0.000491532501362564, "loss": 5.4193, "mean_token_accuracy": 0.17596694231033325, "num_tokens": 20930219.0, "step": 11340 }, { "entropy": 5.62013874053955, "epoch": 0.9531611006091157, "grad_norm": 1.0, "learning_rate": 0.0004915243613418745, "loss": 5.2619, "mean_token_accuracy": 0.17849457859992982, "num_tokens": 20939591.0, "step": 11345 }, { "entropy": 5.714457321166992, "epoch": 0.9535811804242806, "grad_norm": 1.0234375, "learning_rate": 0.0004915162174855675, "loss": 5.4206, "mean_token_accuracy": 0.17033613920211793, "num_tokens": 20950035.0, "step": 11350 }, { "entropy": 5.666524171829224, "epoch": 0.9540012602394455, "grad_norm": 0.96484375, "learning_rate": 0.0004915080697937872, "loss": 5.3352, "mean_token_accuracy": 0.17541322708129883, "num_tokens": 20959168.0, "step": 11355 }, { "entropy": 5.617508268356323, "epoch": 0.9544213400546103, "grad_norm": 1.03125, "learning_rate": 0.0004914999182666779, "loss": 5.2468, "mean_token_accuracy": 0.18561951816082, "num_tokens": 20967887.0, "step": 11360 }, { "entropy": 5.735011196136474, "epoch": 0.9548414198697752, "grad_norm": 0.953125, "learning_rate": 0.0004914917629043839, "loss": 5.3584, "mean_token_accuracy": 0.17448266297578813, "num_tokens": 20977558.0, "step": 11365 }, { "entropy": 5.598196411132813, "epoch": 0.9552614996849401, "grad_norm": 1.0234375, "learning_rate": 0.00049148360370705, "loss": 5.3414, "mean_token_accuracy": 0.17673687338829042, "num_tokens": 20986118.0, "step": 11370 }, { "entropy": 5.655255603790283, "epoch": 0.955681579500105, "grad_norm": 0.9921875, "learning_rate": 0.0004914754406748204, "loss": 5.2536, "mean_token_accuracy": 0.18123535066843033, "num_tokens": 20994623.0, "step": 11375 }, { "entropy": 5.712370443344116, "epoch": 0.9561016593152699, "grad_norm": 0.875, "learning_rate": 0.00049146727380784, "loss": 5.4395, "mean_token_accuracy": 0.17601014971733092, "num_tokens": 21004193.0, "step": 11380 }, { "entropy": 5.649058437347412, "epoch": 0.9565217391304348, "grad_norm": 1.0, "learning_rate": 0.0004914591031062531, "loss": 5.275, "mean_token_accuracy": 0.18350904136896135, "num_tokens": 21013125.0, "step": 11385 }, { "entropy": 5.56644983291626, "epoch": 0.9569418189455997, "grad_norm": 0.99609375, "learning_rate": 0.0004914509285702048, "loss": 5.1701, "mean_token_accuracy": 0.1924222946166992, "num_tokens": 21021402.0, "step": 11390 }, { "entropy": 5.627006149291992, "epoch": 0.9573618987607645, "grad_norm": 1.0625, "learning_rate": 0.0004914427501998397, "loss": 5.2792, "mean_token_accuracy": 0.17939724922180175, "num_tokens": 21029639.0, "step": 11395 }, { "entropy": 5.64644079208374, "epoch": 0.9577819785759294, "grad_norm": 1.078125, "learning_rate": 0.0004914345679953027, "loss": 5.3127, "mean_token_accuracy": 0.18131378740072251, "num_tokens": 21037525.0, "step": 11400 }, { "entropy": 5.702707576751709, "epoch": 0.9582020583910943, "grad_norm": 1.046875, "learning_rate": 0.0004914263819567388, "loss": 5.4198, "mean_token_accuracy": 0.1680728331208229, "num_tokens": 21047702.0, "step": 11405 }, { "entropy": 5.72448697090149, "epoch": 0.9586221382062592, "grad_norm": 0.9609375, "learning_rate": 0.000491418192084293, "loss": 5.3037, "mean_token_accuracy": 0.1790571391582489, "num_tokens": 21056379.0, "step": 11410 }, { "entropy": 5.663040208816528, "epoch": 0.9590422180214241, "grad_norm": 0.9921875, "learning_rate": 0.0004914099983781104, "loss": 5.2916, "mean_token_accuracy": 0.18116774708032607, "num_tokens": 21065283.0, "step": 11415 }, { "entropy": 5.640819883346557, "epoch": 0.959462297836589, "grad_norm": 1.0546875, "learning_rate": 0.000491401800838336, "loss": 5.4156, "mean_token_accuracy": 0.17106927782297135, "num_tokens": 21074938.0, "step": 11420 }, { "entropy": 5.626005411148071, "epoch": 0.9598823776517539, "grad_norm": 0.9609375, "learning_rate": 0.0004913935994651153, "loss": 5.2661, "mean_token_accuracy": 0.1833704188466072, "num_tokens": 21084729.0, "step": 11425 }, { "entropy": 5.560522270202637, "epoch": 0.9603024574669187, "grad_norm": 1.0078125, "learning_rate": 0.0004913853942585932, "loss": 5.1866, "mean_token_accuracy": 0.18622586578130723, "num_tokens": 21093456.0, "step": 11430 }, { "entropy": 5.596753740310669, "epoch": 0.9607225372820836, "grad_norm": 1.0703125, "learning_rate": 0.0004913771852189155, "loss": 5.3026, "mean_token_accuracy": 0.17892259061336518, "num_tokens": 21102980.0, "step": 11435 }, { "entropy": 5.752729749679565, "epoch": 0.9611426170972485, "grad_norm": 0.9609375, "learning_rate": 0.0004913689723462271, "loss": 5.5279, "mean_token_accuracy": 0.18441201895475387, "num_tokens": 21112777.0, "step": 11440 }, { "entropy": 5.689970541000366, "epoch": 0.9615626969124134, "grad_norm": 1.1015625, "learning_rate": 0.000491360755640674, "loss": 5.4179, "mean_token_accuracy": 0.16814291030168532, "num_tokens": 21122139.0, "step": 11445 }, { "entropy": 5.662048196792602, "epoch": 0.9619827767275783, "grad_norm": 1.109375, "learning_rate": 0.0004913525351024014, "loss": 5.3053, "mean_token_accuracy": 0.17923009693622588, "num_tokens": 21131425.0, "step": 11450 }, { "entropy": 5.643110466003418, "epoch": 0.9624028565427432, "grad_norm": 0.9296875, "learning_rate": 0.0004913443107315552, "loss": 5.295, "mean_token_accuracy": 0.17333212941884996, "num_tokens": 21140784.0, "step": 11455 }, { "entropy": 5.650489091873169, "epoch": 0.962822936357908, "grad_norm": 1.03125, "learning_rate": 0.0004913360825282807, "loss": 5.3001, "mean_token_accuracy": 0.1825592339038849, "num_tokens": 21150408.0, "step": 11460 }, { "entropy": 5.642411518096924, "epoch": 0.9632430161730728, "grad_norm": 1.0078125, "learning_rate": 0.000491327850492724, "loss": 5.4027, "mean_token_accuracy": 0.17479322254657745, "num_tokens": 21158915.0, "step": 11465 }, { "entropy": 5.545637416839599, "epoch": 0.9636630959882377, "grad_norm": 1.0078125, "learning_rate": 0.0004913196146250309, "loss": 5.1864, "mean_token_accuracy": 0.18869672417640687, "num_tokens": 21167336.0, "step": 11470 }, { "entropy": 5.704313564300537, "epoch": 0.9640831758034026, "grad_norm": 0.953125, "learning_rate": 0.0004913113749253472, "loss": 5.4864, "mean_token_accuracy": 0.1758169025182724, "num_tokens": 21177499.0, "step": 11475 }, { "entropy": 5.789436292648316, "epoch": 0.9645032556185675, "grad_norm": 1.0390625, "learning_rate": 0.0004913031313938188, "loss": 5.4024, "mean_token_accuracy": 0.17208625823259355, "num_tokens": 21186961.0, "step": 11480 }, { "entropy": 5.6955101013183596, "epoch": 0.9649233354337324, "grad_norm": 0.90234375, "learning_rate": 0.0004912948840305919, "loss": 5.3024, "mean_token_accuracy": 0.18843505382537842, "num_tokens": 21196364.0, "step": 11485 }, { "entropy": 5.644700050354004, "epoch": 0.9653434152488973, "grad_norm": 0.984375, "learning_rate": 0.0004912866328358125, "loss": 5.3754, "mean_token_accuracy": 0.17554905414581298, "num_tokens": 21206376.0, "step": 11490 }, { "entropy": 5.671500587463379, "epoch": 0.9657634950640621, "grad_norm": 0.91796875, "learning_rate": 0.0004912783778096266, "loss": 5.3349, "mean_token_accuracy": 0.1828784391283989, "num_tokens": 21215889.0, "step": 11495 }, { "entropy": 5.7075950622558596, "epoch": 0.966183574879227, "grad_norm": 1.03125, "learning_rate": 0.0004912701189521808, "loss": 5.358, "mean_token_accuracy": 0.18151413798332214, "num_tokens": 21224959.0, "step": 11500 }, { "entropy": 5.7311769962310795, "epoch": 0.9666036546943919, "grad_norm": 1.0078125, "learning_rate": 0.0004912618562636211, "loss": 5.4662, "mean_token_accuracy": 0.16880833953619004, "num_tokens": 21234495.0, "step": 11505 }, { "entropy": 5.6514808177948, "epoch": 0.9670237345095568, "grad_norm": 0.91796875, "learning_rate": 0.000491253589744094, "loss": 5.3163, "mean_token_accuracy": 0.17495938539505004, "num_tokens": 21244555.0, "step": 11510 }, { "entropy": 5.738853120803833, "epoch": 0.9674438143247217, "grad_norm": 0.984375, "learning_rate": 0.0004912453193937459, "loss": 5.4387, "mean_token_accuracy": 0.17752114236354827, "num_tokens": 21254199.0, "step": 11515 }, { "entropy": 5.684632968902588, "epoch": 0.9678638941398866, "grad_norm": 0.98828125, "learning_rate": 0.0004912370452127234, "loss": 5.3596, "mean_token_accuracy": 0.17345560640096663, "num_tokens": 21262723.0, "step": 11520 }, { "entropy": 5.676412582397461, "epoch": 0.9682839739550515, "grad_norm": 1.0234375, "learning_rate": 0.0004912287672011728, "loss": 5.2716, "mean_token_accuracy": 0.1829858049750328, "num_tokens": 21271283.0, "step": 11525 }, { "entropy": 5.622737216949463, "epoch": 0.9687040537702163, "grad_norm": 0.9765625, "learning_rate": 0.0004912204853592411, "loss": 5.3172, "mean_token_accuracy": 0.19032842218875884, "num_tokens": 21279542.0, "step": 11530 }, { "entropy": 5.604928398132325, "epoch": 0.9691241335853812, "grad_norm": 1.015625, "learning_rate": 0.0004912121996870748, "loss": 5.2971, "mean_token_accuracy": 0.18236760497093202, "num_tokens": 21288678.0, "step": 11535 }, { "entropy": 5.696911954879761, "epoch": 0.9695442134005461, "grad_norm": 0.97265625, "learning_rate": 0.0004912039101848207, "loss": 5.4611, "mean_token_accuracy": 0.17298222482204437, "num_tokens": 21298982.0, "step": 11540 }, { "entropy": 5.66067361831665, "epoch": 0.969964293215711, "grad_norm": 1.015625, "learning_rate": 0.0004911956168526257, "loss": 5.3955, "mean_token_accuracy": 0.17378579676151276, "num_tokens": 21307663.0, "step": 11545 }, { "entropy": 5.667431020736695, "epoch": 0.9703843730308759, "grad_norm": 0.9609375, "learning_rate": 0.0004911873196906366, "loss": 5.3925, "mean_token_accuracy": 0.16927654594182967, "num_tokens": 21318004.0, "step": 11550 }, { "entropy": 5.5790492534637455, "epoch": 0.9708044528460408, "grad_norm": 0.9375, "learning_rate": 0.0004911790186990005, "loss": 5.2167, "mean_token_accuracy": 0.19276839643716812, "num_tokens": 21327373.0, "step": 11555 }, { "entropy": 5.624707221984863, "epoch": 0.9712245326612057, "grad_norm": 1.0546875, "learning_rate": 0.0004911707138778643, "loss": 5.3048, "mean_token_accuracy": 0.18128982037305832, "num_tokens": 21335654.0, "step": 11560 }, { "entropy": 5.70596866607666, "epoch": 0.9716446124763705, "grad_norm": 1.03125, "learning_rate": 0.0004911624052273754, "loss": 5.3675, "mean_token_accuracy": 0.17856080830097198, "num_tokens": 21344464.0, "step": 11565 }, { "entropy": 5.74373345375061, "epoch": 0.9720646922915354, "grad_norm": 0.97265625, "learning_rate": 0.0004911540927476807, "loss": 5.4562, "mean_token_accuracy": 0.17238861620426177, "num_tokens": 21354121.0, "step": 11570 }, { "entropy": 5.708244943618775, "epoch": 0.9724847721067003, "grad_norm": 1.03125, "learning_rate": 0.0004911457764389275, "loss": 5.3905, "mean_token_accuracy": 0.18018527776002885, "num_tokens": 21363395.0, "step": 11575 }, { "entropy": 5.645134735107422, "epoch": 0.9729048519218652, "grad_norm": 1.0390625, "learning_rate": 0.0004911374563012633, "loss": 5.3571, "mean_token_accuracy": 0.17058224081993104, "num_tokens": 21372126.0, "step": 11580 }, { "entropy": 5.731898832321167, "epoch": 0.97332493173703, "grad_norm": 1.0, "learning_rate": 0.0004911291323348352, "loss": 5.4266, "mean_token_accuracy": 0.17140968888998032, "num_tokens": 21380554.0, "step": 11585 }, { "entropy": 5.674185276031494, "epoch": 0.973745011552195, "grad_norm": 0.9765625, "learning_rate": 0.0004911208045397909, "loss": 5.3152, "mean_token_accuracy": 0.178539414703846, "num_tokens": 21389317.0, "step": 11590 }, { "entropy": 5.729868316650391, "epoch": 0.9741650913673598, "grad_norm": 1.1015625, "learning_rate": 0.0004911124729162778, "loss": 5.4329, "mean_token_accuracy": 0.16993758529424668, "num_tokens": 21398926.0, "step": 11595 }, { "entropy": 5.665419292449951, "epoch": 0.9745851711825246, "grad_norm": 1.046875, "learning_rate": 0.0004911041374644435, "loss": 5.2406, "mean_token_accuracy": 0.18590443730354309, "num_tokens": 21406962.0, "step": 11600 }, { "entropy": 5.660564422607422, "epoch": 0.9750052509976895, "grad_norm": 0.96875, "learning_rate": 0.0004910957981844357, "loss": 5.3385, "mean_token_accuracy": 0.1784702643752098, "num_tokens": 21415868.0, "step": 11605 }, { "entropy": 5.745399808883667, "epoch": 0.9754253308128544, "grad_norm": 1.0078125, "learning_rate": 0.0004910874550764022, "loss": 5.4583, "mean_token_accuracy": 0.17337681502103805, "num_tokens": 21424544.0, "step": 11610 }, { "entropy": 5.620577812194824, "epoch": 0.9758454106280193, "grad_norm": 1.0390625, "learning_rate": 0.0004910791081404907, "loss": 5.3235, "mean_token_accuracy": 0.1821622669696808, "num_tokens": 21433589.0, "step": 11615 }, { "entropy": 5.664074945449829, "epoch": 0.9762654904431842, "grad_norm": 1.078125, "learning_rate": 0.0004910707573768489, "loss": 5.3867, "mean_token_accuracy": 0.17547966092824935, "num_tokens": 21442084.0, "step": 11620 }, { "entropy": 5.6204887390136715, "epoch": 0.9766855702583491, "grad_norm": 1.046875, "learning_rate": 0.0004910624027856251, "loss": 5.2919, "mean_token_accuracy": 0.17549517452716829, "num_tokens": 21450962.0, "step": 11625 }, { "entropy": 5.698890447616577, "epoch": 0.977105650073514, "grad_norm": 0.984375, "learning_rate": 0.0004910540443669669, "loss": 5.3754, "mean_token_accuracy": 0.17353478074073792, "num_tokens": 21461322.0, "step": 11630 }, { "entropy": 5.668360900878906, "epoch": 0.9775257298886788, "grad_norm": 0.99609375, "learning_rate": 0.0004910456821210227, "loss": 5.3737, "mean_token_accuracy": 0.18053041100502015, "num_tokens": 21470800.0, "step": 11635 }, { "entropy": 5.628610229492187, "epoch": 0.9779458097038437, "grad_norm": 0.9296875, "learning_rate": 0.0004910373160479404, "loss": 5.2405, "mean_token_accuracy": 0.18680354207754135, "num_tokens": 21479707.0, "step": 11640 }, { "entropy": 5.662050867080689, "epoch": 0.9783658895190086, "grad_norm": 1.046875, "learning_rate": 0.0004910289461478683, "loss": 5.4125, "mean_token_accuracy": 0.17121653407812118, "num_tokens": 21489469.0, "step": 11645 }, { "entropy": 5.697856140136719, "epoch": 0.9787859693341735, "grad_norm": 1.0, "learning_rate": 0.0004910205724209547, "loss": 5.3586, "mean_token_accuracy": 0.17544171512126921, "num_tokens": 21499226.0, "step": 11650 }, { "entropy": 5.586391115188599, "epoch": 0.9792060491493384, "grad_norm": 0.94921875, "learning_rate": 0.0004910121948673478, "loss": 5.2533, "mean_token_accuracy": 0.18253547847270965, "num_tokens": 21508129.0, "step": 11655 }, { "entropy": 5.639061975479126, "epoch": 0.9796261289645033, "grad_norm": 1.0859375, "learning_rate": 0.0004910038134871962, "loss": 5.2927, "mean_token_accuracy": 0.186747707426548, "num_tokens": 21516293.0, "step": 11660 }, { "entropy": 5.7256580829620365, "epoch": 0.9800462087796681, "grad_norm": 1.0625, "learning_rate": 0.0004909954282806482, "loss": 5.4285, "mean_token_accuracy": 0.17556715309619902, "num_tokens": 21525393.0, "step": 11665 }, { "entropy": 5.605647945404053, "epoch": 0.980466288594833, "grad_norm": 0.921875, "learning_rate": 0.0004909870392478524, "loss": 5.3009, "mean_token_accuracy": 0.17743275314569473, "num_tokens": 21534585.0, "step": 11670 }, { "entropy": 5.609653759002685, "epoch": 0.9808863684099979, "grad_norm": 0.98828125, "learning_rate": 0.0004909786463889575, "loss": 5.2184, "mean_token_accuracy": 0.18412292003631592, "num_tokens": 21542947.0, "step": 11675 }, { "entropy": 5.66112699508667, "epoch": 0.9813064482251628, "grad_norm": 0.98046875, "learning_rate": 0.0004909702497041121, "loss": 5.3409, "mean_token_accuracy": 0.17706906646490098, "num_tokens": 21552168.0, "step": 11680 }, { "entropy": 5.675204563140869, "epoch": 0.9817265280403277, "grad_norm": 0.94140625, "learning_rate": 0.0004909618491934648, "loss": 5.3491, "mean_token_accuracy": 0.1800813615322113, "num_tokens": 21562131.0, "step": 11685 }, { "entropy": 5.632400417327881, "epoch": 0.9821466078554926, "grad_norm": 1.0234375, "learning_rate": 0.0004909534448571647, "loss": 5.3037, "mean_token_accuracy": 0.18071361035108566, "num_tokens": 21571363.0, "step": 11690 }, { "entropy": 5.645296096801758, "epoch": 0.9825666876706575, "grad_norm": 0.98046875, "learning_rate": 0.0004909450366953604, "loss": 5.2715, "mean_token_accuracy": 0.18449807465076445, "num_tokens": 21580754.0, "step": 11695 }, { "entropy": 5.666966485977173, "epoch": 0.9829867674858223, "grad_norm": 1.0, "learning_rate": 0.000490936624708201, "loss": 5.3949, "mean_token_accuracy": 0.17284763008356094, "num_tokens": 21590053.0, "step": 11700 }, { "entropy": 5.675799751281739, "epoch": 0.9834068473009872, "grad_norm": 0.9921875, "learning_rate": 0.0004909282088958356, "loss": 5.3223, "mean_token_accuracy": 0.18017573952674865, "num_tokens": 21598681.0, "step": 11705 }, { "entropy": 5.69394702911377, "epoch": 0.983826927116152, "grad_norm": 1.0234375, "learning_rate": 0.000490919789258413, "loss": 5.342, "mean_token_accuracy": 0.1870078831911087, "num_tokens": 21607465.0, "step": 11710 }, { "entropy": 5.680866336822509, "epoch": 0.984247006931317, "grad_norm": 0.98046875, "learning_rate": 0.0004909113657960826, "loss": 5.4085, "mean_token_accuracy": 0.17005282640457153, "num_tokens": 21617480.0, "step": 11715 }, { "entropy": 5.646340274810791, "epoch": 0.9846670867464818, "grad_norm": 1.125, "learning_rate": 0.0004909029385089935, "loss": 5.3526, "mean_token_accuracy": 0.18036698400974274, "num_tokens": 21626434.0, "step": 11720 }, { "entropy": 5.669216203689575, "epoch": 0.9850871665616467, "grad_norm": 0.99609375, "learning_rate": 0.000490894507397295, "loss": 5.3356, "mean_token_accuracy": 0.1819071114063263, "num_tokens": 21635627.0, "step": 11725 }, { "entropy": 5.706068801879883, "epoch": 0.9855072463768116, "grad_norm": 0.984375, "learning_rate": 0.0004908860724611365, "loss": 5.3356, "mean_token_accuracy": 0.17555247396230697, "num_tokens": 21644789.0, "step": 11730 }, { "entropy": 5.597310876846313, "epoch": 0.9859273261919764, "grad_norm": 0.98828125, "learning_rate": 0.0004908776337006675, "loss": 5.3253, "mean_token_accuracy": 0.1750568062067032, "num_tokens": 21653696.0, "step": 11735 }, { "entropy": 5.685029315948486, "epoch": 0.9863474060071413, "grad_norm": 1.03125, "learning_rate": 0.0004908691911160373, "loss": 5.3195, "mean_token_accuracy": 0.1678353115916252, "num_tokens": 21664420.0, "step": 11740 }, { "entropy": 5.644360589981079, "epoch": 0.9867674858223062, "grad_norm": 1.0625, "learning_rate": 0.0004908607447073954, "loss": 5.3186, "mean_token_accuracy": 0.1803464934229851, "num_tokens": 21673716.0, "step": 11745 }, { "entropy": 5.6491326808929445, "epoch": 0.9871875656374711, "grad_norm": 1.0703125, "learning_rate": 0.0004908522944748917, "loss": 5.3159, "mean_token_accuracy": 0.18487760573625564, "num_tokens": 21682860.0, "step": 11750 }, { "entropy": 5.5865403175354, "epoch": 0.987607645452636, "grad_norm": 1.0, "learning_rate": 0.0004908438404186758, "loss": 5.3091, "mean_token_accuracy": 0.18622705042362214, "num_tokens": 21691915.0, "step": 11755 }, { "entropy": 5.709643268585205, "epoch": 0.9880277252678009, "grad_norm": 1.0234375, "learning_rate": 0.0004908353825388973, "loss": 5.4251, "mean_token_accuracy": 0.17699630111455916, "num_tokens": 21701666.0, "step": 11760 }, { "entropy": 5.754273891448975, "epoch": 0.9884478050829658, "grad_norm": 1.203125, "learning_rate": 0.0004908269208357062, "loss": 5.3677, "mean_token_accuracy": 0.18144619911909105, "num_tokens": 21709267.0, "step": 11765 }, { "entropy": 5.631012916564941, "epoch": 0.9888678848981306, "grad_norm": 1.046875, "learning_rate": 0.0004908184553092523, "loss": 5.2503, "mean_token_accuracy": 0.1789179801940918, "num_tokens": 21718117.0, "step": 11770 }, { "entropy": 5.675482130050659, "epoch": 0.9892879647132955, "grad_norm": 1.0234375, "learning_rate": 0.0004908099859596856, "loss": 5.3874, "mean_token_accuracy": 0.18255705237388611, "num_tokens": 21727952.0, "step": 11775 }, { "entropy": 5.686703014373779, "epoch": 0.9897080445284604, "grad_norm": 0.95703125, "learning_rate": 0.0004908015127871561, "loss": 5.2596, "mean_token_accuracy": 0.17840789407491683, "num_tokens": 21737878.0, "step": 11780 }, { "entropy": 5.5771726131439205, "epoch": 0.9901281243436253, "grad_norm": 0.9921875, "learning_rate": 0.000490793035791814, "loss": 5.2135, "mean_token_accuracy": 0.18143144994974136, "num_tokens": 21747391.0, "step": 11785 }, { "entropy": 5.593372869491577, "epoch": 0.9905482041587902, "grad_norm": 1.03125, "learning_rate": 0.0004907845549738093, "loss": 5.24, "mean_token_accuracy": 0.1855255201458931, "num_tokens": 21756791.0, "step": 11790 }, { "entropy": 5.579425954818726, "epoch": 0.9909682839739551, "grad_norm": 1.046875, "learning_rate": 0.0004907760703332923, "loss": 5.3041, "mean_token_accuracy": 0.1782523825764656, "num_tokens": 21766020.0, "step": 11795 }, { "entropy": 5.707907199859619, "epoch": 0.99138836378912, "grad_norm": 1.0859375, "learning_rate": 0.0004907675818704134, "loss": 5.3968, "mean_token_accuracy": 0.1679917022585869, "num_tokens": 21775895.0, "step": 11800 }, { "entropy": 5.667204332351685, "epoch": 0.9918084436042848, "grad_norm": 1.0625, "learning_rate": 0.0004907590895853228, "loss": 5.3305, "mean_token_accuracy": 0.18071042597293854, "num_tokens": 21784543.0, "step": 11805 }, { "entropy": 5.641284465789795, "epoch": 0.9922285234194497, "grad_norm": 0.97265625, "learning_rate": 0.0004907505934781712, "loss": 5.373, "mean_token_accuracy": 0.17214979678392411, "num_tokens": 21793938.0, "step": 11810 }, { "entropy": 5.696785974502563, "epoch": 0.9926486032346146, "grad_norm": 0.96875, "learning_rate": 0.0004907420935491087, "loss": 5.3391, "mean_token_accuracy": 0.1736621305346489, "num_tokens": 21803641.0, "step": 11815 }, { "entropy": 5.6910813331604, "epoch": 0.9930686830497795, "grad_norm": 0.93359375, "learning_rate": 0.0004907335897982862, "loss": 5.2645, "mean_token_accuracy": 0.17967502921819686, "num_tokens": 21812542.0, "step": 11820 }, { "entropy": 5.608026647567749, "epoch": 0.9934887628649444, "grad_norm": 1.0, "learning_rate": 0.0004907250822258543, "loss": 5.3315, "mean_token_accuracy": 0.17752791941165924, "num_tokens": 21821847.0, "step": 11825 }, { "entropy": 5.746523666381836, "epoch": 0.9939088426801093, "grad_norm": 1.078125, "learning_rate": 0.0004907165708319637, "loss": 5.3854, "mean_token_accuracy": 0.17786721736192704, "num_tokens": 21830799.0, "step": 11830 }, { "entropy": 5.647748279571533, "epoch": 0.994328922495274, "grad_norm": 1.0390625, "learning_rate": 0.0004907080556167651, "loss": 5.3197, "mean_token_accuracy": 0.17962961047887802, "num_tokens": 21840202.0, "step": 11835 }, { "entropy": 5.7122509479522705, "epoch": 0.994749002310439, "grad_norm": 0.9375, "learning_rate": 0.0004906995365804093, "loss": 5.4057, "mean_token_accuracy": 0.17563900649547576, "num_tokens": 21849701.0, "step": 11840 }, { "entropy": 5.69367208480835, "epoch": 0.9951690821256038, "grad_norm": 1.0078125, "learning_rate": 0.0004906910137230472, "loss": 5.2983, "mean_token_accuracy": 0.1831248253583908, "num_tokens": 21859191.0, "step": 11845 }, { "entropy": 5.678677701950074, "epoch": 0.9955891619407687, "grad_norm": 0.9921875, "learning_rate": 0.00049068248704483, "loss": 5.2895, "mean_token_accuracy": 0.17774451822042464, "num_tokens": 21867944.0, "step": 11850 }, { "entropy": 5.632848024368286, "epoch": 0.9960092417559336, "grad_norm": 1.0078125, "learning_rate": 0.0004906739565459085, "loss": 5.3152, "mean_token_accuracy": 0.17424624115228654, "num_tokens": 21876368.0, "step": 11855 }, { "entropy": 5.751575374603272, "epoch": 0.9964293215710985, "grad_norm": 0.98046875, "learning_rate": 0.000490665422226434, "loss": 5.4233, "mean_token_accuracy": 0.17138303816318512, "num_tokens": 21885634.0, "step": 11860 }, { "entropy": 5.602681446075439, "epoch": 0.9968494013862634, "grad_norm": 1.0234375, "learning_rate": 0.0004906568840865576, "loss": 5.239, "mean_token_accuracy": 0.18423720300197602, "num_tokens": 21894315.0, "step": 11865 }, { "entropy": 5.6025186538696286, "epoch": 0.9972694812014282, "grad_norm": 1.03125, "learning_rate": 0.0004906483421264305, "loss": 5.3356, "mean_token_accuracy": 0.1830558404326439, "num_tokens": 21903342.0, "step": 11870 }, { "entropy": 5.6820799827575685, "epoch": 0.9976895610165931, "grad_norm": 0.98046875, "learning_rate": 0.000490639796346204, "loss": 5.4612, "mean_token_accuracy": 0.16753632724285125, "num_tokens": 21914158.0, "step": 11875 }, { "entropy": 5.718638610839844, "epoch": 0.998109640831758, "grad_norm": 1.0078125, "learning_rate": 0.0004906312467460297, "loss": 5.3486, "mean_token_accuracy": 0.18104548901319503, "num_tokens": 21922639.0, "step": 11880 }, { "entropy": 5.671116209030151, "epoch": 0.9985297206469229, "grad_norm": 1.0, "learning_rate": 0.0004906226933260588, "loss": 5.3139, "mean_token_accuracy": 0.17961958646774293, "num_tokens": 21931385.0, "step": 11885 }, { "entropy": 5.712464618682861, "epoch": 0.9989498004620878, "grad_norm": 0.9609375, "learning_rate": 0.0004906141360864429, "loss": 5.3357, "mean_token_accuracy": 0.1744022250175476, "num_tokens": 21940788.0, "step": 11890 }, { "entropy": 5.693852710723877, "epoch": 0.9993698802772527, "grad_norm": 0.96484375, "learning_rate": 0.0004906055750273336, "loss": 5.3605, "mean_token_accuracy": 0.17913548946380614, "num_tokens": 21950309.0, "step": 11895 }, { "entropy": 5.665466785430908, "epoch": 0.9997899600924176, "grad_norm": 1.0546875, "learning_rate": 0.0004905970101488826, "loss": 5.3565, "mean_token_accuracy": 0.18021825104951858, "num_tokens": 21959141.0, "step": 11900 }, { "entropy": 5.7541399002075195, "epoch": 1.000168031926066, "grad_norm": 1.0078125, "learning_rate": 0.0004905884414512416, "loss": 5.3852, "mean_token_accuracy": 0.18165812393029532, "num_tokens": 21966665.0, "step": 11905 }, { "entropy": 5.703131437301636, "epoch": 1.0005881117412307, "grad_norm": 0.953125, "learning_rate": 0.0004905798689345623, "loss": 5.3276, "mean_token_accuracy": 0.1824957937002182, "num_tokens": 21976728.0, "step": 11910 }, { "entropy": 5.601657056808472, "epoch": 1.0010081915563958, "grad_norm": 1.015625, "learning_rate": 0.0004905712925989968, "loss": 5.1849, "mean_token_accuracy": 0.18301486372947692, "num_tokens": 21985915.0, "step": 11915 }, { "entropy": 5.67268443107605, "epoch": 1.0014282713715605, "grad_norm": 0.86328125, "learning_rate": 0.0004905627124446967, "loss": 5.2383, "mean_token_accuracy": 0.1821852833032608, "num_tokens": 21995826.0, "step": 11920 }, { "entropy": 5.627767324447632, "epoch": 1.0018483511867255, "grad_norm": 1.03125, "learning_rate": 0.0004905541284718142, "loss": 5.1947, "mean_token_accuracy": 0.18410852551460266, "num_tokens": 22005299.0, "step": 11925 }, { "entropy": 5.6066601276397705, "epoch": 1.0022684310018903, "grad_norm": 1.0625, "learning_rate": 0.0004905455406805011, "loss": 5.2333, "mean_token_accuracy": 0.18102063089609147, "num_tokens": 22014499.0, "step": 11930 }, { "entropy": 5.689736795425415, "epoch": 1.0026885108170553, "grad_norm": 0.94921875, "learning_rate": 0.00049053694907091, "loss": 5.3729, "mean_token_accuracy": 0.1760319471359253, "num_tokens": 22024531.0, "step": 11935 }, { "entropy": 5.696668386459351, "epoch": 1.0031085906322201, "grad_norm": 0.95703125, "learning_rate": 0.0004905283536431928, "loss": 5.2797, "mean_token_accuracy": 0.180125692486763, "num_tokens": 22034036.0, "step": 11940 }, { "entropy": 5.648813676834107, "epoch": 1.003528670447385, "grad_norm": 1.0390625, "learning_rate": 0.0004905197543975017, "loss": 5.1981, "mean_token_accuracy": 0.1830651804804802, "num_tokens": 22042910.0, "step": 11945 }, { "entropy": 5.69420485496521, "epoch": 1.00394875026255, "grad_norm": 0.92578125, "learning_rate": 0.0004905111513339892, "loss": 5.2861, "mean_token_accuracy": 0.18227915614843368, "num_tokens": 22052242.0, "step": 11950 }, { "entropy": 5.6567305564880375, "epoch": 1.0043688300777147, "grad_norm": 1.03125, "learning_rate": 0.0004905025444528076, "loss": 5.2644, "mean_token_accuracy": 0.17697068005800248, "num_tokens": 22061467.0, "step": 11955 }, { "entropy": 5.559281492233277, "epoch": 1.0047889098928797, "grad_norm": 1.0, "learning_rate": 0.0004904939337541093, "loss": 5.1082, "mean_token_accuracy": 0.18907876312732697, "num_tokens": 22070300.0, "step": 11960 }, { "entropy": 5.689043378829956, "epoch": 1.0052089897080445, "grad_norm": 1.0, "learning_rate": 0.0004904853192380472, "loss": 5.2949, "mean_token_accuracy": 0.18283973634243011, "num_tokens": 22078960.0, "step": 11965 }, { "entropy": 5.659601259231567, "epoch": 1.0056290695232095, "grad_norm": 0.88671875, "learning_rate": 0.0004904767009047733, "loss": 5.2121, "mean_token_accuracy": 0.1838981106877327, "num_tokens": 22088135.0, "step": 11970 }, { "entropy": 5.6801165580749515, "epoch": 1.0060491493383743, "grad_norm": 0.94921875, "learning_rate": 0.0004904680787544408, "loss": 5.3337, "mean_token_accuracy": 0.18077797293663025, "num_tokens": 22098004.0, "step": 11975 }, { "entropy": 5.735361003875733, "epoch": 1.006469229153539, "grad_norm": 0.94921875, "learning_rate": 0.0004904594527872022, "loss": 5.3109, "mean_token_accuracy": 0.17933668792247773, "num_tokens": 22107680.0, "step": 11980 }, { "entropy": 5.667607593536377, "epoch": 1.006889308968704, "grad_norm": 0.90234375, "learning_rate": 0.0004904508230032103, "loss": 5.2939, "mean_token_accuracy": 0.18030537217855452, "num_tokens": 22118004.0, "step": 11985 }, { "entropy": 5.6297767639160154, "epoch": 1.0073093887838689, "grad_norm": 1.015625, "learning_rate": 0.000490442189402618, "loss": 5.2497, "mean_token_accuracy": 0.1915082961320877, "num_tokens": 22127825.0, "step": 11990 }, { "entropy": 5.662150526046753, "epoch": 1.007729468599034, "grad_norm": 1.046875, "learning_rate": 0.0004904335519855783, "loss": 5.1893, "mean_token_accuracy": 0.18411701321601867, "num_tokens": 22136448.0, "step": 11995 }, { "entropy": 5.627398061752319, "epoch": 1.0081495484141987, "grad_norm": 1.0078125, "learning_rate": 0.0004904249107522442, "loss": 5.2866, "mean_token_accuracy": 0.17936622649431228, "num_tokens": 22146415.0, "step": 12000 }, { "epoch": 1.0081495484141987, "eval_entropy": 5.37550251807015, "eval_loss": 5.336048603057861, "eval_mean_token_accuracy": 0.18655346389568814, "eval_num_tokens": 22146415.0, "eval_runtime": 20.9039, "eval_samples_per_second": 1787.516, "eval_steps_per_second": 223.451, "step": 12000 }, { "entropy": 5.69142255783081, "epoch": 1.0085696282293637, "grad_norm": 1.1328125, "learning_rate": 0.0004904162657027685, "loss": 5.4208, "mean_token_accuracy": 0.18247728198766708, "num_tokens": 22156327.0, "step": 12005 }, { "entropy": 5.618323469161988, "epoch": 1.0089897080445285, "grad_norm": 0.9609375, "learning_rate": 0.0004904076168373049, "loss": 5.2222, "mean_token_accuracy": 0.18440109342336655, "num_tokens": 22165677.0, "step": 12010 }, { "entropy": 5.702277994155883, "epoch": 1.0094097878596933, "grad_norm": 1.0, "learning_rate": 0.0004903989641560061, "loss": 5.3474, "mean_token_accuracy": 0.18221616595983506, "num_tokens": 22175232.0, "step": 12015 }, { "entropy": 5.701148557662964, "epoch": 1.0098298676748583, "grad_norm": 1.0703125, "learning_rate": 0.0004903903076590256, "loss": 5.2257, "mean_token_accuracy": 0.18296757191419602, "num_tokens": 22184026.0, "step": 12020 }, { "entropy": 5.555580949783325, "epoch": 1.010249947490023, "grad_norm": 1.109375, "learning_rate": 0.0004903816473465167, "loss": 5.1174, "mean_token_accuracy": 0.19558341354131697, "num_tokens": 22192020.0, "step": 12025 }, { "entropy": 5.533420181274414, "epoch": 1.010670027305188, "grad_norm": 1.0, "learning_rate": 0.0004903729832186328, "loss": 5.1182, "mean_token_accuracy": 0.19409437328577042, "num_tokens": 22200060.0, "step": 12030 }, { "entropy": 5.596082401275635, "epoch": 1.0110901071203529, "grad_norm": 0.9921875, "learning_rate": 0.0004903643152755274, "loss": 5.1719, "mean_token_accuracy": 0.18199314922094345, "num_tokens": 22208625.0, "step": 12035 }, { "entropy": 5.626865577697754, "epoch": 1.0115101869355176, "grad_norm": 0.94140625, "learning_rate": 0.0004903556435173541, "loss": 5.1519, "mean_token_accuracy": 0.1900147467851639, "num_tokens": 22217781.0, "step": 12040 }, { "entropy": 5.654424047470092, "epoch": 1.0119302667506826, "grad_norm": 1.0390625, "learning_rate": 0.0004903469679442665, "loss": 5.2743, "mean_token_accuracy": 0.1805676445364952, "num_tokens": 22226432.0, "step": 12045 }, { "entropy": 5.611930418014526, "epoch": 1.0123503465658474, "grad_norm": 1.03125, "learning_rate": 0.0004903382885564181, "loss": 5.2937, "mean_token_accuracy": 0.1801860436797142, "num_tokens": 22234811.0, "step": 12050 }, { "entropy": 5.581429290771484, "epoch": 1.0127704263810124, "grad_norm": 0.9921875, "learning_rate": 0.000490329605353963, "loss": 5.1895, "mean_token_accuracy": 0.18706519454717635, "num_tokens": 22242808.0, "step": 12055 }, { "entropy": 5.706435203552246, "epoch": 1.0131905061961772, "grad_norm": 1.046875, "learning_rate": 0.0004903209183370547, "loss": 5.2376, "mean_token_accuracy": 0.18125460147857667, "num_tokens": 22251371.0, "step": 12060 }, { "entropy": 5.732815551757812, "epoch": 1.0136105860113422, "grad_norm": 0.9765625, "learning_rate": 0.0004903122275058472, "loss": 5.3142, "mean_token_accuracy": 0.18328930288553238, "num_tokens": 22260868.0, "step": 12065 }, { "entropy": 5.570784854888916, "epoch": 1.014030665826507, "grad_norm": 0.96875, "learning_rate": 0.0004903035328604944, "loss": 5.198, "mean_token_accuracy": 0.18229980170726776, "num_tokens": 22270554.0, "step": 12070 }, { "entropy": 5.592981767654419, "epoch": 1.0144507456416718, "grad_norm": 1.1015625, "learning_rate": 0.0004902948344011506, "loss": 5.2136, "mean_token_accuracy": 0.18296387195587158, "num_tokens": 22279170.0, "step": 12075 }, { "entropy": 5.7283127784729, "epoch": 1.0148708254568368, "grad_norm": 1.0625, "learning_rate": 0.0004902861321279694, "loss": 5.3628, "mean_token_accuracy": 0.1767708644270897, "num_tokens": 22288788.0, "step": 12080 }, { "entropy": 5.617083263397217, "epoch": 1.0152909052720016, "grad_norm": 1.0390625, "learning_rate": 0.0004902774260411055, "loss": 5.1577, "mean_token_accuracy": 0.1870339885354042, "num_tokens": 22297501.0, "step": 12085 }, { "entropy": 5.600834703445434, "epoch": 1.0157109850871666, "grad_norm": 1.0546875, "learning_rate": 0.0004902687161407126, "loss": 5.1099, "mean_token_accuracy": 0.19561174958944322, "num_tokens": 22306181.0, "step": 12090 }, { "entropy": 5.589878129959106, "epoch": 1.0161310649023314, "grad_norm": 1.03125, "learning_rate": 0.0004902600024269454, "loss": 5.247, "mean_token_accuracy": 0.1891962394118309, "num_tokens": 22315762.0, "step": 12095 }, { "entropy": 5.5894256114959715, "epoch": 1.0165511447174964, "grad_norm": 1.046875, "learning_rate": 0.000490251284899958, "loss": 5.2192, "mean_token_accuracy": 0.1884337618947029, "num_tokens": 22325127.0, "step": 12100 }, { "entropy": 5.599499702453613, "epoch": 1.0169712245326612, "grad_norm": 1.0625, "learning_rate": 0.000490242563559905, "loss": 5.2982, "mean_token_accuracy": 0.18237145394086837, "num_tokens": 22334038.0, "step": 12105 }, { "entropy": 5.6348641872406, "epoch": 1.017391304347826, "grad_norm": 1.1171875, "learning_rate": 0.0004902338384069408, "loss": 5.1622, "mean_token_accuracy": 0.18557495176792144, "num_tokens": 22342658.0, "step": 12110 }, { "entropy": 5.707513093948364, "epoch": 1.017811384162991, "grad_norm": 0.98046875, "learning_rate": 0.00049022510944122, "loss": 5.3215, "mean_token_accuracy": 0.17845727652311325, "num_tokens": 22352559.0, "step": 12115 }, { "entropy": 5.7064680576324465, "epoch": 1.0182314639781558, "grad_norm": 1.0, "learning_rate": 0.0004902163766628972, "loss": 5.2377, "mean_token_accuracy": 0.18466448336839675, "num_tokens": 22361455.0, "step": 12120 }, { "entropy": 5.698122215270996, "epoch": 1.0186515437933208, "grad_norm": 1.0078125, "learning_rate": 0.0004902076400721271, "loss": 5.2449, "mean_token_accuracy": 0.1822131395339966, "num_tokens": 22371163.0, "step": 12125 }, { "entropy": 5.708817052841186, "epoch": 1.0190716236084856, "grad_norm": 1.0390625, "learning_rate": 0.0004901988996690645, "loss": 5.2641, "mean_token_accuracy": 0.18752802163362503, "num_tokens": 22379975.0, "step": 12130 }, { "entropy": 5.704422235488892, "epoch": 1.0194917034236506, "grad_norm": 1.03125, "learning_rate": 0.0004901901554538641, "loss": 5.2997, "mean_token_accuracy": 0.18199055194854735, "num_tokens": 22389657.0, "step": 12135 }, { "entropy": 5.596950101852417, "epoch": 1.0199117832388154, "grad_norm": 1.078125, "learning_rate": 0.000490181407426681, "loss": 5.1336, "mean_token_accuracy": 0.1886232778429985, "num_tokens": 22398320.0, "step": 12140 }, { "entropy": 5.672134017944336, "epoch": 1.0203318630539802, "grad_norm": 1.03125, "learning_rate": 0.0004901726555876701, "loss": 5.3333, "mean_token_accuracy": 0.1786001890897751, "num_tokens": 22406634.0, "step": 12145 }, { "entropy": 5.690277433395385, "epoch": 1.0207519428691452, "grad_norm": 1.0078125, "learning_rate": 0.0004901638999369862, "loss": 5.3667, "mean_token_accuracy": 0.1786521553993225, "num_tokens": 22415939.0, "step": 12150 }, { "entropy": 5.656166887283325, "epoch": 1.02117202268431, "grad_norm": 0.99609375, "learning_rate": 0.0004901551404747847, "loss": 5.3114, "mean_token_accuracy": 0.17751660645008088, "num_tokens": 22425256.0, "step": 12155 }, { "entropy": 5.690919494628906, "epoch": 1.021592102499475, "grad_norm": 0.94921875, "learning_rate": 0.0004901463772012209, "loss": 5.3401, "mean_token_accuracy": 0.17431362271308898, "num_tokens": 22434750.0, "step": 12160 }, { "entropy": 5.666771507263183, "epoch": 1.0220121823146397, "grad_norm": 1.0546875, "learning_rate": 0.0004901376101164495, "loss": 5.2506, "mean_token_accuracy": 0.17747422456741332, "num_tokens": 22443426.0, "step": 12165 }, { "entropy": 5.643777084350586, "epoch": 1.0224322621298048, "grad_norm": 1.0859375, "learning_rate": 0.0004901288392206263, "loss": 5.2509, "mean_token_accuracy": 0.17817339301109314, "num_tokens": 22452778.0, "step": 12170 }, { "entropy": 5.617178344726563, "epoch": 1.0228523419449695, "grad_norm": 1.0, "learning_rate": 0.0004901200645139064, "loss": 5.2099, "mean_token_accuracy": 0.186665578186512, "num_tokens": 22462864.0, "step": 12175 }, { "entropy": 5.6265661239624025, "epoch": 1.0232724217601343, "grad_norm": 1.03125, "learning_rate": 0.0004901112859964454, "loss": 5.2704, "mean_token_accuracy": 0.1804193764925003, "num_tokens": 22472849.0, "step": 12180 }, { "entropy": 5.6163983821868895, "epoch": 1.0236925015752993, "grad_norm": 1.0234375, "learning_rate": 0.0004901025036683987, "loss": 5.1967, "mean_token_accuracy": 0.1820622056722641, "num_tokens": 22481693.0, "step": 12185 }, { "entropy": 5.64280071258545, "epoch": 1.0241125813904641, "grad_norm": 0.90625, "learning_rate": 0.0004900937175299219, "loss": 5.2234, "mean_token_accuracy": 0.1829400032758713, "num_tokens": 22490934.0, "step": 12190 }, { "entropy": 5.637413740158081, "epoch": 1.0245326612056291, "grad_norm": 1.09375, "learning_rate": 0.0004900849275811707, "loss": 5.2524, "mean_token_accuracy": 0.18099325895309448, "num_tokens": 22500457.0, "step": 12195 }, { "entropy": 5.635241460800171, "epoch": 1.024952741020794, "grad_norm": 1.1015625, "learning_rate": 0.0004900761338223007, "loss": 5.175, "mean_token_accuracy": 0.1825869172811508, "num_tokens": 22509641.0, "step": 12200 }, { "entropy": 5.607939767837524, "epoch": 1.025372820835959, "grad_norm": 1.015625, "learning_rate": 0.0004900673362534677, "loss": 5.1404, "mean_token_accuracy": 0.19269167333841325, "num_tokens": 22518616.0, "step": 12205 }, { "entropy": 5.650340366363525, "epoch": 1.0257929006511237, "grad_norm": 0.9609375, "learning_rate": 0.0004900585348748277, "loss": 5.2776, "mean_token_accuracy": 0.18812544792890548, "num_tokens": 22527599.0, "step": 12210 }, { "entropy": 5.590425205230713, "epoch": 1.0262129804662885, "grad_norm": 1.015625, "learning_rate": 0.0004900497296865365, "loss": 5.283, "mean_token_accuracy": 0.17123017311096192, "num_tokens": 22537399.0, "step": 12215 }, { "entropy": 5.8351599216461185, "epoch": 1.0266330602814535, "grad_norm": 0.92578125, "learning_rate": 0.0004900409206887499, "loss": 5.4532, "mean_token_accuracy": 0.17509985864162445, "num_tokens": 22546746.0, "step": 12220 }, { "entropy": 5.716632556915283, "epoch": 1.0270531400966183, "grad_norm": 0.9765625, "learning_rate": 0.0004900321078816243, "loss": 5.247, "mean_token_accuracy": 0.18805406093597413, "num_tokens": 22555735.0, "step": 12225 }, { "entropy": 5.659763336181641, "epoch": 1.0274732199117833, "grad_norm": 0.94921875, "learning_rate": 0.0004900232912653156, "loss": 5.2466, "mean_token_accuracy": 0.18256295025348662, "num_tokens": 22565010.0, "step": 12230 }, { "entropy": 5.639549398422242, "epoch": 1.027893299726948, "grad_norm": 1.0, "learning_rate": 0.00049001447083998, "loss": 5.2395, "mean_token_accuracy": 0.18333375900983812, "num_tokens": 22573565.0, "step": 12235 }, { "entropy": 5.671342229843139, "epoch": 1.028313379542113, "grad_norm": 1.0, "learning_rate": 0.0004900056466057737, "loss": 5.245, "mean_token_accuracy": 0.18329312205314635, "num_tokens": 22582549.0, "step": 12240 }, { "entropy": 5.628865051269531, "epoch": 1.028733459357278, "grad_norm": 1.078125, "learning_rate": 0.0004899968185628531, "loss": 5.2754, "mean_token_accuracy": 0.17815867960453033, "num_tokens": 22592112.0, "step": 12245 }, { "entropy": 5.559484243392944, "epoch": 1.0291535391724427, "grad_norm": 1.03125, "learning_rate": 0.0004899879867113746, "loss": 5.1258, "mean_token_accuracy": 0.19046328067779542, "num_tokens": 22600581.0, "step": 12250 }, { "entropy": 5.697178173065185, "epoch": 1.0295736189876077, "grad_norm": 1.046875, "learning_rate": 0.0004899791510514945, "loss": 5.332, "mean_token_accuracy": 0.17589432001113892, "num_tokens": 22610822.0, "step": 12255 }, { "entropy": 5.636245203018189, "epoch": 1.0299936988027725, "grad_norm": 0.984375, "learning_rate": 0.0004899703115833696, "loss": 5.348, "mean_token_accuracy": 0.17800653576850892, "num_tokens": 22619484.0, "step": 12260 }, { "entropy": 5.641297340393066, "epoch": 1.0304137786179375, "grad_norm": 1.0078125, "learning_rate": 0.0004899614683071563, "loss": 5.179, "mean_token_accuracy": 0.18685067892074586, "num_tokens": 22629038.0, "step": 12265 }, { "entropy": 5.686434698104859, "epoch": 1.0308338584331023, "grad_norm": 0.93359375, "learning_rate": 0.0004899526212230112, "loss": 5.2814, "mean_token_accuracy": 0.17130620330572127, "num_tokens": 22638619.0, "step": 12270 }, { "entropy": 5.599054336547852, "epoch": 1.0312539382482673, "grad_norm": 0.96875, "learning_rate": 0.0004899437703310912, "loss": 5.2536, "mean_token_accuracy": 0.18062396645545958, "num_tokens": 22648065.0, "step": 12275 }, { "entropy": 5.704594135284424, "epoch": 1.031674018063432, "grad_norm": 1.0, "learning_rate": 0.0004899349156315529, "loss": 5.3098, "mean_token_accuracy": 0.17907833456993102, "num_tokens": 22658107.0, "step": 12280 }, { "entropy": 5.657734394073486, "epoch": 1.0320940978785969, "grad_norm": 1.09375, "learning_rate": 0.0004899260571245533, "loss": 5.2209, "mean_token_accuracy": 0.18165767043828965, "num_tokens": 22667103.0, "step": 12285 }, { "entropy": 5.6223057270050045, "epoch": 1.0325141776937619, "grad_norm": 1.015625, "learning_rate": 0.0004899171948102492, "loss": 5.1904, "mean_token_accuracy": 0.18377615660429, "num_tokens": 22676792.0, "step": 12290 }, { "entropy": 5.591983604431152, "epoch": 1.0329342575089266, "grad_norm": 1.0625, "learning_rate": 0.0004899083286887977, "loss": 5.2163, "mean_token_accuracy": 0.1818735346198082, "num_tokens": 22685344.0, "step": 12295 }, { "entropy": 5.682312154769898, "epoch": 1.0333543373240917, "grad_norm": 1.03125, "learning_rate": 0.0004898994587603559, "loss": 5.2675, "mean_token_accuracy": 0.1800654262304306, "num_tokens": 22694387.0, "step": 12300 }, { "entropy": 5.627408504486084, "epoch": 1.0337744171392564, "grad_norm": 1.0078125, "learning_rate": 0.0004898905850250807, "loss": 5.3082, "mean_token_accuracy": 0.18107833415269853, "num_tokens": 22704203.0, "step": 12305 }, { "entropy": 5.691462087631225, "epoch": 1.0341944969544214, "grad_norm": 1.0234375, "learning_rate": 0.0004898817074831295, "loss": 5.3521, "mean_token_accuracy": 0.17909369319677354, "num_tokens": 22713518.0, "step": 12310 }, { "entropy": 5.740213871002197, "epoch": 1.0346145767695862, "grad_norm": 0.984375, "learning_rate": 0.0004898728261346595, "loss": 5.3444, "mean_token_accuracy": 0.1762990452349186, "num_tokens": 22722997.0, "step": 12315 }, { "entropy": 5.693411779403687, "epoch": 1.035034656584751, "grad_norm": 0.98046875, "learning_rate": 0.000489863940979828, "loss": 5.2903, "mean_token_accuracy": 0.18066558986902237, "num_tokens": 22732385.0, "step": 12320 }, { "entropy": 5.574344778060913, "epoch": 1.035454736399916, "grad_norm": 1.015625, "learning_rate": 0.0004898550520187925, "loss": 5.1878, "mean_token_accuracy": 0.18785546571016312, "num_tokens": 22741148.0, "step": 12325 }, { "entropy": 5.601341485977173, "epoch": 1.0358748162150808, "grad_norm": 0.9609375, "learning_rate": 0.0004898461592517103, "loss": 5.164, "mean_token_accuracy": 0.18661797046661377, "num_tokens": 22750239.0, "step": 12330 }, { "entropy": 5.69515962600708, "epoch": 1.0362948960302458, "grad_norm": 1.0390625, "learning_rate": 0.0004898372626787391, "loss": 5.3313, "mean_token_accuracy": 0.17663245052099227, "num_tokens": 22759290.0, "step": 12335 }, { "entropy": 5.761694145202637, "epoch": 1.0367149758454106, "grad_norm": 1.0234375, "learning_rate": 0.0004898283623000364, "loss": 5.3437, "mean_token_accuracy": 0.17481304705142975, "num_tokens": 22768450.0, "step": 12340 }, { "entropy": 5.652455234527588, "epoch": 1.0371350556605754, "grad_norm": 0.953125, "learning_rate": 0.0004898194581157598, "loss": 5.2144, "mean_token_accuracy": 0.17939345836639403, "num_tokens": 22777711.0, "step": 12345 }, { "entropy": 5.666118192672729, "epoch": 1.0375551354757404, "grad_norm": 1.0859375, "learning_rate": 0.0004898105501260671, "loss": 5.2789, "mean_token_accuracy": 0.18181382417678832, "num_tokens": 22787153.0, "step": 12350 }, { "entropy": 5.689254426956177, "epoch": 1.0379752152909052, "grad_norm": 0.875, "learning_rate": 0.0004898016383311163, "loss": 5.2775, "mean_token_accuracy": 0.18912196010351182, "num_tokens": 22797125.0, "step": 12355 }, { "entropy": 5.673904895782471, "epoch": 1.0383952951060702, "grad_norm": 0.96484375, "learning_rate": 0.000489792722731065, "loss": 5.2801, "mean_token_accuracy": 0.17991742342710496, "num_tokens": 22806478.0, "step": 12360 }, { "entropy": 5.628547191619873, "epoch": 1.038815374921235, "grad_norm": 1.0625, "learning_rate": 0.0004897838033260712, "loss": 5.2994, "mean_token_accuracy": 0.1691963866353035, "num_tokens": 22815375.0, "step": 12365 }, { "entropy": 5.669758462905884, "epoch": 1.0392354547364, "grad_norm": 1.0703125, "learning_rate": 0.0004897748801162929, "loss": 5.272, "mean_token_accuracy": 0.18344809263944625, "num_tokens": 22824401.0, "step": 12370 }, { "entropy": 5.68695559501648, "epoch": 1.0396555345515648, "grad_norm": 1.0390625, "learning_rate": 0.0004897659531018882, "loss": 5.3726, "mean_token_accuracy": 0.179783833026886, "num_tokens": 22833933.0, "step": 12375 }, { "entropy": 5.64669017791748, "epoch": 1.0400756143667296, "grad_norm": 1.078125, "learning_rate": 0.0004897570222830152, "loss": 5.2447, "mean_token_accuracy": 0.1805283918976784, "num_tokens": 22843779.0, "step": 12380 }, { "entropy": 5.753574800491333, "epoch": 1.0404956941818946, "grad_norm": 1.078125, "learning_rate": 0.0004897480876598322, "loss": 5.3407, "mean_token_accuracy": 0.17583997100591658, "num_tokens": 22852951.0, "step": 12385 }, { "entropy": 5.71806788444519, "epoch": 1.0409157739970594, "grad_norm": 1.0859375, "learning_rate": 0.0004897391492324974, "loss": 5.3423, "mean_token_accuracy": 0.1749775752425194, "num_tokens": 22861398.0, "step": 12390 }, { "entropy": 5.675067138671875, "epoch": 1.0413358538122244, "grad_norm": 0.984375, "learning_rate": 0.0004897302070011691, "loss": 5.2215, "mean_token_accuracy": 0.18783215582370758, "num_tokens": 22870518.0, "step": 12395 }, { "entropy": 5.636955738067627, "epoch": 1.0417559336273892, "grad_norm": 1.0625, "learning_rate": 0.0004897212609660058, "loss": 5.2974, "mean_token_accuracy": 0.1764320120215416, "num_tokens": 22879389.0, "step": 12400 }, { "entropy": 5.639626121520996, "epoch": 1.0421760134425542, "grad_norm": 1.0625, "learning_rate": 0.0004897123111271659, "loss": 5.2939, "mean_token_accuracy": 0.18479203730821608, "num_tokens": 22888977.0, "step": 12405 }, { "entropy": 5.739208984375, "epoch": 1.042596093257719, "grad_norm": 1.0546875, "learning_rate": 0.0004897033574848079, "loss": 5.2931, "mean_token_accuracy": 0.1832874000072479, "num_tokens": 22898446.0, "step": 12410 }, { "entropy": 5.621999073028564, "epoch": 1.0430161730728837, "grad_norm": 0.94140625, "learning_rate": 0.0004896944000390907, "loss": 5.307, "mean_token_accuracy": 0.18112864345312119, "num_tokens": 22908044.0, "step": 12415 }, { "entropy": 5.7017865657806395, "epoch": 1.0434362528880488, "grad_norm": 1.1328125, "learning_rate": 0.0004896854387901725, "loss": 5.3585, "mean_token_accuracy": 0.1737068012356758, "num_tokens": 22917330.0, "step": 12420 }, { "entropy": 5.726221799850464, "epoch": 1.0438563327032135, "grad_norm": 1.03125, "learning_rate": 0.0004896764737382124, "loss": 5.298, "mean_token_accuracy": 0.1884210541844368, "num_tokens": 22927160.0, "step": 12425 }, { "entropy": 5.719885396957397, "epoch": 1.0442764125183785, "grad_norm": 0.92578125, "learning_rate": 0.0004896675048833691, "loss": 5.2668, "mean_token_accuracy": 0.1786972314119339, "num_tokens": 22936755.0, "step": 12430 }, { "entropy": 5.6939490795135494, "epoch": 1.0446964923335433, "grad_norm": 1.0859375, "learning_rate": 0.0004896585322258014, "loss": 5.2595, "mean_token_accuracy": 0.18076882511377335, "num_tokens": 22945699.0, "step": 12435 }, { "entropy": 5.639131689071656, "epoch": 1.0451165721487083, "grad_norm": 1.1171875, "learning_rate": 0.0004896495557656685, "loss": 5.2371, "mean_token_accuracy": 0.1891343578696251, "num_tokens": 22954001.0, "step": 12440 }, { "entropy": 5.7185853004455565, "epoch": 1.0455366519638731, "grad_norm": 0.9453125, "learning_rate": 0.0004896405755031293, "loss": 5.3262, "mean_token_accuracy": 0.1782376989722252, "num_tokens": 22963805.0, "step": 12445 }, { "entropy": 5.630977725982666, "epoch": 1.045956731779038, "grad_norm": 0.95703125, "learning_rate": 0.0004896315914383427, "loss": 5.2581, "mean_token_accuracy": 0.17448056638240814, "num_tokens": 22973542.0, "step": 12450 }, { "entropy": 5.604122972488403, "epoch": 1.046376811594203, "grad_norm": 0.98828125, "learning_rate": 0.0004896226035714679, "loss": 5.1621, "mean_token_accuracy": 0.18821897059679032, "num_tokens": 22982417.0, "step": 12455 }, { "entropy": 5.632601356506347, "epoch": 1.0467968914093677, "grad_norm": 1.0859375, "learning_rate": 0.0004896136119026642, "loss": 5.2619, "mean_token_accuracy": 0.18409162163734435, "num_tokens": 22992879.0, "step": 12460 }, { "entropy": 5.609059047698975, "epoch": 1.0472169712245327, "grad_norm": 1.09375, "learning_rate": 0.0004896046164320911, "loss": 5.164, "mean_token_accuracy": 0.18929794877767564, "num_tokens": 23001344.0, "step": 12465 }, { "entropy": 5.602516317367554, "epoch": 1.0476370510396975, "grad_norm": 1.0234375, "learning_rate": 0.0004895956171599075, "loss": 5.2066, "mean_token_accuracy": 0.18921121060848237, "num_tokens": 23010007.0, "step": 12470 }, { "entropy": 5.660155773162842, "epoch": 1.0480571308548625, "grad_norm": 1.0234375, "learning_rate": 0.0004895866140862731, "loss": 5.3261, "mean_token_accuracy": 0.1807284966111183, "num_tokens": 23019120.0, "step": 12475 }, { "entropy": 5.685138940811157, "epoch": 1.0484772106700273, "grad_norm": 0.95703125, "learning_rate": 0.0004895776072113473, "loss": 5.2997, "mean_token_accuracy": 0.18290154486894608, "num_tokens": 23028562.0, "step": 12480 }, { "entropy": 5.689487171173096, "epoch": 1.048897290485192, "grad_norm": 0.98046875, "learning_rate": 0.0004895685965352898, "loss": 5.2524, "mean_token_accuracy": 0.18178534507751465, "num_tokens": 23037687.0, "step": 12485 }, { "entropy": 5.689279413223266, "epoch": 1.049317370300357, "grad_norm": 0.94921875, "learning_rate": 0.0004895595820582601, "loss": 5.2394, "mean_token_accuracy": 0.18193750232458114, "num_tokens": 23047475.0, "step": 12490 }, { "entropy": 5.615953207015991, "epoch": 1.0497374501155219, "grad_norm": 1.0, "learning_rate": 0.0004895505637804177, "loss": 5.2742, "mean_token_accuracy": 0.1787284329533577, "num_tokens": 23057475.0, "step": 12495 }, { "entropy": 5.601930332183838, "epoch": 1.050157529930687, "grad_norm": 1.09375, "learning_rate": 0.0004895415417019227, "loss": 5.2557, "mean_token_accuracy": 0.1793588936328888, "num_tokens": 23066419.0, "step": 12500 }, { "entropy": 5.736211252212525, "epoch": 1.0505776097458517, "grad_norm": 1.015625, "learning_rate": 0.0004895325158229346, "loss": 5.3072, "mean_token_accuracy": 0.17959971725940704, "num_tokens": 23075516.0, "step": 12505 }, { "entropy": 5.6330945014953615, "epoch": 1.0509976895610167, "grad_norm": 1.0390625, "learning_rate": 0.0004895234861436136, "loss": 5.196, "mean_token_accuracy": 0.18888653665781022, "num_tokens": 23084132.0, "step": 12510 }, { "entropy": 5.686839485168457, "epoch": 1.0514177693761815, "grad_norm": 1.0078125, "learning_rate": 0.0004895144526641194, "loss": 5.2508, "mean_token_accuracy": 0.17812662422657013, "num_tokens": 23093958.0, "step": 12515 }, { "entropy": 5.741023063659668, "epoch": 1.0518378491913463, "grad_norm": 1.0234375, "learning_rate": 0.0004895054153846123, "loss": 5.2861, "mean_token_accuracy": 0.1793720692396164, "num_tokens": 23103524.0, "step": 12520 }, { "entropy": 5.689534616470337, "epoch": 1.0522579290065113, "grad_norm": 1.03125, "learning_rate": 0.0004894963743052521, "loss": 5.2272, "mean_token_accuracy": 0.1796150654554367, "num_tokens": 23112445.0, "step": 12525 }, { "entropy": 5.677579021453857, "epoch": 1.052678008821676, "grad_norm": 0.99609375, "learning_rate": 0.0004894873294261991, "loss": 5.2907, "mean_token_accuracy": 0.18264670968055724, "num_tokens": 23121299.0, "step": 12530 }, { "entropy": 5.736908388137818, "epoch": 1.053098088636841, "grad_norm": 1.1015625, "learning_rate": 0.0004894782807476134, "loss": 5.2801, "mean_token_accuracy": 0.17894915342330933, "num_tokens": 23130260.0, "step": 12535 }, { "entropy": 5.694188642501831, "epoch": 1.0535181684520059, "grad_norm": 1.046875, "learning_rate": 0.0004894692282696555, "loss": 5.2146, "mean_token_accuracy": 0.1868406981229782, "num_tokens": 23139335.0, "step": 12540 }, { "entropy": 5.622735214233399, "epoch": 1.0539382482671709, "grad_norm": 1.1015625, "learning_rate": 0.0004894601719924857, "loss": 5.224, "mean_token_accuracy": 0.18853606134653092, "num_tokens": 23149299.0, "step": 12545 }, { "entropy": 5.571608686447144, "epoch": 1.0543583280823356, "grad_norm": 0.96484375, "learning_rate": 0.0004894511119162644, "loss": 5.1973, "mean_token_accuracy": 0.19267328977584838, "num_tokens": 23158651.0, "step": 12550 }, { "entropy": 5.684165382385254, "epoch": 1.0547784078975004, "grad_norm": 1.0078125, "learning_rate": 0.000489442048041152, "loss": 5.2801, "mean_token_accuracy": 0.1786165028810501, "num_tokens": 23167629.0, "step": 12555 }, { "entropy": 5.6584742069244385, "epoch": 1.0551984877126654, "grad_norm": 0.984375, "learning_rate": 0.0004894329803673092, "loss": 5.2767, "mean_token_accuracy": 0.17648475021123886, "num_tokens": 23177026.0, "step": 12560 }, { "entropy": 5.644589805603028, "epoch": 1.0556185675278302, "grad_norm": 1.21875, "learning_rate": 0.0004894239088948964, "loss": 5.2249, "mean_token_accuracy": 0.18399962037801743, "num_tokens": 23185297.0, "step": 12565 }, { "entropy": 5.598917818069458, "epoch": 1.0560386473429952, "grad_norm": 1.03125, "learning_rate": 0.0004894148336240747, "loss": 5.2347, "mean_token_accuracy": 0.18499256074428558, "num_tokens": 23194804.0, "step": 12570 }, { "entropy": 5.688305282592774, "epoch": 1.05645872715816, "grad_norm": 0.96484375, "learning_rate": 0.0004894057545550045, "loss": 5.286, "mean_token_accuracy": 0.17913931906223296, "num_tokens": 23205063.0, "step": 12575 }, { "entropy": 5.633170032501221, "epoch": 1.056878806973325, "grad_norm": 1.0390625, "learning_rate": 0.0004893966716878467, "loss": 5.2143, "mean_token_accuracy": 0.17792848199605943, "num_tokens": 23215038.0, "step": 12580 }, { "entropy": 5.749055671691894, "epoch": 1.0572988867884898, "grad_norm": 1.0, "learning_rate": 0.0004893875850227624, "loss": 5.3636, "mean_token_accuracy": 0.17411137372255325, "num_tokens": 23223530.0, "step": 12585 }, { "entropy": 5.680791807174683, "epoch": 1.0577189666036546, "grad_norm": 1.078125, "learning_rate": 0.0004893784945599124, "loss": 5.2914, "mean_token_accuracy": 0.18603131920099258, "num_tokens": 23232547.0, "step": 12590 }, { "entropy": 5.646617841720581, "epoch": 1.0581390464188196, "grad_norm": 1.0390625, "learning_rate": 0.0004893694002994577, "loss": 5.3395, "mean_token_accuracy": 0.18207189291715623, "num_tokens": 23241305.0, "step": 12595 }, { "entropy": 5.812775707244873, "epoch": 1.0585591262339844, "grad_norm": 1.125, "learning_rate": 0.0004893603022415595, "loss": 5.3596, "mean_token_accuracy": 0.18028753846883774, "num_tokens": 23250708.0, "step": 12600 }, { "entropy": 5.747534465789795, "epoch": 1.0589792060491494, "grad_norm": 1.125, "learning_rate": 0.0004893512003863788, "loss": 5.2754, "mean_token_accuracy": 0.17291081994771956, "num_tokens": 23260161.0, "step": 12605 }, { "entropy": 5.627267742156983, "epoch": 1.0593992858643142, "grad_norm": 0.98046875, "learning_rate": 0.0004893420947340771, "loss": 5.1898, "mean_token_accuracy": 0.17830443382263184, "num_tokens": 23268932.0, "step": 12610 }, { "entropy": 5.617309808731079, "epoch": 1.0598193656794792, "grad_norm": 1.0078125, "learning_rate": 0.0004893329852848155, "loss": 5.2802, "mean_token_accuracy": 0.1815733030438423, "num_tokens": 23277741.0, "step": 12615 }, { "entropy": 5.6404839038848875, "epoch": 1.060239445494644, "grad_norm": 1.03125, "learning_rate": 0.0004893238720387555, "loss": 5.2737, "mean_token_accuracy": 0.18266890645027162, "num_tokens": 23286982.0, "step": 12620 }, { "entropy": 5.608418369293213, "epoch": 1.0606595253098088, "grad_norm": 0.99609375, "learning_rate": 0.0004893147549960584, "loss": 5.1969, "mean_token_accuracy": 0.18518164306879042, "num_tokens": 23296902.0, "step": 12625 }, { "entropy": 5.61023325920105, "epoch": 1.0610796051249738, "grad_norm": 1.0390625, "learning_rate": 0.0004893056341568857, "loss": 5.2231, "mean_token_accuracy": 0.18583046942949294, "num_tokens": 23305443.0, "step": 12630 }, { "entropy": 5.602107429504395, "epoch": 1.0614996849401386, "grad_norm": 1.0234375, "learning_rate": 0.0004892965095213992, "loss": 5.1788, "mean_token_accuracy": 0.1890096038579941, "num_tokens": 23315420.0, "step": 12635 }, { "entropy": 5.717545795440674, "epoch": 1.0619197647553036, "grad_norm": 1.046875, "learning_rate": 0.0004892873810897604, "loss": 5.2681, "mean_token_accuracy": 0.1774292603135109, "num_tokens": 23324540.0, "step": 12640 }, { "entropy": 5.685351085662842, "epoch": 1.0623398445704684, "grad_norm": 1.03125, "learning_rate": 0.0004892782488621308, "loss": 5.2382, "mean_token_accuracy": 0.18667006939649583, "num_tokens": 23334282.0, "step": 12645 }, { "entropy": 5.646767520904541, "epoch": 1.0627599243856332, "grad_norm": 1.0, "learning_rate": 0.0004892691128386725, "loss": 5.2274, "mean_token_accuracy": 0.18286852389574051, "num_tokens": 23342836.0, "step": 12650 }, { "entropy": 5.645523881912231, "epoch": 1.0631800042007982, "grad_norm": 1.0, "learning_rate": 0.0004892599730195471, "loss": 5.22, "mean_token_accuracy": 0.18985742926597596, "num_tokens": 23351863.0, "step": 12655 }, { "entropy": 5.710491800308228, "epoch": 1.063600084015963, "grad_norm": 1.078125, "learning_rate": 0.0004892508294049167, "loss": 5.3603, "mean_token_accuracy": 0.17996528297662734, "num_tokens": 23361788.0, "step": 12660 }, { "entropy": 5.675336170196533, "epoch": 1.064020163831128, "grad_norm": 1.0234375, "learning_rate": 0.0004892416819949431, "loss": 5.2188, "mean_token_accuracy": 0.17850940972566604, "num_tokens": 23370175.0, "step": 12665 }, { "entropy": 5.595318698883057, "epoch": 1.0644402436462927, "grad_norm": 1.0078125, "learning_rate": 0.0004892325307897886, "loss": 5.2502, "mean_token_accuracy": 0.1822288915514946, "num_tokens": 23378835.0, "step": 12670 }, { "entropy": 5.635321950912475, "epoch": 1.0648603234614578, "grad_norm": 0.91796875, "learning_rate": 0.0004892233757896149, "loss": 5.2491, "mean_token_accuracy": 0.18298966884613038, "num_tokens": 23389390.0, "step": 12675 }, { "entropy": 5.637257242202759, "epoch": 1.0652804032766225, "grad_norm": 0.95703125, "learning_rate": 0.0004892142169945845, "loss": 5.2534, "mean_token_accuracy": 0.18063441514968873, "num_tokens": 23398802.0, "step": 12680 }, { "entropy": 5.6366626739501955, "epoch": 1.0657004830917876, "grad_norm": 1.0625, "learning_rate": 0.0004892050544048596, "loss": 5.2291, "mean_token_accuracy": 0.18350041657686234, "num_tokens": 23407731.0, "step": 12685 }, { "entropy": 5.669007110595703, "epoch": 1.0661205629069523, "grad_norm": 0.94140625, "learning_rate": 0.0004891958880206024, "loss": 5.2895, "mean_token_accuracy": 0.1780366614460945, "num_tokens": 23417046.0, "step": 12690 }, { "entropy": 5.624680662155152, "epoch": 1.0665406427221171, "grad_norm": 1.078125, "learning_rate": 0.0004891867178419753, "loss": 5.2612, "mean_token_accuracy": 0.18198800683021546, "num_tokens": 23426107.0, "step": 12695 }, { "entropy": 5.680970287322998, "epoch": 1.0669607225372821, "grad_norm": 0.9609375, "learning_rate": 0.0004891775438691408, "loss": 5.3087, "mean_token_accuracy": 0.18102641701698302, "num_tokens": 23435523.0, "step": 12700 }, { "entropy": 5.700118589401245, "epoch": 1.067380802352447, "grad_norm": 1.359375, "learning_rate": 0.0004891683661022615, "loss": 5.252, "mean_token_accuracy": 0.185978502035141, "num_tokens": 23444185.0, "step": 12705 }, { "entropy": 5.747912788391114, "epoch": 1.067800882167612, "grad_norm": 1.03125, "learning_rate": 0.0004891591845414997, "loss": 5.4483, "mean_token_accuracy": 0.1628721162676811, "num_tokens": 23454100.0, "step": 12710 }, { "entropy": 5.724557876586914, "epoch": 1.0682209619827767, "grad_norm": 1.046875, "learning_rate": 0.0004891499991870184, "loss": 5.3177, "mean_token_accuracy": 0.17514833956956863, "num_tokens": 23463415.0, "step": 12715 }, { "entropy": 5.695041370391846, "epoch": 1.0686410417979415, "grad_norm": 1.0390625, "learning_rate": 0.00048914081003898, "loss": 5.23, "mean_token_accuracy": 0.17918239384889603, "num_tokens": 23471515.0, "step": 12720 }, { "entropy": 5.721878433227539, "epoch": 1.0690611216131065, "grad_norm": 1.0546875, "learning_rate": 0.0004891316170975475, "loss": 5.2935, "mean_token_accuracy": 0.17834519147872924, "num_tokens": 23481696.0, "step": 12725 }, { "entropy": 5.715478372573853, "epoch": 1.0694812014282713, "grad_norm": 1.1875, "learning_rate": 0.0004891224203628836, "loss": 5.2461, "mean_token_accuracy": 0.18814520686864852, "num_tokens": 23490714.0, "step": 12730 }, { "entropy": 5.582677364349365, "epoch": 1.0699012812434363, "grad_norm": 0.953125, "learning_rate": 0.0004891132198351514, "loss": 5.2108, "mean_token_accuracy": 0.1884111285209656, "num_tokens": 23500368.0, "step": 12735 }, { "entropy": 5.532419013977051, "epoch": 1.070321361058601, "grad_norm": 1.0234375, "learning_rate": 0.0004891040155145137, "loss": 5.1803, "mean_token_accuracy": 0.18773214966058732, "num_tokens": 23508857.0, "step": 12740 }, { "entropy": 5.598638916015625, "epoch": 1.070741440873766, "grad_norm": 1.015625, "learning_rate": 0.0004890948074011335, "loss": 5.1657, "mean_token_accuracy": 0.19128426015377045, "num_tokens": 23518128.0, "step": 12745 }, { "entropy": 5.698208332061768, "epoch": 1.071161520688931, "grad_norm": 1.046875, "learning_rate": 0.0004890855954951741, "loss": 5.2683, "mean_token_accuracy": 0.188133105635643, "num_tokens": 23527292.0, "step": 12750 }, { "entropy": 5.694556379318238, "epoch": 1.0715816005040957, "grad_norm": 0.9921875, "learning_rate": 0.0004890763797967987, "loss": 5.2489, "mean_token_accuracy": 0.18290500491857528, "num_tokens": 23535694.0, "step": 12755 }, { "entropy": 5.65405421257019, "epoch": 1.0720016803192607, "grad_norm": 1.0, "learning_rate": 0.0004890671603061704, "loss": 5.2517, "mean_token_accuracy": 0.18395259380340576, "num_tokens": 23544766.0, "step": 12760 }, { "entropy": 5.640880823135376, "epoch": 1.0724217601344255, "grad_norm": 0.9921875, "learning_rate": 0.0004890579370234526, "loss": 5.2314, "mean_token_accuracy": 0.18854832649230957, "num_tokens": 23554037.0, "step": 12765 }, { "entropy": 5.713632726669312, "epoch": 1.0728418399495905, "grad_norm": 1.0546875, "learning_rate": 0.0004890487099488086, "loss": 5.2981, "mean_token_accuracy": 0.17728416472673417, "num_tokens": 23562282.0, "step": 12770 }, { "entropy": 5.761063957214356, "epoch": 1.0732619197647553, "grad_norm": 0.98828125, "learning_rate": 0.000489039479082402, "loss": 5.3426, "mean_token_accuracy": 0.17374907284975052, "num_tokens": 23571955.0, "step": 12775 }, { "entropy": 5.659833192825317, "epoch": 1.0736819995799203, "grad_norm": 0.94140625, "learning_rate": 0.0004890302444243962, "loss": 5.2394, "mean_token_accuracy": 0.17800159603357316, "num_tokens": 23580996.0, "step": 12780 }, { "entropy": 5.675001716613769, "epoch": 1.074102079395085, "grad_norm": 1.0390625, "learning_rate": 0.0004890210059749549, "loss": 5.3372, "mean_token_accuracy": 0.17432284504175186, "num_tokens": 23589618.0, "step": 12785 }, { "entropy": 5.628401374816894, "epoch": 1.0745221592102498, "grad_norm": 1.0546875, "learning_rate": 0.0004890117637342416, "loss": 5.1777, "mean_token_accuracy": 0.1831621617078781, "num_tokens": 23599574.0, "step": 12790 }, { "entropy": 5.66285514831543, "epoch": 1.0749422390254149, "grad_norm": 1.03125, "learning_rate": 0.0004890025177024202, "loss": 5.2415, "mean_token_accuracy": 0.1792875424027443, "num_tokens": 23609205.0, "step": 12795 }, { "entropy": 5.615373754501343, "epoch": 1.0753623188405796, "grad_norm": 1.1015625, "learning_rate": 0.0004889932678796543, "loss": 5.2695, "mean_token_accuracy": 0.1729859620332718, "num_tokens": 23617554.0, "step": 12800 }, { "entropy": 5.699828147888184, "epoch": 1.0757823986557447, "grad_norm": 1.046875, "learning_rate": 0.0004889840142661078, "loss": 5.3311, "mean_token_accuracy": 0.1775299936532974, "num_tokens": 23626757.0, "step": 12805 }, { "entropy": 5.708516836166382, "epoch": 1.0762024784709094, "grad_norm": 1.046875, "learning_rate": 0.0004889747568619447, "loss": 5.2693, "mean_token_accuracy": 0.18425986766815186, "num_tokens": 23636111.0, "step": 12810 }, { "entropy": 5.6856849670410154, "epoch": 1.0766225582860744, "grad_norm": 1.1328125, "learning_rate": 0.0004889654956673291, "loss": 5.2758, "mean_token_accuracy": 0.1837363764643669, "num_tokens": 23644579.0, "step": 12815 }, { "entropy": 5.680776500701905, "epoch": 1.0770426381012392, "grad_norm": 1.03125, "learning_rate": 0.0004889562306824248, "loss": 5.184, "mean_token_accuracy": 0.18426006585359572, "num_tokens": 23653263.0, "step": 12820 }, { "entropy": 5.544320678710937, "epoch": 1.077462717916404, "grad_norm": 0.98828125, "learning_rate": 0.000488946961907396, "loss": 5.1403, "mean_token_accuracy": 0.19812086820602418, "num_tokens": 23662529.0, "step": 12825 }, { "entropy": 5.574382972717285, "epoch": 1.077882797731569, "grad_norm": 1.0, "learning_rate": 0.0004889376893424071, "loss": 5.1741, "mean_token_accuracy": 0.18910592794418335, "num_tokens": 23671491.0, "step": 12830 }, { "entropy": 5.615490579605103, "epoch": 1.0783028775467338, "grad_norm": 1.0078125, "learning_rate": 0.0004889284129876221, "loss": 5.1743, "mean_token_accuracy": 0.18766126036643982, "num_tokens": 23680121.0, "step": 12835 }, { "entropy": 5.612439584732056, "epoch": 1.0787229573618988, "grad_norm": 1.1015625, "learning_rate": 0.0004889191328432054, "loss": 5.2362, "mean_token_accuracy": 0.18087808787822723, "num_tokens": 23689008.0, "step": 12840 }, { "entropy": 5.660691928863526, "epoch": 1.0791430371770636, "grad_norm": 0.91015625, "learning_rate": 0.0004889098489093215, "loss": 5.2576, "mean_token_accuracy": 0.17944198548793794, "num_tokens": 23698551.0, "step": 12845 }, { "entropy": 5.752760744094848, "epoch": 1.0795631169922286, "grad_norm": 1.03125, "learning_rate": 0.0004889005611861347, "loss": 5.4146, "mean_token_accuracy": 0.17980343997478485, "num_tokens": 23707438.0, "step": 12850 }, { "entropy": 5.622406911849976, "epoch": 1.0799831968073934, "grad_norm": 1.0703125, "learning_rate": 0.0004888912696738096, "loss": 5.2835, "mean_token_accuracy": 0.18134992569684982, "num_tokens": 23715822.0, "step": 12855 }, { "entropy": 5.668863964080811, "epoch": 1.0804032766225582, "grad_norm": 1.03125, "learning_rate": 0.0004888819743725108, "loss": 5.2949, "mean_token_accuracy": 0.1811855435371399, "num_tokens": 23725426.0, "step": 12860 }, { "entropy": 5.73712477684021, "epoch": 1.0808233564377232, "grad_norm": 0.96875, "learning_rate": 0.000488872675282403, "loss": 5.2788, "mean_token_accuracy": 0.17887916266918183, "num_tokens": 23735092.0, "step": 12865 }, { "entropy": 5.6691062450408936, "epoch": 1.081243436252888, "grad_norm": 0.99609375, "learning_rate": 0.0004888633724036509, "loss": 5.2697, "mean_token_accuracy": 0.18183707296848298, "num_tokens": 23744255.0, "step": 12870 }, { "entropy": 5.597784852981567, "epoch": 1.081663516068053, "grad_norm": 1.09375, "learning_rate": 0.0004888540657364192, "loss": 5.1049, "mean_token_accuracy": 0.19642930924892427, "num_tokens": 23752978.0, "step": 12875 }, { "entropy": 5.6465291500091555, "epoch": 1.0820835958832178, "grad_norm": 1.125, "learning_rate": 0.0004888447552808729, "loss": 5.205, "mean_token_accuracy": 0.1904713034629822, "num_tokens": 23761051.0, "step": 12880 }, { "entropy": 5.7192590713500975, "epoch": 1.0825036756983828, "grad_norm": 1.0234375, "learning_rate": 0.0004888354410371768, "loss": 5.3285, "mean_token_accuracy": 0.17311316579580308, "num_tokens": 23770818.0, "step": 12885 }, { "entropy": 5.762405490875244, "epoch": 1.0829237555135476, "grad_norm": 0.96484375, "learning_rate": 0.000488826123005496, "loss": 5.3356, "mean_token_accuracy": 0.1816746786236763, "num_tokens": 23780597.0, "step": 12890 }, { "entropy": 5.637387371063232, "epoch": 1.0833438353287124, "grad_norm": 0.984375, "learning_rate": 0.0004888168011859957, "loss": 5.1716, "mean_token_accuracy": 0.1854577124118805, "num_tokens": 23790119.0, "step": 12895 }, { "entropy": 5.638812685012818, "epoch": 1.0837639151438774, "grad_norm": 0.9375, "learning_rate": 0.0004888074755788407, "loss": 5.2413, "mean_token_accuracy": 0.18185414075851442, "num_tokens": 23798972.0, "step": 12900 }, { "entropy": 5.599403333663941, "epoch": 1.0841839949590422, "grad_norm": 0.97265625, "learning_rate": 0.0004887981461841963, "loss": 5.2173, "mean_token_accuracy": 0.19349830895662307, "num_tokens": 23808685.0, "step": 12905 }, { "entropy": 5.688136005401612, "epoch": 1.0846040747742072, "grad_norm": 1.046875, "learning_rate": 0.0004887888130022279, "loss": 5.2133, "mean_token_accuracy": 0.18391136527061464, "num_tokens": 23817721.0, "step": 12910 }, { "entropy": 5.551941633224487, "epoch": 1.085024154589372, "grad_norm": 0.9765625, "learning_rate": 0.0004887794760331008, "loss": 5.1915, "mean_token_accuracy": 0.19003793001174926, "num_tokens": 23826892.0, "step": 12915 }, { "entropy": 5.623990154266357, "epoch": 1.085444234404537, "grad_norm": 1.046875, "learning_rate": 0.0004887701352769804, "loss": 5.1402, "mean_token_accuracy": 0.19249491393566132, "num_tokens": 23835717.0, "step": 12920 }, { "entropy": 5.6494200229644775, "epoch": 1.0858643142197018, "grad_norm": 0.99609375, "learning_rate": 0.000488760790734032, "loss": 5.2201, "mean_token_accuracy": 0.18874411135911942, "num_tokens": 23845814.0, "step": 12925 }, { "entropy": 5.664129161834717, "epoch": 1.0862843940348665, "grad_norm": 1.1875, "learning_rate": 0.0004887514424044214, "loss": 5.2218, "mean_token_accuracy": 0.17604484856128694, "num_tokens": 23854779.0, "step": 12930 }, { "entropy": 5.600961828231812, "epoch": 1.0867044738500315, "grad_norm": 1.0859375, "learning_rate": 0.000488742090288314, "loss": 5.2668, "mean_token_accuracy": 0.18102030903100969, "num_tokens": 23863533.0, "step": 12935 }, { "entropy": 5.642888736724854, "epoch": 1.0871245536651963, "grad_norm": 1.015625, "learning_rate": 0.0004887327343858755, "loss": 5.2947, "mean_token_accuracy": 0.18080966174602509, "num_tokens": 23872725.0, "step": 12940 }, { "entropy": 5.66283769607544, "epoch": 1.0875446334803613, "grad_norm": 1.03125, "learning_rate": 0.0004887233746972717, "loss": 5.2856, "mean_token_accuracy": 0.17893132269382478, "num_tokens": 23881799.0, "step": 12945 }, { "entropy": 5.633523321151733, "epoch": 1.0879647132955261, "grad_norm": 1.03125, "learning_rate": 0.0004887140112226684, "loss": 5.3057, "mean_token_accuracy": 0.17848242670297623, "num_tokens": 23890628.0, "step": 12950 }, { "entropy": 5.621290779113769, "epoch": 1.088384793110691, "grad_norm": 1.0390625, "learning_rate": 0.0004887046439622314, "loss": 5.2758, "mean_token_accuracy": 0.18959250599145888, "num_tokens": 23899968.0, "step": 12955 }, { "entropy": 5.703247499465943, "epoch": 1.088804872925856, "grad_norm": 1.0390625, "learning_rate": 0.0004886952729161267, "loss": 5.1582, "mean_token_accuracy": 0.18921613544225693, "num_tokens": 23908634.0, "step": 12960 }, { "entropy": 5.717560148239135, "epoch": 1.0892249527410207, "grad_norm": 1.0, "learning_rate": 0.0004886858980845202, "loss": 5.3037, "mean_token_accuracy": 0.17998072355985642, "num_tokens": 23917925.0, "step": 12965 }, { "entropy": 5.622332811355591, "epoch": 1.0896450325561857, "grad_norm": 0.9609375, "learning_rate": 0.0004886765194675782, "loss": 5.2151, "mean_token_accuracy": 0.18772554099559785, "num_tokens": 23927173.0, "step": 12970 }, { "entropy": 5.653474044799805, "epoch": 1.0900651123713505, "grad_norm": 1.0625, "learning_rate": 0.0004886671370654665, "loss": 5.1647, "mean_token_accuracy": 0.19048593193292618, "num_tokens": 23936258.0, "step": 12975 }, { "entropy": 5.6524711608886715, "epoch": 1.0904851921865155, "grad_norm": 1.1171875, "learning_rate": 0.0004886577508783516, "loss": 5.1604, "mean_token_accuracy": 0.194679893553257, "num_tokens": 23944215.0, "step": 12980 }, { "entropy": 5.681831789016724, "epoch": 1.0909052720016803, "grad_norm": 1.0390625, "learning_rate": 0.0004886483609063997, "loss": 5.2351, "mean_token_accuracy": 0.17767837196588515, "num_tokens": 23953151.0, "step": 12985 }, { "entropy": 5.601357555389404, "epoch": 1.0913253518168453, "grad_norm": 1.015625, "learning_rate": 0.0004886389671497769, "loss": 5.2156, "mean_token_accuracy": 0.19118448495864868, "num_tokens": 23962919.0, "step": 12990 }, { "entropy": 5.671156120300293, "epoch": 1.09174543163201, "grad_norm": 1.0, "learning_rate": 0.00048862956960865, "loss": 5.2469, "mean_token_accuracy": 0.18204855918884277, "num_tokens": 23971900.0, "step": 12995 }, { "entropy": 5.666143989562988, "epoch": 1.0921655114471749, "grad_norm": 0.9453125, "learning_rate": 0.0004886201682831852, "loss": 5.2078, "mean_token_accuracy": 0.18578559309244155, "num_tokens": 23980945.0, "step": 13000 }, { "entropy": 5.62028660774231, "epoch": 1.09258559126234, "grad_norm": 0.9609375, "learning_rate": 0.0004886107631735491, "loss": 5.1766, "mean_token_accuracy": 0.18489760309457778, "num_tokens": 23990460.0, "step": 13005 }, { "entropy": 5.718626022338867, "epoch": 1.0930056710775047, "grad_norm": 1.0859375, "learning_rate": 0.0004886013542799083, "loss": 5.3431, "mean_token_accuracy": 0.1721617564558983, "num_tokens": 23999925.0, "step": 13010 }, { "entropy": 5.601929426193237, "epoch": 1.0934257508926697, "grad_norm": 0.96484375, "learning_rate": 0.0004885919416024296, "loss": 5.1934, "mean_token_accuracy": 0.18379901647567748, "num_tokens": 24009039.0, "step": 13015 }, { "entropy": 5.690941286087036, "epoch": 1.0938458307078345, "grad_norm": 1.0078125, "learning_rate": 0.0004885825251412796, "loss": 5.2396, "mean_token_accuracy": 0.18521216064691542, "num_tokens": 24017725.0, "step": 13020 }, { "entropy": 5.698583364486694, "epoch": 1.0942659105229993, "grad_norm": 1.390625, "learning_rate": 0.0004885731048966252, "loss": 5.2452, "mean_token_accuracy": 0.1829244911670685, "num_tokens": 24027158.0, "step": 13025 }, { "entropy": 5.656570768356323, "epoch": 1.0946859903381643, "grad_norm": 1.09375, "learning_rate": 0.0004885636808686331, "loss": 5.3166, "mean_token_accuracy": 0.18431147187948227, "num_tokens": 24037224.0, "step": 13030 }, { "entropy": 5.669613027572632, "epoch": 1.095106070153329, "grad_norm": 1.0625, "learning_rate": 0.0004885542530574705, "loss": 5.2836, "mean_token_accuracy": 0.17715646624565123, "num_tokens": 24046097.0, "step": 13035 }, { "entropy": 5.617032241821289, "epoch": 1.095526149968494, "grad_norm": 0.9453125, "learning_rate": 0.0004885448214633042, "loss": 5.1544, "mean_token_accuracy": 0.18388025164604188, "num_tokens": 24055270.0, "step": 13040 }, { "entropy": 5.688759613037109, "epoch": 1.0959462297836589, "grad_norm": 1.0, "learning_rate": 0.0004885353860863013, "loss": 5.3386, "mean_token_accuracy": 0.1729425773024559, "num_tokens": 24064995.0, "step": 13045 }, { "entropy": 5.768623542785645, "epoch": 1.0963663095988239, "grad_norm": 1.1015625, "learning_rate": 0.000488525946926629, "loss": 5.3591, "mean_token_accuracy": 0.17546359300613404, "num_tokens": 24075523.0, "step": 13050 }, { "entropy": 5.693236255645752, "epoch": 1.0967863894139886, "grad_norm": 0.96875, "learning_rate": 0.0004885165039844545, "loss": 5.2419, "mean_token_accuracy": 0.18383643329143523, "num_tokens": 24084933.0, "step": 13055 }, { "entropy": 5.682478952407837, "epoch": 1.0972064692291534, "grad_norm": 0.96875, "learning_rate": 0.0004885070572599452, "loss": 5.2709, "mean_token_accuracy": 0.1794643297791481, "num_tokens": 24093964.0, "step": 13060 }, { "entropy": 5.660529851913452, "epoch": 1.0976265490443184, "grad_norm": 1.078125, "learning_rate": 0.0004884976067532681, "loss": 5.226, "mean_token_accuracy": 0.16944489181041716, "num_tokens": 24103951.0, "step": 13065 }, { "entropy": 5.632370138168335, "epoch": 1.0980466288594832, "grad_norm": 0.9765625, "learning_rate": 0.000488488152464591, "loss": 5.3132, "mean_token_accuracy": 0.17879901379346846, "num_tokens": 24113392.0, "step": 13070 }, { "entropy": 5.585694885253906, "epoch": 1.0984667086746482, "grad_norm": 1.03125, "learning_rate": 0.0004884786943940812, "loss": 5.1958, "mean_token_accuracy": 0.18187503069639205, "num_tokens": 24123165.0, "step": 13075 }, { "entropy": 5.65133810043335, "epoch": 1.098886788489813, "grad_norm": 1.0, "learning_rate": 0.0004884692325419063, "loss": 5.235, "mean_token_accuracy": 0.18105321526527404, "num_tokens": 24132176.0, "step": 13080 }, { "entropy": 5.7093912124633786, "epoch": 1.099306868304978, "grad_norm": 1.03125, "learning_rate": 0.0004884597669082336, "loss": 5.298, "mean_token_accuracy": 0.1792781189084053, "num_tokens": 24141737.0, "step": 13085 }, { "entropy": 5.674421787261963, "epoch": 1.0997269481201428, "grad_norm": 1.1171875, "learning_rate": 0.0004884502974932313, "loss": 5.2207, "mean_token_accuracy": 0.18564613312482833, "num_tokens": 24150477.0, "step": 13090 }, { "entropy": 5.734474849700928, "epoch": 1.1001470279353076, "grad_norm": 1.1015625, "learning_rate": 0.0004884408242970668, "loss": 5.3137, "mean_token_accuracy": 0.17890911549329758, "num_tokens": 24158739.0, "step": 13095 }, { "entropy": 5.619977521896362, "epoch": 1.1005671077504726, "grad_norm": 1.0234375, "learning_rate": 0.0004884313473199081, "loss": 5.1796, "mean_token_accuracy": 0.1910802960395813, "num_tokens": 24167511.0, "step": 13100 }, { "entropy": 5.608010721206665, "epoch": 1.1009871875656374, "grad_norm": 1.0546875, "learning_rate": 0.0004884218665619229, "loss": 5.1676, "mean_token_accuracy": 0.18546599000692368, "num_tokens": 24176413.0, "step": 13105 }, { "entropy": 5.639752197265625, "epoch": 1.1014072673808024, "grad_norm": 1.0390625, "learning_rate": 0.0004884123820232792, "loss": 5.1343, "mean_token_accuracy": 0.19571121484041215, "num_tokens": 24185135.0, "step": 13110 }, { "entropy": 5.655673170089722, "epoch": 1.1018273471959672, "grad_norm": 1.0625, "learning_rate": 0.0004884028937041451, "loss": 5.198, "mean_token_accuracy": 0.18560341000556946, "num_tokens": 24193273.0, "step": 13115 }, { "entropy": 5.7045598983764645, "epoch": 1.1022474270111322, "grad_norm": 1.109375, "learning_rate": 0.0004883934016046886, "loss": 5.2699, "mean_token_accuracy": 0.17633774280548095, "num_tokens": 24202509.0, "step": 13120 }, { "entropy": 5.707277393341064, "epoch": 1.102667506826297, "grad_norm": 1.0234375, "learning_rate": 0.000488383905725078, "loss": 5.2772, "mean_token_accuracy": 0.17653931975364684, "num_tokens": 24212644.0, "step": 13125 }, { "entropy": 5.6511146068573, "epoch": 1.1030875866414618, "grad_norm": 1.03125, "learning_rate": 0.0004883744060654811, "loss": 5.1682, "mean_token_accuracy": 0.1851295381784439, "num_tokens": 24221838.0, "step": 13130 }, { "entropy": 5.622931909561157, "epoch": 1.1035076664566268, "grad_norm": 1.015625, "learning_rate": 0.0004883649026260667, "loss": 5.2416, "mean_token_accuracy": 0.18858914375305175, "num_tokens": 24230987.0, "step": 13135 }, { "entropy": 5.656198358535766, "epoch": 1.1039277462717916, "grad_norm": 0.97265625, "learning_rate": 0.0004883553954070028, "loss": 5.1957, "mean_token_accuracy": 0.1842139780521393, "num_tokens": 24240523.0, "step": 13140 }, { "entropy": 5.685358142852783, "epoch": 1.1043478260869566, "grad_norm": 0.984375, "learning_rate": 0.000488345884408458, "loss": 5.2916, "mean_token_accuracy": 0.18604642897844315, "num_tokens": 24249799.0, "step": 13145 }, { "entropy": 5.645019388198852, "epoch": 1.1047679059021214, "grad_norm": 1.015625, "learning_rate": 0.0004883363696306007, "loss": 5.2388, "mean_token_accuracy": 0.1856672450900078, "num_tokens": 24259361.0, "step": 13150 }, { "entropy": 5.675960350036621, "epoch": 1.1051879857172864, "grad_norm": 1.0859375, "learning_rate": 0.0004883268510735995, "loss": 5.1993, "mean_token_accuracy": 0.1820151388645172, "num_tokens": 24268010.0, "step": 13155 }, { "entropy": 5.579957008361816, "epoch": 1.1056080655324512, "grad_norm": 1.03125, "learning_rate": 0.0004883173287376229, "loss": 5.2548, "mean_token_accuracy": 0.179135662317276, "num_tokens": 24277416.0, "step": 13160 }, { "entropy": 5.7374327182769775, "epoch": 1.106028145347616, "grad_norm": 0.9453125, "learning_rate": 0.0004883078026228397, "loss": 5.3284, "mean_token_accuracy": 0.17996440529823304, "num_tokens": 24286185.0, "step": 13165 }, { "entropy": 5.693727350234985, "epoch": 1.106448225162781, "grad_norm": 0.9453125, "learning_rate": 0.0004882982727294187, "loss": 5.1797, "mean_token_accuracy": 0.18091154098510742, "num_tokens": 24295382.0, "step": 13170 }, { "entropy": 5.649933910369873, "epoch": 1.1068683049779457, "grad_norm": 1.0546875, "learning_rate": 0.0004882887390575284, "loss": 5.2065, "mean_token_accuracy": 0.18659734576940537, "num_tokens": 24305197.0, "step": 13175 }, { "entropy": 5.699779605865478, "epoch": 1.1072883847931108, "grad_norm": 1.015625, "learning_rate": 0.0004882792016073381, "loss": 5.2949, "mean_token_accuracy": 0.17045092582702637, "num_tokens": 24314149.0, "step": 13180 }, { "entropy": 5.670999765396118, "epoch": 1.1077084646082755, "grad_norm": 1.0859375, "learning_rate": 0.00048826966037901655, "loss": 5.2222, "mean_token_accuracy": 0.18388027548789979, "num_tokens": 24323737.0, "step": 13185 }, { "entropy": 5.618364143371582, "epoch": 1.1081285444234406, "grad_norm": 1.0390625, "learning_rate": 0.00048826011537273276, "loss": 5.2051, "mean_token_accuracy": 0.1819797158241272, "num_tokens": 24332853.0, "step": 13190 }, { "entropy": 5.648488569259643, "epoch": 1.1085486242386053, "grad_norm": 0.96484375, "learning_rate": 0.0004882505665886558, "loss": 5.3193, "mean_token_accuracy": 0.17565063536167144, "num_tokens": 24342632.0, "step": 13195 }, { "entropy": 5.596490716934204, "epoch": 1.1089687040537701, "grad_norm": 0.95703125, "learning_rate": 0.00048824101402695493, "loss": 5.1726, "mean_token_accuracy": 0.18641633838415145, "num_tokens": 24351659.0, "step": 13200 }, { "entropy": 5.601254653930664, "epoch": 1.1093887838689351, "grad_norm": 1.0703125, "learning_rate": 0.0004882314576877993, "loss": 5.1979, "mean_token_accuracy": 0.18101346790790557, "num_tokens": 24360938.0, "step": 13205 }, { "entropy": 5.669651031494141, "epoch": 1.1098088636841, "grad_norm": 1.0, "learning_rate": 0.0004882218975713581, "loss": 5.2958, "mean_token_accuracy": 0.18428930640220642, "num_tokens": 24369603.0, "step": 13210 }, { "entropy": 5.614552736282349, "epoch": 1.110228943499265, "grad_norm": 1.1640625, "learning_rate": 0.0004882123336778009, "loss": 5.2342, "mean_token_accuracy": 0.18002720624208451, "num_tokens": 24377605.0, "step": 13215 }, { "entropy": 5.715504503250122, "epoch": 1.1106490233144297, "grad_norm": 1.0390625, "learning_rate": 0.0004882027660072969, "loss": 5.2674, "mean_token_accuracy": 0.17885927706956864, "num_tokens": 24386930.0, "step": 13220 }, { "entropy": 5.659976768493652, "epoch": 1.1110691031295947, "grad_norm": 1.1796875, "learning_rate": 0.0004881931945600157, "loss": 5.2207, "mean_token_accuracy": 0.19023172706365585, "num_tokens": 24396473.0, "step": 13225 }, { "entropy": 5.6731767654418945, "epoch": 1.1114891829447595, "grad_norm": 0.9609375, "learning_rate": 0.0004881836193361269, "loss": 5.3024, "mean_token_accuracy": 0.18500218242406846, "num_tokens": 24405461.0, "step": 13230 }, { "entropy": 5.6734254360198975, "epoch": 1.1119092627599243, "grad_norm": 0.99609375, "learning_rate": 0.0004881740403358, "loss": 5.2418, "mean_token_accuracy": 0.18769099116325377, "num_tokens": 24414138.0, "step": 13235 }, { "entropy": 5.665589189529419, "epoch": 1.1123293425750893, "grad_norm": 0.953125, "learning_rate": 0.00048816445755920474, "loss": 5.2598, "mean_token_accuracy": 0.17860634624958038, "num_tokens": 24423386.0, "step": 13240 }, { "entropy": 5.670639133453369, "epoch": 1.112749422390254, "grad_norm": 1.0234375, "learning_rate": 0.0004881548710065109, "loss": 5.2347, "mean_token_accuracy": 0.1841973140835762, "num_tokens": 24433637.0, "step": 13245 }, { "entropy": 5.673031187057495, "epoch": 1.113169502205419, "grad_norm": 0.953125, "learning_rate": 0.0004881452806778883, "loss": 5.3014, "mean_token_accuracy": 0.17894951850175858, "num_tokens": 24443677.0, "step": 13250 }, { "entropy": 5.654498672485351, "epoch": 1.113589582020584, "grad_norm": 1.109375, "learning_rate": 0.00048813568657350676, "loss": 5.2058, "mean_token_accuracy": 0.18656252324581146, "num_tokens": 24452317.0, "step": 13255 }, { "entropy": 5.658048677444458, "epoch": 1.1140096618357487, "grad_norm": 1.09375, "learning_rate": 0.0004881260886935363, "loss": 5.2362, "mean_token_accuracy": 0.1817552089691162, "num_tokens": 24460626.0, "step": 13260 }, { "entropy": 5.713985395431519, "epoch": 1.1144297416509137, "grad_norm": 1.234375, "learning_rate": 0.00048811648703814693, "loss": 5.3048, "mean_token_accuracy": 0.17176585644483566, "num_tokens": 24469583.0, "step": 13265 }, { "entropy": 5.655399179458618, "epoch": 1.1148498214660785, "grad_norm": 1.0078125, "learning_rate": 0.0004881068816075087, "loss": 5.2348, "mean_token_accuracy": 0.1799187481403351, "num_tokens": 24478811.0, "step": 13270 }, { "entropy": 5.646001625061035, "epoch": 1.1152699012812435, "grad_norm": 1.0546875, "learning_rate": 0.00048809727240179193, "loss": 5.2778, "mean_token_accuracy": 0.18360942751169204, "num_tokens": 24487818.0, "step": 13275 }, { "entropy": 5.654885768890381, "epoch": 1.1156899810964083, "grad_norm": 1.0390625, "learning_rate": 0.0004880876594211665, "loss": 5.2633, "mean_token_accuracy": 0.18129506558179856, "num_tokens": 24497087.0, "step": 13280 }, { "entropy": 5.762274217605591, "epoch": 1.1161100609115733, "grad_norm": 1.15625, "learning_rate": 0.00048807804266580304, "loss": 5.2049, "mean_token_accuracy": 0.17759926319122316, "num_tokens": 24505347.0, "step": 13285 }, { "entropy": 5.7171532154083256, "epoch": 1.116530140726738, "grad_norm": 1.078125, "learning_rate": 0.0004880684221358717, "loss": 5.2158, "mean_token_accuracy": 0.18222595155239105, "num_tokens": 24514732.0, "step": 13290 }, { "entropy": 5.693452453613281, "epoch": 1.116950220541903, "grad_norm": 1.0390625, "learning_rate": 0.00048805879783154305, "loss": 5.2611, "mean_token_accuracy": 0.18072494268417358, "num_tokens": 24523295.0, "step": 13295 }, { "entropy": 5.652671384811401, "epoch": 1.1173703003570679, "grad_norm": 1.015625, "learning_rate": 0.00048804916975298744, "loss": 5.1683, "mean_token_accuracy": 0.18520602583885193, "num_tokens": 24532415.0, "step": 13300 }, { "entropy": 5.701218938827514, "epoch": 1.1177903801722326, "grad_norm": 1.03125, "learning_rate": 0.0004880395379003755, "loss": 5.2989, "mean_token_accuracy": 0.18368625789880752, "num_tokens": 24541856.0, "step": 13305 }, { "entropy": 5.653467607498169, "epoch": 1.1182104599873977, "grad_norm": 1.0859375, "learning_rate": 0.00048802990227387797, "loss": 5.3029, "mean_token_accuracy": 0.17603146731853486, "num_tokens": 24550982.0, "step": 13310 }, { "entropy": 5.708571863174439, "epoch": 1.1186305398025624, "grad_norm": 0.9375, "learning_rate": 0.00048802026287366525, "loss": 5.3698, "mean_token_accuracy": 0.17326470464468002, "num_tokens": 24561176.0, "step": 13315 }, { "entropy": 5.716219568252564, "epoch": 1.1190506196177274, "grad_norm": 0.9765625, "learning_rate": 0.00048801061969990834, "loss": 5.2399, "mean_token_accuracy": 0.17708809971809386, "num_tokens": 24570741.0, "step": 13320 }, { "entropy": 5.668814468383789, "epoch": 1.1194706994328922, "grad_norm": 0.95703125, "learning_rate": 0.00048800097275277795, "loss": 5.2386, "mean_token_accuracy": 0.1835561990737915, "num_tokens": 24580175.0, "step": 13325 }, { "entropy": 5.694848728179932, "epoch": 1.119890779248057, "grad_norm": 1.03125, "learning_rate": 0.000487991322032445, "loss": 5.2421, "mean_token_accuracy": 0.18930458426475524, "num_tokens": 24588754.0, "step": 13330 }, { "entropy": 5.768158197402954, "epoch": 1.120310859063222, "grad_norm": 1.0234375, "learning_rate": 0.0004879816675390805, "loss": 5.4214, "mean_token_accuracy": 0.17500364333391188, "num_tokens": 24599429.0, "step": 13335 }, { "entropy": 5.630630683898926, "epoch": 1.1207309388783868, "grad_norm": 0.9921875, "learning_rate": 0.00048797200927285547, "loss": 5.1485, "mean_token_accuracy": 0.191367506980896, "num_tokens": 24608767.0, "step": 13340 }, { "entropy": 5.658508014678955, "epoch": 1.1211510186935518, "grad_norm": 1.078125, "learning_rate": 0.0004879623472339409, "loss": 5.3061, "mean_token_accuracy": 0.18395798653364182, "num_tokens": 24618232.0, "step": 13345 }, { "entropy": 5.703488397598266, "epoch": 1.1215710985087166, "grad_norm": 1.0625, "learning_rate": 0.000487952681422508, "loss": 5.2115, "mean_token_accuracy": 0.18733716905117034, "num_tokens": 24626986.0, "step": 13350 }, { "entropy": 5.5935609340667725, "epoch": 1.1219911783238816, "grad_norm": 1.078125, "learning_rate": 0.000487943011838728, "loss": 5.0953, "mean_token_accuracy": 0.1933867812156677, "num_tokens": 24635283.0, "step": 13355 }, { "entropy": 5.560376167297363, "epoch": 1.1224112581390464, "grad_norm": 1.0859375, "learning_rate": 0.0004879333384827722, "loss": 5.1993, "mean_token_accuracy": 0.18064828217029572, "num_tokens": 24644451.0, "step": 13360 }, { "entropy": 5.738787651062012, "epoch": 1.1228313379542114, "grad_norm": 1.046875, "learning_rate": 0.0004879236613548119, "loss": 5.3299, "mean_token_accuracy": 0.17765877395868301, "num_tokens": 24654811.0, "step": 13365 }, { "entropy": 5.710661268234253, "epoch": 1.1232514177693762, "grad_norm": 0.99609375, "learning_rate": 0.0004879139804550187, "loss": 5.2732, "mean_token_accuracy": 0.18363934457302095, "num_tokens": 24663712.0, "step": 13370 }, { "entropy": 5.690995979309082, "epoch": 1.123671497584541, "grad_norm": 1.0390625, "learning_rate": 0.00048790429578356387, "loss": 5.3495, "mean_token_accuracy": 0.17719237953424455, "num_tokens": 24672518.0, "step": 13375 }, { "entropy": 5.672314453125, "epoch": 1.124091577399706, "grad_norm": 0.99609375, "learning_rate": 0.00048789460734061915, "loss": 5.2395, "mean_token_accuracy": 0.18729058057069778, "num_tokens": 24681900.0, "step": 13380 }, { "entropy": 5.680240869522095, "epoch": 1.1245116572148708, "grad_norm": 0.9921875, "learning_rate": 0.0004878849151263561, "loss": 5.247, "mean_token_accuracy": 0.18325935900211335, "num_tokens": 24691760.0, "step": 13385 }, { "entropy": 5.707732820510865, "epoch": 1.1249317370300358, "grad_norm": 1.09375, "learning_rate": 0.0004878752191409463, "loss": 5.204, "mean_token_accuracy": 0.1889537826180458, "num_tokens": 24700742.0, "step": 13390 }, { "entropy": 5.660908460617065, "epoch": 1.1253518168452006, "grad_norm": 0.9453125, "learning_rate": 0.0004878655193845616, "loss": 5.2693, "mean_token_accuracy": 0.18033764511346817, "num_tokens": 24709329.0, "step": 13395 }, { "entropy": 5.679986619949341, "epoch": 1.1257718966603654, "grad_norm": 1.03125, "learning_rate": 0.00048785581585737394, "loss": 5.4078, "mean_token_accuracy": 0.1735921934247017, "num_tokens": 24718475.0, "step": 13400 }, { "entropy": 5.734420919418335, "epoch": 1.1261919764755304, "grad_norm": 0.96875, "learning_rate": 0.000487846108559555, "loss": 5.2537, "mean_token_accuracy": 0.19340488016605378, "num_tokens": 24727817.0, "step": 13405 }, { "entropy": 5.682028293609619, "epoch": 1.1266120562906952, "grad_norm": 1.1328125, "learning_rate": 0.00048783639749127694, "loss": 5.2446, "mean_token_accuracy": 0.1802604839205742, "num_tokens": 24737057.0, "step": 13410 }, { "entropy": 5.728585052490234, "epoch": 1.1270321361058602, "grad_norm": 1.0390625, "learning_rate": 0.0004878266826527116, "loss": 5.2955, "mean_token_accuracy": 0.17693672478199005, "num_tokens": 24746016.0, "step": 13415 }, { "entropy": 5.744471168518066, "epoch": 1.127452215921025, "grad_norm": 1.078125, "learning_rate": 0.00048781696404403126, "loss": 5.287, "mean_token_accuracy": 0.17760882079601287, "num_tokens": 24755978.0, "step": 13420 }, { "entropy": 5.6333362579345705, "epoch": 1.12787229573619, "grad_norm": 0.96875, "learning_rate": 0.00048780724166540794, "loss": 5.1868, "mean_token_accuracy": 0.18159880489110947, "num_tokens": 24765255.0, "step": 13425 }, { "entropy": 5.625966501235962, "epoch": 1.1282923755513548, "grad_norm": 0.96875, "learning_rate": 0.0004877975155170139, "loss": 5.26, "mean_token_accuracy": 0.1774269625544548, "num_tokens": 24774339.0, "step": 13430 }, { "entropy": 5.673966026306152, "epoch": 1.1287124553665198, "grad_norm": 1.0859375, "learning_rate": 0.0004877877855990215, "loss": 5.2606, "mean_token_accuracy": 0.1784887433052063, "num_tokens": 24783236.0, "step": 13435 }, { "entropy": 5.624089002609253, "epoch": 1.1291325351816845, "grad_norm": 0.91796875, "learning_rate": 0.000487778051911603, "loss": 5.1747, "mean_token_accuracy": 0.1849935159087181, "num_tokens": 24792168.0, "step": 13440 }, { "entropy": 5.7314207553863525, "epoch": 1.1295526149968493, "grad_norm": 1.0703125, "learning_rate": 0.0004877683144549308, "loss": 5.3273, "mean_token_accuracy": 0.17823956906795502, "num_tokens": 24800843.0, "step": 13445 }, { "entropy": 5.68920521736145, "epoch": 1.1299726948120143, "grad_norm": 1.0546875, "learning_rate": 0.00048775857322917753, "loss": 5.2068, "mean_token_accuracy": 0.17774172574281694, "num_tokens": 24810475.0, "step": 13450 }, { "entropy": 5.629417800903321, "epoch": 1.1303927746271791, "grad_norm": 0.98828125, "learning_rate": 0.0004877488282345158, "loss": 5.2749, "mean_token_accuracy": 0.17738532274961472, "num_tokens": 24820486.0, "step": 13455 }, { "entropy": 5.714861440658569, "epoch": 1.1308128544423441, "grad_norm": 0.97265625, "learning_rate": 0.000487739079471118, "loss": 5.3452, "mean_token_accuracy": 0.18117614984512329, "num_tokens": 24830243.0, "step": 13460 }, { "entropy": 5.671858119964599, "epoch": 1.131232934257509, "grad_norm": 1.0859375, "learning_rate": 0.000487729326939157, "loss": 5.2567, "mean_token_accuracy": 0.17802006602287293, "num_tokens": 24839090.0, "step": 13465 }, { "entropy": 5.641973829269409, "epoch": 1.1316530140726737, "grad_norm": 1.0546875, "learning_rate": 0.00048771957063880553, "loss": 5.2456, "mean_token_accuracy": 0.17685411423444747, "num_tokens": 24847933.0, "step": 13470 }, { "entropy": 5.701412487030029, "epoch": 1.1320730938878387, "grad_norm": 1.140625, "learning_rate": 0.0004877098105702363, "loss": 5.2282, "mean_token_accuracy": 0.1925026521086693, "num_tokens": 24857037.0, "step": 13475 }, { "entropy": 5.557268524169922, "epoch": 1.1324931737030035, "grad_norm": 1.0390625, "learning_rate": 0.00048770004673362243, "loss": 5.0701, "mean_token_accuracy": 0.19654955565929413, "num_tokens": 24866042.0, "step": 13480 }, { "entropy": 5.548870515823364, "epoch": 1.1329132535181685, "grad_norm": 1.0390625, "learning_rate": 0.00048769027912913673, "loss": 5.042, "mean_token_accuracy": 0.1975183829665184, "num_tokens": 24873735.0, "step": 13485 }, { "entropy": 5.5484272003173825, "epoch": 1.1333333333333333, "grad_norm": 1.015625, "learning_rate": 0.0004876805077569522, "loss": 5.1312, "mean_token_accuracy": 0.18720841109752656, "num_tokens": 24882277.0, "step": 13490 }, { "entropy": 5.5885995388031, "epoch": 1.133753413148498, "grad_norm": 1.0546875, "learning_rate": 0.00048767073261724204, "loss": 5.2449, "mean_token_accuracy": 0.18127612471580506, "num_tokens": 24891354.0, "step": 13495 }, { "entropy": 5.698721837997437, "epoch": 1.134173492963663, "grad_norm": 1.0390625, "learning_rate": 0.0004876609537101793, "loss": 5.2295, "mean_token_accuracy": 0.1827969342470169, "num_tokens": 24899887.0, "step": 13500 }, { "entropy": 5.767540693283081, "epoch": 1.1345935727788279, "grad_norm": 1.078125, "learning_rate": 0.0004876511710359374, "loss": 5.2505, "mean_token_accuracy": 0.18438805490732194, "num_tokens": 24908616.0, "step": 13505 }, { "entropy": 5.694990539550782, "epoch": 1.135013652593993, "grad_norm": 1.1171875, "learning_rate": 0.00048764138459468935, "loss": 5.2875, "mean_token_accuracy": 0.18279025405645372, "num_tokens": 24917864.0, "step": 13510 }, { "entropy": 5.763183450698852, "epoch": 1.1354337324091577, "grad_norm": 0.921875, "learning_rate": 0.00048763159438660876, "loss": 5.3047, "mean_token_accuracy": 0.18350074142217637, "num_tokens": 24927864.0, "step": 13515 }, { "entropy": 5.595635509490966, "epoch": 1.1358538122243227, "grad_norm": 1.03125, "learning_rate": 0.00048762180041186893, "loss": 5.2086, "mean_token_accuracy": 0.18730085641145705, "num_tokens": 24937146.0, "step": 13520 }, { "entropy": 5.668976640701294, "epoch": 1.1362738920394875, "grad_norm": 1.0625, "learning_rate": 0.0004876120026706434, "loss": 5.2948, "mean_token_accuracy": 0.1853949770331383, "num_tokens": 24945694.0, "step": 13525 }, { "entropy": 5.652399349212646, "epoch": 1.1366939718546525, "grad_norm": 0.9609375, "learning_rate": 0.0004876022011631057, "loss": 5.1888, "mean_token_accuracy": 0.1877109855413437, "num_tokens": 24955325.0, "step": 13530 }, { "entropy": 5.631319904327393, "epoch": 1.1371140516698173, "grad_norm": 0.953125, "learning_rate": 0.0004875923958894295, "loss": 5.0891, "mean_token_accuracy": 0.18686340153217315, "num_tokens": 24964028.0, "step": 13535 }, { "entropy": 5.67464246749878, "epoch": 1.137534131484982, "grad_norm": 1.0390625, "learning_rate": 0.00048758258684978846, "loss": 5.2492, "mean_token_accuracy": 0.1864520639181137, "num_tokens": 24972923.0, "step": 13540 }, { "entropy": 5.668635845184326, "epoch": 1.137954211300147, "grad_norm": 1.0625, "learning_rate": 0.00048757277404435636, "loss": 5.154, "mean_token_accuracy": 0.1896499752998352, "num_tokens": 24982156.0, "step": 13545 }, { "entropy": 5.604721546173096, "epoch": 1.1383742911153119, "grad_norm": 1.03125, "learning_rate": 0.000487562957473307, "loss": 5.2057, "mean_token_accuracy": 0.18223409801721574, "num_tokens": 24991616.0, "step": 13550 }, { "entropy": 5.603752851486206, "epoch": 1.1387943709304769, "grad_norm": 1.0, "learning_rate": 0.0004875531371368144, "loss": 5.2643, "mean_token_accuracy": 0.18362347334623336, "num_tokens": 25001140.0, "step": 13555 }, { "entropy": 5.635473251342773, "epoch": 1.1392144507456416, "grad_norm": 1.03125, "learning_rate": 0.00048754331303505236, "loss": 5.18, "mean_token_accuracy": 0.18545341789722442, "num_tokens": 25010863.0, "step": 13560 }, { "entropy": 5.703922080993652, "epoch": 1.1396345305608064, "grad_norm": 1.125, "learning_rate": 0.00048753348516819496, "loss": 5.2766, "mean_token_accuracy": 0.1830003872513771, "num_tokens": 25019770.0, "step": 13565 }, { "entropy": 5.76104884147644, "epoch": 1.1400546103759714, "grad_norm": 0.9921875, "learning_rate": 0.0004875236535364163, "loss": 5.3171, "mean_token_accuracy": 0.17561974972486497, "num_tokens": 25029900.0, "step": 13570 }, { "entropy": 5.730456686019897, "epoch": 1.1404746901911362, "grad_norm": 0.984375, "learning_rate": 0.0004875138181398906, "loss": 5.2764, "mean_token_accuracy": 0.17939021140336991, "num_tokens": 25039428.0, "step": 13575 }, { "entropy": 5.670626068115235, "epoch": 1.1408947700063012, "grad_norm": 1.015625, "learning_rate": 0.000487503978978792, "loss": 5.2619, "mean_token_accuracy": 0.17901408821344375, "num_tokens": 25049145.0, "step": 13580 }, { "entropy": 5.6930016040802, "epoch": 1.141314849821466, "grad_norm": 1.0234375, "learning_rate": 0.00048749413605329487, "loss": 5.2767, "mean_token_accuracy": 0.1844818651676178, "num_tokens": 25058772.0, "step": 13585 }, { "entropy": 5.65727128982544, "epoch": 1.141734929636631, "grad_norm": 1.0703125, "learning_rate": 0.00048748428936357346, "loss": 5.2076, "mean_token_accuracy": 0.18715782165527345, "num_tokens": 25067249.0, "step": 13590 }, { "entropy": 5.603106737136841, "epoch": 1.1421550094517958, "grad_norm": 1.0390625, "learning_rate": 0.0004874744389098024, "loss": 5.1778, "mean_token_accuracy": 0.17866151928901672, "num_tokens": 25076893.0, "step": 13595 }, { "entropy": 5.5808816909790036, "epoch": 1.1425750892669608, "grad_norm": 1.0703125, "learning_rate": 0.0004874645846921559, "loss": 5.1565, "mean_token_accuracy": 0.1969090536236763, "num_tokens": 25086238.0, "step": 13600 }, { "entropy": 5.653136682510376, "epoch": 1.1429951690821256, "grad_norm": 0.96484375, "learning_rate": 0.00048745472671080884, "loss": 5.1952, "mean_token_accuracy": 0.17972956895828246, "num_tokens": 25095334.0, "step": 13605 }, { "entropy": 5.666208982467651, "epoch": 1.1434152488972904, "grad_norm": 1.015625, "learning_rate": 0.00048744486496593565, "loss": 5.1923, "mean_token_accuracy": 0.1845288097858429, "num_tokens": 25104136.0, "step": 13610 }, { "entropy": 5.670278072357178, "epoch": 1.1438353287124554, "grad_norm": 1.109375, "learning_rate": 0.000487434999457711, "loss": 5.1784, "mean_token_accuracy": 0.19145991206169127, "num_tokens": 25112629.0, "step": 13615 }, { "entropy": 5.662566757202148, "epoch": 1.1442554085276202, "grad_norm": 1.078125, "learning_rate": 0.0004874251301863098, "loss": 5.2279, "mean_token_accuracy": 0.18024315536022187, "num_tokens": 25121014.0, "step": 13620 }, { "entropy": 5.6031898021697994, "epoch": 1.1446754883427852, "grad_norm": 1.0078125, "learning_rate": 0.00048741525715190675, "loss": 5.2495, "mean_token_accuracy": 0.18202711194753646, "num_tokens": 25130097.0, "step": 13625 }, { "entropy": 5.675847053527832, "epoch": 1.14509556815795, "grad_norm": 1.03125, "learning_rate": 0.0004874053803546769, "loss": 5.2692, "mean_token_accuracy": 0.18346379250288009, "num_tokens": 25139065.0, "step": 13630 }, { "entropy": 5.674046373367309, "epoch": 1.1455156479731148, "grad_norm": 1.09375, "learning_rate": 0.000487395499794795, "loss": 5.256, "mean_token_accuracy": 0.1828017920255661, "num_tokens": 25148852.0, "step": 13635 }, { "entropy": 5.6175562858581545, "epoch": 1.1459357277882798, "grad_norm": 1.0859375, "learning_rate": 0.0004873856154724362, "loss": 5.1349, "mean_token_accuracy": 0.1938551276922226, "num_tokens": 25157580.0, "step": 13640 }, { "entropy": 5.7053426742553714, "epoch": 1.1463558076034446, "grad_norm": 1.0625, "learning_rate": 0.0004873757273877756, "loss": 5.2319, "mean_token_accuracy": 0.18188530057668686, "num_tokens": 25166243.0, "step": 13645 }, { "entropy": 5.633433103561401, "epoch": 1.1467758874186096, "grad_norm": 1.0625, "learning_rate": 0.00048736583554098836, "loss": 5.2495, "mean_token_accuracy": 0.1773426726460457, "num_tokens": 25174674.0, "step": 13650 }, { "entropy": 5.579121351242065, "epoch": 1.1471959672337744, "grad_norm": 1.0625, "learning_rate": 0.00048735593993224973, "loss": 5.1702, "mean_token_accuracy": 0.19095555394887925, "num_tokens": 25183892.0, "step": 13655 }, { "entropy": 5.601590204238891, "epoch": 1.1476160470489394, "grad_norm": 1.046875, "learning_rate": 0.00048734604056173495, "loss": 5.2048, "mean_token_accuracy": 0.1855093717575073, "num_tokens": 25192731.0, "step": 13660 }, { "entropy": 5.676291370391846, "epoch": 1.1480361268641042, "grad_norm": 1.03125, "learning_rate": 0.00048733613742961933, "loss": 5.2998, "mean_token_accuracy": 0.18106140047311783, "num_tokens": 25201280.0, "step": 13665 }, { "entropy": 5.663060045242309, "epoch": 1.1484562066792692, "grad_norm": 1.109375, "learning_rate": 0.00048732623053607846, "loss": 5.1861, "mean_token_accuracy": 0.18693942725658416, "num_tokens": 25209929.0, "step": 13670 }, { "entropy": 5.644407796859741, "epoch": 1.148876286494434, "grad_norm": 1.0390625, "learning_rate": 0.0004873163198812877, "loss": 5.1259, "mean_token_accuracy": 0.1932801976799965, "num_tokens": 25218583.0, "step": 13675 }, { "entropy": 5.705209589004516, "epoch": 1.1492963663095987, "grad_norm": 0.96484375, "learning_rate": 0.0004873064054654227, "loss": 5.3184, "mean_token_accuracy": 0.17877951115369797, "num_tokens": 25228949.0, "step": 13680 }, { "entropy": 5.660206937789917, "epoch": 1.1497164461247638, "grad_norm": 1.0625, "learning_rate": 0.00048729648728865904, "loss": 5.1621, "mean_token_accuracy": 0.19778074622154235, "num_tokens": 25238603.0, "step": 13685 }, { "entropy": 5.655360555648803, "epoch": 1.1501365259399285, "grad_norm": 1.0625, "learning_rate": 0.00048728656535117237, "loss": 5.2919, "mean_token_accuracy": 0.17583213448524476, "num_tokens": 25248265.0, "step": 13690 }, { "entropy": 5.651636743545533, "epoch": 1.1505566057550936, "grad_norm": 1.0078125, "learning_rate": 0.0004872766396531386, "loss": 5.2608, "mean_token_accuracy": 0.18153744339942932, "num_tokens": 25258195.0, "step": 13695 }, { "entropy": 5.7246545314788815, "epoch": 1.1509766855702583, "grad_norm": 1.0234375, "learning_rate": 0.00048726671019473335, "loss": 5.2089, "mean_token_accuracy": 0.18699081987142563, "num_tokens": 25267886.0, "step": 13700 }, { "entropy": 5.712115097045898, "epoch": 1.1513967653854231, "grad_norm": 1.078125, "learning_rate": 0.00048725677697613267, "loss": 5.2551, "mean_token_accuracy": 0.18418315351009368, "num_tokens": 25277304.0, "step": 13705 }, { "entropy": 5.677088880538941, "epoch": 1.1518168452005881, "grad_norm": 1.0078125, "learning_rate": 0.0004872468399975125, "loss": 5.2623, "mean_token_accuracy": 0.1757863402366638, "num_tokens": 25286771.0, "step": 13710 }, { "entropy": 5.7187107563018795, "epoch": 1.152236925015753, "grad_norm": 1.09375, "learning_rate": 0.00048723689925904884, "loss": 5.3138, "mean_token_accuracy": 0.18047995120286942, "num_tokens": 25296018.0, "step": 13715 }, { "entropy": 5.6641716957092285, "epoch": 1.152657004830918, "grad_norm": 1.1875, "learning_rate": 0.0004872269547609179, "loss": 5.2628, "mean_token_accuracy": 0.19101088643074035, "num_tokens": 25305737.0, "step": 13720 }, { "entropy": 5.590297889709473, "epoch": 1.1530770846460827, "grad_norm": 1.0625, "learning_rate": 0.0004872170065032956, "loss": 5.0968, "mean_token_accuracy": 0.18746337294578552, "num_tokens": 25314625.0, "step": 13725 }, { "entropy": 5.653110456466675, "epoch": 1.1534971644612477, "grad_norm": 1.0859375, "learning_rate": 0.0004872070544863584, "loss": 5.2293, "mean_token_accuracy": 0.18832913637161255, "num_tokens": 25323453.0, "step": 13730 }, { "entropy": 5.667890024185181, "epoch": 1.1539172442764125, "grad_norm": 0.9296875, "learning_rate": 0.0004871970987102824, "loss": 5.2525, "mean_token_accuracy": 0.18474705517292023, "num_tokens": 25333236.0, "step": 13735 }, { "entropy": 5.658923721313476, "epoch": 1.1543373240915775, "grad_norm": 1.0703125, "learning_rate": 0.0004871871391752442, "loss": 5.1421, "mean_token_accuracy": 0.18851940780878068, "num_tokens": 25341993.0, "step": 13740 }, { "entropy": 5.7106139183044435, "epoch": 1.1547574039067423, "grad_norm": 1.0, "learning_rate": 0.00048717717588141993, "loss": 5.1856, "mean_token_accuracy": 0.18306624591350557, "num_tokens": 25350695.0, "step": 13745 }, { "entropy": 5.690318775177002, "epoch": 1.155177483721907, "grad_norm": 1.15625, "learning_rate": 0.0004871672088289863, "loss": 5.2383, "mean_token_accuracy": 0.1834632784128189, "num_tokens": 25359044.0, "step": 13750 }, { "entropy": 5.656288290023804, "epoch": 1.155597563537072, "grad_norm": 0.98828125, "learning_rate": 0.00048715723801811986, "loss": 5.2464, "mean_token_accuracy": 0.18718164265155793, "num_tokens": 25367959.0, "step": 13755 }, { "entropy": 5.689774894714356, "epoch": 1.156017643352237, "grad_norm": 1.0625, "learning_rate": 0.00048714726344899716, "loss": 5.2785, "mean_token_accuracy": 0.18345650136470795, "num_tokens": 25376968.0, "step": 13760 }, { "entropy": 5.587648582458496, "epoch": 1.156437723167402, "grad_norm": 1.078125, "learning_rate": 0.0004871372851217949, "loss": 5.154, "mean_token_accuracy": 0.18699706792831422, "num_tokens": 25385381.0, "step": 13765 }, { "entropy": 5.653797578811646, "epoch": 1.1568578029825667, "grad_norm": 0.9921875, "learning_rate": 0.0004871273030366899, "loss": 5.2549, "mean_token_accuracy": 0.1830378443002701, "num_tokens": 25394647.0, "step": 13770 }, { "entropy": 5.6360091209411625, "epoch": 1.1572778827977315, "grad_norm": 1.046875, "learning_rate": 0.0004871173171938589, "loss": 5.2205, "mean_token_accuracy": 0.190428164601326, "num_tokens": 25403973.0, "step": 13775 }, { "entropy": 5.641447877883911, "epoch": 1.1576979626128965, "grad_norm": 0.96875, "learning_rate": 0.0004871073275934789, "loss": 5.1966, "mean_token_accuracy": 0.18467400819063187, "num_tokens": 25412319.0, "step": 13780 }, { "entropy": 5.635476350784302, "epoch": 1.1581180424280613, "grad_norm": 1.1015625, "learning_rate": 0.00048709733423572685, "loss": 5.2345, "mean_token_accuracy": 0.17975110709667205, "num_tokens": 25420558.0, "step": 13785 }, { "entropy": 5.606858444213867, "epoch": 1.1585381222432263, "grad_norm": 1.1171875, "learning_rate": 0.00048708733712077973, "loss": 5.1696, "mean_token_accuracy": 0.19106279760599137, "num_tokens": 25429258.0, "step": 13790 }, { "entropy": 5.642301034927368, "epoch": 1.158958202058391, "grad_norm": 1.0078125, "learning_rate": 0.0004870773362488146, "loss": 5.1335, "mean_token_accuracy": 0.1902405098080635, "num_tokens": 25438005.0, "step": 13795 }, { "entropy": 5.639985609054565, "epoch": 1.159378281873556, "grad_norm": 1.0546875, "learning_rate": 0.0004870673316200087, "loss": 5.1726, "mean_token_accuracy": 0.1865493282675743, "num_tokens": 25447120.0, "step": 13800 }, { "entropy": 5.6110520362854, "epoch": 1.1597983616887209, "grad_norm": 0.97265625, "learning_rate": 0.0004870573232345392, "loss": 5.1486, "mean_token_accuracy": 0.1866797223687172, "num_tokens": 25456216.0, "step": 13805 }, { "entropy": 5.789475679397583, "epoch": 1.1602184415038856, "grad_norm": 0.94921875, "learning_rate": 0.0004870473110925834, "loss": 5.4167, "mean_token_accuracy": 0.1789159968495369, "num_tokens": 25466456.0, "step": 13810 }, { "entropy": 5.609800004959107, "epoch": 1.1606385213190507, "grad_norm": 1.0390625, "learning_rate": 0.0004870372951943187, "loss": 5.089, "mean_token_accuracy": 0.19656192660331726, "num_tokens": 25475217.0, "step": 13815 }, { "entropy": 5.691216373443604, "epoch": 1.1610586011342154, "grad_norm": 1.03125, "learning_rate": 0.00048702727553992243, "loss": 5.3606, "mean_token_accuracy": 0.17178623229265214, "num_tokens": 25484617.0, "step": 13820 }, { "entropy": 5.6387592315673825, "epoch": 1.1614786809493804, "grad_norm": 1.15625, "learning_rate": 0.00048701725212957223, "loss": 5.1802, "mean_token_accuracy": 0.1895785689353943, "num_tokens": 25493936.0, "step": 13825 }, { "entropy": 5.62312159538269, "epoch": 1.1618987607645452, "grad_norm": 1.0703125, "learning_rate": 0.0004870072249634455, "loss": 5.1455, "mean_token_accuracy": 0.19129602909088134, "num_tokens": 25502306.0, "step": 13830 }, { "entropy": 5.5922346115112305, "epoch": 1.1623188405797102, "grad_norm": 1.0703125, "learning_rate": 0.00048699719404172006, "loss": 5.1968, "mean_token_accuracy": 0.1844888299703598, "num_tokens": 25511247.0, "step": 13835 }, { "entropy": 5.6161332607269285, "epoch": 1.162738920394875, "grad_norm": 1.0078125, "learning_rate": 0.00048698715936457344, "loss": 5.2631, "mean_token_accuracy": 0.17933199405670167, "num_tokens": 25520482.0, "step": 13840 }, { "entropy": 5.621447467803955, "epoch": 1.1631590002100398, "grad_norm": 0.94921875, "learning_rate": 0.00048697712093218336, "loss": 5.1487, "mean_token_accuracy": 0.18286369144916534, "num_tokens": 25529854.0, "step": 13845 }, { "entropy": 5.621363544464112, "epoch": 1.1635790800252048, "grad_norm": 0.99609375, "learning_rate": 0.0004869670787447279, "loss": 5.1056, "mean_token_accuracy": 0.18813402503728865, "num_tokens": 25538251.0, "step": 13850 }, { "entropy": 5.610999059677124, "epoch": 1.1639991598403696, "grad_norm": 0.99609375, "learning_rate": 0.0004869570328023846, "loss": 5.1916, "mean_token_accuracy": 0.1843186005949974, "num_tokens": 25546889.0, "step": 13855 }, { "entropy": 5.615938520431518, "epoch": 1.1644192396555346, "grad_norm": 0.93359375, "learning_rate": 0.00048694698310533177, "loss": 5.2336, "mean_token_accuracy": 0.18064400255680085, "num_tokens": 25557040.0, "step": 13860 }, { "entropy": 5.713138818740845, "epoch": 1.1648393194706994, "grad_norm": 1.0390625, "learning_rate": 0.0004869369296537472, "loss": 5.3911, "mean_token_accuracy": 0.1721922531723976, "num_tokens": 25565798.0, "step": 13865 }, { "entropy": 5.7594860076904295, "epoch": 1.1652593992858642, "grad_norm": 1.0078125, "learning_rate": 0.0004869268724478091, "loss": 5.2341, "mean_token_accuracy": 0.18672980070114137, "num_tokens": 25575039.0, "step": 13870 }, { "entropy": 5.738545560836792, "epoch": 1.1656794791010292, "grad_norm": 1.0390625, "learning_rate": 0.00048691681148769545, "loss": 5.2427, "mean_token_accuracy": 0.18275733292102814, "num_tokens": 25584635.0, "step": 13875 }, { "entropy": 5.571552896499634, "epoch": 1.166099558916194, "grad_norm": 1.1015625, "learning_rate": 0.0004869067467735847, "loss": 5.1801, "mean_token_accuracy": 0.18628969341516494, "num_tokens": 25593736.0, "step": 13880 }, { "entropy": 5.637985897064209, "epoch": 1.166519638731359, "grad_norm": 0.9609375, "learning_rate": 0.0004868966783056551, "loss": 5.1414, "mean_token_accuracy": 0.1976246416568756, "num_tokens": 25602685.0, "step": 13885 }, { "entropy": 5.6373748779296875, "epoch": 1.1669397185465238, "grad_norm": 1.0859375, "learning_rate": 0.00048688660608408484, "loss": 5.2173, "mean_token_accuracy": 0.18443208038806916, "num_tokens": 25610690.0, "step": 13890 }, { "entropy": 5.571537446975708, "epoch": 1.1673597983616888, "grad_norm": 0.9765625, "learning_rate": 0.00048687653010905254, "loss": 5.1052, "mean_token_accuracy": 0.191065913438797, "num_tokens": 25619805.0, "step": 13895 }, { "entropy": 5.682378625869751, "epoch": 1.1677798781768536, "grad_norm": 0.98046875, "learning_rate": 0.00048686645038073664, "loss": 5.3021, "mean_token_accuracy": 0.18269888311624527, "num_tokens": 25629447.0, "step": 13900 }, { "entropy": 5.614893770217895, "epoch": 1.1681999579920186, "grad_norm": 1.0625, "learning_rate": 0.00048685636689931554, "loss": 5.179, "mean_token_accuracy": 0.187445530295372, "num_tokens": 25638619.0, "step": 13905 }, { "entropy": 5.702981233596802, "epoch": 1.1686200378071834, "grad_norm": 1.0703125, "learning_rate": 0.00048684627966496803, "loss": 5.2479, "mean_token_accuracy": 0.1858130842447281, "num_tokens": 25648255.0, "step": 13910 }, { "entropy": 5.642744064331055, "epoch": 1.1690401176223482, "grad_norm": 1.0234375, "learning_rate": 0.00048683618867787284, "loss": 5.2686, "mean_token_accuracy": 0.18554134666919708, "num_tokens": 25657881.0, "step": 13915 }, { "entropy": 5.697892856597901, "epoch": 1.1694601974375132, "grad_norm": 1.21875, "learning_rate": 0.0004868260939382086, "loss": 5.3134, "mean_token_accuracy": 0.17519297897815705, "num_tokens": 25666773.0, "step": 13920 }, { "entropy": 5.649929189682007, "epoch": 1.169880277252678, "grad_norm": 1.0859375, "learning_rate": 0.0004868159954461542, "loss": 5.2056, "mean_token_accuracy": 0.18639975935220718, "num_tokens": 25675152.0, "step": 13925 }, { "entropy": 5.717081594467163, "epoch": 1.170300357067843, "grad_norm": 0.9765625, "learning_rate": 0.00048680589320188847, "loss": 5.2951, "mean_token_accuracy": 0.1795559599995613, "num_tokens": 25684962.0, "step": 13930 }, { "entropy": 5.662706089019776, "epoch": 1.1707204368830078, "grad_norm": 1.046875, "learning_rate": 0.0004867957872055904, "loss": 5.2163, "mean_token_accuracy": 0.18223697245121, "num_tokens": 25693782.0, "step": 13935 }, { "entropy": 5.607689046859742, "epoch": 1.1711405166981725, "grad_norm": 1.0625, "learning_rate": 0.00048678567745743905, "loss": 5.1804, "mean_token_accuracy": 0.19192726612091066, "num_tokens": 25703081.0, "step": 13940 }, { "entropy": 5.590435361862182, "epoch": 1.1715605965133375, "grad_norm": 1.015625, "learning_rate": 0.0004867755639576135, "loss": 5.1913, "mean_token_accuracy": 0.18899434804916382, "num_tokens": 25711628.0, "step": 13945 }, { "entropy": 5.595601320266724, "epoch": 1.1719806763285023, "grad_norm": 1.0, "learning_rate": 0.0004867654467062928, "loss": 5.2261, "mean_token_accuracy": 0.1866099566221237, "num_tokens": 25720676.0, "step": 13950 }, { "entropy": 5.612931919097901, "epoch": 1.1724007561436673, "grad_norm": 1.046875, "learning_rate": 0.00048675532570365633, "loss": 5.1816, "mean_token_accuracy": 0.1908836469054222, "num_tokens": 25729920.0, "step": 13955 }, { "entropy": 5.625372123718262, "epoch": 1.1728208359588321, "grad_norm": 1.0234375, "learning_rate": 0.00048674520094988327, "loss": 5.1621, "mean_token_accuracy": 0.18939254730939864, "num_tokens": 25739745.0, "step": 13960 }, { "entropy": 5.6231465339660645, "epoch": 1.1732409157739971, "grad_norm": 1.09375, "learning_rate": 0.00048673507244515303, "loss": 5.1768, "mean_token_accuracy": 0.18547391146421432, "num_tokens": 25748636.0, "step": 13965 }, { "entropy": 5.715896320343018, "epoch": 1.173660995589162, "grad_norm": 1.0, "learning_rate": 0.000486724940189645, "loss": 5.3189, "mean_token_accuracy": 0.1826425760984421, "num_tokens": 25758393.0, "step": 13970 }, { "entropy": 5.689463138580322, "epoch": 1.174081075404327, "grad_norm": 0.953125, "learning_rate": 0.0004867148041835386, "loss": 5.291, "mean_token_accuracy": 0.17766545563936234, "num_tokens": 25768520.0, "step": 13975 }, { "entropy": 5.5998283386230465, "epoch": 1.1745011552194917, "grad_norm": 1.078125, "learning_rate": 0.0004867046644270136, "loss": 5.1121, "mean_token_accuracy": 0.19138467013835908, "num_tokens": 25777168.0, "step": 13980 }, { "entropy": 5.77685580253601, "epoch": 1.1749212350346565, "grad_norm": 0.99609375, "learning_rate": 0.0004866945209202494, "loss": 5.4172, "mean_token_accuracy": 0.16987072974443435, "num_tokens": 25787042.0, "step": 13985 }, { "entropy": 5.710710287094116, "epoch": 1.1753413148498215, "grad_norm": 0.9609375, "learning_rate": 0.0004866843736634258, "loss": 5.2834, "mean_token_accuracy": 0.1831536442041397, "num_tokens": 25796784.0, "step": 13990 }, { "entropy": 5.71886944770813, "epoch": 1.1757613946649863, "grad_norm": 1.0546875, "learning_rate": 0.0004866742226567225, "loss": 5.3215, "mean_token_accuracy": 0.17402513176202775, "num_tokens": 25806285.0, "step": 13995 }, { "entropy": 5.665318536758423, "epoch": 1.1761814744801513, "grad_norm": 1.1015625, "learning_rate": 0.00048666406790031936, "loss": 5.1745, "mean_token_accuracy": 0.18109488785266875, "num_tokens": 25814889.0, "step": 14000 }, { "entropy": 5.639062881469727, "epoch": 1.176601554295316, "grad_norm": 0.9921875, "learning_rate": 0.0004866539093943962, "loss": 5.2069, "mean_token_accuracy": 0.1859121948480606, "num_tokens": 25824551.0, "step": 14005 }, { "entropy": 5.714960432052612, "epoch": 1.1770216341104809, "grad_norm": 0.98046875, "learning_rate": 0.00048664374713913304, "loss": 5.2321, "mean_token_accuracy": 0.18473626375198365, "num_tokens": 25834482.0, "step": 14010 }, { "entropy": 5.712164449691772, "epoch": 1.177441713925646, "grad_norm": 1.15625, "learning_rate": 0.0004866335811347099, "loss": 5.2943, "mean_token_accuracy": 0.18292463719844818, "num_tokens": 25843274.0, "step": 14015 }, { "entropy": 5.729008960723877, "epoch": 1.1778617937408107, "grad_norm": 1.0546875, "learning_rate": 0.00048662341138130683, "loss": 5.2897, "mean_token_accuracy": 0.17183826267719268, "num_tokens": 25852482.0, "step": 14020 }, { "entropy": 5.70672607421875, "epoch": 1.1782818735559757, "grad_norm": 1.03125, "learning_rate": 0.00048661323787910405, "loss": 5.2571, "mean_token_accuracy": 0.18058744668960572, "num_tokens": 25862657.0, "step": 14025 }, { "entropy": 5.633672618865967, "epoch": 1.1787019533711405, "grad_norm": 0.94140625, "learning_rate": 0.0004866030606282817, "loss": 5.21, "mean_token_accuracy": 0.19001703560352326, "num_tokens": 25871492.0, "step": 14030 }, { "entropy": 5.6546038627624515, "epoch": 1.1791220331863055, "grad_norm": 1.1015625, "learning_rate": 0.00048659287962902006, "loss": 5.2149, "mean_token_accuracy": 0.1857159912586212, "num_tokens": 25880979.0, "step": 14035 }, { "entropy": 5.673685026168823, "epoch": 1.1795421130014703, "grad_norm": 0.96875, "learning_rate": 0.00048658269488149945, "loss": 5.198, "mean_token_accuracy": 0.18017589747905732, "num_tokens": 25891060.0, "step": 14040 }, { "entropy": 5.787826442718506, "epoch": 1.1799621928166353, "grad_norm": 1.0390625, "learning_rate": 0.0004865725063859005, "loss": 5.338, "mean_token_accuracy": 0.18180553764104843, "num_tokens": 25900421.0, "step": 14045 }, { "entropy": 5.668501281738282, "epoch": 1.1803822726318, "grad_norm": 0.96484375, "learning_rate": 0.00048656231414240345, "loss": 5.2168, "mean_token_accuracy": 0.18123113214969636, "num_tokens": 25909614.0, "step": 14050 }, { "entropy": 5.591706132888794, "epoch": 1.1808023524469649, "grad_norm": 1.0, "learning_rate": 0.000486552118151189, "loss": 5.2658, "mean_token_accuracy": 0.17698712199926375, "num_tokens": 25919324.0, "step": 14055 }, { "entropy": 5.672393751144409, "epoch": 1.1812224322621299, "grad_norm": 0.96875, "learning_rate": 0.00048654191841243763, "loss": 5.2675, "mean_token_accuracy": 0.1860215961933136, "num_tokens": 25928818.0, "step": 14060 }, { "entropy": 5.7141131401062015, "epoch": 1.1816425120772946, "grad_norm": 0.9921875, "learning_rate": 0.0004865317149263301, "loss": 5.3126, "mean_token_accuracy": 0.1787165328860283, "num_tokens": 25938148.0, "step": 14065 }, { "entropy": 5.6664868831634525, "epoch": 1.1820625918924597, "grad_norm": 1.078125, "learning_rate": 0.0004865215076930473, "loss": 5.2239, "mean_token_accuracy": 0.18717748969793319, "num_tokens": 25947210.0, "step": 14070 }, { "entropy": 5.655594778060913, "epoch": 1.1824826717076244, "grad_norm": 0.96484375, "learning_rate": 0.0004865112967127697, "loss": 5.2061, "mean_token_accuracy": 0.188488145172596, "num_tokens": 25955949.0, "step": 14075 }, { "entropy": 5.579589414596557, "epoch": 1.1829027515227892, "grad_norm": 1.015625, "learning_rate": 0.0004865010819856786, "loss": 5.1472, "mean_token_accuracy": 0.1813472792506218, "num_tokens": 25964193.0, "step": 14080 }, { "entropy": 5.615473890304566, "epoch": 1.1833228313379542, "grad_norm": 0.9609375, "learning_rate": 0.0004864908635119546, "loss": 5.2268, "mean_token_accuracy": 0.18226004391908646, "num_tokens": 25973141.0, "step": 14085 }, { "entropy": 5.691499376296997, "epoch": 1.183742911153119, "grad_norm": 1.109375, "learning_rate": 0.0004864806412917788, "loss": 5.2999, "mean_token_accuracy": 0.1838969483971596, "num_tokens": 25982650.0, "step": 14090 }, { "entropy": 5.734250688552857, "epoch": 1.184162990968284, "grad_norm": 1.1484375, "learning_rate": 0.0004864704153253325, "loss": 5.2945, "mean_token_accuracy": 0.17772632837295532, "num_tokens": 25992096.0, "step": 14095 }, { "entropy": 5.724607753753662, "epoch": 1.1845830707834488, "grad_norm": 1.0703125, "learning_rate": 0.00048646018561279665, "loss": 5.2701, "mean_token_accuracy": 0.18178205341100692, "num_tokens": 26002063.0, "step": 14100 }, { "entropy": 5.558489036560059, "epoch": 1.1850031505986138, "grad_norm": 1.0234375, "learning_rate": 0.00048644995215435245, "loss": 5.133, "mean_token_accuracy": 0.19223445802927017, "num_tokens": 26010716.0, "step": 14105 }, { "entropy": 5.6479597091674805, "epoch": 1.1854232304137786, "grad_norm": 1.125, "learning_rate": 0.0004864397149501812, "loss": 5.1949, "mean_token_accuracy": 0.18594390600919725, "num_tokens": 26019136.0, "step": 14110 }, { "entropy": 5.6586041927337645, "epoch": 1.1858433102289434, "grad_norm": 1.015625, "learning_rate": 0.00048642947400046434, "loss": 5.224, "mean_token_accuracy": 0.19260908663272858, "num_tokens": 26028029.0, "step": 14115 }, { "entropy": 5.75810055732727, "epoch": 1.1862633900441084, "grad_norm": 1.015625, "learning_rate": 0.00048641922930538325, "loss": 5.3572, "mean_token_accuracy": 0.17536600083112716, "num_tokens": 26038025.0, "step": 14120 }, { "entropy": 5.69813494682312, "epoch": 1.1866834698592732, "grad_norm": 1.0, "learning_rate": 0.0004864089808651193, "loss": 5.344, "mean_token_accuracy": 0.1677533730864525, "num_tokens": 26048427.0, "step": 14125 }, { "entropy": 5.65303406715393, "epoch": 1.1871035496744382, "grad_norm": 1.015625, "learning_rate": 0.0004863987286798541, "loss": 5.143, "mean_token_accuracy": 0.18616390079259873, "num_tokens": 26057682.0, "step": 14130 }, { "entropy": 5.622858953475952, "epoch": 1.187523629489603, "grad_norm": 1.09375, "learning_rate": 0.0004863884727497693, "loss": 5.2231, "mean_token_accuracy": 0.18667658269405366, "num_tokens": 26066562.0, "step": 14135 }, { "entropy": 5.592396926879883, "epoch": 1.187943709304768, "grad_norm": 1.0859375, "learning_rate": 0.0004863782130750466, "loss": 5.1511, "mean_token_accuracy": 0.18460778892040253, "num_tokens": 26075633.0, "step": 14140 }, { "entropy": 5.6819360733032225, "epoch": 1.1883637891199328, "grad_norm": 1.03125, "learning_rate": 0.00048636794965586764, "loss": 5.2997, "mean_token_accuracy": 0.1789734736084938, "num_tokens": 26085160.0, "step": 14145 }, { "entropy": 5.66538519859314, "epoch": 1.1887838689350976, "grad_norm": 1.03125, "learning_rate": 0.00048635768249241434, "loss": 5.1771, "mean_token_accuracy": 0.18609512001276016, "num_tokens": 26094157.0, "step": 14150 }, { "entropy": 5.744089412689209, "epoch": 1.1892039487502626, "grad_norm": 1.0390625, "learning_rate": 0.0004863474115848685, "loss": 5.3159, "mean_token_accuracy": 0.1863292083144188, "num_tokens": 26104459.0, "step": 14155 }, { "entropy": 5.610680103302002, "epoch": 1.1896240285654274, "grad_norm": 1.015625, "learning_rate": 0.00048633713693341214, "loss": 5.2293, "mean_token_accuracy": 0.18353819698095322, "num_tokens": 26114468.0, "step": 14160 }, { "entropy": 5.6227161407470705, "epoch": 1.1900441083805924, "grad_norm": 1.09375, "learning_rate": 0.00048632685853822714, "loss": 5.2178, "mean_token_accuracy": 0.17873319685459138, "num_tokens": 26123408.0, "step": 14165 }, { "entropy": 5.667204809188843, "epoch": 1.1904641881957572, "grad_norm": 1.0546875, "learning_rate": 0.0004863165763994957, "loss": 5.2402, "mean_token_accuracy": 0.17876092046499253, "num_tokens": 26132692.0, "step": 14170 }, { "entropy": 5.758384275436401, "epoch": 1.190884268010922, "grad_norm": 1.125, "learning_rate": 0.0004863062905173999, "loss": 5.3813, "mean_token_accuracy": 0.17815937995910644, "num_tokens": 26142259.0, "step": 14175 }, { "entropy": 5.681025505065918, "epoch": 1.191304347826087, "grad_norm": 0.97265625, "learning_rate": 0.000486296000892122, "loss": 5.228, "mean_token_accuracy": 0.1825854390859604, "num_tokens": 26151782.0, "step": 14180 }, { "entropy": 5.604825973510742, "epoch": 1.1917244276412517, "grad_norm": 0.9921875, "learning_rate": 0.00048628570752384424, "loss": 5.0831, "mean_token_accuracy": 0.1915929928421974, "num_tokens": 26160449.0, "step": 14185 }, { "entropy": 5.777124977111816, "epoch": 1.1921445074564168, "grad_norm": 1.046875, "learning_rate": 0.00048627541041274897, "loss": 5.3375, "mean_token_accuracy": 0.1733965367078781, "num_tokens": 26169764.0, "step": 14190 }, { "entropy": 5.676273393630981, "epoch": 1.1925645872715815, "grad_norm": 1.0078125, "learning_rate": 0.00048626510955901854, "loss": 5.1832, "mean_token_accuracy": 0.1855803370475769, "num_tokens": 26178759.0, "step": 14195 }, { "entropy": 5.707294940948486, "epoch": 1.1929846670867466, "grad_norm": 1.078125, "learning_rate": 0.0004862548049628356, "loss": 5.3071, "mean_token_accuracy": 0.18505449295043946, "num_tokens": 26187904.0, "step": 14200 }, { "entropy": 5.716597843170166, "epoch": 1.1934047469019113, "grad_norm": 0.99609375, "learning_rate": 0.0004862444966243824, "loss": 5.2306, "mean_token_accuracy": 0.18754901885986328, "num_tokens": 26196563.0, "step": 14205 }, { "entropy": 5.783038663864136, "epoch": 1.1938248267170763, "grad_norm": 0.94921875, "learning_rate": 0.0004862341845438419, "loss": 5.257, "mean_token_accuracy": 0.18142412453889847, "num_tokens": 26206573.0, "step": 14210 }, { "entropy": 5.635410594940185, "epoch": 1.1942449065322411, "grad_norm": 1.0859375, "learning_rate": 0.00048622386872139645, "loss": 5.1633, "mean_token_accuracy": 0.1847258046269417, "num_tokens": 26215308.0, "step": 14215 }, { "entropy": 5.570873022079468, "epoch": 1.194664986347406, "grad_norm": 1.0078125, "learning_rate": 0.000486213549157229, "loss": 5.2275, "mean_token_accuracy": 0.18310608714818954, "num_tokens": 26224379.0, "step": 14220 }, { "entropy": 5.640047311782837, "epoch": 1.195085066162571, "grad_norm": 1.0390625, "learning_rate": 0.0004862032258515222, "loss": 5.2111, "mean_token_accuracy": 0.1891263708472252, "num_tokens": 26233620.0, "step": 14225 }, { "entropy": 5.717768812179566, "epoch": 1.1955051459777357, "grad_norm": 1.046875, "learning_rate": 0.0004861928988044592, "loss": 5.3, "mean_token_accuracy": 0.17701064348220824, "num_tokens": 26242556.0, "step": 14230 }, { "entropy": 5.708515977859497, "epoch": 1.1959252257929007, "grad_norm": 0.98828125, "learning_rate": 0.0004861825680162226, "loss": 5.2665, "mean_token_accuracy": 0.1848612532019615, "num_tokens": 26251561.0, "step": 14235 }, { "entropy": 5.640319061279297, "epoch": 1.1963453056080655, "grad_norm": 1.03125, "learning_rate": 0.00048617223348699546, "loss": 5.1874, "mean_token_accuracy": 0.18436055332422258, "num_tokens": 26261115.0, "step": 14240 }, { "entropy": 5.713002729415893, "epoch": 1.1967653854232303, "grad_norm": 0.98046875, "learning_rate": 0.0004861618952169611, "loss": 5.3288, "mean_token_accuracy": 0.18686943575739862, "num_tokens": 26271165.0, "step": 14245 }, { "entropy": 5.650120067596435, "epoch": 1.1971854652383953, "grad_norm": 0.984375, "learning_rate": 0.0004861515532063025, "loss": 5.2939, "mean_token_accuracy": 0.17982505708932878, "num_tokens": 26280822.0, "step": 14250 }, { "entropy": 5.662411069869995, "epoch": 1.19760554505356, "grad_norm": 1.078125, "learning_rate": 0.00048614120745520275, "loss": 5.2109, "mean_token_accuracy": 0.18372425585985183, "num_tokens": 26288747.0, "step": 14255 }, { "entropy": 5.698615026473999, "epoch": 1.198025624868725, "grad_norm": 1.0078125, "learning_rate": 0.00048613085796384524, "loss": 5.2517, "mean_token_accuracy": 0.18029409199953078, "num_tokens": 26298387.0, "step": 14260 }, { "entropy": 5.646587991714478, "epoch": 1.19844570468389, "grad_norm": 1.0703125, "learning_rate": 0.00048612050473241335, "loss": 5.1474, "mean_token_accuracy": 0.18792334496974944, "num_tokens": 26307016.0, "step": 14265 }, { "entropy": 5.713901662826538, "epoch": 1.198865784499055, "grad_norm": 1.0234375, "learning_rate": 0.0004861101477610905, "loss": 5.2702, "mean_token_accuracy": 0.17869615852832793, "num_tokens": 26316296.0, "step": 14270 }, { "entropy": 5.665866470336914, "epoch": 1.1992858643142197, "grad_norm": 0.98046875, "learning_rate": 0.00048609978705006, "loss": 5.2605, "mean_token_accuracy": 0.18256956934928895, "num_tokens": 26325525.0, "step": 14275 }, { "entropy": 5.629416847229004, "epoch": 1.1997059441293847, "grad_norm": 1.0859375, "learning_rate": 0.0004860894225995055, "loss": 5.1322, "mean_token_accuracy": 0.18909634202718734, "num_tokens": 26334195.0, "step": 14280 }, { "entropy": 5.655806255340576, "epoch": 1.2001260239445495, "grad_norm": 0.9921875, "learning_rate": 0.00048607905440961054, "loss": 5.2606, "mean_token_accuracy": 0.18612580001354218, "num_tokens": 26343933.0, "step": 14285 }, { "entropy": 5.657006645202637, "epoch": 1.2005461037597143, "grad_norm": 1.1796875, "learning_rate": 0.00048606868248055887, "loss": 5.2199, "mean_token_accuracy": 0.18329950124025346, "num_tokens": 26353455.0, "step": 14290 }, { "entropy": 5.7338886737823485, "epoch": 1.2009661835748793, "grad_norm": 1.03125, "learning_rate": 0.0004860583068125341, "loss": 5.2306, "mean_token_accuracy": 0.1863962933421135, "num_tokens": 26362662.0, "step": 14295 }, { "entropy": 5.682442951202392, "epoch": 1.201386263390044, "grad_norm": 1.0703125, "learning_rate": 0.0004860479274057202, "loss": 5.227, "mean_token_accuracy": 0.1782439038157463, "num_tokens": 26371536.0, "step": 14300 }, { "entropy": 5.699567461013794, "epoch": 1.201806343205209, "grad_norm": 1.0234375, "learning_rate": 0.00048603754426030087, "loss": 5.3084, "mean_token_accuracy": 0.18461299538612366, "num_tokens": 26381925.0, "step": 14305 }, { "entropy": 5.659120988845825, "epoch": 1.2022264230203739, "grad_norm": 1.0703125, "learning_rate": 0.00048602715737646016, "loss": 5.1737, "mean_token_accuracy": 0.18752854615449904, "num_tokens": 26391111.0, "step": 14310 }, { "entropy": 5.761658239364624, "epoch": 1.2026465028355386, "grad_norm": 0.953125, "learning_rate": 0.00048601676675438197, "loss": 5.3326, "mean_token_accuracy": 0.1687532737851143, "num_tokens": 26401667.0, "step": 14315 }, { "entropy": 5.6375326156616214, "epoch": 1.2030665826507037, "grad_norm": 1.015625, "learning_rate": 0.00048600637239425045, "loss": 5.1604, "mean_token_accuracy": 0.19208646863698958, "num_tokens": 26411261.0, "step": 14320 }, { "entropy": 5.652923250198365, "epoch": 1.2034866624658684, "grad_norm": 1.078125, "learning_rate": 0.00048599597429624966, "loss": 5.3011, "mean_token_accuracy": 0.17783199548721312, "num_tokens": 26419808.0, "step": 14325 }, { "entropy": 5.668402910232544, "epoch": 1.2039067422810334, "grad_norm": 1.1015625, "learning_rate": 0.00048598557246056385, "loss": 5.2187, "mean_token_accuracy": 0.1881200224161148, "num_tokens": 26429160.0, "step": 14330 }, { "entropy": 5.671262502670288, "epoch": 1.2043268220961982, "grad_norm": 1.0234375, "learning_rate": 0.00048597516688737727, "loss": 5.1742, "mean_token_accuracy": 0.1853098079562187, "num_tokens": 26437675.0, "step": 14335 }, { "entropy": 5.697401618957519, "epoch": 1.2047469019113632, "grad_norm": 1.0703125, "learning_rate": 0.00048596475757687425, "loss": 5.2198, "mean_token_accuracy": 0.18341862857341767, "num_tokens": 26446317.0, "step": 14340 }, { "entropy": 5.69052677154541, "epoch": 1.205166981726528, "grad_norm": 0.98828125, "learning_rate": 0.00048595434452923915, "loss": 5.2728, "mean_token_accuracy": 0.1880559504032135, "num_tokens": 26456183.0, "step": 14345 }, { "entropy": 5.677026081085205, "epoch": 1.205587061541693, "grad_norm": 1.0390625, "learning_rate": 0.00048594392774465656, "loss": 5.2374, "mean_token_accuracy": 0.17911425828933716, "num_tokens": 26466324.0, "step": 14350 }, { "entropy": 5.654966402053833, "epoch": 1.2060071413568578, "grad_norm": 1.0546875, "learning_rate": 0.00048593350722331074, "loss": 5.2211, "mean_token_accuracy": 0.18575226366519929, "num_tokens": 26475560.0, "step": 14355 }, { "entropy": 5.634671831130982, "epoch": 1.2064272211720226, "grad_norm": 1.1328125, "learning_rate": 0.00048592308296538654, "loss": 5.2188, "mean_token_accuracy": 0.18697748184204102, "num_tokens": 26484955.0, "step": 14360 }, { "entropy": 5.647588777542114, "epoch": 1.2068473009871876, "grad_norm": 1.0234375, "learning_rate": 0.0004859126549710686, "loss": 5.1878, "mean_token_accuracy": 0.19262006878852844, "num_tokens": 26494306.0, "step": 14365 }, { "entropy": 5.624613475799561, "epoch": 1.2072673808023524, "grad_norm": 1.0859375, "learning_rate": 0.00048590222324054153, "loss": 5.1703, "mean_token_accuracy": 0.18993528336286544, "num_tokens": 26503871.0, "step": 14370 }, { "entropy": 5.727626705169678, "epoch": 1.2076874606175174, "grad_norm": 1.0546875, "learning_rate": 0.0004858917877739901, "loss": 5.2977, "mean_token_accuracy": 0.1798119768500328, "num_tokens": 26511929.0, "step": 14375 }, { "entropy": 5.702956581115723, "epoch": 1.2081075404326822, "grad_norm": 1.203125, "learning_rate": 0.0004858813485715994, "loss": 5.2672, "mean_token_accuracy": 0.17639320343732834, "num_tokens": 26520469.0, "step": 14380 }, { "entropy": 5.6741899967193605, "epoch": 1.208527620247847, "grad_norm": 1.03125, "learning_rate": 0.0004858709056335541, "loss": 5.2393, "mean_token_accuracy": 0.183968648314476, "num_tokens": 26530102.0, "step": 14385 }, { "entropy": 5.662025117874146, "epoch": 1.208947700063012, "grad_norm": 0.99609375, "learning_rate": 0.00048586045896003926, "loss": 5.2386, "mean_token_accuracy": 0.18364233523607254, "num_tokens": 26538705.0, "step": 14390 }, { "entropy": 5.686083030700684, "epoch": 1.2093677798781768, "grad_norm": 1.0703125, "learning_rate": 0.0004858500085512401, "loss": 5.3288, "mean_token_accuracy": 0.17981955111026765, "num_tokens": 26548315.0, "step": 14395 }, { "entropy": 5.684038305282593, "epoch": 1.2097878596933418, "grad_norm": 1.078125, "learning_rate": 0.00048583955440734144, "loss": 5.1718, "mean_token_accuracy": 0.18542635291814805, "num_tokens": 26556412.0, "step": 14400 }, { "entropy": 5.721052074432373, "epoch": 1.2102079395085066, "grad_norm": 0.9453125, "learning_rate": 0.00048582909652852873, "loss": 5.3325, "mean_token_accuracy": 0.17493930757045745, "num_tokens": 26566146.0, "step": 14405 }, { "entropy": 5.652407789230347, "epoch": 1.2106280193236716, "grad_norm": 1.0625, "learning_rate": 0.0004858186349149871, "loss": 5.2219, "mean_token_accuracy": 0.1897057741880417, "num_tokens": 26576019.0, "step": 14410 }, { "entropy": 5.57247142791748, "epoch": 1.2110480991388364, "grad_norm": 1.0390625, "learning_rate": 0.000485808169566902, "loss": 5.0956, "mean_token_accuracy": 0.18700592070817948, "num_tokens": 26585461.0, "step": 14415 }, { "entropy": 5.582096529006958, "epoch": 1.2114681789540014, "grad_norm": 1.0625, "learning_rate": 0.00048579770048445863, "loss": 5.1317, "mean_token_accuracy": 0.20506960898637772, "num_tokens": 26594021.0, "step": 14420 }, { "entropy": 5.722401714324951, "epoch": 1.2118882587691662, "grad_norm": 0.96875, "learning_rate": 0.00048578722766784253, "loss": 5.2715, "mean_token_accuracy": 0.18513525426387786, "num_tokens": 26602712.0, "step": 14425 }, { "entropy": 5.609136343002319, "epoch": 1.212308338584331, "grad_norm": 1.15625, "learning_rate": 0.00048577675111723925, "loss": 5.0014, "mean_token_accuracy": 0.1986709341406822, "num_tokens": 26610970.0, "step": 14430 }, { "entropy": 5.674383687973022, "epoch": 1.212728418399496, "grad_norm": 1.0, "learning_rate": 0.00048576627083283435, "loss": 5.2513, "mean_token_accuracy": 0.1869391143321991, "num_tokens": 26619840.0, "step": 14435 }, { "entropy": 5.638332033157349, "epoch": 1.2131484982146608, "grad_norm": 1.0078125, "learning_rate": 0.0004857557868148136, "loss": 5.1501, "mean_token_accuracy": 0.19292452484369277, "num_tokens": 26629271.0, "step": 14440 }, { "entropy": 5.635064649581909, "epoch": 1.2135685780298258, "grad_norm": 1.203125, "learning_rate": 0.0004857452990633625, "loss": 5.2122, "mean_token_accuracy": 0.18105288296937944, "num_tokens": 26638610.0, "step": 14445 }, { "entropy": 5.760031414031983, "epoch": 1.2139886578449905, "grad_norm": 1.09375, "learning_rate": 0.00048573480757866695, "loss": 5.3284, "mean_token_accuracy": 0.18001771569252015, "num_tokens": 26648504.0, "step": 14450 }, { "entropy": 5.662951946258545, "epoch": 1.2144087376601553, "grad_norm": 1.0390625, "learning_rate": 0.00048572431236091284, "loss": 5.2384, "mean_token_accuracy": 0.18161435425281525, "num_tokens": 26658084.0, "step": 14455 }, { "entropy": 5.648608875274658, "epoch": 1.2148288174753203, "grad_norm": 1.03125, "learning_rate": 0.00048571381341028604, "loss": 5.3195, "mean_token_accuracy": 0.1801130771636963, "num_tokens": 26666933.0, "step": 14460 }, { "entropy": 5.678757095336914, "epoch": 1.2152488972904851, "grad_norm": 1.171875, "learning_rate": 0.0004857033107269725, "loss": 5.201, "mean_token_accuracy": 0.18683169335126876, "num_tokens": 26675049.0, "step": 14465 }, { "entropy": 5.604978752136231, "epoch": 1.2156689771056501, "grad_norm": 1.1171875, "learning_rate": 0.00048569280431115823, "loss": 5.251, "mean_token_accuracy": 0.18504755795001984, "num_tokens": 26684223.0, "step": 14470 }, { "entropy": 5.688118124008179, "epoch": 1.216089056920815, "grad_norm": 1.078125, "learning_rate": 0.0004856822941630296, "loss": 5.2148, "mean_token_accuracy": 0.18037988245487213, "num_tokens": 26693605.0, "step": 14475 }, { "entropy": 5.7239954471588135, "epoch": 1.2165091367359797, "grad_norm": 1.03125, "learning_rate": 0.00048567178028277255, "loss": 5.2769, "mean_token_accuracy": 0.18577805310487747, "num_tokens": 26702829.0, "step": 14480 }, { "entropy": 5.7303508758544925, "epoch": 1.2169292165511447, "grad_norm": 0.9921875, "learning_rate": 0.0004856612626705733, "loss": 5.3101, "mean_token_accuracy": 0.17937374264001846, "num_tokens": 26712466.0, "step": 14485 }, { "entropy": 5.662670469284057, "epoch": 1.2173492963663095, "grad_norm": 1.0390625, "learning_rate": 0.0004856507413266183, "loss": 5.1774, "mean_token_accuracy": 0.19302290230989455, "num_tokens": 26721730.0, "step": 14490 }, { "entropy": 5.649547147750854, "epoch": 1.2177693761814745, "grad_norm": 1.09375, "learning_rate": 0.000485640216251094, "loss": 5.2812, "mean_token_accuracy": 0.18129193633794785, "num_tokens": 26731017.0, "step": 14495 }, { "entropy": 5.6918542861938475, "epoch": 1.2181894559966393, "grad_norm": 1.0859375, "learning_rate": 0.00048562968744418665, "loss": 5.2567, "mean_token_accuracy": 0.1789872959256172, "num_tokens": 26739588.0, "step": 14500 }, { "entropy": 5.757008409500122, "epoch": 1.2186095358118043, "grad_norm": 1.0546875, "learning_rate": 0.0004856191549060828, "loss": 5.3325, "mean_token_accuracy": 0.17744818031787873, "num_tokens": 26748889.0, "step": 14505 }, { "entropy": 5.744517374038696, "epoch": 1.219029615626969, "grad_norm": 1.0390625, "learning_rate": 0.00048560861863696913, "loss": 5.2841, "mean_token_accuracy": 0.1818558990955353, "num_tokens": 26757979.0, "step": 14510 }, { "entropy": 5.672915410995484, "epoch": 1.219449695442134, "grad_norm": 0.98828125, "learning_rate": 0.0004855980786370322, "loss": 5.217, "mean_token_accuracy": 0.18610319942235948, "num_tokens": 26767225.0, "step": 14515 }, { "entropy": 5.644172096252442, "epoch": 1.219869775257299, "grad_norm": 1.0234375, "learning_rate": 0.0004855875349064588, "loss": 5.1646, "mean_token_accuracy": 0.18708633184432982, "num_tokens": 26776289.0, "step": 14520 }, { "entropy": 5.755469179153442, "epoch": 1.2202898550724637, "grad_norm": 1.125, "learning_rate": 0.0004855769874454356, "loss": 5.2716, "mean_token_accuracy": 0.18372897058725357, "num_tokens": 26785631.0, "step": 14525 }, { "entropy": 5.697255420684814, "epoch": 1.2207099348876287, "grad_norm": 1.09375, "learning_rate": 0.0004855664362541495, "loss": 5.2556, "mean_token_accuracy": 0.17897605001926423, "num_tokens": 26795285.0, "step": 14530 }, { "entropy": 5.616041994094848, "epoch": 1.2211300147027935, "grad_norm": 1.078125, "learning_rate": 0.00048555588133278744, "loss": 5.2046, "mean_token_accuracy": 0.17997522354125978, "num_tokens": 26804584.0, "step": 14535 }, { "entropy": 5.569559955596924, "epoch": 1.2215500945179585, "grad_norm": 1.1171875, "learning_rate": 0.0004855453226815363, "loss": 5.1106, "mean_token_accuracy": 0.1836683601140976, "num_tokens": 26814354.0, "step": 14540 }, { "entropy": 5.587725067138672, "epoch": 1.2219701743331233, "grad_norm": 0.9609375, "learning_rate": 0.00048553476030058326, "loss": 5.1222, "mean_token_accuracy": 0.19063053727149964, "num_tokens": 26824274.0, "step": 14545 }, { "entropy": 5.619691467285156, "epoch": 1.222390254148288, "grad_norm": 1.0546875, "learning_rate": 0.00048552419419011536, "loss": 5.2478, "mean_token_accuracy": 0.18361906111240386, "num_tokens": 26833155.0, "step": 14550 }, { "entropy": 5.665100240707398, "epoch": 1.222810333963453, "grad_norm": 1.0390625, "learning_rate": 0.0004855136243503196, "loss": 5.2006, "mean_token_accuracy": 0.1821645513176918, "num_tokens": 26842545.0, "step": 14555 }, { "entropy": 5.705448484420776, "epoch": 1.2232304137786179, "grad_norm": 0.9296875, "learning_rate": 0.00048550305078138363, "loss": 5.2634, "mean_token_accuracy": 0.17856968939304352, "num_tokens": 26851772.0, "step": 14560 }, { "entropy": 5.629469919204712, "epoch": 1.2236504935937829, "grad_norm": 1.0234375, "learning_rate": 0.00048549247348349435, "loss": 5.1494, "mean_token_accuracy": 0.184813891351223, "num_tokens": 26860884.0, "step": 14565 }, { "entropy": 5.653168058395385, "epoch": 1.2240705734089476, "grad_norm": 1.0546875, "learning_rate": 0.00048548189245683934, "loss": 5.2655, "mean_token_accuracy": 0.1878492698073387, "num_tokens": 26869435.0, "step": 14570 }, { "entropy": 5.624099826812744, "epoch": 1.2244906532241127, "grad_norm": 1.0390625, "learning_rate": 0.00048547130770160596, "loss": 5.1737, "mean_token_accuracy": 0.183485546708107, "num_tokens": 26878852.0, "step": 14575 }, { "entropy": 5.664286470413208, "epoch": 1.2249107330392774, "grad_norm": 1.0625, "learning_rate": 0.0004854607192179817, "loss": 5.1533, "mean_token_accuracy": 0.18998856693506241, "num_tokens": 26887532.0, "step": 14580 }, { "entropy": 5.8016410827636715, "epoch": 1.2253308128544425, "grad_norm": 1.0390625, "learning_rate": 0.0004854501270061543, "loss": 5.3683, "mean_token_accuracy": 0.18002112656831742, "num_tokens": 26897459.0, "step": 14585 }, { "entropy": 5.640872526168823, "epoch": 1.2257508926696072, "grad_norm": 1.1171875, "learning_rate": 0.00048543953106631115, "loss": 5.1407, "mean_token_accuracy": 0.1936166688799858, "num_tokens": 26907156.0, "step": 14590 }, { "entropy": 5.713595676422119, "epoch": 1.226170972484772, "grad_norm": 1.0546875, "learning_rate": 0.0004854289313986401, "loss": 5.2366, "mean_token_accuracy": 0.18461982607841493, "num_tokens": 26915764.0, "step": 14595 }, { "entropy": 5.578314256668091, "epoch": 1.226591052299937, "grad_norm": 1.015625, "learning_rate": 0.0004854183280033289, "loss": 5.1182, "mean_token_accuracy": 0.18726677149534227, "num_tokens": 26924166.0, "step": 14600 }, { "entropy": 5.67516016960144, "epoch": 1.2270111321151018, "grad_norm": 1.0859375, "learning_rate": 0.0004854077208805654, "loss": 5.3371, "mean_token_accuracy": 0.17056439965963363, "num_tokens": 26933546.0, "step": 14605 }, { "entropy": 5.6956147193908695, "epoch": 1.2274312119302668, "grad_norm": 0.94140625, "learning_rate": 0.0004853971100305374, "loss": 5.2654, "mean_token_accuracy": 0.18477360159158707, "num_tokens": 26943213.0, "step": 14610 }, { "entropy": 5.697147417068481, "epoch": 1.2278512917454316, "grad_norm": 0.94140625, "learning_rate": 0.000485386495453433, "loss": 5.227, "mean_token_accuracy": 0.18982611894607543, "num_tokens": 26952968.0, "step": 14615 }, { "entropy": 5.650878953933716, "epoch": 1.2282713715605964, "grad_norm": 1.015625, "learning_rate": 0.00048537587714944007, "loss": 5.1941, "mean_token_accuracy": 0.1892140194773674, "num_tokens": 26962230.0, "step": 14620 }, { "entropy": 5.635179615020752, "epoch": 1.2286914513757614, "grad_norm": 1.1171875, "learning_rate": 0.0004853652551187469, "loss": 5.2783, "mean_token_accuracy": 0.1888777494430542, "num_tokens": 26970985.0, "step": 14625 }, { "entropy": 5.711835193634033, "epoch": 1.2291115311909262, "grad_norm": 0.95703125, "learning_rate": 0.00048535462936154147, "loss": 5.2866, "mean_token_accuracy": 0.18420315831899642, "num_tokens": 26981138.0, "step": 14630 }, { "entropy": 5.647654056549072, "epoch": 1.2295316110060912, "grad_norm": 1.0, "learning_rate": 0.0004853439998780122, "loss": 5.1539, "mean_token_accuracy": 0.18696757405996323, "num_tokens": 26990158.0, "step": 14635 }, { "entropy": 5.666933250427246, "epoch": 1.229951690821256, "grad_norm": 0.9921875, "learning_rate": 0.0004853333666683472, "loss": 5.304, "mean_token_accuracy": 0.17528288215398788, "num_tokens": 26998889.0, "step": 14640 }, { "entropy": 5.653009271621704, "epoch": 1.230371770636421, "grad_norm": 1.0078125, "learning_rate": 0.00048532272973273496, "loss": 5.2164, "mean_token_accuracy": 0.18172599524259567, "num_tokens": 27008912.0, "step": 14645 }, { "entropy": 5.66656265258789, "epoch": 1.2307918504515858, "grad_norm": 1.0546875, "learning_rate": 0.00048531208907136384, "loss": 5.1307, "mean_token_accuracy": 0.20003609210252762, "num_tokens": 27017573.0, "step": 14650 }, { "entropy": 5.637627172470093, "epoch": 1.2312119302667508, "grad_norm": 1.0078125, "learning_rate": 0.00048530144468442236, "loss": 5.2062, "mean_token_accuracy": 0.18131556063890458, "num_tokens": 27027205.0, "step": 14655 }, { "entropy": 5.644493913650512, "epoch": 1.2316320100819156, "grad_norm": 1.1015625, "learning_rate": 0.00048529079657209906, "loss": 5.1473, "mean_token_accuracy": 0.19096727818250656, "num_tokens": 27035882.0, "step": 14660 }, { "entropy": 5.619836807250977, "epoch": 1.2320520898970804, "grad_norm": 1.0234375, "learning_rate": 0.0004852801447345826, "loss": 5.2366, "mean_token_accuracy": 0.1910835549235344, "num_tokens": 27044761.0, "step": 14665 }, { "entropy": 5.682107639312744, "epoch": 1.2324721697122454, "grad_norm": 1.015625, "learning_rate": 0.0004852694891720617, "loss": 5.2518, "mean_token_accuracy": 0.18330902755260467, "num_tokens": 27054149.0, "step": 14670 }, { "entropy": 5.696134471893311, "epoch": 1.2328922495274102, "grad_norm": 0.98046875, "learning_rate": 0.000485258829884725, "loss": 5.28, "mean_token_accuracy": 0.18425629884004593, "num_tokens": 27063145.0, "step": 14675 }, { "entropy": 5.7195472240448, "epoch": 1.2333123293425752, "grad_norm": 1.109375, "learning_rate": 0.0004852481668727614, "loss": 5.2502, "mean_token_accuracy": 0.1807716965675354, "num_tokens": 27072378.0, "step": 14680 }, { "entropy": 5.599211978912353, "epoch": 1.23373240915774, "grad_norm": 1.0625, "learning_rate": 0.00048523750013635986, "loss": 5.1355, "mean_token_accuracy": 0.18781913220882415, "num_tokens": 27082241.0, "step": 14685 }, { "entropy": 5.610976982116699, "epoch": 1.2341524889729047, "grad_norm": 0.9765625, "learning_rate": 0.0004852268296757092, "loss": 5.1431, "mean_token_accuracy": 0.18622777462005616, "num_tokens": 27091488.0, "step": 14690 }, { "entropy": 5.708965158462524, "epoch": 1.2345725687880698, "grad_norm": 1.09375, "learning_rate": 0.0004852161554909985, "loss": 5.1976, "mean_token_accuracy": 0.18661137521266938, "num_tokens": 27100378.0, "step": 14695 }, { "entropy": 5.666726589202881, "epoch": 1.2349926486032345, "grad_norm": 1.0, "learning_rate": 0.00048520547758241686, "loss": 5.2113, "mean_token_accuracy": 0.18408560007810593, "num_tokens": 27110341.0, "step": 14700 }, { "entropy": 5.671677970886231, "epoch": 1.2354127284183996, "grad_norm": 1.03125, "learning_rate": 0.00048519479595015343, "loss": 5.1772, "mean_token_accuracy": 0.18427809625864028, "num_tokens": 27119381.0, "step": 14705 }, { "entropy": 5.6315773010253904, "epoch": 1.2358328082335643, "grad_norm": 0.96484375, "learning_rate": 0.00048518411059439746, "loss": 5.2598, "mean_token_accuracy": 0.17573754340410233, "num_tokens": 27129167.0, "step": 14710 }, { "entropy": 5.678117513656616, "epoch": 1.2362528880487293, "grad_norm": 1.09375, "learning_rate": 0.00048517342151533813, "loss": 5.2651, "mean_token_accuracy": 0.1768067240715027, "num_tokens": 27138479.0, "step": 14715 }, { "entropy": 5.691605710983277, "epoch": 1.2366729678638941, "grad_norm": 1.0546875, "learning_rate": 0.0004851627287131649, "loss": 5.1353, "mean_token_accuracy": 0.18740508258342742, "num_tokens": 27147197.0, "step": 14720 }, { "entropy": 5.575682544708252, "epoch": 1.2370930476790591, "grad_norm": 1.0546875, "learning_rate": 0.0004851520321880672, "loss": 5.1931, "mean_token_accuracy": 0.1900464877486229, "num_tokens": 27155854.0, "step": 14725 }, { "entropy": 5.639089298248291, "epoch": 1.237513127494224, "grad_norm": 1.046875, "learning_rate": 0.0004851413319402344, "loss": 5.1552, "mean_token_accuracy": 0.18309604823589326, "num_tokens": 27165069.0, "step": 14730 }, { "entropy": 5.707401132583618, "epoch": 1.2379332073093887, "grad_norm": 1.1015625, "learning_rate": 0.0004851306279698561, "loss": 5.191, "mean_token_accuracy": 0.18097084760665894, "num_tokens": 27174070.0, "step": 14735 }, { "entropy": 5.732707118988037, "epoch": 1.2383532871245537, "grad_norm": 1.0546875, "learning_rate": 0.0004851199202771219, "loss": 5.2636, "mean_token_accuracy": 0.18812942504882812, "num_tokens": 27182903.0, "step": 14740 }, { "entropy": 5.657004070281983, "epoch": 1.2387733669397185, "grad_norm": 1.09375, "learning_rate": 0.0004851092088622216, "loss": 5.1592, "mean_token_accuracy": 0.1928549587726593, "num_tokens": 27192747.0, "step": 14745 }, { "entropy": 5.703258228302002, "epoch": 1.2391934467548835, "grad_norm": 1.09375, "learning_rate": 0.0004850984937253448, "loss": 5.209, "mean_token_accuracy": 0.1915891721844673, "num_tokens": 27201657.0, "step": 14750 }, { "entropy": 5.680203771591186, "epoch": 1.2396135265700483, "grad_norm": 0.9921875, "learning_rate": 0.0004850877748666814, "loss": 5.2253, "mean_token_accuracy": 0.18461125642061232, "num_tokens": 27211794.0, "step": 14755 }, { "entropy": 5.614065790176392, "epoch": 1.240033606385213, "grad_norm": 1.0, "learning_rate": 0.00048507705228642117, "loss": 5.1927, "mean_token_accuracy": 0.18003716915845872, "num_tokens": 27221852.0, "step": 14760 }, { "entropy": 5.624101209640503, "epoch": 1.240453686200378, "grad_norm": 0.99609375, "learning_rate": 0.0004850663259847542, "loss": 5.2415, "mean_token_accuracy": 0.18276151716709138, "num_tokens": 27231558.0, "step": 14765 }, { "entropy": 5.6573535919189455, "epoch": 1.240873766015543, "grad_norm": 0.9453125, "learning_rate": 0.00048505559596187037, "loss": 5.2166, "mean_token_accuracy": 0.18167485445737838, "num_tokens": 27241053.0, "step": 14770 }, { "entropy": 5.622683143615722, "epoch": 1.241293845830708, "grad_norm": 1.0703125, "learning_rate": 0.0004850448622179599, "loss": 5.104, "mean_token_accuracy": 0.19088426381349563, "num_tokens": 27249770.0, "step": 14775 }, { "entropy": 5.745169830322266, "epoch": 1.2417139256458727, "grad_norm": 1.1015625, "learning_rate": 0.0004850341247532128, "loss": 5.3479, "mean_token_accuracy": 0.17926825284957887, "num_tokens": 27258883.0, "step": 14780 }, { "entropy": 5.714787006378174, "epoch": 1.2421340054610377, "grad_norm": 0.98828125, "learning_rate": 0.0004850233835678194, "loss": 5.2513, "mean_token_accuracy": 0.18446033149957658, "num_tokens": 27268056.0, "step": 14785 }, { "entropy": 5.685917234420776, "epoch": 1.2425540852762025, "grad_norm": 1.0546875, "learning_rate": 0.0004850126386619699, "loss": 5.1289, "mean_token_accuracy": 0.19529231637716293, "num_tokens": 27276965.0, "step": 14790 }, { "entropy": 5.656106281280517, "epoch": 1.2429741650913673, "grad_norm": 0.94921875, "learning_rate": 0.0004850018900358545, "loss": 5.1866, "mean_token_accuracy": 0.1916363701224327, "num_tokens": 27286173.0, "step": 14795 }, { "entropy": 5.6285277843475345, "epoch": 1.2433942449065323, "grad_norm": 1.0390625, "learning_rate": 0.00048499113768966386, "loss": 5.1941, "mean_token_accuracy": 0.1881430432200432, "num_tokens": 27294863.0, "step": 14800 }, { "entropy": 5.675506162643432, "epoch": 1.243814324721697, "grad_norm": 1.0546875, "learning_rate": 0.0004849803816235884, "loss": 5.2397, "mean_token_accuracy": 0.182273006439209, "num_tokens": 27304427.0, "step": 14805 }, { "entropy": 5.7278337478637695, "epoch": 1.244234404536862, "grad_norm": 1.03125, "learning_rate": 0.0004849696218378185, "loss": 5.2738, "mean_token_accuracy": 0.18515864610671998, "num_tokens": 27313716.0, "step": 14810 }, { "entropy": 5.714139461517334, "epoch": 1.2446544843520269, "grad_norm": 1.15625, "learning_rate": 0.0004849588583325449, "loss": 5.1877, "mean_token_accuracy": 0.1935681253671646, "num_tokens": 27322342.0, "step": 14815 }, { "entropy": 5.685880279541015, "epoch": 1.2450745641671919, "grad_norm": 0.98046875, "learning_rate": 0.0004849480911079583, "loss": 5.2699, "mean_token_accuracy": 0.17245246171951295, "num_tokens": 27331892.0, "step": 14820 }, { "entropy": 5.706261062622071, "epoch": 1.2454946439823567, "grad_norm": 1.1328125, "learning_rate": 0.0004849373201642493, "loss": 5.2511, "mean_token_accuracy": 0.17408859729766846, "num_tokens": 27340428.0, "step": 14825 }, { "entropy": 5.674344444274903, "epoch": 1.2459147237975214, "grad_norm": 0.98828125, "learning_rate": 0.0004849265455016088, "loss": 5.24, "mean_token_accuracy": 0.18267205804586412, "num_tokens": 27349224.0, "step": 14830 }, { "entropy": 5.663537263870239, "epoch": 1.2463348036126864, "grad_norm": 1.0546875, "learning_rate": 0.0004849157671202277, "loss": 5.2201, "mean_token_accuracy": 0.19008608758449555, "num_tokens": 27357480.0, "step": 14835 }, { "entropy": 5.626085519790649, "epoch": 1.2467548834278512, "grad_norm": 1.0234375, "learning_rate": 0.0004849049850202968, "loss": 5.1306, "mean_token_accuracy": 0.18790293186903, "num_tokens": 27366732.0, "step": 14840 }, { "entropy": 5.664501762390136, "epoch": 1.2471749632430162, "grad_norm": 1.046875, "learning_rate": 0.0004848941992020072, "loss": 5.2438, "mean_token_accuracy": 0.17996540665626526, "num_tokens": 27375834.0, "step": 14845 }, { "entropy": 5.722063302993774, "epoch": 1.247595043058181, "grad_norm": 1.09375, "learning_rate": 0.0004848834096655499, "loss": 5.2242, "mean_token_accuracy": 0.18336610794067382, "num_tokens": 27385311.0, "step": 14850 }, { "entropy": 5.6893871307373045, "epoch": 1.2480151228733458, "grad_norm": 0.99609375, "learning_rate": 0.00048487261641111607, "loss": 5.2689, "mean_token_accuracy": 0.18398506790399552, "num_tokens": 27394587.0, "step": 14855 }, { "entropy": 5.567498970031738, "epoch": 1.2484352026885108, "grad_norm": 1.03125, "learning_rate": 0.000484861819438897, "loss": 5.1342, "mean_token_accuracy": 0.1845276400446892, "num_tokens": 27403316.0, "step": 14860 }, { "entropy": 5.6082133769989015, "epoch": 1.2488552825036756, "grad_norm": 1.1640625, "learning_rate": 0.0004848510187490838, "loss": 5.2098, "mean_token_accuracy": 0.18957970440387725, "num_tokens": 27412709.0, "step": 14865 }, { "entropy": 5.6727558135986325, "epoch": 1.2492753623188406, "grad_norm": 1.1171875, "learning_rate": 0.0004848402143418679, "loss": 5.2554, "mean_token_accuracy": 0.18320296257734298, "num_tokens": 27422004.0, "step": 14870 }, { "entropy": 5.680542135238648, "epoch": 1.2496954421340054, "grad_norm": 1.046875, "learning_rate": 0.00048482940621744053, "loss": 5.2831, "mean_token_accuracy": 0.1786421850323677, "num_tokens": 27431931.0, "step": 14875 }, { "entropy": 5.618782424926758, "epoch": 1.2501155219491704, "grad_norm": 0.9453125, "learning_rate": 0.0004848185943759934, "loss": 5.1164, "mean_token_accuracy": 0.19321394115686416, "num_tokens": 27441527.0, "step": 14880 }, { "entropy": 5.681323432922364, "epoch": 1.2505356017643352, "grad_norm": 1.109375, "learning_rate": 0.00048480777881771786, "loss": 5.2437, "mean_token_accuracy": 0.18861683458089828, "num_tokens": 27449964.0, "step": 14885 }, { "entropy": 5.663643932342529, "epoch": 1.2509556815795002, "grad_norm": 1.078125, "learning_rate": 0.0004847969595428056, "loss": 5.2665, "mean_token_accuracy": 0.17827805429697036, "num_tokens": 27459044.0, "step": 14890 }, { "entropy": 5.641737079620361, "epoch": 1.251375761394665, "grad_norm": 1.046875, "learning_rate": 0.00048478613655144817, "loss": 5.2431, "mean_token_accuracy": 0.18376678824424744, "num_tokens": 27467644.0, "step": 14895 }, { "entropy": 5.70604305267334, "epoch": 1.2517958412098298, "grad_norm": 1.015625, "learning_rate": 0.0004847753098438374, "loss": 5.2775, "mean_token_accuracy": 0.17858496457338333, "num_tokens": 27476899.0, "step": 14900 }, { "entropy": 5.658094692230224, "epoch": 1.2522159210249948, "grad_norm": 1.046875, "learning_rate": 0.000484764479420165, "loss": 5.1659, "mean_token_accuracy": 0.18931404054164885, "num_tokens": 27485167.0, "step": 14905 }, { "entropy": 5.707379245758057, "epoch": 1.2526360008401596, "grad_norm": 1.109375, "learning_rate": 0.00048475364528062287, "loss": 5.2041, "mean_token_accuracy": 0.1846518412232399, "num_tokens": 27493986.0, "step": 14910 }, { "entropy": 5.7029248714447025, "epoch": 1.2530560806553246, "grad_norm": 1.015625, "learning_rate": 0.0004847428074254029, "loss": 5.2395, "mean_token_accuracy": 0.19285746067762374, "num_tokens": 27503896.0, "step": 14915 }, { "entropy": 5.652810955047608, "epoch": 1.2534761604704894, "grad_norm": 1.046875, "learning_rate": 0.00048473196585469713, "loss": 5.211, "mean_token_accuracy": 0.18419021517038345, "num_tokens": 27513485.0, "step": 14920 }, { "entropy": 5.728656578063965, "epoch": 1.2538962402856542, "grad_norm": 1.140625, "learning_rate": 0.00048472112056869763, "loss": 5.2717, "mean_token_accuracy": 0.18179314583539963, "num_tokens": 27523164.0, "step": 14925 }, { "entropy": 5.740363311767578, "epoch": 1.2543163201008192, "grad_norm": 1.03125, "learning_rate": 0.0004847102715675964, "loss": 5.217, "mean_token_accuracy": 0.1826660603284836, "num_tokens": 27531387.0, "step": 14930 }, { "entropy": 5.620765018463135, "epoch": 1.254736399915984, "grad_norm": 1.03125, "learning_rate": 0.0004846994188515857, "loss": 5.2225, "mean_token_accuracy": 0.1886933147907257, "num_tokens": 27541754.0, "step": 14935 }, { "entropy": 5.718434810638428, "epoch": 1.255156479731149, "grad_norm": 1.0, "learning_rate": 0.0004846885624208578, "loss": 5.281, "mean_token_accuracy": 0.17799268811941146, "num_tokens": 27551458.0, "step": 14940 }, { "entropy": 5.679688501358032, "epoch": 1.2555765595463138, "grad_norm": 1.015625, "learning_rate": 0.000484677702275605, "loss": 5.2015, "mean_token_accuracy": 0.18932418078184127, "num_tokens": 27560797.0, "step": 14945 }, { "entropy": 5.729085302352905, "epoch": 1.2559966393614788, "grad_norm": 1.0390625, "learning_rate": 0.00048466683841601963, "loss": 5.1966, "mean_token_accuracy": 0.1864927589893341, "num_tokens": 27570166.0, "step": 14950 }, { "entropy": 5.597444629669189, "epoch": 1.2564167191766435, "grad_norm": 1.046875, "learning_rate": 0.00048465597084229416, "loss": 5.1137, "mean_token_accuracy": 0.19255895167589188, "num_tokens": 27579411.0, "step": 14955 }, { "entropy": 5.69745626449585, "epoch": 1.2568367989918086, "grad_norm": 1.1015625, "learning_rate": 0.0004846450995546212, "loss": 5.3359, "mean_token_accuracy": 0.1801409661769867, "num_tokens": 27589124.0, "step": 14960 }, { "entropy": 5.713966274261475, "epoch": 1.2572568788069733, "grad_norm": 1.0546875, "learning_rate": 0.0004846342245531932, "loss": 5.3089, "mean_token_accuracy": 0.1796952649950981, "num_tokens": 27598664.0, "step": 14965 }, { "entropy": 5.737641191482544, "epoch": 1.2576769586221381, "grad_norm": 1.0234375, "learning_rate": 0.0004846233458382029, "loss": 5.2237, "mean_token_accuracy": 0.1873997762799263, "num_tokens": 27607189.0, "step": 14970 }, { "entropy": 5.692186403274536, "epoch": 1.2580970384373031, "grad_norm": 1.0234375, "learning_rate": 0.00048461246340984293, "loss": 5.2671, "mean_token_accuracy": 0.18366713374853133, "num_tokens": 27616415.0, "step": 14975 }, { "entropy": 5.677560234069825, "epoch": 1.258517118252468, "grad_norm": 1.0390625, "learning_rate": 0.0004846015772683061, "loss": 5.246, "mean_token_accuracy": 0.18738396018743514, "num_tokens": 27624492.0, "step": 14980 }, { "entropy": 5.595950984954834, "epoch": 1.258937198067633, "grad_norm": 0.97265625, "learning_rate": 0.00048459068741378526, "loss": 5.1543, "mean_token_accuracy": 0.18182894438505173, "num_tokens": 27634243.0, "step": 14985 }, { "entropy": 5.674468851089477, "epoch": 1.2593572778827977, "grad_norm": 0.97265625, "learning_rate": 0.0004845797938464734, "loss": 5.2384, "mean_token_accuracy": 0.18295591771602632, "num_tokens": 27642887.0, "step": 14990 }, { "entropy": 5.718436670303345, "epoch": 1.2597773576979625, "grad_norm": 1.0234375, "learning_rate": 0.0004845688965665633, "loss": 5.2614, "mean_token_accuracy": 0.18309373408555984, "num_tokens": 27652524.0, "step": 14995 }, { "entropy": 5.643162250518799, "epoch": 1.2601974375131275, "grad_norm": 1.125, "learning_rate": 0.00048455799557424814, "loss": 5.1092, "mean_token_accuracy": 0.19538451135158538, "num_tokens": 27661306.0, "step": 15000 }, { "epoch": 1.2601974375131275, "eval_entropy": 5.473019234451689, "eval_loss": 5.278746128082275, "eval_mean_token_accuracy": 0.19182138981585123, "eval_num_tokens": 27661306.0, "eval_runtime": 20.982, "eval_samples_per_second": 1780.864, "eval_steps_per_second": 222.62, "step": 15000 }, { "entropy": 5.6969993114471436, "epoch": 1.2606175173282923, "grad_norm": 0.97265625, "learning_rate": 0.0004845470908697209, "loss": 5.3112, "mean_token_accuracy": 0.18793802857398986, "num_tokens": 27671728.0, "step": 15005 }, { "entropy": 5.603932857513428, "epoch": 1.2610375971434573, "grad_norm": 0.92578125, "learning_rate": 0.000484536182453175, "loss": 5.113, "mean_token_accuracy": 0.18951596468687057, "num_tokens": 27680740.0, "step": 15010 }, { "entropy": 5.630282068252564, "epoch": 1.261457676958622, "grad_norm": 0.95703125, "learning_rate": 0.0004845252703248035, "loss": 5.1937, "mean_token_accuracy": 0.18728238344192505, "num_tokens": 27689865.0, "step": 15015 }, { "entropy": 5.664612102508545, "epoch": 1.2618777567737869, "grad_norm": 1.03125, "learning_rate": 0.0004845143544847997, "loss": 5.2414, "mean_token_accuracy": 0.18425808548927308, "num_tokens": 27700366.0, "step": 15020 }, { "entropy": 5.688190650939942, "epoch": 1.262297836588952, "grad_norm": 1.125, "learning_rate": 0.00048450343493335697, "loss": 5.1151, "mean_token_accuracy": 0.19027119874954224, "num_tokens": 27708893.0, "step": 15025 }, { "entropy": 5.603574228286743, "epoch": 1.262717916404117, "grad_norm": 1.1328125, "learning_rate": 0.0004844925116706688, "loss": 5.1573, "mean_token_accuracy": 0.18225290030241012, "num_tokens": 27717494.0, "step": 15030 }, { "entropy": 5.5542198657989506, "epoch": 1.2631379962192817, "grad_norm": 1.0546875, "learning_rate": 0.00048448158469692866, "loss": 5.068, "mean_token_accuracy": 0.1993619903922081, "num_tokens": 27726487.0, "step": 15035 }, { "entropy": 5.740460538864136, "epoch": 1.2635580760344465, "grad_norm": 1.0625, "learning_rate": 0.0004844706540123301, "loss": 5.3068, "mean_token_accuracy": 0.180335333943367, "num_tokens": 27736602.0, "step": 15040 }, { "entropy": 5.736536836624145, "epoch": 1.2639781558496115, "grad_norm": 1.09375, "learning_rate": 0.00048445971961706675, "loss": 5.3051, "mean_token_accuracy": 0.1801508918404579, "num_tokens": 27746322.0, "step": 15045 }, { "entropy": 5.652089834213257, "epoch": 1.2643982356647763, "grad_norm": 1.125, "learning_rate": 0.0004844487815113323, "loss": 5.1738, "mean_token_accuracy": 0.19577944725751878, "num_tokens": 27754941.0, "step": 15050 }, { "entropy": 5.655590343475342, "epoch": 1.2648183154799413, "grad_norm": 1.1484375, "learning_rate": 0.0004844378396953206, "loss": 5.2223, "mean_token_accuracy": 0.1852713868021965, "num_tokens": 27763941.0, "step": 15055 }, { "entropy": 5.679401063919068, "epoch": 1.265238395295106, "grad_norm": 1.078125, "learning_rate": 0.00048442689416922536, "loss": 5.2575, "mean_token_accuracy": 0.18722603768110274, "num_tokens": 27773087.0, "step": 15060 }, { "entropy": 5.61571364402771, "epoch": 1.2656584751102709, "grad_norm": 1.0546875, "learning_rate": 0.00048441594493324057, "loss": 5.0825, "mean_token_accuracy": 0.19577680379152299, "num_tokens": 27782648.0, "step": 15065 }, { "entropy": 5.6810108661651615, "epoch": 1.2660785549254359, "grad_norm": 1.0390625, "learning_rate": 0.00048440499198756015, "loss": 5.2883, "mean_token_accuracy": 0.18132796585559846, "num_tokens": 27791567.0, "step": 15070 }, { "entropy": 5.640935182571411, "epoch": 1.2664986347406006, "grad_norm": 1.0703125, "learning_rate": 0.00048439403533237816, "loss": 5.2772, "mean_token_accuracy": 0.18109726160764694, "num_tokens": 27801397.0, "step": 15075 }, { "entropy": 5.741448831558228, "epoch": 1.2669187145557657, "grad_norm": 0.96875, "learning_rate": 0.0004843830749678886, "loss": 5.2633, "mean_token_accuracy": 0.18804392367601394, "num_tokens": 27810831.0, "step": 15080 }, { "entropy": 5.623208665847779, "epoch": 1.2673387943709304, "grad_norm": 1.0859375, "learning_rate": 0.0004843721108942856, "loss": 5.1855, "mean_token_accuracy": 0.18844966292381288, "num_tokens": 27819591.0, "step": 15085 }, { "entropy": 5.622630929946899, "epoch": 1.2677588741860952, "grad_norm": 1.015625, "learning_rate": 0.0004843611431117636, "loss": 5.1892, "mean_token_accuracy": 0.19289564788341523, "num_tokens": 27828614.0, "step": 15090 }, { "entropy": 5.674425935745239, "epoch": 1.2681789540012602, "grad_norm": 1.0859375, "learning_rate": 0.0004843501716205167, "loss": 5.2203, "mean_token_accuracy": 0.18590681850910187, "num_tokens": 27837549.0, "step": 15095 }, { "entropy": 5.701784706115722, "epoch": 1.2685990338164252, "grad_norm": 1.0078125, "learning_rate": 0.0004843391964207393, "loss": 5.2276, "mean_token_accuracy": 0.18296049833297728, "num_tokens": 27846678.0, "step": 15100 }, { "entropy": 5.718660879135132, "epoch": 1.26901911363159, "grad_norm": 0.98828125, "learning_rate": 0.0004843282175126258, "loss": 5.2443, "mean_token_accuracy": 0.18651428520679475, "num_tokens": 27855734.0, "step": 15105 }, { "entropy": 5.6848588466644285, "epoch": 1.2694391934467548, "grad_norm": 1.0078125, "learning_rate": 0.00048431723489637086, "loss": 5.1873, "mean_token_accuracy": 0.1850312739610672, "num_tokens": 27865111.0, "step": 15110 }, { "entropy": 5.716093683242798, "epoch": 1.2698592732619198, "grad_norm": 1.0703125, "learning_rate": 0.00048430624857216876, "loss": 5.2037, "mean_token_accuracy": 0.18562792241573334, "num_tokens": 27874495.0, "step": 15115 }, { "entropy": 5.6487141132354735, "epoch": 1.2702793530770846, "grad_norm": 1.046875, "learning_rate": 0.0004842952585402143, "loss": 5.231, "mean_token_accuracy": 0.18664893507957458, "num_tokens": 27884531.0, "step": 15120 }, { "entropy": 5.6162103652954105, "epoch": 1.2706994328922496, "grad_norm": 1.1484375, "learning_rate": 0.000484284264800702, "loss": 5.1196, "mean_token_accuracy": 0.19320788234472275, "num_tokens": 27893463.0, "step": 15125 }, { "entropy": 5.710645771026611, "epoch": 1.2711195127074144, "grad_norm": 1.0078125, "learning_rate": 0.00048427326735382687, "loss": 5.2397, "mean_token_accuracy": 0.1843972235918045, "num_tokens": 27903015.0, "step": 15130 }, { "entropy": 5.686969041824341, "epoch": 1.2715395925225792, "grad_norm": 1.0546875, "learning_rate": 0.0004842622661997834, "loss": 5.2277, "mean_token_accuracy": 0.18325212448835373, "num_tokens": 27912207.0, "step": 15135 }, { "entropy": 5.685573863983154, "epoch": 1.2719596723377442, "grad_norm": 0.98046875, "learning_rate": 0.0004842512613387668, "loss": 5.2287, "mean_token_accuracy": 0.1802036225795746, "num_tokens": 27921566.0, "step": 15140 }, { "entropy": 5.652072381973267, "epoch": 1.272379752152909, "grad_norm": 1.0078125, "learning_rate": 0.0004842402527709718, "loss": 5.174, "mean_token_accuracy": 0.19068079590797424, "num_tokens": 27930633.0, "step": 15145 }, { "entropy": 5.730960178375244, "epoch": 1.272799831968074, "grad_norm": 0.96875, "learning_rate": 0.0004842292404965934, "loss": 5.2799, "mean_token_accuracy": 0.1801493212580681, "num_tokens": 27939887.0, "step": 15150 }, { "entropy": 5.71314811706543, "epoch": 1.2732199117832388, "grad_norm": 1.15625, "learning_rate": 0.0004842182245158268, "loss": 5.288, "mean_token_accuracy": 0.18862022012472152, "num_tokens": 27949090.0, "step": 15155 }, { "entropy": 5.609988880157471, "epoch": 1.2736399915984036, "grad_norm": 0.93359375, "learning_rate": 0.00048420720482886715, "loss": 5.1097, "mean_token_accuracy": 0.19717749804258347, "num_tokens": 27958141.0, "step": 15160 }, { "entropy": 5.598087310791016, "epoch": 1.2740600714135686, "grad_norm": 0.98828125, "learning_rate": 0.0004841961814359095, "loss": 5.165, "mean_token_accuracy": 0.18704772889614105, "num_tokens": 27967780.0, "step": 15165 }, { "entropy": 5.703890037536621, "epoch": 1.2744801512287336, "grad_norm": 1.1640625, "learning_rate": 0.00048418515433714917, "loss": 5.274, "mean_token_accuracy": 0.1796239972114563, "num_tokens": 27976243.0, "step": 15170 }, { "entropy": 5.66547384262085, "epoch": 1.2749002310438984, "grad_norm": 1.0859375, "learning_rate": 0.0004841741235327817, "loss": 5.1376, "mean_token_accuracy": 0.19057952463626862, "num_tokens": 27985874.0, "step": 15175 }, { "entropy": 5.766933870315552, "epoch": 1.2753203108590632, "grad_norm": 1.0390625, "learning_rate": 0.00048416308902300215, "loss": 5.3709, "mean_token_accuracy": 0.1746917337179184, "num_tokens": 27995111.0, "step": 15180 }, { "entropy": 5.635549926757813, "epoch": 1.2757403906742282, "grad_norm": 1.0546875, "learning_rate": 0.0004841520508080063, "loss": 5.1715, "mean_token_accuracy": 0.1852971687912941, "num_tokens": 28003948.0, "step": 15185 }, { "entropy": 5.618925619125366, "epoch": 1.276160470489393, "grad_norm": 1.1484375, "learning_rate": 0.00048414100888798957, "loss": 5.2011, "mean_token_accuracy": 0.18496257215738296, "num_tokens": 28012941.0, "step": 15190 }, { "entropy": 5.594309568405151, "epoch": 1.276580550304558, "grad_norm": 1.0234375, "learning_rate": 0.0004841299632631475, "loss": 5.1803, "mean_token_accuracy": 0.18394773155450822, "num_tokens": 28022195.0, "step": 15195 }, { "entropy": 5.661182498931884, "epoch": 1.2770006301197228, "grad_norm": 1.0703125, "learning_rate": 0.0004841189139336759, "loss": 5.1274, "mean_token_accuracy": 0.18883874267339706, "num_tokens": 28031446.0, "step": 15200 }, { "entropy": 5.674383640289307, "epoch": 1.2774207099348875, "grad_norm": 1.09375, "learning_rate": 0.0004841078608997703, "loss": 5.128, "mean_token_accuracy": 0.1933099776506424, "num_tokens": 28040906.0, "step": 15205 }, { "entropy": 5.632932567596436, "epoch": 1.2778407897500526, "grad_norm": 1.0625, "learning_rate": 0.0004840968041616267, "loss": 5.1603, "mean_token_accuracy": 0.1895108014345169, "num_tokens": 28049848.0, "step": 15210 }, { "entropy": 5.647669458389283, "epoch": 1.2782608695652173, "grad_norm": 1.109375, "learning_rate": 0.00048408574371944094, "loss": 5.167, "mean_token_accuracy": 0.18454947471618652, "num_tokens": 28058276.0, "step": 15215 }, { "entropy": 5.696973991394043, "epoch": 1.2786809493803823, "grad_norm": 0.96875, "learning_rate": 0.0004840746795734088, "loss": 5.2594, "mean_token_accuracy": 0.1869572103023529, "num_tokens": 28068185.0, "step": 15220 }, { "entropy": 5.748417234420776, "epoch": 1.2791010291955471, "grad_norm": 0.99609375, "learning_rate": 0.0004840636117237264, "loss": 5.32, "mean_token_accuracy": 0.1800918310880661, "num_tokens": 28077532.0, "step": 15225 }, { "entropy": 5.632634019851684, "epoch": 1.279521109010712, "grad_norm": 1.015625, "learning_rate": 0.0004840525401705897, "loss": 5.1649, "mean_token_accuracy": 0.18294108510017396, "num_tokens": 28087593.0, "step": 15230 }, { "entropy": 5.666968250274659, "epoch": 1.279941188825877, "grad_norm": 1.1484375, "learning_rate": 0.00048404146491419503, "loss": 5.1459, "mean_token_accuracy": 0.1934385895729065, "num_tokens": 28096256.0, "step": 15235 }, { "entropy": 5.689238262176514, "epoch": 1.2803612686410417, "grad_norm": 1.078125, "learning_rate": 0.00048403038595473837, "loss": 5.1728, "mean_token_accuracy": 0.1915496289730072, "num_tokens": 28105048.0, "step": 15240 }, { "entropy": 5.65830979347229, "epoch": 1.2807813484562067, "grad_norm": 1.0078125, "learning_rate": 0.000484019303292416, "loss": 5.2496, "mean_token_accuracy": 0.17233385294675826, "num_tokens": 28114330.0, "step": 15245 }, { "entropy": 5.656398582458496, "epoch": 1.2812014282713715, "grad_norm": 1.1015625, "learning_rate": 0.00048400821692742434, "loss": 5.1684, "mean_token_accuracy": 0.1907978519797325, "num_tokens": 28123147.0, "step": 15250 }, { "entropy": 5.680694580078125, "epoch": 1.2816215080865365, "grad_norm": 1.046875, "learning_rate": 0.00048399712685995983, "loss": 5.2847, "mean_token_accuracy": 0.18714991062879563, "num_tokens": 28132477.0, "step": 15255 }, { "entropy": 5.6503098487854, "epoch": 1.2820415879017013, "grad_norm": 1.1953125, "learning_rate": 0.00048398603309021877, "loss": 5.2757, "mean_token_accuracy": 0.18028116077184678, "num_tokens": 28141350.0, "step": 15260 }, { "entropy": 5.7183387756347654, "epoch": 1.2824616677168663, "grad_norm": 1.1953125, "learning_rate": 0.0004839749356183978, "loss": 5.2207, "mean_token_accuracy": 0.18802748173475264, "num_tokens": 28149522.0, "step": 15265 }, { "entropy": 5.680771160125732, "epoch": 1.282881747532031, "grad_norm": 0.98046875, "learning_rate": 0.0004839638344446933, "loss": 5.3013, "mean_token_accuracy": 0.18413702845573426, "num_tokens": 28159646.0, "step": 15270 }, { "entropy": 5.723370599746704, "epoch": 1.283301827347196, "grad_norm": 1.0, "learning_rate": 0.0004839527295693023, "loss": 5.2181, "mean_token_accuracy": 0.19183022528886795, "num_tokens": 28168408.0, "step": 15275 }, { "entropy": 5.692142629623413, "epoch": 1.283721907162361, "grad_norm": 1.109375, "learning_rate": 0.0004839416209924211, "loss": 5.2415, "mean_token_accuracy": 0.1824629411101341, "num_tokens": 28177744.0, "step": 15280 }, { "entropy": 5.711873197555542, "epoch": 1.2841419869775257, "grad_norm": 1.03125, "learning_rate": 0.00048393050871424676, "loss": 5.2787, "mean_token_accuracy": 0.1824249029159546, "num_tokens": 28186811.0, "step": 15285 }, { "entropy": 5.684221172332764, "epoch": 1.2845620667926907, "grad_norm": 1.0390625, "learning_rate": 0.000483919392734976, "loss": 5.2745, "mean_token_accuracy": 0.17752473503351213, "num_tokens": 28197052.0, "step": 15290 }, { "entropy": 5.638186073303222, "epoch": 1.2849821466078555, "grad_norm": 1.0390625, "learning_rate": 0.0004839082730548058, "loss": 5.13, "mean_token_accuracy": 0.1966354802250862, "num_tokens": 28206000.0, "step": 15295 }, { "entropy": 5.6376889705657955, "epoch": 1.2854022264230203, "grad_norm": 1.0625, "learning_rate": 0.0004838971496739331, "loss": 5.0888, "mean_token_accuracy": 0.1873488038778305, "num_tokens": 28214679.0, "step": 15300 }, { "entropy": 5.601489067077637, "epoch": 1.2858223062381853, "grad_norm": 0.921875, "learning_rate": 0.000483886022592555, "loss": 5.2222, "mean_token_accuracy": 0.18094058334827423, "num_tokens": 28223890.0, "step": 15305 }, { "entropy": 5.592539644241333, "epoch": 1.28624238605335, "grad_norm": 1.0234375, "learning_rate": 0.0004838748918108685, "loss": 5.161, "mean_token_accuracy": 0.18488988578319548, "num_tokens": 28232422.0, "step": 15310 }, { "entropy": 5.675677347183227, "epoch": 1.286662465868515, "grad_norm": 0.9765625, "learning_rate": 0.00048386375732907083, "loss": 5.2042, "mean_token_accuracy": 0.1888676643371582, "num_tokens": 28242079.0, "step": 15315 }, { "entropy": 5.791743993759155, "epoch": 1.2870825456836799, "grad_norm": 0.96875, "learning_rate": 0.00048385261914735936, "loss": 5.3878, "mean_token_accuracy": 0.17836733758449555, "num_tokens": 28252510.0, "step": 15320 }, { "entropy": 5.763525915145874, "epoch": 1.2875026254988446, "grad_norm": 1.0078125, "learning_rate": 0.00048384147726593125, "loss": 5.3013, "mean_token_accuracy": 0.18291713744401933, "num_tokens": 28261348.0, "step": 15325 }, { "entropy": 5.716344356536865, "epoch": 1.2879227053140097, "grad_norm": 1.03125, "learning_rate": 0.0004838303316849839, "loss": 5.1971, "mean_token_accuracy": 0.18333942890167237, "num_tokens": 28270739.0, "step": 15330 }, { "entropy": 5.6824675559997555, "epoch": 1.2883427851291747, "grad_norm": 1.0546875, "learning_rate": 0.00048381918240471473, "loss": 5.2446, "mean_token_accuracy": 0.17459220737218856, "num_tokens": 28279370.0, "step": 15335 }, { "entropy": 5.679429435729981, "epoch": 1.2887628649443394, "grad_norm": 1.09375, "learning_rate": 0.00048380802942532124, "loss": 5.1977, "mean_token_accuracy": 0.18270488679409028, "num_tokens": 28287955.0, "step": 15340 }, { "entropy": 5.604205369949341, "epoch": 1.2891829447595042, "grad_norm": 1.015625, "learning_rate": 0.00048379687274700107, "loss": 5.1358, "mean_token_accuracy": 0.19691127240657808, "num_tokens": 28296832.0, "step": 15345 }, { "entropy": 5.613079977035523, "epoch": 1.2896030245746692, "grad_norm": 0.98046875, "learning_rate": 0.00048378571236995185, "loss": 5.1686, "mean_token_accuracy": 0.1864880785346031, "num_tokens": 28305778.0, "step": 15350 }, { "entropy": 5.6916910171508786, "epoch": 1.290023104389834, "grad_norm": 1.0625, "learning_rate": 0.00048377454829437124, "loss": 5.2347, "mean_token_accuracy": 0.17607714831829072, "num_tokens": 28314615.0, "step": 15355 }, { "entropy": 5.741583061218262, "epoch": 1.290443184204999, "grad_norm": 0.984375, "learning_rate": 0.0004837633805204569, "loss": 5.2817, "mean_token_accuracy": 0.18190003037452698, "num_tokens": 28324478.0, "step": 15360 }, { "entropy": 5.683787488937378, "epoch": 1.2908632640201638, "grad_norm": 1.0234375, "learning_rate": 0.0004837522090484069, "loss": 5.2375, "mean_token_accuracy": 0.18227873146533966, "num_tokens": 28333532.0, "step": 15365 }, { "entropy": 5.710633373260498, "epoch": 1.2912833438353286, "grad_norm": 1.0, "learning_rate": 0.00048374103387841894, "loss": 5.2074, "mean_token_accuracy": 0.18370541632175447, "num_tokens": 28343723.0, "step": 15370 }, { "entropy": 5.683213424682617, "epoch": 1.2917034236504936, "grad_norm": 1.1015625, "learning_rate": 0.00048372985501069106, "loss": 5.1924, "mean_token_accuracy": 0.18435461521148683, "num_tokens": 28351992.0, "step": 15375 }, { "entropy": 5.665538263320923, "epoch": 1.2921235034656584, "grad_norm": 1.0546875, "learning_rate": 0.0004837186724454213, "loss": 5.1906, "mean_token_accuracy": 0.19026575684547425, "num_tokens": 28361141.0, "step": 15380 }, { "entropy": 5.649229955673218, "epoch": 1.2925435832808234, "grad_norm": 1.0234375, "learning_rate": 0.0004837074861828077, "loss": 5.1732, "mean_token_accuracy": 0.1884308263659477, "num_tokens": 28370339.0, "step": 15385 }, { "entropy": 5.681397438049316, "epoch": 1.2929636630959882, "grad_norm": 1.046875, "learning_rate": 0.0004836962962230485, "loss": 5.2957, "mean_token_accuracy": 0.18279809206724168, "num_tokens": 28379242.0, "step": 15390 }, { "entropy": 5.686970520019531, "epoch": 1.293383742911153, "grad_norm": 1.078125, "learning_rate": 0.0004836851025663418, "loss": 5.1641, "mean_token_accuracy": 0.19510417878627778, "num_tokens": 28388864.0, "step": 15395 }, { "entropy": 5.776041698455811, "epoch": 1.293803822726318, "grad_norm": 1.0546875, "learning_rate": 0.000483673905212886, "loss": 5.2867, "mean_token_accuracy": 0.18392356038093566, "num_tokens": 28398000.0, "step": 15400 }, { "entropy": 5.57850980758667, "epoch": 1.294223902541483, "grad_norm": 1.109375, "learning_rate": 0.0004836627041628794, "loss": 5.2124, "mean_token_accuracy": 0.1882634460926056, "num_tokens": 28407652.0, "step": 15405 }, { "entropy": 5.717814111709595, "epoch": 1.2946439823566478, "grad_norm": 0.96875, "learning_rate": 0.0004836514994165205, "loss": 5.2752, "mean_token_accuracy": 0.18100124597549438, "num_tokens": 28417694.0, "step": 15410 }, { "entropy": 5.731750440597534, "epoch": 1.2950640621718126, "grad_norm": 1.0625, "learning_rate": 0.00048364029097400777, "loss": 5.2132, "mean_token_accuracy": 0.18829178661108018, "num_tokens": 28426928.0, "step": 15415 }, { "entropy": 5.687573623657227, "epoch": 1.2954841419869776, "grad_norm": 1.0234375, "learning_rate": 0.00048362907883553956, "loss": 5.2317, "mean_token_accuracy": 0.1788952261209488, "num_tokens": 28436176.0, "step": 15420 }, { "entropy": 5.709789419174195, "epoch": 1.2959042218021424, "grad_norm": 1.171875, "learning_rate": 0.00048361786300131477, "loss": 5.3287, "mean_token_accuracy": 0.17810456901788713, "num_tokens": 28445277.0, "step": 15425 }, { "entropy": 5.707483005523682, "epoch": 1.2963243016173074, "grad_norm": 1.078125, "learning_rate": 0.0004836066434715319, "loss": 5.201, "mean_token_accuracy": 0.1838946148753166, "num_tokens": 28453959.0, "step": 15430 }, { "entropy": 5.692173194885254, "epoch": 1.2967443814324722, "grad_norm": 1.0, "learning_rate": 0.0004835954202463898, "loss": 5.295, "mean_token_accuracy": 0.18051771968603134, "num_tokens": 28463780.0, "step": 15435 }, { "entropy": 5.636995935440064, "epoch": 1.297164461247637, "grad_norm": 1.0078125, "learning_rate": 0.0004835841933260872, "loss": 5.1401, "mean_token_accuracy": 0.18210506737232207, "num_tokens": 28473299.0, "step": 15440 }, { "entropy": 5.6300867080688475, "epoch": 1.297584541062802, "grad_norm": 1.09375, "learning_rate": 0.00048357296271082305, "loss": 5.1889, "mean_token_accuracy": 0.18463243693113326, "num_tokens": 28481859.0, "step": 15445 }, { "entropy": 5.715458393096924, "epoch": 1.2980046208779668, "grad_norm": 1.015625, "learning_rate": 0.00048356172840079625, "loss": 5.253, "mean_token_accuracy": 0.1836497738957405, "num_tokens": 28491034.0, "step": 15450 }, { "entropy": 5.710042095184326, "epoch": 1.2984247006931318, "grad_norm": 1.125, "learning_rate": 0.0004835504903962058, "loss": 5.1753, "mean_token_accuracy": 0.18293989896774293, "num_tokens": 28499829.0, "step": 15455 }, { "entropy": 5.654258728027344, "epoch": 1.2988447805082965, "grad_norm": 1.15625, "learning_rate": 0.00048353924869725084, "loss": 5.158, "mean_token_accuracy": 0.19599681794643403, "num_tokens": 28508188.0, "step": 15460 }, { "entropy": 5.607231760025025, "epoch": 1.2992648603234613, "grad_norm": 1.1640625, "learning_rate": 0.0004835280033041305, "loss": 5.0757, "mean_token_accuracy": 0.19048233777284623, "num_tokens": 28516509.0, "step": 15465 }, { "entropy": 5.638155221939087, "epoch": 1.2996849401386263, "grad_norm": 1.0703125, "learning_rate": 0.0004835167542170439, "loss": 5.2828, "mean_token_accuracy": 0.18272791802883148, "num_tokens": 28526457.0, "step": 15470 }, { "entropy": 5.6887125968933105, "epoch": 1.3001050199537914, "grad_norm": 0.97265625, "learning_rate": 0.0004835055014361904, "loss": 5.2105, "mean_token_accuracy": 0.1833998218178749, "num_tokens": 28536149.0, "step": 15475 }, { "entropy": 5.764506387710571, "epoch": 1.3005250997689561, "grad_norm": 1.03125, "learning_rate": 0.00048349424496176924, "loss": 5.2724, "mean_token_accuracy": 0.18414745032787322, "num_tokens": 28545486.0, "step": 15480 }, { "entropy": 5.698614454269409, "epoch": 1.300945179584121, "grad_norm": 0.99609375, "learning_rate": 0.00048348298479397996, "loss": 5.1438, "mean_token_accuracy": 0.1875000685453415, "num_tokens": 28554555.0, "step": 15485 }, { "entropy": 5.589125299453736, "epoch": 1.301365259399286, "grad_norm": 1.125, "learning_rate": 0.00048347172093302196, "loss": 5.1926, "mean_token_accuracy": 0.19449746161699294, "num_tokens": 28563387.0, "step": 15490 }, { "entropy": 5.636162424087525, "epoch": 1.3017853392144507, "grad_norm": 0.9609375, "learning_rate": 0.00048346045337909475, "loss": 5.1878, "mean_token_accuracy": 0.19001368433237076, "num_tokens": 28573437.0, "step": 15495 }, { "entropy": 5.622128820419311, "epoch": 1.3022054190296157, "grad_norm": 1.03125, "learning_rate": 0.000483449182132398, "loss": 5.1393, "mean_token_accuracy": 0.19385597109794617, "num_tokens": 28583362.0, "step": 15500 }, { "entropy": 5.816249942779541, "epoch": 1.3026254988447805, "grad_norm": 1.0078125, "learning_rate": 0.00048343790719313124, "loss": 5.3212, "mean_token_accuracy": 0.18051309287548065, "num_tokens": 28593201.0, "step": 15505 }, { "entropy": 5.68142991065979, "epoch": 1.3030455786599453, "grad_norm": 0.99609375, "learning_rate": 0.00048342662856149427, "loss": 5.2255, "mean_token_accuracy": 0.17898503988981246, "num_tokens": 28602486.0, "step": 15510 }, { "entropy": 5.635588598251343, "epoch": 1.3034656584751103, "grad_norm": 1.0625, "learning_rate": 0.000483415346237687, "loss": 5.2398, "mean_token_accuracy": 0.18370794355869294, "num_tokens": 28611643.0, "step": 15515 }, { "entropy": 5.722039413452149, "epoch": 1.303885738290275, "grad_norm": 1.046875, "learning_rate": 0.0004834040602219091, "loss": 5.2764, "mean_token_accuracy": 0.18739972859621049, "num_tokens": 28620545.0, "step": 15520 }, { "entropy": 5.666640424728394, "epoch": 1.30430581810544, "grad_norm": 0.98828125, "learning_rate": 0.00048339277051436067, "loss": 5.2207, "mean_token_accuracy": 0.18616312146186828, "num_tokens": 28630024.0, "step": 15525 }, { "entropy": 5.762605762481689, "epoch": 1.304725897920605, "grad_norm": 1.1015625, "learning_rate": 0.0004833814771152415, "loss": 5.2547, "mean_token_accuracy": 0.19027598202228546, "num_tokens": 28638995.0, "step": 15530 }, { "entropy": 5.697500371932984, "epoch": 1.3051459777357697, "grad_norm": 1.0703125, "learning_rate": 0.00048337018002475184, "loss": 5.2307, "mean_token_accuracy": 0.18560848534107208, "num_tokens": 28647833.0, "step": 15535 }, { "entropy": 5.601126480102539, "epoch": 1.3055660575509347, "grad_norm": 1.0546875, "learning_rate": 0.0004833588792430917, "loss": 5.119, "mean_token_accuracy": 0.19391625076532365, "num_tokens": 28657441.0, "step": 15540 }, { "entropy": 5.655535364151001, "epoch": 1.3059861373660997, "grad_norm": 1.1015625, "learning_rate": 0.0004833475747704614, "loss": 5.2438, "mean_token_accuracy": 0.18754034340381623, "num_tokens": 28666666.0, "step": 15545 }, { "entropy": 5.667781162261963, "epoch": 1.3064062171812645, "grad_norm": 0.96484375, "learning_rate": 0.000483336266607061, "loss": 5.2349, "mean_token_accuracy": 0.18742330819368364, "num_tokens": 28676770.0, "step": 15550 }, { "entropy": 5.695110750198364, "epoch": 1.3068262969964293, "grad_norm": 1.0703125, "learning_rate": 0.00048332495475309097, "loss": 5.1511, "mean_token_accuracy": 0.18711930364370347, "num_tokens": 28685610.0, "step": 15555 }, { "entropy": 5.709208297729492, "epoch": 1.3072463768115943, "grad_norm": 1.0078125, "learning_rate": 0.00048331363920875155, "loss": 5.2564, "mean_token_accuracy": 0.1860833615064621, "num_tokens": 28695082.0, "step": 15560 }, { "entropy": 5.638410568237305, "epoch": 1.307666456626759, "grad_norm": 1.0078125, "learning_rate": 0.00048330231997424335, "loss": 5.1926, "mean_token_accuracy": 0.1873560816049576, "num_tokens": 28704006.0, "step": 15565 }, { "entropy": 5.67619104385376, "epoch": 1.308086536441924, "grad_norm": 1.015625, "learning_rate": 0.0004832909970497668, "loss": 5.2001, "mean_token_accuracy": 0.18815808594226838, "num_tokens": 28713665.0, "step": 15570 }, { "entropy": 5.667042827606201, "epoch": 1.3085066162570889, "grad_norm": 0.984375, "learning_rate": 0.00048327967043552245, "loss": 5.1711, "mean_token_accuracy": 0.18995516747236252, "num_tokens": 28722920.0, "step": 15575 }, { "entropy": 5.624366569519043, "epoch": 1.3089266960722536, "grad_norm": 1.015625, "learning_rate": 0.00048326834013171107, "loss": 5.1147, "mean_token_accuracy": 0.1942248374223709, "num_tokens": 28731689.0, "step": 15580 }, { "entropy": 5.663785982131958, "epoch": 1.3093467758874187, "grad_norm": 1.078125, "learning_rate": 0.0004832570061385332, "loss": 5.232, "mean_token_accuracy": 0.19268742054700852, "num_tokens": 28741308.0, "step": 15585 }, { "entropy": 5.6395485401153564, "epoch": 1.3097668557025834, "grad_norm": 1.0859375, "learning_rate": 0.0004832456684561898, "loss": 5.204, "mean_token_accuracy": 0.18955718725919724, "num_tokens": 28750190.0, "step": 15590 }, { "entropy": 5.680215978622437, "epoch": 1.3101869355177485, "grad_norm": 1.0625, "learning_rate": 0.0004832343270848815, "loss": 5.2777, "mean_token_accuracy": 0.17860866338014603, "num_tokens": 28759588.0, "step": 15595 }, { "entropy": 5.695086526870727, "epoch": 1.3106070153329132, "grad_norm": 1.0390625, "learning_rate": 0.00048322298202480935, "loss": 5.2856, "mean_token_accuracy": 0.18582217693328856, "num_tokens": 28768800.0, "step": 15600 }, { "entropy": 5.711812305450439, "epoch": 1.311027095148078, "grad_norm": 0.9765625, "learning_rate": 0.00048321163327617433, "loss": 5.2129, "mean_token_accuracy": 0.18443716317415237, "num_tokens": 28778108.0, "step": 15605 }, { "entropy": 5.726698589324951, "epoch": 1.311447174963243, "grad_norm": 1.015625, "learning_rate": 0.0004832002808391775, "loss": 5.1786, "mean_token_accuracy": 0.18866728246212006, "num_tokens": 28787202.0, "step": 15610 }, { "entropy": 5.6856794357299805, "epoch": 1.3118672547784078, "grad_norm": 1.015625, "learning_rate": 0.0004831889247140198, "loss": 5.2167, "mean_token_accuracy": 0.18493859171867372, "num_tokens": 28797482.0, "step": 15615 }, { "entropy": 5.624531507492065, "epoch": 1.3122873345935728, "grad_norm": 1.015625, "learning_rate": 0.00048317756490090253, "loss": 5.1615, "mean_token_accuracy": 0.18613650798797607, "num_tokens": 28805872.0, "step": 15620 }, { "entropy": 5.671494817733764, "epoch": 1.3127074144087376, "grad_norm": 1.0234375, "learning_rate": 0.00048316620140002685, "loss": 5.2728, "mean_token_accuracy": 0.17938014715909958, "num_tokens": 28814836.0, "step": 15625 }, { "entropy": 5.740354824066162, "epoch": 1.3131274942239024, "grad_norm": 1.03125, "learning_rate": 0.0004831548342115942, "loss": 5.2637, "mean_token_accuracy": 0.17944078296422958, "num_tokens": 28824727.0, "step": 15630 }, { "entropy": 5.758591842651367, "epoch": 1.3135475740390674, "grad_norm": 1.15625, "learning_rate": 0.00048314346333580576, "loss": 5.3631, "mean_token_accuracy": 0.17876532375812532, "num_tokens": 28833848.0, "step": 15635 }, { "entropy": 5.653533267974853, "epoch": 1.3139676538542324, "grad_norm": 1.2109375, "learning_rate": 0.0004831320887728631, "loss": 5.1237, "mean_token_accuracy": 0.1950199633836746, "num_tokens": 28842198.0, "step": 15640 }, { "entropy": 5.668334197998047, "epoch": 1.3143877336693972, "grad_norm": 1.046875, "learning_rate": 0.0004831207105229676, "loss": 5.2219, "mean_token_accuracy": 0.18556759208440782, "num_tokens": 28851804.0, "step": 15645 }, { "entropy": 5.569966840744018, "epoch": 1.314807813484562, "grad_norm": 1.09375, "learning_rate": 0.00048310932858632087, "loss": 5.1195, "mean_token_accuracy": 0.18943312466144563, "num_tokens": 28860181.0, "step": 15650 }, { "entropy": 5.620699739456176, "epoch": 1.315227893299727, "grad_norm": 1.0234375, "learning_rate": 0.00048309794296312467, "loss": 5.1893, "mean_token_accuracy": 0.1921105682849884, "num_tokens": 28869945.0, "step": 15655 }, { "entropy": 5.684640026092529, "epoch": 1.3156479731148918, "grad_norm": 1.0546875, "learning_rate": 0.00048308655365358053, "loss": 5.2573, "mean_token_accuracy": 0.18720219135284424, "num_tokens": 28880343.0, "step": 15660 }, { "entropy": 5.799919986724854, "epoch": 1.3160680529300568, "grad_norm": 1.2421875, "learning_rate": 0.00048307516065789017, "loss": 5.3152, "mean_token_accuracy": 0.18171356320381166, "num_tokens": 28889441.0, "step": 15665 }, { "entropy": 5.734494638442993, "epoch": 1.3164881327452216, "grad_norm": 1.09375, "learning_rate": 0.00048306376397625546, "loss": 5.2628, "mean_token_accuracy": 0.1802811473608017, "num_tokens": 28898154.0, "step": 15670 }, { "entropy": 5.6840392589569095, "epoch": 1.3169082125603864, "grad_norm": 1.0234375, "learning_rate": 0.00048305236360887834, "loss": 5.2313, "mean_token_accuracy": 0.18511337786912918, "num_tokens": 28908359.0, "step": 15675 }, { "entropy": 5.641195774078369, "epoch": 1.3173282923755514, "grad_norm": 1.0703125, "learning_rate": 0.00048304095955596074, "loss": 5.2406, "mean_token_accuracy": 0.17897201031446458, "num_tokens": 28918416.0, "step": 15680 }, { "entropy": 5.741735887527466, "epoch": 1.3177483721907162, "grad_norm": 1.0234375, "learning_rate": 0.0004830295518177047, "loss": 5.1497, "mean_token_accuracy": 0.1944534122943878, "num_tokens": 28927412.0, "step": 15685 }, { "entropy": 5.658163118362427, "epoch": 1.3181684520058812, "grad_norm": 1.0390625, "learning_rate": 0.00048301814039431227, "loss": 5.2163, "mean_token_accuracy": 0.18869040459394454, "num_tokens": 28936106.0, "step": 15690 }, { "entropy": 5.6517298221588135, "epoch": 1.318588531821046, "grad_norm": 1.109375, "learning_rate": 0.00048300672528598553, "loss": 5.2424, "mean_token_accuracy": 0.18627324402332307, "num_tokens": 28945197.0, "step": 15695 }, { "entropy": 5.753924036026001, "epoch": 1.3190086116362107, "grad_norm": 0.96484375, "learning_rate": 0.0004829953064929268, "loss": 5.2697, "mean_token_accuracy": 0.18008612394332885, "num_tokens": 28954278.0, "step": 15700 }, { "entropy": 5.798672914505005, "epoch": 1.3194286914513758, "grad_norm": 1.0625, "learning_rate": 0.0004829838840153383, "loss": 5.3337, "mean_token_accuracy": 0.17929676324129104, "num_tokens": 28963101.0, "step": 15705 }, { "entropy": 5.65045657157898, "epoch": 1.3198487712665408, "grad_norm": 1.09375, "learning_rate": 0.0004829724578534224, "loss": 5.201, "mean_token_accuracy": 0.1792924165725708, "num_tokens": 28972063.0, "step": 15710 }, { "entropy": 5.662495374679565, "epoch": 1.3202688510817056, "grad_norm": 1.0625, "learning_rate": 0.00048296102800738153, "loss": 5.1584, "mean_token_accuracy": 0.18861245810985566, "num_tokens": 28981617.0, "step": 15715 }, { "entropy": 5.713338375091553, "epoch": 1.3206889308968703, "grad_norm": 1.046875, "learning_rate": 0.00048294959447741807, "loss": 5.175, "mean_token_accuracy": 0.1867440566420555, "num_tokens": 28989442.0, "step": 15720 }, { "entropy": 5.658263874053955, "epoch": 1.3211090107120353, "grad_norm": 1.0625, "learning_rate": 0.00048293815726373467, "loss": 5.1689, "mean_token_accuracy": 0.19235741794109346, "num_tokens": 28999104.0, "step": 15725 }, { "entropy": 5.666537237167359, "epoch": 1.3215290905272001, "grad_norm": 1.0625, "learning_rate": 0.00048292671636653386, "loss": 5.2417, "mean_token_accuracy": 0.18254392594099045, "num_tokens": 29008645.0, "step": 15730 }, { "entropy": 5.683541631698608, "epoch": 1.3219491703423651, "grad_norm": 1.1015625, "learning_rate": 0.0004829152717860184, "loss": 5.2195, "mean_token_accuracy": 0.18260193318128587, "num_tokens": 29018655.0, "step": 15735 }, { "entropy": 5.717009496688843, "epoch": 1.32236925015753, "grad_norm": 1.046875, "learning_rate": 0.00048290382352239087, "loss": 5.2219, "mean_token_accuracy": 0.18736556321382522, "num_tokens": 29027109.0, "step": 15740 }, { "entropy": 5.635298728942871, "epoch": 1.3227893299726947, "grad_norm": 1.0859375, "learning_rate": 0.00048289237157585424, "loss": 5.0509, "mean_token_accuracy": 0.2018749251961708, "num_tokens": 29035535.0, "step": 15745 }, { "entropy": 5.608642959594727, "epoch": 1.3232094097878597, "grad_norm": 1.0234375, "learning_rate": 0.0004828809159466112, "loss": 5.2021, "mean_token_accuracy": 0.18306633979082107, "num_tokens": 29044723.0, "step": 15750 }, { "entropy": 5.691537714004516, "epoch": 1.3236294896030245, "grad_norm": 1.1328125, "learning_rate": 0.0004828694566348648, "loss": 5.3617, "mean_token_accuracy": 0.17423819303512572, "num_tokens": 29053636.0, "step": 15755 }, { "entropy": 5.753398752212524, "epoch": 1.3240495694181895, "grad_norm": 1.0546875, "learning_rate": 0.00048285799364081806, "loss": 5.2446, "mean_token_accuracy": 0.1801772251725197, "num_tokens": 29062940.0, "step": 15760 }, { "entropy": 5.66590051651001, "epoch": 1.3244696492333543, "grad_norm": 0.9921875, "learning_rate": 0.00048284652696467404, "loss": 5.1594, "mean_token_accuracy": 0.19004925787448884, "num_tokens": 29072159.0, "step": 15765 }, { "entropy": 5.7182670593261715, "epoch": 1.324889729048519, "grad_norm": 1.0234375, "learning_rate": 0.00048283505660663575, "loss": 5.2409, "mean_token_accuracy": 0.18952523916959763, "num_tokens": 29081544.0, "step": 15770 }, { "entropy": 5.680987405776977, "epoch": 1.325309808863684, "grad_norm": 1.0234375, "learning_rate": 0.0004828235825669064, "loss": 5.2043, "mean_token_accuracy": 0.18984013050794601, "num_tokens": 29090710.0, "step": 15775 }, { "entropy": 5.696029758453369, "epoch": 1.325729888678849, "grad_norm": 1.0859375, "learning_rate": 0.00048281210484568937, "loss": 5.22, "mean_token_accuracy": 0.19033609181642533, "num_tokens": 29098988.0, "step": 15780 }, { "entropy": 5.61902174949646, "epoch": 1.326149968494014, "grad_norm": 1.0390625, "learning_rate": 0.00048280062344318794, "loss": 5.247, "mean_token_accuracy": 0.1763183817267418, "num_tokens": 29108926.0, "step": 15785 }, { "entropy": 5.6475972652435305, "epoch": 1.3265700483091787, "grad_norm": 1.1328125, "learning_rate": 0.0004827891383596054, "loss": 5.1725, "mean_token_accuracy": 0.19037580490112305, "num_tokens": 29118065.0, "step": 15790 }, { "entropy": 5.653292989730835, "epoch": 1.3269901281243437, "grad_norm": 1.109375, "learning_rate": 0.00048277764959514524, "loss": 5.1407, "mean_token_accuracy": 0.18769484162330627, "num_tokens": 29127030.0, "step": 15795 }, { "entropy": 5.751645517349243, "epoch": 1.3274102079395085, "grad_norm": 0.98828125, "learning_rate": 0.0004827661571500111, "loss": 5.2748, "mean_token_accuracy": 0.1802245259284973, "num_tokens": 29137200.0, "step": 15800 }, { "entropy": 5.70492639541626, "epoch": 1.3278302877546735, "grad_norm": 0.9609375, "learning_rate": 0.00048275466102440644, "loss": 5.2376, "mean_token_accuracy": 0.1885538801550865, "num_tokens": 29147029.0, "step": 15805 }, { "entropy": 5.65048885345459, "epoch": 1.3282503675698383, "grad_norm": 1.1328125, "learning_rate": 0.00048274316121853494, "loss": 5.1526, "mean_token_accuracy": 0.19232604205608367, "num_tokens": 29155675.0, "step": 15810 }, { "entropy": 5.71104097366333, "epoch": 1.328670447385003, "grad_norm": 1.0703125, "learning_rate": 0.00048273165773260023, "loss": 5.1993, "mean_token_accuracy": 0.18791833370923997, "num_tokens": 29164730.0, "step": 15815 }, { "entropy": 5.6307838439941404, "epoch": 1.329090527200168, "grad_norm": 1.125, "learning_rate": 0.0004827201505668063, "loss": 5.2002, "mean_token_accuracy": 0.18692273944616317, "num_tokens": 29173074.0, "step": 15820 }, { "entropy": 5.733736324310303, "epoch": 1.3295106070153329, "grad_norm": 1.0, "learning_rate": 0.0004827086397213568, "loss": 5.3054, "mean_token_accuracy": 0.18535079509019853, "num_tokens": 29182175.0, "step": 15825 }, { "entropy": 5.904061985015869, "epoch": 1.3299306868304979, "grad_norm": 1.0625, "learning_rate": 0.0004826971251964557, "loss": 5.5144, "mean_token_accuracy": 0.1743567131459713, "num_tokens": 29192910.0, "step": 15830 }, { "entropy": 5.644248056411743, "epoch": 1.3303507666456627, "grad_norm": 0.98046875, "learning_rate": 0.000482685606992307, "loss": 5.1588, "mean_token_accuracy": 0.18839340656995773, "num_tokens": 29201969.0, "step": 15835 }, { "entropy": 5.773070001602173, "epoch": 1.3307708464608274, "grad_norm": 1.0703125, "learning_rate": 0.00048267408510911463, "loss": 5.2958, "mean_token_accuracy": 0.18461482375860214, "num_tokens": 29210475.0, "step": 15840 }, { "entropy": 5.657936668395996, "epoch": 1.3311909262759924, "grad_norm": 1.0390625, "learning_rate": 0.0004826625595470829, "loss": 5.1754, "mean_token_accuracy": 0.18231599926948547, "num_tokens": 29222586.0, "step": 15845 }, { "entropy": 5.682246160507202, "epoch": 1.3316110060911575, "grad_norm": 1.015625, "learning_rate": 0.00048265103030641575, "loss": 5.2424, "mean_token_accuracy": 0.18216193914413453, "num_tokens": 29231503.0, "step": 15850 }, { "entropy": 5.649673223495483, "epoch": 1.3320310859063222, "grad_norm": 1.1015625, "learning_rate": 0.0004826394973873176, "loss": 5.2021, "mean_token_accuracy": 0.1840102419257164, "num_tokens": 29241534.0, "step": 15855 }, { "entropy": 5.664816951751709, "epoch": 1.332451165721487, "grad_norm": 1.0078125, "learning_rate": 0.00048262796078999266, "loss": 5.1982, "mean_token_accuracy": 0.18470583409070968, "num_tokens": 29250381.0, "step": 15860 }, { "entropy": 5.678180980682373, "epoch": 1.332871245536652, "grad_norm": 1.09375, "learning_rate": 0.0004826164205146453, "loss": 5.2839, "mean_token_accuracy": 0.181551893055439, "num_tokens": 29259205.0, "step": 15865 }, { "entropy": 5.66503324508667, "epoch": 1.3332913253518168, "grad_norm": 1.0703125, "learning_rate": 0.00048260487656147995, "loss": 5.1581, "mean_token_accuracy": 0.1904314160346985, "num_tokens": 29267723.0, "step": 15870 }, { "entropy": 5.630204105377198, "epoch": 1.3337114051669818, "grad_norm": 1.09375, "learning_rate": 0.00048259332893070106, "loss": 5.1719, "mean_token_accuracy": 0.18981162458658218, "num_tokens": 29277102.0, "step": 15875 }, { "entropy": 5.686960029602051, "epoch": 1.3341314849821466, "grad_norm": 1.1171875, "learning_rate": 0.0004825817776225133, "loss": 5.1573, "mean_token_accuracy": 0.18834829926490784, "num_tokens": 29286484.0, "step": 15880 }, { "entropy": 5.700734853744507, "epoch": 1.3345515647973114, "grad_norm": 1.03125, "learning_rate": 0.00048257022263712123, "loss": 5.2555, "mean_token_accuracy": 0.19244832545518875, "num_tokens": 29296528.0, "step": 15885 }, { "entropy": 5.597277116775513, "epoch": 1.3349716446124764, "grad_norm": 1.0859375, "learning_rate": 0.00048255866397472954, "loss": 5.117, "mean_token_accuracy": 0.1913746953010559, "num_tokens": 29305283.0, "step": 15890 }, { "entropy": 5.631728458404541, "epoch": 1.3353917244276412, "grad_norm": 1.1875, "learning_rate": 0.000482547101635543, "loss": 5.1181, "mean_token_accuracy": 0.18797454088926316, "num_tokens": 29315088.0, "step": 15895 }, { "entropy": 5.662004566192627, "epoch": 1.3358118042428062, "grad_norm": 1.0390625, "learning_rate": 0.00048253553561976645, "loss": 5.1541, "mean_token_accuracy": 0.19282459318637848, "num_tokens": 29323793.0, "step": 15900 }, { "entropy": 5.662420415878296, "epoch": 1.336231884057971, "grad_norm": 0.9140625, "learning_rate": 0.0004825239659276047, "loss": 5.2048, "mean_token_accuracy": 0.18351283222436904, "num_tokens": 29334015.0, "step": 15905 }, { "entropy": 5.743709421157837, "epoch": 1.3366519638731358, "grad_norm": 1.0625, "learning_rate": 0.0004825123925592628, "loss": 5.3188, "mean_token_accuracy": 0.18209717720746993, "num_tokens": 29343221.0, "step": 15910 }, { "entropy": 5.675715541839599, "epoch": 1.3370720436883008, "grad_norm": 1.015625, "learning_rate": 0.00048250081551494574, "loss": 5.1646, "mean_token_accuracy": 0.1875445067882538, "num_tokens": 29352261.0, "step": 15915 }, { "entropy": 5.681710433959961, "epoch": 1.3374921235034656, "grad_norm": 1.03125, "learning_rate": 0.0004824892347948586, "loss": 5.2685, "mean_token_accuracy": 0.18324420899152755, "num_tokens": 29362138.0, "step": 15920 }, { "entropy": 5.672230958938599, "epoch": 1.3379122033186306, "grad_norm": 1.0390625, "learning_rate": 0.0004824776503992064, "loss": 5.1729, "mean_token_accuracy": 0.18980335295200348, "num_tokens": 29371234.0, "step": 15925 }, { "entropy": 5.654215383529663, "epoch": 1.3383322831337954, "grad_norm": 1.1015625, "learning_rate": 0.0004824660623281945, "loss": 5.2142, "mean_token_accuracy": 0.1929275706410408, "num_tokens": 29380371.0, "step": 15930 }, { "entropy": 5.740076732635498, "epoch": 1.3387523629489604, "grad_norm": 1.046875, "learning_rate": 0.00048245447058202815, "loss": 5.333, "mean_token_accuracy": 0.18076390027999878, "num_tokens": 29389230.0, "step": 15935 }, { "entropy": 5.7338865280151365, "epoch": 1.3391724427641252, "grad_norm": 1.03125, "learning_rate": 0.0004824428751609126, "loss": 5.2181, "mean_token_accuracy": 0.18634677529335023, "num_tokens": 29398753.0, "step": 15940 }, { "entropy": 5.67572979927063, "epoch": 1.3395925225792902, "grad_norm": 1.140625, "learning_rate": 0.00048243127606505343, "loss": 5.1979, "mean_token_accuracy": 0.19060231000185013, "num_tokens": 29407487.0, "step": 15945 }, { "entropy": 5.578230571746826, "epoch": 1.340012602394455, "grad_norm": 1.1015625, "learning_rate": 0.000482419673294656, "loss": 5.1895, "mean_token_accuracy": 0.18391541242599488, "num_tokens": 29416140.0, "step": 15950 }, { "entropy": 5.621850061416626, "epoch": 1.3404326822096198, "grad_norm": 1.0625, "learning_rate": 0.0004824080668499259, "loss": 5.2012, "mean_token_accuracy": 0.18630782663822174, "num_tokens": 29424763.0, "step": 15955 }, { "entropy": 5.774163293838501, "epoch": 1.3408527620247848, "grad_norm": 1.1953125, "learning_rate": 0.00048239645673106855, "loss": 5.2115, "mean_token_accuracy": 0.18540870845317842, "num_tokens": 29434589.0, "step": 15960 }, { "entropy": 5.70867166519165, "epoch": 1.3412728418399495, "grad_norm": 1.0625, "learning_rate": 0.00048238484293828995, "loss": 5.2085, "mean_token_accuracy": 0.18407559841871263, "num_tokens": 29443549.0, "step": 15965 }, { "entropy": 5.671496868133545, "epoch": 1.3416929216551146, "grad_norm": 1.0703125, "learning_rate": 0.0004823732254717955, "loss": 5.2342, "mean_token_accuracy": 0.18288592547178267, "num_tokens": 29452457.0, "step": 15970 }, { "entropy": 5.663273906707763, "epoch": 1.3421130014702793, "grad_norm": 1.0546875, "learning_rate": 0.0004823616043317912, "loss": 5.2084, "mean_token_accuracy": 0.1833146706223488, "num_tokens": 29461238.0, "step": 15975 }, { "entropy": 5.730727338790894, "epoch": 1.3425330812854441, "grad_norm": 0.95703125, "learning_rate": 0.00048234997951848284, "loss": 5.2622, "mean_token_accuracy": 0.18041736483573914, "num_tokens": 29471170.0, "step": 15980 }, { "entropy": 5.77214674949646, "epoch": 1.3429531611006091, "grad_norm": 0.97265625, "learning_rate": 0.0004823383510320764, "loss": 5.2657, "mean_token_accuracy": 0.17841576486825944, "num_tokens": 29481017.0, "step": 15985 }, { "entropy": 5.746063661575318, "epoch": 1.343373240915774, "grad_norm": 1.0546875, "learning_rate": 0.00048232671887277786, "loss": 5.2171, "mean_token_accuracy": 0.19310958236455916, "num_tokens": 29489809.0, "step": 15990 }, { "entropy": 5.629018402099609, "epoch": 1.343793320730939, "grad_norm": 1.0625, "learning_rate": 0.00048231508304079313, "loss": 5.2279, "mean_token_accuracy": 0.18908899575471877, "num_tokens": 29499499.0, "step": 15995 }, { "entropy": 5.678432941436768, "epoch": 1.3442134005461037, "grad_norm": 1.0703125, "learning_rate": 0.00048230344353632855, "loss": 5.1884, "mean_token_accuracy": 0.18890270888805388, "num_tokens": 29508526.0, "step": 16000 }, { "entropy": 5.6884765625, "epoch": 1.3446334803612685, "grad_norm": 1.109375, "learning_rate": 0.0004822918003595902, "loss": 5.1397, "mean_token_accuracy": 0.18622955083847045, "num_tokens": 29517516.0, "step": 16005 }, { "entropy": 5.710375165939331, "epoch": 1.3450535601764335, "grad_norm": 1.0390625, "learning_rate": 0.0004822801535107843, "loss": 5.2308, "mean_token_accuracy": 0.18227750658988953, "num_tokens": 29526949.0, "step": 16010 }, { "entropy": 5.635251998901367, "epoch": 1.3454736399915985, "grad_norm": 1.0625, "learning_rate": 0.0004822685029901173, "loss": 5.1558, "mean_token_accuracy": 0.1851478785276413, "num_tokens": 29536696.0, "step": 16015 }, { "entropy": 5.644952821731567, "epoch": 1.3458937198067633, "grad_norm": 1.0625, "learning_rate": 0.0004822568487977954, "loss": 5.2426, "mean_token_accuracy": 0.19276821464300156, "num_tokens": 29545672.0, "step": 16020 }, { "entropy": 5.688053846359253, "epoch": 1.346313799621928, "grad_norm": 1.09375, "learning_rate": 0.00048224519093402517, "loss": 5.2505, "mean_token_accuracy": 0.18563836216926574, "num_tokens": 29554888.0, "step": 16025 }, { "entropy": 5.671356439590454, "epoch": 1.346733879437093, "grad_norm": 1.0234375, "learning_rate": 0.00048223352939901317, "loss": 5.1919, "mean_token_accuracy": 0.18930904865264891, "num_tokens": 29564798.0, "step": 16030 }, { "entropy": 5.68819580078125, "epoch": 1.347153959252258, "grad_norm": 0.96875, "learning_rate": 0.0004822218641929658, "loss": 5.2157, "mean_token_accuracy": 0.18981837034225463, "num_tokens": 29574802.0, "step": 16035 }, { "entropy": 5.726577425003052, "epoch": 1.347574039067423, "grad_norm": 1.0390625, "learning_rate": 0.0004822101953160899, "loss": 5.2136, "mean_token_accuracy": 0.1801066979765892, "num_tokens": 29583056.0, "step": 16040 }, { "entropy": 5.649437618255615, "epoch": 1.3479941188825877, "grad_norm": 1.1328125, "learning_rate": 0.000482198522768592, "loss": 5.1926, "mean_token_accuracy": 0.17975838482379913, "num_tokens": 29591935.0, "step": 16045 }, { "entropy": 5.587256622314453, "epoch": 1.3484141986977525, "grad_norm": 1.015625, "learning_rate": 0.00048218684655067907, "loss": 5.1416, "mean_token_accuracy": 0.19269577264785767, "num_tokens": 29600812.0, "step": 16050 }, { "entropy": 5.729511404037476, "epoch": 1.3488342785129175, "grad_norm": 1.03125, "learning_rate": 0.0004821751666625577, "loss": 5.2368, "mean_token_accuracy": 0.18645123690366744, "num_tokens": 29610735.0, "step": 16055 }, { "entropy": 5.6917768001556395, "epoch": 1.3492543583280823, "grad_norm": 1.0234375, "learning_rate": 0.00048216348310443506, "loss": 5.1733, "mean_token_accuracy": 0.18030295372009278, "num_tokens": 29620295.0, "step": 16060 }, { "entropy": 5.64300947189331, "epoch": 1.3496744381432473, "grad_norm": 1.0703125, "learning_rate": 0.00048215179587651795, "loss": 5.0562, "mean_token_accuracy": 0.20104290097951888, "num_tokens": 29628214.0, "step": 16065 }, { "entropy": 5.646235370635987, "epoch": 1.350094517958412, "grad_norm": 1.0703125, "learning_rate": 0.0004821401049790134, "loss": 5.1797, "mean_token_accuracy": 0.19180000722408294, "num_tokens": 29636598.0, "step": 16070 }, { "entropy": 5.668827772140503, "epoch": 1.3505145977735769, "grad_norm": 0.9921875, "learning_rate": 0.0004821284104121286, "loss": 5.1577, "mean_token_accuracy": 0.19004821181297302, "num_tokens": 29646052.0, "step": 16075 }, { "entropy": 5.663706254959107, "epoch": 1.3509346775887419, "grad_norm": 1.1015625, "learning_rate": 0.00048211671217607066, "loss": 5.1914, "mean_token_accuracy": 0.18023791760206223, "num_tokens": 29655310.0, "step": 16080 }, { "entropy": 5.714651298522949, "epoch": 1.3513547574039069, "grad_norm": 1.140625, "learning_rate": 0.0004821050102710468, "loss": 5.2159, "mean_token_accuracy": 0.18527817279100417, "num_tokens": 29664020.0, "step": 16085 }, { "entropy": 5.634983015060425, "epoch": 1.3517748372190717, "grad_norm": 1.0546875, "learning_rate": 0.00048209330469726433, "loss": 5.2637, "mean_token_accuracy": 0.18105848133563995, "num_tokens": 29672416.0, "step": 16090 }, { "entropy": 5.692638969421386, "epoch": 1.3521949170342364, "grad_norm": 0.984375, "learning_rate": 0.00048208159545493057, "loss": 5.1572, "mean_token_accuracy": 0.19302482455968856, "num_tokens": 29681148.0, "step": 16095 }, { "entropy": 5.666637563705445, "epoch": 1.3526149968494015, "grad_norm": 0.984375, "learning_rate": 0.0004820698825442531, "loss": 5.1384, "mean_token_accuracy": 0.19141075015068054, "num_tokens": 29689089.0, "step": 16100 }, { "entropy": 5.678461360931396, "epoch": 1.3530350766645662, "grad_norm": 0.96484375, "learning_rate": 0.00048205816596543914, "loss": 5.2703, "mean_token_accuracy": 0.1827550783753395, "num_tokens": 29697704.0, "step": 16105 }, { "entropy": 5.657993602752685, "epoch": 1.3534551564797312, "grad_norm": 1.0234375, "learning_rate": 0.00048204644571869646, "loss": 5.2667, "mean_token_accuracy": 0.17853583693504332, "num_tokens": 29706966.0, "step": 16110 }, { "entropy": 5.636247205734253, "epoch": 1.353875236294896, "grad_norm": 1.078125, "learning_rate": 0.0004820347218042326, "loss": 5.1744, "mean_token_accuracy": 0.18644336014986038, "num_tokens": 29715817.0, "step": 16115 }, { "entropy": 5.686433887481689, "epoch": 1.3542953161100608, "grad_norm": 1.0859375, "learning_rate": 0.0004820229942222553, "loss": 5.2685, "mean_token_accuracy": 0.18254653215408326, "num_tokens": 29725500.0, "step": 16120 }, { "entropy": 5.6613428592681885, "epoch": 1.3547153959252258, "grad_norm": 1.046875, "learning_rate": 0.00048201126297297214, "loss": 5.1836, "mean_token_accuracy": 0.1885615035891533, "num_tokens": 29734774.0, "step": 16125 }, { "entropy": 5.67340087890625, "epoch": 1.3551354757403906, "grad_norm": 0.88671875, "learning_rate": 0.0004819995280565911, "loss": 5.1615, "mean_token_accuracy": 0.19176916033029556, "num_tokens": 29744667.0, "step": 16130 }, { "entropy": 5.773205041885376, "epoch": 1.3555555555555556, "grad_norm": 1.0703125, "learning_rate": 0.00048198778947332, "loss": 5.2628, "mean_token_accuracy": 0.18847485035657882, "num_tokens": 29753644.0, "step": 16135 }, { "entropy": 5.767695903778076, "epoch": 1.3559756353707204, "grad_norm": 1.0859375, "learning_rate": 0.0004819760472233668, "loss": 5.1889, "mean_token_accuracy": 0.19281279742717744, "num_tokens": 29762977.0, "step": 16140 }, { "entropy": 5.651463031768799, "epoch": 1.3563957151858852, "grad_norm": 1.0, "learning_rate": 0.00048196430130693956, "loss": 5.1912, "mean_token_accuracy": 0.19013935774564744, "num_tokens": 29772221.0, "step": 16145 }, { "entropy": 5.6568115234375, "epoch": 1.3568157950010502, "grad_norm": 1.1015625, "learning_rate": 0.00048195255172424627, "loss": 5.1613, "mean_token_accuracy": 0.19426402300596238, "num_tokens": 29781240.0, "step": 16150 }, { "entropy": 5.685576248168945, "epoch": 1.3572358748162152, "grad_norm": 1.0859375, "learning_rate": 0.00048194079847549507, "loss": 5.1591, "mean_token_accuracy": 0.1893797904253006, "num_tokens": 29790330.0, "step": 16155 }, { "entropy": 5.759364128112793, "epoch": 1.35765595463138, "grad_norm": 1.0078125, "learning_rate": 0.0004819290415608942, "loss": 5.275, "mean_token_accuracy": 0.18803547769784928, "num_tokens": 29800945.0, "step": 16160 }, { "entropy": 5.748047876358032, "epoch": 1.3580760344465448, "grad_norm": 1.0390625, "learning_rate": 0.0004819172809806519, "loss": 5.3266, "mean_token_accuracy": 0.17912647426128386, "num_tokens": 29810391.0, "step": 16165 }, { "entropy": 5.6772714138031, "epoch": 1.3584961142617098, "grad_norm": 1.0625, "learning_rate": 0.00048190551673497645, "loss": 5.1899, "mean_token_accuracy": 0.1841709315776825, "num_tokens": 29819511.0, "step": 16170 }, { "entropy": 5.665785121917724, "epoch": 1.3589161940768746, "grad_norm": 1.109375, "learning_rate": 0.0004818937488240764, "loss": 5.2326, "mean_token_accuracy": 0.1885146364569664, "num_tokens": 29828313.0, "step": 16175 }, { "entropy": 5.6614518642425535, "epoch": 1.3593362738920396, "grad_norm": 1.015625, "learning_rate": 0.00048188197724816014, "loss": 5.1305, "mean_token_accuracy": 0.1909303680062294, "num_tokens": 29837940.0, "step": 16180 }, { "entropy": 5.686138677597046, "epoch": 1.3597563537072044, "grad_norm": 0.9765625, "learning_rate": 0.00048187020200743613, "loss": 5.1042, "mean_token_accuracy": 0.19251737147569656, "num_tokens": 29846799.0, "step": 16185 }, { "entropy": 5.711656951904297, "epoch": 1.3601764335223692, "grad_norm": 0.9296875, "learning_rate": 0.000481858423102113, "loss": 5.2512, "mean_token_accuracy": 0.18040336966514586, "num_tokens": 29856263.0, "step": 16190 }, { "entropy": 5.646722412109375, "epoch": 1.3605965133375342, "grad_norm": 1.09375, "learning_rate": 0.0004818466405323994, "loss": 5.1787, "mean_token_accuracy": 0.1831099048256874, "num_tokens": 29864335.0, "step": 16195 }, { "entropy": 5.716898965835571, "epoch": 1.361016593152699, "grad_norm": 1.1640625, "learning_rate": 0.00048183485429850417, "loss": 5.2292, "mean_token_accuracy": 0.1823573648929596, "num_tokens": 29873466.0, "step": 16200 }, { "entropy": 5.608238315582275, "epoch": 1.361436672967864, "grad_norm": 1.171875, "learning_rate": 0.0004818230644006359, "loss": 5.1915, "mean_token_accuracy": 0.19537708014249802, "num_tokens": 29883051.0, "step": 16205 }, { "entropy": 5.667512893676758, "epoch": 1.3618567527830288, "grad_norm": 1.1015625, "learning_rate": 0.0004818112708390036, "loss": 5.1402, "mean_token_accuracy": 0.18765533119440078, "num_tokens": 29891823.0, "step": 16210 }, { "entropy": 5.664647626876831, "epoch": 1.3622768325981935, "grad_norm": 1.1328125, "learning_rate": 0.0004817994736138162, "loss": 5.1635, "mean_token_accuracy": 0.19000998884439468, "num_tokens": 29900735.0, "step": 16215 }, { "entropy": 5.680288505554199, "epoch": 1.3626969124133586, "grad_norm": 1.015625, "learning_rate": 0.0004817876727252824, "loss": 5.2356, "mean_token_accuracy": 0.19040718525648118, "num_tokens": 29910345.0, "step": 16220 }, { "entropy": 5.6547447681427006, "epoch": 1.3631169922285233, "grad_norm": 1.125, "learning_rate": 0.00048177586817361166, "loss": 5.1914, "mean_token_accuracy": 0.1873699352145195, "num_tokens": 29919650.0, "step": 16225 }, { "entropy": 5.761954259872437, "epoch": 1.3635370720436883, "grad_norm": 1.046875, "learning_rate": 0.0004817640599590128, "loss": 5.2384, "mean_token_accuracy": 0.18756576776504516, "num_tokens": 29928851.0, "step": 16230 }, { "entropy": 5.777277898788452, "epoch": 1.3639571518588531, "grad_norm": 0.9765625, "learning_rate": 0.00048175224808169506, "loss": 5.3031, "mean_token_accuracy": 0.18052376806735992, "num_tokens": 29939146.0, "step": 16235 }, { "entropy": 5.697104597091675, "epoch": 1.3643772316740181, "grad_norm": 1.0390625, "learning_rate": 0.00048174043254186775, "loss": 5.174, "mean_token_accuracy": 0.18092049062252044, "num_tokens": 29947556.0, "step": 16240 }, { "entropy": 5.669161653518676, "epoch": 1.364797311489183, "grad_norm": 0.9453125, "learning_rate": 0.0004817286133397401, "loss": 5.2284, "mean_token_accuracy": 0.18422749936580657, "num_tokens": 29957319.0, "step": 16245 }, { "entropy": 5.712401914596557, "epoch": 1.365217391304348, "grad_norm": 1.0546875, "learning_rate": 0.0004817167904755216, "loss": 5.2441, "mean_token_accuracy": 0.18295372128486634, "num_tokens": 29966697.0, "step": 16250 }, { "entropy": 5.714666652679443, "epoch": 1.3656374711195127, "grad_norm": 0.99609375, "learning_rate": 0.00048170496394942154, "loss": 5.2258, "mean_token_accuracy": 0.18755841255187988, "num_tokens": 29975103.0, "step": 16255 }, { "entropy": 5.620734214782715, "epoch": 1.3660575509346775, "grad_norm": 0.98828125, "learning_rate": 0.00048169313376164943, "loss": 5.1508, "mean_token_accuracy": 0.1868949458003044, "num_tokens": 29984865.0, "step": 16260 }, { "entropy": 5.693135118484497, "epoch": 1.3664776307498425, "grad_norm": 1.0625, "learning_rate": 0.00048168129991241497, "loss": 5.1728, "mean_token_accuracy": 0.1872917354106903, "num_tokens": 29994376.0, "step": 16265 }, { "entropy": 5.819542264938354, "epoch": 1.3668977105650073, "grad_norm": 1.046875, "learning_rate": 0.0004816694624019277, "loss": 5.3818, "mean_token_accuracy": 0.18195104598999023, "num_tokens": 30004846.0, "step": 16270 }, { "entropy": 5.687378931045532, "epoch": 1.3673177903801723, "grad_norm": 1.03125, "learning_rate": 0.00048165762123039723, "loss": 5.1933, "mean_token_accuracy": 0.19059983491897584, "num_tokens": 30014083.0, "step": 16275 }, { "entropy": 5.66285548210144, "epoch": 1.367737870195337, "grad_norm": 1.046875, "learning_rate": 0.00048164577639803354, "loss": 5.165, "mean_token_accuracy": 0.19134429544210435, "num_tokens": 30023606.0, "step": 16280 }, { "entropy": 5.59691047668457, "epoch": 1.3681579500105019, "grad_norm": 1.0, "learning_rate": 0.0004816339279050463, "loss": 5.1649, "mean_token_accuracy": 0.17939123064279555, "num_tokens": 30033657.0, "step": 16285 }, { "entropy": 5.630356025695801, "epoch": 1.368578029825667, "grad_norm": 1.0546875, "learning_rate": 0.00048162207575164537, "loss": 5.2046, "mean_token_accuracy": 0.18993112593889236, "num_tokens": 30043230.0, "step": 16290 }, { "entropy": 5.703529310226441, "epoch": 1.3689981096408317, "grad_norm": 0.95703125, "learning_rate": 0.00048161021993804075, "loss": 5.2373, "mean_token_accuracy": 0.18676026314496993, "num_tokens": 30054457.0, "step": 16295 }, { "entropy": 5.655398178100586, "epoch": 1.3694181894559967, "grad_norm": 1.0625, "learning_rate": 0.00048159836046444255, "loss": 5.1047, "mean_token_accuracy": 0.19381102919578552, "num_tokens": 30062912.0, "step": 16300 }, { "entropy": 5.6848639965057375, "epoch": 1.3698382692711615, "grad_norm": 1.125, "learning_rate": 0.0004815864973310607, "loss": 5.2476, "mean_token_accuracy": 0.18270398080348968, "num_tokens": 30071340.0, "step": 16305 }, { "entropy": 5.776359891891479, "epoch": 1.3702583490863263, "grad_norm": 1.0703125, "learning_rate": 0.00048157463053810553, "loss": 5.2977, "mean_token_accuracy": 0.18175773024559022, "num_tokens": 30080334.0, "step": 16310 }, { "entropy": 5.682469034194947, "epoch": 1.3706784289014913, "grad_norm": 1.15625, "learning_rate": 0.00048156276008578706, "loss": 5.1909, "mean_token_accuracy": 0.19033408761024476, "num_tokens": 30089391.0, "step": 16315 }, { "entropy": 5.6369740009307865, "epoch": 1.3710985087166563, "grad_norm": 0.96875, "learning_rate": 0.0004815508859743157, "loss": 5.1379, "mean_token_accuracy": 0.18886044174432753, "num_tokens": 30099027.0, "step": 16320 }, { "entropy": 5.648059892654419, "epoch": 1.371518588531821, "grad_norm": 1.109375, "learning_rate": 0.0004815390082039017, "loss": 5.1487, "mean_token_accuracy": 0.19226816296577454, "num_tokens": 30108088.0, "step": 16325 }, { "entropy": 5.62946515083313, "epoch": 1.3719386683469859, "grad_norm": 0.9765625, "learning_rate": 0.00048152712677475556, "loss": 5.1289, "mean_token_accuracy": 0.19087375849485397, "num_tokens": 30117768.0, "step": 16330 }, { "entropy": 5.696799039840698, "epoch": 1.3723587481621509, "grad_norm": 1.078125, "learning_rate": 0.00048151524168708773, "loss": 5.2588, "mean_token_accuracy": 0.18289670199155808, "num_tokens": 30126364.0, "step": 16335 }, { "entropy": 5.659416389465332, "epoch": 1.3727788279773157, "grad_norm": 1.0859375, "learning_rate": 0.00048150335294110867, "loss": 5.1912, "mean_token_accuracy": 0.1897234857082367, "num_tokens": 30135365.0, "step": 16340 }, { "entropy": 5.726896190643311, "epoch": 1.3731989077924807, "grad_norm": 1.25, "learning_rate": 0.00048149146053702915, "loss": 5.2602, "mean_token_accuracy": 0.19675673246383668, "num_tokens": 30145542.0, "step": 16345 }, { "entropy": 5.711710262298584, "epoch": 1.3736189876076454, "grad_norm": 1.125, "learning_rate": 0.0004814795644750597, "loss": 5.2755, "mean_token_accuracy": 0.18046986162662507, "num_tokens": 30154100.0, "step": 16350 }, { "entropy": 5.665477514266968, "epoch": 1.3740390674228102, "grad_norm": 1.0859375, "learning_rate": 0.00048146766475541105, "loss": 5.1811, "mean_token_accuracy": 0.18969687670469285, "num_tokens": 30162647.0, "step": 16355 }, { "entropy": 5.808218050003052, "epoch": 1.3744591472379752, "grad_norm": 1.34375, "learning_rate": 0.00048145576137829406, "loss": 5.3332, "mean_token_accuracy": 0.18027455359697342, "num_tokens": 30172518.0, "step": 16360 }, { "entropy": 5.712655782699585, "epoch": 1.37487922705314, "grad_norm": 0.97265625, "learning_rate": 0.0004814438543439195, "loss": 5.2487, "mean_token_accuracy": 0.1859870120882988, "num_tokens": 30183124.0, "step": 16365 }, { "entropy": 5.6926047801971436, "epoch": 1.375299306868305, "grad_norm": 1.046875, "learning_rate": 0.0004814319436524984, "loss": 5.2015, "mean_token_accuracy": 0.18640538454055786, "num_tokens": 30191861.0, "step": 16370 }, { "entropy": 5.600358867645264, "epoch": 1.3757193866834698, "grad_norm": 1.1875, "learning_rate": 0.00048142002930424174, "loss": 5.1071, "mean_token_accuracy": 0.19114326685667038, "num_tokens": 30200308.0, "step": 16375 }, { "entropy": 5.704294586181641, "epoch": 1.3761394664986346, "grad_norm": 1.0234375, "learning_rate": 0.0004814081112993605, "loss": 5.2086, "mean_token_accuracy": 0.19368355721235275, "num_tokens": 30209380.0, "step": 16380 }, { "entropy": 5.744526529312134, "epoch": 1.3765595463137996, "grad_norm": 1.015625, "learning_rate": 0.0004813961896380659, "loss": 5.2865, "mean_token_accuracy": 0.18788620680570603, "num_tokens": 30218549.0, "step": 16385 }, { "entropy": 5.726917123794555, "epoch": 1.3769796261289646, "grad_norm": 1.0078125, "learning_rate": 0.0004813842643205691, "loss": 5.2257, "mean_token_accuracy": 0.1853763610124588, "num_tokens": 30228119.0, "step": 16390 }, { "entropy": 5.730572271347046, "epoch": 1.3773997059441294, "grad_norm": 1.0078125, "learning_rate": 0.0004813723353470813, "loss": 5.2041, "mean_token_accuracy": 0.1828337773680687, "num_tokens": 30236765.0, "step": 16395 }, { "entropy": 5.682282447814941, "epoch": 1.3778197857592942, "grad_norm": 1.0390625, "learning_rate": 0.0004813604027178139, "loss": 5.1523, "mean_token_accuracy": 0.18526762425899507, "num_tokens": 30246089.0, "step": 16400 }, { "entropy": 5.690469121932983, "epoch": 1.3782398655744592, "grad_norm": 1.0625, "learning_rate": 0.00048134846643297817, "loss": 5.2567, "mean_token_accuracy": 0.18135033100843428, "num_tokens": 30255806.0, "step": 16405 }, { "entropy": 5.731490898132324, "epoch": 1.378659945389624, "grad_norm": 0.96875, "learning_rate": 0.0004813365264927856, "loss": 5.3288, "mean_token_accuracy": 0.1714249700307846, "num_tokens": 30267112.0, "step": 16410 }, { "entropy": 5.720319080352783, "epoch": 1.379080025204789, "grad_norm": 1.015625, "learning_rate": 0.0004813245828974477, "loss": 5.1902, "mean_token_accuracy": 0.186105976998806, "num_tokens": 30276168.0, "step": 16415 }, { "entropy": 5.647353410720825, "epoch": 1.3795001050199538, "grad_norm": 1.1015625, "learning_rate": 0.0004813126356471761, "loss": 5.2158, "mean_token_accuracy": 0.1868085652589798, "num_tokens": 30285723.0, "step": 16420 }, { "entropy": 5.776714181900024, "epoch": 1.3799201848351186, "grad_norm": 1.078125, "learning_rate": 0.0004813006847421824, "loss": 5.2814, "mean_token_accuracy": 0.18424659371376037, "num_tokens": 30294790.0, "step": 16425 }, { "entropy": 5.696031808853149, "epoch": 1.3803402646502836, "grad_norm": 1.0, "learning_rate": 0.0004812887301826783, "loss": 5.2056, "mean_token_accuracy": 0.18647325783967972, "num_tokens": 30303439.0, "step": 16430 }, { "entropy": 5.668688249588013, "epoch": 1.3807603444654484, "grad_norm": 1.0625, "learning_rate": 0.0004812767719688755, "loss": 5.1797, "mean_token_accuracy": 0.1818026915192604, "num_tokens": 30312493.0, "step": 16435 }, { "entropy": 5.7417408466339115, "epoch": 1.3811804242806134, "grad_norm": 1.0390625, "learning_rate": 0.0004812648101009859, "loss": 5.2173, "mean_token_accuracy": 0.19054559618234634, "num_tokens": 30321637.0, "step": 16440 }, { "entropy": 5.755763244628906, "epoch": 1.3816005040957782, "grad_norm": 0.96875, "learning_rate": 0.0004812528445792215, "loss": 5.3438, "mean_token_accuracy": 0.17596334517002105, "num_tokens": 30330730.0, "step": 16445 }, { "entropy": 5.627185344696045, "epoch": 1.382020583910943, "grad_norm": 1.0859375, "learning_rate": 0.00048124087540379407, "loss": 5.1903, "mean_token_accuracy": 0.18773747831583024, "num_tokens": 30339568.0, "step": 16450 }, { "entropy": 5.6316142082214355, "epoch": 1.382440663726108, "grad_norm": 1.1171875, "learning_rate": 0.00048122890257491573, "loss": 5.2198, "mean_token_accuracy": 0.174595382809639, "num_tokens": 30349225.0, "step": 16455 }, { "entropy": 5.70872130393982, "epoch": 1.382860743541273, "grad_norm": 0.9765625, "learning_rate": 0.00048121692609279866, "loss": 5.2191, "mean_token_accuracy": 0.19126648157835008, "num_tokens": 30358804.0, "step": 16460 }, { "entropy": 5.6989562034606935, "epoch": 1.3832808233564378, "grad_norm": 0.98828125, "learning_rate": 0.0004812049459576549, "loss": 5.2878, "mean_token_accuracy": 0.18524790108203887, "num_tokens": 30368490.0, "step": 16465 }, { "entropy": 5.744047784805298, "epoch": 1.3837009031716025, "grad_norm": 1.0390625, "learning_rate": 0.0004811929621696966, "loss": 5.1754, "mean_token_accuracy": 0.18408805429935454, "num_tokens": 30377117.0, "step": 16470 }, { "entropy": 5.583626079559326, "epoch": 1.3841209829867676, "grad_norm": 1.140625, "learning_rate": 0.00048118097472913627, "loss": 5.0836, "mean_token_accuracy": 0.1945900395512581, "num_tokens": 30385151.0, "step": 16475 }, { "entropy": 5.5814605236053465, "epoch": 1.3845410628019323, "grad_norm": 1.0625, "learning_rate": 0.0004811689836361861, "loss": 5.1139, "mean_token_accuracy": 0.19011847227811812, "num_tokens": 30394837.0, "step": 16480 }, { "entropy": 5.676617765426636, "epoch": 1.3849611426170974, "grad_norm": 1.0859375, "learning_rate": 0.0004811569888910585, "loss": 5.1747, "mean_token_accuracy": 0.19189244508743286, "num_tokens": 30403507.0, "step": 16485 }, { "entropy": 5.6457610607147215, "epoch": 1.3853812224322621, "grad_norm": 1.046875, "learning_rate": 0.0004811449904939661, "loss": 5.1639, "mean_token_accuracy": 0.19110038578510286, "num_tokens": 30412941.0, "step": 16490 }, { "entropy": 5.6220416069030765, "epoch": 1.385801302247427, "grad_norm": 1.0703125, "learning_rate": 0.00048113298844512127, "loss": 5.1765, "mean_token_accuracy": 0.19164388626813889, "num_tokens": 30421823.0, "step": 16495 }, { "entropy": 5.679866313934326, "epoch": 1.386221382062592, "grad_norm": 0.99609375, "learning_rate": 0.0004811209827447367, "loss": 5.2445, "mean_token_accuracy": 0.18197256922721863, "num_tokens": 30431901.0, "step": 16500 }, { "entropy": 5.704894161224365, "epoch": 1.3866414618777567, "grad_norm": 0.99609375, "learning_rate": 0.00048110897339302504, "loss": 5.1954, "mean_token_accuracy": 0.1848170652985573, "num_tokens": 30442037.0, "step": 16505 }, { "entropy": 5.715596866607666, "epoch": 1.3870615416929217, "grad_norm": 1.1171875, "learning_rate": 0.00048109696039019915, "loss": 5.1661, "mean_token_accuracy": 0.19520570188760758, "num_tokens": 30451189.0, "step": 16510 }, { "entropy": 5.6763568878173825, "epoch": 1.3874816215080865, "grad_norm": 1.0546875, "learning_rate": 0.0004810849437364716, "loss": 5.2393, "mean_token_accuracy": 0.18380953520536422, "num_tokens": 30460214.0, "step": 16515 }, { "entropy": 5.720759439468384, "epoch": 1.3879017013232513, "grad_norm": 1.0234375, "learning_rate": 0.00048107292343205546, "loss": 5.254, "mean_token_accuracy": 0.18680346459150315, "num_tokens": 30469936.0, "step": 16520 }, { "entropy": 5.68984088897705, "epoch": 1.3883217811384163, "grad_norm": 1.0234375, "learning_rate": 0.0004810608994771636, "loss": 5.1928, "mean_token_accuracy": 0.1872539833188057, "num_tokens": 30479282.0, "step": 16525 }, { "entropy": 5.7390711307525635, "epoch": 1.388741860953581, "grad_norm": 0.99609375, "learning_rate": 0.000481048871872009, "loss": 5.2345, "mean_token_accuracy": 0.18669211268424987, "num_tokens": 30487839.0, "step": 16530 }, { "entropy": 5.736377954483032, "epoch": 1.389161940768746, "grad_norm": 1.09375, "learning_rate": 0.00048103684061680463, "loss": 5.2794, "mean_token_accuracy": 0.18340542167425156, "num_tokens": 30497327.0, "step": 16535 }, { "entropy": 5.670967674255371, "epoch": 1.389582020583911, "grad_norm": 1.03125, "learning_rate": 0.00048102480571176384, "loss": 5.1725, "mean_token_accuracy": 0.19103943854570388, "num_tokens": 30506996.0, "step": 16540 }, { "entropy": 5.674752998352051, "epoch": 1.390002100399076, "grad_norm": 0.9921875, "learning_rate": 0.0004810127671570997, "loss": 5.1221, "mean_token_accuracy": 0.1949089929461479, "num_tokens": 30515627.0, "step": 16545 }, { "entropy": 5.6866960525512695, "epoch": 1.3904221802142407, "grad_norm": 0.91796875, "learning_rate": 0.00048100072495302544, "loss": 5.2362, "mean_token_accuracy": 0.1817348152399063, "num_tokens": 30525858.0, "step": 16550 }, { "entropy": 5.588730049133301, "epoch": 1.3908422600294057, "grad_norm": 1.0078125, "learning_rate": 0.0004809886790997544, "loss": 5.1546, "mean_token_accuracy": 0.1901202067732811, "num_tokens": 30536331.0, "step": 16555 }, { "entropy": 5.689899682998657, "epoch": 1.3912623398445705, "grad_norm": 0.97265625, "learning_rate": 0.0004809766295975, "loss": 5.1929, "mean_token_accuracy": 0.19101303666830063, "num_tokens": 30545329.0, "step": 16560 }, { "entropy": 5.639544916152954, "epoch": 1.3916824196597353, "grad_norm": 1.0078125, "learning_rate": 0.0004809645764464757, "loss": 5.1509, "mean_token_accuracy": 0.1832982763648033, "num_tokens": 30554357.0, "step": 16565 }, { "entropy": 5.690026140213012, "epoch": 1.3921024994749003, "grad_norm": 1.0390625, "learning_rate": 0.00048095251964689494, "loss": 5.3162, "mean_token_accuracy": 0.18092724233865737, "num_tokens": 30563548.0, "step": 16570 }, { "entropy": 5.6918659687042235, "epoch": 1.392522579290065, "grad_norm": 1.0546875, "learning_rate": 0.00048094045919897134, "loss": 5.2075, "mean_token_accuracy": 0.18539494425058364, "num_tokens": 30572844.0, "step": 16575 }, { "entropy": 5.65005054473877, "epoch": 1.39294265910523, "grad_norm": 1.0390625, "learning_rate": 0.0004809283951029185, "loss": 5.1369, "mean_token_accuracy": 0.19091492444276809, "num_tokens": 30580930.0, "step": 16580 }, { "entropy": 5.69403600692749, "epoch": 1.3933627389203949, "grad_norm": 1.0, "learning_rate": 0.0004809163273589503, "loss": 5.134, "mean_token_accuracy": 0.1900659143924713, "num_tokens": 30589917.0, "step": 16585 }, { "entropy": 5.671267509460449, "epoch": 1.3937828187355596, "grad_norm": 1.015625, "learning_rate": 0.00048090425596728035, "loss": 5.2294, "mean_token_accuracy": 0.18690518736839296, "num_tokens": 30599282.0, "step": 16590 }, { "entropy": 5.706273031234741, "epoch": 1.3942028985507247, "grad_norm": 1.1015625, "learning_rate": 0.00048089218092812254, "loss": 5.2171, "mean_token_accuracy": 0.18142667412757874, "num_tokens": 30608244.0, "step": 16595 }, { "entropy": 5.750440502166748, "epoch": 1.3946229783658894, "grad_norm": 1.1015625, "learning_rate": 0.00048088010224169064, "loss": 5.3357, "mean_token_accuracy": 0.1832943454384804, "num_tokens": 30617340.0, "step": 16600 }, { "entropy": 5.784826993942261, "epoch": 1.3950430581810545, "grad_norm": 1.078125, "learning_rate": 0.00048086801990819886, "loss": 5.2664, "mean_token_accuracy": 0.17451538890600204, "num_tokens": 30626244.0, "step": 16605 }, { "entropy": 5.71404128074646, "epoch": 1.3954631379962192, "grad_norm": 1.1015625, "learning_rate": 0.00048085593392786113, "loss": 5.2544, "mean_token_accuracy": 0.18758676797151566, "num_tokens": 30635279.0, "step": 16610 }, { "entropy": 5.747651672363281, "epoch": 1.395883217811384, "grad_norm": 0.9765625, "learning_rate": 0.0004808438443008915, "loss": 5.3748, "mean_token_accuracy": 0.17949276715517043, "num_tokens": 30645790.0, "step": 16615 }, { "entropy": 5.699119615554809, "epoch": 1.396303297626549, "grad_norm": 1.015625, "learning_rate": 0.0004808317510275041, "loss": 5.2189, "mean_token_accuracy": 0.1780444011092186, "num_tokens": 30654497.0, "step": 16620 }, { "entropy": 5.732566070556641, "epoch": 1.396723377441714, "grad_norm": 1.0078125, "learning_rate": 0.0004808196541079133, "loss": 5.2802, "mean_token_accuracy": 0.18494397699832915, "num_tokens": 30663760.0, "step": 16625 }, { "entropy": 5.693180799484253, "epoch": 1.3971434572568788, "grad_norm": 1.109375, "learning_rate": 0.00048080755354233326, "loss": 5.2824, "mean_token_accuracy": 0.1861307740211487, "num_tokens": 30674263.0, "step": 16630 }, { "entropy": 5.6682196140289305, "epoch": 1.3975635370720436, "grad_norm": 1.09375, "learning_rate": 0.0004807954493309784, "loss": 5.1573, "mean_token_accuracy": 0.18905293494462966, "num_tokens": 30683501.0, "step": 16635 }, { "entropy": 5.709077405929565, "epoch": 1.3979836168872086, "grad_norm": 1.140625, "learning_rate": 0.00048078334147406314, "loss": 5.1599, "mean_token_accuracy": 0.19806634038686752, "num_tokens": 30691917.0, "step": 16640 }, { "entropy": 5.67047815322876, "epoch": 1.3984036967023734, "grad_norm": 1.125, "learning_rate": 0.00048077122997180197, "loss": 5.2215, "mean_token_accuracy": 0.18722850531339646, "num_tokens": 30701753.0, "step": 16645 }, { "entropy": 5.610556745529175, "epoch": 1.3988237765175384, "grad_norm": 1.078125, "learning_rate": 0.0004807591148244093, "loss": 5.1876, "mean_token_accuracy": 0.18866357952356339, "num_tokens": 30710878.0, "step": 16650 }, { "entropy": 5.637104606628418, "epoch": 1.3992438563327032, "grad_norm": 0.9765625, "learning_rate": 0.0004807469960321, "loss": 5.1032, "mean_token_accuracy": 0.19152369797229768, "num_tokens": 30719372.0, "step": 16655 }, { "entropy": 5.721623516082763, "epoch": 1.399663936147868, "grad_norm": 1.0078125, "learning_rate": 0.00048073487359508854, "loss": 5.2758, "mean_token_accuracy": 0.18131039142608643, "num_tokens": 30728529.0, "step": 16660 }, { "entropy": 5.716696882247925, "epoch": 1.400084015963033, "grad_norm": 1.078125, "learning_rate": 0.00048072274751358976, "loss": 5.1966, "mean_token_accuracy": 0.19203683733940125, "num_tokens": 30737704.0, "step": 16665 }, { "entropy": 5.672592210769653, "epoch": 1.4005040957781978, "grad_norm": 0.9921875, "learning_rate": 0.00048071061778781843, "loss": 5.1692, "mean_token_accuracy": 0.18804021030664445, "num_tokens": 30747836.0, "step": 16670 }, { "entropy": 5.636957120895386, "epoch": 1.4009241755933628, "grad_norm": 0.9765625, "learning_rate": 0.0004806984844179894, "loss": 5.2443, "mean_token_accuracy": 0.18261843025684357, "num_tokens": 30757881.0, "step": 16675 }, { "entropy": 5.716638612747192, "epoch": 1.4013442554085276, "grad_norm": 1.0625, "learning_rate": 0.00048068634740431774, "loss": 5.2483, "mean_token_accuracy": 0.17776536494493483, "num_tokens": 30767592.0, "step": 16680 }, { "entropy": 5.670879268646241, "epoch": 1.4017643352236924, "grad_norm": 1.2109375, "learning_rate": 0.0004806742067470182, "loss": 5.2344, "mean_token_accuracy": 0.1914249375462532, "num_tokens": 30776633.0, "step": 16685 }, { "entropy": 5.709184074401856, "epoch": 1.4021844150388574, "grad_norm": 1.09375, "learning_rate": 0.00048066206244630613, "loss": 5.1622, "mean_token_accuracy": 0.18570322543382645, "num_tokens": 30785195.0, "step": 16690 }, { "entropy": 5.648166179656982, "epoch": 1.4026044948540224, "grad_norm": 1.03125, "learning_rate": 0.00048064991450239643, "loss": 5.173, "mean_token_accuracy": 0.18653524518013, "num_tokens": 30794397.0, "step": 16695 }, { "entropy": 5.782354164123535, "epoch": 1.4030245746691872, "grad_norm": 1.0625, "learning_rate": 0.00048063776291550444, "loss": 5.3387, "mean_token_accuracy": 0.17594273537397384, "num_tokens": 30803312.0, "step": 16700 }, { "entropy": 5.713446187973022, "epoch": 1.403444654484352, "grad_norm": 1.078125, "learning_rate": 0.00048062560768584537, "loss": 5.2243, "mean_token_accuracy": 0.19707240015268326, "num_tokens": 30812519.0, "step": 16705 }, { "entropy": 5.663441753387451, "epoch": 1.403864734299517, "grad_norm": 1.0390625, "learning_rate": 0.00048061344881363444, "loss": 5.1807, "mean_token_accuracy": 0.19155540466308593, "num_tokens": 30821558.0, "step": 16710 }, { "entropy": 5.735174226760864, "epoch": 1.4042848141146818, "grad_norm": 1.015625, "learning_rate": 0.0004806012862990873, "loss": 5.2135, "mean_token_accuracy": 0.18326453417539595, "num_tokens": 30831521.0, "step": 16715 }, { "entropy": 5.721960067749023, "epoch": 1.4047048939298468, "grad_norm": 1.0, "learning_rate": 0.00048058912014241914, "loss": 5.1878, "mean_token_accuracy": 0.18546638935804366, "num_tokens": 30841191.0, "step": 16720 }, { "entropy": 5.717618131637574, "epoch": 1.4051249737450116, "grad_norm": 1.1171875, "learning_rate": 0.0004805769503438456, "loss": 5.2886, "mean_token_accuracy": 0.18482558876276017, "num_tokens": 30850556.0, "step": 16725 }, { "entropy": 5.69296293258667, "epoch": 1.4055450535601763, "grad_norm": 1.125, "learning_rate": 0.00048056477690358227, "loss": 5.1936, "mean_token_accuracy": 0.19293549209833144, "num_tokens": 30859410.0, "step": 16730 }, { "entropy": 5.795737361907959, "epoch": 1.4059651333753413, "grad_norm": 1.1796875, "learning_rate": 0.0004805525998218447, "loss": 5.2208, "mean_token_accuracy": 0.18415172547101974, "num_tokens": 30868048.0, "step": 16735 }, { "entropy": 5.700779962539673, "epoch": 1.4063852131905061, "grad_norm": 1.140625, "learning_rate": 0.00048054041909884873, "loss": 5.2378, "mean_token_accuracy": 0.18689762949943542, "num_tokens": 30876785.0, "step": 16740 }, { "entropy": 5.754944515228272, "epoch": 1.4068052930056711, "grad_norm": 1.140625, "learning_rate": 0.00048052823473481007, "loss": 5.2928, "mean_token_accuracy": 0.1823737531900406, "num_tokens": 30886158.0, "step": 16745 }, { "entropy": 5.746382093429565, "epoch": 1.407225372820836, "grad_norm": 1.0703125, "learning_rate": 0.00048051604672994446, "loss": 5.1748, "mean_token_accuracy": 0.1862805724143982, "num_tokens": 30895283.0, "step": 16750 }, { "entropy": 5.676848983764648, "epoch": 1.4076454526360007, "grad_norm": 0.984375, "learning_rate": 0.00048050385508446804, "loss": 5.1827, "mean_token_accuracy": 0.1910940334200859, "num_tokens": 30905514.0, "step": 16755 }, { "entropy": 5.683502435684204, "epoch": 1.4080655324511657, "grad_norm": 0.94921875, "learning_rate": 0.00048049165979859655, "loss": 5.1139, "mean_token_accuracy": 0.20378468781709672, "num_tokens": 30914794.0, "step": 16760 }, { "entropy": 5.654257583618164, "epoch": 1.4084856122663307, "grad_norm": 1.0, "learning_rate": 0.00048047946087254615, "loss": 5.1604, "mean_token_accuracy": 0.1820305034518242, "num_tokens": 30923823.0, "step": 16765 }, { "entropy": 5.657008981704712, "epoch": 1.4089056920814955, "grad_norm": 1.1328125, "learning_rate": 0.00048046725830653295, "loss": 5.2355, "mean_token_accuracy": 0.18716273605823516, "num_tokens": 30932738.0, "step": 16770 }, { "entropy": 5.682144641876221, "epoch": 1.4093257718966603, "grad_norm": 1.0859375, "learning_rate": 0.00048045505210077304, "loss": 5.2735, "mean_token_accuracy": 0.17982773035764693, "num_tokens": 30942302.0, "step": 16775 }, { "entropy": 5.714016580581665, "epoch": 1.4097458517118253, "grad_norm": 1.15625, "learning_rate": 0.0004804428422554826, "loss": 5.1542, "mean_token_accuracy": 0.19103640466928482, "num_tokens": 30951662.0, "step": 16780 }, { "entropy": 5.703213548660278, "epoch": 1.41016593152699, "grad_norm": 1.0390625, "learning_rate": 0.0004804306287708782, "loss": 5.2068, "mean_token_accuracy": 0.1919228583574295, "num_tokens": 30960475.0, "step": 16785 }, { "entropy": 5.625126647949219, "epoch": 1.410586011342155, "grad_norm": 1.0546875, "learning_rate": 0.00048041841164717574, "loss": 5.0487, "mean_token_accuracy": 0.19472707509994508, "num_tokens": 30969075.0, "step": 16790 }, { "entropy": 5.646886253356934, "epoch": 1.41100609115732, "grad_norm": 1.1875, "learning_rate": 0.0004804061908845921, "loss": 5.1193, "mean_token_accuracy": 0.19407358765602112, "num_tokens": 30978030.0, "step": 16795 }, { "entropy": 5.658605098724365, "epoch": 1.4114261709724847, "grad_norm": 1.1953125, "learning_rate": 0.00048039396648334346, "loss": 5.0984, "mean_token_accuracy": 0.19156255424022675, "num_tokens": 30985639.0, "step": 16800 }, { "entropy": 5.686256742477417, "epoch": 1.4118462507876497, "grad_norm": 1.03125, "learning_rate": 0.0004803817384436465, "loss": 5.2137, "mean_token_accuracy": 0.19015444815158844, "num_tokens": 30994811.0, "step": 16805 }, { "entropy": 5.72760796546936, "epoch": 1.4122663306028145, "grad_norm": 1.2265625, "learning_rate": 0.0004803695067657178, "loss": 5.1934, "mean_token_accuracy": 0.18725541234016418, "num_tokens": 31003813.0, "step": 16810 }, { "entropy": 5.681657505035401, "epoch": 1.4126864104179795, "grad_norm": 1.0625, "learning_rate": 0.000480357271449774, "loss": 5.1607, "mean_token_accuracy": 0.19405942559242248, "num_tokens": 31012488.0, "step": 16815 }, { "entropy": 5.650327348709107, "epoch": 1.4131064902331443, "grad_norm": 1.078125, "learning_rate": 0.0004803450324960318, "loss": 5.1736, "mean_token_accuracy": 0.18883284479379653, "num_tokens": 31021089.0, "step": 16820 }, { "entropy": 5.702486848831176, "epoch": 1.413526570048309, "grad_norm": 1.0859375, "learning_rate": 0.00048033278990470825, "loss": 5.1954, "mean_token_accuracy": 0.18749319463968278, "num_tokens": 31029903.0, "step": 16825 }, { "entropy": 5.688267612457276, "epoch": 1.413946649863474, "grad_norm": 1.0078125, "learning_rate": 0.00048032054367601996, "loss": 5.1979, "mean_token_accuracy": 0.1839518740773201, "num_tokens": 31039207.0, "step": 16830 }, { "entropy": 5.687576341629028, "epoch": 1.414366729678639, "grad_norm": 1.078125, "learning_rate": 0.00048030829381018396, "loss": 5.2219, "mean_token_accuracy": 0.18286307007074357, "num_tokens": 31048190.0, "step": 16835 }, { "entropy": 5.722398376464843, "epoch": 1.4147868094938039, "grad_norm": 1.0625, "learning_rate": 0.0004802960403074173, "loss": 5.2754, "mean_token_accuracy": 0.19086665362119676, "num_tokens": 31058769.0, "step": 16840 }, { "entropy": 5.717297744750977, "epoch": 1.4152068893089687, "grad_norm": 1.0703125, "learning_rate": 0.00048028378316793705, "loss": 5.2618, "mean_token_accuracy": 0.18488862365484238, "num_tokens": 31066830.0, "step": 16845 }, { "entropy": 5.777028560638428, "epoch": 1.4156269691241337, "grad_norm": 0.99609375, "learning_rate": 0.0004802715223919602, "loss": 5.2911, "mean_token_accuracy": 0.18663453608751296, "num_tokens": 31077205.0, "step": 16850 }, { "entropy": 5.750258255004883, "epoch": 1.4160470489392984, "grad_norm": 1.03125, "learning_rate": 0.00048025925797970403, "loss": 5.2018, "mean_token_accuracy": 0.19433569461107253, "num_tokens": 31087327.0, "step": 16855 }, { "entropy": 5.674712705612182, "epoch": 1.4164671287544635, "grad_norm": 1.0078125, "learning_rate": 0.00048024698993138587, "loss": 5.1636, "mean_token_accuracy": 0.1901378110051155, "num_tokens": 31096501.0, "step": 16860 }, { "entropy": 5.739978456497193, "epoch": 1.4168872085696282, "grad_norm": 1.078125, "learning_rate": 0.00048023471824722294, "loss": 5.3408, "mean_token_accuracy": 0.17992465794086457, "num_tokens": 31105949.0, "step": 16865 }, { "entropy": 5.735897445678711, "epoch": 1.417307288384793, "grad_norm": 1.1171875, "learning_rate": 0.00048022244292743256, "loss": 5.2339, "mean_token_accuracy": 0.17636758387088775, "num_tokens": 31115482.0, "step": 16870 }, { "entropy": 5.745817995071411, "epoch": 1.417727368199958, "grad_norm": 1.046875, "learning_rate": 0.00048021016397223234, "loss": 5.2045, "mean_token_accuracy": 0.18776648193597795, "num_tokens": 31124758.0, "step": 16875 }, { "entropy": 5.6385456085205075, "epoch": 1.4181474480151228, "grad_norm": 1.1171875, "learning_rate": 0.00048019788138183977, "loss": 5.0566, "mean_token_accuracy": 0.20038048774003983, "num_tokens": 31134114.0, "step": 16880 }, { "entropy": 5.590808391571045, "epoch": 1.4185675278302878, "grad_norm": 1.09375, "learning_rate": 0.00048018559515647244, "loss": 5.1143, "mean_token_accuracy": 0.1906316876411438, "num_tokens": 31142667.0, "step": 16885 }, { "entropy": 5.672430086135864, "epoch": 1.4189876076454526, "grad_norm": 1.03125, "learning_rate": 0.00048017330529634785, "loss": 5.2219, "mean_token_accuracy": 0.1828984424471855, "num_tokens": 31152105.0, "step": 16890 }, { "entropy": 5.72483115196228, "epoch": 1.4194076874606174, "grad_norm": 1.1015625, "learning_rate": 0.00048016101180168376, "loss": 5.2022, "mean_token_accuracy": 0.19421773701906203, "num_tokens": 31160277.0, "step": 16895 }, { "entropy": 5.7965850830078125, "epoch": 1.4198277672757824, "grad_norm": 1.0, "learning_rate": 0.00048014871467269804, "loss": 5.3801, "mean_token_accuracy": 0.17789032757282258, "num_tokens": 31170677.0, "step": 16900 }, { "entropy": 5.688093852996826, "epoch": 1.4202478470909472, "grad_norm": 1.0234375, "learning_rate": 0.00048013641390960856, "loss": 5.1942, "mean_token_accuracy": 0.18097881227731705, "num_tokens": 31179298.0, "step": 16905 }, { "entropy": 5.714810943603515, "epoch": 1.4206679269061122, "grad_norm": 1.046875, "learning_rate": 0.0004801241095126331, "loss": 5.2111, "mean_token_accuracy": 0.1831005573272705, "num_tokens": 31188547.0, "step": 16910 }, { "entropy": 5.693942213058472, "epoch": 1.421088006721277, "grad_norm": 1.0390625, "learning_rate": 0.0004801118014819896, "loss": 5.213, "mean_token_accuracy": 0.18663998395204545, "num_tokens": 31197680.0, "step": 16915 }, { "entropy": 5.67077603340149, "epoch": 1.421508086536442, "grad_norm": 1.1171875, "learning_rate": 0.0004800994898178962, "loss": 5.1639, "mean_token_accuracy": 0.19014709442853928, "num_tokens": 31206351.0, "step": 16920 }, { "entropy": 5.713833236694336, "epoch": 1.4219281663516068, "grad_norm": 1.03125, "learning_rate": 0.0004800871745205708, "loss": 5.3525, "mean_token_accuracy": 0.17853694260120392, "num_tokens": 31216478.0, "step": 16925 }, { "entropy": 5.787515020370483, "epoch": 1.4223482461667718, "grad_norm": 1.0625, "learning_rate": 0.00048007485559023195, "loss": 5.3068, "mean_token_accuracy": 0.1799660935997963, "num_tokens": 31225920.0, "step": 16930 }, { "entropy": 5.672777557373047, "epoch": 1.4227683259819366, "grad_norm": 1.0703125, "learning_rate": 0.0004800625330270975, "loss": 5.1774, "mean_token_accuracy": 0.18758865147829057, "num_tokens": 31235061.0, "step": 16935 }, { "entropy": 5.613970756530762, "epoch": 1.4231884057971014, "grad_norm": 1.1015625, "learning_rate": 0.0004800502068313859, "loss": 5.1608, "mean_token_accuracy": 0.1931985855102539, "num_tokens": 31243448.0, "step": 16940 }, { "entropy": 5.727277946472168, "epoch": 1.4236084856122664, "grad_norm": 1.1875, "learning_rate": 0.0004800378770033154, "loss": 5.2586, "mean_token_accuracy": 0.18508874028921127, "num_tokens": 31252569.0, "step": 16945 }, { "entropy": 5.693600845336914, "epoch": 1.4240285654274312, "grad_norm": 1.0, "learning_rate": 0.0004800255435431046, "loss": 5.1502, "mean_token_accuracy": 0.19388384222984315, "num_tokens": 31261905.0, "step": 16950 }, { "entropy": 5.630561828613281, "epoch": 1.4244486452425962, "grad_norm": 1.0, "learning_rate": 0.00048001320645097177, "loss": 5.1304, "mean_token_accuracy": 0.1940467670559883, "num_tokens": 31271203.0, "step": 16955 }, { "entropy": 5.663483905792236, "epoch": 1.424868725057761, "grad_norm": 1.0390625, "learning_rate": 0.00048000086572713566, "loss": 5.1438, "mean_token_accuracy": 0.19396023750305175, "num_tokens": 31279812.0, "step": 16960 }, { "entropy": 5.617663431167602, "epoch": 1.4252888048729258, "grad_norm": 1.0859375, "learning_rate": 0.0004799885213718147, "loss": 5.1695, "mean_token_accuracy": 0.18534606248140334, "num_tokens": 31289615.0, "step": 16965 }, { "entropy": 5.63858380317688, "epoch": 1.4257088846880908, "grad_norm": 1.0703125, "learning_rate": 0.00047997617338522763, "loss": 5.1448, "mean_token_accuracy": 0.19058055579662322, "num_tokens": 31298947.0, "step": 16970 }, { "entropy": 5.634194278717041, "epoch": 1.4261289645032555, "grad_norm": 1.21875, "learning_rate": 0.00047996382176759324, "loss": 5.1249, "mean_token_accuracy": 0.18965020924806594, "num_tokens": 31307465.0, "step": 16975 }, { "entropy": 5.636101198196411, "epoch": 1.4265490443184206, "grad_norm": 1.03125, "learning_rate": 0.0004799514665191303, "loss": 5.2282, "mean_token_accuracy": 0.18445250540971755, "num_tokens": 31317682.0, "step": 16980 }, { "entropy": 5.704889011383057, "epoch": 1.4269691241335853, "grad_norm": 1.1875, "learning_rate": 0.0004799391076400576, "loss": 5.2396, "mean_token_accuracy": 0.1820172920823097, "num_tokens": 31326113.0, "step": 16985 }, { "entropy": 5.770142030715943, "epoch": 1.4273892039487501, "grad_norm": 1.0859375, "learning_rate": 0.00047992674513059415, "loss": 5.2698, "mean_token_accuracy": 0.18564314246177674, "num_tokens": 31335263.0, "step": 16990 }, { "entropy": 5.697316455841064, "epoch": 1.4278092837639151, "grad_norm": 1.0859375, "learning_rate": 0.00047991437899095896, "loss": 5.2009, "mean_token_accuracy": 0.19329051226377486, "num_tokens": 31344503.0, "step": 16995 }, { "entropy": 5.686694574356079, "epoch": 1.4282293635790801, "grad_norm": 1.125, "learning_rate": 0.00047990200922137105, "loss": 5.2454, "mean_token_accuracy": 0.18646418154239655, "num_tokens": 31354530.0, "step": 17000 }, { "entropy": 5.649638223648071, "epoch": 1.428649443394245, "grad_norm": 1.078125, "learning_rate": 0.0004798896358220496, "loss": 5.0922, "mean_token_accuracy": 0.19509213864803315, "num_tokens": 31362761.0, "step": 17005 }, { "entropy": 5.609728384017944, "epoch": 1.4290695232094097, "grad_norm": 1.0078125, "learning_rate": 0.0004798772587932137, "loss": 5.0939, "mean_token_accuracy": 0.18893544226884842, "num_tokens": 31372933.0, "step": 17010 }, { "entropy": 5.742751121520996, "epoch": 1.4294896030245747, "grad_norm": 1.0234375, "learning_rate": 0.0004798648781350826, "loss": 5.2775, "mean_token_accuracy": 0.18596864938735963, "num_tokens": 31382651.0, "step": 17015 }, { "entropy": 5.695306062698364, "epoch": 1.4299096828397395, "grad_norm": 0.95703125, "learning_rate": 0.0004798524938478758, "loss": 5.2471, "mean_token_accuracy": 0.18174055814743043, "num_tokens": 31392272.0, "step": 17020 }, { "entropy": 5.740943193435669, "epoch": 1.4303297626549045, "grad_norm": 1.0625, "learning_rate": 0.0004798401059318124, "loss": 5.1698, "mean_token_accuracy": 0.18745066374540328, "num_tokens": 31400684.0, "step": 17025 }, { "entropy": 5.696807527542115, "epoch": 1.4307498424700693, "grad_norm": 1.0625, "learning_rate": 0.0004798277143871122, "loss": 5.1501, "mean_token_accuracy": 0.19832424372434615, "num_tokens": 31409082.0, "step": 17030 }, { "entropy": 5.595523929595947, "epoch": 1.431169922285234, "grad_norm": 1.0390625, "learning_rate": 0.0004798153192139944, "loss": 5.1124, "mean_token_accuracy": 0.19344846457242965, "num_tokens": 31417415.0, "step": 17035 }, { "entropy": 5.690292453765869, "epoch": 1.431590002100399, "grad_norm": 1.0234375, "learning_rate": 0.0004798029204126786, "loss": 5.2528, "mean_token_accuracy": 0.19225176721811293, "num_tokens": 31427510.0, "step": 17040 }, { "entropy": 5.688080263137818, "epoch": 1.432010081915564, "grad_norm": 1.0703125, "learning_rate": 0.0004797905179833847, "loss": 5.1296, "mean_token_accuracy": 0.18983213752508163, "num_tokens": 31436187.0, "step": 17045 }, { "entropy": 5.666804075241089, "epoch": 1.432430161730729, "grad_norm": 1.1015625, "learning_rate": 0.0004797781119263321, "loss": 5.143, "mean_token_accuracy": 0.18700398057699202, "num_tokens": 31445179.0, "step": 17050 }, { "entropy": 5.70466046333313, "epoch": 1.4328502415458937, "grad_norm": 1.09375, "learning_rate": 0.0004797657022417408, "loss": 5.2142, "mean_token_accuracy": 0.1858804851770401, "num_tokens": 31454434.0, "step": 17055 }, { "entropy": 5.6890199184417725, "epoch": 1.4332703213610585, "grad_norm": 1.1640625, "learning_rate": 0.00047975328892983045, "loss": 5.2167, "mean_token_accuracy": 0.18929940611124038, "num_tokens": 31464202.0, "step": 17060 }, { "entropy": 5.615249872207642, "epoch": 1.4336904011762235, "grad_norm": 1.0, "learning_rate": 0.00047974087199082095, "loss": 5.107, "mean_token_accuracy": 0.19815172106027604, "num_tokens": 31473158.0, "step": 17065 }, { "entropy": 5.621609020233154, "epoch": 1.4341104809913885, "grad_norm": 1.09375, "learning_rate": 0.00047972845142493244, "loss": 5.146, "mean_token_accuracy": 0.18635354936122894, "num_tokens": 31482643.0, "step": 17070 }, { "entropy": 5.6963695049285885, "epoch": 1.4345305608065533, "grad_norm": 1.0234375, "learning_rate": 0.0004797160272323848, "loss": 5.1972, "mean_token_accuracy": 0.1903929516673088, "num_tokens": 31492080.0, "step": 17075 }, { "entropy": 5.670864725112915, "epoch": 1.434950640621718, "grad_norm": 1.1171875, "learning_rate": 0.00047970359941339815, "loss": 5.1465, "mean_token_accuracy": 0.19422874897718428, "num_tokens": 31501990.0, "step": 17080 }, { "entropy": 5.665297794342041, "epoch": 1.435370720436883, "grad_norm": 1.0703125, "learning_rate": 0.0004796911679681926, "loss": 5.2301, "mean_token_accuracy": 0.18239501863718033, "num_tokens": 31510548.0, "step": 17085 }, { "entropy": 5.715258550643921, "epoch": 1.4357908002520479, "grad_norm": 1.125, "learning_rate": 0.00047967873289698847, "loss": 5.197, "mean_token_accuracy": 0.18592957705259322, "num_tokens": 31518695.0, "step": 17090 }, { "entropy": 5.8106156349182125, "epoch": 1.4362108800672129, "grad_norm": 1.0703125, "learning_rate": 0.00047966629420000595, "loss": 5.3124, "mean_token_accuracy": 0.18350915610790253, "num_tokens": 31528021.0, "step": 17095 }, { "entropy": 5.815104818344116, "epoch": 1.4366309598823777, "grad_norm": 1.1640625, "learning_rate": 0.0004796538518774654, "loss": 5.2945, "mean_token_accuracy": 0.1877882570028305, "num_tokens": 31537786.0, "step": 17100 }, { "entropy": 5.698870658874512, "epoch": 1.4370510396975424, "grad_norm": 1.140625, "learning_rate": 0.00047964140592958725, "loss": 5.2376, "mean_token_accuracy": 0.18914994299411775, "num_tokens": 31548006.0, "step": 17105 }, { "entropy": 5.712019777297973, "epoch": 1.4374711195127075, "grad_norm": 1.1484375, "learning_rate": 0.000479628956356592, "loss": 5.1845, "mean_token_accuracy": 0.18605631738901138, "num_tokens": 31557042.0, "step": 17110 }, { "entropy": 5.720201921463013, "epoch": 1.4378911993278722, "grad_norm": 1.03125, "learning_rate": 0.0004796165031587001, "loss": 5.2087, "mean_token_accuracy": 0.18433595597743987, "num_tokens": 31566661.0, "step": 17115 }, { "entropy": 5.724954128265381, "epoch": 1.4383112791430372, "grad_norm": 1.1328125, "learning_rate": 0.0004796040463361323, "loss": 5.1764, "mean_token_accuracy": 0.2017846956849098, "num_tokens": 31575724.0, "step": 17120 }, { "entropy": 5.737515115737915, "epoch": 1.438731358958202, "grad_norm": 1.046875, "learning_rate": 0.0004795915858891091, "loss": 5.2401, "mean_token_accuracy": 0.18774639070034027, "num_tokens": 31585068.0, "step": 17125 }, { "entropy": 5.72348918914795, "epoch": 1.4391514387733668, "grad_norm": 1.03125, "learning_rate": 0.0004795791218178514, "loss": 5.303, "mean_token_accuracy": 0.18364842385053634, "num_tokens": 31594629.0, "step": 17130 }, { "entropy": 5.677643728256226, "epoch": 1.4395715185885318, "grad_norm": 1.0625, "learning_rate": 0.00047956665412257984, "loss": 5.1624, "mean_token_accuracy": 0.19037005454301834, "num_tokens": 31603469.0, "step": 17135 }, { "entropy": 5.695523071289062, "epoch": 1.4399915984036968, "grad_norm": 1.1484375, "learning_rate": 0.00047955418280351526, "loss": 5.1347, "mean_token_accuracy": 0.19423467069864273, "num_tokens": 31611674.0, "step": 17140 }, { "entropy": 5.809403038024902, "epoch": 1.4404116782188616, "grad_norm": 1.0703125, "learning_rate": 0.0004795417078608788, "loss": 5.3764, "mean_token_accuracy": 0.17637077122926711, "num_tokens": 31621863.0, "step": 17145 }, { "entropy": 5.764920473098755, "epoch": 1.4408317580340264, "grad_norm": 1.1171875, "learning_rate": 0.00047952922929489126, "loss": 5.2358, "mean_token_accuracy": 0.18378381729125975, "num_tokens": 31630968.0, "step": 17150 }, { "entropy": 5.696118783950806, "epoch": 1.4412518378491914, "grad_norm": 1.0546875, "learning_rate": 0.00047951674710577366, "loss": 5.2366, "mean_token_accuracy": 0.18363456279039383, "num_tokens": 31640643.0, "step": 17155 }, { "entropy": 5.606297492980957, "epoch": 1.4416719176643562, "grad_norm": 1.0703125, "learning_rate": 0.00047950426129374723, "loss": 5.1521, "mean_token_accuracy": 0.19056872576475142, "num_tokens": 31648941.0, "step": 17160 }, { "entropy": 5.6977440357208256, "epoch": 1.4420919974795212, "grad_norm": 1.03125, "learning_rate": 0.00047949177185903314, "loss": 5.2351, "mean_token_accuracy": 0.19023771733045577, "num_tokens": 31658019.0, "step": 17165 }, { "entropy": 5.734630489349366, "epoch": 1.442512077294686, "grad_norm": 0.99609375, "learning_rate": 0.0004794792788018526, "loss": 5.2585, "mean_token_accuracy": 0.17784494757652283, "num_tokens": 31668050.0, "step": 17170 }, { "entropy": 5.678135967254638, "epoch": 1.4429321571098508, "grad_norm": 1.171875, "learning_rate": 0.000479466782122427, "loss": 5.1114, "mean_token_accuracy": 0.1903393715620041, "num_tokens": 31676727.0, "step": 17175 }, { "entropy": 5.688620853424072, "epoch": 1.4433522369250158, "grad_norm": 1.03125, "learning_rate": 0.00047945428182097756, "loss": 5.2272, "mean_token_accuracy": 0.18040607422590255, "num_tokens": 31686205.0, "step": 17180 }, { "entropy": 5.7261570453643795, "epoch": 1.4437723167401806, "grad_norm": 1.0546875, "learning_rate": 0.00047944177789772583, "loss": 5.2675, "mean_token_accuracy": 0.18105460703372955, "num_tokens": 31695521.0, "step": 17185 }, { "entropy": 5.78768138885498, "epoch": 1.4441923965553456, "grad_norm": 0.9765625, "learning_rate": 0.0004794292703528932, "loss": 5.2886, "mean_token_accuracy": 0.17410886883735657, "num_tokens": 31706606.0, "step": 17190 }, { "entropy": 5.760098600387574, "epoch": 1.4446124763705104, "grad_norm": 1.046875, "learning_rate": 0.00047941675918670133, "loss": 5.3671, "mean_token_accuracy": 0.17306947112083435, "num_tokens": 31716881.0, "step": 17195 }, { "entropy": 5.682158613204956, "epoch": 1.4450325561856752, "grad_norm": 1.0234375, "learning_rate": 0.0004794042443993719, "loss": 5.1599, "mean_token_accuracy": 0.18410923779010774, "num_tokens": 31725878.0, "step": 17200 }, { "entropy": 5.659201383590698, "epoch": 1.4454526360008402, "grad_norm": 1.046875, "learning_rate": 0.0004793917259911265, "loss": 5.2272, "mean_token_accuracy": 0.18736049383878708, "num_tokens": 31735033.0, "step": 17205 }, { "entropy": 5.647129344940185, "epoch": 1.445872715816005, "grad_norm": 1.0078125, "learning_rate": 0.0004793792039621869, "loss": 5.2032, "mean_token_accuracy": 0.18861926794052125, "num_tokens": 31744887.0, "step": 17210 }, { "entropy": 5.7151007652282715, "epoch": 1.44629279563117, "grad_norm": 1.03125, "learning_rate": 0.00047936667831277504, "loss": 5.257, "mean_token_accuracy": 0.18106930553913117, "num_tokens": 31754137.0, "step": 17215 }, { "entropy": 5.7163918018341064, "epoch": 1.4467128754463348, "grad_norm": 1.140625, "learning_rate": 0.0004793541490431126, "loss": 5.0896, "mean_token_accuracy": 0.1964927777647972, "num_tokens": 31763394.0, "step": 17220 }, { "entropy": 5.712170743942261, "epoch": 1.4471329552614998, "grad_norm": 1.03125, "learning_rate": 0.0004793416161534216, "loss": 5.1963, "mean_token_accuracy": 0.1857247307896614, "num_tokens": 31771905.0, "step": 17225 }, { "entropy": 5.558425951004028, "epoch": 1.4475530350766646, "grad_norm": 1.078125, "learning_rate": 0.00047932907964392423, "loss": 5.0451, "mean_token_accuracy": 0.20380705893039702, "num_tokens": 31780788.0, "step": 17230 }, { "entropy": 5.7161338329315186, "epoch": 1.4479731148918296, "grad_norm": 1.03125, "learning_rate": 0.00047931653951484234, "loss": 5.211, "mean_token_accuracy": 0.18486355990171432, "num_tokens": 31790198.0, "step": 17235 }, { "entropy": 5.688244247436524, "epoch": 1.4483931947069943, "grad_norm": 1.0625, "learning_rate": 0.00047930399576639815, "loss": 5.2086, "mean_token_accuracy": 0.1912611246109009, "num_tokens": 31799396.0, "step": 17240 }, { "entropy": 5.626449298858643, "epoch": 1.4488132745221591, "grad_norm": 1.0703125, "learning_rate": 0.00047929144839881386, "loss": 5.0636, "mean_token_accuracy": 0.19987558126449584, "num_tokens": 31807680.0, "step": 17245 }, { "entropy": 5.752202129364013, "epoch": 1.4492333543373241, "grad_norm": 0.99609375, "learning_rate": 0.00047927889741231186, "loss": 5.2105, "mean_token_accuracy": 0.18495881259441377, "num_tokens": 31817406.0, "step": 17250 }, { "entropy": 5.697268342971801, "epoch": 1.449653434152489, "grad_norm": 1.125, "learning_rate": 0.00047926634280711435, "loss": 5.1926, "mean_token_accuracy": 0.18381931781768798, "num_tokens": 31826518.0, "step": 17255 }, { "entropy": 5.683557033538818, "epoch": 1.450073513967654, "grad_norm": 1.0625, "learning_rate": 0.0004792537845834437, "loss": 5.2792, "mean_token_accuracy": 0.1792782410979271, "num_tokens": 31835538.0, "step": 17260 }, { "entropy": 5.643603038787842, "epoch": 1.4504935937828187, "grad_norm": 1.0546875, "learning_rate": 0.0004792412227415224, "loss": 5.1342, "mean_token_accuracy": 0.19183077961206435, "num_tokens": 31844899.0, "step": 17265 }, { "entropy": 5.690053939819336, "epoch": 1.4509136735979835, "grad_norm": 1.03125, "learning_rate": 0.00047922865728157314, "loss": 5.1824, "mean_token_accuracy": 0.1946403980255127, "num_tokens": 31854322.0, "step": 17270 }, { "entropy": 5.658100318908692, "epoch": 1.4513337534131485, "grad_norm": 1.09375, "learning_rate": 0.0004792160882038183, "loss": 5.1506, "mean_token_accuracy": 0.18531603813171388, "num_tokens": 31863657.0, "step": 17275 }, { "entropy": 5.65094747543335, "epoch": 1.4517538332283133, "grad_norm": 1.0703125, "learning_rate": 0.0004792035155084806, "loss": 5.1376, "mean_token_accuracy": 0.19246282428503036, "num_tokens": 31873468.0, "step": 17280 }, { "entropy": 5.662827491760254, "epoch": 1.4521739130434783, "grad_norm": 1.0390625, "learning_rate": 0.00047919093919578283, "loss": 5.2267, "mean_token_accuracy": 0.18955521434545516, "num_tokens": 31882391.0, "step": 17285 }, { "entropy": 5.666268444061279, "epoch": 1.452593992858643, "grad_norm": 1.078125, "learning_rate": 0.0004791783592659476, "loss": 5.2489, "mean_token_accuracy": 0.1839091420173645, "num_tokens": 31891370.0, "step": 17290 }, { "entropy": 5.6363461971282955, "epoch": 1.4530140726738079, "grad_norm": 1.1015625, "learning_rate": 0.000479165775719198, "loss": 5.1901, "mean_token_accuracy": 0.18755106925964354, "num_tokens": 31900688.0, "step": 17295 }, { "entropy": 5.713179063796997, "epoch": 1.453434152488973, "grad_norm": 1.5078125, "learning_rate": 0.00047915318855575674, "loss": 5.189, "mean_token_accuracy": 0.1955808088183403, "num_tokens": 31909359.0, "step": 17300 }, { "entropy": 5.671366548538208, "epoch": 1.453854232304138, "grad_norm": 1.0625, "learning_rate": 0.00047914059777584686, "loss": 5.1775, "mean_token_accuracy": 0.19081505089998246, "num_tokens": 31918529.0, "step": 17305 }, { "entropy": 5.64331088066101, "epoch": 1.4542743121193027, "grad_norm": 1.078125, "learning_rate": 0.00047912800337969144, "loss": 5.2467, "mean_token_accuracy": 0.18120260685682296, "num_tokens": 31928310.0, "step": 17310 }, { "entropy": 5.667942237854004, "epoch": 1.4546943919344675, "grad_norm": 1.0703125, "learning_rate": 0.00047911540536751355, "loss": 5.1627, "mean_token_accuracy": 0.18576472699642183, "num_tokens": 31937077.0, "step": 17315 }, { "entropy": 5.7339723110198975, "epoch": 1.4551144717496325, "grad_norm": 1.0703125, "learning_rate": 0.0004791028037395363, "loss": 5.219, "mean_token_accuracy": 0.17893830984830855, "num_tokens": 31946023.0, "step": 17320 }, { "entropy": 5.640270328521728, "epoch": 1.4555345515647973, "grad_norm": 1.0234375, "learning_rate": 0.00047909019849598305, "loss": 5.0897, "mean_token_accuracy": 0.19406481981277465, "num_tokens": 31954741.0, "step": 17325 }, { "entropy": 5.6529233932495115, "epoch": 1.4559546313799623, "grad_norm": 1.078125, "learning_rate": 0.00047907758963707696, "loss": 5.1707, "mean_token_accuracy": 0.18677785694599153, "num_tokens": 31963516.0, "step": 17330 }, { "entropy": 5.6786895275115965, "epoch": 1.456374711195127, "grad_norm": 1.1328125, "learning_rate": 0.00047906497716304153, "loss": 5.1897, "mean_token_accuracy": 0.19184095561504363, "num_tokens": 31971917.0, "step": 17335 }, { "entropy": 5.743244886398315, "epoch": 1.4567947910102919, "grad_norm": 1.1796875, "learning_rate": 0.0004790523610741001, "loss": 5.2654, "mean_token_accuracy": 0.18295784890651703, "num_tokens": 31980718.0, "step": 17340 }, { "entropy": 5.748479890823364, "epoch": 1.4572148708254569, "grad_norm": 1.1640625, "learning_rate": 0.00047903974137047614, "loss": 5.1897, "mean_token_accuracy": 0.1894355446100235, "num_tokens": 31988664.0, "step": 17345 }, { "entropy": 5.696847915649414, "epoch": 1.4576349506406217, "grad_norm": 0.984375, "learning_rate": 0.00047902711805239325, "loss": 5.2584, "mean_token_accuracy": 0.1823319375514984, "num_tokens": 31998415.0, "step": 17350 }, { "entropy": 5.7451536655426025, "epoch": 1.4580550304557867, "grad_norm": 1.015625, "learning_rate": 0.00047901449112007494, "loss": 5.2722, "mean_token_accuracy": 0.18601103574037553, "num_tokens": 32007915.0, "step": 17355 }, { "entropy": 5.676834440231323, "epoch": 1.4584751102709514, "grad_norm": 1.0234375, "learning_rate": 0.00047900186057374514, "loss": 5.1933, "mean_token_accuracy": 0.18653995394706727, "num_tokens": 32016582.0, "step": 17360 }, { "entropy": 5.639830446243286, "epoch": 1.4588951900861162, "grad_norm": 1.09375, "learning_rate": 0.00047898922641362724, "loss": 5.1905, "mean_token_accuracy": 0.18718859404325486, "num_tokens": 32026008.0, "step": 17365 }, { "entropy": 5.742193222045898, "epoch": 1.4593152699012812, "grad_norm": 1.1171875, "learning_rate": 0.0004789765886399453, "loss": 5.2375, "mean_token_accuracy": 0.18520684987306596, "num_tokens": 32034554.0, "step": 17370 }, { "entropy": 5.80055799484253, "epoch": 1.4597353497164463, "grad_norm": 1.140625, "learning_rate": 0.00047896394725292313, "loss": 5.2248, "mean_token_accuracy": 0.19242112934589387, "num_tokens": 32044003.0, "step": 17375 }, { "entropy": 5.66378116607666, "epoch": 1.460155429531611, "grad_norm": 1.0625, "learning_rate": 0.00047895130225278473, "loss": 5.2011, "mean_token_accuracy": 0.18860233277082444, "num_tokens": 32053753.0, "step": 17380 }, { "entropy": 5.609521675109863, "epoch": 1.4605755093467758, "grad_norm": 1.1484375, "learning_rate": 0.0004789386536397539, "loss": 5.2122, "mean_token_accuracy": 0.1838676691055298, "num_tokens": 32062459.0, "step": 17385 }, { "entropy": 5.758156156539917, "epoch": 1.4609955891619408, "grad_norm": 1.078125, "learning_rate": 0.0004789260014140549, "loss": 5.2812, "mean_token_accuracy": 0.18510547280311584, "num_tokens": 32072544.0, "step": 17390 }, { "entropy": 5.767707633972168, "epoch": 1.4614156689771056, "grad_norm": 1.0703125, "learning_rate": 0.00047891334557591177, "loss": 5.231, "mean_token_accuracy": 0.18288177251815796, "num_tokens": 32082015.0, "step": 17395 }, { "entropy": 5.665274047851563, "epoch": 1.4618357487922706, "grad_norm": 1.0859375, "learning_rate": 0.0004789006861255488, "loss": 5.1551, "mean_token_accuracy": 0.1949882447719574, "num_tokens": 32091622.0, "step": 17400 }, { "entropy": 5.749509191513061, "epoch": 1.4622558286074354, "grad_norm": 1.0234375, "learning_rate": 0.0004788880230631901, "loss": 5.3217, "mean_token_accuracy": 0.18173176497220994, "num_tokens": 32102716.0, "step": 17405 }, { "entropy": 5.737289476394653, "epoch": 1.4626759084226002, "grad_norm": 1.09375, "learning_rate": 0.00047887535638906005, "loss": 5.1199, "mean_token_accuracy": 0.19971857815980912, "num_tokens": 32111051.0, "step": 17410 }, { "entropy": 5.626353073120117, "epoch": 1.4630959882377652, "grad_norm": 1.0546875, "learning_rate": 0.000478862686103383, "loss": 5.1214, "mean_token_accuracy": 0.1939268797636032, "num_tokens": 32119781.0, "step": 17415 }, { "entropy": 5.677955675125122, "epoch": 1.46351606805293, "grad_norm": 1.1484375, "learning_rate": 0.00047885001220638354, "loss": 5.2164, "mean_token_accuracy": 0.18863945603370666, "num_tokens": 32128849.0, "step": 17420 }, { "entropy": 5.782941198348999, "epoch": 1.463936147868095, "grad_norm": 1.0078125, "learning_rate": 0.00047883733469828604, "loss": 5.2514, "mean_token_accuracy": 0.19107764065265656, "num_tokens": 32138046.0, "step": 17425 }, { "entropy": 5.834447813034058, "epoch": 1.4643562276832598, "grad_norm": 0.98828125, "learning_rate": 0.00047882465357931516, "loss": 5.2986, "mean_token_accuracy": 0.17931941598653794, "num_tokens": 32147994.0, "step": 17430 }, { "entropy": 5.7761908054351805, "epoch": 1.4647763074984246, "grad_norm": 1.109375, "learning_rate": 0.0004788119688496954, "loss": 5.2636, "mean_token_accuracy": 0.18317876905202865, "num_tokens": 32156835.0, "step": 17435 }, { "entropy": 5.69421353340149, "epoch": 1.4651963873135896, "grad_norm": 0.98828125, "learning_rate": 0.0004787992805096516, "loss": 5.1855, "mean_token_accuracy": 0.18656348884105683, "num_tokens": 32166751.0, "step": 17440 }, { "entropy": 5.7025329113006595, "epoch": 1.4656164671287546, "grad_norm": 1.109375, "learning_rate": 0.00047878658855940855, "loss": 5.2849, "mean_token_accuracy": 0.18085859566926957, "num_tokens": 32175705.0, "step": 17445 }, { "entropy": 5.800773668289184, "epoch": 1.4660365469439194, "grad_norm": 0.98828125, "learning_rate": 0.0004787738929991909, "loss": 5.3198, "mean_token_accuracy": 0.17974041700363158, "num_tokens": 32185404.0, "step": 17450 }, { "entropy": 5.732823705673217, "epoch": 1.4664566267590842, "grad_norm": 1.0859375, "learning_rate": 0.00047876119382922374, "loss": 5.1896, "mean_token_accuracy": 0.1902810275554657, "num_tokens": 32194054.0, "step": 17455 }, { "entropy": 5.677874517440796, "epoch": 1.4668767065742492, "grad_norm": 1.0, "learning_rate": 0.00047874849104973194, "loss": 5.2675, "mean_token_accuracy": 0.17640230059623718, "num_tokens": 32204080.0, "step": 17460 }, { "entropy": 5.7024627208709715, "epoch": 1.467296786389414, "grad_norm": 1.0703125, "learning_rate": 0.00047873578466094054, "loss": 5.1839, "mean_token_accuracy": 0.18210013210773468, "num_tokens": 32213279.0, "step": 17465 }, { "entropy": 5.675989627838135, "epoch": 1.467716866204579, "grad_norm": 1.2109375, "learning_rate": 0.0004787230746630746, "loss": 5.1927, "mean_token_accuracy": 0.1882736086845398, "num_tokens": 32221668.0, "step": 17470 }, { "entropy": 5.682435321807861, "epoch": 1.4681369460197438, "grad_norm": 1.140625, "learning_rate": 0.0004787103610563593, "loss": 5.1304, "mean_token_accuracy": 0.1912645921111107, "num_tokens": 32229683.0, "step": 17475 }, { "entropy": 5.692967176437378, "epoch": 1.4685570258349085, "grad_norm": 1.03125, "learning_rate": 0.00047869764384101993, "loss": 5.2022, "mean_token_accuracy": 0.18859823495149614, "num_tokens": 32238948.0, "step": 17480 }, { "entropy": 5.755647420883179, "epoch": 1.4689771056500736, "grad_norm": 1.1484375, "learning_rate": 0.00047868492301728164, "loss": 5.2163, "mean_token_accuracy": 0.18520476073026657, "num_tokens": 32248079.0, "step": 17485 }, { "entropy": 5.669157934188843, "epoch": 1.4693971854652383, "grad_norm": 1.0859375, "learning_rate": 0.00047867219858536975, "loss": 5.0592, "mean_token_accuracy": 0.20037250071763993, "num_tokens": 32256413.0, "step": 17490 }, { "entropy": 5.704810285568238, "epoch": 1.4698172652804034, "grad_norm": 1.0703125, "learning_rate": 0.0004786594705455098, "loss": 5.2119, "mean_token_accuracy": 0.1800605535507202, "num_tokens": 32265954.0, "step": 17495 }, { "entropy": 5.6476171016693115, "epoch": 1.4702373450955681, "grad_norm": 1.078125, "learning_rate": 0.0004786467388979272, "loss": 5.1263, "mean_token_accuracy": 0.19340334087610245, "num_tokens": 32273817.0, "step": 17500 }, { "entropy": 5.629765796661377, "epoch": 1.470657424910733, "grad_norm": 1.046875, "learning_rate": 0.00047863400364284744, "loss": 5.1714, "mean_token_accuracy": 0.19489692896604538, "num_tokens": 32283025.0, "step": 17505 }, { "entropy": 5.674668693542481, "epoch": 1.471077504725898, "grad_norm": 1.0546875, "learning_rate": 0.00047862126478049623, "loss": 5.1858, "mean_token_accuracy": 0.18881264925003052, "num_tokens": 32292321.0, "step": 17510 }, { "entropy": 5.797921562194825, "epoch": 1.4714975845410627, "grad_norm": 0.95703125, "learning_rate": 0.00047860852231109915, "loss": 5.274, "mean_token_accuracy": 0.17467544674873353, "num_tokens": 32302203.0, "step": 17515 }, { "entropy": 5.635599660873413, "epoch": 1.4719176643562277, "grad_norm": 1.1015625, "learning_rate": 0.0004785957762348819, "loss": 5.1082, "mean_token_accuracy": 0.1901296705007553, "num_tokens": 32310893.0, "step": 17520 }, { "entropy": 5.618343782424927, "epoch": 1.4723377441713925, "grad_norm": 1.0390625, "learning_rate": 0.0004785830265520703, "loss": 5.1456, "mean_token_accuracy": 0.19215014427900315, "num_tokens": 32320320.0, "step": 17525 }, { "entropy": 5.622067403793335, "epoch": 1.4727578239865575, "grad_norm": 1.0390625, "learning_rate": 0.00047857027326289023, "loss": 5.0794, "mean_token_accuracy": 0.19937957674264908, "num_tokens": 32329196.0, "step": 17530 }, { "entropy": 5.684448623657227, "epoch": 1.4731779038017223, "grad_norm": 0.99609375, "learning_rate": 0.00047855751636756763, "loss": 5.1986, "mean_token_accuracy": 0.18534134775400163, "num_tokens": 32338529.0, "step": 17535 }, { "entropy": 5.726581287384033, "epoch": 1.4735979836168873, "grad_norm": 1.1328125, "learning_rate": 0.0004785447558663284, "loss": 5.2162, "mean_token_accuracy": 0.1909865155816078, "num_tokens": 32347114.0, "step": 17540 }, { "entropy": 5.781416368484497, "epoch": 1.474018063432052, "grad_norm": 1.1171875, "learning_rate": 0.00047853199175939865, "loss": 5.3446, "mean_token_accuracy": 0.18164856731891632, "num_tokens": 32356765.0, "step": 17545 }, { "entropy": 5.801598644256591, "epoch": 1.474438143247217, "grad_norm": 1.1875, "learning_rate": 0.0004785192240470045, "loss": 5.3291, "mean_token_accuracy": 0.18008331656455995, "num_tokens": 32366175.0, "step": 17550 }, { "entropy": 5.69089937210083, "epoch": 1.474858223062382, "grad_norm": 1.0, "learning_rate": 0.000478506452729372, "loss": 5.1058, "mean_token_accuracy": 0.1885230913758278, "num_tokens": 32375063.0, "step": 17555 }, { "entropy": 5.680359268188477, "epoch": 1.4752783028775467, "grad_norm": 1.1484375, "learning_rate": 0.00047849367780672755, "loss": 5.191, "mean_token_accuracy": 0.1903966560959816, "num_tokens": 32384596.0, "step": 17560 }, { "entropy": 5.656247854232788, "epoch": 1.4756983826927117, "grad_norm": 1.0234375, "learning_rate": 0.0004784808992792974, "loss": 5.1474, "mean_token_accuracy": 0.1929773524403572, "num_tokens": 32393489.0, "step": 17565 }, { "entropy": 5.709002351760864, "epoch": 1.4761184625078765, "grad_norm": 1.0859375, "learning_rate": 0.0004784681171473079, "loss": 5.1268, "mean_token_accuracy": 0.1930202931165695, "num_tokens": 32402192.0, "step": 17570 }, { "entropy": 5.748080682754517, "epoch": 1.4765385423230413, "grad_norm": 1.1484375, "learning_rate": 0.00047845533141098543, "loss": 5.226, "mean_token_accuracy": 0.18090004920959474, "num_tokens": 32411317.0, "step": 17575 }, { "entropy": 5.7867354393005375, "epoch": 1.4769586221382063, "grad_norm": 1.09375, "learning_rate": 0.0004784425420705565, "loss": 5.277, "mean_token_accuracy": 0.17942063212394715, "num_tokens": 32420308.0, "step": 17580 }, { "entropy": 5.664358186721802, "epoch": 1.477378701953371, "grad_norm": 0.984375, "learning_rate": 0.0004784297491262477, "loss": 5.2136, "mean_token_accuracy": 0.19021391570568086, "num_tokens": 32429532.0, "step": 17585 }, { "entropy": 5.6970508098602295, "epoch": 1.477798781768536, "grad_norm": 1.0625, "learning_rate": 0.0004784169525782858, "loss": 5.1927, "mean_token_accuracy": 0.187297785282135, "num_tokens": 32439382.0, "step": 17590 }, { "entropy": 5.710856533050537, "epoch": 1.4782188615837009, "grad_norm": 1.0703125, "learning_rate": 0.0004784041524268971, "loss": 5.1839, "mean_token_accuracy": 0.19514687955379487, "num_tokens": 32447893.0, "step": 17595 }, { "entropy": 5.669908761978149, "epoch": 1.4786389413988656, "grad_norm": 1.28125, "learning_rate": 0.00047839134867230874, "loss": 5.1771, "mean_token_accuracy": 0.19171685427427293, "num_tokens": 32457770.0, "step": 17600 }, { "entropy": 5.755254554748535, "epoch": 1.4790590212140307, "grad_norm": 1.0546875, "learning_rate": 0.00047837854131474726, "loss": 5.2792, "mean_token_accuracy": 0.18135013580322265, "num_tokens": 32467247.0, "step": 17605 }, { "entropy": 5.768255376815796, "epoch": 1.4794791010291957, "grad_norm": 1.0390625, "learning_rate": 0.00047836573035443976, "loss": 5.2582, "mean_token_accuracy": 0.18924690634012223, "num_tokens": 32477453.0, "step": 17610 }, { "entropy": 5.771643543243409, "epoch": 1.4798991808443605, "grad_norm": 1.203125, "learning_rate": 0.00047835291579161293, "loss": 5.2266, "mean_token_accuracy": 0.1941353812813759, "num_tokens": 32486278.0, "step": 17615 }, { "entropy": 5.668265676498413, "epoch": 1.4803192606595252, "grad_norm": 1.125, "learning_rate": 0.0004783400976264941, "loss": 5.1605, "mean_token_accuracy": 0.19295942932367324, "num_tokens": 32495523.0, "step": 17620 }, { "entropy": 5.713512992858886, "epoch": 1.4807393404746902, "grad_norm": 1.1171875, "learning_rate": 0.00047832727585930997, "loss": 5.2057, "mean_token_accuracy": 0.18736464977264405, "num_tokens": 32504952.0, "step": 17625 }, { "entropy": 5.680152511596679, "epoch": 1.481159420289855, "grad_norm": 1.078125, "learning_rate": 0.0004783144504902879, "loss": 5.1853, "mean_token_accuracy": 0.1853386342525482, "num_tokens": 32515620.0, "step": 17630 }, { "entropy": 5.643487501144409, "epoch": 1.48157950010502, "grad_norm": 1.1171875, "learning_rate": 0.000478301621519655, "loss": 5.1464, "mean_token_accuracy": 0.19343707710504532, "num_tokens": 32524549.0, "step": 17635 }, { "entropy": 5.661568355560303, "epoch": 1.4819995799201848, "grad_norm": 1.15625, "learning_rate": 0.0004782887889476386, "loss": 5.0439, "mean_token_accuracy": 0.20409162640571593, "num_tokens": 32533043.0, "step": 17640 }, { "entropy": 5.703748559951782, "epoch": 1.4824196597353496, "grad_norm": 1.0625, "learning_rate": 0.000478275952774466, "loss": 5.1652, "mean_token_accuracy": 0.19010685831308366, "num_tokens": 32541679.0, "step": 17645 }, { "entropy": 5.735295677185059, "epoch": 1.4828397395505146, "grad_norm": 1.0390625, "learning_rate": 0.0004782631130003646, "loss": 5.2788, "mean_token_accuracy": 0.1889811173081398, "num_tokens": 32550922.0, "step": 17650 }, { "entropy": 5.724912881851196, "epoch": 1.4832598193656794, "grad_norm": 1.125, "learning_rate": 0.0004782502696255617, "loss": 5.2615, "mean_token_accuracy": 0.18291771709918975, "num_tokens": 32560063.0, "step": 17655 }, { "entropy": 5.6248459815979, "epoch": 1.4836798991808444, "grad_norm": 1.1015625, "learning_rate": 0.00047823742265028495, "loss": 5.1557, "mean_token_accuracy": 0.19118309319019317, "num_tokens": 32569476.0, "step": 17660 }, { "entropy": 5.710929679870605, "epoch": 1.4840999789960092, "grad_norm": 1.0234375, "learning_rate": 0.000478224572074762, "loss": 5.2088, "mean_token_accuracy": 0.19493364095687865, "num_tokens": 32578552.0, "step": 17665 }, { "entropy": 5.729469585418701, "epoch": 1.484520058811174, "grad_norm": 1.1015625, "learning_rate": 0.0004782117178992203, "loss": 5.2066, "mean_token_accuracy": 0.18812828063964843, "num_tokens": 32589074.0, "step": 17670 }, { "entropy": 5.719221782684326, "epoch": 1.484940138626339, "grad_norm": 1.0234375, "learning_rate": 0.0004781988601238878, "loss": 5.235, "mean_token_accuracy": 0.1889791488647461, "num_tokens": 32599288.0, "step": 17675 }, { "entropy": 5.769744062423706, "epoch": 1.485360218441504, "grad_norm": 1.0078125, "learning_rate": 0.000478185998748992, "loss": 5.2843, "mean_token_accuracy": 0.18057381808757783, "num_tokens": 32609430.0, "step": 17680 }, { "entropy": 5.647144603729248, "epoch": 1.4857802982566688, "grad_norm": 1.09375, "learning_rate": 0.00047817313377476083, "loss": 5.1433, "mean_token_accuracy": 0.18725276589393616, "num_tokens": 32617763.0, "step": 17685 }, { "entropy": 5.657654428482056, "epoch": 1.4862003780718336, "grad_norm": 1.0390625, "learning_rate": 0.00047816026520142234, "loss": 5.2227, "mean_token_accuracy": 0.1810248777270317, "num_tokens": 32627465.0, "step": 17690 }, { "entropy": 5.770398139953613, "epoch": 1.4866204578869986, "grad_norm": 1.078125, "learning_rate": 0.0004781473930292043, "loss": 5.1256, "mean_token_accuracy": 0.20110053122043609, "num_tokens": 32635984.0, "step": 17695 }, { "entropy": 5.617955160140991, "epoch": 1.4870405377021634, "grad_norm": 1.1328125, "learning_rate": 0.0004781345172583348, "loss": 5.0725, "mean_token_accuracy": 0.1978319212794304, "num_tokens": 32644346.0, "step": 17700 }, { "entropy": 5.646724033355713, "epoch": 1.4874606175173284, "grad_norm": 1.0, "learning_rate": 0.00047812163788904196, "loss": 5.2051, "mean_token_accuracy": 0.18413894921541213, "num_tokens": 32654118.0, "step": 17705 }, { "entropy": 5.727961730957031, "epoch": 1.4878806973324932, "grad_norm": 1.015625, "learning_rate": 0.00047810875492155386, "loss": 5.2092, "mean_token_accuracy": 0.18924564868211746, "num_tokens": 32664258.0, "step": 17710 }, { "entropy": 5.7081732749938965, "epoch": 1.488300777147658, "grad_norm": 1.0703125, "learning_rate": 0.0004780958683560987, "loss": 5.27, "mean_token_accuracy": 0.17974067181348802, "num_tokens": 32673672.0, "step": 17715 }, { "entropy": 5.708933782577515, "epoch": 1.488720856962823, "grad_norm": 1.0234375, "learning_rate": 0.0004780829781929049, "loss": 5.2502, "mean_token_accuracy": 0.17554962486028672, "num_tokens": 32682901.0, "step": 17720 }, { "entropy": 5.760427761077881, "epoch": 1.4891409367779878, "grad_norm": 1.1015625, "learning_rate": 0.0004780700844322007, "loss": 5.1757, "mean_token_accuracy": 0.19681486934423448, "num_tokens": 32691384.0, "step": 17725 }, { "entropy": 5.687942886352539, "epoch": 1.4895610165931528, "grad_norm": 1.171875, "learning_rate": 0.00047805718707421446, "loss": 5.2096, "mean_token_accuracy": 0.18906741291284562, "num_tokens": 32700758.0, "step": 17730 }, { "entropy": 5.751689720153808, "epoch": 1.4899810964083176, "grad_norm": 1.15625, "learning_rate": 0.00047804428611917475, "loss": 5.3245, "mean_token_accuracy": 0.18217762261629106, "num_tokens": 32709676.0, "step": 17735 }, { "entropy": 5.731592178344727, "epoch": 1.4904011762234823, "grad_norm": 1.09375, "learning_rate": 0.00047803138156731, "loss": 5.2108, "mean_token_accuracy": 0.18470272272825242, "num_tokens": 32718102.0, "step": 17740 }, { "entropy": 5.7573596954345705, "epoch": 1.4908212560386473, "grad_norm": 1.0625, "learning_rate": 0.00047801847341884897, "loss": 5.2099, "mean_token_accuracy": 0.18768994510173798, "num_tokens": 32727356.0, "step": 17745 }, { "entropy": 5.714266777038574, "epoch": 1.4912413358538124, "grad_norm": 1.1015625, "learning_rate": 0.0004780055616740202, "loss": 5.1889, "mean_token_accuracy": 0.1872628942131996, "num_tokens": 32736605.0, "step": 17750 }, { "entropy": 5.6698919296264645, "epoch": 1.4916614156689771, "grad_norm": 1.0390625, "learning_rate": 0.0004779926463330524, "loss": 5.1316, "mean_token_accuracy": 0.18657617568969725, "num_tokens": 32745573.0, "step": 17755 }, { "entropy": 5.677543830871582, "epoch": 1.492081495484142, "grad_norm": 1.09375, "learning_rate": 0.0004779797273961744, "loss": 5.1956, "mean_token_accuracy": 0.19196071922779084, "num_tokens": 32755695.0, "step": 17760 }, { "entropy": 5.664437484741211, "epoch": 1.492501575299307, "grad_norm": 1.1953125, "learning_rate": 0.0004779668048636151, "loss": 5.1314, "mean_token_accuracy": 0.18969690799713135, "num_tokens": 32763570.0, "step": 17765 }, { "entropy": 5.6956212520599365, "epoch": 1.4929216551144717, "grad_norm": 1.1875, "learning_rate": 0.00047795387873560336, "loss": 5.2183, "mean_token_accuracy": 0.17812351286411285, "num_tokens": 32772006.0, "step": 17770 }, { "entropy": 5.689272689819336, "epoch": 1.4933417349296367, "grad_norm": 1.171875, "learning_rate": 0.0004779409490123681, "loss": 5.1779, "mean_token_accuracy": 0.18634060323238372, "num_tokens": 32781080.0, "step": 17775 }, { "entropy": 5.642456531524658, "epoch": 1.4937618147448015, "grad_norm": 1.1796875, "learning_rate": 0.0004779280156941384, "loss": 5.1325, "mean_token_accuracy": 0.186872436106205, "num_tokens": 32789880.0, "step": 17780 }, { "entropy": 5.733442592620849, "epoch": 1.4941818945599663, "grad_norm": 1.0859375, "learning_rate": 0.00047791507878114354, "loss": 5.1759, "mean_token_accuracy": 0.18839626312255858, "num_tokens": 32799222.0, "step": 17785 }, { "entropy": 5.664855098724365, "epoch": 1.4946019743751313, "grad_norm": 1.0, "learning_rate": 0.0004779021382736124, "loss": 5.1643, "mean_token_accuracy": 0.1854260191321373, "num_tokens": 32808945.0, "step": 17790 }, { "entropy": 5.622431659698487, "epoch": 1.495022054190296, "grad_norm": 0.96484375, "learning_rate": 0.0004778891941717745, "loss": 5.0866, "mean_token_accuracy": 0.19672405421733857, "num_tokens": 32818386.0, "step": 17795 }, { "entropy": 5.60933403968811, "epoch": 1.495442134005461, "grad_norm": 1.0703125, "learning_rate": 0.0004778762464758589, "loss": 5.1678, "mean_token_accuracy": 0.17985923290252687, "num_tokens": 32828364.0, "step": 17800 }, { "entropy": 5.773142576217651, "epoch": 1.495862213820626, "grad_norm": 0.97265625, "learning_rate": 0.00047786329518609505, "loss": 5.2626, "mean_token_accuracy": 0.18685206770896912, "num_tokens": 32837399.0, "step": 17805 }, { "entropy": 5.698681306838989, "epoch": 1.4962822936357907, "grad_norm": 1.0546875, "learning_rate": 0.00047785034030271243, "loss": 5.0943, "mean_token_accuracy": 0.2052606776356697, "num_tokens": 32846111.0, "step": 17810 }, { "entropy": 5.6565797328948975, "epoch": 1.4967023734509557, "grad_norm": 1.1015625, "learning_rate": 0.0004778373818259404, "loss": 5.0382, "mean_token_accuracy": 0.20204033255577086, "num_tokens": 32855839.0, "step": 17815 }, { "entropy": 5.756921720504761, "epoch": 1.4971224532661207, "grad_norm": 1.1328125, "learning_rate": 0.00047782441975600866, "loss": 5.2987, "mean_token_accuracy": 0.18595268279314042, "num_tokens": 32865946.0, "step": 17820 }, { "entropy": 5.751388740539551, "epoch": 1.4975425330812855, "grad_norm": 1.0390625, "learning_rate": 0.0004778114540931468, "loss": 5.2804, "mean_token_accuracy": 0.18458262383937835, "num_tokens": 32875310.0, "step": 17825 }, { "entropy": 5.681033515930176, "epoch": 1.4979626128964503, "grad_norm": 1.1171875, "learning_rate": 0.00047779848483758445, "loss": 5.2329, "mean_token_accuracy": 0.18733306378126144, "num_tokens": 32885315.0, "step": 17830 }, { "entropy": 5.702258920669555, "epoch": 1.4983826927116153, "grad_norm": 1.1171875, "learning_rate": 0.00047778551198955133, "loss": 5.2002, "mean_token_accuracy": 0.1926373064517975, "num_tokens": 32894055.0, "step": 17835 }, { "entropy": 5.724915170669556, "epoch": 1.49880277252678, "grad_norm": 1.0703125, "learning_rate": 0.0004777725355492773, "loss": 5.2169, "mean_token_accuracy": 0.19602440297603607, "num_tokens": 32903030.0, "step": 17840 }, { "entropy": 5.714483785629272, "epoch": 1.499222852341945, "grad_norm": 1.1015625, "learning_rate": 0.0004777595555169922, "loss": 5.1583, "mean_token_accuracy": 0.18896477669477463, "num_tokens": 32911562.0, "step": 17845 }, { "entropy": 5.717610549926758, "epoch": 1.4996429321571099, "grad_norm": 1.0703125, "learning_rate": 0.000477746571892926, "loss": 5.2577, "mean_token_accuracy": 0.18056833148002624, "num_tokens": 32920376.0, "step": 17850 }, { "entropy": 5.726553821563721, "epoch": 1.5000630119722747, "grad_norm": 1.1328125, "learning_rate": 0.0004777335846773087, "loss": 5.1985, "mean_token_accuracy": 0.18284859210252763, "num_tokens": 32929374.0, "step": 17855 }, { "entropy": 5.62431902885437, "epoch": 1.5004830917874397, "grad_norm": 1.09375, "learning_rate": 0.00047772059387037025, "loss": 5.1191, "mean_token_accuracy": 0.18503511548042298, "num_tokens": 32938695.0, "step": 17860 }, { "entropy": 5.757184457778931, "epoch": 1.5009031716026044, "grad_norm": 1.078125, "learning_rate": 0.0004777075994723409, "loss": 5.1928, "mean_token_accuracy": 0.1902647390961647, "num_tokens": 32947725.0, "step": 17865 }, { "entropy": 5.734492826461792, "epoch": 1.5013232514177695, "grad_norm": 1.1484375, "learning_rate": 0.00047769460148345085, "loss": 5.1872, "mean_token_accuracy": 0.18539869040250778, "num_tokens": 32957017.0, "step": 17870 }, { "entropy": 5.699925899505615, "epoch": 1.5017433312329342, "grad_norm": 1.046875, "learning_rate": 0.0004776815999039303, "loss": 5.1606, "mean_token_accuracy": 0.18995736837387084, "num_tokens": 32965944.0, "step": 17875 }, { "entropy": 5.671454524993896, "epoch": 1.502163411048099, "grad_norm": 1.0234375, "learning_rate": 0.0004776685947340096, "loss": 5.1778, "mean_token_accuracy": 0.19463064819574355, "num_tokens": 32975368.0, "step": 17880 }, { "entropy": 5.7139819145202635, "epoch": 1.502583490863264, "grad_norm": 1.1171875, "learning_rate": 0.0004776555859739191, "loss": 5.2295, "mean_token_accuracy": 0.18614423871040345, "num_tokens": 32984603.0, "step": 17885 }, { "entropy": 5.734739446640015, "epoch": 1.503003570678429, "grad_norm": 1.0625, "learning_rate": 0.00047764257362388913, "loss": 5.1857, "mean_token_accuracy": 0.18691615462303163, "num_tokens": 32993621.0, "step": 17890 }, { "entropy": 5.663558578491211, "epoch": 1.5034236504935938, "grad_norm": 1.125, "learning_rate": 0.0004776295576841504, "loss": 5.194, "mean_token_accuracy": 0.19059872031211852, "num_tokens": 33002637.0, "step": 17895 }, { "entropy": 5.662126588821411, "epoch": 1.5038437303087586, "grad_norm": 1.0859375, "learning_rate": 0.00047761653815493337, "loss": 5.1196, "mean_token_accuracy": 0.2037479043006897, "num_tokens": 33011964.0, "step": 17900 }, { "entropy": 5.69144458770752, "epoch": 1.5042638101239234, "grad_norm": 1.2109375, "learning_rate": 0.00047760351503646877, "loss": 5.1874, "mean_token_accuracy": 0.18670964986085892, "num_tokens": 33020626.0, "step": 17905 }, { "entropy": 5.705088186264038, "epoch": 1.5046838899390884, "grad_norm": 1.0859375, "learning_rate": 0.0004775904883289871, "loss": 5.1741, "mean_token_accuracy": 0.1892619326710701, "num_tokens": 33029212.0, "step": 17910 }, { "entropy": 5.694438457489014, "epoch": 1.5051039697542534, "grad_norm": 1.015625, "learning_rate": 0.00047757745803271936, "loss": 5.2224, "mean_token_accuracy": 0.1858616441488266, "num_tokens": 33038893.0, "step": 17915 }, { "entropy": 5.696289110183716, "epoch": 1.5055240495694182, "grad_norm": 1.03125, "learning_rate": 0.0004775644241478962, "loss": 5.1949, "mean_token_accuracy": 0.1846272110939026, "num_tokens": 33048058.0, "step": 17920 }, { "entropy": 5.6542076587677, "epoch": 1.505944129384583, "grad_norm": 1.0546875, "learning_rate": 0.00047755138667474864, "loss": 5.1231, "mean_token_accuracy": 0.19427444487810136, "num_tokens": 33057106.0, "step": 17925 }, { "entropy": 5.668252897262573, "epoch": 1.506364209199748, "grad_norm": 1.171875, "learning_rate": 0.0004775383456135075, "loss": 5.2626, "mean_token_accuracy": 0.183236962556839, "num_tokens": 33066400.0, "step": 17930 }, { "entropy": 5.698903226852417, "epoch": 1.5067842890149128, "grad_norm": 1.0234375, "learning_rate": 0.0004775253009644038, "loss": 5.1148, "mean_token_accuracy": 0.19779697805643082, "num_tokens": 33075357.0, "step": 17935 }, { "entropy": 5.758629655838012, "epoch": 1.5072043688300778, "grad_norm": 1.0390625, "learning_rate": 0.00047751225272766885, "loss": 5.2187, "mean_token_accuracy": 0.1916794091463089, "num_tokens": 33085707.0, "step": 17940 }, { "entropy": 5.8011332035064695, "epoch": 1.5076244486452426, "grad_norm": 1.078125, "learning_rate": 0.0004774992009035335, "loss": 5.3102, "mean_token_accuracy": 0.18529380410909652, "num_tokens": 33095825.0, "step": 17945 }, { "entropy": 5.658703708648682, "epoch": 1.5080445284604074, "grad_norm": 1.0, "learning_rate": 0.0004774861454922291, "loss": 5.1194, "mean_token_accuracy": 0.1949077233672142, "num_tokens": 33105130.0, "step": 17950 }, { "entropy": 5.694227409362793, "epoch": 1.5084646082755724, "grad_norm": 1.140625, "learning_rate": 0.0004774730864939869, "loss": 5.1675, "mean_token_accuracy": 0.19098176509141923, "num_tokens": 33113226.0, "step": 17955 }, { "entropy": 5.74734354019165, "epoch": 1.5088846880907374, "grad_norm": 1.0390625, "learning_rate": 0.00047746002390903824, "loss": 5.1527, "mean_token_accuracy": 0.19166601002216338, "num_tokens": 33120824.0, "step": 17960 }, { "entropy": 5.7338663101196286, "epoch": 1.5093047679059022, "grad_norm": 1.2109375, "learning_rate": 0.0004774469577376145, "loss": 5.1474, "mean_token_accuracy": 0.19656829833984374, "num_tokens": 33129503.0, "step": 17965 }, { "entropy": 5.55996470451355, "epoch": 1.509724847721067, "grad_norm": 0.953125, "learning_rate": 0.00047743388797994715, "loss": 5.0679, "mean_token_accuracy": 0.19400005638599396, "num_tokens": 33138838.0, "step": 17970 }, { "entropy": 5.633906507492066, "epoch": 1.5101449275362318, "grad_norm": 1.125, "learning_rate": 0.00047742081463626767, "loss": 5.1781, "mean_token_accuracy": 0.19286924302577974, "num_tokens": 33148142.0, "step": 17975 }, { "entropy": 5.700980806350708, "epoch": 1.5105650073513968, "grad_norm": 1.140625, "learning_rate": 0.0004774077377068078, "loss": 5.1869, "mean_token_accuracy": 0.19072088599205017, "num_tokens": 33156750.0, "step": 17980 }, { "entropy": 5.7820343494415285, "epoch": 1.5109850871665618, "grad_norm": 1.0625, "learning_rate": 0.000477394657191799, "loss": 5.3116, "mean_token_accuracy": 0.1828122228384018, "num_tokens": 33166511.0, "step": 17985 }, { "entropy": 5.728284406661987, "epoch": 1.5114051669817266, "grad_norm": 1.0546875, "learning_rate": 0.00047738157309147307, "loss": 5.2693, "mean_token_accuracy": 0.19000640213489534, "num_tokens": 33175812.0, "step": 17990 }, { "entropy": 5.643117666244507, "epoch": 1.5118252467968913, "grad_norm": 1.046875, "learning_rate": 0.00047736848540606174, "loss": 5.1309, "mean_token_accuracy": 0.1884799987077713, "num_tokens": 33185201.0, "step": 17995 }, { "entropy": 5.664025831222534, "epoch": 1.5122453266120561, "grad_norm": 1.078125, "learning_rate": 0.000477355394135797, "loss": 5.1151, "mean_token_accuracy": 0.19276682883501053, "num_tokens": 33195151.0, "step": 18000 }, { "epoch": 1.5122453266120561, "eval_entropy": 5.4309860528395895, "eval_loss": 5.242064952850342, "eval_mean_token_accuracy": 0.19504030572908054, "eval_num_tokens": 33195151.0, "eval_runtime": 21.073, "eval_samples_per_second": 1773.166, "eval_steps_per_second": 221.658, "step": 18000 } ], "logging_steps": 5, "max_steps": 119020, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7195522584084480.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }