{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7561436672967864, "eval_steps": 3000, "global_step": 9000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 10.742606925964356, "epoch": 0.0004200798151648813, "grad_norm": 5.21875, "learning_rate": 2e-06, "loss": 10.7358, "mean_token_accuracy": 0.0, "num_tokens": 8348.0, "step": 5 }, { "entropy": 10.74260492324829, "epoch": 0.0008401596303297626, "grad_norm": 5.15625, "learning_rate": 4.5e-06, "loss": 10.7547, "mean_token_accuracy": 0.0, "num_tokens": 17465.0, "step": 10 }, { "entropy": 10.742631721496583, "epoch": 0.001260239445494644, "grad_norm": 5.25, "learning_rate": 7e-06, "loss": 10.7247, "mean_token_accuracy": 0.00010341261513531208, "num_tokens": 26627.0, "step": 15 }, { "entropy": 10.742714214324952, "epoch": 0.0016803192606595252, "grad_norm": 4.96875, "learning_rate": 9.5e-06, "loss": 10.6807, "mean_token_accuracy": 0.0, "num_tokens": 36069.0, "step": 20 }, { "entropy": 10.742774486541748, "epoch": 0.002100399075824407, "grad_norm": 4.96875, "learning_rate": 1.2e-05, "loss": 10.564, "mean_token_accuracy": 0.0009151221020147204, "num_tokens": 44967.0, "step": 25 }, { "entropy": 10.742547607421875, "epoch": 0.002520478890989288, "grad_norm": 3.8125, "learning_rate": 1.4500000000000002e-05, "loss": 10.4843, "mean_token_accuracy": 0.0172414593398571, "num_tokens": 55132.0, "step": 30 }, { "entropy": 10.741770172119141, "epoch": 0.0029405587061541692, "grad_norm": 3.1875, "learning_rate": 1.7000000000000003e-05, "loss": 10.3322, "mean_token_accuracy": 0.044619453698396684, "num_tokens": 65141.0, "step": 35 }, { "entropy": 10.739381885528564, "epoch": 0.0033606385213190504, "grad_norm": 2.484375, "learning_rate": 1.95e-05, "loss": 10.2048, "mean_token_accuracy": 0.04063304513692856, "num_tokens": 74007.0, "step": 40 }, { "entropy": 10.735391807556152, "epoch": 0.003780718336483932, "grad_norm": 2.203125, "learning_rate": 2.2e-05, "loss": 10.1027, "mean_token_accuracy": 0.04380051270127296, "num_tokens": 83736.0, "step": 45 }, { "entropy": 10.731560325622558, "epoch": 0.004200798151648814, "grad_norm": 2.03125, "learning_rate": 2.4500000000000003e-05, "loss": 10.0024, "mean_token_accuracy": 0.04462047629058361, "num_tokens": 92525.0, "step": 50 }, { "entropy": 10.729215049743653, "epoch": 0.004620877966813695, "grad_norm": 2.046875, "learning_rate": 2.7e-05, "loss": 9.9462, "mean_token_accuracy": 0.042681990377604964, "num_tokens": 102015.0, "step": 55 }, { "entropy": 10.728453350067138, "epoch": 0.005040957781978576, "grad_norm": 1.7890625, "learning_rate": 2.95e-05, "loss": 9.9154, "mean_token_accuracy": 0.03954915180802345, "num_tokens": 110887.0, "step": 60 }, { "entropy": 10.727616500854491, "epoch": 0.005461037597143457, "grad_norm": 1.8828125, "learning_rate": 3.2e-05, "loss": 9.8453, "mean_token_accuracy": 0.04232911877334118, "num_tokens": 120442.0, "step": 65 }, { "entropy": 10.726141738891602, "epoch": 0.0058811174123083385, "grad_norm": 1.9609375, "learning_rate": 3.4500000000000005e-05, "loss": 9.7509, "mean_token_accuracy": 0.041194649040699007, "num_tokens": 129297.0, "step": 70 }, { "entropy": 10.723711013793945, "epoch": 0.00630119722747322, "grad_norm": 1.8828125, "learning_rate": 3.7e-05, "loss": 9.7015, "mean_token_accuracy": 0.04228766188025475, "num_tokens": 138305.0, "step": 75 }, { "entropy": 10.719814491271972, "epoch": 0.006721277042638101, "grad_norm": 1.96875, "learning_rate": 3.95e-05, "loss": 9.6499, "mean_token_accuracy": 0.04200226049870252, "num_tokens": 147640.0, "step": 80 }, { "entropy": 10.714290428161622, "epoch": 0.007141356857802983, "grad_norm": 1.8515625, "learning_rate": 4.2000000000000004e-05, "loss": 9.576, "mean_token_accuracy": 0.04255363866686821, "num_tokens": 157633.0, "step": 85 }, { "entropy": 10.707357215881348, "epoch": 0.007561436672967864, "grad_norm": 1.671875, "learning_rate": 4.45e-05, "loss": 9.5382, "mean_token_accuracy": 0.03800953794270754, "num_tokens": 167984.0, "step": 90 }, { "entropy": 10.699947547912597, "epoch": 0.007981516488132745, "grad_norm": 1.7421875, "learning_rate": 4.7000000000000004e-05, "loss": 9.4351, "mean_token_accuracy": 0.04883353523910046, "num_tokens": 176984.0, "step": 95 }, { "entropy": 10.683709812164306, "epoch": 0.008401596303297627, "grad_norm": 1.890625, "learning_rate": 4.9500000000000004e-05, "loss": 9.3133, "mean_token_accuracy": 0.051684480533003806, "num_tokens": 185931.0, "step": 100 }, { "entropy": 10.665494632720947, "epoch": 0.008821676118462508, "grad_norm": 1.859375, "learning_rate": 5.2e-05, "loss": 9.2723, "mean_token_accuracy": 0.05058838985860348, "num_tokens": 195065.0, "step": 105 }, { "entropy": 10.650426483154297, "epoch": 0.00924175593362739, "grad_norm": 1.703125, "learning_rate": 5.45e-05, "loss": 9.1345, "mean_token_accuracy": 0.05380081832408905, "num_tokens": 203687.0, "step": 110 }, { "entropy": 10.613165855407715, "epoch": 0.00966183574879227, "grad_norm": 1.6484375, "learning_rate": 5.7e-05, "loss": 9.0467, "mean_token_accuracy": 0.057396522164344786, "num_tokens": 212847.0, "step": 115 }, { "entropy": 10.554168796539306, "epoch": 0.010081915563957152, "grad_norm": 1.6875, "learning_rate": 5.9499999999999996e-05, "loss": 8.93, "mean_token_accuracy": 0.05599412247538567, "num_tokens": 222593.0, "step": 120 }, { "entropy": 10.50309362411499, "epoch": 0.010501995379122032, "grad_norm": 1.6875, "learning_rate": 6.2e-05, "loss": 8.7842, "mean_token_accuracy": 0.054633737355470655, "num_tokens": 231174.0, "step": 125 }, { "entropy": 10.446444129943847, "epoch": 0.010922075194286915, "grad_norm": 1.5546875, "learning_rate": 6.450000000000001e-05, "loss": 8.6507, "mean_token_accuracy": 0.05882068388164043, "num_tokens": 239833.0, "step": 130 }, { "entropy": 10.371571159362793, "epoch": 0.011342155009451797, "grad_norm": 1.53125, "learning_rate": 6.7e-05, "loss": 8.62, "mean_token_accuracy": 0.05638743191957474, "num_tokens": 248794.0, "step": 135 }, { "entropy": 10.297250938415527, "epoch": 0.011762234824616677, "grad_norm": 1.4375, "learning_rate": 6.950000000000001e-05, "loss": 8.5299, "mean_token_accuracy": 0.056220804899930955, "num_tokens": 257123.0, "step": 140 }, { "entropy": 10.228730010986329, "epoch": 0.012182314639781559, "grad_norm": 1.453125, "learning_rate": 7.2e-05, "loss": 8.2842, "mean_token_accuracy": 0.05619280487298965, "num_tokens": 266088.0, "step": 145 }, { "entropy": 10.08653745651245, "epoch": 0.01260239445494644, "grad_norm": 1.21875, "learning_rate": 7.45e-05, "loss": 8.3619, "mean_token_accuracy": 0.0516346599906683, "num_tokens": 276074.0, "step": 150 }, { "entropy": 9.963776969909668, "epoch": 0.013022474270111321, "grad_norm": 1.171875, "learning_rate": 7.7e-05, "loss": 8.1944, "mean_token_accuracy": 0.054025283083319664, "num_tokens": 285280.0, "step": 155 }, { "entropy": 9.805997848510742, "epoch": 0.013442554085276202, "grad_norm": 1.171875, "learning_rate": 7.950000000000001e-05, "loss": 8.151, "mean_token_accuracy": 0.052671706303954124, "num_tokens": 296115.0, "step": 160 }, { "entropy": 9.606755542755128, "epoch": 0.013862633900441084, "grad_norm": 0.99609375, "learning_rate": 8.2e-05, "loss": 7.9584, "mean_token_accuracy": 0.05575060956180096, "num_tokens": 305483.0, "step": 165 }, { "entropy": 9.449717140197754, "epoch": 0.014282713715605966, "grad_norm": 0.93359375, "learning_rate": 8.450000000000001e-05, "loss": 7.9165, "mean_token_accuracy": 0.058218777552247046, "num_tokens": 314000.0, "step": 170 }, { "entropy": 9.167982482910157, "epoch": 0.014702793530770846, "grad_norm": 1.1953125, "learning_rate": 8.7e-05, "loss": 7.8517, "mean_token_accuracy": 0.062257979065179825, "num_tokens": 323667.0, "step": 175 }, { "entropy": 8.951386070251464, "epoch": 0.015122873345935728, "grad_norm": 0.9296875, "learning_rate": 8.95e-05, "loss": 7.8029, "mean_token_accuracy": 0.06150264739990234, "num_tokens": 332695.0, "step": 180 }, { "entropy": 8.776250171661378, "epoch": 0.015542953161100609, "grad_norm": 0.9609375, "learning_rate": 9.2e-05, "loss": 7.643, "mean_token_accuracy": 0.05887415409088135, "num_tokens": 342428.0, "step": 185 }, { "entropy": 8.602806949615479, "epoch": 0.01596303297626549, "grad_norm": 0.79296875, "learning_rate": 9.45e-05, "loss": 7.7106, "mean_token_accuracy": 0.06374814324080944, "num_tokens": 353587.0, "step": 190 }, { "entropy": 8.474033164978028, "epoch": 0.01638311279143037, "grad_norm": 0.93359375, "learning_rate": 9.7e-05, "loss": 7.6401, "mean_token_accuracy": 0.06406850814819336, "num_tokens": 362997.0, "step": 195 }, { "entropy": 8.364265060424804, "epoch": 0.016803192606595255, "grad_norm": 0.95703125, "learning_rate": 9.95e-05, "loss": 7.6617, "mean_token_accuracy": 0.06993534453213215, "num_tokens": 372346.0, "step": 200 }, { "entropy": 8.375140285491943, "epoch": 0.017223272421760135, "grad_norm": 1.0, "learning_rate": 0.000102, "loss": 7.5334, "mean_token_accuracy": 0.06646758764982223, "num_tokens": 381575.0, "step": 205 }, { "entropy": 8.26815767288208, "epoch": 0.017643352236925015, "grad_norm": 0.90625, "learning_rate": 0.00010449999999999999, "loss": 7.5902, "mean_token_accuracy": 0.07085754275321961, "num_tokens": 390706.0, "step": 210 }, { "entropy": 8.218460845947266, "epoch": 0.018063432052089896, "grad_norm": 0.828125, "learning_rate": 0.000107, "loss": 7.5876, "mean_token_accuracy": 0.07221915200352669, "num_tokens": 400000.0, "step": 215 }, { "entropy": 8.139337062835693, "epoch": 0.01848351186725478, "grad_norm": 0.85546875, "learning_rate": 0.0001095, "loss": 7.5295, "mean_token_accuracy": 0.07644539698958397, "num_tokens": 409447.0, "step": 220 }, { "entropy": 8.122040271759033, "epoch": 0.01890359168241966, "grad_norm": 1.1328125, "learning_rate": 0.000112, "loss": 7.5068, "mean_token_accuracy": 0.07519292533397674, "num_tokens": 418417.0, "step": 225 }, { "entropy": 8.067694330215454, "epoch": 0.01932367149758454, "grad_norm": 0.9609375, "learning_rate": 0.0001145, "loss": 7.4664, "mean_token_accuracy": 0.07503528967499733, "num_tokens": 427619.0, "step": 230 }, { "entropy": 8.071773529052734, "epoch": 0.019743751312749424, "grad_norm": 0.96484375, "learning_rate": 0.00011700000000000001, "loss": 7.5131, "mean_token_accuracy": 0.07185145244002342, "num_tokens": 437931.0, "step": 235 }, { "entropy": 8.109980726242066, "epoch": 0.020163831127914304, "grad_norm": 0.9609375, "learning_rate": 0.00011949999999999999, "loss": 7.552, "mean_token_accuracy": 0.07611973807215691, "num_tokens": 447595.0, "step": 240 }, { "entropy": 8.026875400543213, "epoch": 0.020583910943079185, "grad_norm": 0.94921875, "learning_rate": 0.000122, "loss": 7.4164, "mean_token_accuracy": 0.07035953775048256, "num_tokens": 457062.0, "step": 245 }, { "entropy": 8.063331604003906, "epoch": 0.021003990758244065, "grad_norm": 1.015625, "learning_rate": 0.0001245, "loss": 7.5166, "mean_token_accuracy": 0.07237975299358368, "num_tokens": 466191.0, "step": 250 }, { "entropy": 8.050399017333984, "epoch": 0.02142407057340895, "grad_norm": 1.2734375, "learning_rate": 0.000127, "loss": 7.4443, "mean_token_accuracy": 0.07492763809859752, "num_tokens": 475693.0, "step": 255 }, { "entropy": 8.024266242980957, "epoch": 0.02184415038857383, "grad_norm": 1.0234375, "learning_rate": 0.0001295, "loss": 7.4691, "mean_token_accuracy": 0.07379123903810977, "num_tokens": 485173.0, "step": 260 }, { "entropy": 7.993921422958374, "epoch": 0.02226423020373871, "grad_norm": 0.99609375, "learning_rate": 0.000132, "loss": 7.3863, "mean_token_accuracy": 0.08008474782109261, "num_tokens": 493985.0, "step": 265 }, { "entropy": 7.907951974868775, "epoch": 0.022684310018903593, "grad_norm": 1.125, "learning_rate": 0.00013450000000000002, "loss": 7.4036, "mean_token_accuracy": 0.07586845718324184, "num_tokens": 502837.0, "step": 270 }, { "entropy": 7.981403732299805, "epoch": 0.023104389834068473, "grad_norm": 0.91015625, "learning_rate": 0.00013700000000000002, "loss": 7.3605, "mean_token_accuracy": 0.07924394458532333, "num_tokens": 511503.0, "step": 275 }, { "entropy": 7.977783203125, "epoch": 0.023524469649233354, "grad_norm": 0.92578125, "learning_rate": 0.0001395, "loss": 7.5335, "mean_token_accuracy": 0.0751778606325388, "num_tokens": 521499.0, "step": 280 }, { "entropy": 7.871473217010498, "epoch": 0.023944549464398234, "grad_norm": 1.0703125, "learning_rate": 0.00014199999999999998, "loss": 7.2955, "mean_token_accuracy": 0.0799000546336174, "num_tokens": 530067.0, "step": 285 }, { "entropy": 7.885423564910889, "epoch": 0.024364629279563118, "grad_norm": 0.921875, "learning_rate": 0.0001445, "loss": 7.2851, "mean_token_accuracy": 0.08089336939156055, "num_tokens": 538559.0, "step": 290 }, { "entropy": 7.956486988067627, "epoch": 0.024784709094728, "grad_norm": 1.0078125, "learning_rate": 0.000147, "loss": 7.4858, "mean_token_accuracy": 0.07482350952923297, "num_tokens": 547288.0, "step": 295 }, { "entropy": 7.870783424377441, "epoch": 0.02520478890989288, "grad_norm": 0.8828125, "learning_rate": 0.0001495, "loss": 7.3589, "mean_token_accuracy": 0.07514288201928139, "num_tokens": 557269.0, "step": 300 }, { "entropy": 7.939627742767334, "epoch": 0.025624868725057762, "grad_norm": 0.96484375, "learning_rate": 0.000152, "loss": 7.3914, "mean_token_accuracy": 0.07472754344344139, "num_tokens": 567280.0, "step": 305 }, { "entropy": 7.828274822235107, "epoch": 0.026044948540222643, "grad_norm": 0.91796875, "learning_rate": 0.00015450000000000001, "loss": 7.2341, "mean_token_accuracy": 0.07823858335614205, "num_tokens": 576609.0, "step": 310 }, { "entropy": 7.761577320098877, "epoch": 0.026465028355387523, "grad_norm": 1.046875, "learning_rate": 0.000157, "loss": 7.1336, "mean_token_accuracy": 0.08791142702102661, "num_tokens": 586053.0, "step": 315 }, { "entropy": 7.695616436004639, "epoch": 0.026885108170552403, "grad_norm": 0.94921875, "learning_rate": 0.0001595, "loss": 7.3339, "mean_token_accuracy": 0.08298731297254562, "num_tokens": 594649.0, "step": 320 }, { "entropy": 7.869348049163818, "epoch": 0.027305187985717287, "grad_norm": 1.109375, "learning_rate": 0.000162, "loss": 7.2862, "mean_token_accuracy": 0.07372522614896297, "num_tokens": 603445.0, "step": 325 }, { "entropy": 7.86638765335083, "epoch": 0.027725267800882167, "grad_norm": 1.0625, "learning_rate": 0.00016450000000000001, "loss": 7.3613, "mean_token_accuracy": 0.07848134562373162, "num_tokens": 613611.0, "step": 330 }, { "entropy": 7.971248960494995, "epoch": 0.028145347616047048, "grad_norm": 1.0703125, "learning_rate": 0.00016700000000000002, "loss": 7.5217, "mean_token_accuracy": 0.07931054159998893, "num_tokens": 623024.0, "step": 335 }, { "entropy": 7.725814580917358, "epoch": 0.02856542743121193, "grad_norm": 1.2734375, "learning_rate": 0.00016950000000000003, "loss": 7.225, "mean_token_accuracy": 0.08345521688461303, "num_tokens": 631624.0, "step": 340 }, { "entropy": 7.762637519836426, "epoch": 0.028985507246376812, "grad_norm": 1.0078125, "learning_rate": 0.00017199999999999998, "loss": 7.1844, "mean_token_accuracy": 0.08410112038254738, "num_tokens": 640473.0, "step": 345 }, { "entropy": 7.841788578033447, "epoch": 0.029405587061541692, "grad_norm": 1.0625, "learning_rate": 0.00017449999999999999, "loss": 7.3409, "mean_token_accuracy": 0.08037517666816711, "num_tokens": 649692.0, "step": 350 }, { "entropy": 7.800195980072021, "epoch": 0.029825666876706573, "grad_norm": 1.0390625, "learning_rate": 0.000177, "loss": 7.2995, "mean_token_accuracy": 0.08097823038697242, "num_tokens": 658236.0, "step": 355 }, { "entropy": 7.668969297409058, "epoch": 0.030245746691871456, "grad_norm": 1.0859375, "learning_rate": 0.0001795, "loss": 7.0948, "mean_token_accuracy": 0.08619136661291123, "num_tokens": 667175.0, "step": 360 }, { "entropy": 7.798488330841065, "epoch": 0.030665826507036337, "grad_norm": 1.125, "learning_rate": 0.000182, "loss": 7.3842, "mean_token_accuracy": 0.07823293879628182, "num_tokens": 676456.0, "step": 365 }, { "entropy": 7.812319660186768, "epoch": 0.031085906322201217, "grad_norm": 0.9765625, "learning_rate": 0.0001845, "loss": 7.3503, "mean_token_accuracy": 0.07726633399724961, "num_tokens": 686881.0, "step": 370 }, { "entropy": 7.688674831390381, "epoch": 0.0315059861373661, "grad_norm": 1.0234375, "learning_rate": 0.000187, "loss": 7.1373, "mean_token_accuracy": 0.0819906547665596, "num_tokens": 696045.0, "step": 375 }, { "entropy": 7.655067443847656, "epoch": 0.03192606595253098, "grad_norm": 1.1484375, "learning_rate": 0.0001895, "loss": 7.1112, "mean_token_accuracy": 0.08879919424653053, "num_tokens": 704729.0, "step": 380 }, { "entropy": 7.4980494499206545, "epoch": 0.032346145767695865, "grad_norm": 0.953125, "learning_rate": 0.000192, "loss": 7.1679, "mean_token_accuracy": 0.07921729236841202, "num_tokens": 714331.0, "step": 385 }, { "entropy": 7.735121536254883, "epoch": 0.03276622558286074, "grad_norm": 1.0625, "learning_rate": 0.0001945, "loss": 7.1229, "mean_token_accuracy": 0.08520057946443557, "num_tokens": 722788.0, "step": 390 }, { "entropy": 7.683975791931152, "epoch": 0.033186305398025626, "grad_norm": 1.2421875, "learning_rate": 0.00019700000000000002, "loss": 7.1944, "mean_token_accuracy": 0.08690556064248085, "num_tokens": 731417.0, "step": 395 }, { "entropy": 7.576824569702149, "epoch": 0.03360638521319051, "grad_norm": 0.9140625, "learning_rate": 0.00019950000000000002, "loss": 7.1549, "mean_token_accuracy": 0.08151165619492531, "num_tokens": 741034.0, "step": 400 }, { "entropy": 7.698281908035279, "epoch": 0.034026465028355386, "grad_norm": 0.9453125, "learning_rate": 0.000202, "loss": 7.156, "mean_token_accuracy": 0.08484743162989616, "num_tokens": 749596.0, "step": 405 }, { "entropy": 7.556124067306518, "epoch": 0.03444654484352027, "grad_norm": 0.921875, "learning_rate": 0.00020449999999999998, "loss": 7.1145, "mean_token_accuracy": 0.08153974264860153, "num_tokens": 758931.0, "step": 410 }, { "entropy": 7.533982944488526, "epoch": 0.03486662465868515, "grad_norm": 1.0390625, "learning_rate": 0.000207, "loss": 7.0206, "mean_token_accuracy": 0.09019657000899314, "num_tokens": 767534.0, "step": 415 }, { "entropy": 7.6061821460723875, "epoch": 0.03528670447385003, "grad_norm": 1.078125, "learning_rate": 0.0002095, "loss": 7.0789, "mean_token_accuracy": 0.08290171101689339, "num_tokens": 776456.0, "step": 420 }, { "entropy": 7.5107566833496096, "epoch": 0.035706784289014915, "grad_norm": 1.0078125, "learning_rate": 0.000212, "loss": 7.1362, "mean_token_accuracy": 0.08152465149760246, "num_tokens": 786172.0, "step": 425 }, { "entropy": 7.553678846359253, "epoch": 0.03612686410417979, "grad_norm": 0.97265625, "learning_rate": 0.0002145, "loss": 7.0139, "mean_token_accuracy": 0.09106989204883575, "num_tokens": 795081.0, "step": 430 }, { "entropy": 7.604944372177124, "epoch": 0.036546943919344675, "grad_norm": 1.03125, "learning_rate": 0.00021700000000000002, "loss": 7.0628, "mean_token_accuracy": 0.08461785838007926, "num_tokens": 804259.0, "step": 435 }, { "entropy": 7.534902191162109, "epoch": 0.03696702373450956, "grad_norm": 1.109375, "learning_rate": 0.0002195, "loss": 7.0873, "mean_token_accuracy": 0.08283074498176575, "num_tokens": 813463.0, "step": 440 }, { "entropy": 7.502531671524048, "epoch": 0.037387103549674436, "grad_norm": 1.046875, "learning_rate": 0.000222, "loss": 7.0035, "mean_token_accuracy": 0.09452007561922074, "num_tokens": 823029.0, "step": 445 }, { "entropy": 7.486780834197998, "epoch": 0.03780718336483932, "grad_norm": 1.015625, "learning_rate": 0.0002245, "loss": 7.0727, "mean_token_accuracy": 0.08529324010014534, "num_tokens": 832902.0, "step": 450 }, { "entropy": 7.476432847976684, "epoch": 0.0382272631800042, "grad_norm": 1.0, "learning_rate": 0.00022700000000000002, "loss": 7.0158, "mean_token_accuracy": 0.08854726403951645, "num_tokens": 842162.0, "step": 455 }, { "entropy": 7.52789797782898, "epoch": 0.03864734299516908, "grad_norm": 1.0625, "learning_rate": 0.00022950000000000002, "loss": 7.0493, "mean_token_accuracy": 0.08622511699795724, "num_tokens": 852328.0, "step": 460 }, { "entropy": 7.449561357498169, "epoch": 0.039067422810333964, "grad_norm": 1.046875, "learning_rate": 0.00023200000000000003, "loss": 7.0104, "mean_token_accuracy": 0.09133929386734962, "num_tokens": 860929.0, "step": 465 }, { "entropy": 7.458409357070923, "epoch": 0.03948750262549885, "grad_norm": 1.1015625, "learning_rate": 0.00023449999999999998, "loss": 7.0901, "mean_token_accuracy": 0.08522843271493911, "num_tokens": 869144.0, "step": 470 }, { "entropy": 7.584603118896484, "epoch": 0.039907582440663725, "grad_norm": 1.1484375, "learning_rate": 0.000237, "loss": 7.03, "mean_token_accuracy": 0.09454337358474732, "num_tokens": 877447.0, "step": 475 }, { "entropy": 7.431310081481934, "epoch": 0.04032766225582861, "grad_norm": 0.99609375, "learning_rate": 0.0002395, "loss": 6.9871, "mean_token_accuracy": 0.08733554184436798, "num_tokens": 887020.0, "step": 480 }, { "entropy": 7.453667879104614, "epoch": 0.040747742070993485, "grad_norm": 1.171875, "learning_rate": 0.000242, "loss": 7.0323, "mean_token_accuracy": 0.08681000843644142, "num_tokens": 895937.0, "step": 485 }, { "entropy": 7.41835618019104, "epoch": 0.04116782188615837, "grad_norm": 1.0234375, "learning_rate": 0.0002445, "loss": 7.0366, "mean_token_accuracy": 0.08261745497584343, "num_tokens": 905446.0, "step": 490 }, { "entropy": 7.464281463623047, "epoch": 0.04158790170132325, "grad_norm": 1.078125, "learning_rate": 0.000247, "loss": 6.9289, "mean_token_accuracy": 0.09576694294810295, "num_tokens": 914547.0, "step": 495 }, { "entropy": 7.421106290817261, "epoch": 0.04200798151648813, "grad_norm": 1.0703125, "learning_rate": 0.0002495, "loss": 6.9377, "mean_token_accuracy": 0.0962467186152935, "num_tokens": 922900.0, "step": 500 }, { "entropy": 7.401471900939941, "epoch": 0.042428061331653014, "grad_norm": 1.1484375, "learning_rate": 0.000252, "loss": 6.9572, "mean_token_accuracy": 0.09509932994842529, "num_tokens": 930876.0, "step": 505 }, { "entropy": 7.342588901519775, "epoch": 0.0428481411468179, "grad_norm": 0.98828125, "learning_rate": 0.0002545, "loss": 7.0021, "mean_token_accuracy": 0.09231638312339782, "num_tokens": 939871.0, "step": 510 }, { "entropy": 7.44086856842041, "epoch": 0.043268220961982774, "grad_norm": 1.1875, "learning_rate": 0.000257, "loss": 6.988, "mean_token_accuracy": 0.09245615154504776, "num_tokens": 948673.0, "step": 515 }, { "entropy": 7.274595832824707, "epoch": 0.04368830077714766, "grad_norm": 1.015625, "learning_rate": 0.0002595, "loss": 6.9409, "mean_token_accuracy": 0.08984568417072296, "num_tokens": 957603.0, "step": 520 }, { "entropy": 7.436605787277221, "epoch": 0.04410838059231254, "grad_norm": 1.1015625, "learning_rate": 0.000262, "loss": 7.0062, "mean_token_accuracy": 0.08319340422749519, "num_tokens": 967731.0, "step": 525 }, { "entropy": 7.435907888412475, "epoch": 0.04452846040747742, "grad_norm": 1.140625, "learning_rate": 0.00026450000000000003, "loss": 7.0032, "mean_token_accuracy": 0.09049810692667962, "num_tokens": 977427.0, "step": 530 }, { "entropy": 7.3634380340576175, "epoch": 0.0449485402226423, "grad_norm": 1.125, "learning_rate": 0.00026700000000000004, "loss": 6.9827, "mean_token_accuracy": 0.0860845424234867, "num_tokens": 986758.0, "step": 535 }, { "entropy": 7.425018453598023, "epoch": 0.045368620037807186, "grad_norm": 1.2578125, "learning_rate": 0.00026950000000000005, "loss": 6.9738, "mean_token_accuracy": 0.09986243322491646, "num_tokens": 996377.0, "step": 540 }, { "entropy": 7.333861589431763, "epoch": 0.04578869985297206, "grad_norm": 1.0859375, "learning_rate": 0.00027200000000000005, "loss": 7.0222, "mean_token_accuracy": 0.08520096391439438, "num_tokens": 1006483.0, "step": 545 }, { "entropy": 7.269639205932617, "epoch": 0.04620877966813695, "grad_norm": 0.984375, "learning_rate": 0.0002745, "loss": 6.9248, "mean_token_accuracy": 0.091129120439291, "num_tokens": 1016132.0, "step": 550 }, { "entropy": 7.3355879306793215, "epoch": 0.04662885948330183, "grad_norm": 1.171875, "learning_rate": 0.000277, "loss": 6.8796, "mean_token_accuracy": 0.09489664137363434, "num_tokens": 1024970.0, "step": 555 }, { "entropy": 7.3572368144989015, "epoch": 0.04704893929846671, "grad_norm": 0.96484375, "learning_rate": 0.0002795, "loss": 6.9525, "mean_token_accuracy": 0.09272714778780937, "num_tokens": 1034335.0, "step": 560 }, { "entropy": 7.423572063446045, "epoch": 0.04746901911363159, "grad_norm": 1.015625, "learning_rate": 0.00028199999999999997, "loss": 7.0075, "mean_token_accuracy": 0.09945140630006791, "num_tokens": 1043954.0, "step": 565 }, { "entropy": 7.319319725036621, "epoch": 0.04788909892879647, "grad_norm": 1.0234375, "learning_rate": 0.0002845, "loss": 6.9431, "mean_token_accuracy": 0.09524357318878174, "num_tokens": 1053554.0, "step": 570 }, { "entropy": 7.376662826538086, "epoch": 0.04830917874396135, "grad_norm": 1.0078125, "learning_rate": 0.000287, "loss": 6.8893, "mean_token_accuracy": 0.0956316351890564, "num_tokens": 1062008.0, "step": 575 }, { "entropy": 7.246560859680176, "epoch": 0.048729258559126236, "grad_norm": 1.1171875, "learning_rate": 0.0002895, "loss": 6.9602, "mean_token_accuracy": 0.09502239599823951, "num_tokens": 1070740.0, "step": 580 }, { "entropy": 7.361734390258789, "epoch": 0.04914933837429111, "grad_norm": 1.203125, "learning_rate": 0.000292, "loss": 6.9451, "mean_token_accuracy": 0.09238593950867653, "num_tokens": 1079681.0, "step": 585 }, { "entropy": 7.294089078903198, "epoch": 0.049569418189456, "grad_norm": 1.015625, "learning_rate": 0.0002945, "loss": 6.8326, "mean_token_accuracy": 0.09609337821602822, "num_tokens": 1088979.0, "step": 590 }, { "entropy": 7.192009592056275, "epoch": 0.04998949800462088, "grad_norm": 1.1171875, "learning_rate": 0.000297, "loss": 6.8381, "mean_token_accuracy": 0.09695586860179901, "num_tokens": 1097870.0, "step": 595 }, { "entropy": 7.285109043121338, "epoch": 0.05040957781978576, "grad_norm": 1.109375, "learning_rate": 0.0002995, "loss": 6.9361, "mean_token_accuracy": 0.09410082027316094, "num_tokens": 1107948.0, "step": 600 }, { "entropy": 7.2816235542297365, "epoch": 0.05082965763495064, "grad_norm": 1.109375, "learning_rate": 0.000302, "loss": 6.856, "mean_token_accuracy": 0.09758619442582131, "num_tokens": 1117032.0, "step": 605 }, { "entropy": 7.1946680545806885, "epoch": 0.051249737450115525, "grad_norm": 1.0078125, "learning_rate": 0.0003045, "loss": 6.8323, "mean_token_accuracy": 0.09758584424853325, "num_tokens": 1127834.0, "step": 610 }, { "entropy": 7.325930643081665, "epoch": 0.0516698172652804, "grad_norm": 1.234375, "learning_rate": 0.000307, "loss": 6.9314, "mean_token_accuracy": 0.10701763778924941, "num_tokens": 1137382.0, "step": 615 }, { "entropy": 7.191529178619385, "epoch": 0.052089897080445285, "grad_norm": 1.0546875, "learning_rate": 0.0003095, "loss": 6.7726, "mean_token_accuracy": 0.1016211412847042, "num_tokens": 1146095.0, "step": 620 }, { "entropy": 7.197086191177368, "epoch": 0.05250997689561017, "grad_norm": 1.0234375, "learning_rate": 0.000312, "loss": 6.8164, "mean_token_accuracy": 0.09977484568953514, "num_tokens": 1154981.0, "step": 625 }, { "entropy": 7.111207914352417, "epoch": 0.052930056710775046, "grad_norm": 1.203125, "learning_rate": 0.0003145, "loss": 6.822, "mean_token_accuracy": 0.09889646545052529, "num_tokens": 1164939.0, "step": 630 }, { "entropy": 7.286598014831543, "epoch": 0.05335013652593993, "grad_norm": 1.046875, "learning_rate": 0.000317, "loss": 6.9423, "mean_token_accuracy": 0.0905054323375225, "num_tokens": 1174991.0, "step": 635 }, { "entropy": 7.268424129486084, "epoch": 0.05377021634110481, "grad_norm": 0.98046875, "learning_rate": 0.0003195, "loss": 6.9893, "mean_token_accuracy": 0.09030458927154542, "num_tokens": 1184885.0, "step": 640 }, { "entropy": 7.25072751045227, "epoch": 0.05419029615626969, "grad_norm": 1.1640625, "learning_rate": 0.000322, "loss": 6.8843, "mean_token_accuracy": 0.09418094158172607, "num_tokens": 1193637.0, "step": 645 }, { "entropy": 7.144441413879394, "epoch": 0.054610375971434574, "grad_norm": 1.1328125, "learning_rate": 0.00032450000000000003, "loss": 6.6712, "mean_token_accuracy": 0.10373484939336777, "num_tokens": 1202188.0, "step": 650 }, { "entropy": 7.2327552318573, "epoch": 0.05503045578659945, "grad_norm": 1.1484375, "learning_rate": 0.00032700000000000003, "loss": 6.8046, "mean_token_accuracy": 0.09572408124804496, "num_tokens": 1210768.0, "step": 655 }, { "entropy": 7.196833848953247, "epoch": 0.055450535601764335, "grad_norm": 1.1171875, "learning_rate": 0.00032950000000000004, "loss": 6.8024, "mean_token_accuracy": 0.09782998114824296, "num_tokens": 1219819.0, "step": 660 }, { "entropy": 7.211909484863281, "epoch": 0.05587061541692922, "grad_norm": 0.91796875, "learning_rate": 0.00033200000000000005, "loss": 6.8553, "mean_token_accuracy": 0.09061138033866882, "num_tokens": 1229703.0, "step": 665 }, { "entropy": 7.242569494247436, "epoch": 0.056290695232094096, "grad_norm": 1.1796875, "learning_rate": 0.00033450000000000005, "loss": 6.8929, "mean_token_accuracy": 0.09304608702659607, "num_tokens": 1238942.0, "step": 670 }, { "entropy": 7.276552438735962, "epoch": 0.05671077504725898, "grad_norm": 1.015625, "learning_rate": 0.000337, "loss": 6.9316, "mean_token_accuracy": 0.09855509251356125, "num_tokens": 1248943.0, "step": 675 }, { "entropy": 7.130473899841308, "epoch": 0.05713085486242386, "grad_norm": 1.015625, "learning_rate": 0.0003395, "loss": 6.8196, "mean_token_accuracy": 0.09641827270388603, "num_tokens": 1257761.0, "step": 680 }, { "entropy": 7.069635629653931, "epoch": 0.05755093467758874, "grad_norm": 1.1328125, "learning_rate": 0.000342, "loss": 6.7531, "mean_token_accuracy": 0.09635655134916306, "num_tokens": 1267216.0, "step": 685 }, { "entropy": 7.244167423248291, "epoch": 0.057971014492753624, "grad_norm": 1.0703125, "learning_rate": 0.00034449999999999997, "loss": 6.8517, "mean_token_accuracy": 0.09775793552398682, "num_tokens": 1277210.0, "step": 690 }, { "entropy": 7.151098155975342, "epoch": 0.05839109430791851, "grad_norm": 1.078125, "learning_rate": 0.000347, "loss": 6.7848, "mean_token_accuracy": 0.09209914952516556, "num_tokens": 1285310.0, "step": 695 }, { "entropy": 7.133235788345337, "epoch": 0.058811174123083385, "grad_norm": 1.1015625, "learning_rate": 0.0003495, "loss": 6.7884, "mean_token_accuracy": 0.0997276745736599, "num_tokens": 1294421.0, "step": 700 }, { "entropy": 7.089715480804443, "epoch": 0.05923125393824827, "grad_norm": 1.0078125, "learning_rate": 0.000352, "loss": 6.6149, "mean_token_accuracy": 0.10670206919312478, "num_tokens": 1303281.0, "step": 705 }, { "entropy": 7.096017217636108, "epoch": 0.059651333753413145, "grad_norm": 1.3046875, "learning_rate": 0.0003545, "loss": 6.7841, "mean_token_accuracy": 0.1047137551009655, "num_tokens": 1312280.0, "step": 710 }, { "entropy": 7.01336669921875, "epoch": 0.06007141356857803, "grad_norm": 1.0390625, "learning_rate": 0.000357, "loss": 6.7519, "mean_token_accuracy": 0.09830996096134186, "num_tokens": 1321243.0, "step": 715 }, { "entropy": 7.150788021087647, "epoch": 0.06049149338374291, "grad_norm": 1.0234375, "learning_rate": 0.0003595, "loss": 6.8411, "mean_token_accuracy": 0.0983475923538208, "num_tokens": 1330324.0, "step": 720 }, { "entropy": 7.074830770492554, "epoch": 0.06091157319890779, "grad_norm": 1.140625, "learning_rate": 0.000362, "loss": 6.6865, "mean_token_accuracy": 0.1045832097530365, "num_tokens": 1339485.0, "step": 725 }, { "entropy": 7.180077934265137, "epoch": 0.06133165301407267, "grad_norm": 1.2578125, "learning_rate": 0.0003645, "loss": 6.8327, "mean_token_accuracy": 0.09178336262702942, "num_tokens": 1348640.0, "step": 730 }, { "entropy": 7.070912313461304, "epoch": 0.06175173282923756, "grad_norm": 1.203125, "learning_rate": 0.000367, "loss": 6.7313, "mean_token_accuracy": 0.10252036228775978, "num_tokens": 1357581.0, "step": 735 }, { "entropy": 7.097622108459473, "epoch": 0.062171812644402434, "grad_norm": 1.171875, "learning_rate": 0.0003695, "loss": 6.7976, "mean_token_accuracy": 0.09888288527727127, "num_tokens": 1367883.0, "step": 740 }, { "entropy": 7.072182083129883, "epoch": 0.06259189245956731, "grad_norm": 1.078125, "learning_rate": 0.000372, "loss": 6.7536, "mean_token_accuracy": 0.09760352596640587, "num_tokens": 1376936.0, "step": 745 }, { "entropy": 6.975026559829712, "epoch": 0.0630119722747322, "grad_norm": 1.15625, "learning_rate": 0.0003745, "loss": 6.6653, "mean_token_accuracy": 0.10172178596258163, "num_tokens": 1386359.0, "step": 750 }, { "entropy": 7.0470263957977295, "epoch": 0.06343205208989708, "grad_norm": 1.0234375, "learning_rate": 0.000377, "loss": 6.7205, "mean_token_accuracy": 0.10334330797195435, "num_tokens": 1395223.0, "step": 755 }, { "entropy": 7.237481212615966, "epoch": 0.06385213190506196, "grad_norm": 0.9375, "learning_rate": 0.0003795, "loss": 6.8854, "mean_token_accuracy": 0.09526007026433944, "num_tokens": 1404917.0, "step": 760 }, { "entropy": 7.060393810272217, "epoch": 0.06427221172022685, "grad_norm": 1.109375, "learning_rate": 0.000382, "loss": 6.7712, "mean_token_accuracy": 0.10844952017068862, "num_tokens": 1413348.0, "step": 765 }, { "entropy": 7.010181617736817, "epoch": 0.06469229153539173, "grad_norm": 1.109375, "learning_rate": 0.0003845, "loss": 6.751, "mean_token_accuracy": 0.0988110676407814, "num_tokens": 1421726.0, "step": 770 }, { "entropy": 7.068030214309692, "epoch": 0.0651123713505566, "grad_norm": 1.015625, "learning_rate": 0.00038700000000000003, "loss": 6.7626, "mean_token_accuracy": 0.10152493417263031, "num_tokens": 1430686.0, "step": 775 }, { "entropy": 7.124918842315674, "epoch": 0.06553245116572148, "grad_norm": 1.1015625, "learning_rate": 0.00038950000000000003, "loss": 6.7567, "mean_token_accuracy": 0.10261558443307876, "num_tokens": 1439499.0, "step": 780 }, { "entropy": 7.08576397895813, "epoch": 0.06595253098088637, "grad_norm": 1.1953125, "learning_rate": 0.00039200000000000004, "loss": 6.7308, "mean_token_accuracy": 0.10436978489160538, "num_tokens": 1448220.0, "step": 785 }, { "entropy": 6.918930721282959, "epoch": 0.06637261079605125, "grad_norm": 1.0, "learning_rate": 0.00039450000000000005, "loss": 6.7623, "mean_token_accuracy": 0.09306630715727807, "num_tokens": 1458217.0, "step": 790 }, { "entropy": 7.050667333602905, "epoch": 0.06679269061121614, "grad_norm": 1.0703125, "learning_rate": 0.00039700000000000005, "loss": 6.6615, "mean_token_accuracy": 0.10148273557424545, "num_tokens": 1467422.0, "step": 795 }, { "entropy": 7.04574761390686, "epoch": 0.06721277042638102, "grad_norm": 1.03125, "learning_rate": 0.0003995, "loss": 6.6428, "mean_token_accuracy": 0.10174536257982254, "num_tokens": 1476152.0, "step": 800 }, { "entropy": 6.920849370956421, "epoch": 0.06763285024154589, "grad_norm": 1.140625, "learning_rate": 0.000402, "loss": 6.7303, "mean_token_accuracy": 0.09813930094242096, "num_tokens": 1485248.0, "step": 805 }, { "entropy": 7.021937704086303, "epoch": 0.06805293005671077, "grad_norm": 1.09375, "learning_rate": 0.0004045, "loss": 6.6965, "mean_token_accuracy": 0.10005066767334939, "num_tokens": 1494248.0, "step": 810 }, { "entropy": 7.009239387512207, "epoch": 0.06847300987187566, "grad_norm": 1.078125, "learning_rate": 0.00040699999999999997, "loss": 6.7988, "mean_token_accuracy": 0.10206111744046212, "num_tokens": 1503565.0, "step": 815 }, { "entropy": 7.153907108306885, "epoch": 0.06889308968704054, "grad_norm": 1.046875, "learning_rate": 0.0004095, "loss": 6.8967, "mean_token_accuracy": 0.09253153279423713, "num_tokens": 1513227.0, "step": 820 }, { "entropy": 7.081949377059937, "epoch": 0.06931316950220542, "grad_norm": 1.0625, "learning_rate": 0.000412, "loss": 6.6785, "mean_token_accuracy": 0.10418465957045556, "num_tokens": 1522312.0, "step": 825 }, { "entropy": 6.934855031967163, "epoch": 0.0697332493173703, "grad_norm": 1.09375, "learning_rate": 0.0004145, "loss": 6.6359, "mean_token_accuracy": 0.1031254269182682, "num_tokens": 1531720.0, "step": 830 }, { "entropy": 6.970464134216309, "epoch": 0.07015332913253518, "grad_norm": 1.09375, "learning_rate": 0.000417, "loss": 6.7192, "mean_token_accuracy": 0.09493932947516441, "num_tokens": 1541238.0, "step": 835 }, { "entropy": 7.103578281402588, "epoch": 0.07057340894770006, "grad_norm": 1.1015625, "learning_rate": 0.0004195, "loss": 6.8114, "mean_token_accuracy": 0.0987453043460846, "num_tokens": 1550875.0, "step": 840 }, { "entropy": 6.948361873626709, "epoch": 0.07099348876286495, "grad_norm": 1.0234375, "learning_rate": 0.000422, "loss": 6.7522, "mean_token_accuracy": 0.10080962181091309, "num_tokens": 1560287.0, "step": 845 }, { "entropy": 6.981166744232178, "epoch": 0.07141356857802983, "grad_norm": 1.0546875, "learning_rate": 0.0004245, "loss": 6.6378, "mean_token_accuracy": 0.10372715294361115, "num_tokens": 1569043.0, "step": 850 }, { "entropy": 6.902826881408691, "epoch": 0.07183364839319471, "grad_norm": 1.0546875, "learning_rate": 0.000427, "loss": 6.6697, "mean_token_accuracy": 0.10197147876024246, "num_tokens": 1578112.0, "step": 855 }, { "entropy": 6.874331331253051, "epoch": 0.07225372820835958, "grad_norm": 1.1015625, "learning_rate": 0.0004295, "loss": 6.5725, "mean_token_accuracy": 0.1078405149281025, "num_tokens": 1586587.0, "step": 860 }, { "entropy": 7.059461355209351, "epoch": 0.07267380802352447, "grad_norm": 1.078125, "learning_rate": 0.000432, "loss": 6.7397, "mean_token_accuracy": 0.09989926218986511, "num_tokens": 1595585.0, "step": 865 }, { "entropy": 6.951946210861206, "epoch": 0.07309388783868935, "grad_norm": 1.09375, "learning_rate": 0.0004345, "loss": 6.6946, "mean_token_accuracy": 0.10353797450661659, "num_tokens": 1605355.0, "step": 870 }, { "entropy": 6.944614362716675, "epoch": 0.07351396765385423, "grad_norm": 1.1328125, "learning_rate": 0.000437, "loss": 6.7108, "mean_token_accuracy": 0.09883329644799232, "num_tokens": 1613637.0, "step": 875 }, { "entropy": 6.975859832763672, "epoch": 0.07393404746901912, "grad_norm": 1.109375, "learning_rate": 0.0004395, "loss": 6.6703, "mean_token_accuracy": 0.10343916267156601, "num_tokens": 1622731.0, "step": 880 }, { "entropy": 7.003747940063477, "epoch": 0.074354127284184, "grad_norm": 1.0390625, "learning_rate": 0.000442, "loss": 6.6373, "mean_token_accuracy": 0.10040950924158096, "num_tokens": 1632098.0, "step": 885 }, { "entropy": 6.826285457611084, "epoch": 0.07477420709934887, "grad_norm": 0.96484375, "learning_rate": 0.0004445, "loss": 6.6454, "mean_token_accuracy": 0.09755287617444992, "num_tokens": 1641259.0, "step": 890 }, { "entropy": 7.0150947093963625, "epoch": 0.07519428691451376, "grad_norm": 1.1875, "learning_rate": 0.000447, "loss": 6.7262, "mean_token_accuracy": 0.09560549557209015, "num_tokens": 1651362.0, "step": 895 }, { "entropy": 6.897852563858033, "epoch": 0.07561436672967864, "grad_norm": 1.171875, "learning_rate": 0.00044950000000000003, "loss": 6.6487, "mean_token_accuracy": 0.10112505033612251, "num_tokens": 1660190.0, "step": 900 }, { "entropy": 6.90705189704895, "epoch": 0.07603444654484352, "grad_norm": 1.1953125, "learning_rate": 0.00045200000000000004, "loss": 6.663, "mean_token_accuracy": 0.10142350941896439, "num_tokens": 1669020.0, "step": 905 }, { "entropy": 6.973592853546142, "epoch": 0.0764545263600084, "grad_norm": 1.140625, "learning_rate": 0.00045450000000000004, "loss": 6.6861, "mean_token_accuracy": 0.1048488400876522, "num_tokens": 1678158.0, "step": 910 }, { "entropy": 6.985338020324707, "epoch": 0.07687460617517328, "grad_norm": 1.1328125, "learning_rate": 0.00045700000000000005, "loss": 6.7084, "mean_token_accuracy": 0.10136276260018348, "num_tokens": 1687481.0, "step": 915 }, { "entropy": 6.876794004440308, "epoch": 0.07729468599033816, "grad_norm": 1.140625, "learning_rate": 0.00045950000000000006, "loss": 6.6666, "mean_token_accuracy": 0.10845559537410736, "num_tokens": 1696782.0, "step": 920 }, { "entropy": 6.932897567749023, "epoch": 0.07771476580550304, "grad_norm": 1.0390625, "learning_rate": 0.000462, "loss": 6.6725, "mean_token_accuracy": 0.10497085899114608, "num_tokens": 1706153.0, "step": 925 }, { "entropy": 6.9077776908874515, "epoch": 0.07813484562066793, "grad_norm": 1.0078125, "learning_rate": 0.0004645, "loss": 6.6889, "mean_token_accuracy": 0.10281107649207115, "num_tokens": 1715585.0, "step": 930 }, { "entropy": 7.106683778762817, "epoch": 0.07855492543583281, "grad_norm": 1.3359375, "learning_rate": 0.000467, "loss": 6.8042, "mean_token_accuracy": 0.10099845305085182, "num_tokens": 1724857.0, "step": 935 }, { "entropy": 6.858903789520264, "epoch": 0.0789750052509977, "grad_norm": 1.15625, "learning_rate": 0.0004695, "loss": 6.6175, "mean_token_accuracy": 0.10900806412100791, "num_tokens": 1733528.0, "step": 940 }, { "entropy": 7.006282758712769, "epoch": 0.07939508506616257, "grad_norm": 0.9140625, "learning_rate": 0.000472, "loss": 6.7383, "mean_token_accuracy": 0.10379872918128967, "num_tokens": 1742953.0, "step": 945 }, { "entropy": 6.92790584564209, "epoch": 0.07981516488132745, "grad_norm": 1.1015625, "learning_rate": 0.0004745, "loss": 6.6988, "mean_token_accuracy": 0.10636084228754043, "num_tokens": 1752155.0, "step": 950 }, { "entropy": 6.911950254440308, "epoch": 0.08023524469649233, "grad_norm": 1.171875, "learning_rate": 0.000477, "loss": 6.5687, "mean_token_accuracy": 0.10838210806250573, "num_tokens": 1760562.0, "step": 955 }, { "entropy": 6.83457088470459, "epoch": 0.08065532451165722, "grad_norm": 1.1796875, "learning_rate": 0.0004795, "loss": 6.5891, "mean_token_accuracy": 0.10088410004973411, "num_tokens": 1769631.0, "step": 960 }, { "entropy": 6.914610385894775, "epoch": 0.0810754043268221, "grad_norm": 1.21875, "learning_rate": 0.000482, "loss": 6.6346, "mean_token_accuracy": 0.10217849463224411, "num_tokens": 1779080.0, "step": 965 }, { "entropy": 6.8898755550384525, "epoch": 0.08149548414198697, "grad_norm": 1.296875, "learning_rate": 0.0004845, "loss": 6.6271, "mean_token_accuracy": 0.10570115596055984, "num_tokens": 1787830.0, "step": 970 }, { "entropy": 6.751455068588257, "epoch": 0.08191556395715185, "grad_norm": 1.125, "learning_rate": 0.000487, "loss": 6.5346, "mean_token_accuracy": 0.10223312452435493, "num_tokens": 1796998.0, "step": 975 }, { "entropy": 6.8943780899047855, "epoch": 0.08233564377231674, "grad_norm": 1.0625, "learning_rate": 0.0004895, "loss": 6.6202, "mean_token_accuracy": 0.10597362667322159, "num_tokens": 1806194.0, "step": 980 }, { "entropy": 6.700069093704224, "epoch": 0.08275572358748162, "grad_norm": 0.9609375, "learning_rate": 0.000492, "loss": 6.5072, "mean_token_accuracy": 0.10932167768478393, "num_tokens": 1815751.0, "step": 985 }, { "entropy": 6.749313592910767, "epoch": 0.0831758034026465, "grad_norm": 0.953125, "learning_rate": 0.0004945, "loss": 6.5857, "mean_token_accuracy": 0.10682184919714928, "num_tokens": 1825379.0, "step": 990 }, { "entropy": 6.845586490631104, "epoch": 0.08359588321781139, "grad_norm": 1.1328125, "learning_rate": 0.000497, "loss": 6.5541, "mean_token_accuracy": 0.10507402196526527, "num_tokens": 1834158.0, "step": 995 }, { "entropy": 6.844553852081299, "epoch": 0.08401596303297626, "grad_norm": 1.0625, "learning_rate": 0.0004995, "loss": 6.5161, "mean_token_accuracy": 0.10857650190591812, "num_tokens": 1842724.0, "step": 1000 }, { "entropy": 6.795124101638794, "epoch": 0.08443604284814114, "grad_norm": 1.046875, "learning_rate": 0.000499999998724557, "loss": 6.5362, "mean_token_accuracy": 0.10392995700240135, "num_tokens": 1852485.0, "step": 1005 }, { "entropy": 6.765092468261718, "epoch": 0.08485612266330603, "grad_norm": 1.109375, "learning_rate": 0.0004999999935430703, "loss": 6.575, "mean_token_accuracy": 0.10723726153373718, "num_tokens": 1861303.0, "step": 1010 }, { "entropy": 6.745694637298584, "epoch": 0.08527620247847091, "grad_norm": 1.125, "learning_rate": 0.0004999999843758243, "loss": 6.5409, "mean_token_accuracy": 0.1151320680975914, "num_tokens": 1870859.0, "step": 1015 }, { "entropy": 6.8996889114379885, "epoch": 0.0856962822936358, "grad_norm": 1.0078125, "learning_rate": 0.0004999999712228196, "loss": 6.7032, "mean_token_accuracy": 0.10041022300720215, "num_tokens": 1880295.0, "step": 1020 }, { "entropy": 6.899116802215576, "epoch": 0.08611636210880068, "grad_norm": 1.09375, "learning_rate": 0.0004999999540840562, "loss": 6.6176, "mean_token_accuracy": 0.10147540494799615, "num_tokens": 1889193.0, "step": 1025 }, { "entropy": 6.797919845581054, "epoch": 0.08653644192396555, "grad_norm": 1.0625, "learning_rate": 0.0004999999329595345, "loss": 6.709, "mean_token_accuracy": 0.09875654354691506, "num_tokens": 1899437.0, "step": 1030 }, { "entropy": 6.910034608840943, "epoch": 0.08695652173913043, "grad_norm": 1.03125, "learning_rate": 0.0004999999078492548, "loss": 6.6032, "mean_token_accuracy": 0.10777303576469421, "num_tokens": 1907882.0, "step": 1035 }, { "entropy": 6.728742361068726, "epoch": 0.08737660155429532, "grad_norm": 0.9375, "learning_rate": 0.0004999998787532176, "loss": 6.5131, "mean_token_accuracy": 0.1080910786986351, "num_tokens": 1916872.0, "step": 1040 }, { "entropy": 6.86653618812561, "epoch": 0.0877966813694602, "grad_norm": 1.0625, "learning_rate": 0.0004999998456714234, "loss": 6.6681, "mean_token_accuracy": 0.1074354499578476, "num_tokens": 1926636.0, "step": 1045 }, { "entropy": 6.773524904251099, "epoch": 0.08821676118462508, "grad_norm": 1.1640625, "learning_rate": 0.0004999998086038729, "loss": 6.5697, "mean_token_accuracy": 0.108617003262043, "num_tokens": 1935962.0, "step": 1050 }, { "entropy": 6.809631824493408, "epoch": 0.08863684099978995, "grad_norm": 1.078125, "learning_rate": 0.0004999997675505665, "loss": 6.5493, "mean_token_accuracy": 0.10353536382317544, "num_tokens": 1944600.0, "step": 1055 }, { "entropy": 6.8208941459655765, "epoch": 0.08905692081495484, "grad_norm": 1.015625, "learning_rate": 0.0004999997225115052, "loss": 6.7156, "mean_token_accuracy": 0.10389059409499168, "num_tokens": 1954234.0, "step": 1060 }, { "entropy": 6.95792784690857, "epoch": 0.08947700063011972, "grad_norm": 1.0625, "learning_rate": 0.0004999996734866896, "loss": 6.677, "mean_token_accuracy": 0.10057736709713935, "num_tokens": 1964499.0, "step": 1065 }, { "entropy": 6.662513589859008, "epoch": 0.0898970804452846, "grad_norm": 1.1640625, "learning_rate": 0.0004999996204761206, "loss": 6.3883, "mean_token_accuracy": 0.11360553354024887, "num_tokens": 1973635.0, "step": 1070 }, { "entropy": 6.745052719116211, "epoch": 0.09031716026044949, "grad_norm": 0.95703125, "learning_rate": 0.0004999995634797993, "loss": 6.5278, "mean_token_accuracy": 0.1087425634264946, "num_tokens": 1983509.0, "step": 1075 }, { "entropy": 6.769761800765991, "epoch": 0.09073724007561437, "grad_norm": 1.1484375, "learning_rate": 0.0004999995024977265, "loss": 6.5385, "mean_token_accuracy": 0.11216638460755349, "num_tokens": 1992336.0, "step": 1080 }, { "entropy": 6.855973386764527, "epoch": 0.09115731989077924, "grad_norm": 0.99609375, "learning_rate": 0.0004999994375299034, "loss": 6.5509, "mean_token_accuracy": 0.1137130968272686, "num_tokens": 2001931.0, "step": 1085 }, { "entropy": 6.615939617156982, "epoch": 0.09157739970594413, "grad_norm": 0.98828125, "learning_rate": 0.000499999368576331, "loss": 6.4174, "mean_token_accuracy": 0.11283476129174233, "num_tokens": 2010935.0, "step": 1090 }, { "entropy": 6.7152961730957035, "epoch": 0.09199747952110901, "grad_norm": 1.109375, "learning_rate": 0.0004999992956370109, "loss": 6.4684, "mean_token_accuracy": 0.11342488676309585, "num_tokens": 2020587.0, "step": 1095 }, { "entropy": 6.688837385177612, "epoch": 0.0924175593362739, "grad_norm": 1.046875, "learning_rate": 0.000499999218711944, "loss": 6.5046, "mean_token_accuracy": 0.10743609666824341, "num_tokens": 2029743.0, "step": 1100 }, { "entropy": 6.771305274963379, "epoch": 0.09283763915143878, "grad_norm": 1.1484375, "learning_rate": 0.0004999991378011317, "loss": 6.5286, "mean_token_accuracy": 0.11453117504715919, "num_tokens": 2038468.0, "step": 1105 }, { "entropy": 6.67022180557251, "epoch": 0.09325771896660366, "grad_norm": 1.046875, "learning_rate": 0.0004999990529045757, "loss": 6.4451, "mean_token_accuracy": 0.11554965823888778, "num_tokens": 2047456.0, "step": 1110 }, { "entropy": 6.870058679580689, "epoch": 0.09367779878176853, "grad_norm": 0.9765625, "learning_rate": 0.0004999989640222771, "loss": 6.7458, "mean_token_accuracy": 0.09942527562379837, "num_tokens": 2056691.0, "step": 1115 }, { "entropy": 6.829685544967651, "epoch": 0.09409787859693342, "grad_norm": 1.03125, "learning_rate": 0.000499998871154238, "loss": 6.5487, "mean_token_accuracy": 0.10888865366578102, "num_tokens": 2066068.0, "step": 1120 }, { "entropy": 6.725253868103027, "epoch": 0.0945179584120983, "grad_norm": 1.015625, "learning_rate": 0.0004999987743004597, "loss": 6.4837, "mean_token_accuracy": 0.11379996240139008, "num_tokens": 2075113.0, "step": 1125 }, { "entropy": 6.7777934074401855, "epoch": 0.09493803822726318, "grad_norm": 0.9609375, "learning_rate": 0.0004999986734609438, "loss": 6.6044, "mean_token_accuracy": 0.11070828661322593, "num_tokens": 2084557.0, "step": 1130 }, { "entropy": 6.817347526550293, "epoch": 0.09535811804242807, "grad_norm": 1.0625, "learning_rate": 0.0004999985686356923, "loss": 6.497, "mean_token_accuracy": 0.10584703534841537, "num_tokens": 2093424.0, "step": 1135 }, { "entropy": 6.7462608337402346, "epoch": 0.09577819785759294, "grad_norm": 1.03125, "learning_rate": 0.000499998459824707, "loss": 6.6329, "mean_token_accuracy": 0.10303654298186302, "num_tokens": 2103066.0, "step": 1140 }, { "entropy": 6.799277830123901, "epoch": 0.09619827767275782, "grad_norm": 1.046875, "learning_rate": 0.00049999834702799, "loss": 6.5085, "mean_token_accuracy": 0.11131441742181777, "num_tokens": 2112447.0, "step": 1145 }, { "entropy": 6.711055421829224, "epoch": 0.0966183574879227, "grad_norm": 0.9375, "learning_rate": 0.0004999982302455431, "loss": 6.52, "mean_token_accuracy": 0.11281892731785774, "num_tokens": 2121949.0, "step": 1150 }, { "entropy": 6.780323314666748, "epoch": 0.09703843730308759, "grad_norm": 1.015625, "learning_rate": 0.0004999981094773683, "loss": 6.4157, "mean_token_accuracy": 0.1144998162984848, "num_tokens": 2130464.0, "step": 1155 }, { "entropy": 6.697625207901001, "epoch": 0.09745851711825247, "grad_norm": 1.140625, "learning_rate": 0.000499997984723468, "loss": 6.5921, "mean_token_accuracy": 0.1068018026649952, "num_tokens": 2139577.0, "step": 1160 }, { "entropy": 6.569090557098389, "epoch": 0.09787859693341736, "grad_norm": 0.96484375, "learning_rate": 0.0004999978559838441, "loss": 6.3121, "mean_token_accuracy": 0.11300956755876541, "num_tokens": 2147919.0, "step": 1165 }, { "entropy": 6.716167974472046, "epoch": 0.09829867674858223, "grad_norm": 1.0390625, "learning_rate": 0.0004999977232584991, "loss": 6.4791, "mean_token_accuracy": 0.11262017637491226, "num_tokens": 2156936.0, "step": 1170 }, { "entropy": 6.6336616516113285, "epoch": 0.09871875656374711, "grad_norm": 1.0859375, "learning_rate": 0.0004999975865474354, "loss": 6.5492, "mean_token_accuracy": 0.10994603037834168, "num_tokens": 2165362.0, "step": 1175 }, { "entropy": 6.719806575775147, "epoch": 0.099138836378912, "grad_norm": 1.1796875, "learning_rate": 0.0004999974458506551, "loss": 6.4705, "mean_token_accuracy": 0.11214353889226913, "num_tokens": 2173665.0, "step": 1180 }, { "entropy": 6.786266422271728, "epoch": 0.09955891619407688, "grad_norm": 1.2578125, "learning_rate": 0.000499997301168161, "loss": 6.4531, "mean_token_accuracy": 0.11377902403473854, "num_tokens": 2182222.0, "step": 1185 }, { "entropy": 6.670177459716797, "epoch": 0.09997899600924176, "grad_norm": 0.9609375, "learning_rate": 0.0004999971524999556, "loss": 6.528, "mean_token_accuracy": 0.11228533461689949, "num_tokens": 2192358.0, "step": 1190 }, { "entropy": 6.779563045501709, "epoch": 0.10039907582440663, "grad_norm": 1.03125, "learning_rate": 0.0004999969998460414, "loss": 6.5039, "mean_token_accuracy": 0.10956505164504052, "num_tokens": 2201889.0, "step": 1195 }, { "entropy": 6.6560157299041744, "epoch": 0.10081915563957151, "grad_norm": 1.3359375, "learning_rate": 0.0004999968432064213, "loss": 6.5232, "mean_token_accuracy": 0.11500915959477424, "num_tokens": 2211810.0, "step": 1200 }, { "entropy": 6.652071762084961, "epoch": 0.1012392354547364, "grad_norm": 0.921875, "learning_rate": 0.0004999966825810979, "loss": 6.4474, "mean_token_accuracy": 0.11259665861725807, "num_tokens": 2221123.0, "step": 1205 }, { "entropy": 6.634405040740967, "epoch": 0.10165931526990128, "grad_norm": 1.0703125, "learning_rate": 0.0004999965179700742, "loss": 6.402, "mean_token_accuracy": 0.1181789293885231, "num_tokens": 2230129.0, "step": 1210 }, { "entropy": 6.625933122634888, "epoch": 0.10207939508506617, "grad_norm": 1.03125, "learning_rate": 0.000499996349373353, "loss": 6.4624, "mean_token_accuracy": 0.11246607527136802, "num_tokens": 2239929.0, "step": 1215 }, { "entropy": 6.709180927276611, "epoch": 0.10249947490023105, "grad_norm": 1.0390625, "learning_rate": 0.0004999961767909374, "loss": 6.4292, "mean_token_accuracy": 0.11479318514466286, "num_tokens": 2248078.0, "step": 1220 }, { "entropy": 6.59263162612915, "epoch": 0.10291955471539592, "grad_norm": 1.0625, "learning_rate": 0.0004999960002228303, "loss": 6.5262, "mean_token_accuracy": 0.11000767946243287, "num_tokens": 2256975.0, "step": 1225 }, { "entropy": 6.708470964431763, "epoch": 0.1033396345305608, "grad_norm": 1.15625, "learning_rate": 0.0004999958196690349, "loss": 6.3792, "mean_token_accuracy": 0.11624118462204933, "num_tokens": 2265797.0, "step": 1230 }, { "entropy": 6.645881128311157, "epoch": 0.10375971434572569, "grad_norm": 1.0234375, "learning_rate": 0.0004999956351295545, "loss": 6.4736, "mean_token_accuracy": 0.1176276110112667, "num_tokens": 2274099.0, "step": 1235 }, { "entropy": 6.599815797805786, "epoch": 0.10417979416089057, "grad_norm": 1.03125, "learning_rate": 0.0004999954466043922, "loss": 6.3853, "mean_token_accuracy": 0.11810432821512222, "num_tokens": 2282360.0, "step": 1240 }, { "entropy": 6.57668776512146, "epoch": 0.10459987397605545, "grad_norm": 0.96875, "learning_rate": 0.0004999952540935514, "loss": 6.4891, "mean_token_accuracy": 0.11048517748713493, "num_tokens": 2292714.0, "step": 1245 }, { "entropy": 6.675060033798218, "epoch": 0.10501995379122034, "grad_norm": 1.0859375, "learning_rate": 0.0004999950575970356, "loss": 6.4361, "mean_token_accuracy": 0.11576245203614235, "num_tokens": 2301633.0, "step": 1250 }, { "entropy": 6.642887592315674, "epoch": 0.10544003360638521, "grad_norm": 1.0234375, "learning_rate": 0.0004999948571148482, "loss": 6.3931, "mean_token_accuracy": 0.12049147412180901, "num_tokens": 2310067.0, "step": 1255 }, { "entropy": 6.610925579071045, "epoch": 0.10586011342155009, "grad_norm": 1.046875, "learning_rate": 0.0004999946526469927, "loss": 6.4927, "mean_token_accuracy": 0.11412879601120948, "num_tokens": 2320090.0, "step": 1260 }, { "entropy": 6.649963521957398, "epoch": 0.10628019323671498, "grad_norm": 1.03125, "learning_rate": 0.0004999944441934728, "loss": 6.4451, "mean_token_accuracy": 0.11852803751826287, "num_tokens": 2329255.0, "step": 1265 }, { "entropy": 6.678138732910156, "epoch": 0.10670027305187986, "grad_norm": 1.109375, "learning_rate": 0.0004999942317542922, "loss": 6.5261, "mean_token_accuracy": 0.11407028958201408, "num_tokens": 2339535.0, "step": 1270 }, { "entropy": 6.635104560852051, "epoch": 0.10712035286704474, "grad_norm": 1.0546875, "learning_rate": 0.0004999940153294546, "loss": 6.425, "mean_token_accuracy": 0.11798783987760544, "num_tokens": 2348948.0, "step": 1275 }, { "entropy": 6.629437446594238, "epoch": 0.10754043268220961, "grad_norm": 0.99609375, "learning_rate": 0.000499993794918964, "loss": 6.4518, "mean_token_accuracy": 0.10851866900920867, "num_tokens": 2359141.0, "step": 1280 }, { "entropy": 6.612447357177734, "epoch": 0.1079605124973745, "grad_norm": 1.1875, "learning_rate": 0.0004999935705228241, "loss": 6.5007, "mean_token_accuracy": 0.10988411605358124, "num_tokens": 2368906.0, "step": 1285 }, { "entropy": 6.720192527770996, "epoch": 0.10838059231253938, "grad_norm": 1.15625, "learning_rate": 0.0004999933421410389, "loss": 6.4756, "mean_token_accuracy": 0.11632761880755424, "num_tokens": 2377029.0, "step": 1290 }, { "entropy": 6.682251882553101, "epoch": 0.10880067212770426, "grad_norm": 0.84765625, "learning_rate": 0.0004999931097736125, "loss": 6.5226, "mean_token_accuracy": 0.10841714516282082, "num_tokens": 2387088.0, "step": 1295 }, { "entropy": 6.616416501998901, "epoch": 0.10922075194286915, "grad_norm": 1.015625, "learning_rate": 0.0004999928734205492, "loss": 6.4358, "mean_token_accuracy": 0.11085559725761414, "num_tokens": 2395596.0, "step": 1300 }, { "entropy": 6.630216932296753, "epoch": 0.10964083175803403, "grad_norm": 1.09375, "learning_rate": 0.0004999926330818528, "loss": 6.4278, "mean_token_accuracy": 0.11868382543325424, "num_tokens": 2404506.0, "step": 1305 }, { "entropy": 6.615355587005615, "epoch": 0.1100609115731989, "grad_norm": 1.109375, "learning_rate": 0.0004999923887575278, "loss": 6.4742, "mean_token_accuracy": 0.11464583277702331, "num_tokens": 2414342.0, "step": 1310 }, { "entropy": 6.68165545463562, "epoch": 0.11048099138836379, "grad_norm": 1.0859375, "learning_rate": 0.0004999921404475785, "loss": 6.4271, "mean_token_accuracy": 0.11960532069206238, "num_tokens": 2423076.0, "step": 1315 }, { "entropy": 6.567938899993896, "epoch": 0.11090107120352867, "grad_norm": 0.91796875, "learning_rate": 0.0004999918881520093, "loss": 6.3809, "mean_token_accuracy": 0.1204459622502327, "num_tokens": 2432492.0, "step": 1320 }, { "entropy": 6.610611057281494, "epoch": 0.11132115101869355, "grad_norm": 0.96875, "learning_rate": 0.0004999916318708246, "loss": 6.3447, "mean_token_accuracy": 0.1213211365044117, "num_tokens": 2441916.0, "step": 1325 }, { "entropy": 6.550094270706177, "epoch": 0.11174123083385844, "grad_norm": 1.1015625, "learning_rate": 0.0004999913716040291, "loss": 6.4, "mean_token_accuracy": 0.11803905665874481, "num_tokens": 2450932.0, "step": 1330 }, { "entropy": 6.5825268745422365, "epoch": 0.11216131064902331, "grad_norm": 1.0859375, "learning_rate": 0.0004999911073516272, "loss": 6.4156, "mean_token_accuracy": 0.11501810997724533, "num_tokens": 2460058.0, "step": 1335 }, { "entropy": 6.541036558151245, "epoch": 0.11258139046418819, "grad_norm": 0.98046875, "learning_rate": 0.0004999908391136237, "loss": 6.3486, "mean_token_accuracy": 0.11862518936395645, "num_tokens": 2469607.0, "step": 1340 }, { "entropy": 6.54659481048584, "epoch": 0.11300147027935308, "grad_norm": 1.09375, "learning_rate": 0.0004999905668900234, "loss": 6.4037, "mean_token_accuracy": 0.11429757624864578, "num_tokens": 2478345.0, "step": 1345 }, { "entropy": 6.665723133087158, "epoch": 0.11342155009451796, "grad_norm": 1.15625, "learning_rate": 0.000499990290680831, "loss": 6.3362, "mean_token_accuracy": 0.11939993128180504, "num_tokens": 2486662.0, "step": 1350 }, { "entropy": 6.539735174179077, "epoch": 0.11384162990968284, "grad_norm": 1.0859375, "learning_rate": 0.0004999900104860516, "loss": 6.4496, "mean_token_accuracy": 0.11450904607772827, "num_tokens": 2495392.0, "step": 1355 }, { "entropy": 6.640576314926148, "epoch": 0.11426170972484773, "grad_norm": 1.0546875, "learning_rate": 0.0004999897263056898, "loss": 6.4824, "mean_token_accuracy": 0.11427311152219773, "num_tokens": 2505254.0, "step": 1360 }, { "entropy": 6.6059410572052, "epoch": 0.1146817895400126, "grad_norm": 1.0, "learning_rate": 0.000499989438139751, "loss": 6.2902, "mean_token_accuracy": 0.12163057401776314, "num_tokens": 2514096.0, "step": 1365 }, { "entropy": 6.572102785110474, "epoch": 0.11510186935517748, "grad_norm": 0.9453125, "learning_rate": 0.0004999891459882401, "loss": 6.3036, "mean_token_accuracy": 0.12106614261865616, "num_tokens": 2523635.0, "step": 1370 }, { "entropy": 6.518535518646241, "epoch": 0.11552194917034236, "grad_norm": 0.99609375, "learning_rate": 0.0004999888498511624, "loss": 6.3872, "mean_token_accuracy": 0.117999816685915, "num_tokens": 2532528.0, "step": 1375 }, { "entropy": 6.522701168060303, "epoch": 0.11594202898550725, "grad_norm": 1.0625, "learning_rate": 0.0004999885497285229, "loss": 6.3026, "mean_token_accuracy": 0.11809839084744453, "num_tokens": 2541893.0, "step": 1380 }, { "entropy": 6.516852188110351, "epoch": 0.11636210880067213, "grad_norm": 0.99609375, "learning_rate": 0.0004999882456203273, "loss": 6.3627, "mean_token_accuracy": 0.11867272853851318, "num_tokens": 2551551.0, "step": 1385 }, { "entropy": 6.592957019805908, "epoch": 0.11678218861583702, "grad_norm": 1.1171875, "learning_rate": 0.0004999879375265806, "loss": 6.314, "mean_token_accuracy": 0.1192450650036335, "num_tokens": 2560183.0, "step": 1390 }, { "entropy": 6.526823472976685, "epoch": 0.11720226843100189, "grad_norm": 1.1484375, "learning_rate": 0.0004999876254472886, "loss": 6.2065, "mean_token_accuracy": 0.127345572412014, "num_tokens": 2568697.0, "step": 1395 }, { "entropy": 6.488171815872192, "epoch": 0.11762234824616677, "grad_norm": 0.97265625, "learning_rate": 0.0004999873093824565, "loss": 6.4136, "mean_token_accuracy": 0.1172497920691967, "num_tokens": 2578151.0, "step": 1400 }, { "entropy": 6.697162342071533, "epoch": 0.11804242806133165, "grad_norm": 1.1171875, "learning_rate": 0.0004999869893320902, "loss": 6.5415, "mean_token_accuracy": 0.11695929765701293, "num_tokens": 2585901.0, "step": 1405 }, { "entropy": 6.558137512207031, "epoch": 0.11846250787649654, "grad_norm": 1.0234375, "learning_rate": 0.0004999866652961952, "loss": 6.3565, "mean_token_accuracy": 0.11195311546325684, "num_tokens": 2595655.0, "step": 1410 }, { "entropy": 6.547592639923096, "epoch": 0.11888258769166142, "grad_norm": 0.984375, "learning_rate": 0.0004999863372747773, "loss": 6.3241, "mean_token_accuracy": 0.1137452982366085, "num_tokens": 2604949.0, "step": 1415 }, { "entropy": 6.549184036254883, "epoch": 0.11930266750682629, "grad_norm": 1.125, "learning_rate": 0.0004999860052678423, "loss": 6.3987, "mean_token_accuracy": 0.12182095795869827, "num_tokens": 2614260.0, "step": 1420 }, { "entropy": 6.533220100402832, "epoch": 0.11972274732199117, "grad_norm": 1.046875, "learning_rate": 0.0004999856692753959, "loss": 6.3846, "mean_token_accuracy": 0.11606933474540711, "num_tokens": 2623740.0, "step": 1425 }, { "entropy": 6.56026554107666, "epoch": 0.12014282713715606, "grad_norm": 1.0390625, "learning_rate": 0.0004999853292974444, "loss": 6.2829, "mean_token_accuracy": 0.1191012591123581, "num_tokens": 2631998.0, "step": 1430 }, { "entropy": 6.436700010299683, "epoch": 0.12056290695232094, "grad_norm": 0.96875, "learning_rate": 0.0004999849853339936, "loss": 6.4441, "mean_token_accuracy": 0.12089451104402542, "num_tokens": 2641169.0, "step": 1435 }, { "entropy": 6.6503981590271, "epoch": 0.12098298676748583, "grad_norm": 0.9140625, "learning_rate": 0.0004999846373850497, "loss": 6.2726, "mean_token_accuracy": 0.12328374907374381, "num_tokens": 2650576.0, "step": 1440 }, { "entropy": 6.504758834838867, "epoch": 0.12140306658265071, "grad_norm": 1.0234375, "learning_rate": 0.0004999842854506186, "loss": 6.3597, "mean_token_accuracy": 0.11508475914597512, "num_tokens": 2660817.0, "step": 1445 }, { "entropy": 6.454709720611572, "epoch": 0.12182314639781558, "grad_norm": 1.0859375, "learning_rate": 0.0004999839295307069, "loss": 6.317, "mean_token_accuracy": 0.11818674132227898, "num_tokens": 2669338.0, "step": 1450 }, { "entropy": 6.5724732875823975, "epoch": 0.12224322621298046, "grad_norm": 1.078125, "learning_rate": 0.0004999835696253206, "loss": 6.3698, "mean_token_accuracy": 0.11763316094875335, "num_tokens": 2679108.0, "step": 1455 }, { "entropy": 6.542471504211425, "epoch": 0.12266330602814535, "grad_norm": 0.9453125, "learning_rate": 0.0004999832057344664, "loss": 6.3312, "mean_token_accuracy": 0.11857884675264359, "num_tokens": 2688126.0, "step": 1460 }, { "entropy": 6.3690132141113285, "epoch": 0.12308338584331023, "grad_norm": 1.0390625, "learning_rate": 0.0004999828378581504, "loss": 6.2827, "mean_token_accuracy": 0.12631092369556426, "num_tokens": 2697245.0, "step": 1465 }, { "entropy": 6.5668089389801025, "epoch": 0.12350346565847511, "grad_norm": 1.046875, "learning_rate": 0.0004999824659963793, "loss": 6.3543, "mean_token_accuracy": 0.12048940360546112, "num_tokens": 2705934.0, "step": 1470 }, { "entropy": 6.516648006439209, "epoch": 0.12392354547364, "grad_norm": 1.125, "learning_rate": 0.0004999820901491598, "loss": 6.2753, "mean_token_accuracy": 0.12523386031389236, "num_tokens": 2714367.0, "step": 1475 }, { "entropy": 6.416815328598022, "epoch": 0.12434362528880487, "grad_norm": 1.0390625, "learning_rate": 0.0004999817103164983, "loss": 6.3117, "mean_token_accuracy": 0.12113343179225922, "num_tokens": 2724366.0, "step": 1480 }, { "entropy": 6.518594264984131, "epoch": 0.12476370510396975, "grad_norm": 0.953125, "learning_rate": 0.0004999813264984017, "loss": 6.3262, "mean_token_accuracy": 0.11913523152470588, "num_tokens": 2733980.0, "step": 1485 }, { "entropy": 6.520108652114868, "epoch": 0.12518378491913462, "grad_norm": 1.0234375, "learning_rate": 0.0004999809386948767, "loss": 6.3232, "mean_token_accuracy": 0.11875561475753785, "num_tokens": 2744013.0, "step": 1490 }, { "entropy": 6.4508843421936035, "epoch": 0.12560386473429952, "grad_norm": 1.1640625, "learning_rate": 0.0004999805469059302, "loss": 6.3917, "mean_token_accuracy": 0.1202739343047142, "num_tokens": 2753385.0, "step": 1495 }, { "entropy": 6.467165565490722, "epoch": 0.1260239445494644, "grad_norm": 1.03125, "learning_rate": 0.0004999801511315693, "loss": 6.2443, "mean_token_accuracy": 0.11950960382819176, "num_tokens": 2762875.0, "step": 1500 }, { "entropy": 6.561000490188599, "epoch": 0.1264440243646293, "grad_norm": 1.0234375, "learning_rate": 0.0004999797513718007, "loss": 6.3133, "mean_token_accuracy": 0.12554540634155273, "num_tokens": 2772182.0, "step": 1505 }, { "entropy": 6.398244476318359, "epoch": 0.12686410417979416, "grad_norm": 1.0234375, "learning_rate": 0.0004999793476266317, "loss": 6.2652, "mean_token_accuracy": 0.12494927272200584, "num_tokens": 2780814.0, "step": 1510 }, { "entropy": 6.759689664840698, "epoch": 0.12728418399495905, "grad_norm": 1.0234375, "learning_rate": 0.0004999789398960695, "loss": 6.5371, "mean_token_accuracy": 0.120218076556921, "num_tokens": 2791104.0, "step": 1515 }, { "entropy": 6.380699729919433, "epoch": 0.12770426381012392, "grad_norm": 0.9921875, "learning_rate": 0.0004999785281801212, "loss": 6.2392, "mean_token_accuracy": 0.12141881808638573, "num_tokens": 2800081.0, "step": 1520 }, { "entropy": 6.502162122726441, "epoch": 0.1281243436252888, "grad_norm": 1.0703125, "learning_rate": 0.000499978112478794, "loss": 6.3645, "mean_token_accuracy": 0.11820052862167359, "num_tokens": 2809096.0, "step": 1525 }, { "entropy": 6.559705686569214, "epoch": 0.1285444234404537, "grad_norm": 1.0, "learning_rate": 0.0004999776927920955, "loss": 6.3324, "mean_token_accuracy": 0.12376131415367127, "num_tokens": 2818857.0, "step": 1530 }, { "entropy": 6.478033876419067, "epoch": 0.12896450325561856, "grad_norm": 1.0703125, "learning_rate": 0.000499977269120033, "loss": 6.3924, "mean_token_accuracy": 0.11640017554163933, "num_tokens": 2829332.0, "step": 1535 }, { "entropy": 6.471277475357056, "epoch": 0.12938458307078346, "grad_norm": 0.9453125, "learning_rate": 0.000499976841462614, "loss": 6.3118, "mean_token_accuracy": 0.11578154116868973, "num_tokens": 2839193.0, "step": 1540 }, { "entropy": 6.515983152389526, "epoch": 0.12980466288594833, "grad_norm": 0.94921875, "learning_rate": 0.000499976409819846, "loss": 6.3126, "mean_token_accuracy": 0.1165178470313549, "num_tokens": 2848535.0, "step": 1545 }, { "entropy": 6.329218864440918, "epoch": 0.1302247427011132, "grad_norm": 0.9609375, "learning_rate": 0.0004999759741917369, "loss": 6.2119, "mean_token_accuracy": 0.12768493369221687, "num_tokens": 2858090.0, "step": 1550 }, { "entropy": 6.4847986698150635, "epoch": 0.1306448225162781, "grad_norm": 1.1640625, "learning_rate": 0.0004999755345782941, "loss": 6.3672, "mean_token_accuracy": 0.1186487466096878, "num_tokens": 2866984.0, "step": 1555 }, { "entropy": 6.419411611557007, "epoch": 0.13106490233144297, "grad_norm": 0.89453125, "learning_rate": 0.0004999750909795256, "loss": 6.1757, "mean_token_accuracy": 0.1280258044600487, "num_tokens": 2876550.0, "step": 1560 }, { "entropy": 6.461032104492188, "epoch": 0.13148498214660786, "grad_norm": 0.98046875, "learning_rate": 0.0004999746433954394, "loss": 6.2774, "mean_token_accuracy": 0.1213872842490673, "num_tokens": 2885782.0, "step": 1565 }, { "entropy": 6.447916793823242, "epoch": 0.13190506196177273, "grad_norm": 1.0, "learning_rate": 0.000499974191826043, "loss": 6.2448, "mean_token_accuracy": 0.13687582612037658, "num_tokens": 2894807.0, "step": 1570 }, { "entropy": 6.439778518676758, "epoch": 0.1323251417769376, "grad_norm": 1.171875, "learning_rate": 0.0004999737362713448, "loss": 6.2925, "mean_token_accuracy": 0.1238982230424881, "num_tokens": 2904076.0, "step": 1575 }, { "entropy": 6.471430492401123, "epoch": 0.1327452215921025, "grad_norm": 1.0390625, "learning_rate": 0.0004999732767313527, "loss": 6.2033, "mean_token_accuracy": 0.1205870471894741, "num_tokens": 2913761.0, "step": 1580 }, { "entropy": 6.509069633483887, "epoch": 0.13316530140726737, "grad_norm": 1.0546875, "learning_rate": 0.0004999728132060746, "loss": 6.4228, "mean_token_accuracy": 0.12286271527409554, "num_tokens": 2922848.0, "step": 1585 }, { "entropy": 6.5165454864501955, "epoch": 0.13358538122243227, "grad_norm": 0.953125, "learning_rate": 0.0004999723456955192, "loss": 6.3079, "mean_token_accuracy": 0.11906806230545045, "num_tokens": 2932718.0, "step": 1590 }, { "entropy": 6.353040504455566, "epoch": 0.13400546103759714, "grad_norm": 0.9765625, "learning_rate": 0.0004999718741996945, "loss": 6.2648, "mean_token_accuracy": 0.12362491562962533, "num_tokens": 2942686.0, "step": 1595 }, { "entropy": 6.480581188201905, "epoch": 0.13442554085276204, "grad_norm": 0.98046875, "learning_rate": 0.000499971398718609, "loss": 6.2304, "mean_token_accuracy": 0.12233746945858001, "num_tokens": 2952096.0, "step": 1600 }, { "entropy": 6.41249566078186, "epoch": 0.1348456206679269, "grad_norm": 1.0234375, "learning_rate": 0.0004999709192522708, "loss": 6.3139, "mean_token_accuracy": 0.12512291446328164, "num_tokens": 2960660.0, "step": 1605 }, { "entropy": 6.536613845825196, "epoch": 0.13526570048309178, "grad_norm": 0.91796875, "learning_rate": 0.0004999704358006887, "loss": 6.3118, "mean_token_accuracy": 0.12129077091813087, "num_tokens": 2969834.0, "step": 1610 }, { "entropy": 6.4085368633270265, "epoch": 0.13568578029825668, "grad_norm": 1.09375, "learning_rate": 0.0004999699483638712, "loss": 6.2906, "mean_token_accuracy": 0.12232841104269028, "num_tokens": 2979023.0, "step": 1615 }, { "entropy": 6.476312971115112, "epoch": 0.13610586011342155, "grad_norm": 1.015625, "learning_rate": 0.0004999694569418269, "loss": 6.2964, "mean_token_accuracy": 0.12233099341392517, "num_tokens": 2988083.0, "step": 1620 }, { "entropy": 6.359239149093628, "epoch": 0.13652593992858644, "grad_norm": 0.9921875, "learning_rate": 0.0004999689615345645, "loss": 6.2196, "mean_token_accuracy": 0.12490532472729683, "num_tokens": 2997240.0, "step": 1625 }, { "entropy": 6.505274820327759, "epoch": 0.1369460197437513, "grad_norm": 1.0859375, "learning_rate": 0.0004999684621420928, "loss": 6.2805, "mean_token_accuracy": 0.12174654453992843, "num_tokens": 3007077.0, "step": 1630 }, { "entropy": 6.501539659500122, "epoch": 0.13736609955891618, "grad_norm": 1.0078125, "learning_rate": 0.0004999679587644205, "loss": 6.3282, "mean_token_accuracy": 0.11869422942399979, "num_tokens": 3015821.0, "step": 1635 }, { "entropy": 6.434766483306885, "epoch": 0.13778617937408108, "grad_norm": 1.046875, "learning_rate": 0.0004999674514015568, "loss": 6.2508, "mean_token_accuracy": 0.1246812529861927, "num_tokens": 3025858.0, "step": 1640 }, { "entropy": 6.406217813491821, "epoch": 0.13820625918924595, "grad_norm": 0.98046875, "learning_rate": 0.0004999669400535105, "loss": 6.2132, "mean_token_accuracy": 0.12023203670978547, "num_tokens": 3035537.0, "step": 1645 }, { "entropy": 6.359542560577393, "epoch": 0.13862633900441085, "grad_norm": 1.140625, "learning_rate": 0.0004999664247202907, "loss": 6.152, "mean_token_accuracy": 0.12406394928693772, "num_tokens": 3044204.0, "step": 1650 }, { "entropy": 6.404636430740356, "epoch": 0.13904641881957572, "grad_norm": 1.03125, "learning_rate": 0.0004999659054019066, "loss": 6.2994, "mean_token_accuracy": 0.12448503151535988, "num_tokens": 3053111.0, "step": 1655 }, { "entropy": 6.443476963043213, "epoch": 0.1394664986347406, "grad_norm": 1.0390625, "learning_rate": 0.0004999653820983673, "loss": 6.2201, "mean_token_accuracy": 0.12843194082379342, "num_tokens": 3062456.0, "step": 1660 }, { "entropy": 6.356498098373413, "epoch": 0.13988657844990549, "grad_norm": 0.98828125, "learning_rate": 0.000499964854809682, "loss": 6.2579, "mean_token_accuracy": 0.12453076243400574, "num_tokens": 3071132.0, "step": 1665 }, { "entropy": 6.388091611862182, "epoch": 0.14030665826507036, "grad_norm": 0.98046875, "learning_rate": 0.0004999643235358602, "loss": 6.2078, "mean_token_accuracy": 0.12833356559276582, "num_tokens": 3080892.0, "step": 1670 }, { "entropy": 6.392906522750854, "epoch": 0.14072673808023525, "grad_norm": 1.015625, "learning_rate": 0.0004999637882769112, "loss": 6.1429, "mean_token_accuracy": 0.12803655937314035, "num_tokens": 3089874.0, "step": 1675 }, { "entropy": 6.369514799118042, "epoch": 0.14114681789540012, "grad_norm": 0.91796875, "learning_rate": 0.0004999632490328447, "loss": 6.2814, "mean_token_accuracy": 0.12487674206495285, "num_tokens": 3099535.0, "step": 1680 }, { "entropy": 6.432224130630493, "epoch": 0.14156689771056502, "grad_norm": 0.984375, "learning_rate": 0.0004999627058036699, "loss": 6.24, "mean_token_accuracy": 0.12075779214501381, "num_tokens": 3108772.0, "step": 1685 }, { "entropy": 6.430401134490967, "epoch": 0.1419869775257299, "grad_norm": 1.0234375, "learning_rate": 0.0004999621585893966, "loss": 6.2696, "mean_token_accuracy": 0.11704754754900933, "num_tokens": 3118333.0, "step": 1690 }, { "entropy": 6.450057506561279, "epoch": 0.14240705734089476, "grad_norm": 1.0625, "learning_rate": 0.0004999616073900346, "loss": 6.3013, "mean_token_accuracy": 0.12180939391255378, "num_tokens": 3127356.0, "step": 1695 }, { "entropy": 6.412153673171997, "epoch": 0.14282713715605966, "grad_norm": 1.0859375, "learning_rate": 0.0004999610522055935, "loss": 6.2662, "mean_token_accuracy": 0.1200573742389679, "num_tokens": 3136859.0, "step": 1700 }, { "entropy": 6.451931762695312, "epoch": 0.14324721697122453, "grad_norm": 0.9921875, "learning_rate": 0.0004999604930360832, "loss": 6.2945, "mean_token_accuracy": 0.12161469012498856, "num_tokens": 3146607.0, "step": 1705 }, { "entropy": 6.3816108226776125, "epoch": 0.14366729678638943, "grad_norm": 0.95703125, "learning_rate": 0.0004999599298815136, "loss": 6.2364, "mean_token_accuracy": 0.12764545828104018, "num_tokens": 3156327.0, "step": 1710 }, { "entropy": 6.309280204772949, "epoch": 0.1440873766015543, "grad_norm": 1.5390625, "learning_rate": 0.0004999593627418947, "loss": 6.177, "mean_token_accuracy": 0.13247063681483268, "num_tokens": 3165559.0, "step": 1715 }, { "entropy": 6.405248212814331, "epoch": 0.14450745641671917, "grad_norm": 1.0625, "learning_rate": 0.0004999587916172365, "loss": 6.2704, "mean_token_accuracy": 0.1183898076415062, "num_tokens": 3173850.0, "step": 1720 }, { "entropy": 6.435620069503784, "epoch": 0.14492753623188406, "grad_norm": 1.0078125, "learning_rate": 0.0004999582165075492, "loss": 6.22, "mean_token_accuracy": 0.11956866905093193, "num_tokens": 3182838.0, "step": 1725 }, { "entropy": 6.2884269714355465, "epoch": 0.14534761604704893, "grad_norm": 1.0234375, "learning_rate": 0.0004999576374128429, "loss": 6.202, "mean_token_accuracy": 0.1219302274286747, "num_tokens": 3191692.0, "step": 1730 }, { "entropy": 6.500776195526123, "epoch": 0.14576769586221383, "grad_norm": 1.0703125, "learning_rate": 0.0004999570543331279, "loss": 6.226, "mean_token_accuracy": 0.1263854332268238, "num_tokens": 3200069.0, "step": 1735 }, { "entropy": 6.411444854736328, "epoch": 0.1461877756773787, "grad_norm": 1.140625, "learning_rate": 0.0004999564672684145, "loss": 6.3228, "mean_token_accuracy": 0.12090336456894875, "num_tokens": 3209653.0, "step": 1740 }, { "entropy": 6.448664712905884, "epoch": 0.14660785549254357, "grad_norm": 1.03125, "learning_rate": 0.0004999558762187131, "loss": 6.1938, "mean_token_accuracy": 0.12701231315732003, "num_tokens": 3218313.0, "step": 1745 }, { "entropy": 6.32896614074707, "epoch": 0.14702793530770847, "grad_norm": 1.015625, "learning_rate": 0.0004999552811840342, "loss": 6.1297, "mean_token_accuracy": 0.12769370079040526, "num_tokens": 3227525.0, "step": 1750 }, { "entropy": 6.335414171218872, "epoch": 0.14744801512287334, "grad_norm": 0.94921875, "learning_rate": 0.0004999546821643884, "loss": 6.2408, "mean_token_accuracy": 0.12636618986725806, "num_tokens": 3237022.0, "step": 1755 }, { "entropy": 6.317769384384155, "epoch": 0.14786809493803824, "grad_norm": 0.9921875, "learning_rate": 0.0004999540791597861, "loss": 6.1464, "mean_token_accuracy": 0.12537204548716546, "num_tokens": 3246605.0, "step": 1760 }, { "entropy": 6.258312511444092, "epoch": 0.1482881747532031, "grad_norm": 1.03125, "learning_rate": 0.0004999534721702383, "loss": 6.0956, "mean_token_accuracy": 0.13141294568777084, "num_tokens": 3255587.0, "step": 1765 }, { "entropy": 6.364277791976929, "epoch": 0.148708254568368, "grad_norm": 1.0234375, "learning_rate": 0.0004999528611957553, "loss": 6.1968, "mean_token_accuracy": 0.1267327442765236, "num_tokens": 3265669.0, "step": 1770 }, { "entropy": 6.433037424087525, "epoch": 0.14912833438353287, "grad_norm": 1.078125, "learning_rate": 0.0004999522462363485, "loss": 6.1795, "mean_token_accuracy": 0.12822128161787988, "num_tokens": 3275013.0, "step": 1775 }, { "entropy": 6.372742748260498, "epoch": 0.14954841419869774, "grad_norm": 0.91796875, "learning_rate": 0.0004999516272920283, "loss": 6.2775, "mean_token_accuracy": 0.12774404734373093, "num_tokens": 3284723.0, "step": 1780 }, { "entropy": 6.256136322021485, "epoch": 0.14996849401386264, "grad_norm": 0.96484375, "learning_rate": 0.000499951004362806, "loss": 6.1087, "mean_token_accuracy": 0.13196263536810876, "num_tokens": 3293860.0, "step": 1785 }, { "entropy": 6.278848552703858, "epoch": 0.1503885738290275, "grad_norm": 0.9765625, "learning_rate": 0.0004999503774486924, "loss": 6.1623, "mean_token_accuracy": 0.13007338494062423, "num_tokens": 3303158.0, "step": 1790 }, { "entropy": 6.253765487670899, "epoch": 0.1508086536441924, "grad_norm": 0.96484375, "learning_rate": 0.0004999497465496987, "loss": 6.1083, "mean_token_accuracy": 0.1231241799890995, "num_tokens": 3313068.0, "step": 1795 }, { "entropy": 6.319281959533692, "epoch": 0.15122873345935728, "grad_norm": 1.0390625, "learning_rate": 0.000499949111665836, "loss": 6.1761, "mean_token_accuracy": 0.12510209009051323, "num_tokens": 3321885.0, "step": 1800 }, { "entropy": 6.368197298049926, "epoch": 0.15164881327452215, "grad_norm": 1.015625, "learning_rate": 0.0004999484727971158, "loss": 6.1707, "mean_token_accuracy": 0.12798358947038652, "num_tokens": 3330924.0, "step": 1805 }, { "entropy": 6.339307403564453, "epoch": 0.15206889308968705, "grad_norm": 1.0625, "learning_rate": 0.000499947829943549, "loss": 6.1964, "mean_token_accuracy": 0.12618306949734687, "num_tokens": 3340070.0, "step": 1810 }, { "entropy": 6.394219160079956, "epoch": 0.15248897290485192, "grad_norm": 0.984375, "learning_rate": 0.0004999471831051474, "loss": 6.1922, "mean_token_accuracy": 0.13684661015868188, "num_tokens": 3349870.0, "step": 1815 }, { "entropy": 6.330759143829345, "epoch": 0.1529090527200168, "grad_norm": 0.94921875, "learning_rate": 0.0004999465322819222, "loss": 6.2371, "mean_token_accuracy": 0.12111249193549156, "num_tokens": 3359573.0, "step": 1820 }, { "entropy": 6.372816276550293, "epoch": 0.15332913253518168, "grad_norm": 1.046875, "learning_rate": 0.0004999458774738851, "loss": 6.1732, "mean_token_accuracy": 0.13470285311341285, "num_tokens": 3368577.0, "step": 1825 }, { "entropy": 6.352361059188842, "epoch": 0.15374921235034655, "grad_norm": 1.078125, "learning_rate": 0.0004999452186810476, "loss": 6.1469, "mean_token_accuracy": 0.13113251850008964, "num_tokens": 3377801.0, "step": 1830 }, { "entropy": 6.3680521011352536, "epoch": 0.15416929216551145, "grad_norm": 1.046875, "learning_rate": 0.0004999445559034214, "loss": 6.1995, "mean_token_accuracy": 0.12895982414484025, "num_tokens": 3386666.0, "step": 1835 }, { "entropy": 6.443807363510132, "epoch": 0.15458937198067632, "grad_norm": 0.97265625, "learning_rate": 0.0004999438891410181, "loss": 6.3344, "mean_token_accuracy": 0.12429568618535995, "num_tokens": 3396086.0, "step": 1840 }, { "entropy": 6.371559190750122, "epoch": 0.15500945179584122, "grad_norm": 1.0234375, "learning_rate": 0.0004999432183938496, "loss": 6.2503, "mean_token_accuracy": 0.1258139818906784, "num_tokens": 3404894.0, "step": 1845 }, { "entropy": 6.40411787033081, "epoch": 0.1554295316110061, "grad_norm": 1.015625, "learning_rate": 0.0004999425436619279, "loss": 6.2301, "mean_token_accuracy": 0.1250107169151306, "num_tokens": 3414172.0, "step": 1850 }, { "entropy": 6.4263053894042965, "epoch": 0.15584961142617096, "grad_norm": 0.9375, "learning_rate": 0.000499941864945265, "loss": 6.2069, "mean_token_accuracy": 0.12341500893235206, "num_tokens": 3423409.0, "step": 1855 }, { "entropy": 6.2579625129699705, "epoch": 0.15626969124133586, "grad_norm": 0.99609375, "learning_rate": 0.0004999411822438726, "loss": 6.1554, "mean_token_accuracy": 0.12717969343066216, "num_tokens": 3433047.0, "step": 1860 }, { "entropy": 6.4037513256073, "epoch": 0.15668977105650073, "grad_norm": 1.078125, "learning_rate": 0.000499940495557763, "loss": 6.1468, "mean_token_accuracy": 0.12783457711338997, "num_tokens": 3442490.0, "step": 1865 }, { "entropy": 6.303406810760498, "epoch": 0.15710985087166562, "grad_norm": 0.9921875, "learning_rate": 0.0004999398048869485, "loss": 6.2099, "mean_token_accuracy": 0.129954195022583, "num_tokens": 3451804.0, "step": 1870 }, { "entropy": 6.385490417480469, "epoch": 0.1575299306868305, "grad_norm": 0.984375, "learning_rate": 0.000499939110231441, "loss": 6.199, "mean_token_accuracy": 0.1304432988166809, "num_tokens": 3461481.0, "step": 1875 }, { "entropy": 6.364220190048218, "epoch": 0.1579500105019954, "grad_norm": 1.0234375, "learning_rate": 0.0004999384115912531, "loss": 6.2449, "mean_token_accuracy": 0.13135363310575485, "num_tokens": 3471798.0, "step": 1880 }, { "entropy": 6.247316694259643, "epoch": 0.15837009031716026, "grad_norm": 0.96875, "learning_rate": 0.000499937708966397, "loss": 6.1296, "mean_token_accuracy": 0.12637364491820335, "num_tokens": 3481386.0, "step": 1885 }, { "entropy": 6.332306051254273, "epoch": 0.15879017013232513, "grad_norm": 0.97265625, "learning_rate": 0.0004999370023568853, "loss": 6.127, "mean_token_accuracy": 0.1316571466624737, "num_tokens": 3489981.0, "step": 1890 }, { "entropy": 6.299954462051391, "epoch": 0.15921024994749003, "grad_norm": 1.03125, "learning_rate": 0.0004999362917627304, "loss": 6.1227, "mean_token_accuracy": 0.1305247150361538, "num_tokens": 3498551.0, "step": 1895 }, { "entropy": 6.316105461120605, "epoch": 0.1596303297626549, "grad_norm": 1.046875, "learning_rate": 0.0004999355771839448, "loss": 6.0979, "mean_token_accuracy": 0.12954429015517235, "num_tokens": 3507921.0, "step": 1900 }, { "entropy": 6.470440483093261, "epoch": 0.1600504095778198, "grad_norm": 1.078125, "learning_rate": 0.0004999348586205414, "loss": 6.2729, "mean_token_accuracy": 0.13220328316092492, "num_tokens": 3517570.0, "step": 1905 }, { "entropy": 6.38808388710022, "epoch": 0.16047048939298467, "grad_norm": 1.0703125, "learning_rate": 0.0004999341360725327, "loss": 6.2438, "mean_token_accuracy": 0.123927091807127, "num_tokens": 3526774.0, "step": 1910 }, { "entropy": 6.285849714279175, "epoch": 0.16089056920814954, "grad_norm": 1.03125, "learning_rate": 0.0004999334095399317, "loss": 6.1859, "mean_token_accuracy": 0.1361298866569996, "num_tokens": 3535319.0, "step": 1915 }, { "entropy": 6.249746656417846, "epoch": 0.16131064902331443, "grad_norm": 0.98828125, "learning_rate": 0.0004999326790227512, "loss": 6.1605, "mean_token_accuracy": 0.1271871216595173, "num_tokens": 3544468.0, "step": 1920 }, { "entropy": 6.217294788360595, "epoch": 0.1617307288384793, "grad_norm": 0.9140625, "learning_rate": 0.0004999319445210041, "loss": 6.0261, "mean_token_accuracy": 0.1361843690276146, "num_tokens": 3553529.0, "step": 1925 }, { "entropy": 6.290815734863282, "epoch": 0.1621508086536442, "grad_norm": 0.96875, "learning_rate": 0.0004999312060347034, "loss": 6.1011, "mean_token_accuracy": 0.13233864828944206, "num_tokens": 3563053.0, "step": 1930 }, { "entropy": 6.224975728988648, "epoch": 0.16257088846880907, "grad_norm": 0.953125, "learning_rate": 0.0004999304635638621, "loss": 6.0288, "mean_token_accuracy": 0.1342104844748974, "num_tokens": 3571877.0, "step": 1935 }, { "entropy": 6.233099460601807, "epoch": 0.16299096828397394, "grad_norm": 0.92578125, "learning_rate": 0.0004999297171084935, "loss": 6.091, "mean_token_accuracy": 0.13373700231313707, "num_tokens": 3581496.0, "step": 1940 }, { "entropy": 6.324843549728394, "epoch": 0.16341104809913884, "grad_norm": 0.98828125, "learning_rate": 0.0004999289666686109, "loss": 6.1071, "mean_token_accuracy": 0.1308230109512806, "num_tokens": 3590752.0, "step": 1945 }, { "entropy": 6.129473495483398, "epoch": 0.1638311279143037, "grad_norm": 0.98046875, "learning_rate": 0.0004999282122442274, "loss": 6.1072, "mean_token_accuracy": 0.1328013814985752, "num_tokens": 3599885.0, "step": 1950 }, { "entropy": 6.387533235549927, "epoch": 0.1642512077294686, "grad_norm": 0.9296875, "learning_rate": 0.0004999274538353564, "loss": 6.1968, "mean_token_accuracy": 0.12293331325054169, "num_tokens": 3610039.0, "step": 1955 }, { "entropy": 6.2677867889404295, "epoch": 0.16467128754463348, "grad_norm": 1.015625, "learning_rate": 0.0004999266914420114, "loss": 6.1123, "mean_token_accuracy": 0.12491545528173446, "num_tokens": 3619954.0, "step": 1960 }, { "entropy": 6.291842746734619, "epoch": 0.16509136735979837, "grad_norm": 1.0078125, "learning_rate": 0.000499925925064206, "loss": 6.0646, "mean_token_accuracy": 0.13617814630270003, "num_tokens": 3628164.0, "step": 1965 }, { "entropy": 6.377547359466552, "epoch": 0.16551144717496324, "grad_norm": 1.046875, "learning_rate": 0.0004999251547019535, "loss": 6.2126, "mean_token_accuracy": 0.13370679765939714, "num_tokens": 3636778.0, "step": 1970 }, { "entropy": 6.318364191055298, "epoch": 0.16593152699012811, "grad_norm": 0.9609375, "learning_rate": 0.0004999243803552678, "loss": 6.1666, "mean_token_accuracy": 0.13474627435207367, "num_tokens": 3647046.0, "step": 1975 }, { "entropy": 6.2661604404449465, "epoch": 0.166351606805293, "grad_norm": 1.03125, "learning_rate": 0.0004999236020241625, "loss": 6.0969, "mean_token_accuracy": 0.1302388660609722, "num_tokens": 3656130.0, "step": 1980 }, { "entropy": 6.294794940948487, "epoch": 0.16677168662045788, "grad_norm": 0.9921875, "learning_rate": 0.0004999228197086514, "loss": 6.1791, "mean_token_accuracy": 0.12147556319832802, "num_tokens": 3666145.0, "step": 1985 }, { "entropy": 6.308886766433716, "epoch": 0.16719176643562278, "grad_norm": 0.88671875, "learning_rate": 0.0004999220334087484, "loss": 6.2221, "mean_token_accuracy": 0.12820759564638137, "num_tokens": 3676722.0, "step": 1990 }, { "entropy": 6.34148588180542, "epoch": 0.16761184625078765, "grad_norm": 1.0, "learning_rate": 0.0004999212431244673, "loss": 6.1977, "mean_token_accuracy": 0.1265730917453766, "num_tokens": 3685880.0, "step": 1995 }, { "entropy": 6.220745372772217, "epoch": 0.16803192606595252, "grad_norm": 0.98828125, "learning_rate": 0.0004999204488558222, "loss": 6.0332, "mean_token_accuracy": 0.13368572890758515, "num_tokens": 3695167.0, "step": 2000 }, { "entropy": 6.279938268661499, "epoch": 0.16845200588111742, "grad_norm": 0.96875, "learning_rate": 0.0004999196506028273, "loss": 6.1455, "mean_token_accuracy": 0.12803823873400688, "num_tokens": 3703700.0, "step": 2005 }, { "entropy": 6.340878582000732, "epoch": 0.1688720856962823, "grad_norm": 1.0390625, "learning_rate": 0.0004999188483654965, "loss": 6.0938, "mean_token_accuracy": 0.12776080071926116, "num_tokens": 3712825.0, "step": 2010 }, { "entropy": 6.229676914215088, "epoch": 0.16929216551144718, "grad_norm": 0.9453125, "learning_rate": 0.0004999180421438442, "loss": 6.0447, "mean_token_accuracy": 0.13442618474364282, "num_tokens": 3721807.0, "step": 2015 }, { "entropy": 6.3377564430236815, "epoch": 0.16971224532661205, "grad_norm": 1.0625, "learning_rate": 0.0004999172319378846, "loss": 6.2308, "mean_token_accuracy": 0.12342165559530258, "num_tokens": 3730502.0, "step": 2020 }, { "entropy": 6.334515047073364, "epoch": 0.17013232514177692, "grad_norm": 0.98828125, "learning_rate": 0.0004999164177476319, "loss": 6.1138, "mean_token_accuracy": 0.13388336971402168, "num_tokens": 3739696.0, "step": 2025 }, { "entropy": 6.170955038070678, "epoch": 0.17055240495694182, "grad_norm": 1.0625, "learning_rate": 0.0004999155995731009, "loss": 6.1168, "mean_token_accuracy": 0.1329979881644249, "num_tokens": 3748675.0, "step": 2030 }, { "entropy": 6.440923643112183, "epoch": 0.1709724847721067, "grad_norm": 1.0234375, "learning_rate": 0.0004999147774143057, "loss": 6.1895, "mean_token_accuracy": 0.12849014177918433, "num_tokens": 3757714.0, "step": 2035 }, { "entropy": 6.217456531524658, "epoch": 0.1713925645872716, "grad_norm": 1.0, "learning_rate": 0.000499913951271261, "loss": 6.0181, "mean_token_accuracy": 0.13668849244713782, "num_tokens": 3767589.0, "step": 2040 }, { "entropy": 6.216994047164917, "epoch": 0.17181264440243646, "grad_norm": 1.125, "learning_rate": 0.0004999131211439816, "loss": 6.1246, "mean_token_accuracy": 0.13397686704993247, "num_tokens": 3777261.0, "step": 2045 }, { "entropy": 6.3198566913604735, "epoch": 0.17223272421760136, "grad_norm": 1.015625, "learning_rate": 0.000499912287032482, "loss": 6.0738, "mean_token_accuracy": 0.13602124899625778, "num_tokens": 3786658.0, "step": 2050 }, { "entropy": 6.19984622001648, "epoch": 0.17265280403276623, "grad_norm": 1.0703125, "learning_rate": 0.000499911448936777, "loss": 6.0669, "mean_token_accuracy": 0.14067015573382377, "num_tokens": 3794977.0, "step": 2055 }, { "entropy": 6.179085731506348, "epoch": 0.1730728838479311, "grad_norm": 0.93359375, "learning_rate": 0.0004999106068568816, "loss": 6.1457, "mean_token_accuracy": 0.12947675883769988, "num_tokens": 3805138.0, "step": 2060 }, { "entropy": 6.279845762252807, "epoch": 0.173492963663096, "grad_norm": 1.015625, "learning_rate": 0.0004999097607928106, "loss": 6.0911, "mean_token_accuracy": 0.13879665359854698, "num_tokens": 3814444.0, "step": 2065 }, { "entropy": 6.212150764465332, "epoch": 0.17391304347826086, "grad_norm": 0.984375, "learning_rate": 0.0004999089107445788, "loss": 6.0398, "mean_token_accuracy": 0.13306153938174248, "num_tokens": 3822859.0, "step": 2070 }, { "entropy": 6.133330869674682, "epoch": 0.17433312329342576, "grad_norm": 0.9140625, "learning_rate": 0.0004999080567122016, "loss": 6.0707, "mean_token_accuracy": 0.13198764845728875, "num_tokens": 3833159.0, "step": 2075 }, { "entropy": 6.295455646514893, "epoch": 0.17475320310859063, "grad_norm": 1.015625, "learning_rate": 0.0004999071986956941, "loss": 6.0856, "mean_token_accuracy": 0.13797224685549736, "num_tokens": 3842136.0, "step": 2080 }, { "entropy": 6.208657741546631, "epoch": 0.1751732829237555, "grad_norm": 1.0234375, "learning_rate": 0.0004999063366950713, "loss": 6.1499, "mean_token_accuracy": 0.12877421900629998, "num_tokens": 3851406.0, "step": 2085 }, { "entropy": 6.217505025863647, "epoch": 0.1755933627389204, "grad_norm": 1.0078125, "learning_rate": 0.0004999054707103486, "loss": 6.0713, "mean_token_accuracy": 0.1279774695634842, "num_tokens": 3861061.0, "step": 2090 }, { "entropy": 6.265169095993042, "epoch": 0.17601344255408527, "grad_norm": 1.0234375, "learning_rate": 0.0004999046007415412, "loss": 6.0378, "mean_token_accuracy": 0.12900712937116623, "num_tokens": 3870357.0, "step": 2095 }, { "entropy": 6.2917054176330565, "epoch": 0.17643352236925017, "grad_norm": 1.0, "learning_rate": 0.0004999037267886646, "loss": 6.0715, "mean_token_accuracy": 0.13141706436872483, "num_tokens": 3879393.0, "step": 2100 }, { "entropy": 6.180794954299927, "epoch": 0.17685360218441504, "grad_norm": 1.046875, "learning_rate": 0.0004999028488517343, "loss": 6.0832, "mean_token_accuracy": 0.13525146320462228, "num_tokens": 3888030.0, "step": 2105 }, { "entropy": 6.266747093200683, "epoch": 0.1772736819995799, "grad_norm": 1.0234375, "learning_rate": 0.0004999019669307659, "loss": 6.0788, "mean_token_accuracy": 0.1376435212790966, "num_tokens": 3897430.0, "step": 2110 }, { "entropy": 6.238908100128174, "epoch": 0.1776937618147448, "grad_norm": 0.9296875, "learning_rate": 0.0004999010810257749, "loss": 6.0977, "mean_token_accuracy": 0.12719068825244903, "num_tokens": 3907711.0, "step": 2115 }, { "entropy": 6.189173746109009, "epoch": 0.17811384162990967, "grad_norm": 0.9765625, "learning_rate": 0.0004999001911367771, "loss": 6.0411, "mean_token_accuracy": 0.13638337776064874, "num_tokens": 3915816.0, "step": 2120 }, { "entropy": 6.22648811340332, "epoch": 0.17853392144507457, "grad_norm": 0.96484375, "learning_rate": 0.0004998992972637883, "loss": 6.1538, "mean_token_accuracy": 0.12582943066954613, "num_tokens": 3925162.0, "step": 2125 }, { "entropy": 6.284874153137207, "epoch": 0.17895400126023944, "grad_norm": 0.94921875, "learning_rate": 0.0004998983994068242, "loss": 6.0395, "mean_token_accuracy": 0.13122835606336594, "num_tokens": 3934476.0, "step": 2130 }, { "entropy": 6.186276054382324, "epoch": 0.17937408107540434, "grad_norm": 0.93359375, "learning_rate": 0.0004998974975659006, "loss": 6.0907, "mean_token_accuracy": 0.1297646477818489, "num_tokens": 3943501.0, "step": 2135 }, { "entropy": 6.205726194381714, "epoch": 0.1797941608905692, "grad_norm": 0.96484375, "learning_rate": 0.0004998965917410338, "loss": 6.0816, "mean_token_accuracy": 0.12778471410274506, "num_tokens": 3953663.0, "step": 2140 }, { "entropy": 6.211074018478394, "epoch": 0.18021424070573408, "grad_norm": 1.0078125, "learning_rate": 0.0004998956819322397, "loss": 6.0495, "mean_token_accuracy": 0.13608243688941002, "num_tokens": 3962634.0, "step": 2145 }, { "entropy": 6.177238512039184, "epoch": 0.18063432052089898, "grad_norm": 0.94921875, "learning_rate": 0.0004998947681395343, "loss": 6.052, "mean_token_accuracy": 0.13605224341154099, "num_tokens": 3972496.0, "step": 2150 }, { "entropy": 6.390697908401489, "epoch": 0.18105440033606385, "grad_norm": 1.03125, "learning_rate": 0.000499893850362934, "loss": 6.2977, "mean_token_accuracy": 0.12441082820296287, "num_tokens": 3980724.0, "step": 2155 }, { "entropy": 6.262918901443482, "epoch": 0.18147448015122875, "grad_norm": 0.96875, "learning_rate": 0.0004998929286024548, "loss": 6.1304, "mean_token_accuracy": 0.1300631955265999, "num_tokens": 3989842.0, "step": 2160 }, { "entropy": 6.230935716629029, "epoch": 0.18189455996639362, "grad_norm": 1.109375, "learning_rate": 0.0004998920028581133, "loss": 6.0378, "mean_token_accuracy": 0.14167480319738388, "num_tokens": 3998534.0, "step": 2165 }, { "entropy": 6.241239356994629, "epoch": 0.18231463978155849, "grad_norm": 0.9765625, "learning_rate": 0.0004998910731299258, "loss": 6.0631, "mean_token_accuracy": 0.13066420927643776, "num_tokens": 4007677.0, "step": 2170 }, { "entropy": 6.19789605140686, "epoch": 0.18273471959672338, "grad_norm": 1.0234375, "learning_rate": 0.0004998901394179085, "loss": 6.1007, "mean_token_accuracy": 0.12627347633242608, "num_tokens": 4016347.0, "step": 2175 }, { "entropy": 6.198655843734741, "epoch": 0.18315479941188825, "grad_norm": 1.046875, "learning_rate": 0.0004998892017220784, "loss": 5.9767, "mean_token_accuracy": 0.14088783264160157, "num_tokens": 4025199.0, "step": 2180 }, { "entropy": 6.262273931503296, "epoch": 0.18357487922705315, "grad_norm": 1.0859375, "learning_rate": 0.0004998882600424519, "loss": 6.0603, "mean_token_accuracy": 0.1286892294883728, "num_tokens": 4033933.0, "step": 2185 }, { "entropy": 6.162368822097778, "epoch": 0.18399495904221802, "grad_norm": 0.9609375, "learning_rate": 0.0004998873143790455, "loss": 5.9753, "mean_token_accuracy": 0.1438771367073059, "num_tokens": 4042891.0, "step": 2190 }, { "entropy": 6.274066638946533, "epoch": 0.1844150388573829, "grad_norm": 0.9609375, "learning_rate": 0.0004998863647318763, "loss": 6.1041, "mean_token_accuracy": 0.13264708146452903, "num_tokens": 4051123.0, "step": 2195 }, { "entropy": 6.144877004623413, "epoch": 0.1848351186725478, "grad_norm": 1.046875, "learning_rate": 0.0004998854111009608, "loss": 6.0715, "mean_token_accuracy": 0.12865814492106437, "num_tokens": 4060025.0, "step": 2200 }, { "entropy": 6.182585954666138, "epoch": 0.18525519848771266, "grad_norm": 0.90625, "learning_rate": 0.0004998844534863161, "loss": 5.991, "mean_token_accuracy": 0.1295328378677368, "num_tokens": 4069363.0, "step": 2205 }, { "entropy": 6.241155099868775, "epoch": 0.18567527830287756, "grad_norm": 0.99609375, "learning_rate": 0.0004998834918879592, "loss": 6.1376, "mean_token_accuracy": 0.133307021856308, "num_tokens": 4078855.0, "step": 2210 }, { "entropy": 6.206245565414429, "epoch": 0.18609535811804243, "grad_norm": 0.9453125, "learning_rate": 0.000499882526305907, "loss": 6.0804, "mean_token_accuracy": 0.12953457087278367, "num_tokens": 4087801.0, "step": 2215 }, { "entropy": 6.248236179351807, "epoch": 0.18651543793320732, "grad_norm": 0.91796875, "learning_rate": 0.0004998815567401765, "loss": 6.0926, "mean_token_accuracy": 0.1376325160264969, "num_tokens": 4096949.0, "step": 2220 }, { "entropy": 6.279425954818725, "epoch": 0.1869355177483722, "grad_norm": 1.03125, "learning_rate": 0.0004998805831907851, "loss": 6.0617, "mean_token_accuracy": 0.13082574903964997, "num_tokens": 4105399.0, "step": 2225 }, { "entropy": 6.169968605041504, "epoch": 0.18735559756353706, "grad_norm": 1.0078125, "learning_rate": 0.0004998796056577501, "loss": 6.0071, "mean_token_accuracy": 0.12926321402192115, "num_tokens": 4113873.0, "step": 2230 }, { "entropy": 6.154512643814087, "epoch": 0.18777567737870196, "grad_norm": 0.90625, "learning_rate": 0.0004998786241410886, "loss": 6.0586, "mean_token_accuracy": 0.13699585050344468, "num_tokens": 4123528.0, "step": 2235 }, { "entropy": 6.2988721370697025, "epoch": 0.18819575719386683, "grad_norm": 0.9140625, "learning_rate": 0.000499877638640818, "loss": 6.0699, "mean_token_accuracy": 0.13017342165112494, "num_tokens": 4133370.0, "step": 2240 }, { "entropy": 6.184452104568481, "epoch": 0.18861583700903173, "grad_norm": 0.94140625, "learning_rate": 0.000499876649156956, "loss": 5.9844, "mean_token_accuracy": 0.13666255846619607, "num_tokens": 4142370.0, "step": 2245 }, { "entropy": 6.133312082290649, "epoch": 0.1890359168241966, "grad_norm": 0.96875, "learning_rate": 0.0004998756556895196, "loss": 6.0725, "mean_token_accuracy": 0.1354515865445137, "num_tokens": 4152367.0, "step": 2250 }, { "entropy": 6.21663122177124, "epoch": 0.18945599663936147, "grad_norm": 1.0078125, "learning_rate": 0.000499874658238527, "loss": 6.0625, "mean_token_accuracy": 0.13495326191186904, "num_tokens": 4161126.0, "step": 2255 }, { "entropy": 6.186970901489258, "epoch": 0.18987607645452637, "grad_norm": 1.0078125, "learning_rate": 0.0004998736568039957, "loss": 5.9748, "mean_token_accuracy": 0.13723411411046982, "num_tokens": 4169910.0, "step": 2260 }, { "entropy": 6.1857301712036135, "epoch": 0.19029615626969124, "grad_norm": 0.9921875, "learning_rate": 0.0004998726513859432, "loss": 6.1067, "mean_token_accuracy": 0.12761787325143814, "num_tokens": 4179893.0, "step": 2265 }, { "entropy": 6.308238935470581, "epoch": 0.19071623608485613, "grad_norm": 0.9140625, "learning_rate": 0.0004998716419843875, "loss": 6.12, "mean_token_accuracy": 0.13745217099785806, "num_tokens": 4190065.0, "step": 2270 }, { "entropy": 6.090948486328125, "epoch": 0.191136315900021, "grad_norm": 1.015625, "learning_rate": 0.0004998706285993465, "loss": 6.0313, "mean_token_accuracy": 0.1420229621231556, "num_tokens": 4198395.0, "step": 2275 }, { "entropy": 6.282499647140503, "epoch": 0.19155639571518587, "grad_norm": 0.9453125, "learning_rate": 0.0004998696112308381, "loss": 6.0533, "mean_token_accuracy": 0.1310360386967659, "num_tokens": 4207555.0, "step": 2280 }, { "entropy": 6.088230180740356, "epoch": 0.19197647553035077, "grad_norm": 0.9296875, "learning_rate": 0.0004998685898788803, "loss": 5.9946, "mean_token_accuracy": 0.13536595478653907, "num_tokens": 4216533.0, "step": 2285 }, { "entropy": 6.274929618835449, "epoch": 0.19239655534551564, "grad_norm": 1.0390625, "learning_rate": 0.0004998675645434914, "loss": 6.1095, "mean_token_accuracy": 0.13767784610390663, "num_tokens": 4225575.0, "step": 2290 }, { "entropy": 6.153714513778686, "epoch": 0.19281663516068054, "grad_norm": 1.0234375, "learning_rate": 0.0004998665352246891, "loss": 5.8958, "mean_token_accuracy": 0.14245088025927544, "num_tokens": 4234306.0, "step": 2295 }, { "entropy": 6.08680305480957, "epoch": 0.1932367149758454, "grad_norm": 0.9609375, "learning_rate": 0.0004998655019224921, "loss": 6.0823, "mean_token_accuracy": 0.1359329827129841, "num_tokens": 4243998.0, "step": 2300 }, { "entropy": 6.237053394317627, "epoch": 0.19365679479101028, "grad_norm": 0.98828125, "learning_rate": 0.0004998644646369185, "loss": 5.9776, "mean_token_accuracy": 0.13352483361959458, "num_tokens": 4253653.0, "step": 2305 }, { "entropy": 6.139167737960816, "epoch": 0.19407687460617518, "grad_norm": 0.98828125, "learning_rate": 0.0004998634233679865, "loss": 6.0652, "mean_token_accuracy": 0.1278400629758835, "num_tokens": 4263305.0, "step": 2310 }, { "entropy": 6.127392339706421, "epoch": 0.19449695442134005, "grad_norm": 1.0078125, "learning_rate": 0.000499862378115715, "loss": 5.9342, "mean_token_accuracy": 0.14543856382369996, "num_tokens": 4272212.0, "step": 2315 }, { "entropy": 6.305202007293701, "epoch": 0.19491703423650494, "grad_norm": 1.0625, "learning_rate": 0.0004998613288801221, "loss": 6.1375, "mean_token_accuracy": 0.13151465207338334, "num_tokens": 4281445.0, "step": 2320 }, { "entropy": 6.2177956104278564, "epoch": 0.1953371140516698, "grad_norm": 0.9609375, "learning_rate": 0.0004998602756612267, "loss": 6.055, "mean_token_accuracy": 0.1372949168086052, "num_tokens": 4290938.0, "step": 2325 }, { "entropy": 6.175972557067871, "epoch": 0.1957571938668347, "grad_norm": 0.9765625, "learning_rate": 0.0004998592184590471, "loss": 6.0786, "mean_token_accuracy": 0.13233636021614076, "num_tokens": 4300022.0, "step": 2330 }, { "entropy": 6.134920358657837, "epoch": 0.19617727368199958, "grad_norm": 1.0, "learning_rate": 0.0004998581572736024, "loss": 5.9674, "mean_token_accuracy": 0.1363460712134838, "num_tokens": 4308910.0, "step": 2335 }, { "entropy": 6.092206907272339, "epoch": 0.19659735349716445, "grad_norm": 0.93359375, "learning_rate": 0.0004998570921049112, "loss": 5.9454, "mean_token_accuracy": 0.13969452679157257, "num_tokens": 4317136.0, "step": 2340 }, { "entropy": 6.112558746337891, "epoch": 0.19701743331232935, "grad_norm": 1.046875, "learning_rate": 0.0004998560229529924, "loss": 5.9993, "mean_token_accuracy": 0.1428337089717388, "num_tokens": 4326163.0, "step": 2345 }, { "entropy": 6.308993816375732, "epoch": 0.19743751312749422, "grad_norm": 0.97265625, "learning_rate": 0.0004998549498178649, "loss": 6.1402, "mean_token_accuracy": 0.13658420667052268, "num_tokens": 4335837.0, "step": 2350 }, { "entropy": 6.216946363449097, "epoch": 0.19785759294265912, "grad_norm": 1.09375, "learning_rate": 0.0004998538726995477, "loss": 6.0561, "mean_token_accuracy": 0.1374947391450405, "num_tokens": 4345108.0, "step": 2355 }, { "entropy": 6.217574787139893, "epoch": 0.198277672757824, "grad_norm": 0.953125, "learning_rate": 0.00049985279159806, "loss": 6.0722, "mean_token_accuracy": 0.1334306165575981, "num_tokens": 4353761.0, "step": 2360 }, { "entropy": 6.1630774974823, "epoch": 0.19869775257298886, "grad_norm": 0.99609375, "learning_rate": 0.0004998517065134208, "loss": 6.0354, "mean_token_accuracy": 0.13587109968066216, "num_tokens": 4363244.0, "step": 2365 }, { "entropy": 6.205533790588379, "epoch": 0.19911783238815375, "grad_norm": 0.92578125, "learning_rate": 0.0004998506174456494, "loss": 6.0386, "mean_token_accuracy": 0.13257589265704156, "num_tokens": 4373034.0, "step": 2370 }, { "entropy": 6.200410652160644, "epoch": 0.19953791220331862, "grad_norm": 0.90625, "learning_rate": 0.0004998495243947653, "loss": 5.9816, "mean_token_accuracy": 0.13029902279376984, "num_tokens": 4382554.0, "step": 2375 }, { "entropy": 6.191087865829468, "epoch": 0.19995799201848352, "grad_norm": 1.03125, "learning_rate": 0.0004998484273607875, "loss": 5.9843, "mean_token_accuracy": 0.14299238696694375, "num_tokens": 4391001.0, "step": 2380 }, { "entropy": 6.023518228530884, "epoch": 0.2003780718336484, "grad_norm": 0.9140625, "learning_rate": 0.0004998473263437356, "loss": 5.9141, "mean_token_accuracy": 0.13673870489001275, "num_tokens": 4400632.0, "step": 2385 }, { "entropy": 6.105119514465332, "epoch": 0.20079815164881326, "grad_norm": 0.97265625, "learning_rate": 0.000499846221343629, "loss": 6.0095, "mean_token_accuracy": 0.12952324375510216, "num_tokens": 4409565.0, "step": 2390 }, { "entropy": 6.128167533874512, "epoch": 0.20121823146397816, "grad_norm": 1.0234375, "learning_rate": 0.0004998451123604875, "loss": 5.944, "mean_token_accuracy": 0.14282809123396872, "num_tokens": 4418384.0, "step": 2395 }, { "entropy": 6.1983355522155765, "epoch": 0.20163831127914303, "grad_norm": 1.0546875, "learning_rate": 0.0004998439993943306, "loss": 6.0692, "mean_token_accuracy": 0.1389256276190281, "num_tokens": 4427581.0, "step": 2400 }, { "entropy": 6.267655086517334, "epoch": 0.20205839109430793, "grad_norm": 1.0078125, "learning_rate": 0.0004998428824451779, "loss": 6.0521, "mean_token_accuracy": 0.1341543450951576, "num_tokens": 4436572.0, "step": 2405 }, { "entropy": 6.1763083934783936, "epoch": 0.2024784709094728, "grad_norm": 1.0078125, "learning_rate": 0.0004998417615130495, "loss": 6.055, "mean_token_accuracy": 0.13537125810980796, "num_tokens": 4445230.0, "step": 2410 }, { "entropy": 6.247248315811158, "epoch": 0.2028985507246377, "grad_norm": 0.98046875, "learning_rate": 0.0004998406365979649, "loss": 6.1134, "mean_token_accuracy": 0.13383878991007805, "num_tokens": 4454251.0, "step": 2415 }, { "entropy": 6.136447811126709, "epoch": 0.20331863053980256, "grad_norm": 0.9375, "learning_rate": 0.0004998395076999443, "loss": 5.9699, "mean_token_accuracy": 0.13695907220244408, "num_tokens": 4463949.0, "step": 2420 }, { "entropy": 6.227413558959961, "epoch": 0.20373871035496743, "grad_norm": 1.03125, "learning_rate": 0.0004998383748190076, "loss": 6.1649, "mean_token_accuracy": 0.12917085587978364, "num_tokens": 4473373.0, "step": 2425 }, { "entropy": 6.249214363098145, "epoch": 0.20415879017013233, "grad_norm": 1.0234375, "learning_rate": 0.0004998372379551748, "loss": 5.9842, "mean_token_accuracy": 0.1414948470890522, "num_tokens": 4482303.0, "step": 2430 }, { "entropy": 6.117572832107544, "epoch": 0.2045788699852972, "grad_norm": 0.9765625, "learning_rate": 0.0004998360971084663, "loss": 5.9567, "mean_token_accuracy": 0.1317524030804634, "num_tokens": 4491214.0, "step": 2435 }, { "entropy": 6.057681226730347, "epoch": 0.2049989498004621, "grad_norm": 0.97265625, "learning_rate": 0.0004998349522789019, "loss": 5.8856, "mean_token_accuracy": 0.14377139806747435, "num_tokens": 4500099.0, "step": 2440 }, { "entropy": 6.115459060668945, "epoch": 0.20541902961562697, "grad_norm": 0.96875, "learning_rate": 0.0004998338034665021, "loss": 5.9692, "mean_token_accuracy": 0.1437109664082527, "num_tokens": 4509893.0, "step": 2445 }, { "entropy": 6.08744249343872, "epoch": 0.20583910943079184, "grad_norm": 0.98828125, "learning_rate": 0.0004998326506712872, "loss": 5.9375, "mean_token_accuracy": 0.13774847760796546, "num_tokens": 4518606.0, "step": 2450 }, { "entropy": 6.11673412322998, "epoch": 0.20625918924595674, "grad_norm": 0.99609375, "learning_rate": 0.0004998314938932778, "loss": 6.0218, "mean_token_accuracy": 0.14001012295484544, "num_tokens": 4528392.0, "step": 2455 }, { "entropy": 6.221143388748169, "epoch": 0.2066792690611216, "grad_norm": 0.96875, "learning_rate": 0.0004998303331324943, "loss": 5.9923, "mean_token_accuracy": 0.13821439668536187, "num_tokens": 4536983.0, "step": 2460 }, { "entropy": 6.041988134384155, "epoch": 0.2070993488762865, "grad_norm": 0.96875, "learning_rate": 0.0004998291683889571, "loss": 5.9145, "mean_token_accuracy": 0.1391140677034855, "num_tokens": 4544967.0, "step": 2465 }, { "entropy": 6.134957313537598, "epoch": 0.20751942869145137, "grad_norm": 1.0234375, "learning_rate": 0.000499827999662687, "loss": 5.9727, "mean_token_accuracy": 0.13200750946998596, "num_tokens": 4554646.0, "step": 2470 }, { "entropy": 6.192252588272095, "epoch": 0.20793950850661624, "grad_norm": 0.9453125, "learning_rate": 0.0004998268269537046, "loss": 5.9954, "mean_token_accuracy": 0.1370847873389721, "num_tokens": 4564040.0, "step": 2475 }, { "entropy": 6.091167068481445, "epoch": 0.20835958832178114, "grad_norm": 0.96875, "learning_rate": 0.0004998256502620308, "loss": 6.0187, "mean_token_accuracy": 0.14094985872507096, "num_tokens": 4573758.0, "step": 2480 }, { "entropy": 6.206011056900024, "epoch": 0.208779668136946, "grad_norm": 0.92578125, "learning_rate": 0.0004998244695876864, "loss": 6.0452, "mean_token_accuracy": 0.13380730673670768, "num_tokens": 4582097.0, "step": 2485 }, { "entropy": 6.0949585914611815, "epoch": 0.2091997479521109, "grad_norm": 1.015625, "learning_rate": 0.0004998232849306921, "loss": 6.0055, "mean_token_accuracy": 0.13993047401309014, "num_tokens": 4590687.0, "step": 2490 }, { "entropy": 6.1933338165283205, "epoch": 0.20961982776727578, "grad_norm": 0.9765625, "learning_rate": 0.0004998220962910693, "loss": 5.9965, "mean_token_accuracy": 0.13453714549541473, "num_tokens": 4599497.0, "step": 2495 }, { "entropy": 6.101396179199218, "epoch": 0.21003990758244068, "grad_norm": 1.0390625, "learning_rate": 0.0004998209036688386, "loss": 5.9532, "mean_token_accuracy": 0.13716981932520866, "num_tokens": 4607958.0, "step": 2500 }, { "entropy": 6.216299772262573, "epoch": 0.21045998739760555, "grad_norm": 0.96484375, "learning_rate": 0.0004998197070640216, "loss": 6.0812, "mean_token_accuracy": 0.1314453199505806, "num_tokens": 4617515.0, "step": 2505 }, { "entropy": 6.2111225605010985, "epoch": 0.21088006721277042, "grad_norm": 0.9765625, "learning_rate": 0.0004998185064766391, "loss": 5.9892, "mean_token_accuracy": 0.135587390512228, "num_tokens": 4627037.0, "step": 2510 }, { "entropy": 6.083059787750244, "epoch": 0.21130014702793531, "grad_norm": 0.91015625, "learning_rate": 0.0004998173019067127, "loss": 5.9864, "mean_token_accuracy": 0.13536423593759536, "num_tokens": 4637393.0, "step": 2515 }, { "entropy": 6.111885261535645, "epoch": 0.21172022684310018, "grad_norm": 0.98828125, "learning_rate": 0.0004998160933542633, "loss": 6.0252, "mean_token_accuracy": 0.12426691725850106, "num_tokens": 4646832.0, "step": 2520 }, { "entropy": 6.200415229797363, "epoch": 0.21214030665826508, "grad_norm": 1.0703125, "learning_rate": 0.0004998148808193128, "loss": 6.0364, "mean_token_accuracy": 0.1378290109336376, "num_tokens": 4655719.0, "step": 2525 }, { "entropy": 6.140298128128052, "epoch": 0.21256038647342995, "grad_norm": 0.953125, "learning_rate": 0.0004998136643018823, "loss": 5.9978, "mean_token_accuracy": 0.1409161224961281, "num_tokens": 4665364.0, "step": 2530 }, { "entropy": 6.113859462738037, "epoch": 0.21298046628859482, "grad_norm": 1.0234375, "learning_rate": 0.0004998124438019935, "loss": 5.9707, "mean_token_accuracy": 0.13255369514226914, "num_tokens": 4674760.0, "step": 2535 }, { "entropy": 6.032169342041016, "epoch": 0.21340054610375972, "grad_norm": 0.9375, "learning_rate": 0.0004998112193196681, "loss": 5.8954, "mean_token_accuracy": 0.1398087151348591, "num_tokens": 4683900.0, "step": 2540 }, { "entropy": 6.009505701065064, "epoch": 0.2138206259189246, "grad_norm": 0.98046875, "learning_rate": 0.0004998099908549277, "loss": 5.9487, "mean_token_accuracy": 0.1326383799314499, "num_tokens": 4693915.0, "step": 2545 }, { "entropy": 6.048102998733521, "epoch": 0.2142407057340895, "grad_norm": 0.98046875, "learning_rate": 0.000499808758407794, "loss": 5.7948, "mean_token_accuracy": 0.1494914174079895, "num_tokens": 4703102.0, "step": 2550 }, { "entropy": 6.130202150344848, "epoch": 0.21466078554925436, "grad_norm": 0.96875, "learning_rate": 0.0004998075219782889, "loss": 6.0201, "mean_token_accuracy": 0.13604088351130486, "num_tokens": 4712925.0, "step": 2555 }, { "entropy": 6.086578845977783, "epoch": 0.21508086536441923, "grad_norm": 1.0078125, "learning_rate": 0.0004998062815664344, "loss": 5.9508, "mean_token_accuracy": 0.13391971811652184, "num_tokens": 4722641.0, "step": 2560 }, { "entropy": 6.060202693939209, "epoch": 0.21550094517958412, "grad_norm": 0.9375, "learning_rate": 0.0004998050371722524, "loss": 6.028, "mean_token_accuracy": 0.13827937468886375, "num_tokens": 4732603.0, "step": 2565 }, { "entropy": 6.060051965713501, "epoch": 0.215921024994749, "grad_norm": 0.90625, "learning_rate": 0.0004998037887957649, "loss": 5.8655, "mean_token_accuracy": 0.1426350235939026, "num_tokens": 4742644.0, "step": 2570 }, { "entropy": 6.2458967685699465, "epoch": 0.2163411048099139, "grad_norm": 0.9765625, "learning_rate": 0.0004998025364369939, "loss": 6.1759, "mean_token_accuracy": 0.1332129217684269, "num_tokens": 4751482.0, "step": 2575 }, { "entropy": 6.246464967727661, "epoch": 0.21676118462507876, "grad_norm": 1.03125, "learning_rate": 0.0004998012800959619, "loss": 6.0435, "mean_token_accuracy": 0.13494925051927567, "num_tokens": 4760593.0, "step": 2580 }, { "entropy": 6.139482402801514, "epoch": 0.21718126444024366, "grad_norm": 1.046875, "learning_rate": 0.0004998000197726909, "loss": 6.041, "mean_token_accuracy": 0.14071242287755012, "num_tokens": 4769294.0, "step": 2585 }, { "entropy": 6.151182079315186, "epoch": 0.21760134425540853, "grad_norm": 0.87890625, "learning_rate": 0.0004997987554672033, "loss": 5.9433, "mean_token_accuracy": 0.13458855599164962, "num_tokens": 4779239.0, "step": 2590 }, { "entropy": 6.153560495376587, "epoch": 0.2180214240705734, "grad_norm": 0.921875, "learning_rate": 0.0004997974871795215, "loss": 6.0165, "mean_token_accuracy": 0.13904761373996735, "num_tokens": 4788211.0, "step": 2595 }, { "entropy": 6.1266923427581785, "epoch": 0.2184415038857383, "grad_norm": 0.87109375, "learning_rate": 0.000499796214909668, "loss": 5.9707, "mean_token_accuracy": 0.14307306259870528, "num_tokens": 4797921.0, "step": 2600 }, { "entropy": 6.151721715927124, "epoch": 0.21886158370090317, "grad_norm": 0.97265625, "learning_rate": 0.0004997949386576653, "loss": 5.9792, "mean_token_accuracy": 0.1372672997415066, "num_tokens": 4807772.0, "step": 2605 }, { "entropy": 5.999966764450074, "epoch": 0.21928166351606806, "grad_norm": 0.9375, "learning_rate": 0.000499793658423536, "loss": 6.0037, "mean_token_accuracy": 0.13394766226410865, "num_tokens": 4817999.0, "step": 2610 }, { "entropy": 6.197027158737183, "epoch": 0.21970174333123293, "grad_norm": 1.0625, "learning_rate": 0.0004997923742073028, "loss": 5.9552, "mean_token_accuracy": 0.14477612674236298, "num_tokens": 4826679.0, "step": 2615 }, { "entropy": 6.0403674125671385, "epoch": 0.2201218231463978, "grad_norm": 1.015625, "learning_rate": 0.0004997910860089884, "loss": 5.9647, "mean_token_accuracy": 0.13903913348913194, "num_tokens": 4834998.0, "step": 2620 }, { "entropy": 6.119702100753784, "epoch": 0.2205419029615627, "grad_norm": 1.0234375, "learning_rate": 0.0004997897938286156, "loss": 5.9173, "mean_token_accuracy": 0.13934070989489555, "num_tokens": 4843635.0, "step": 2625 }, { "entropy": 6.135205316543579, "epoch": 0.22096198277672757, "grad_norm": 1.0859375, "learning_rate": 0.0004997884976662075, "loss": 6.0334, "mean_token_accuracy": 0.13847846239805223, "num_tokens": 4852027.0, "step": 2630 }, { "entropy": 6.115947484970093, "epoch": 0.22138206259189247, "grad_norm": 1.0390625, "learning_rate": 0.0004997871975217868, "loss": 5.9555, "mean_token_accuracy": 0.1428781971335411, "num_tokens": 4861244.0, "step": 2635 }, { "entropy": 6.043252468109131, "epoch": 0.22180214240705734, "grad_norm": 0.95703125, "learning_rate": 0.0004997858933953768, "loss": 5.8579, "mean_token_accuracy": 0.14281281381845473, "num_tokens": 4869902.0, "step": 2640 }, { "entropy": 6.012739181518555, "epoch": 0.2222222222222222, "grad_norm": 0.95703125, "learning_rate": 0.0004997845852870004, "loss": 5.8421, "mean_token_accuracy": 0.1463296964764595, "num_tokens": 4878502.0, "step": 2645 }, { "entropy": 6.089871215820312, "epoch": 0.2226423020373871, "grad_norm": 0.9765625, "learning_rate": 0.0004997832731966806, "loss": 5.9032, "mean_token_accuracy": 0.14714645445346833, "num_tokens": 4888348.0, "step": 2650 }, { "entropy": 6.06225700378418, "epoch": 0.22306238185255198, "grad_norm": 1.015625, "learning_rate": 0.0004997819571244411, "loss": 5.972, "mean_token_accuracy": 0.1450254276394844, "num_tokens": 4897302.0, "step": 2655 }, { "entropy": 6.0446860790252686, "epoch": 0.22348246166771688, "grad_norm": 1.0, "learning_rate": 0.0004997806370703049, "loss": 5.9876, "mean_token_accuracy": 0.14430617392063141, "num_tokens": 4907078.0, "step": 2660 }, { "entropy": 6.057806348800659, "epoch": 0.22390254148288175, "grad_norm": 0.8671875, "learning_rate": 0.0004997793130342954, "loss": 5.8272, "mean_token_accuracy": 0.1456086441874504, "num_tokens": 4917489.0, "step": 2665 }, { "entropy": 5.973814630508423, "epoch": 0.22432262129804661, "grad_norm": 0.9765625, "learning_rate": 0.0004997779850164363, "loss": 5.9156, "mean_token_accuracy": 0.140571466088295, "num_tokens": 4927073.0, "step": 2670 }, { "entropy": 6.177860355377197, "epoch": 0.2247427011132115, "grad_norm": 0.98828125, "learning_rate": 0.0004997766530167508, "loss": 6.019, "mean_token_accuracy": 0.1344543881714344, "num_tokens": 4935464.0, "step": 2675 }, { "entropy": 6.22092981338501, "epoch": 0.22516278092837638, "grad_norm": 1.0078125, "learning_rate": 0.0004997753170352627, "loss": 6.0914, "mean_token_accuracy": 0.13605839386582375, "num_tokens": 4944718.0, "step": 2680 }, { "entropy": 6.105925226211548, "epoch": 0.22558286074354128, "grad_norm": 1.03125, "learning_rate": 0.0004997739770719955, "loss": 5.9844, "mean_token_accuracy": 0.13587288782000542, "num_tokens": 4954223.0, "step": 2685 }, { "entropy": 6.107930469512939, "epoch": 0.22600294055870615, "grad_norm": 0.921875, "learning_rate": 0.000499772633126973, "loss": 6.0132, "mean_token_accuracy": 0.13594387769699096, "num_tokens": 4963371.0, "step": 2690 }, { "entropy": 6.04271125793457, "epoch": 0.22642302037387105, "grad_norm": 0.98046875, "learning_rate": 0.0004997712852002192, "loss": 5.8679, "mean_token_accuracy": 0.1471228800714016, "num_tokens": 4972973.0, "step": 2695 }, { "entropy": 6.086397647857666, "epoch": 0.22684310018903592, "grad_norm": 1.0234375, "learning_rate": 0.0004997699332917578, "loss": 6.1119, "mean_token_accuracy": 0.12916670590639115, "num_tokens": 4982808.0, "step": 2700 }, { "entropy": 6.201492786407471, "epoch": 0.2272631800042008, "grad_norm": 0.94140625, "learning_rate": 0.0004997685774016127, "loss": 5.9896, "mean_token_accuracy": 0.13685485795140268, "num_tokens": 4992427.0, "step": 2705 }, { "entropy": 6.162964010238648, "epoch": 0.22768325981936569, "grad_norm": 0.84375, "learning_rate": 0.000499767217529808, "loss": 6.1604, "mean_token_accuracy": 0.12921097874641418, "num_tokens": 5003562.0, "step": 2710 }, { "entropy": 6.098525857925415, "epoch": 0.22810333963453056, "grad_norm": 0.890625, "learning_rate": 0.0004997658536763678, "loss": 5.8638, "mean_token_accuracy": 0.1451013281941414, "num_tokens": 5013429.0, "step": 2715 }, { "entropy": 6.117339611053467, "epoch": 0.22852341944969545, "grad_norm": 0.953125, "learning_rate": 0.0004997644858413163, "loss": 6.0022, "mean_token_accuracy": 0.14247513711452484, "num_tokens": 5022045.0, "step": 2720 }, { "entropy": 6.008642053604126, "epoch": 0.22894349926486032, "grad_norm": 0.88671875, "learning_rate": 0.0004997631140246775, "loss": 5.8287, "mean_token_accuracy": 0.14408515840768815, "num_tokens": 5032260.0, "step": 2725 }, { "entropy": 6.021863174438477, "epoch": 0.2293635790800252, "grad_norm": 0.9453125, "learning_rate": 0.000499761738226476, "loss": 5.8626, "mean_token_accuracy": 0.14258013665676117, "num_tokens": 5041688.0, "step": 2730 }, { "entropy": 6.056025457382202, "epoch": 0.2297836588951901, "grad_norm": 0.9765625, "learning_rate": 0.000499760358446736, "loss": 5.9702, "mean_token_accuracy": 0.13718490228056907, "num_tokens": 5051005.0, "step": 2735 }, { "entropy": 6.152891635894775, "epoch": 0.23020373871035496, "grad_norm": 0.96484375, "learning_rate": 0.000499758974685482, "loss": 5.9147, "mean_token_accuracy": 0.13967233374714852, "num_tokens": 5060084.0, "step": 2740 }, { "entropy": 6.059838390350341, "epoch": 0.23062381852551986, "grad_norm": 1.0859375, "learning_rate": 0.0004997575869427385, "loss": 5.9122, "mean_token_accuracy": 0.14734914749860764, "num_tokens": 5069081.0, "step": 2745 }, { "entropy": 6.0928624153137205, "epoch": 0.23104389834068473, "grad_norm": 0.9609375, "learning_rate": 0.00049975619521853, "loss": 5.9121, "mean_token_accuracy": 0.13845374211668968, "num_tokens": 5078597.0, "step": 2750 }, { "entropy": 6.052087306976318, "epoch": 0.2314639781558496, "grad_norm": 0.953125, "learning_rate": 0.0004997547995128814, "loss": 5.9554, "mean_token_accuracy": 0.14530446976423264, "num_tokens": 5087607.0, "step": 2755 }, { "entropy": 6.094136476516724, "epoch": 0.2318840579710145, "grad_norm": 1.078125, "learning_rate": 0.0004997533998258171, "loss": 5.9424, "mean_token_accuracy": 0.14329736083745956, "num_tokens": 5097412.0, "step": 2760 }, { "entropy": 6.16567211151123, "epoch": 0.23230413778617937, "grad_norm": 0.984375, "learning_rate": 0.0004997519961573622, "loss": 6.0152, "mean_token_accuracy": 0.13348544016480446, "num_tokens": 5105817.0, "step": 2765 }, { "entropy": 6.226717376708985, "epoch": 0.23272421760134426, "grad_norm": 1.0625, "learning_rate": 0.0004997505885075414, "loss": 6.0522, "mean_token_accuracy": 0.13480133637785913, "num_tokens": 5114958.0, "step": 2770 }, { "entropy": 6.084324312210083, "epoch": 0.23314429741650913, "grad_norm": 0.9609375, "learning_rate": 0.0004997491768763795, "loss": 5.9898, "mean_token_accuracy": 0.13868246227502823, "num_tokens": 5123728.0, "step": 2775 }, { "entropy": 6.100927209854126, "epoch": 0.23356437723167403, "grad_norm": 0.9921875, "learning_rate": 0.0004997477612639018, "loss": 6.0218, "mean_token_accuracy": 0.13395264372229576, "num_tokens": 5134099.0, "step": 2780 }, { "entropy": 6.162116241455078, "epoch": 0.2339844570468389, "grad_norm": 1.0, "learning_rate": 0.0004997463416701332, "loss": 6.0325, "mean_token_accuracy": 0.13172747194766998, "num_tokens": 5142934.0, "step": 2785 }, { "entropy": 6.000607919692993, "epoch": 0.23440453686200377, "grad_norm": 0.99609375, "learning_rate": 0.0004997449180950989, "loss": 5.8681, "mean_token_accuracy": 0.15649961084127426, "num_tokens": 5151835.0, "step": 2790 }, { "entropy": 6.038245487213135, "epoch": 0.23482461667716867, "grad_norm": 0.9140625, "learning_rate": 0.0004997434905388241, "loss": 5.921, "mean_token_accuracy": 0.1477814018726349, "num_tokens": 5161136.0, "step": 2795 }, { "entropy": 6.029763174057007, "epoch": 0.23524469649233354, "grad_norm": 0.921875, "learning_rate": 0.000499742059001334, "loss": 5.8684, "mean_token_accuracy": 0.14450337663292884, "num_tokens": 5170741.0, "step": 2800 }, { "entropy": 6.046102046966553, "epoch": 0.23566477630749844, "grad_norm": 0.9921875, "learning_rate": 0.0004997406234826541, "loss": 5.9001, "mean_token_accuracy": 0.14729267880320548, "num_tokens": 5180549.0, "step": 2805 }, { "entropy": 5.980107164382934, "epoch": 0.2360848561226633, "grad_norm": 0.88671875, "learning_rate": 0.0004997391839828098, "loss": 5.8667, "mean_token_accuracy": 0.14962306916713713, "num_tokens": 5189486.0, "step": 2810 }, { "entropy": 6.044159746170044, "epoch": 0.23650493593782818, "grad_norm": 0.96484375, "learning_rate": 0.0004997377405018266, "loss": 5.9303, "mean_token_accuracy": 0.13750530928373336, "num_tokens": 5198525.0, "step": 2815 }, { "entropy": 6.075648498535156, "epoch": 0.23692501575299307, "grad_norm": 0.99609375, "learning_rate": 0.00049973629303973, "loss": 5.9734, "mean_token_accuracy": 0.14086321070790292, "num_tokens": 5207124.0, "step": 2820 }, { "entropy": 5.964286422729492, "epoch": 0.23734509556815794, "grad_norm": 0.8984375, "learning_rate": 0.0004997348415965457, "loss": 5.8079, "mean_token_accuracy": 0.14603810012340546, "num_tokens": 5216529.0, "step": 2825 }, { "entropy": 6.12622709274292, "epoch": 0.23776517538332284, "grad_norm": 1.03125, "learning_rate": 0.0004997333861722995, "loss": 5.9402, "mean_token_accuracy": 0.14331007972359658, "num_tokens": 5225796.0, "step": 2830 }, { "entropy": 6.085462188720703, "epoch": 0.2381852551984877, "grad_norm": 1.0703125, "learning_rate": 0.000499731926767017, "loss": 5.9732, "mean_token_accuracy": 0.14003979936242103, "num_tokens": 5233876.0, "step": 2835 }, { "entropy": 6.016348743438721, "epoch": 0.23860533501365258, "grad_norm": 0.9375, "learning_rate": 0.0004997304633807242, "loss": 5.9695, "mean_token_accuracy": 0.13823127001523972, "num_tokens": 5244782.0, "step": 2840 }, { "entropy": 6.077929925918579, "epoch": 0.23902541482881748, "grad_norm": 0.99609375, "learning_rate": 0.0004997289960134468, "loss": 5.8993, "mean_token_accuracy": 0.14192162305116654, "num_tokens": 5253453.0, "step": 2845 }, { "entropy": 6.049857330322266, "epoch": 0.23944549464398235, "grad_norm": 1.0546875, "learning_rate": 0.0004997275246652111, "loss": 5.9414, "mean_token_accuracy": 0.14183279648423194, "num_tokens": 5262355.0, "step": 2850 }, { "entropy": 6.019342088699341, "epoch": 0.23986557445914725, "grad_norm": 1.0, "learning_rate": 0.000499726049336043, "loss": 5.8652, "mean_token_accuracy": 0.14227822795510292, "num_tokens": 5271959.0, "step": 2855 }, { "entropy": 6.045290803909301, "epoch": 0.24028565427431212, "grad_norm": 1.0546875, "learning_rate": 0.0004997245700259686, "loss": 5.8938, "mean_token_accuracy": 0.14394148513674737, "num_tokens": 5281393.0, "step": 2860 }, { "entropy": 6.126777935028076, "epoch": 0.240705734089477, "grad_norm": 0.921875, "learning_rate": 0.0004997230867350141, "loss": 6.0153, "mean_token_accuracy": 0.13795892894268036, "num_tokens": 5290979.0, "step": 2865 }, { "entropy": 6.170654964447022, "epoch": 0.24112581390464188, "grad_norm": 0.9921875, "learning_rate": 0.0004997215994632059, "loss": 5.9662, "mean_token_accuracy": 0.1420626498758793, "num_tokens": 5300263.0, "step": 2870 }, { "entropy": 6.098070096969605, "epoch": 0.24154589371980675, "grad_norm": 0.94921875, "learning_rate": 0.0004997201082105704, "loss": 5.9973, "mean_token_accuracy": 0.1376795694231987, "num_tokens": 5309522.0, "step": 2875 }, { "entropy": 6.09854941368103, "epoch": 0.24196597353497165, "grad_norm": 1.03125, "learning_rate": 0.0004997186129771338, "loss": 5.9906, "mean_token_accuracy": 0.1443823680281639, "num_tokens": 5319770.0, "step": 2880 }, { "entropy": 6.159392309188843, "epoch": 0.24238605335013652, "grad_norm": 1.015625, "learning_rate": 0.0004997171137629226, "loss": 5.9994, "mean_token_accuracy": 0.14119460731744765, "num_tokens": 5328400.0, "step": 2885 }, { "entropy": 6.00137939453125, "epoch": 0.24280613316530142, "grad_norm": 1.03125, "learning_rate": 0.0004997156105679636, "loss": 5.8054, "mean_token_accuracy": 0.15445883423089982, "num_tokens": 5336338.0, "step": 2890 }, { "entropy": 5.9904273509979244, "epoch": 0.2432262129804663, "grad_norm": 0.97265625, "learning_rate": 0.0004997141033922832, "loss": 5.8983, "mean_token_accuracy": 0.1381608746945858, "num_tokens": 5345391.0, "step": 2895 }, { "entropy": 6.080091238021851, "epoch": 0.24364629279563116, "grad_norm": 0.9921875, "learning_rate": 0.0004997125922359081, "loss": 5.9345, "mean_token_accuracy": 0.13472433462738992, "num_tokens": 5354709.0, "step": 2900 }, { "entropy": 6.0483152866363525, "epoch": 0.24406637261079606, "grad_norm": 1.0, "learning_rate": 0.0004997110770988652, "loss": 5.8441, "mean_token_accuracy": 0.14647466093301773, "num_tokens": 5363738.0, "step": 2905 }, { "entropy": 6.065390634536743, "epoch": 0.24448645242596093, "grad_norm": 1.078125, "learning_rate": 0.0004997095579811813, "loss": 5.9742, "mean_token_accuracy": 0.14132302552461623, "num_tokens": 5373583.0, "step": 2910 }, { "entropy": 6.1408384323120115, "epoch": 0.24490653224112582, "grad_norm": 0.875, "learning_rate": 0.0004997080348828833, "loss": 6.0104, "mean_token_accuracy": 0.14406906738877295, "num_tokens": 5383486.0, "step": 2915 }, { "entropy": 6.012083101272583, "epoch": 0.2453266120562907, "grad_norm": 1.0390625, "learning_rate": 0.0004997065078039981, "loss": 5.9283, "mean_token_accuracy": 0.13883504942059516, "num_tokens": 5391974.0, "step": 2920 }, { "entropy": 6.098450088500977, "epoch": 0.24574669187145556, "grad_norm": 1.03125, "learning_rate": 0.0004997049767445529, "loss": 5.9688, "mean_token_accuracy": 0.13587900176644324, "num_tokens": 5400882.0, "step": 2925 }, { "entropy": 6.1687455654144285, "epoch": 0.24616677168662046, "grad_norm": 0.96484375, "learning_rate": 0.0004997034417045746, "loss": 5.9199, "mean_token_accuracy": 0.13755179792642594, "num_tokens": 5410538.0, "step": 2930 }, { "entropy": 6.019326400756836, "epoch": 0.24658685150178533, "grad_norm": 0.99609375, "learning_rate": 0.0004997019026840907, "loss": 5.8134, "mean_token_accuracy": 0.14420632421970367, "num_tokens": 5419406.0, "step": 2935 }, { "entropy": 5.9686970710754395, "epoch": 0.24700693131695023, "grad_norm": 0.98046875, "learning_rate": 0.0004997003596831282, "loss": 5.941, "mean_token_accuracy": 0.13971618413925171, "num_tokens": 5428817.0, "step": 2940 }, { "entropy": 6.097631120681763, "epoch": 0.2474270111321151, "grad_norm": 0.98828125, "learning_rate": 0.0004996988127017145, "loss": 5.9448, "mean_token_accuracy": 0.13872243240475654, "num_tokens": 5438277.0, "step": 2945 }, { "entropy": 6.047083616256714, "epoch": 0.24784709094728, "grad_norm": 1.0234375, "learning_rate": 0.0004996972617398772, "loss": 5.974, "mean_token_accuracy": 0.13909853398799896, "num_tokens": 5447440.0, "step": 2950 }, { "entropy": 6.065885257720947, "epoch": 0.24826717076244487, "grad_norm": 0.98828125, "learning_rate": 0.0004996957067976435, "loss": 5.9005, "mean_token_accuracy": 0.13819090723991395, "num_tokens": 5455988.0, "step": 2955 }, { "entropy": 6.079396390914917, "epoch": 0.24868725057760974, "grad_norm": 0.96875, "learning_rate": 0.0004996941478750411, "loss": 5.895, "mean_token_accuracy": 0.14170320481061935, "num_tokens": 5464996.0, "step": 2960 }, { "entropy": 6.131442737579346, "epoch": 0.24910733039277463, "grad_norm": 0.9140625, "learning_rate": 0.0004996925849720975, "loss": 6.0433, "mean_token_accuracy": 0.13297844752669336, "num_tokens": 5474174.0, "step": 2965 }, { "entropy": 6.144496154785156, "epoch": 0.2495274102079395, "grad_norm": 1.0390625, "learning_rate": 0.0004996910180888405, "loss": 5.928, "mean_token_accuracy": 0.14379495605826378, "num_tokens": 5482838.0, "step": 2970 }, { "entropy": 6.089239263534546, "epoch": 0.2499474900231044, "grad_norm": 0.9609375, "learning_rate": 0.0004996894472252977, "loss": 5.9339, "mean_token_accuracy": 0.1420593172311783, "num_tokens": 5491616.0, "step": 2975 }, { "entropy": 5.992457008361816, "epoch": 0.25036756983826924, "grad_norm": 0.94921875, "learning_rate": 0.0004996878723814973, "loss": 5.9265, "mean_token_accuracy": 0.13892921283841134, "num_tokens": 5500942.0, "step": 2980 }, { "entropy": 6.117427587509155, "epoch": 0.25078764965343414, "grad_norm": 0.94921875, "learning_rate": 0.0004996862935574667, "loss": 5.8788, "mean_token_accuracy": 0.13912170454859735, "num_tokens": 5510078.0, "step": 2985 }, { "entropy": 5.943054437637329, "epoch": 0.25120772946859904, "grad_norm": 0.94140625, "learning_rate": 0.0004996847107532342, "loss": 5.9134, "mean_token_accuracy": 0.14340257570147513, "num_tokens": 5518924.0, "step": 2990 }, { "entropy": 6.108536148071289, "epoch": 0.25162780928376394, "grad_norm": 0.93359375, "learning_rate": 0.0004996831239688277, "loss": 5.9216, "mean_token_accuracy": 0.13749035373330115, "num_tokens": 5527385.0, "step": 2995 }, { "entropy": 5.977105903625488, "epoch": 0.2520478890989288, "grad_norm": 0.95703125, "learning_rate": 0.0004996815332042754, "loss": 5.766, "mean_token_accuracy": 0.15047305673360825, "num_tokens": 5536781.0, "step": 3000 }, { "epoch": 0.2520478890989288, "eval_entropy": 5.7445289912557636, "eval_loss": 5.931798458099365, "eval_mean_token_accuracy": 0.1480788363722414, "eval_num_tokens": 5536781.0, "eval_runtime": 21.0325, "eval_samples_per_second": 1776.586, "eval_steps_per_second": 222.085, "step": 3000 }, { "entropy": 6.008361387252807, "epoch": 0.2524679689140937, "grad_norm": 0.96484375, "learning_rate": 0.0004996799384596054, "loss": 5.9477, "mean_token_accuracy": 0.14386533573269844, "num_tokens": 5545893.0, "step": 3005 }, { "entropy": 6.112303066253662, "epoch": 0.2528880487292586, "grad_norm": 0.90625, "learning_rate": 0.0004996783397348461, "loss": 5.9152, "mean_token_accuracy": 0.13690555915236474, "num_tokens": 5555818.0, "step": 3010 }, { "entropy": 6.042035245895386, "epoch": 0.2533081285444234, "grad_norm": 0.8671875, "learning_rate": 0.0004996767370300256, "loss": 5.8717, "mean_token_accuracy": 0.14453656524419783, "num_tokens": 5565331.0, "step": 3015 }, { "entropy": 6.081929445266724, "epoch": 0.2537282083595883, "grad_norm": 1.0, "learning_rate": 0.0004996751303451724, "loss": 5.8599, "mean_token_accuracy": 0.14481035768985748, "num_tokens": 5574003.0, "step": 3020 }, { "entropy": 5.977067756652832, "epoch": 0.2541482881747532, "grad_norm": 0.9765625, "learning_rate": 0.0004996735196803149, "loss": 5.7815, "mean_token_accuracy": 0.15307400673627852, "num_tokens": 5582517.0, "step": 3025 }, { "entropy": 6.072621822357178, "epoch": 0.2545683679899181, "grad_norm": 0.875, "learning_rate": 0.0004996719050354818, "loss": 5.9948, "mean_token_accuracy": 0.13989571258425712, "num_tokens": 5591952.0, "step": 3030 }, { "entropy": 6.03379979133606, "epoch": 0.25498844780508295, "grad_norm": 0.953125, "learning_rate": 0.0004996702864107015, "loss": 5.8913, "mean_token_accuracy": 0.14787303507328034, "num_tokens": 5601460.0, "step": 3035 }, { "entropy": 6.189465713500977, "epoch": 0.25540852762024785, "grad_norm": 0.98828125, "learning_rate": 0.0004996686638060028, "loss": 6.0052, "mean_token_accuracy": 0.13520606160163878, "num_tokens": 5610776.0, "step": 3040 }, { "entropy": 6.085352611541748, "epoch": 0.25582860743541275, "grad_norm": 0.91015625, "learning_rate": 0.0004996670372214144, "loss": 5.9054, "mean_token_accuracy": 0.14562050476670266, "num_tokens": 5619627.0, "step": 3045 }, { "entropy": 5.9095056533813475, "epoch": 0.2562486872505776, "grad_norm": 0.87890625, "learning_rate": 0.0004996654066569651, "loss": 5.7872, "mean_token_accuracy": 0.14956104382872581, "num_tokens": 5628969.0, "step": 3050 }, { "entropy": 5.998289918899536, "epoch": 0.2566687670657425, "grad_norm": 0.9921875, "learning_rate": 0.0004996637721126839, "loss": 5.8501, "mean_token_accuracy": 0.14419863522052764, "num_tokens": 5638629.0, "step": 3055 }, { "entropy": 6.084632110595703, "epoch": 0.2570888468809074, "grad_norm": 1.046875, "learning_rate": 0.0004996621335885996, "loss": 5.9249, "mean_token_accuracy": 0.13865133970975876, "num_tokens": 5647571.0, "step": 3060 }, { "entropy": 6.059264850616455, "epoch": 0.2575089266960722, "grad_norm": 1.21875, "learning_rate": 0.0004996604910847413, "loss": 5.8418, "mean_token_accuracy": 0.1548224687576294, "num_tokens": 5656709.0, "step": 3065 }, { "entropy": 6.037788724899292, "epoch": 0.2579290065112371, "grad_norm": 0.96484375, "learning_rate": 0.000499658844601138, "loss": 6.0136, "mean_token_accuracy": 0.14061269238591195, "num_tokens": 5665714.0, "step": 3070 }, { "entropy": 6.112887382507324, "epoch": 0.258349086326402, "grad_norm": 0.91796875, "learning_rate": 0.000499657194137819, "loss": 5.9813, "mean_token_accuracy": 0.1434816040098667, "num_tokens": 5675854.0, "step": 3075 }, { "entropy": 6.10079174041748, "epoch": 0.2587691661415669, "grad_norm": 0.96875, "learning_rate": 0.0004996555396948136, "loss": 5.8062, "mean_token_accuracy": 0.14445895925164223, "num_tokens": 5685690.0, "step": 3080 }, { "entropy": 6.008033037185669, "epoch": 0.25918924595673176, "grad_norm": 0.88671875, "learning_rate": 0.0004996538812721509, "loss": 5.8654, "mean_token_accuracy": 0.14993129372596742, "num_tokens": 5695766.0, "step": 3085 }, { "entropy": 6.072084999084472, "epoch": 0.25960932577189666, "grad_norm": 1.046875, "learning_rate": 0.0004996522188698603, "loss": 5.8982, "mean_token_accuracy": 0.14610292240977288, "num_tokens": 5704365.0, "step": 3090 }, { "entropy": 6.0555907726287845, "epoch": 0.26002940558706156, "grad_norm": 1.125, "learning_rate": 0.0004996505524879714, "loss": 6.0101, "mean_token_accuracy": 0.14055205136537552, "num_tokens": 5713345.0, "step": 3095 }, { "entropy": 6.035314083099365, "epoch": 0.2604494854022264, "grad_norm": 0.91796875, "learning_rate": 0.0004996488821265137, "loss": 5.816, "mean_token_accuracy": 0.14724740535020828, "num_tokens": 5722907.0, "step": 3100 }, { "entropy": 6.007513093948364, "epoch": 0.2608695652173913, "grad_norm": 0.98828125, "learning_rate": 0.0004996472077855166, "loss": 5.8596, "mean_token_accuracy": 0.1498942032456398, "num_tokens": 5731589.0, "step": 3105 }, { "entropy": 5.998636054992676, "epoch": 0.2612896450325562, "grad_norm": 0.984375, "learning_rate": 0.00049964552946501, "loss": 5.8476, "mean_token_accuracy": 0.1439466342329979, "num_tokens": 5739922.0, "step": 3110 }, { "entropy": 5.9389458179473875, "epoch": 0.2617097248477211, "grad_norm": 0.96484375, "learning_rate": 0.0004996438471650235, "loss": 5.7675, "mean_token_accuracy": 0.15062671899795532, "num_tokens": 5749206.0, "step": 3115 }, { "entropy": 6.008351278305054, "epoch": 0.26212980466288593, "grad_norm": 0.92578125, "learning_rate": 0.0004996421608855869, "loss": 5.8288, "mean_token_accuracy": 0.15271472856402396, "num_tokens": 5758803.0, "step": 3120 }, { "entropy": 6.044885444641113, "epoch": 0.26254988447805083, "grad_norm": 0.96875, "learning_rate": 0.0004996404706267301, "loss": 5.9065, "mean_token_accuracy": 0.13532925099134446, "num_tokens": 5768368.0, "step": 3125 }, { "entropy": 5.958721733093261, "epoch": 0.26296996429321573, "grad_norm": 1.0625, "learning_rate": 0.000499638776388483, "loss": 5.7648, "mean_token_accuracy": 0.1534928262233734, "num_tokens": 5776707.0, "step": 3130 }, { "entropy": 5.986162996292114, "epoch": 0.26339004410838057, "grad_norm": 0.97265625, "learning_rate": 0.0004996370781708757, "loss": 5.9532, "mean_token_accuracy": 0.13491747826337813, "num_tokens": 5787037.0, "step": 3135 }, { "entropy": 6.018689870834351, "epoch": 0.26381012392354547, "grad_norm": 0.875, "learning_rate": 0.0004996353759739382, "loss": 5.9005, "mean_token_accuracy": 0.14967331141233445, "num_tokens": 5796630.0, "step": 3140 }, { "entropy": 5.985601377487183, "epoch": 0.26423020373871037, "grad_norm": 1.015625, "learning_rate": 0.0004996336697977007, "loss": 5.8974, "mean_token_accuracy": 0.14190822690725327, "num_tokens": 5806402.0, "step": 3145 }, { "entropy": 5.99180235862732, "epoch": 0.2646502835538752, "grad_norm": 0.98828125, "learning_rate": 0.0004996319596421933, "loss": 5.853, "mean_token_accuracy": 0.14679677560925483, "num_tokens": 5815742.0, "step": 3150 }, { "entropy": 6.00025954246521, "epoch": 0.2650703633690401, "grad_norm": 0.90625, "learning_rate": 0.0004996302455074466, "loss": 5.8679, "mean_token_accuracy": 0.14232094436883927, "num_tokens": 5824915.0, "step": 3155 }, { "entropy": 6.032740592956543, "epoch": 0.265490443184205, "grad_norm": 0.921875, "learning_rate": 0.0004996285273934906, "loss": 5.8901, "mean_token_accuracy": 0.14556412398815155, "num_tokens": 5834978.0, "step": 3160 }, { "entropy": 6.078465604782105, "epoch": 0.2659105229993699, "grad_norm": 0.87890625, "learning_rate": 0.000499626805300356, "loss": 6.0439, "mean_token_accuracy": 0.14277126342058183, "num_tokens": 5845684.0, "step": 3165 }, { "entropy": 6.094513893127441, "epoch": 0.26633060281453474, "grad_norm": 0.97265625, "learning_rate": 0.0004996250792280732, "loss": 5.9226, "mean_token_accuracy": 0.13814914003014564, "num_tokens": 5854905.0, "step": 3170 }, { "entropy": 6.054658889770508, "epoch": 0.26675068262969964, "grad_norm": 1.03125, "learning_rate": 0.0004996233491766727, "loss": 5.934, "mean_token_accuracy": 0.14257717728614808, "num_tokens": 5863654.0, "step": 3175 }, { "entropy": 6.036546421051026, "epoch": 0.26717076244486454, "grad_norm": 1.03125, "learning_rate": 0.0004996216151461854, "loss": 5.9289, "mean_token_accuracy": 0.14137156009674073, "num_tokens": 5872442.0, "step": 3180 }, { "entropy": 6.089460277557373, "epoch": 0.2675908422600294, "grad_norm": 0.9609375, "learning_rate": 0.0004996198771366417, "loss": 5.8594, "mean_token_accuracy": 0.14687168076634408, "num_tokens": 5882372.0, "step": 3185 }, { "entropy": 5.836459922790527, "epoch": 0.2680109220751943, "grad_norm": 0.98828125, "learning_rate": 0.0004996181351480726, "loss": 5.6727, "mean_token_accuracy": 0.15421667248010634, "num_tokens": 5891113.0, "step": 3190 }, { "entropy": 5.909378480911255, "epoch": 0.2684310018903592, "grad_norm": 0.94140625, "learning_rate": 0.0004996163891805089, "loss": 5.9167, "mean_token_accuracy": 0.14929258525371553, "num_tokens": 5899582.0, "step": 3195 }, { "entropy": 6.088847398757935, "epoch": 0.2688510817055241, "grad_norm": 0.94921875, "learning_rate": 0.0004996146392339815, "loss": 5.8788, "mean_token_accuracy": 0.137289460003376, "num_tokens": 5908938.0, "step": 3200 }, { "entropy": 6.025485897064209, "epoch": 0.2692711615206889, "grad_norm": 0.9453125, "learning_rate": 0.0004996128853085215, "loss": 5.8462, "mean_token_accuracy": 0.14703118950128555, "num_tokens": 5918055.0, "step": 3205 }, { "entropy": 6.024847555160522, "epoch": 0.2696912413358538, "grad_norm": 0.921875, "learning_rate": 0.0004996111274041598, "loss": 5.8169, "mean_token_accuracy": 0.14159609079360963, "num_tokens": 5926744.0, "step": 3210 }, { "entropy": 6.007894611358642, "epoch": 0.2701113211510187, "grad_norm": 0.87109375, "learning_rate": 0.0004996093655209277, "loss": 5.9028, "mean_token_accuracy": 0.1412175938487053, "num_tokens": 5936521.0, "step": 3215 }, { "entropy": 6.093644618988037, "epoch": 0.27053140096618356, "grad_norm": 0.98828125, "learning_rate": 0.0004996075996588563, "loss": 5.9689, "mean_token_accuracy": 0.1381188787519932, "num_tokens": 5945010.0, "step": 3220 }, { "entropy": 6.014964437484741, "epoch": 0.27095148078134845, "grad_norm": 0.9296875, "learning_rate": 0.000499605829817977, "loss": 5.8629, "mean_token_accuracy": 0.15120311975479125, "num_tokens": 5953766.0, "step": 3225 }, { "entropy": 5.982144498825074, "epoch": 0.27137156059651335, "grad_norm": 0.90234375, "learning_rate": 0.000499604055998321, "loss": 5.8001, "mean_token_accuracy": 0.14623286202549934, "num_tokens": 5962168.0, "step": 3230 }, { "entropy": 5.941414022445679, "epoch": 0.2717916404116782, "grad_norm": 0.890625, "learning_rate": 0.0004996022781999198, "loss": 5.8249, "mean_token_accuracy": 0.14706685170531272, "num_tokens": 5971627.0, "step": 3235 }, { "entropy": 6.00689377784729, "epoch": 0.2722117202268431, "grad_norm": 0.97265625, "learning_rate": 0.000499600496422805, "loss": 5.8993, "mean_token_accuracy": 0.14405820965766908, "num_tokens": 5981775.0, "step": 3240 }, { "entropy": 5.973731327056885, "epoch": 0.272631800042008, "grad_norm": 0.9453125, "learning_rate": 0.000499598710667008, "loss": 5.838, "mean_token_accuracy": 0.1444271594285965, "num_tokens": 5991097.0, "step": 3245 }, { "entropy": 5.973551654815674, "epoch": 0.2730518798571729, "grad_norm": 1.0234375, "learning_rate": 0.0004995969209325604, "loss": 5.8988, "mean_token_accuracy": 0.14417145103216172, "num_tokens": 5999517.0, "step": 3250 }, { "entropy": 5.939422225952148, "epoch": 0.2734719596723377, "grad_norm": 0.953125, "learning_rate": 0.0004995951272194941, "loss": 5.8778, "mean_token_accuracy": 0.139290714263916, "num_tokens": 6008545.0, "step": 3255 }, { "entropy": 6.07567138671875, "epoch": 0.2738920394875026, "grad_norm": 0.9765625, "learning_rate": 0.0004995933295278407, "loss": 5.8603, "mean_token_accuracy": 0.14346815124154091, "num_tokens": 6017366.0, "step": 3260 }, { "entropy": 5.989615488052368, "epoch": 0.2743121193026675, "grad_norm": 1.046875, "learning_rate": 0.0004995915278576321, "loss": 5.8024, "mean_token_accuracy": 0.14921536892652512, "num_tokens": 6025597.0, "step": 3265 }, { "entropy": 5.995965671539307, "epoch": 0.27473219911783237, "grad_norm": 0.87890625, "learning_rate": 0.0004995897222089004, "loss": 5.9055, "mean_token_accuracy": 0.1438031278550625, "num_tokens": 6034239.0, "step": 3270 }, { "entropy": 6.17506217956543, "epoch": 0.27515227893299726, "grad_norm": 0.953125, "learning_rate": 0.0004995879125816772, "loss": 5.9388, "mean_token_accuracy": 0.14314718097448348, "num_tokens": 6043837.0, "step": 3275 }, { "entropy": 5.962472629547119, "epoch": 0.27557235874816216, "grad_norm": 0.87109375, "learning_rate": 0.0004995860989759949, "loss": 5.8709, "mean_token_accuracy": 0.14632273614406585, "num_tokens": 6053217.0, "step": 3280 }, { "entropy": 6.029792261123657, "epoch": 0.27599243856332706, "grad_norm": 1.0078125, "learning_rate": 0.0004995842813918855, "loss": 5.8948, "mean_token_accuracy": 0.1460642173886299, "num_tokens": 6061553.0, "step": 3285 }, { "entropy": 5.981232643127441, "epoch": 0.2764125183784919, "grad_norm": 1.046875, "learning_rate": 0.0004995824598293812, "loss": 5.7712, "mean_token_accuracy": 0.1501307800412178, "num_tokens": 6070080.0, "step": 3290 }, { "entropy": 6.045267486572266, "epoch": 0.2768325981936568, "grad_norm": 0.89453125, "learning_rate": 0.0004995806342885142, "loss": 5.9245, "mean_token_accuracy": 0.14930349588394165, "num_tokens": 6078438.0, "step": 3295 }, { "entropy": 6.0462220191955565, "epoch": 0.2772526780088217, "grad_norm": 1.0078125, "learning_rate": 0.000499578804769317, "loss": 5.9092, "mean_token_accuracy": 0.13776859119534493, "num_tokens": 6087794.0, "step": 3300 }, { "entropy": 6.104273176193237, "epoch": 0.27767275782398654, "grad_norm": 0.90234375, "learning_rate": 0.0004995769712718218, "loss": 5.9152, "mean_token_accuracy": 0.14523780345916748, "num_tokens": 6096709.0, "step": 3305 }, { "entropy": 5.998883199691773, "epoch": 0.27809283763915144, "grad_norm": 1.0, "learning_rate": 0.0004995751337960613, "loss": 5.8495, "mean_token_accuracy": 0.14268894568085672, "num_tokens": 6105866.0, "step": 3310 }, { "entropy": 6.001236534118652, "epoch": 0.27851291745431633, "grad_norm": 0.953125, "learning_rate": 0.0004995732923420679, "loss": 5.8071, "mean_token_accuracy": 0.15081177204847335, "num_tokens": 6114882.0, "step": 3315 }, { "entropy": 5.930415248870849, "epoch": 0.2789329972694812, "grad_norm": 0.9453125, "learning_rate": 0.0004995714469098743, "loss": 5.7725, "mean_token_accuracy": 0.14834588766098022, "num_tokens": 6123978.0, "step": 3320 }, { "entropy": 5.966728734970093, "epoch": 0.2793530770846461, "grad_norm": 0.93359375, "learning_rate": 0.000499569597499513, "loss": 5.9104, "mean_token_accuracy": 0.1466206818819046, "num_tokens": 6133246.0, "step": 3325 }, { "entropy": 5.988458681106567, "epoch": 0.27977315689981097, "grad_norm": 0.8671875, "learning_rate": 0.0004995677441110172, "loss": 5.7702, "mean_token_accuracy": 0.14939837008714676, "num_tokens": 6142865.0, "step": 3330 }, { "entropy": 6.014625930786133, "epoch": 0.28019323671497587, "grad_norm": 0.94140625, "learning_rate": 0.0004995658867444192, "loss": 5.8654, "mean_token_accuracy": 0.13881808668375015, "num_tokens": 6152492.0, "step": 3335 }, { "entropy": 5.975307273864746, "epoch": 0.2806133165301407, "grad_norm": 1.0078125, "learning_rate": 0.0004995640253997523, "loss": 5.8652, "mean_token_accuracy": 0.1395415373146534, "num_tokens": 6161953.0, "step": 3340 }, { "entropy": 5.848208713531494, "epoch": 0.2810333963453056, "grad_norm": 0.86328125, "learning_rate": 0.0004995621600770492, "loss": 5.7285, "mean_token_accuracy": 0.1502986840903759, "num_tokens": 6171467.0, "step": 3345 }, { "entropy": 5.9759973049163815, "epoch": 0.2814534761604705, "grad_norm": 0.87890625, "learning_rate": 0.0004995602907763431, "loss": 5.8103, "mean_token_accuracy": 0.1470308281481266, "num_tokens": 6180646.0, "step": 3350 }, { "entropy": 5.981297445297241, "epoch": 0.28187355597563535, "grad_norm": 1.0234375, "learning_rate": 0.0004995584174976672, "loss": 5.8029, "mean_token_accuracy": 0.14213321059942247, "num_tokens": 6189832.0, "step": 3355 }, { "entropy": 5.966393995285034, "epoch": 0.28229363579080025, "grad_norm": 0.95703125, "learning_rate": 0.0004995565402410544, "loss": 5.7274, "mean_token_accuracy": 0.1558822512626648, "num_tokens": 6198339.0, "step": 3360 }, { "entropy": 5.935036706924438, "epoch": 0.28271371560596514, "grad_norm": 1.0859375, "learning_rate": 0.0004995546590065383, "loss": 5.8126, "mean_token_accuracy": 0.14656742215156554, "num_tokens": 6207564.0, "step": 3365 }, { "entropy": 6.000332260131836, "epoch": 0.28313379542113004, "grad_norm": 0.98828125, "learning_rate": 0.0004995527737941518, "loss": 5.8581, "mean_token_accuracy": 0.14725540429353715, "num_tokens": 6216056.0, "step": 3370 }, { "entropy": 5.969868230819702, "epoch": 0.2835538752362949, "grad_norm": 0.9609375, "learning_rate": 0.0004995508846039287, "loss": 5.8259, "mean_token_accuracy": 0.1441423200070858, "num_tokens": 6225573.0, "step": 3375 }, { "entropy": 6.054820203781128, "epoch": 0.2839739550514598, "grad_norm": 0.93359375, "learning_rate": 0.0004995489914359023, "loss": 5.9519, "mean_token_accuracy": 0.13889921978116035, "num_tokens": 6235057.0, "step": 3380 }, { "entropy": 6.0446230411529545, "epoch": 0.2843940348666247, "grad_norm": 0.98046875, "learning_rate": 0.0004995470942901061, "loss": 5.8635, "mean_token_accuracy": 0.1436339296400547, "num_tokens": 6244164.0, "step": 3385 }, { "entropy": 6.036704730987549, "epoch": 0.2848141146817895, "grad_norm": 1.0, "learning_rate": 0.0004995451931665738, "loss": 5.8685, "mean_token_accuracy": 0.14183638542890548, "num_tokens": 6253095.0, "step": 3390 }, { "entropy": 5.9995965480804445, "epoch": 0.2852341944969544, "grad_norm": 0.94921875, "learning_rate": 0.000499543288065339, "loss": 5.817, "mean_token_accuracy": 0.14616027921438218, "num_tokens": 6261134.0, "step": 3395 }, { "entropy": 5.918176984786987, "epoch": 0.2856542743121193, "grad_norm": 1.015625, "learning_rate": 0.0004995413789864354, "loss": 5.8093, "mean_token_accuracy": 0.15111583173274995, "num_tokens": 6270384.0, "step": 3400 }, { "entropy": 5.925231647491455, "epoch": 0.28607435412728416, "grad_norm": 0.90234375, "learning_rate": 0.0004995394659298971, "loss": 5.7581, "mean_token_accuracy": 0.15247000753879547, "num_tokens": 6279702.0, "step": 3405 }, { "entropy": 5.9355387687683105, "epoch": 0.28649443394244906, "grad_norm": 0.90625, "learning_rate": 0.0004995375488957576, "loss": 5.8087, "mean_token_accuracy": 0.14355491399765014, "num_tokens": 6288297.0, "step": 3410 }, { "entropy": 5.953091335296631, "epoch": 0.28691451375761395, "grad_norm": 0.95703125, "learning_rate": 0.000499535627884051, "loss": 5.8943, "mean_token_accuracy": 0.13816075548529624, "num_tokens": 6297288.0, "step": 3415 }, { "entropy": 6.1151526927947994, "epoch": 0.28733459357277885, "grad_norm": 0.93359375, "learning_rate": 0.0004995337028948115, "loss": 5.912, "mean_token_accuracy": 0.13960782587528228, "num_tokens": 6306719.0, "step": 3420 }, { "entropy": 5.956048154830933, "epoch": 0.2877546733879437, "grad_norm": 0.9609375, "learning_rate": 0.0004995317739280731, "loss": 5.7384, "mean_token_accuracy": 0.15413220077753068, "num_tokens": 6316639.0, "step": 3425 }, { "entropy": 5.9882111072540285, "epoch": 0.2881747532031086, "grad_norm": 0.9375, "learning_rate": 0.0004995298409838699, "loss": 5.8729, "mean_token_accuracy": 0.14296835884451867, "num_tokens": 6326879.0, "step": 3430 }, { "entropy": 5.922442245483398, "epoch": 0.2885948330182735, "grad_norm": 0.90234375, "learning_rate": 0.000499527904062236, "loss": 5.7735, "mean_token_accuracy": 0.15226557850837708, "num_tokens": 6335729.0, "step": 3435 }, { "entropy": 5.973740720748902, "epoch": 0.28901491283343833, "grad_norm": 0.89453125, "learning_rate": 0.0004995259631632061, "loss": 5.8537, "mean_token_accuracy": 0.1386033460497856, "num_tokens": 6345154.0, "step": 3440 }, { "entropy": 5.9747546195983885, "epoch": 0.28943499264860323, "grad_norm": 0.9609375, "learning_rate": 0.0004995240182868143, "loss": 5.8072, "mean_token_accuracy": 0.14772575795650483, "num_tokens": 6354309.0, "step": 3445 }, { "entropy": 5.879770755767822, "epoch": 0.2898550724637681, "grad_norm": 0.89453125, "learning_rate": 0.0004995220694330951, "loss": 5.764, "mean_token_accuracy": 0.14814788177609445, "num_tokens": 6363389.0, "step": 3450 }, { "entropy": 5.928126335144043, "epoch": 0.290275152278933, "grad_norm": 0.921875, "learning_rate": 0.0004995201166020832, "loss": 5.8394, "mean_token_accuracy": 0.1423036128282547, "num_tokens": 6372475.0, "step": 3455 }, { "entropy": 6.01046404838562, "epoch": 0.29069523209409787, "grad_norm": 1.015625, "learning_rate": 0.000499518159793813, "loss": 5.7909, "mean_token_accuracy": 0.15391181409358978, "num_tokens": 6380906.0, "step": 3460 }, { "entropy": 5.901024436950683, "epoch": 0.29111531190926276, "grad_norm": 0.984375, "learning_rate": 0.000499516199008319, "loss": 5.7893, "mean_token_accuracy": 0.147665573656559, "num_tokens": 6390085.0, "step": 3465 }, { "entropy": 6.005919504165649, "epoch": 0.29153539172442766, "grad_norm": 1.0, "learning_rate": 0.0004995142342456364, "loss": 5.8587, "mean_token_accuracy": 0.14177713990211488, "num_tokens": 6399441.0, "step": 3470 }, { "entropy": 6.037836742401123, "epoch": 0.2919554715395925, "grad_norm": 0.95703125, "learning_rate": 0.0004995122655057997, "loss": 5.9277, "mean_token_accuracy": 0.14434729218482972, "num_tokens": 6408995.0, "step": 3475 }, { "entropy": 5.8759626865386965, "epoch": 0.2923755513547574, "grad_norm": 0.9453125, "learning_rate": 0.0004995102927888437, "loss": 5.6769, "mean_token_accuracy": 0.15346557945013045, "num_tokens": 6418080.0, "step": 3480 }, { "entropy": 5.980447435379029, "epoch": 0.2927956311699223, "grad_norm": 1.0390625, "learning_rate": 0.0004995083160948036, "loss": 5.8654, "mean_token_accuracy": 0.14365637302398682, "num_tokens": 6426732.0, "step": 3485 }, { "entropy": 5.918527126312256, "epoch": 0.29321571098508714, "grad_norm": 0.953125, "learning_rate": 0.0004995063354237141, "loss": 5.8601, "mean_token_accuracy": 0.14886348843574523, "num_tokens": 6435957.0, "step": 3490 }, { "entropy": 5.965629720687867, "epoch": 0.29363579080025204, "grad_norm": 1.1015625, "learning_rate": 0.0004995043507756107, "loss": 5.807, "mean_token_accuracy": 0.14377646446228026, "num_tokens": 6445642.0, "step": 3495 }, { "entropy": 5.966208457946777, "epoch": 0.29405587061541694, "grad_norm": 1.0859375, "learning_rate": 0.0004995023621505282, "loss": 5.8468, "mean_token_accuracy": 0.14531085640192032, "num_tokens": 6454664.0, "step": 3500 }, { "entropy": 5.846572160720825, "epoch": 0.29447595043058183, "grad_norm": 0.96484375, "learning_rate": 0.000499500369548502, "loss": 5.7718, "mean_token_accuracy": 0.14744968637824057, "num_tokens": 6463224.0, "step": 3505 }, { "entropy": 6.10300350189209, "epoch": 0.2948960302457467, "grad_norm": 0.90625, "learning_rate": 0.0004994983729695674, "loss": 5.9886, "mean_token_accuracy": 0.13981593102216722, "num_tokens": 6473112.0, "step": 3510 }, { "entropy": 5.991326189041137, "epoch": 0.2953161100609116, "grad_norm": 1.046875, "learning_rate": 0.0004994963724137595, "loss": 5.834, "mean_token_accuracy": 0.14485643282532693, "num_tokens": 6482062.0, "step": 3515 }, { "entropy": 5.928696584701538, "epoch": 0.29573618987607647, "grad_norm": 1.0625, "learning_rate": 0.0004994943678811142, "loss": 5.8362, "mean_token_accuracy": 0.1416163809597492, "num_tokens": 6490568.0, "step": 3520 }, { "entropy": 5.993920183181762, "epoch": 0.2961562696912413, "grad_norm": 0.9296875, "learning_rate": 0.0004994923593716667, "loss": 5.8772, "mean_token_accuracy": 0.14611808955669403, "num_tokens": 6500815.0, "step": 3525 }, { "entropy": 5.930905771255493, "epoch": 0.2965763495064062, "grad_norm": 0.94921875, "learning_rate": 0.0004994903468854527, "loss": 5.7544, "mean_token_accuracy": 0.15672436058521272, "num_tokens": 6509529.0, "step": 3530 }, { "entropy": 5.8914727687835695, "epoch": 0.2969964293215711, "grad_norm": 0.9609375, "learning_rate": 0.0004994883304225077, "loss": 5.8141, "mean_token_accuracy": 0.1436660371720791, "num_tokens": 6517934.0, "step": 3535 }, { "entropy": 6.048480892181397, "epoch": 0.297416509136736, "grad_norm": 0.90234375, "learning_rate": 0.0004994863099828675, "loss": 5.7902, "mean_token_accuracy": 0.14704177230596543, "num_tokens": 6526098.0, "step": 3540 }, { "entropy": 5.920773935317993, "epoch": 0.29783658895190085, "grad_norm": 0.953125, "learning_rate": 0.000499484285566568, "loss": 5.8221, "mean_token_accuracy": 0.14378595799207688, "num_tokens": 6535831.0, "step": 3545 }, { "entropy": 5.922514152526856, "epoch": 0.29825666876706575, "grad_norm": 0.859375, "learning_rate": 0.0004994822571736449, "loss": 5.7254, "mean_token_accuracy": 0.1482064038515091, "num_tokens": 6545704.0, "step": 3550 }, { "entropy": 5.899800491333008, "epoch": 0.29867674858223064, "grad_norm": 1.046875, "learning_rate": 0.0004994802248041342, "loss": 5.7535, "mean_token_accuracy": 0.14916675686836242, "num_tokens": 6554423.0, "step": 3555 }, { "entropy": 5.932198619842529, "epoch": 0.2990968283973955, "grad_norm": 0.96875, "learning_rate": 0.000499478188458072, "loss": 5.8022, "mean_token_accuracy": 0.14890404120087625, "num_tokens": 6563989.0, "step": 3560 }, { "entropy": 5.968116617202758, "epoch": 0.2995169082125604, "grad_norm": 1.1171875, "learning_rate": 0.0004994761481354943, "loss": 5.9483, "mean_token_accuracy": 0.1441567473113537, "num_tokens": 6572745.0, "step": 3565 }, { "entropy": 6.137206792831421, "epoch": 0.2999369880277253, "grad_norm": 0.99609375, "learning_rate": 0.0004994741038364371, "loss": 5.9343, "mean_token_accuracy": 0.142555071413517, "num_tokens": 6581723.0, "step": 3570 }, { "entropy": 5.88220705986023, "epoch": 0.3003570678428901, "grad_norm": 0.96875, "learning_rate": 0.0004994720555609369, "loss": 5.6659, "mean_token_accuracy": 0.1542235180735588, "num_tokens": 6590342.0, "step": 3575 }, { "entropy": 5.829970359802246, "epoch": 0.300777147658055, "grad_norm": 1.03125, "learning_rate": 0.0004994700033090297, "loss": 5.7501, "mean_token_accuracy": 0.1582304283976555, "num_tokens": 6599206.0, "step": 3580 }, { "entropy": 6.041889762878418, "epoch": 0.3011972274732199, "grad_norm": 1.0703125, "learning_rate": 0.000499467947080752, "loss": 6.0318, "mean_token_accuracy": 0.13561916202306748, "num_tokens": 6608947.0, "step": 3585 }, { "entropy": 6.06544942855835, "epoch": 0.3016173072883848, "grad_norm": 0.9765625, "learning_rate": 0.0004994658868761402, "loss": 5.8283, "mean_token_accuracy": 0.15170362889766692, "num_tokens": 6618378.0, "step": 3590 }, { "entropy": 5.914470624923706, "epoch": 0.30203738710354966, "grad_norm": 1.0078125, "learning_rate": 0.0004994638226952307, "loss": 5.8836, "mean_token_accuracy": 0.14195557832717895, "num_tokens": 6627527.0, "step": 3595 }, { "entropy": 5.982400751113891, "epoch": 0.30245746691871456, "grad_norm": 0.98828125, "learning_rate": 0.0004994617545380604, "loss": 5.8286, "mean_token_accuracy": 0.14527858346700667, "num_tokens": 6636964.0, "step": 3600 }, { "entropy": 5.908453559875488, "epoch": 0.30287754673387945, "grad_norm": 1.03125, "learning_rate": 0.0004994596824046656, "loss": 5.7718, "mean_token_accuracy": 0.14911266565322875, "num_tokens": 6646074.0, "step": 3605 }, { "entropy": 5.99076018333435, "epoch": 0.3032976265490443, "grad_norm": 0.9296875, "learning_rate": 0.000499457606295083, "loss": 5.8447, "mean_token_accuracy": 0.14240661412477493, "num_tokens": 6655027.0, "step": 3610 }, { "entropy": 5.808787536621094, "epoch": 0.3037177063642092, "grad_norm": 1.0390625, "learning_rate": 0.0004994555262093495, "loss": 5.6321, "mean_token_accuracy": 0.1570141136646271, "num_tokens": 6663747.0, "step": 3615 }, { "entropy": 6.046371412277222, "epoch": 0.3041377861793741, "grad_norm": 0.99609375, "learning_rate": 0.000499453442147502, "loss": 5.9593, "mean_token_accuracy": 0.1389522023499012, "num_tokens": 6672922.0, "step": 3620 }, { "entropy": 5.9334362030029295, "epoch": 0.304557865994539, "grad_norm": 0.9296875, "learning_rate": 0.0004994513541095773, "loss": 5.7735, "mean_token_accuracy": 0.15685406178236008, "num_tokens": 6682233.0, "step": 3625 }, { "entropy": 5.922385549545288, "epoch": 0.30497794580970383, "grad_norm": 0.97265625, "learning_rate": 0.0004994492620956126, "loss": 5.8112, "mean_token_accuracy": 0.15047757476568221, "num_tokens": 6691593.0, "step": 3630 }, { "entropy": 5.917299842834472, "epoch": 0.30539802562486873, "grad_norm": 0.91796875, "learning_rate": 0.0004994471661056445, "loss": 5.8207, "mean_token_accuracy": 0.15176298022270202, "num_tokens": 6701318.0, "step": 3635 }, { "entropy": 6.031417036056519, "epoch": 0.3058181054400336, "grad_norm": 0.9140625, "learning_rate": 0.0004994450661397106, "loss": 5.8199, "mean_token_accuracy": 0.1515482097864151, "num_tokens": 6710059.0, "step": 3640 }, { "entropy": 6.035120582580566, "epoch": 0.30623818525519847, "grad_norm": 0.921875, "learning_rate": 0.000499442962197848, "loss": 5.9111, "mean_token_accuracy": 0.14002010971307755, "num_tokens": 6719811.0, "step": 3645 }, { "entropy": 5.872648668289185, "epoch": 0.30665826507036337, "grad_norm": 0.93359375, "learning_rate": 0.0004994408542800937, "loss": 5.7991, "mean_token_accuracy": 0.15095670521259308, "num_tokens": 6728789.0, "step": 3650 }, { "entropy": 5.943379068374634, "epoch": 0.30707834488552826, "grad_norm": 0.984375, "learning_rate": 0.0004994387423864855, "loss": 5.7834, "mean_token_accuracy": 0.1460746333003044, "num_tokens": 6737706.0, "step": 3655 }, { "entropy": 5.941844272613525, "epoch": 0.3074984247006931, "grad_norm": 0.98828125, "learning_rate": 0.0004994366265170603, "loss": 5.7446, "mean_token_accuracy": 0.16055794954299926, "num_tokens": 6746861.0, "step": 3660 }, { "entropy": 6.028618669509887, "epoch": 0.307918504515858, "grad_norm": 1.0390625, "learning_rate": 0.0004994345066718558, "loss": 5.916, "mean_token_accuracy": 0.14116688221693038, "num_tokens": 6755242.0, "step": 3665 }, { "entropy": 6.008127069473266, "epoch": 0.3083385843310229, "grad_norm": 0.9609375, "learning_rate": 0.0004994323828509098, "loss": 5.8727, "mean_token_accuracy": 0.14286566898226738, "num_tokens": 6764549.0, "step": 3670 }, { "entropy": 5.929146242141724, "epoch": 0.3087586641461878, "grad_norm": 1.0390625, "learning_rate": 0.0004994302550542596, "loss": 5.8471, "mean_token_accuracy": 0.1538454920053482, "num_tokens": 6774123.0, "step": 3675 }, { "entropy": 5.80585126876831, "epoch": 0.30917874396135264, "grad_norm": 1.046875, "learning_rate": 0.000499428123281943, "loss": 5.6317, "mean_token_accuracy": 0.1558361306786537, "num_tokens": 6782922.0, "step": 3680 }, { "entropy": 5.925417232513428, "epoch": 0.30959882377651754, "grad_norm": 0.94921875, "learning_rate": 0.0004994259875339978, "loss": 5.8838, "mean_token_accuracy": 0.14831040799617767, "num_tokens": 6792042.0, "step": 3685 }, { "entropy": 6.067014694213867, "epoch": 0.31001890359168244, "grad_norm": 1.0234375, "learning_rate": 0.0004994238478104617, "loss": 5.872, "mean_token_accuracy": 0.14466599076986314, "num_tokens": 6800994.0, "step": 3690 }, { "entropy": 5.913062810897827, "epoch": 0.3104389834068473, "grad_norm": 0.9375, "learning_rate": 0.0004994217041113727, "loss": 5.8012, "mean_token_accuracy": 0.15395486801862718, "num_tokens": 6809938.0, "step": 3695 }, { "entropy": 6.028704833984375, "epoch": 0.3108590632220122, "grad_norm": 0.8828125, "learning_rate": 0.0004994195564367688, "loss": 5.9148, "mean_token_accuracy": 0.14361433312296867, "num_tokens": 6820289.0, "step": 3700 }, { "entropy": 5.998479652404785, "epoch": 0.3112791430371771, "grad_norm": 1.0078125, "learning_rate": 0.0004994174047866882, "loss": 5.7538, "mean_token_accuracy": 0.15162525251507758, "num_tokens": 6830068.0, "step": 3705 }, { "entropy": 5.830403566360474, "epoch": 0.3116992228523419, "grad_norm": 0.98828125, "learning_rate": 0.0004994152491611686, "loss": 5.7916, "mean_token_accuracy": 0.14659319072961807, "num_tokens": 6838591.0, "step": 3710 }, { "entropy": 5.838834381103515, "epoch": 0.3121193026675068, "grad_norm": 0.94921875, "learning_rate": 0.0004994130895602485, "loss": 5.7583, "mean_token_accuracy": 0.14570422172546388, "num_tokens": 6847796.0, "step": 3715 }, { "entropy": 6.02327971458435, "epoch": 0.3125393824826717, "grad_norm": 0.92578125, "learning_rate": 0.000499410925983966, "loss": 5.8457, "mean_token_accuracy": 0.14952262938022615, "num_tokens": 6856585.0, "step": 3720 }, { "entropy": 5.887494659423828, "epoch": 0.3129594622978366, "grad_norm": 1.015625, "learning_rate": 0.0004994087584323596, "loss": 5.7583, "mean_token_accuracy": 0.15517981797456742, "num_tokens": 6865757.0, "step": 3725 }, { "entropy": 5.853988265991211, "epoch": 0.31337954211300145, "grad_norm": 0.90625, "learning_rate": 0.0004994065869054676, "loss": 5.796, "mean_token_accuracy": 0.1451224982738495, "num_tokens": 6875371.0, "step": 3730 }, { "entropy": 6.02379846572876, "epoch": 0.31379962192816635, "grad_norm": 1.03125, "learning_rate": 0.0004994044114033283, "loss": 5.8687, "mean_token_accuracy": 0.1440061092376709, "num_tokens": 6884050.0, "step": 3735 }, { "entropy": 6.026759815216065, "epoch": 0.31421970174333125, "grad_norm": 1.0859375, "learning_rate": 0.0004994022319259806, "loss": 5.8372, "mean_token_accuracy": 0.14598554819822313, "num_tokens": 6893079.0, "step": 3740 }, { "entropy": 5.911620283126831, "epoch": 0.3146397815584961, "grad_norm": 0.984375, "learning_rate": 0.0004994000484734629, "loss": 5.9136, "mean_token_accuracy": 0.15156169682741166, "num_tokens": 6903100.0, "step": 3745 }, { "entropy": 5.923766088485718, "epoch": 0.315059861373661, "grad_norm": 0.875, "learning_rate": 0.0004993978610458137, "loss": 5.7654, "mean_token_accuracy": 0.15068738907575607, "num_tokens": 6912164.0, "step": 3750 }, { "entropy": 5.878131437301636, "epoch": 0.3154799411888259, "grad_norm": 0.9296875, "learning_rate": 0.0004993956696430721, "loss": 5.7781, "mean_token_accuracy": 0.1453731819987297, "num_tokens": 6921183.0, "step": 3755 }, { "entropy": 5.950732278823852, "epoch": 0.3159000210039908, "grad_norm": 0.9296875, "learning_rate": 0.0004993934742652768, "loss": 5.8422, "mean_token_accuracy": 0.14924204498529434, "num_tokens": 6931325.0, "step": 3760 }, { "entropy": 5.98630108833313, "epoch": 0.3163201008191556, "grad_norm": 0.9140625, "learning_rate": 0.0004993912749124665, "loss": 5.7579, "mean_token_accuracy": 0.15365685075521468, "num_tokens": 6940234.0, "step": 3765 }, { "entropy": 5.933948040008545, "epoch": 0.3167401806343205, "grad_norm": 0.92578125, "learning_rate": 0.0004993890715846804, "loss": 5.8442, "mean_token_accuracy": 0.1472316324710846, "num_tokens": 6949067.0, "step": 3770 }, { "entropy": 5.98266453742981, "epoch": 0.3171602604494854, "grad_norm": 0.9375, "learning_rate": 0.0004993868642819574, "loss": 5.8092, "mean_token_accuracy": 0.14614944905042648, "num_tokens": 6959085.0, "step": 3775 }, { "entropy": 5.905980443954467, "epoch": 0.31758034026465026, "grad_norm": 1.015625, "learning_rate": 0.0004993846530043367, "loss": 5.8539, "mean_token_accuracy": 0.14434425979852678, "num_tokens": 6967392.0, "step": 3780 }, { "entropy": 5.910531997680664, "epoch": 0.31800042007981516, "grad_norm": 1.09375, "learning_rate": 0.0004993824377518574, "loss": 5.7851, "mean_token_accuracy": 0.1514693483710289, "num_tokens": 6976369.0, "step": 3785 }, { "entropy": 5.976119804382324, "epoch": 0.31842049989498006, "grad_norm": 0.94140625, "learning_rate": 0.0004993802185245587, "loss": 5.8013, "mean_token_accuracy": 0.14934585690498353, "num_tokens": 6985889.0, "step": 3790 }, { "entropy": 5.925661182403564, "epoch": 0.3188405797101449, "grad_norm": 0.9921875, "learning_rate": 0.00049937799532248, "loss": 5.8359, "mean_token_accuracy": 0.13918048441410064, "num_tokens": 6995396.0, "step": 3795 }, { "entropy": 6.0729657173156735, "epoch": 0.3192606595253098, "grad_norm": 0.921875, "learning_rate": 0.0004993757681456607, "loss": 5.8718, "mean_token_accuracy": 0.1478106528520584, "num_tokens": 7004666.0, "step": 3800 }, { "entropy": 5.967416000366211, "epoch": 0.3196807393404747, "grad_norm": 0.87890625, "learning_rate": 0.0004993735369941401, "loss": 5.8998, "mean_token_accuracy": 0.14525311812758446, "num_tokens": 7014608.0, "step": 3805 }, { "entropy": 5.966092729568482, "epoch": 0.3201008191556396, "grad_norm": 0.91015625, "learning_rate": 0.0004993713018679579, "loss": 5.7888, "mean_token_accuracy": 0.14646613076329232, "num_tokens": 7023671.0, "step": 3810 }, { "entropy": 5.904713773727417, "epoch": 0.32052089897080444, "grad_norm": 0.8984375, "learning_rate": 0.0004993690627671536, "loss": 5.8148, "mean_token_accuracy": 0.1434755489230156, "num_tokens": 7033786.0, "step": 3815 }, { "entropy": 5.907800912857056, "epoch": 0.32094097878596933, "grad_norm": 0.9609375, "learning_rate": 0.0004993668196917669, "loss": 5.7268, "mean_token_accuracy": 0.15316082686185836, "num_tokens": 7042162.0, "step": 3820 }, { "entropy": 5.994227170944214, "epoch": 0.32136105860113423, "grad_norm": 0.9140625, "learning_rate": 0.0004993645726418375, "loss": 5.8618, "mean_token_accuracy": 0.15052291825413705, "num_tokens": 7051903.0, "step": 3825 }, { "entropy": 5.900808525085449, "epoch": 0.3217811384162991, "grad_norm": 0.96875, "learning_rate": 0.0004993623216174053, "loss": 5.7121, "mean_token_accuracy": 0.161135034263134, "num_tokens": 7060229.0, "step": 3830 }, { "entropy": 5.845855093002319, "epoch": 0.32220121823146397, "grad_norm": 0.99609375, "learning_rate": 0.00049936006661851, "loss": 5.7989, "mean_token_accuracy": 0.1526742696762085, "num_tokens": 7069040.0, "step": 3835 }, { "entropy": 5.919027471542359, "epoch": 0.32262129804662887, "grad_norm": 1.03125, "learning_rate": 0.0004993578076451917, "loss": 5.6805, "mean_token_accuracy": 0.15347311198711394, "num_tokens": 7078409.0, "step": 3840 }, { "entropy": 5.853667831420898, "epoch": 0.32304137786179377, "grad_norm": 0.9453125, "learning_rate": 0.0004993555446974903, "loss": 5.765, "mean_token_accuracy": 0.14782839864492417, "num_tokens": 7087983.0, "step": 3845 }, { "entropy": 5.853893089294433, "epoch": 0.3234614576769586, "grad_norm": 1.0234375, "learning_rate": 0.000499353277775446, "loss": 5.7182, "mean_token_accuracy": 0.1580560803413391, "num_tokens": 7097277.0, "step": 3850 }, { "entropy": 5.87832407951355, "epoch": 0.3238815374921235, "grad_norm": 1.0859375, "learning_rate": 0.0004993510068790989, "loss": 5.6187, "mean_token_accuracy": 0.16494725197553634, "num_tokens": 7105918.0, "step": 3855 }, { "entropy": 5.8204621315002445, "epoch": 0.3243016173072884, "grad_norm": 0.92578125, "learning_rate": 0.0004993487320084892, "loss": 5.6885, "mean_token_accuracy": 0.1581684559583664, "num_tokens": 7115049.0, "step": 3860 }, { "entropy": 5.950232267379761, "epoch": 0.32472169712245325, "grad_norm": 0.921875, "learning_rate": 0.0004993464531636573, "loss": 5.7875, "mean_token_accuracy": 0.1498127706348896, "num_tokens": 7124862.0, "step": 3865 }, { "entropy": 5.82954216003418, "epoch": 0.32514177693761814, "grad_norm": 0.984375, "learning_rate": 0.0004993441703446435, "loss": 5.6777, "mean_token_accuracy": 0.1620057240128517, "num_tokens": 7133280.0, "step": 3870 }, { "entropy": 5.929150485992432, "epoch": 0.32556185675278304, "grad_norm": 0.9921875, "learning_rate": 0.0004993418835514882, "loss": 5.8773, "mean_token_accuracy": 0.14564588218927382, "num_tokens": 7142446.0, "step": 3875 }, { "entropy": 5.9440654754638675, "epoch": 0.3259819365679479, "grad_norm": 0.875, "learning_rate": 0.0004993395927842321, "loss": 5.7755, "mean_token_accuracy": 0.14392856359481812, "num_tokens": 7152143.0, "step": 3880 }, { "entropy": 6.021526956558228, "epoch": 0.3264020163831128, "grad_norm": 0.98046875, "learning_rate": 0.0004993372980429155, "loss": 5.8501, "mean_token_accuracy": 0.14762358814477922, "num_tokens": 7162046.0, "step": 3885 }, { "entropy": 5.937510824203491, "epoch": 0.3268220961982777, "grad_norm": 0.95703125, "learning_rate": 0.0004993349993275792, "loss": 5.7358, "mean_token_accuracy": 0.1501179426908493, "num_tokens": 7171557.0, "step": 3890 }, { "entropy": 5.722299528121948, "epoch": 0.3272421760134426, "grad_norm": 0.86328125, "learning_rate": 0.0004993326966382639, "loss": 5.6455, "mean_token_accuracy": 0.15715345591306687, "num_tokens": 7180927.0, "step": 3895 }, { "entropy": 5.841052865982055, "epoch": 0.3276622558286074, "grad_norm": 1.0546875, "learning_rate": 0.0004993303899750104, "loss": 5.728, "mean_token_accuracy": 0.15390928834676743, "num_tokens": 7189552.0, "step": 3900 }, { "entropy": 5.984076976776123, "epoch": 0.3280823356437723, "grad_norm": 1.015625, "learning_rate": 0.0004993280793378595, "loss": 5.7447, "mean_token_accuracy": 0.14799359515309335, "num_tokens": 7197857.0, "step": 3905 }, { "entropy": 5.883258295059204, "epoch": 0.3285024154589372, "grad_norm": 0.9921875, "learning_rate": 0.0004993257647268522, "loss": 5.7153, "mean_token_accuracy": 0.15892730355262757, "num_tokens": 7206785.0, "step": 3910 }, { "entropy": 5.8749652862548825, "epoch": 0.32892249527410206, "grad_norm": 0.9140625, "learning_rate": 0.0004993234461420295, "loss": 5.8032, "mean_token_accuracy": 0.1540107510983944, "num_tokens": 7216360.0, "step": 3915 }, { "entropy": 5.903149938583374, "epoch": 0.32934257508926695, "grad_norm": 1.015625, "learning_rate": 0.0004993211235834326, "loss": 5.6111, "mean_token_accuracy": 0.1713676080107689, "num_tokens": 7224890.0, "step": 3920 }, { "entropy": 5.803111982345581, "epoch": 0.32976265490443185, "grad_norm": 1.078125, "learning_rate": 0.0004993187970511023, "loss": 5.6647, "mean_token_accuracy": 0.17485086023807525, "num_tokens": 7234442.0, "step": 3925 }, { "entropy": 5.873620986938477, "epoch": 0.33018273471959675, "grad_norm": 0.94921875, "learning_rate": 0.0004993164665450801, "loss": 5.8228, "mean_token_accuracy": 0.15156899392604828, "num_tokens": 7244023.0, "step": 3930 }, { "entropy": 5.843383169174194, "epoch": 0.3306028145347616, "grad_norm": 0.9140625, "learning_rate": 0.0004993141320654072, "loss": 5.6665, "mean_token_accuracy": 0.15884078443050384, "num_tokens": 7253548.0, "step": 3935 }, { "entropy": 5.8344789981842045, "epoch": 0.3310228943499265, "grad_norm": 0.9609375, "learning_rate": 0.000499311793612125, "loss": 5.7347, "mean_token_accuracy": 0.15194563269615174, "num_tokens": 7262962.0, "step": 3940 }, { "entropy": 5.9449968338012695, "epoch": 0.3314429741650914, "grad_norm": 0.91015625, "learning_rate": 0.0004993094511852748, "loss": 5.7609, "mean_token_accuracy": 0.14924739301204681, "num_tokens": 7272234.0, "step": 3945 }, { "entropy": 5.968133401870728, "epoch": 0.33186305398025623, "grad_norm": 0.984375, "learning_rate": 0.0004993071047848983, "loss": 5.7413, "mean_token_accuracy": 0.15319221317768097, "num_tokens": 7281524.0, "step": 3950 }, { "entropy": 5.790039682388306, "epoch": 0.3322831337954211, "grad_norm": 0.98046875, "learning_rate": 0.0004993047544110368, "loss": 5.6528, "mean_token_accuracy": 0.15719158425927163, "num_tokens": 7289601.0, "step": 3955 }, { "entropy": 5.721573781967163, "epoch": 0.332703213610586, "grad_norm": 1.046875, "learning_rate": 0.0004993024000637321, "loss": 5.6074, "mean_token_accuracy": 0.16373219192028046, "num_tokens": 7298508.0, "step": 3960 }, { "entropy": 5.854639863967895, "epoch": 0.33312329342575087, "grad_norm": 0.9296875, "learning_rate": 0.0004993000417430259, "loss": 5.8333, "mean_token_accuracy": 0.14586606696248056, "num_tokens": 7309065.0, "step": 3965 }, { "entropy": 6.050255537033081, "epoch": 0.33354337324091576, "grad_norm": 0.86328125, "learning_rate": 0.00049929767944896, "loss": 5.8607, "mean_token_accuracy": 0.14968539252877236, "num_tokens": 7319669.0, "step": 3970 }, { "entropy": 5.973075866699219, "epoch": 0.33396345305608066, "grad_norm": 0.96875, "learning_rate": 0.0004992953131815761, "loss": 5.7964, "mean_token_accuracy": 0.14924187809228898, "num_tokens": 7328425.0, "step": 3975 }, { "entropy": 5.858473682403565, "epoch": 0.33438353287124556, "grad_norm": 1.0703125, "learning_rate": 0.0004992929429409164, "loss": 5.6701, "mean_token_accuracy": 0.15970652550458908, "num_tokens": 7337369.0, "step": 3980 }, { "entropy": 5.832104206085205, "epoch": 0.3348036126864104, "grad_norm": 0.92578125, "learning_rate": 0.0004992905687270225, "loss": 5.7375, "mean_token_accuracy": 0.15307654216885566, "num_tokens": 7346829.0, "step": 3985 }, { "entropy": 5.9267027378082275, "epoch": 0.3352236925015753, "grad_norm": 0.96484375, "learning_rate": 0.0004992881905399368, "loss": 5.7952, "mean_token_accuracy": 0.14916737228631974, "num_tokens": 7355976.0, "step": 3990 }, { "entropy": 5.941111850738525, "epoch": 0.3356437723167402, "grad_norm": 1.0859375, "learning_rate": 0.0004992858083797013, "loss": 5.7675, "mean_token_accuracy": 0.1473349630832672, "num_tokens": 7365210.0, "step": 3995 }, { "entropy": 5.9041369438171385, "epoch": 0.33606385213190504, "grad_norm": 1.015625, "learning_rate": 0.0004992834222463581, "loss": 5.8093, "mean_token_accuracy": 0.14046019837260246, "num_tokens": 7374175.0, "step": 4000 }, { "entropy": 5.923312139511109, "epoch": 0.33648393194706994, "grad_norm": 0.94921875, "learning_rate": 0.0004992810321399496, "loss": 5.8383, "mean_token_accuracy": 0.147621788084507, "num_tokens": 7383302.0, "step": 4005 }, { "entropy": 5.99611988067627, "epoch": 0.33690401176223483, "grad_norm": 0.96484375, "learning_rate": 0.0004992786380605182, "loss": 5.8018, "mean_token_accuracy": 0.15006497725844384, "num_tokens": 7392746.0, "step": 4010 }, { "entropy": 5.865422248840332, "epoch": 0.33732409157739973, "grad_norm": 1.0, "learning_rate": 0.0004992762400081062, "loss": 5.6537, "mean_token_accuracy": 0.1529911682009697, "num_tokens": 7401604.0, "step": 4015 }, { "entropy": 5.859767580032349, "epoch": 0.3377441713925646, "grad_norm": 0.99609375, "learning_rate": 0.0004992738379827559, "loss": 5.7575, "mean_token_accuracy": 0.15247822627425195, "num_tokens": 7410594.0, "step": 4020 }, { "entropy": 5.920141792297363, "epoch": 0.33816425120772947, "grad_norm": 0.9765625, "learning_rate": 0.0004992714319845101, "loss": 5.658, "mean_token_accuracy": 0.16050563454627992, "num_tokens": 7418831.0, "step": 4025 }, { "entropy": 5.809229993820191, "epoch": 0.33858433102289437, "grad_norm": 0.92578125, "learning_rate": 0.0004992690220134116, "loss": 5.7047, "mean_token_accuracy": 0.15451119393110274, "num_tokens": 7427731.0, "step": 4030 }, { "entropy": 5.96991548538208, "epoch": 0.3390044108380592, "grad_norm": 1.0, "learning_rate": 0.0004992666080695027, "loss": 5.8101, "mean_token_accuracy": 0.14591643139719962, "num_tokens": 7436447.0, "step": 4035 }, { "entropy": 5.9149298667907715, "epoch": 0.3394244906532241, "grad_norm": 1.015625, "learning_rate": 0.0004992641901528262, "loss": 5.7195, "mean_token_accuracy": 0.15583046823740004, "num_tokens": 7445352.0, "step": 4040 }, { "entropy": 5.908085584640503, "epoch": 0.339844570468389, "grad_norm": 0.89453125, "learning_rate": 0.0004992617682634252, "loss": 5.7887, "mean_token_accuracy": 0.1540717288851738, "num_tokens": 7454298.0, "step": 4045 }, { "entropy": 5.891385746002197, "epoch": 0.34026465028355385, "grad_norm": 0.8828125, "learning_rate": 0.0004992593424013424, "loss": 5.7978, "mean_token_accuracy": 0.15331364274024964, "num_tokens": 7463543.0, "step": 4050 }, { "entropy": 5.913450384140015, "epoch": 0.34068473009871875, "grad_norm": 0.98046875, "learning_rate": 0.0004992569125666209, "loss": 5.8148, "mean_token_accuracy": 0.14611926972866057, "num_tokens": 7472701.0, "step": 4055 }, { "entropy": 6.010456657409668, "epoch": 0.34110480991388364, "grad_norm": 0.92578125, "learning_rate": 0.0004992544787593037, "loss": 5.817, "mean_token_accuracy": 0.14246124625205994, "num_tokens": 7481123.0, "step": 4060 }, { "entropy": 5.905852317810059, "epoch": 0.34152488972904854, "grad_norm": 0.92578125, "learning_rate": 0.0004992520409794338, "loss": 5.8641, "mean_token_accuracy": 0.1508338287472725, "num_tokens": 7490439.0, "step": 4065 }, { "entropy": 5.901952314376831, "epoch": 0.3419449695442134, "grad_norm": 0.953125, "learning_rate": 0.0004992495992270544, "loss": 5.7351, "mean_token_accuracy": 0.1509379267692566, "num_tokens": 7499326.0, "step": 4070 }, { "entropy": 5.938205337524414, "epoch": 0.3423650493593783, "grad_norm": 0.92578125, "learning_rate": 0.0004992471535022089, "loss": 5.7857, "mean_token_accuracy": 0.1451237343251705, "num_tokens": 7509407.0, "step": 4075 }, { "entropy": 5.869676685333252, "epoch": 0.3427851291745432, "grad_norm": 0.92578125, "learning_rate": 0.0004992447038049405, "loss": 5.829, "mean_token_accuracy": 0.14850043952465058, "num_tokens": 7518443.0, "step": 4080 }, { "entropy": 5.861940097808838, "epoch": 0.343205208989708, "grad_norm": 1.015625, "learning_rate": 0.0004992422501352927, "loss": 5.6977, "mean_token_accuracy": 0.15755705237388612, "num_tokens": 7527609.0, "step": 4085 }, { "entropy": 5.978248167037964, "epoch": 0.3436252888048729, "grad_norm": 1.0234375, "learning_rate": 0.0004992397924933089, "loss": 5.7788, "mean_token_accuracy": 0.15250536054372787, "num_tokens": 7536890.0, "step": 4090 }, { "entropy": 5.899935388565064, "epoch": 0.3440453686200378, "grad_norm": 0.99609375, "learning_rate": 0.0004992373308790325, "loss": 5.731, "mean_token_accuracy": 0.1621832400560379, "num_tokens": 7546509.0, "step": 4095 }, { "entropy": 5.818875694274903, "epoch": 0.3444654484352027, "grad_norm": 0.9765625, "learning_rate": 0.0004992348652925074, "loss": 5.7667, "mean_token_accuracy": 0.15332106947898866, "num_tokens": 7555336.0, "step": 4100 }, { "entropy": 5.907353639602661, "epoch": 0.34488552825036756, "grad_norm": 1.046875, "learning_rate": 0.0004992323957337771, "loss": 5.7278, "mean_token_accuracy": 0.1509070634841919, "num_tokens": 7565210.0, "step": 4105 }, { "entropy": 5.929575109481812, "epoch": 0.34530560806553245, "grad_norm": 0.89453125, "learning_rate": 0.0004992299222028855, "loss": 5.8127, "mean_token_accuracy": 0.15723925679922104, "num_tokens": 7574516.0, "step": 4110 }, { "entropy": 5.839164924621582, "epoch": 0.34572568788069735, "grad_norm": 0.9921875, "learning_rate": 0.0004992274446998761, "loss": 5.6588, "mean_token_accuracy": 0.1544717237353325, "num_tokens": 7583219.0, "step": 4115 }, { "entropy": 5.916603851318359, "epoch": 0.3461457676958622, "grad_norm": 0.9765625, "learning_rate": 0.0004992249632247929, "loss": 5.902, "mean_token_accuracy": 0.14321533888578414, "num_tokens": 7592050.0, "step": 4120 }, { "entropy": 5.9809043407440186, "epoch": 0.3465658475110271, "grad_norm": 0.95703125, "learning_rate": 0.0004992224777776802, "loss": 5.732, "mean_token_accuracy": 0.1493101716041565, "num_tokens": 7600718.0, "step": 4125 }, { "entropy": 5.901517105102539, "epoch": 0.346985927326192, "grad_norm": 0.98046875, "learning_rate": 0.0004992199883585816, "loss": 5.7557, "mean_token_accuracy": 0.15382387340068818, "num_tokens": 7609191.0, "step": 4130 }, { "entropy": 5.910360288619995, "epoch": 0.34740600714135683, "grad_norm": 0.98046875, "learning_rate": 0.0004992174949675413, "loss": 5.7894, "mean_token_accuracy": 0.152114437520504, "num_tokens": 7618509.0, "step": 4135 }, { "entropy": 5.890322875976563, "epoch": 0.34782608695652173, "grad_norm": 1.03125, "learning_rate": 0.0004992149976046037, "loss": 5.7136, "mean_token_accuracy": 0.15040391087532043, "num_tokens": 7627851.0, "step": 4140 }, { "entropy": 5.837684154510498, "epoch": 0.3482461667716866, "grad_norm": 0.921875, "learning_rate": 0.0004992124962698128, "loss": 5.7584, "mean_token_accuracy": 0.15606331154704095, "num_tokens": 7636748.0, "step": 4145 }, { "entropy": 5.921899652481079, "epoch": 0.3486662465868515, "grad_norm": 1.03125, "learning_rate": 0.000499209990963213, "loss": 5.7078, "mean_token_accuracy": 0.15208663642406464, "num_tokens": 7645436.0, "step": 4150 }, { "entropy": 5.917012548446655, "epoch": 0.34908632640201637, "grad_norm": 0.98828125, "learning_rate": 0.0004992074816848487, "loss": 5.8094, "mean_token_accuracy": 0.15278877168893815, "num_tokens": 7655414.0, "step": 4155 }, { "entropy": 5.772976493835449, "epoch": 0.34950640621718126, "grad_norm": 0.98828125, "learning_rate": 0.0004992049684347642, "loss": 5.6074, "mean_token_accuracy": 0.15534141510725022, "num_tokens": 7664295.0, "step": 4160 }, { "entropy": 5.917826843261719, "epoch": 0.34992648603234616, "grad_norm": 1.0546875, "learning_rate": 0.0004992024512130042, "loss": 5.7416, "mean_token_accuracy": 0.15260617434978485, "num_tokens": 7673295.0, "step": 4165 }, { "entropy": 5.788580131530762, "epoch": 0.350346565847511, "grad_norm": 0.859375, "learning_rate": 0.0004991999300196132, "loss": 5.7469, "mean_token_accuracy": 0.15305035635828973, "num_tokens": 7682932.0, "step": 4170 }, { "entropy": 5.923834562301636, "epoch": 0.3507666456626759, "grad_norm": 1.0078125, "learning_rate": 0.0004991974048546359, "loss": 5.753, "mean_token_accuracy": 0.1500132530927658, "num_tokens": 7692105.0, "step": 4175 }, { "entropy": 5.925296068191528, "epoch": 0.3511867254778408, "grad_norm": 0.9765625, "learning_rate": 0.000499194875718117, "loss": 5.7511, "mean_token_accuracy": 0.15551865100860596, "num_tokens": 7701294.0, "step": 4180 }, { "entropy": 5.861107254028321, "epoch": 0.3516068052930057, "grad_norm": 0.92578125, "learning_rate": 0.0004991923426101013, "loss": 5.7386, "mean_token_accuracy": 0.14845747649669647, "num_tokens": 7710964.0, "step": 4185 }, { "entropy": 5.949919605255127, "epoch": 0.35202688510817054, "grad_norm": 0.953125, "learning_rate": 0.0004991898055306337, "loss": 5.8577, "mean_token_accuracy": 0.14658492356538771, "num_tokens": 7719938.0, "step": 4190 }, { "entropy": 5.951687955856324, "epoch": 0.35244696492333544, "grad_norm": 0.89453125, "learning_rate": 0.0004991872644797591, "loss": 5.7808, "mean_token_accuracy": 0.15141311138868332, "num_tokens": 7729129.0, "step": 4195 }, { "entropy": 5.855287361145019, "epoch": 0.35286704473850034, "grad_norm": 1.03125, "learning_rate": 0.0004991847194575226, "loss": 5.7901, "mean_token_accuracy": 0.14619968980550765, "num_tokens": 7738506.0, "step": 4200 }, { "entropy": 5.942954778671265, "epoch": 0.3532871245536652, "grad_norm": 0.9140625, "learning_rate": 0.0004991821704639693, "loss": 5.8959, "mean_token_accuracy": 0.14654064998030664, "num_tokens": 7749320.0, "step": 4205 }, { "entropy": 6.01116943359375, "epoch": 0.3537072043688301, "grad_norm": 1.0, "learning_rate": 0.0004991796174991443, "loss": 5.7415, "mean_token_accuracy": 0.1537883497774601, "num_tokens": 7758735.0, "step": 4210 }, { "entropy": 5.822880458831787, "epoch": 0.354127284183995, "grad_norm": 0.99609375, "learning_rate": 0.0004991770605630927, "loss": 5.7132, "mean_token_accuracy": 0.15271057039499283, "num_tokens": 7767556.0, "step": 4215 }, { "entropy": 5.818714237213134, "epoch": 0.3545473639991598, "grad_norm": 0.9765625, "learning_rate": 0.0004991744996558599, "loss": 5.7336, "mean_token_accuracy": 0.15282744243741037, "num_tokens": 7776615.0, "step": 4220 }, { "entropy": 5.915001726150512, "epoch": 0.3549674438143247, "grad_norm": 0.94921875, "learning_rate": 0.0004991719347774913, "loss": 5.7682, "mean_token_accuracy": 0.15577882081270217, "num_tokens": 7785288.0, "step": 4225 }, { "entropy": 5.918221855163575, "epoch": 0.3553875236294896, "grad_norm": 0.95703125, "learning_rate": 0.0004991693659280324, "loss": 5.6811, "mean_token_accuracy": 0.15442655980587006, "num_tokens": 7794381.0, "step": 4230 }, { "entropy": 5.821169424057007, "epoch": 0.3558076034446545, "grad_norm": 0.984375, "learning_rate": 0.0004991667931075284, "loss": 5.6546, "mean_token_accuracy": 0.15124934762716294, "num_tokens": 7803265.0, "step": 4235 }, { "entropy": 5.829122161865234, "epoch": 0.35622768325981935, "grad_norm": 0.9296875, "learning_rate": 0.0004991642163160252, "loss": 5.7671, "mean_token_accuracy": 0.15388772487640381, "num_tokens": 7812445.0, "step": 4240 }, { "entropy": 5.934730339050293, "epoch": 0.35664776307498425, "grad_norm": 0.87109375, "learning_rate": 0.0004991616355535684, "loss": 5.7542, "mean_token_accuracy": 0.15821312218904496, "num_tokens": 7822073.0, "step": 4245 }, { "entropy": 5.918817663192749, "epoch": 0.35706784289014915, "grad_norm": 0.9375, "learning_rate": 0.0004991590508202036, "loss": 5.7264, "mean_token_accuracy": 0.15280235260725022, "num_tokens": 7831193.0, "step": 4250 }, { "entropy": 5.89573392868042, "epoch": 0.357487922705314, "grad_norm": 0.9765625, "learning_rate": 0.0004991564621159766, "loss": 5.7728, "mean_token_accuracy": 0.15194582045078278, "num_tokens": 7840311.0, "step": 4255 }, { "entropy": 5.8731294631958, "epoch": 0.3579080025204789, "grad_norm": 0.9609375, "learning_rate": 0.0004991538694409334, "loss": 5.7954, "mean_token_accuracy": 0.14721263125538825, "num_tokens": 7849622.0, "step": 4260 }, { "entropy": 5.876342821121216, "epoch": 0.3583280823356438, "grad_norm": 1.03125, "learning_rate": 0.0004991512727951198, "loss": 5.7558, "mean_token_accuracy": 0.15003474354743956, "num_tokens": 7859494.0, "step": 4265 }, { "entropy": 5.9838221073150635, "epoch": 0.3587481621508087, "grad_norm": 0.93359375, "learning_rate": 0.0004991486721785818, "loss": 5.8503, "mean_token_accuracy": 0.14846469163894654, "num_tokens": 7868526.0, "step": 4270 }, { "entropy": 5.859622812271118, "epoch": 0.3591682419659735, "grad_norm": 0.99609375, "learning_rate": 0.0004991460675913655, "loss": 5.6799, "mean_token_accuracy": 0.1537486046552658, "num_tokens": 7877631.0, "step": 4275 }, { "entropy": 5.85202202796936, "epoch": 0.3595883217811384, "grad_norm": 0.96875, "learning_rate": 0.000499143459033517, "loss": 5.7338, "mean_token_accuracy": 0.15869542211294174, "num_tokens": 7886814.0, "step": 4280 }, { "entropy": 5.794212818145752, "epoch": 0.3600084015963033, "grad_norm": 0.9765625, "learning_rate": 0.0004991408465050825, "loss": 5.5727, "mean_token_accuracy": 0.1595866084098816, "num_tokens": 7896337.0, "step": 4285 }, { "entropy": 5.852896070480346, "epoch": 0.36042848141146816, "grad_norm": 0.890625, "learning_rate": 0.0004991382300061084, "loss": 5.8163, "mean_token_accuracy": 0.14354490041732787, "num_tokens": 7906071.0, "step": 4290 }, { "entropy": 5.937732839584351, "epoch": 0.36084856122663306, "grad_norm": 0.92578125, "learning_rate": 0.0004991356095366409, "loss": 5.8111, "mean_token_accuracy": 0.14974057525396348, "num_tokens": 7915003.0, "step": 4295 }, { "entropy": 5.904038953781128, "epoch": 0.36126864104179796, "grad_norm": 0.94140625, "learning_rate": 0.0004991329850967266, "loss": 5.6791, "mean_token_accuracy": 0.15475230365991594, "num_tokens": 7924408.0, "step": 4300 }, { "entropy": 5.8507331848144535, "epoch": 0.3616887208569628, "grad_norm": 0.89453125, "learning_rate": 0.0004991303566864118, "loss": 5.637, "mean_token_accuracy": 0.1542945459485054, "num_tokens": 7934717.0, "step": 4305 }, { "entropy": 5.7739667892456055, "epoch": 0.3621088006721277, "grad_norm": 0.88671875, "learning_rate": 0.0004991277243057431, "loss": 5.7101, "mean_token_accuracy": 0.1505005143582821, "num_tokens": 7944278.0, "step": 4310 }, { "entropy": 5.808600950241089, "epoch": 0.3625288804872926, "grad_norm": 0.9609375, "learning_rate": 0.0004991250879547673, "loss": 5.7235, "mean_token_accuracy": 0.1538018502295017, "num_tokens": 7953344.0, "step": 4315 }, { "entropy": 5.829892158508301, "epoch": 0.3629489603024575, "grad_norm": 0.90234375, "learning_rate": 0.0004991224476335309, "loss": 5.7448, "mean_token_accuracy": 0.149826068431139, "num_tokens": 7962869.0, "step": 4320 }, { "entropy": 5.963926601409912, "epoch": 0.36336904011762233, "grad_norm": 0.98046875, "learning_rate": 0.0004991198033420807, "loss": 5.7344, "mean_token_accuracy": 0.15306216776371, "num_tokens": 7971981.0, "step": 4325 }, { "entropy": 5.884770917892456, "epoch": 0.36378911993278723, "grad_norm": 0.91796875, "learning_rate": 0.0004991171550804636, "loss": 5.7019, "mean_token_accuracy": 0.15474960654973985, "num_tokens": 7980979.0, "step": 4330 }, { "entropy": 5.863976860046387, "epoch": 0.36420919974795213, "grad_norm": 0.9453125, "learning_rate": 0.0004991145028487266, "loss": 5.7748, "mean_token_accuracy": 0.1529791235923767, "num_tokens": 7989607.0, "step": 4335 }, { "entropy": 5.7957190990448, "epoch": 0.36462927956311697, "grad_norm": 0.91796875, "learning_rate": 0.0004991118466469165, "loss": 5.5897, "mean_token_accuracy": 0.1639975592494011, "num_tokens": 7998356.0, "step": 4340 }, { "entropy": 5.849919033050537, "epoch": 0.36504935937828187, "grad_norm": 0.9609375, "learning_rate": 0.0004991091864750805, "loss": 5.7033, "mean_token_accuracy": 0.1553362563252449, "num_tokens": 8007596.0, "step": 4345 }, { "entropy": 5.909917688369751, "epoch": 0.36546943919344677, "grad_norm": 0.94921875, "learning_rate": 0.0004991065223332655, "loss": 5.7587, "mean_token_accuracy": 0.15085091739892958, "num_tokens": 8016493.0, "step": 4350 }, { "entropy": 5.884606981277466, "epoch": 0.36588951900861166, "grad_norm": 0.984375, "learning_rate": 0.0004991038542215191, "loss": 5.7272, "mean_token_accuracy": 0.1481338232755661, "num_tokens": 8025867.0, "step": 4355 }, { "entropy": 5.814969539642334, "epoch": 0.3663095988237765, "grad_norm": 0.921875, "learning_rate": 0.0004991011821398882, "loss": 5.7464, "mean_token_accuracy": 0.15548805743455887, "num_tokens": 8036251.0, "step": 4360 }, { "entropy": 5.905033826828003, "epoch": 0.3667296786389414, "grad_norm": 1.0390625, "learning_rate": 0.0004990985060884202, "loss": 5.7024, "mean_token_accuracy": 0.1582213595509529, "num_tokens": 8045647.0, "step": 4365 }, { "entropy": 5.88990044593811, "epoch": 0.3671497584541063, "grad_norm": 0.90625, "learning_rate": 0.0004990958260671627, "loss": 5.79, "mean_token_accuracy": 0.1454270862042904, "num_tokens": 8056025.0, "step": 4370 }, { "entropy": 5.809770679473877, "epoch": 0.36756983826927114, "grad_norm": 0.98828125, "learning_rate": 0.0004990931420761629, "loss": 5.7083, "mean_token_accuracy": 0.16103482097387314, "num_tokens": 8065029.0, "step": 4375 }, { "entropy": 5.914457368850708, "epoch": 0.36798991808443604, "grad_norm": 1.015625, "learning_rate": 0.0004990904541154685, "loss": 5.6763, "mean_token_accuracy": 0.16559941172599793, "num_tokens": 8073249.0, "step": 4380 }, { "entropy": 5.894069719314575, "epoch": 0.36840999789960094, "grad_norm": 1.0, "learning_rate": 0.0004990877621851271, "loss": 5.8002, "mean_token_accuracy": 0.153408020734787, "num_tokens": 8082039.0, "step": 4385 }, { "entropy": 5.8033387660980225, "epoch": 0.3688300777147658, "grad_norm": 1.078125, "learning_rate": 0.0004990850662851863, "loss": 5.6375, "mean_token_accuracy": 0.15707656592130662, "num_tokens": 8090011.0, "step": 4390 }, { "entropy": 5.879843854904175, "epoch": 0.3692501575299307, "grad_norm": 0.95703125, "learning_rate": 0.0004990823664156941, "loss": 5.7455, "mean_token_accuracy": 0.1648575708270073, "num_tokens": 8099934.0, "step": 4395 }, { "entropy": 5.963798093795776, "epoch": 0.3696702373450956, "grad_norm": 0.97265625, "learning_rate": 0.0004990796625766981, "loss": 5.7681, "mean_token_accuracy": 0.14946894496679305, "num_tokens": 8108969.0, "step": 4400 }, { "entropy": 5.835124111175537, "epoch": 0.3700903171602605, "grad_norm": 0.98046875, "learning_rate": 0.0004990769547682462, "loss": 5.6935, "mean_token_accuracy": 0.15169232487678527, "num_tokens": 8117372.0, "step": 4405 }, { "entropy": 5.979207563400268, "epoch": 0.3705103969754253, "grad_norm": 0.9375, "learning_rate": 0.0004990742429903866, "loss": 5.8757, "mean_token_accuracy": 0.14571133852005005, "num_tokens": 8127108.0, "step": 4410 }, { "entropy": 5.961515951156616, "epoch": 0.3709304767905902, "grad_norm": 0.8984375, "learning_rate": 0.000499071527243167, "loss": 5.8507, "mean_token_accuracy": 0.14516980648040773, "num_tokens": 8137392.0, "step": 4415 }, { "entropy": 5.880073976516724, "epoch": 0.3713505566057551, "grad_norm": 0.984375, "learning_rate": 0.0004990688075266357, "loss": 5.7019, "mean_token_accuracy": 0.15986401289701463, "num_tokens": 8146257.0, "step": 4420 }, { "entropy": 5.805649709701538, "epoch": 0.37177063642091995, "grad_norm": 0.96484375, "learning_rate": 0.0004990660838408409, "loss": 5.6521, "mean_token_accuracy": 0.15721987932920456, "num_tokens": 8154952.0, "step": 4425 }, { "entropy": 5.893301391601563, "epoch": 0.37219071623608485, "grad_norm": 0.921875, "learning_rate": 0.0004990633561858308, "loss": 5.7106, "mean_token_accuracy": 0.14765800014138222, "num_tokens": 8164365.0, "step": 4430 }, { "entropy": 5.924961233139038, "epoch": 0.37261079605124975, "grad_norm": 1.0390625, "learning_rate": 0.0004990606245616537, "loss": 5.7205, "mean_token_accuracy": 0.15445269271731377, "num_tokens": 8172614.0, "step": 4435 }, { "entropy": 5.877901983261109, "epoch": 0.37303087586641465, "grad_norm": 1.0, "learning_rate": 0.0004990578889683579, "loss": 5.7888, "mean_token_accuracy": 0.150545197725296, "num_tokens": 8182445.0, "step": 4440 }, { "entropy": 5.885668134689331, "epoch": 0.3734509556815795, "grad_norm": 0.90234375, "learning_rate": 0.0004990551494059921, "loss": 5.6613, "mean_token_accuracy": 0.15747766494750975, "num_tokens": 8191871.0, "step": 4445 }, { "entropy": 5.893858480453491, "epoch": 0.3738710354967444, "grad_norm": 0.9375, "learning_rate": 0.0004990524058746047, "loss": 5.8285, "mean_token_accuracy": 0.15561486929655075, "num_tokens": 8200658.0, "step": 4450 }, { "entropy": 5.879518842697143, "epoch": 0.3742911153119093, "grad_norm": 1.015625, "learning_rate": 0.0004990496583742443, "loss": 5.7547, "mean_token_accuracy": 0.15101703256368637, "num_tokens": 8209776.0, "step": 4455 }, { "entropy": 5.868221950531006, "epoch": 0.3747111951270741, "grad_norm": 1.046875, "learning_rate": 0.0004990469069049596, "loss": 5.6747, "mean_token_accuracy": 0.15401403456926346, "num_tokens": 8219401.0, "step": 4460 }, { "entropy": 5.809508180618286, "epoch": 0.375131274942239, "grad_norm": 0.9765625, "learning_rate": 0.0004990441514667993, "loss": 5.7095, "mean_token_accuracy": 0.15698247104883195, "num_tokens": 8228762.0, "step": 4465 }, { "entropy": 5.932300424575805, "epoch": 0.3755513547574039, "grad_norm": 0.984375, "learning_rate": 0.0004990413920598121, "loss": 5.7223, "mean_token_accuracy": 0.15662275701761247, "num_tokens": 8236612.0, "step": 4470 }, { "entropy": 5.896757698059082, "epoch": 0.37597143457256876, "grad_norm": 1.0625, "learning_rate": 0.0004990386286840471, "loss": 5.7335, "mean_token_accuracy": 0.15207386016845703, "num_tokens": 8245043.0, "step": 4475 }, { "entropy": 5.995736980438233, "epoch": 0.37639151438773366, "grad_norm": 0.9921875, "learning_rate": 0.0004990358613395532, "loss": 5.8307, "mean_token_accuracy": 0.15044568330049515, "num_tokens": 8255270.0, "step": 4480 }, { "entropy": 5.938156318664551, "epoch": 0.37681159420289856, "grad_norm": 0.8828125, "learning_rate": 0.0004990330900263792, "loss": 5.7971, "mean_token_accuracy": 0.14653817862272261, "num_tokens": 8264761.0, "step": 4485 }, { "entropy": 5.8954840183258055, "epoch": 0.37723167401806346, "grad_norm": 0.9296875, "learning_rate": 0.0004990303147445745, "loss": 5.7454, "mean_token_accuracy": 0.15479619354009627, "num_tokens": 8274308.0, "step": 4490 }, { "entropy": 5.815971899032593, "epoch": 0.3776517538332283, "grad_norm": 0.96875, "learning_rate": 0.0004990275354941881, "loss": 5.6288, "mean_token_accuracy": 0.1646218091249466, "num_tokens": 8283323.0, "step": 4495 }, { "entropy": 5.893220853805542, "epoch": 0.3780718336483932, "grad_norm": 0.9296875, "learning_rate": 0.0004990247522752694, "loss": 5.9629, "mean_token_accuracy": 0.14029839560389518, "num_tokens": 8293452.0, "step": 4500 }, { "entropy": 5.897252893447876, "epoch": 0.3784919134635581, "grad_norm": 0.9453125, "learning_rate": 0.0004990219650878674, "loss": 5.6576, "mean_token_accuracy": 0.16113524734973908, "num_tokens": 8302941.0, "step": 4505 }, { "entropy": 5.781876134872436, "epoch": 0.37891199327872294, "grad_norm": 1.4140625, "learning_rate": 0.0004990191739320318, "loss": 5.6671, "mean_token_accuracy": 0.1652265876531601, "num_tokens": 8311811.0, "step": 4510 }, { "entropy": 5.76027250289917, "epoch": 0.37933207309388783, "grad_norm": 0.9296875, "learning_rate": 0.0004990163788078117, "loss": 5.5692, "mean_token_accuracy": 0.15842368602752685, "num_tokens": 8321130.0, "step": 4515 }, { "entropy": 5.842820358276367, "epoch": 0.37975215290905273, "grad_norm": 0.8984375, "learning_rate": 0.0004990135797152569, "loss": 5.6768, "mean_token_accuracy": 0.15367345213890077, "num_tokens": 8330233.0, "step": 4520 }, { "entropy": 5.770590019226074, "epoch": 0.3801722327242176, "grad_norm": 0.9609375, "learning_rate": 0.0004990107766544169, "loss": 5.6599, "mean_token_accuracy": 0.16070746779441833, "num_tokens": 8338585.0, "step": 4525 }, { "entropy": 5.844082069396973, "epoch": 0.38059231253938247, "grad_norm": 0.97265625, "learning_rate": 0.0004990079696253413, "loss": 5.7068, "mean_token_accuracy": 0.15848116278648378, "num_tokens": 8346618.0, "step": 4530 }, { "entropy": 5.902699136734009, "epoch": 0.38101239235454737, "grad_norm": 0.96484375, "learning_rate": 0.0004990051586280799, "loss": 5.6829, "mean_token_accuracy": 0.15385363698005677, "num_tokens": 8356273.0, "step": 4535 }, { "entropy": 5.847843742370605, "epoch": 0.38143247216971227, "grad_norm": 0.87890625, "learning_rate": 0.0004990023436626824, "loss": 5.674, "mean_token_accuracy": 0.15799472630023956, "num_tokens": 8366668.0, "step": 4540 }, { "entropy": 5.954341840744019, "epoch": 0.3818525519848771, "grad_norm": 1.1015625, "learning_rate": 0.0004989995247291988, "loss": 5.7933, "mean_token_accuracy": 0.15496921986341478, "num_tokens": 8375610.0, "step": 4545 }, { "entropy": 5.860501337051391, "epoch": 0.382272631800042, "grad_norm": 0.95703125, "learning_rate": 0.0004989967018276789, "loss": 5.6729, "mean_token_accuracy": 0.1558580845594406, "num_tokens": 8384455.0, "step": 4550 }, { "entropy": 5.7317808151245115, "epoch": 0.3826927116152069, "grad_norm": 0.9140625, "learning_rate": 0.0004989938749581727, "loss": 5.7105, "mean_token_accuracy": 0.14987761974334718, "num_tokens": 8393868.0, "step": 4555 }, { "entropy": 5.8618772506713865, "epoch": 0.38311279143037175, "grad_norm": 0.890625, "learning_rate": 0.0004989910441207305, "loss": 5.7312, "mean_token_accuracy": 0.15411882251501083, "num_tokens": 8402916.0, "step": 4560 }, { "entropy": 5.830321025848389, "epoch": 0.38353287124553664, "grad_norm": 1.0546875, "learning_rate": 0.0004989882093154023, "loss": 5.6485, "mean_token_accuracy": 0.1575123891234398, "num_tokens": 8411649.0, "step": 4565 }, { "entropy": 5.8616162776947025, "epoch": 0.38395295106070154, "grad_norm": 0.890625, "learning_rate": 0.0004989853705422381, "loss": 5.769, "mean_token_accuracy": 0.14645260721445083, "num_tokens": 8420393.0, "step": 4570 }, { "entropy": 5.813478136062622, "epoch": 0.38437303087586644, "grad_norm": 0.97265625, "learning_rate": 0.0004989825278012886, "loss": 5.6629, "mean_token_accuracy": 0.154879230260849, "num_tokens": 8429404.0, "step": 4575 }, { "entropy": 5.851570463180542, "epoch": 0.3847931106910313, "grad_norm": 1.078125, "learning_rate": 0.000498979681092604, "loss": 5.703, "mean_token_accuracy": 0.149764809012413, "num_tokens": 8438299.0, "step": 4580 }, { "entropy": 5.760462951660156, "epoch": 0.3852131905061962, "grad_norm": 0.88671875, "learning_rate": 0.0004989768304162345, "loss": 5.6615, "mean_token_accuracy": 0.15541962534189224, "num_tokens": 8447392.0, "step": 4585 }, { "entropy": 5.89559907913208, "epoch": 0.3856332703213611, "grad_norm": 0.90625, "learning_rate": 0.0004989739757722308, "loss": 5.7474, "mean_token_accuracy": 0.14751126170158385, "num_tokens": 8456361.0, "step": 4590 }, { "entropy": 5.852615118026733, "epoch": 0.3860533501365259, "grad_norm": 1.015625, "learning_rate": 0.0004989711171606436, "loss": 5.6747, "mean_token_accuracy": 0.15710035860538482, "num_tokens": 8465548.0, "step": 4595 }, { "entropy": 5.885403347015381, "epoch": 0.3864734299516908, "grad_norm": 0.921875, "learning_rate": 0.0004989682545815232, "loss": 5.6869, "mean_token_accuracy": 0.1525876745581627, "num_tokens": 8474454.0, "step": 4600 }, { "entropy": 5.8074538230896, "epoch": 0.3868935097668557, "grad_norm": 1.0625, "learning_rate": 0.0004989653880349207, "loss": 5.6074, "mean_token_accuracy": 0.1573283538222313, "num_tokens": 8482694.0, "step": 4605 }, { "entropy": 5.842355585098266, "epoch": 0.38731358958202056, "grad_norm": 0.99609375, "learning_rate": 0.0004989625175208864, "loss": 5.7257, "mean_token_accuracy": 0.15177675783634187, "num_tokens": 8491162.0, "step": 4610 }, { "entropy": 5.787636756896973, "epoch": 0.38773366939718545, "grad_norm": 0.9921875, "learning_rate": 0.0004989596430394717, "loss": 5.5752, "mean_token_accuracy": 0.17091956436634065, "num_tokens": 8500716.0, "step": 4615 }, { "entropy": 5.7534934997558596, "epoch": 0.38815374921235035, "grad_norm": 0.9453125, "learning_rate": 0.000498956764590727, "loss": 5.6231, "mean_token_accuracy": 0.1520329423248768, "num_tokens": 8508871.0, "step": 4620 }, { "entropy": 5.890595149993897, "epoch": 0.38857382902751525, "grad_norm": 1.03125, "learning_rate": 0.0004989538821747037, "loss": 5.8315, "mean_token_accuracy": 0.15000174939632416, "num_tokens": 8518450.0, "step": 4625 }, { "entropy": 5.941072607040406, "epoch": 0.3889939088426801, "grad_norm": 0.91015625, "learning_rate": 0.0004989509957914527, "loss": 5.7284, "mean_token_accuracy": 0.15086407959461212, "num_tokens": 8528238.0, "step": 4630 }, { "entropy": 5.794663810729981, "epoch": 0.389413988657845, "grad_norm": 0.89453125, "learning_rate": 0.0004989481054410251, "loss": 5.6258, "mean_token_accuracy": 0.1528220996260643, "num_tokens": 8537587.0, "step": 4635 }, { "entropy": 5.795312023162841, "epoch": 0.3898340684730099, "grad_norm": 0.9609375, "learning_rate": 0.0004989452111234721, "loss": 5.7462, "mean_token_accuracy": 0.1528109699487686, "num_tokens": 8547703.0, "step": 4640 }, { "entropy": 5.84535961151123, "epoch": 0.39025414828817473, "grad_norm": 1.0546875, "learning_rate": 0.000498942312838845, "loss": 5.6766, "mean_token_accuracy": 0.1572122886776924, "num_tokens": 8557001.0, "step": 4645 }, { "entropy": 5.796119689941406, "epoch": 0.3906742281033396, "grad_norm": 1.015625, "learning_rate": 0.0004989394105871952, "loss": 5.5616, "mean_token_accuracy": 0.16711176037788392, "num_tokens": 8565638.0, "step": 4650 }, { "entropy": 5.91137285232544, "epoch": 0.3910943079185045, "grad_norm": 1.046875, "learning_rate": 0.000498936504368574, "loss": 5.7305, "mean_token_accuracy": 0.15593890845775604, "num_tokens": 8574428.0, "step": 4655 }, { "entropy": 5.800365591049195, "epoch": 0.3915143877336694, "grad_norm": 0.93359375, "learning_rate": 0.0004989335941830329, "loss": 5.684, "mean_token_accuracy": 0.15439117401838304, "num_tokens": 8583157.0, "step": 4660 }, { "entropy": 5.817437553405762, "epoch": 0.39193446754883426, "grad_norm": 1.046875, "learning_rate": 0.0004989306800306236, "loss": 5.6621, "mean_token_accuracy": 0.149759341776371, "num_tokens": 8592382.0, "step": 4665 }, { "entropy": 5.7860520362854, "epoch": 0.39235454736399916, "grad_norm": 1.0078125, "learning_rate": 0.0004989277619113975, "loss": 5.6345, "mean_token_accuracy": 0.16216987669467925, "num_tokens": 8601058.0, "step": 4670 }, { "entropy": 5.875742197036743, "epoch": 0.39277462717916406, "grad_norm": 1.0234375, "learning_rate": 0.0004989248398254065, "loss": 5.7352, "mean_token_accuracy": 0.15142691284418106, "num_tokens": 8609479.0, "step": 4675 }, { "entropy": 5.859423112869263, "epoch": 0.3931947069943289, "grad_norm": 0.9296875, "learning_rate": 0.0004989219137727021, "loss": 5.7036, "mean_token_accuracy": 0.15549542009830475, "num_tokens": 8618860.0, "step": 4680 }, { "entropy": 5.81779637336731, "epoch": 0.3936147868094938, "grad_norm": 0.93359375, "learning_rate": 0.0004989189837533365, "loss": 5.6363, "mean_token_accuracy": 0.1587088495492935, "num_tokens": 8627462.0, "step": 4685 }, { "entropy": 5.924579429626465, "epoch": 0.3940348666246587, "grad_norm": 0.83203125, "learning_rate": 0.0004989160497673613, "loss": 5.8254, "mean_token_accuracy": 0.1513897880911827, "num_tokens": 8637569.0, "step": 4690 }, { "entropy": 5.850678825378418, "epoch": 0.39445494643982354, "grad_norm": 1.0546875, "learning_rate": 0.0004989131118148286, "loss": 5.6177, "mean_token_accuracy": 0.15605207085609435, "num_tokens": 8645440.0, "step": 4695 }, { "entropy": 5.835308980941773, "epoch": 0.39487502625498844, "grad_norm": 0.9453125, "learning_rate": 0.0004989101698957904, "loss": 5.7682, "mean_token_accuracy": 0.15626595616340638, "num_tokens": 8655077.0, "step": 4700 }, { "entropy": 5.830049610137939, "epoch": 0.39529510607015333, "grad_norm": 1.0390625, "learning_rate": 0.0004989072240102988, "loss": 5.6957, "mean_token_accuracy": 0.16012858897447585, "num_tokens": 8663126.0, "step": 4705 }, { "entropy": 5.901100158691406, "epoch": 0.39571518588531823, "grad_norm": 0.94921875, "learning_rate": 0.0004989042741584061, "loss": 5.6726, "mean_token_accuracy": 0.15270041525363923, "num_tokens": 8672386.0, "step": 4710 }, { "entropy": 5.7314942359924315, "epoch": 0.3961352657004831, "grad_norm": 0.91796875, "learning_rate": 0.0004989013203401645, "loss": 5.612, "mean_token_accuracy": 0.1580759972333908, "num_tokens": 8681930.0, "step": 4715 }, { "entropy": 5.797902965545655, "epoch": 0.396555345515648, "grad_norm": 0.9375, "learning_rate": 0.0004988983625556264, "loss": 5.6787, "mean_token_accuracy": 0.15581901967525483, "num_tokens": 8690993.0, "step": 4720 }, { "entropy": 5.798060894012451, "epoch": 0.39697542533081287, "grad_norm": 1.0234375, "learning_rate": 0.0004988954008048438, "loss": 5.672, "mean_token_accuracy": 0.15935962349176408, "num_tokens": 8699497.0, "step": 4725 }, { "entropy": 5.933620643615723, "epoch": 0.3973955051459777, "grad_norm": 0.9296875, "learning_rate": 0.0004988924350878697, "loss": 5.8568, "mean_token_accuracy": 0.14457278251647948, "num_tokens": 8709274.0, "step": 4730 }, { "entropy": 5.934816789627075, "epoch": 0.3978155849611426, "grad_norm": 0.96484375, "learning_rate": 0.0004988894654047563, "loss": 5.7297, "mean_token_accuracy": 0.15009873509407043, "num_tokens": 8718158.0, "step": 4735 }, { "entropy": 5.786411237716675, "epoch": 0.3982356647763075, "grad_norm": 0.93359375, "learning_rate": 0.0004988864917555562, "loss": 5.5866, "mean_token_accuracy": 0.15930677056312562, "num_tokens": 8727459.0, "step": 4740 }, { "entropy": 5.864226961135865, "epoch": 0.3986557445914724, "grad_norm": 0.9609375, "learning_rate": 0.0004988835141403224, "loss": 5.7293, "mean_token_accuracy": 0.15878916680812835, "num_tokens": 8737614.0, "step": 4745 }, { "entropy": 5.824589109420776, "epoch": 0.39907582440663725, "grad_norm": 0.9921875, "learning_rate": 0.0004988805325591073, "loss": 5.56, "mean_token_accuracy": 0.15695197582244874, "num_tokens": 8746799.0, "step": 4750 }, { "entropy": 5.8385083198547365, "epoch": 0.39949590422180214, "grad_norm": 0.96484375, "learning_rate": 0.0004988775470119639, "loss": 5.7326, "mean_token_accuracy": 0.14953183978796006, "num_tokens": 8756555.0, "step": 4755 }, { "entropy": 5.7729175090789795, "epoch": 0.39991598403696704, "grad_norm": 0.9296875, "learning_rate": 0.0004988745574989451, "loss": 5.7535, "mean_token_accuracy": 0.15938151776790618, "num_tokens": 8765849.0, "step": 4760 }, { "entropy": 5.965050411224365, "epoch": 0.4003360638521319, "grad_norm": 0.9296875, "learning_rate": 0.0004988715640201036, "loss": 5.8322, "mean_token_accuracy": 0.14530889242887496, "num_tokens": 8775713.0, "step": 4765 }, { "entropy": 5.839820480346679, "epoch": 0.4007561436672968, "grad_norm": 0.953125, "learning_rate": 0.0004988685665754928, "loss": 5.6466, "mean_token_accuracy": 0.1569948598742485, "num_tokens": 8784717.0, "step": 4770 }, { "entropy": 5.792028474807739, "epoch": 0.4011762234824617, "grad_norm": 0.98046875, "learning_rate": 0.0004988655651651656, "loss": 5.6649, "mean_token_accuracy": 0.15628512352705, "num_tokens": 8794388.0, "step": 4775 }, { "entropy": 5.755618572235107, "epoch": 0.4015963032976265, "grad_norm": 1.0078125, "learning_rate": 0.0004988625597891751, "loss": 5.6762, "mean_token_accuracy": 0.15925197303295135, "num_tokens": 8802436.0, "step": 4780 }, { "entropy": 5.85797004699707, "epoch": 0.4020163831127914, "grad_norm": 0.9140625, "learning_rate": 0.0004988595504475746, "loss": 5.6376, "mean_token_accuracy": 0.15845684409141542, "num_tokens": 8811184.0, "step": 4785 }, { "entropy": 5.920813274383545, "epoch": 0.4024364629279563, "grad_norm": 0.94921875, "learning_rate": 0.0004988565371404175, "loss": 5.7115, "mean_token_accuracy": 0.15826244726777078, "num_tokens": 8820525.0, "step": 4790 }, { "entropy": 5.790119886398315, "epoch": 0.4028565427431212, "grad_norm": 1.03125, "learning_rate": 0.0004988535198677571, "loss": 5.5798, "mean_token_accuracy": 0.16315356642007828, "num_tokens": 8828928.0, "step": 4795 }, { "entropy": 5.902295684814453, "epoch": 0.40327662255828606, "grad_norm": 1.0234375, "learning_rate": 0.0004988504986296469, "loss": 5.7884, "mean_token_accuracy": 0.1443356990814209, "num_tokens": 8838615.0, "step": 4800 }, { "entropy": 5.862144041061401, "epoch": 0.40369670237345096, "grad_norm": 0.88671875, "learning_rate": 0.0004988474734261404, "loss": 5.769, "mean_token_accuracy": 0.1485462300479412, "num_tokens": 8848709.0, "step": 4805 }, { "entropy": 5.8929126262664795, "epoch": 0.40411678218861585, "grad_norm": 0.9140625, "learning_rate": 0.0004988444442572911, "loss": 5.7251, "mean_token_accuracy": 0.14630650877952575, "num_tokens": 8858277.0, "step": 4810 }, { "entropy": 5.814572858810425, "epoch": 0.4045368620037807, "grad_norm": 0.9140625, "learning_rate": 0.0004988414111231528, "loss": 5.6716, "mean_token_accuracy": 0.15942000597715378, "num_tokens": 8868436.0, "step": 4815 }, { "entropy": 5.8521270751953125, "epoch": 0.4049569418189456, "grad_norm": 0.86328125, "learning_rate": 0.000498838374023779, "loss": 5.6738, "mean_token_accuracy": 0.15392234772443772, "num_tokens": 8877740.0, "step": 4820 }, { "entropy": 5.896619701385498, "epoch": 0.4053770216341105, "grad_norm": 0.875, "learning_rate": 0.0004988353329592239, "loss": 5.6449, "mean_token_accuracy": 0.15986622273921966, "num_tokens": 8887408.0, "step": 4825 }, { "entropy": 5.889400386810303, "epoch": 0.4057971014492754, "grad_norm": 0.984375, "learning_rate": 0.0004988322879295409, "loss": 5.8084, "mean_token_accuracy": 0.151357901096344, "num_tokens": 8897141.0, "step": 4830 }, { "entropy": 5.732660865783691, "epoch": 0.40621718126444023, "grad_norm": 0.96875, "learning_rate": 0.0004988292389347844, "loss": 5.5937, "mean_token_accuracy": 0.16834330409765244, "num_tokens": 8905747.0, "step": 4835 }, { "entropy": 5.910235500335693, "epoch": 0.40663726107960513, "grad_norm": 0.99609375, "learning_rate": 0.000498826185975008, "loss": 5.7403, "mean_token_accuracy": 0.15051692128181457, "num_tokens": 8914926.0, "step": 4840 }, { "entropy": 5.855715417861939, "epoch": 0.40705734089477, "grad_norm": 0.9375, "learning_rate": 0.0004988231290502662, "loss": 5.7351, "mean_token_accuracy": 0.15608510375022888, "num_tokens": 8923956.0, "step": 4845 }, { "entropy": 5.844746065139771, "epoch": 0.40747742070993487, "grad_norm": 0.99609375, "learning_rate": 0.0004988200681606127, "loss": 5.6105, "mean_token_accuracy": 0.15472539961338044, "num_tokens": 8932654.0, "step": 4850 }, { "entropy": 5.819759750366211, "epoch": 0.40789750052509977, "grad_norm": 0.97265625, "learning_rate": 0.000498817003306102, "loss": 5.602, "mean_token_accuracy": 0.1623125731945038, "num_tokens": 8941716.0, "step": 4855 }, { "entropy": 5.776214361190796, "epoch": 0.40831758034026466, "grad_norm": 1.0234375, "learning_rate": 0.0004988139344867884, "loss": 5.6825, "mean_token_accuracy": 0.1535426653921604, "num_tokens": 8950377.0, "step": 4860 }, { "entropy": 5.807446241378784, "epoch": 0.4087376601554295, "grad_norm": 0.984375, "learning_rate": 0.0004988108617027261, "loss": 5.6579, "mean_token_accuracy": 0.15453788191080092, "num_tokens": 8959857.0, "step": 4865 }, { "entropy": 5.781218004226685, "epoch": 0.4091577399705944, "grad_norm": 0.90234375, "learning_rate": 0.0004988077849539698, "loss": 5.5902, "mean_token_accuracy": 0.15969525128602982, "num_tokens": 8968272.0, "step": 4870 }, { "entropy": 5.820656394958496, "epoch": 0.4095778197857593, "grad_norm": 1.015625, "learning_rate": 0.0004988047042405736, "loss": 5.6674, "mean_token_accuracy": 0.15931978076696396, "num_tokens": 8977445.0, "step": 4875 }, { "entropy": 5.915397691726684, "epoch": 0.4099978996009242, "grad_norm": 0.875, "learning_rate": 0.0004988016195625924, "loss": 5.7299, "mean_token_accuracy": 0.15139664933085442, "num_tokens": 8987315.0, "step": 4880 }, { "entropy": 5.871594667434692, "epoch": 0.41041797941608904, "grad_norm": 0.90234375, "learning_rate": 0.0004987985309200807, "loss": 5.7173, "mean_token_accuracy": 0.15377188473939896, "num_tokens": 8998119.0, "step": 4885 }, { "entropy": 5.775591278076172, "epoch": 0.41083805923125394, "grad_norm": 0.97265625, "learning_rate": 0.0004987954383130934, "loss": 5.6066, "mean_token_accuracy": 0.16712582856416702, "num_tokens": 9007167.0, "step": 4890 }, { "entropy": 5.807595109939575, "epoch": 0.41125813904641884, "grad_norm": 0.9453125, "learning_rate": 0.000498792341741685, "loss": 5.6687, "mean_token_accuracy": 0.1526729181408882, "num_tokens": 9016690.0, "step": 4895 }, { "entropy": 5.874031114578247, "epoch": 0.4116782188615837, "grad_norm": 0.91015625, "learning_rate": 0.0004987892412059106, "loss": 5.758, "mean_token_accuracy": 0.15407043546438218, "num_tokens": 9026117.0, "step": 4900 }, { "entropy": 5.780725193023682, "epoch": 0.4120982986767486, "grad_norm": 0.95703125, "learning_rate": 0.0004987861367058251, "loss": 5.644, "mean_token_accuracy": 0.1559523746371269, "num_tokens": 9035754.0, "step": 4905 }, { "entropy": 5.826504945755005, "epoch": 0.4125183784919135, "grad_norm": 0.96875, "learning_rate": 0.0004987830282414833, "loss": 5.642, "mean_token_accuracy": 0.15711333677172662, "num_tokens": 9045453.0, "step": 4910 }, { "entropy": 5.873796701431274, "epoch": 0.41293845830707837, "grad_norm": 0.94921875, "learning_rate": 0.0004987799158129404, "loss": 5.7527, "mean_token_accuracy": 0.15677697360515594, "num_tokens": 9056045.0, "step": 4915 }, { "entropy": 5.820205545425415, "epoch": 0.4133585381222432, "grad_norm": 0.94921875, "learning_rate": 0.0004987767994202516, "loss": 5.6455, "mean_token_accuracy": 0.1496775045990944, "num_tokens": 9065728.0, "step": 4920 }, { "entropy": 5.809246253967285, "epoch": 0.4137786179374081, "grad_norm": 0.9375, "learning_rate": 0.0004987736790634719, "loss": 5.6661, "mean_token_accuracy": 0.15184428542852402, "num_tokens": 9075522.0, "step": 4925 }, { "entropy": 5.794481945037842, "epoch": 0.414198697752573, "grad_norm": 1.09375, "learning_rate": 0.0004987705547426568, "loss": 5.6358, "mean_token_accuracy": 0.1499626338481903, "num_tokens": 9084412.0, "step": 4930 }, { "entropy": 5.868565320968628, "epoch": 0.41461877756773785, "grad_norm": 0.88671875, "learning_rate": 0.0004987674264578615, "loss": 5.6942, "mean_token_accuracy": 0.15214097648859023, "num_tokens": 9094289.0, "step": 4935 }, { "entropy": 5.820976829528808, "epoch": 0.41503885738290275, "grad_norm": 0.9921875, "learning_rate": 0.0004987642942091414, "loss": 5.6177, "mean_token_accuracy": 0.15684758871793747, "num_tokens": 9103124.0, "step": 4940 }, { "entropy": 5.808840274810791, "epoch": 0.41545893719806765, "grad_norm": 0.8984375, "learning_rate": 0.0004987611579965523, "loss": 5.5534, "mean_token_accuracy": 0.15804969370365143, "num_tokens": 9112794.0, "step": 4945 }, { "entropy": 5.837375354766846, "epoch": 0.4158790170132325, "grad_norm": 0.8359375, "learning_rate": 0.0004987580178201492, "loss": 5.7246, "mean_token_accuracy": 0.16285934299230576, "num_tokens": 9122718.0, "step": 4950 }, { "entropy": 5.831628942489624, "epoch": 0.4162990968283974, "grad_norm": 1.0234375, "learning_rate": 0.0004987548736799882, "loss": 5.7454, "mean_token_accuracy": 0.1529500514268875, "num_tokens": 9131855.0, "step": 4955 }, { "entropy": 5.798128986358643, "epoch": 0.4167191766435623, "grad_norm": 1.0, "learning_rate": 0.0004987517255761248, "loss": 5.6019, "mean_token_accuracy": 0.1599896475672722, "num_tokens": 9141102.0, "step": 4960 }, { "entropy": 5.77801775932312, "epoch": 0.4171392564587272, "grad_norm": 1.0234375, "learning_rate": 0.0004987485735086148, "loss": 5.6601, "mean_token_accuracy": 0.16009112149477006, "num_tokens": 9150552.0, "step": 4965 }, { "entropy": 5.852486228942871, "epoch": 0.417559336273892, "grad_norm": 0.93359375, "learning_rate": 0.000498745417477514, "loss": 5.657, "mean_token_accuracy": 0.15402564853429795, "num_tokens": 9160105.0, "step": 4970 }, { "entropy": 5.779581785202026, "epoch": 0.4179794160890569, "grad_norm": 0.9296875, "learning_rate": 0.0004987422574828784, "loss": 5.6566, "mean_token_accuracy": 0.15598243325948716, "num_tokens": 9169367.0, "step": 4975 }, { "entropy": 5.786018943786621, "epoch": 0.4183994959042218, "grad_norm": 1.046875, "learning_rate": 0.0004987390935247639, "loss": 5.5264, "mean_token_accuracy": 0.16368313133716583, "num_tokens": 9177872.0, "step": 4980 }, { "entropy": 5.82407512664795, "epoch": 0.41881957571938666, "grad_norm": 1.109375, "learning_rate": 0.0004987359256032265, "loss": 5.7466, "mean_token_accuracy": 0.151212839782238, "num_tokens": 9187879.0, "step": 4985 }, { "entropy": 5.807058525085449, "epoch": 0.41923965553455156, "grad_norm": 0.8671875, "learning_rate": 0.0004987327537183225, "loss": 5.6561, "mean_token_accuracy": 0.15415959805250168, "num_tokens": 9198281.0, "step": 4990 }, { "entropy": 5.805870008468628, "epoch": 0.41965973534971646, "grad_norm": 0.89453125, "learning_rate": 0.0004987295778701078, "loss": 5.6394, "mean_token_accuracy": 0.16050323396921157, "num_tokens": 9207670.0, "step": 4995 }, { "entropy": 5.877247047424317, "epoch": 0.42007981516488135, "grad_norm": 1.046875, "learning_rate": 0.000498726398058639, "loss": 5.6482, "mean_token_accuracy": 0.16082072257995605, "num_tokens": 9216995.0, "step": 5000 }, { "entropy": 5.812716388702393, "epoch": 0.4204998949800462, "grad_norm": 0.875, "learning_rate": 0.0004987232142839723, "loss": 5.7482, "mean_token_accuracy": 0.1490781858563423, "num_tokens": 9227330.0, "step": 5005 }, { "entropy": 5.844203805923462, "epoch": 0.4209199747952111, "grad_norm": 0.91796875, "learning_rate": 0.0004987200265461638, "loss": 5.656, "mean_token_accuracy": 0.16385895162820815, "num_tokens": 9236666.0, "step": 5010 }, { "entropy": 5.85231499671936, "epoch": 0.421340054610376, "grad_norm": 0.9609375, "learning_rate": 0.0004987168348452705, "loss": 5.6595, "mean_token_accuracy": 0.16210315823554994, "num_tokens": 9246388.0, "step": 5015 }, { "entropy": 5.789185667037964, "epoch": 0.42176013442554083, "grad_norm": 0.93359375, "learning_rate": 0.0004987136391813485, "loss": 5.6096, "mean_token_accuracy": 0.16511590033769608, "num_tokens": 9255239.0, "step": 5020 }, { "entropy": 5.742922639846801, "epoch": 0.42218021424070573, "grad_norm": 0.95703125, "learning_rate": 0.0004987104395544547, "loss": 5.5924, "mean_token_accuracy": 0.15797384828329086, "num_tokens": 9264468.0, "step": 5025 }, { "entropy": 5.819699382781982, "epoch": 0.42260029405587063, "grad_norm": 0.98046875, "learning_rate": 0.0004987072359646455, "loss": 5.6607, "mean_token_accuracy": 0.16205601245164872, "num_tokens": 9274140.0, "step": 5030 }, { "entropy": 5.83985595703125, "epoch": 0.42302037387103547, "grad_norm": 0.9609375, "learning_rate": 0.0004987040284119778, "loss": 5.6327, "mean_token_accuracy": 0.1588321939110756, "num_tokens": 9283539.0, "step": 5035 }, { "entropy": 5.751109886169433, "epoch": 0.42344045368620037, "grad_norm": 1.0234375, "learning_rate": 0.0004987008168965087, "loss": 5.6403, "mean_token_accuracy": 0.1550469622015953, "num_tokens": 9292664.0, "step": 5040 }, { "entropy": 5.876785469055176, "epoch": 0.42386053350136527, "grad_norm": 0.890625, "learning_rate": 0.0004986976014182946, "loss": 5.7374, "mean_token_accuracy": 0.1531568393111229, "num_tokens": 9302814.0, "step": 5045 }, { "entropy": 5.890387535095215, "epoch": 0.42428061331653016, "grad_norm": 0.98046875, "learning_rate": 0.0004986943819773927, "loss": 5.7332, "mean_token_accuracy": 0.15649186819791794, "num_tokens": 9312654.0, "step": 5050 }, { "entropy": 5.8707475662231445, "epoch": 0.424700693131695, "grad_norm": 0.92578125, "learning_rate": 0.00049869115857386, "loss": 5.7558, "mean_token_accuracy": 0.14800945520401002, "num_tokens": 9322271.0, "step": 5055 }, { "entropy": 5.878791618347168, "epoch": 0.4251207729468599, "grad_norm": 0.86328125, "learning_rate": 0.0004986879312077536, "loss": 5.688, "mean_token_accuracy": 0.15585887283086777, "num_tokens": 9331341.0, "step": 5060 }, { "entropy": 5.796487426757812, "epoch": 0.4255408527620248, "grad_norm": 1.0078125, "learning_rate": 0.0004986846998791308, "loss": 5.6274, "mean_token_accuracy": 0.15625337660312652, "num_tokens": 9339863.0, "step": 5065 }, { "entropy": 5.72486629486084, "epoch": 0.42596093257718964, "grad_norm": 1.0078125, "learning_rate": 0.0004986814645880485, "loss": 5.5974, "mean_token_accuracy": 0.16185437515377998, "num_tokens": 9349488.0, "step": 5070 }, { "entropy": 5.7803843975067135, "epoch": 0.42638101239235454, "grad_norm": 0.89453125, "learning_rate": 0.0004986782253345645, "loss": 5.6105, "mean_token_accuracy": 0.15332376062870026, "num_tokens": 9357977.0, "step": 5075 }, { "entropy": 5.823932743072509, "epoch": 0.42680109220751944, "grad_norm": 0.92578125, "learning_rate": 0.0004986749821187358, "loss": 5.7156, "mean_token_accuracy": 0.15630935728549958, "num_tokens": 9367449.0, "step": 5080 }, { "entropy": 5.89394211769104, "epoch": 0.42722117202268434, "grad_norm": 0.97265625, "learning_rate": 0.00049867173494062, "loss": 5.7321, "mean_token_accuracy": 0.15639646500349044, "num_tokens": 9377070.0, "step": 5085 }, { "entropy": 5.765441846847534, "epoch": 0.4276412518378492, "grad_norm": 1.0, "learning_rate": 0.0004986684838002744, "loss": 5.5217, "mean_token_accuracy": 0.15419476479291916, "num_tokens": 9385881.0, "step": 5090 }, { "entropy": 5.770947122573853, "epoch": 0.4280613316530141, "grad_norm": 0.94140625, "learning_rate": 0.0004986652286977569, "loss": 5.6523, "mean_token_accuracy": 0.15255010426044463, "num_tokens": 9395159.0, "step": 5095 }, { "entropy": 5.805099630355835, "epoch": 0.428481411468179, "grad_norm": 0.91015625, "learning_rate": 0.0004986619696331252, "loss": 5.6045, "mean_token_accuracy": 0.1583484500646591, "num_tokens": 9404590.0, "step": 5100 }, { "entropy": 5.841793823242187, "epoch": 0.4289014912833438, "grad_norm": 0.8515625, "learning_rate": 0.0004986587066064367, "loss": 5.6238, "mean_token_accuracy": 0.1618543565273285, "num_tokens": 9414452.0, "step": 5105 }, { "entropy": 5.882272624969483, "epoch": 0.4293215710985087, "grad_norm": 0.96875, "learning_rate": 0.0004986554396177494, "loss": 5.7691, "mean_token_accuracy": 0.1512654058635235, "num_tokens": 9424004.0, "step": 5110 }, { "entropy": 5.826911163330078, "epoch": 0.4297416509136736, "grad_norm": 0.88671875, "learning_rate": 0.0004986521686671212, "loss": 5.6377, "mean_token_accuracy": 0.16602189987897872, "num_tokens": 9433487.0, "step": 5115 }, { "entropy": 5.761785840988159, "epoch": 0.43016173072883845, "grad_norm": 1.046875, "learning_rate": 0.00049864889375461, "loss": 5.701, "mean_token_accuracy": 0.15255770534276963, "num_tokens": 9442742.0, "step": 5120 }, { "entropy": 5.816967296600342, "epoch": 0.43058181054400335, "grad_norm": 0.8984375, "learning_rate": 0.0004986456148802738, "loss": 5.7673, "mean_token_accuracy": 0.15205237418413162, "num_tokens": 9452550.0, "step": 5125 }, { "entropy": 5.930779886245728, "epoch": 0.43100189035916825, "grad_norm": 0.94921875, "learning_rate": 0.0004986423320441707, "loss": 5.7143, "mean_token_accuracy": 0.14957663267850876, "num_tokens": 9461920.0, "step": 5130 }, { "entropy": 5.818691873550415, "epoch": 0.43142197017433315, "grad_norm": 1.0, "learning_rate": 0.0004986390452463588, "loss": 5.6211, "mean_token_accuracy": 0.15580169409513472, "num_tokens": 9470817.0, "step": 5135 }, { "entropy": 5.700370407104492, "epoch": 0.431842049989498, "grad_norm": 0.9921875, "learning_rate": 0.0004986357544868964, "loss": 5.5801, "mean_token_accuracy": 0.1596447467803955, "num_tokens": 9479936.0, "step": 5140 }, { "entropy": 5.841777086257935, "epoch": 0.4322621298046629, "grad_norm": 0.96875, "learning_rate": 0.0004986324597658418, "loss": 5.6155, "mean_token_accuracy": 0.16243926435709, "num_tokens": 9489818.0, "step": 5145 }, { "entropy": 5.728731489181518, "epoch": 0.4326822096198278, "grad_norm": 0.9375, "learning_rate": 0.0004986291610832533, "loss": 5.624, "mean_token_accuracy": 0.153781495988369, "num_tokens": 9499688.0, "step": 5150 }, { "entropy": 5.918451547622681, "epoch": 0.4331022894349926, "grad_norm": 0.99609375, "learning_rate": 0.0004986258584391892, "loss": 5.6774, "mean_token_accuracy": 0.15540721267461777, "num_tokens": 9509581.0, "step": 5155 }, { "entropy": 5.923600053787231, "epoch": 0.4335223692501575, "grad_norm": 0.96484375, "learning_rate": 0.0004986225518337084, "loss": 5.7525, "mean_token_accuracy": 0.15666318088769912, "num_tokens": 9518556.0, "step": 5160 }, { "entropy": 5.714486789703369, "epoch": 0.4339424490653224, "grad_norm": 0.91015625, "learning_rate": 0.0004986192412668692, "loss": 5.6587, "mean_token_accuracy": 0.1547637924551964, "num_tokens": 9527612.0, "step": 5165 }, { "entropy": 5.787137269973755, "epoch": 0.4343625288804873, "grad_norm": 1.03125, "learning_rate": 0.0004986159267387302, "loss": 5.5546, "mean_token_accuracy": 0.16138194501399994, "num_tokens": 9535882.0, "step": 5170 }, { "entropy": 5.797946739196777, "epoch": 0.43478260869565216, "grad_norm": 0.953125, "learning_rate": 0.0004986126082493502, "loss": 5.656, "mean_token_accuracy": 0.1613065406680107, "num_tokens": 9544799.0, "step": 5175 }, { "entropy": 5.779606723785401, "epoch": 0.43520268851081706, "grad_norm": 0.890625, "learning_rate": 0.0004986092857987881, "loss": 5.5729, "mean_token_accuracy": 0.1618928477168083, "num_tokens": 9553805.0, "step": 5180 }, { "entropy": 5.782668399810791, "epoch": 0.43562276832598196, "grad_norm": 0.94921875, "learning_rate": 0.0004986059593871026, "loss": 5.5971, "mean_token_accuracy": 0.1598972573876381, "num_tokens": 9563493.0, "step": 5185 }, { "entropy": 5.800241613388062, "epoch": 0.4360428481411468, "grad_norm": 0.89453125, "learning_rate": 0.0004986026290143527, "loss": 5.6842, "mean_token_accuracy": 0.15388598516583443, "num_tokens": 9572297.0, "step": 5190 }, { "entropy": 5.936120653152466, "epoch": 0.4364629279563117, "grad_norm": 1.0390625, "learning_rate": 0.0004985992946805973, "loss": 5.8134, "mean_token_accuracy": 0.15065453350543975, "num_tokens": 9581967.0, "step": 5195 }, { "entropy": 5.819184160232544, "epoch": 0.4368830077714766, "grad_norm": 0.8828125, "learning_rate": 0.0004985959563858955, "loss": 5.7273, "mean_token_accuracy": 0.16100031584501268, "num_tokens": 9590885.0, "step": 5200 }, { "entropy": 5.860151624679565, "epoch": 0.43730308758664144, "grad_norm": 0.953125, "learning_rate": 0.0004985926141303066, "loss": 5.6532, "mean_token_accuracy": 0.1567025899887085, "num_tokens": 9599247.0, "step": 5205 }, { "entropy": 5.818394136428833, "epoch": 0.43772316740180633, "grad_norm": 1.0546875, "learning_rate": 0.0004985892679138896, "loss": 5.571, "mean_token_accuracy": 0.16371893361210824, "num_tokens": 9608296.0, "step": 5210 }, { "entropy": 5.8166498184204105, "epoch": 0.43814324721697123, "grad_norm": 1.1015625, "learning_rate": 0.0004985859177367038, "loss": 5.6242, "mean_token_accuracy": 0.15776645839214326, "num_tokens": 9616734.0, "step": 5215 }, { "entropy": 5.83067135810852, "epoch": 0.43856332703213613, "grad_norm": 0.890625, "learning_rate": 0.0004985825635988087, "loss": 5.699, "mean_token_accuracy": 0.1571464478969574, "num_tokens": 9626246.0, "step": 5220 }, { "entropy": 5.7702131271362305, "epoch": 0.43898340684730097, "grad_norm": 0.96484375, "learning_rate": 0.0004985792055002635, "loss": 5.5794, "mean_token_accuracy": 0.16028426140546798, "num_tokens": 9634963.0, "step": 5225 }, { "entropy": 5.8400349617004395, "epoch": 0.43940348666246587, "grad_norm": 1.0078125, "learning_rate": 0.0004985758434411278, "loss": 5.6513, "mean_token_accuracy": 0.16422291100025177, "num_tokens": 9643615.0, "step": 5230 }, { "entropy": 5.810837030410767, "epoch": 0.43982356647763077, "grad_norm": 0.96484375, "learning_rate": 0.0004985724774214613, "loss": 5.6244, "mean_token_accuracy": 0.15992441177368164, "num_tokens": 9653306.0, "step": 5235 }, { "entropy": 5.767703294754028, "epoch": 0.4402436462927956, "grad_norm": 0.9453125, "learning_rate": 0.0004985691074413233, "loss": 5.6505, "mean_token_accuracy": 0.15613847076892853, "num_tokens": 9662389.0, "step": 5240 }, { "entropy": 5.753371381759644, "epoch": 0.4406637261079605, "grad_norm": 0.94921875, "learning_rate": 0.0004985657335007739, "loss": 5.6446, "mean_token_accuracy": 0.15534982979297637, "num_tokens": 9671183.0, "step": 5245 }, { "entropy": 5.836323595046997, "epoch": 0.4410838059231254, "grad_norm": 0.90234375, "learning_rate": 0.0004985623555998725, "loss": 5.6222, "mean_token_accuracy": 0.16474147886037827, "num_tokens": 9680544.0, "step": 5250 }, { "entropy": 5.819104290008545, "epoch": 0.4415038857382903, "grad_norm": 0.99609375, "learning_rate": 0.0004985589737386791, "loss": 5.6779, "mean_token_accuracy": 0.15779446437954903, "num_tokens": 9690137.0, "step": 5255 }, { "entropy": 5.74895076751709, "epoch": 0.44192396555345514, "grad_norm": 0.94921875, "learning_rate": 0.0004985555879172535, "loss": 5.6131, "mean_token_accuracy": 0.16228249818086624, "num_tokens": 9699149.0, "step": 5260 }, { "entropy": 5.830872917175293, "epoch": 0.44234404536862004, "grad_norm": 0.89453125, "learning_rate": 0.000498552198135656, "loss": 5.6857, "mean_token_accuracy": 0.16091985404491424, "num_tokens": 9709308.0, "step": 5265 }, { "entropy": 5.817913627624511, "epoch": 0.44276412518378494, "grad_norm": 1.15625, "learning_rate": 0.0004985488043939462, "loss": 5.6133, "mean_token_accuracy": 0.15377137959003448, "num_tokens": 9718462.0, "step": 5270 }, { "entropy": 5.762473201751709, "epoch": 0.4431842049989498, "grad_norm": 0.953125, "learning_rate": 0.0004985454066921846, "loss": 5.5442, "mean_token_accuracy": 0.16455349177122117, "num_tokens": 9727626.0, "step": 5275 }, { "entropy": 5.663512516021728, "epoch": 0.4436042848141147, "grad_norm": 0.91015625, "learning_rate": 0.0004985420050304312, "loss": 5.5827, "mean_token_accuracy": 0.15936666429042817, "num_tokens": 9737091.0, "step": 5280 }, { "entropy": 5.770118761062622, "epoch": 0.4440243646292796, "grad_norm": 1.015625, "learning_rate": 0.0004985385994087462, "loss": 5.6417, "mean_token_accuracy": 0.1584844209253788, "num_tokens": 9746135.0, "step": 5285 }, { "entropy": 5.844138050079346, "epoch": 0.4444444444444444, "grad_norm": 1.015625, "learning_rate": 0.0004985351898271901, "loss": 5.5853, "mean_token_accuracy": 0.1622116059064865, "num_tokens": 9754549.0, "step": 5290 }, { "entropy": 5.83607120513916, "epoch": 0.4448645242596093, "grad_norm": 0.95703125, "learning_rate": 0.0004985317762858231, "loss": 5.7065, "mean_token_accuracy": 0.1499613419175148, "num_tokens": 9764219.0, "step": 5295 }, { "entropy": 5.792026853561401, "epoch": 0.4452846040747742, "grad_norm": 0.984375, "learning_rate": 0.000498528358784706, "loss": 5.5519, "mean_token_accuracy": 0.1638228639960289, "num_tokens": 9772234.0, "step": 5300 }, { "entropy": 5.749575090408325, "epoch": 0.4457046838899391, "grad_norm": 0.9375, "learning_rate": 0.000498524937323899, "loss": 5.6106, "mean_token_accuracy": 0.16515014916658402, "num_tokens": 9781417.0, "step": 5305 }, { "entropy": 5.9356084823608395, "epoch": 0.44612476370510395, "grad_norm": 0.90625, "learning_rate": 0.0004985215119034628, "loss": 5.7505, "mean_token_accuracy": 0.14851112440228462, "num_tokens": 9791286.0, "step": 5310 }, { "entropy": 5.8016856670379635, "epoch": 0.44654484352026885, "grad_norm": 0.95703125, "learning_rate": 0.0004985180825234582, "loss": 5.7329, "mean_token_accuracy": 0.15573213249444962, "num_tokens": 9802157.0, "step": 5315 }, { "entropy": 5.89680552482605, "epoch": 0.44696492333543375, "grad_norm": 0.93359375, "learning_rate": 0.0004985146491839459, "loss": 5.7173, "mean_token_accuracy": 0.1475129798054695, "num_tokens": 9812646.0, "step": 5320 }, { "entropy": 5.870607805252075, "epoch": 0.4473850031505986, "grad_norm": 0.9765625, "learning_rate": 0.0004985112118849865, "loss": 5.7088, "mean_token_accuracy": 0.15120236873626708, "num_tokens": 9822274.0, "step": 5325 }, { "entropy": 5.753091526031494, "epoch": 0.4478050829657635, "grad_norm": 0.9609375, "learning_rate": 0.0004985077706266412, "loss": 5.5294, "mean_token_accuracy": 0.15791643261909485, "num_tokens": 9831337.0, "step": 5330 }, { "entropy": 5.79245548248291, "epoch": 0.4482251627809284, "grad_norm": 0.8828125, "learning_rate": 0.0004985043254089708, "loss": 5.6629, "mean_token_accuracy": 0.15153390020132065, "num_tokens": 9840798.0, "step": 5335 }, { "entropy": 5.723747682571411, "epoch": 0.44864524259609323, "grad_norm": 0.953125, "learning_rate": 0.0004985008762320364, "loss": 5.637, "mean_token_accuracy": 0.15859152227640153, "num_tokens": 9850117.0, "step": 5340 }, { "entropy": 5.79846601486206, "epoch": 0.4490653224112581, "grad_norm": 0.9921875, "learning_rate": 0.000498497423095899, "loss": 5.5724, "mean_token_accuracy": 0.16569938510656357, "num_tokens": 9858227.0, "step": 5345 }, { "entropy": 5.755469799041748, "epoch": 0.449485402226423, "grad_norm": 0.9140625, "learning_rate": 0.0004984939660006199, "loss": 5.6759, "mean_token_accuracy": 0.15846239179372787, "num_tokens": 9867157.0, "step": 5350 }, { "entropy": 5.7474853515625, "epoch": 0.4499054820415879, "grad_norm": 0.9453125, "learning_rate": 0.0004984905049462602, "loss": 5.5876, "mean_token_accuracy": 0.15728517472743989, "num_tokens": 9877045.0, "step": 5355 }, { "entropy": 5.918812370300293, "epoch": 0.45032556185675277, "grad_norm": 0.953125, "learning_rate": 0.0004984870399328814, "loss": 5.7228, "mean_token_accuracy": 0.15240922719240188, "num_tokens": 9886637.0, "step": 5360 }, { "entropy": 5.742618703842163, "epoch": 0.45074564167191766, "grad_norm": 0.93359375, "learning_rate": 0.0004984835709605446, "loss": 5.5883, "mean_token_accuracy": 0.16404919177293778, "num_tokens": 9895601.0, "step": 5365 }, { "entropy": 5.8194098472595215, "epoch": 0.45116572148708256, "grad_norm": 1.0, "learning_rate": 0.0004984800980293116, "loss": 5.738, "mean_token_accuracy": 0.1579892724752426, "num_tokens": 9904775.0, "step": 5370 }, { "entropy": 5.780790996551514, "epoch": 0.4515858013022474, "grad_norm": 0.9609375, "learning_rate": 0.0004984766211392435, "loss": 5.6783, "mean_token_accuracy": 0.15692917853593827, "num_tokens": 9913795.0, "step": 5375 }, { "entropy": 5.802691316604614, "epoch": 0.4520058811174123, "grad_norm": 0.90234375, "learning_rate": 0.0004984731402904024, "loss": 5.5113, "mean_token_accuracy": 0.16487460136413573, "num_tokens": 9922576.0, "step": 5380 }, { "entropy": 5.772703742980957, "epoch": 0.4524259609325772, "grad_norm": 0.93359375, "learning_rate": 0.0004984696554828496, "loss": 5.4922, "mean_token_accuracy": 0.1670244887471199, "num_tokens": 9930971.0, "step": 5385 }, { "entropy": 5.794325065612793, "epoch": 0.4528460407477421, "grad_norm": 1.0859375, "learning_rate": 0.0004984661667166468, "loss": 5.6128, "mean_token_accuracy": 0.16192587018013, "num_tokens": 9939628.0, "step": 5390 }, { "entropy": 5.7850220680236815, "epoch": 0.45326612056290694, "grad_norm": 0.9140625, "learning_rate": 0.0004984626739918561, "loss": 5.5903, "mean_token_accuracy": 0.16074153482913972, "num_tokens": 9948397.0, "step": 5395 }, { "entropy": 5.814194774627685, "epoch": 0.45368620037807184, "grad_norm": 0.87890625, "learning_rate": 0.0004984591773085391, "loss": 5.67, "mean_token_accuracy": 0.15753872096538543, "num_tokens": 9957683.0, "step": 5400 }, { "entropy": 5.814547824859619, "epoch": 0.45410628019323673, "grad_norm": 0.921875, "learning_rate": 0.0004984556766667578, "loss": 5.6587, "mean_token_accuracy": 0.1586209386587143, "num_tokens": 9966756.0, "step": 5405 }, { "entropy": 5.744683790206909, "epoch": 0.4545263600084016, "grad_norm": 0.97265625, "learning_rate": 0.0004984521720665743, "loss": 5.6532, "mean_token_accuracy": 0.16073551923036575, "num_tokens": 9976000.0, "step": 5410 }, { "entropy": 5.857652235031128, "epoch": 0.4549464398235665, "grad_norm": 0.90625, "learning_rate": 0.0004984486635080507, "loss": 5.6506, "mean_token_accuracy": 0.15694389641284942, "num_tokens": 9985509.0, "step": 5415 }, { "entropy": 5.7711996078491214, "epoch": 0.45536651963873137, "grad_norm": 0.94140625, "learning_rate": 0.0004984451509912489, "loss": 5.5899, "mean_token_accuracy": 0.1618253692984581, "num_tokens": 9994342.0, "step": 5420 }, { "entropy": 5.746224308013916, "epoch": 0.4557865994538962, "grad_norm": 0.91796875, "learning_rate": 0.0004984416345162315, "loss": 5.6478, "mean_token_accuracy": 0.15566404908895493, "num_tokens": 10004249.0, "step": 5425 }, { "entropy": 5.76487717628479, "epoch": 0.4562066792690611, "grad_norm": 1.0234375, "learning_rate": 0.0004984381140830605, "loss": 5.6061, "mean_token_accuracy": 0.16023263484239578, "num_tokens": 10012430.0, "step": 5430 }, { "entropy": 5.82148494720459, "epoch": 0.456626759084226, "grad_norm": 0.94140625, "learning_rate": 0.0004984345896917984, "loss": 5.615, "mean_token_accuracy": 0.15671578347682952, "num_tokens": 10021434.0, "step": 5435 }, { "entropy": 5.7957844734191895, "epoch": 0.4570468388993909, "grad_norm": 1.0234375, "learning_rate": 0.0004984310613425076, "loss": 5.6077, "mean_token_accuracy": 0.16273672878742218, "num_tokens": 10030473.0, "step": 5440 }, { "entropy": 5.7984706401824955, "epoch": 0.45746691871455575, "grad_norm": 1.109375, "learning_rate": 0.0004984275290352506, "loss": 5.6027, "mean_token_accuracy": 0.16592728793621064, "num_tokens": 10039057.0, "step": 5445 }, { "entropy": 5.82614917755127, "epoch": 0.45788699852972065, "grad_norm": 0.98046875, "learning_rate": 0.0004984239927700899, "loss": 5.6993, "mean_token_accuracy": 0.15564172416925431, "num_tokens": 10047998.0, "step": 5450 }, { "entropy": 5.890322923660278, "epoch": 0.45830707834488554, "grad_norm": 0.94921875, "learning_rate": 0.0004984204525470883, "loss": 5.6293, "mean_token_accuracy": 0.1547103099524975, "num_tokens": 10057479.0, "step": 5455 }, { "entropy": 5.735934209823609, "epoch": 0.4587271581600504, "grad_norm": 0.89453125, "learning_rate": 0.0004984169083663084, "loss": 5.6068, "mean_token_accuracy": 0.1534338653087616, "num_tokens": 10067754.0, "step": 5460 }, { "entropy": 5.795390987396241, "epoch": 0.4591472379752153, "grad_norm": 0.8828125, "learning_rate": 0.0004984133602278129, "loss": 5.6835, "mean_token_accuracy": 0.157898972928524, "num_tokens": 10076815.0, "step": 5465 }, { "entropy": 5.918915462493897, "epoch": 0.4595673177903802, "grad_norm": 0.95703125, "learning_rate": 0.000498409808131665, "loss": 5.6866, "mean_token_accuracy": 0.15232098400592803, "num_tokens": 10086300.0, "step": 5470 }, { "entropy": 5.7501527786254885, "epoch": 0.4599873976055451, "grad_norm": 0.8828125, "learning_rate": 0.0004984062520779272, "loss": 5.5857, "mean_token_accuracy": 0.16250389367341994, "num_tokens": 10095383.0, "step": 5475 }, { "entropy": 5.6954700469970705, "epoch": 0.4604074774207099, "grad_norm": 0.94921875, "learning_rate": 0.0004984026920666628, "loss": 5.5697, "mean_token_accuracy": 0.15912551581859588, "num_tokens": 10103971.0, "step": 5480 }, { "entropy": 5.814951801300049, "epoch": 0.4608275572358748, "grad_norm": 0.92578125, "learning_rate": 0.0004983991280979347, "loss": 5.5799, "mean_token_accuracy": 0.16145333349704744, "num_tokens": 10113028.0, "step": 5485 }, { "entropy": 5.79097695350647, "epoch": 0.4612476370510397, "grad_norm": 0.9453125, "learning_rate": 0.0004983955601718061, "loss": 5.5408, "mean_token_accuracy": 0.16365961581468583, "num_tokens": 10121890.0, "step": 5490 }, { "entropy": 5.804393863677978, "epoch": 0.46166771686620456, "grad_norm": 0.97265625, "learning_rate": 0.0004983919882883401, "loss": 5.6663, "mean_token_accuracy": 0.1603729695081711, "num_tokens": 10131655.0, "step": 5495 }, { "entropy": 5.873544406890869, "epoch": 0.46208779668136946, "grad_norm": 0.9765625, "learning_rate": 0.0004983884124476, "loss": 5.6699, "mean_token_accuracy": 0.15749045610427856, "num_tokens": 10140778.0, "step": 5500 }, { "entropy": 5.814252138137817, "epoch": 0.46250787649653435, "grad_norm": 0.92578125, "learning_rate": 0.0004983848326496494, "loss": 5.7045, "mean_token_accuracy": 0.15820754915475846, "num_tokens": 10150229.0, "step": 5505 }, { "entropy": 5.815248012542725, "epoch": 0.4629279563116992, "grad_norm": 0.98828125, "learning_rate": 0.0004983812488945513, "loss": 5.6102, "mean_token_accuracy": 0.15927310138940812, "num_tokens": 10158939.0, "step": 5510 }, { "entropy": 5.772242593765259, "epoch": 0.4633480361268641, "grad_norm": 0.984375, "learning_rate": 0.0004983776611823696, "loss": 5.6172, "mean_token_accuracy": 0.15591025203466416, "num_tokens": 10168383.0, "step": 5515 }, { "entropy": 5.762513542175293, "epoch": 0.463768115942029, "grad_norm": 0.93359375, "learning_rate": 0.0004983740695131676, "loss": 5.614, "mean_token_accuracy": 0.16522103548049927, "num_tokens": 10178678.0, "step": 5520 }, { "entropy": 5.78189172744751, "epoch": 0.4641881957571939, "grad_norm": 0.90625, "learning_rate": 0.000498370473887009, "loss": 5.5993, "mean_token_accuracy": 0.1618872195482254, "num_tokens": 10188964.0, "step": 5525 }, { "entropy": 5.797432947158813, "epoch": 0.46460827557235873, "grad_norm": 0.95703125, "learning_rate": 0.0004983668743039573, "loss": 5.626, "mean_token_accuracy": 0.16132238358259202, "num_tokens": 10198333.0, "step": 5530 }, { "entropy": 5.7464605331420895, "epoch": 0.46502835538752363, "grad_norm": 0.99609375, "learning_rate": 0.0004983632707640766, "loss": 5.6385, "mean_token_accuracy": 0.15782831460237504, "num_tokens": 10207876.0, "step": 5535 }, { "entropy": 5.7676252841949465, "epoch": 0.4654484352026885, "grad_norm": 0.95703125, "learning_rate": 0.0004983596632674306, "loss": 5.5836, "mean_token_accuracy": 0.15963911265134811, "num_tokens": 10216822.0, "step": 5540 }, { "entropy": 5.864213514328003, "epoch": 0.46586851501785337, "grad_norm": 0.93359375, "learning_rate": 0.0004983560518140831, "loss": 5.6988, "mean_token_accuracy": 0.15088534951210023, "num_tokens": 10226887.0, "step": 5545 }, { "entropy": 5.807913875579834, "epoch": 0.46628859483301827, "grad_norm": 0.96875, "learning_rate": 0.0004983524364040982, "loss": 5.5379, "mean_token_accuracy": 0.16848834306001664, "num_tokens": 10235935.0, "step": 5550 }, { "entropy": 5.751170539855957, "epoch": 0.46670867464818316, "grad_norm": 0.89453125, "learning_rate": 0.0004983488170375399, "loss": 5.5025, "mean_token_accuracy": 0.16097538769245148, "num_tokens": 10245590.0, "step": 5555 }, { "entropy": 5.752688026428222, "epoch": 0.46712875446334806, "grad_norm": 0.95703125, "learning_rate": 0.0004983451937144723, "loss": 5.5925, "mean_token_accuracy": 0.15908439457416534, "num_tokens": 10255104.0, "step": 5560 }, { "entropy": 5.625225067138672, "epoch": 0.4675488342785129, "grad_norm": 0.9140625, "learning_rate": 0.0004983415664349595, "loss": 5.4479, "mean_token_accuracy": 0.16906733959913253, "num_tokens": 10264236.0, "step": 5565 }, { "entropy": 5.791613006591797, "epoch": 0.4679689140936778, "grad_norm": 0.921875, "learning_rate": 0.0004983379351990659, "loss": 5.5634, "mean_token_accuracy": 0.16491406708955764, "num_tokens": 10273335.0, "step": 5570 }, { "entropy": 5.73756160736084, "epoch": 0.4683889939088427, "grad_norm": 0.83203125, "learning_rate": 0.0004983343000068559, "loss": 5.5392, "mean_token_accuracy": 0.16353048831224443, "num_tokens": 10282206.0, "step": 5575 }, { "entropy": 5.679240655899048, "epoch": 0.46880907372400754, "grad_norm": 1.0, "learning_rate": 0.0004983306608583937, "loss": 5.4798, "mean_token_accuracy": 0.17844018042087556, "num_tokens": 10290056.0, "step": 5580 }, { "entropy": 5.697105741500854, "epoch": 0.46922915353917244, "grad_norm": 0.92578125, "learning_rate": 0.0004983270177537438, "loss": 5.5596, "mean_token_accuracy": 0.16428319364786148, "num_tokens": 10299726.0, "step": 5585 }, { "entropy": 5.741534852981568, "epoch": 0.46964923335433734, "grad_norm": 0.99609375, "learning_rate": 0.0004983233706929708, "loss": 5.6128, "mean_token_accuracy": 0.1574200913310051, "num_tokens": 10308696.0, "step": 5590 }, { "entropy": 5.87669529914856, "epoch": 0.4700693131695022, "grad_norm": 0.92578125, "learning_rate": 0.0004983197196761392, "loss": 5.706, "mean_token_accuracy": 0.1552853010594845, "num_tokens": 10317845.0, "step": 5595 }, { "entropy": 5.774369955062866, "epoch": 0.4704893929846671, "grad_norm": 1.0546875, "learning_rate": 0.0004983160647033139, "loss": 5.5975, "mean_token_accuracy": 0.16107087433338166, "num_tokens": 10326563.0, "step": 5600 }, { "entropy": 5.75340576171875, "epoch": 0.470909472799832, "grad_norm": 0.90234375, "learning_rate": 0.0004983124057745595, "loss": 5.5791, "mean_token_accuracy": 0.15735821723937987, "num_tokens": 10335931.0, "step": 5605 }, { "entropy": 5.707799339294434, "epoch": 0.47132955261499687, "grad_norm": 0.96484375, "learning_rate": 0.0004983087428899408, "loss": 5.5773, "mean_token_accuracy": 0.15221105068922042, "num_tokens": 10344984.0, "step": 5610 }, { "entropy": 5.7647332668304445, "epoch": 0.4717496324301617, "grad_norm": 1.09375, "learning_rate": 0.0004983050760495227, "loss": 5.5966, "mean_token_accuracy": 0.1603370040655136, "num_tokens": 10353522.0, "step": 5615 }, { "entropy": 5.7834312438964846, "epoch": 0.4721697122453266, "grad_norm": 0.96875, "learning_rate": 0.0004983014052533702, "loss": 5.6121, "mean_token_accuracy": 0.15812979638576508, "num_tokens": 10363527.0, "step": 5620 }, { "entropy": 5.723613166809082, "epoch": 0.4725897920604915, "grad_norm": 0.88671875, "learning_rate": 0.0004982977305015481, "loss": 5.5439, "mean_token_accuracy": 0.15958572328090667, "num_tokens": 10372040.0, "step": 5625 }, { "entropy": 5.772522401809693, "epoch": 0.47300987187565635, "grad_norm": 0.9296875, "learning_rate": 0.0004982940517941219, "loss": 5.5227, "mean_token_accuracy": 0.16043394133448602, "num_tokens": 10381279.0, "step": 5630 }, { "entropy": 5.790616703033447, "epoch": 0.47342995169082125, "grad_norm": 1.0078125, "learning_rate": 0.0004982903691311564, "loss": 5.6984, "mean_token_accuracy": 0.15549325048923493, "num_tokens": 10390608.0, "step": 5635 }, { "entropy": 5.768335485458374, "epoch": 0.47385003150598615, "grad_norm": 0.9609375, "learning_rate": 0.0004982866825127172, "loss": 5.4862, "mean_token_accuracy": 0.16711296737194062, "num_tokens": 10399851.0, "step": 5640 }, { "entropy": 5.826428365707398, "epoch": 0.47427011132115104, "grad_norm": 0.953125, "learning_rate": 0.0004982829919388692, "loss": 5.7573, "mean_token_accuracy": 0.15294661596417428, "num_tokens": 10410425.0, "step": 5645 }, { "entropy": 5.705338096618652, "epoch": 0.4746901911363159, "grad_norm": 0.9140625, "learning_rate": 0.0004982792974096781, "loss": 5.5446, "mean_token_accuracy": 0.16691708862781524, "num_tokens": 10418783.0, "step": 5650 }, { "entropy": 5.836835145950317, "epoch": 0.4751102709514808, "grad_norm": 1.1796875, "learning_rate": 0.000498275598925209, "loss": 5.7114, "mean_token_accuracy": 0.15507804453372956, "num_tokens": 10427360.0, "step": 5655 }, { "entropy": 5.856819105148316, "epoch": 0.4755303507666457, "grad_norm": 0.94140625, "learning_rate": 0.0004982718964855277, "loss": 5.6653, "mean_token_accuracy": 0.1575305789709091, "num_tokens": 10436613.0, "step": 5660 }, { "entropy": 5.742249536514282, "epoch": 0.4759504305818105, "grad_norm": 0.98828125, "learning_rate": 0.0004982681900907, "loss": 5.7114, "mean_token_accuracy": 0.15877616107463838, "num_tokens": 10445055.0, "step": 5665 }, { "entropy": 5.744962549209594, "epoch": 0.4763705103969754, "grad_norm": 0.89453125, "learning_rate": 0.000498264479740791, "loss": 5.5379, "mean_token_accuracy": 0.16900296211242677, "num_tokens": 10454516.0, "step": 5670 }, { "entropy": 5.830320215225219, "epoch": 0.4767905902121403, "grad_norm": 0.8984375, "learning_rate": 0.0004982607654358668, "loss": 5.6596, "mean_token_accuracy": 0.15974192917346955, "num_tokens": 10463771.0, "step": 5675 }, { "entropy": 5.769126272201538, "epoch": 0.47721067002730516, "grad_norm": 0.875, "learning_rate": 0.000498257047175993, "loss": 5.5908, "mean_token_accuracy": 0.15908040702342988, "num_tokens": 10473783.0, "step": 5680 }, { "entropy": 5.78115234375, "epoch": 0.47763074984247006, "grad_norm": 0.9609375, "learning_rate": 0.0004982533249612357, "loss": 5.5629, "mean_token_accuracy": 0.16332129687070845, "num_tokens": 10483424.0, "step": 5685 }, { "entropy": 5.69402847290039, "epoch": 0.47805082965763496, "grad_norm": 0.9375, "learning_rate": 0.0004982495987916607, "loss": 5.5045, "mean_token_accuracy": 0.1687542662024498, "num_tokens": 10492536.0, "step": 5690 }, { "entropy": 5.782306718826294, "epoch": 0.47847090947279985, "grad_norm": 1.0625, "learning_rate": 0.0004982458686673339, "loss": 5.6148, "mean_token_accuracy": 0.15962855368852616, "num_tokens": 10501616.0, "step": 5695 }, { "entropy": 5.8774285316467285, "epoch": 0.4788909892879647, "grad_norm": 1.0, "learning_rate": 0.0004982421345883217, "loss": 5.6435, "mean_token_accuracy": 0.1528232589364052, "num_tokens": 10511190.0, "step": 5700 }, { "entropy": 5.737439727783203, "epoch": 0.4793110691031296, "grad_norm": 0.9609375, "learning_rate": 0.0004982383965546898, "loss": 5.5899, "mean_token_accuracy": 0.15596046000719072, "num_tokens": 10520310.0, "step": 5705 }, { "entropy": 5.806997585296631, "epoch": 0.4797311489182945, "grad_norm": 0.9765625, "learning_rate": 0.0004982346545665048, "loss": 5.563, "mean_token_accuracy": 0.16304250210523605, "num_tokens": 10528711.0, "step": 5710 }, { "entropy": 5.757972669601441, "epoch": 0.48015122873345933, "grad_norm": 1.0078125, "learning_rate": 0.0004982309086238328, "loss": 5.6498, "mean_token_accuracy": 0.15384584218263625, "num_tokens": 10538484.0, "step": 5715 }, { "entropy": 5.7749903202056885, "epoch": 0.48057130854862423, "grad_norm": 0.94140625, "learning_rate": 0.0004982271587267403, "loss": 5.5947, "mean_token_accuracy": 0.15901431441307068, "num_tokens": 10547623.0, "step": 5720 }, { "entropy": 5.7751219272613525, "epoch": 0.48099138836378913, "grad_norm": 1.015625, "learning_rate": 0.0004982234048752935, "loss": 5.5458, "mean_token_accuracy": 0.16144074499607086, "num_tokens": 10556234.0, "step": 5725 }, { "entropy": 5.856562280654908, "epoch": 0.481411468178954, "grad_norm": 0.94921875, "learning_rate": 0.000498219647069559, "loss": 5.7641, "mean_token_accuracy": 0.1533028818666935, "num_tokens": 10566308.0, "step": 5730 }, { "entropy": 5.8091706275939945, "epoch": 0.48183154799411887, "grad_norm": 1.015625, "learning_rate": 0.0004982158853096035, "loss": 5.7108, "mean_token_accuracy": 0.15445562452077866, "num_tokens": 10575212.0, "step": 5735 }, { "entropy": 5.755967473983764, "epoch": 0.48225162780928377, "grad_norm": 0.9296875, "learning_rate": 0.0004982121195954935, "loss": 5.4688, "mean_token_accuracy": 0.1693451941013336, "num_tokens": 10584590.0, "step": 5740 }, { "entropy": 5.736726951599121, "epoch": 0.48267170762444866, "grad_norm": 0.9296875, "learning_rate": 0.0004982083499272957, "loss": 5.5512, "mean_token_accuracy": 0.16557496339082717, "num_tokens": 10593997.0, "step": 5745 }, { "entropy": 5.806335926055908, "epoch": 0.4830917874396135, "grad_norm": 0.97265625, "learning_rate": 0.0004982045763050768, "loss": 5.6777, "mean_token_accuracy": 0.157341568171978, "num_tokens": 10603299.0, "step": 5750 }, { "entropy": 5.790657663345337, "epoch": 0.4835118672547784, "grad_norm": 0.953125, "learning_rate": 0.0004982007987289041, "loss": 5.5987, "mean_token_accuracy": 0.15882896780967712, "num_tokens": 10613546.0, "step": 5755 }, { "entropy": 5.743067026138306, "epoch": 0.4839319470699433, "grad_norm": 1.0234375, "learning_rate": 0.0004981970171988439, "loss": 5.5707, "mean_token_accuracy": 0.16890112310647964, "num_tokens": 10622966.0, "step": 5760 }, { "entropy": 5.773163938522339, "epoch": 0.48435202688510814, "grad_norm": 1.0859375, "learning_rate": 0.0004981932317149636, "loss": 5.6484, "mean_token_accuracy": 0.1565729409456253, "num_tokens": 10633441.0, "step": 5765 }, { "entropy": 5.843293190002441, "epoch": 0.48477210670027304, "grad_norm": 0.93359375, "learning_rate": 0.00049818944227733, "loss": 5.6374, "mean_token_accuracy": 0.15993442833423616, "num_tokens": 10643124.0, "step": 5770 }, { "entropy": 5.831496477127075, "epoch": 0.48519218651543794, "grad_norm": 0.95703125, "learning_rate": 0.0004981856488860105, "loss": 5.6117, "mean_token_accuracy": 0.1523417502641678, "num_tokens": 10652517.0, "step": 5775 }, { "entropy": 5.804540205001831, "epoch": 0.48561226633060284, "grad_norm": 0.99609375, "learning_rate": 0.0004981818515410721, "loss": 5.6591, "mean_token_accuracy": 0.1497793585062027, "num_tokens": 10663352.0, "step": 5780 }, { "entropy": 5.732200670242309, "epoch": 0.4860323461457677, "grad_norm": 0.96484375, "learning_rate": 0.0004981780502425821, "loss": 5.6688, "mean_token_accuracy": 0.15934486985206603, "num_tokens": 10672430.0, "step": 5785 }, { "entropy": 5.7780238628387455, "epoch": 0.4864524259609326, "grad_norm": 0.9140625, "learning_rate": 0.0004981742449906079, "loss": 5.6075, "mean_token_accuracy": 0.16593022048473358, "num_tokens": 10681908.0, "step": 5790 }, { "entropy": 5.821439170837403, "epoch": 0.4868725057760975, "grad_norm": 1.0078125, "learning_rate": 0.0004981704357852168, "loss": 5.6032, "mean_token_accuracy": 0.16017231941223145, "num_tokens": 10691259.0, "step": 5795 }, { "entropy": 5.739565515518189, "epoch": 0.4872925855912623, "grad_norm": 0.98046875, "learning_rate": 0.0004981666226264764, "loss": 5.5018, "mean_token_accuracy": 0.16552049070596694, "num_tokens": 10699668.0, "step": 5800 }, { "entropy": 5.741326379776001, "epoch": 0.4877126654064272, "grad_norm": 0.84765625, "learning_rate": 0.0004981628055144542, "loss": 5.5384, "mean_token_accuracy": 0.16326582431793213, "num_tokens": 10709146.0, "step": 5805 }, { "entropy": 5.826295614242554, "epoch": 0.4881327452215921, "grad_norm": 0.90234375, "learning_rate": 0.0004981589844492177, "loss": 5.6268, "mean_token_accuracy": 0.1511153683066368, "num_tokens": 10718724.0, "step": 5810 }, { "entropy": 5.774454784393311, "epoch": 0.488552825036757, "grad_norm": 0.91015625, "learning_rate": 0.0004981551594308349, "loss": 5.6002, "mean_token_accuracy": 0.16163085922598838, "num_tokens": 10728101.0, "step": 5815 }, { "entropy": 5.8604474544525145, "epoch": 0.48897290485192185, "grad_norm": 1.015625, "learning_rate": 0.0004981513304593733, "loss": 5.5894, "mean_token_accuracy": 0.16614548563957215, "num_tokens": 10736750.0, "step": 5820 }, { "entropy": 5.813880395889282, "epoch": 0.48939298466708675, "grad_norm": 0.89453125, "learning_rate": 0.0004981474975349006, "loss": 5.7934, "mean_token_accuracy": 0.15620144009590148, "num_tokens": 10746914.0, "step": 5825 }, { "entropy": 5.775779962539673, "epoch": 0.48981306448225165, "grad_norm": 1.0078125, "learning_rate": 0.000498143660657485, "loss": 5.6266, "mean_token_accuracy": 0.160403074324131, "num_tokens": 10755786.0, "step": 5830 }, { "entropy": 5.672336006164551, "epoch": 0.4902331442974165, "grad_norm": 0.90625, "learning_rate": 0.0004981398198271944, "loss": 5.512, "mean_token_accuracy": 0.16457450538873672, "num_tokens": 10764821.0, "step": 5835 }, { "entropy": 5.762319898605346, "epoch": 0.4906532241125814, "grad_norm": 0.97265625, "learning_rate": 0.0004981359750440968, "loss": 5.5981, "mean_token_accuracy": 0.15791754126548768, "num_tokens": 10773569.0, "step": 5840 }, { "entropy": 5.703838157653808, "epoch": 0.4910733039277463, "grad_norm": 0.98046875, "learning_rate": 0.0004981321263082603, "loss": 5.5547, "mean_token_accuracy": 0.15730964243412018, "num_tokens": 10782298.0, "step": 5845 }, { "entropy": 5.705076360702515, "epoch": 0.4914933837429111, "grad_norm": 0.921875, "learning_rate": 0.000498128273619753, "loss": 5.5491, "mean_token_accuracy": 0.1628515049815178, "num_tokens": 10792087.0, "step": 5850 }, { "entropy": 5.771277141571045, "epoch": 0.491913463558076, "grad_norm": 0.9140625, "learning_rate": 0.0004981244169786433, "loss": 5.6458, "mean_token_accuracy": 0.15582741051912308, "num_tokens": 10801641.0, "step": 5855 }, { "entropy": 5.861782169342041, "epoch": 0.4923335433732409, "grad_norm": 0.94140625, "learning_rate": 0.0004981205563849994, "loss": 5.7007, "mean_token_accuracy": 0.15648430287837983, "num_tokens": 10811612.0, "step": 5860 }, { "entropy": 5.788508701324463, "epoch": 0.4927536231884058, "grad_norm": 0.95703125, "learning_rate": 0.0004981166918388897, "loss": 5.5149, "mean_token_accuracy": 0.16366831362247466, "num_tokens": 10821608.0, "step": 5865 }, { "entropy": 5.720433568954467, "epoch": 0.49317370300357066, "grad_norm": 0.98828125, "learning_rate": 0.0004981128233403828, "loss": 5.4915, "mean_token_accuracy": 0.16485851109027863, "num_tokens": 10830679.0, "step": 5870 }, { "entropy": 5.718778944015503, "epoch": 0.49359378281873556, "grad_norm": 0.890625, "learning_rate": 0.000498108950889547, "loss": 5.5507, "mean_token_accuracy": 0.16066077202558518, "num_tokens": 10839669.0, "step": 5875 }, { "entropy": 5.787919759750366, "epoch": 0.49401386263390046, "grad_norm": 0.92578125, "learning_rate": 0.0004981050744864512, "loss": 5.5387, "mean_token_accuracy": 0.16012917906045915, "num_tokens": 10849666.0, "step": 5880 }, { "entropy": 5.731645965576172, "epoch": 0.4944339424490653, "grad_norm": 0.9765625, "learning_rate": 0.0004981011941311638, "loss": 5.455, "mean_token_accuracy": 0.1706133618950844, "num_tokens": 10858225.0, "step": 5885 }, { "entropy": 5.7152073860168455, "epoch": 0.4948540222642302, "grad_norm": 0.9609375, "learning_rate": 0.0004980973098237535, "loss": 5.5608, "mean_token_accuracy": 0.1573803097009659, "num_tokens": 10867466.0, "step": 5890 }, { "entropy": 5.793262910842896, "epoch": 0.4952741020793951, "grad_norm": 0.95703125, "learning_rate": 0.0004980934215642894, "loss": 5.5967, "mean_token_accuracy": 0.1668254867196083, "num_tokens": 10875850.0, "step": 5895 }, { "entropy": 5.676056289672852, "epoch": 0.49569418189456, "grad_norm": 0.98828125, "learning_rate": 0.00049808952935284, "loss": 5.5231, "mean_token_accuracy": 0.16948444843292237, "num_tokens": 10885154.0, "step": 5900 }, { "entropy": 5.739302301406861, "epoch": 0.49611426170972484, "grad_norm": 0.984375, "learning_rate": 0.0004980856331894747, "loss": 5.6296, "mean_token_accuracy": 0.16090053021907808, "num_tokens": 10894080.0, "step": 5905 }, { "entropy": 5.7569280624389645, "epoch": 0.49653434152488973, "grad_norm": 0.9453125, "learning_rate": 0.0004980817330742621, "loss": 5.6161, "mean_token_accuracy": 0.15483176559209824, "num_tokens": 10903248.0, "step": 5910 }, { "entropy": 5.768988418579101, "epoch": 0.49695442134005463, "grad_norm": 0.91015625, "learning_rate": 0.0004980778290072716, "loss": 5.5804, "mean_token_accuracy": 0.16294265836477279, "num_tokens": 10912939.0, "step": 5915 }, { "entropy": 5.777530717849731, "epoch": 0.4973745011552195, "grad_norm": 0.9765625, "learning_rate": 0.0004980739209885722, "loss": 5.6127, "mean_token_accuracy": 0.16438234001398086, "num_tokens": 10921505.0, "step": 5920 }, { "entropy": 5.802098226547241, "epoch": 0.49779458097038437, "grad_norm": 0.9375, "learning_rate": 0.0004980700090182331, "loss": 5.6819, "mean_token_accuracy": 0.16335346847772597, "num_tokens": 10931861.0, "step": 5925 }, { "entropy": 5.83542947769165, "epoch": 0.49821466078554927, "grad_norm": 0.921875, "learning_rate": 0.0004980660930963238, "loss": 5.5848, "mean_token_accuracy": 0.16074420511722565, "num_tokens": 10940810.0, "step": 5930 }, { "entropy": 5.723906135559082, "epoch": 0.4986347406007141, "grad_norm": 0.94140625, "learning_rate": 0.0004980621732229133, "loss": 5.4722, "mean_token_accuracy": 0.16402249783277512, "num_tokens": 10949514.0, "step": 5935 }, { "entropy": 5.749081373214722, "epoch": 0.499054820415879, "grad_norm": 0.96875, "learning_rate": 0.0004980582493980714, "loss": 5.6742, "mean_token_accuracy": 0.1556909427046776, "num_tokens": 10959161.0, "step": 5940 }, { "entropy": 5.750719594955444, "epoch": 0.4994749002310439, "grad_norm": 0.890625, "learning_rate": 0.0004980543216218674, "loss": 5.5569, "mean_token_accuracy": 0.17051900774240494, "num_tokens": 10968983.0, "step": 5945 }, { "entropy": 5.795907783508301, "epoch": 0.4998949800462088, "grad_norm": 0.9921875, "learning_rate": 0.0004980503898943711, "loss": 5.6755, "mean_token_accuracy": 0.16463214308023452, "num_tokens": 10978044.0, "step": 5950 }, { "entropy": 5.818535089492798, "epoch": 0.5003150598613737, "grad_norm": 0.90625, "learning_rate": 0.0004980464542156519, "loss": 5.5895, "mean_token_accuracy": 0.16786763817071915, "num_tokens": 10986980.0, "step": 5955 }, { "entropy": 5.744042301177979, "epoch": 0.5007351396765385, "grad_norm": 0.953125, "learning_rate": 0.0004980425145857796, "loss": 5.5404, "mean_token_accuracy": 0.17190210670232772, "num_tokens": 10995163.0, "step": 5960 }, { "entropy": 5.6839663028717045, "epoch": 0.5011552194917034, "grad_norm": 0.92578125, "learning_rate": 0.000498038571004824, "loss": 5.4658, "mean_token_accuracy": 0.1701178327202797, "num_tokens": 11003722.0, "step": 5965 }, { "entropy": 5.658802843093872, "epoch": 0.5015752993068683, "grad_norm": 0.93359375, "learning_rate": 0.0004980346234728549, "loss": 5.5459, "mean_token_accuracy": 0.1696319282054901, "num_tokens": 11013176.0, "step": 5970 }, { "entropy": 5.7755608558654785, "epoch": 0.5019953791220332, "grad_norm": 0.94140625, "learning_rate": 0.0004980306719899424, "loss": 5.601, "mean_token_accuracy": 0.16234323978424073, "num_tokens": 11022636.0, "step": 5975 }, { "entropy": 5.711779022216797, "epoch": 0.5024154589371981, "grad_norm": 0.96875, "learning_rate": 0.0004980267165561564, "loss": 5.5409, "mean_token_accuracy": 0.16729752868413925, "num_tokens": 11031896.0, "step": 5980 }, { "entropy": 5.725300073623657, "epoch": 0.502835538752363, "grad_norm": 0.91796875, "learning_rate": 0.0004980227571715669, "loss": 5.579, "mean_token_accuracy": 0.15976378172636033, "num_tokens": 11040802.0, "step": 5985 }, { "entropy": 5.731253290176392, "epoch": 0.5032556185675279, "grad_norm": 0.96484375, "learning_rate": 0.0004980187938362441, "loss": 5.5153, "mean_token_accuracy": 0.1588967353105545, "num_tokens": 11049701.0, "step": 5990 }, { "entropy": 5.786366033554077, "epoch": 0.5036756983826927, "grad_norm": 0.9296875, "learning_rate": 0.0004980148265502581, "loss": 5.694, "mean_token_accuracy": 0.15498168617486954, "num_tokens": 11059555.0, "step": 5995 }, { "entropy": 5.793335866928101, "epoch": 0.5040957781978576, "grad_norm": 0.9921875, "learning_rate": 0.0004980108553136795, "loss": 5.6141, "mean_token_accuracy": 0.16208730340003968, "num_tokens": 11068940.0, "step": 6000 }, { "epoch": 0.5040957781978576, "eval_entropy": 5.5702669805797465, "eval_loss": 5.591900825500488, "eval_mean_token_accuracy": 0.1687953193199262, "eval_num_tokens": 11068940.0, "eval_runtime": 21.0876, "eval_samples_per_second": 1771.942, "eval_steps_per_second": 221.505, "step": 6000 }, { "entropy": 5.811098432540893, "epoch": 0.5045158580130225, "grad_norm": 0.85546875, "learning_rate": 0.0004980068801265783, "loss": 5.5883, "mean_token_accuracy": 0.16337504461407662, "num_tokens": 11079014.0, "step": 6005 }, { "entropy": 5.81418080329895, "epoch": 0.5049359378281874, "grad_norm": 0.953125, "learning_rate": 0.0004980029009890251, "loss": 5.6866, "mean_token_accuracy": 0.15968995168805122, "num_tokens": 11089526.0, "step": 6010 }, { "entropy": 5.788970947265625, "epoch": 0.5053560176433523, "grad_norm": 0.9609375, "learning_rate": 0.0004979989179010904, "loss": 5.5593, "mean_token_accuracy": 0.16788360476493835, "num_tokens": 11099156.0, "step": 6015 }, { "entropy": 5.692927360534668, "epoch": 0.5057760974585171, "grad_norm": 1.03125, "learning_rate": 0.0004979949308628445, "loss": 5.5502, "mean_token_accuracy": 0.1613062158226967, "num_tokens": 11108242.0, "step": 6020 }, { "entropy": 5.660373067855835, "epoch": 0.506196177273682, "grad_norm": 0.9453125, "learning_rate": 0.0004979909398743584, "loss": 5.5452, "mean_token_accuracy": 0.16594227254390717, "num_tokens": 11118076.0, "step": 6025 }, { "entropy": 5.79341983795166, "epoch": 0.5066162570888468, "grad_norm": 0.87109375, "learning_rate": 0.0004979869449357026, "loss": 5.6164, "mean_token_accuracy": 0.16827214658260345, "num_tokens": 11127265.0, "step": 6030 }, { "entropy": 5.790122604370117, "epoch": 0.5070363369040117, "grad_norm": 0.90625, "learning_rate": 0.0004979829460469478, "loss": 5.537, "mean_token_accuracy": 0.16255101412534714, "num_tokens": 11136429.0, "step": 6035 }, { "entropy": 5.731788492202758, "epoch": 0.5074564167191766, "grad_norm": 0.921875, "learning_rate": 0.0004979789432081649, "loss": 5.5406, "mean_token_accuracy": 0.16618361473083496, "num_tokens": 11146201.0, "step": 6040 }, { "entropy": 5.780402135848999, "epoch": 0.5078764965343415, "grad_norm": 1.015625, "learning_rate": 0.000497974936419425, "loss": 5.5673, "mean_token_accuracy": 0.16517029255628585, "num_tokens": 11154867.0, "step": 6045 }, { "entropy": 5.6448524475097654, "epoch": 0.5082965763495064, "grad_norm": 0.96875, "learning_rate": 0.0004979709256807989, "loss": 5.6053, "mean_token_accuracy": 0.16172728240489959, "num_tokens": 11164092.0, "step": 6050 }, { "entropy": 5.781954717636109, "epoch": 0.5087166561646713, "grad_norm": 0.921875, "learning_rate": 0.0004979669109923575, "loss": 5.6388, "mean_token_accuracy": 0.15602062940597533, "num_tokens": 11173176.0, "step": 6055 }, { "entropy": 5.811630630493164, "epoch": 0.5091367359798362, "grad_norm": 0.9375, "learning_rate": 0.0004979628923541721, "loss": 5.5983, "mean_token_accuracy": 0.16064341068267823, "num_tokens": 11182397.0, "step": 6060 }, { "entropy": 5.808675527572632, "epoch": 0.509556815795001, "grad_norm": 0.92578125, "learning_rate": 0.000497958869766314, "loss": 5.6302, "mean_token_accuracy": 0.16145683825016022, "num_tokens": 11191790.0, "step": 6065 }, { "entropy": 5.783952713012695, "epoch": 0.5099768956101659, "grad_norm": 0.9140625, "learning_rate": 0.0004979548432288543, "loss": 5.541, "mean_token_accuracy": 0.1686972364783287, "num_tokens": 11201104.0, "step": 6070 }, { "entropy": 5.752194738388061, "epoch": 0.5103969754253308, "grad_norm": 1.0, "learning_rate": 0.0004979508127418643, "loss": 5.5324, "mean_token_accuracy": 0.16261855214834214, "num_tokens": 11209578.0, "step": 6075 }, { "entropy": 5.768413734436035, "epoch": 0.5108170552404957, "grad_norm": 1.0546875, "learning_rate": 0.0004979467783054155, "loss": 5.5069, "mean_token_accuracy": 0.1716530740261078, "num_tokens": 11218380.0, "step": 6080 }, { "entropy": 5.7077422618865965, "epoch": 0.5112371350556606, "grad_norm": 0.953125, "learning_rate": 0.0004979427399195793, "loss": 5.5338, "mean_token_accuracy": 0.16079539507627488, "num_tokens": 11227810.0, "step": 6085 }, { "entropy": 5.74758620262146, "epoch": 0.5116572148708255, "grad_norm": 1.1015625, "learning_rate": 0.0004979386975844274, "loss": 5.5518, "mean_token_accuracy": 0.1612395703792572, "num_tokens": 11236631.0, "step": 6090 }, { "entropy": 5.72519702911377, "epoch": 0.5120772946859904, "grad_norm": 0.8125, "learning_rate": 0.0004979346513000311, "loss": 5.5893, "mean_token_accuracy": 0.15641138106584548, "num_tokens": 11247418.0, "step": 6095 }, { "entropy": 5.719963645935058, "epoch": 0.5124973745011552, "grad_norm": 0.9296875, "learning_rate": 0.0004979306010664623, "loss": 5.5085, "mean_token_accuracy": 0.1705961272120476, "num_tokens": 11256246.0, "step": 6100 }, { "entropy": 5.627693128585816, "epoch": 0.5129174543163201, "grad_norm": 0.87890625, "learning_rate": 0.0004979265468837927, "loss": 5.4941, "mean_token_accuracy": 0.16766001135110856, "num_tokens": 11265980.0, "step": 6105 }, { "entropy": 5.759566164016723, "epoch": 0.513337534131485, "grad_norm": 0.9140625, "learning_rate": 0.000497922488752094, "loss": 5.529, "mean_token_accuracy": 0.1628105789422989, "num_tokens": 11276158.0, "step": 6110 }, { "entropy": 5.7324329853057865, "epoch": 0.5137576139466499, "grad_norm": 0.94140625, "learning_rate": 0.0004979184266714383, "loss": 5.4482, "mean_token_accuracy": 0.16801214665174485, "num_tokens": 11284957.0, "step": 6115 }, { "entropy": 5.649721574783325, "epoch": 0.5141776937618148, "grad_norm": 0.98046875, "learning_rate": 0.0004979143606418974, "loss": 5.482, "mean_token_accuracy": 0.16361449509859086, "num_tokens": 11294340.0, "step": 6120 }, { "entropy": 5.820867586135864, "epoch": 0.5145977735769797, "grad_norm": 0.9140625, "learning_rate": 0.0004979102906635435, "loss": 5.7268, "mean_token_accuracy": 0.15687822848558425, "num_tokens": 11303344.0, "step": 6125 }, { "entropy": 5.772322273254394, "epoch": 0.5150178533921445, "grad_norm": 1.09375, "learning_rate": 0.0004979062167364486, "loss": 5.5951, "mean_token_accuracy": 0.16613196283578874, "num_tokens": 11311338.0, "step": 6130 }, { "entropy": 5.675012588500977, "epoch": 0.5154379332073094, "grad_norm": 0.9765625, "learning_rate": 0.0004979021388606847, "loss": 5.4169, "mean_token_accuracy": 0.17838600128889084, "num_tokens": 11320194.0, "step": 6135 }, { "entropy": 5.737465143203735, "epoch": 0.5158580130224742, "grad_norm": 0.8671875, "learning_rate": 0.0004978980570363243, "loss": 5.6104, "mean_token_accuracy": 0.1656625747680664, "num_tokens": 11329952.0, "step": 6140 }, { "entropy": 5.681354331970215, "epoch": 0.5162780928376391, "grad_norm": 0.98046875, "learning_rate": 0.0004978939712634396, "loss": 5.5507, "mean_token_accuracy": 0.16612329334020615, "num_tokens": 11339384.0, "step": 6145 }, { "entropy": 5.825163555145264, "epoch": 0.516698172652804, "grad_norm": 0.9375, "learning_rate": 0.0004978898815421029, "loss": 5.7224, "mean_token_accuracy": 0.1597566932439804, "num_tokens": 11348409.0, "step": 6150 }, { "entropy": 5.876521301269531, "epoch": 0.5171182524679689, "grad_norm": 1.03125, "learning_rate": 0.0004978857878723867, "loss": 5.6278, "mean_token_accuracy": 0.16059536784887313, "num_tokens": 11357478.0, "step": 6155 }, { "entropy": 5.806015205383301, "epoch": 0.5175383322831338, "grad_norm": 1.0, "learning_rate": 0.0004978816902543636, "loss": 5.6454, "mean_token_accuracy": 0.15964788049459458, "num_tokens": 11366379.0, "step": 6160 }, { "entropy": 5.7852825164794925, "epoch": 0.5179584120982986, "grad_norm": 0.90625, "learning_rate": 0.0004978775886881062, "loss": 5.6466, "mean_token_accuracy": 0.15952356532216072, "num_tokens": 11376357.0, "step": 6165 }, { "entropy": 5.7297625064849855, "epoch": 0.5183784919134635, "grad_norm": 1.015625, "learning_rate": 0.000497873483173687, "loss": 5.5309, "mean_token_accuracy": 0.17101848274469375, "num_tokens": 11384995.0, "step": 6170 }, { "entropy": 5.71215410232544, "epoch": 0.5187985717286284, "grad_norm": 0.90234375, "learning_rate": 0.0004978693737111787, "loss": 5.5337, "mean_token_accuracy": 0.1644275352358818, "num_tokens": 11395363.0, "step": 6175 }, { "entropy": 5.756123781204224, "epoch": 0.5192186515437933, "grad_norm": 0.9296875, "learning_rate": 0.0004978652603006543, "loss": 5.5116, "mean_token_accuracy": 0.1630913570523262, "num_tokens": 11404511.0, "step": 6180 }, { "entropy": 5.801231575012207, "epoch": 0.5196387313589582, "grad_norm": 0.9375, "learning_rate": 0.0004978611429421866, "loss": 5.5624, "mean_token_accuracy": 0.16539832353591918, "num_tokens": 11413400.0, "step": 6185 }, { "entropy": 5.74934229850769, "epoch": 0.5200588111741231, "grad_norm": 0.95703125, "learning_rate": 0.0004978570216358485, "loss": 5.6156, "mean_token_accuracy": 0.15378101766109467, "num_tokens": 11423693.0, "step": 6190 }, { "entropy": 5.790632915496826, "epoch": 0.520478890989288, "grad_norm": 0.9296875, "learning_rate": 0.000497852896381713, "loss": 5.5801, "mean_token_accuracy": 0.15778316110372542, "num_tokens": 11433195.0, "step": 6195 }, { "entropy": 5.809474945068359, "epoch": 0.5208989708044528, "grad_norm": 0.99609375, "learning_rate": 0.0004978487671798531, "loss": 5.702, "mean_token_accuracy": 0.15822496265172958, "num_tokens": 11443416.0, "step": 6200 }, { "entropy": 5.802986145019531, "epoch": 0.5213190506196177, "grad_norm": 0.984375, "learning_rate": 0.0004978446340303422, "loss": 5.5712, "mean_token_accuracy": 0.16286559998989106, "num_tokens": 11452487.0, "step": 6205 }, { "entropy": 5.800027227401733, "epoch": 0.5217391304347826, "grad_norm": 0.98046875, "learning_rate": 0.0004978404969332533, "loss": 5.5917, "mean_token_accuracy": 0.16486820578575134, "num_tokens": 11461893.0, "step": 6210 }, { "entropy": 5.672508907318115, "epoch": 0.5221592102499475, "grad_norm": 0.91015625, "learning_rate": 0.0004978363558886597, "loss": 5.5188, "mean_token_accuracy": 0.1554739385843277, "num_tokens": 11471238.0, "step": 6215 }, { "entropy": 5.718248462677002, "epoch": 0.5225792900651124, "grad_norm": 0.9609375, "learning_rate": 0.0004978322108966348, "loss": 5.6277, "mean_token_accuracy": 0.15638385266065596, "num_tokens": 11480571.0, "step": 6220 }, { "entropy": 5.763249778747559, "epoch": 0.5229993698802773, "grad_norm": 0.90234375, "learning_rate": 0.0004978280619572521, "loss": 5.6024, "mean_token_accuracy": 0.16089674681425095, "num_tokens": 11489552.0, "step": 6225 }, { "entropy": 5.776705312728882, "epoch": 0.5234194496954422, "grad_norm": 0.95703125, "learning_rate": 0.000497823909070585, "loss": 5.6565, "mean_token_accuracy": 0.15730864256620408, "num_tokens": 11498715.0, "step": 6230 }, { "entropy": 5.766147661209106, "epoch": 0.523839529510607, "grad_norm": 0.96875, "learning_rate": 0.0004978197522367071, "loss": 5.5864, "mean_token_accuracy": 0.15774240344762802, "num_tokens": 11508472.0, "step": 6235 }, { "entropy": 5.825910902023315, "epoch": 0.5242596093257719, "grad_norm": 0.90234375, "learning_rate": 0.0004978155914556919, "loss": 5.5261, "mean_token_accuracy": 0.17228852659463884, "num_tokens": 11517620.0, "step": 6240 }, { "entropy": 5.7359106063842775, "epoch": 0.5246796891409368, "grad_norm": 0.93359375, "learning_rate": 0.0004978114267276134, "loss": 5.5822, "mean_token_accuracy": 0.1607842430472374, "num_tokens": 11526106.0, "step": 6245 }, { "entropy": 5.735781860351563, "epoch": 0.5250997689561017, "grad_norm": 0.97265625, "learning_rate": 0.0004978072580525451, "loss": 5.6084, "mean_token_accuracy": 0.16850581914186477, "num_tokens": 11535840.0, "step": 6250 }, { "entropy": 5.773589372634888, "epoch": 0.5255198487712666, "grad_norm": 0.953125, "learning_rate": 0.000497803085430561, "loss": 5.5746, "mean_token_accuracy": 0.16597797349095345, "num_tokens": 11545110.0, "step": 6255 }, { "entropy": 5.841729068756104, "epoch": 0.5259399285864315, "grad_norm": 1.0, "learning_rate": 0.0004977989088617349, "loss": 5.6189, "mean_token_accuracy": 0.158939990401268, "num_tokens": 11554382.0, "step": 6260 }, { "entropy": 5.731269979476929, "epoch": 0.5263600084015964, "grad_norm": 0.9609375, "learning_rate": 0.000497794728346141, "loss": 5.4784, "mean_token_accuracy": 0.1658696085214615, "num_tokens": 11562821.0, "step": 6265 }, { "entropy": 5.805121564865113, "epoch": 0.5267800882167611, "grad_norm": 0.99609375, "learning_rate": 0.0004977905438838531, "loss": 5.6848, "mean_token_accuracy": 0.15056246519088745, "num_tokens": 11571705.0, "step": 6270 }, { "entropy": 5.646053838729858, "epoch": 0.527200168031926, "grad_norm": 0.9375, "learning_rate": 0.0004977863554749453, "loss": 5.5048, "mean_token_accuracy": 0.1614176630973816, "num_tokens": 11580692.0, "step": 6275 }, { "entropy": 5.706112480163574, "epoch": 0.5276202478470909, "grad_norm": 0.8828125, "learning_rate": 0.0004977821631194922, "loss": 5.5261, "mean_token_accuracy": 0.15832365602254866, "num_tokens": 11589966.0, "step": 6280 }, { "entropy": 5.80370602607727, "epoch": 0.5280403276622558, "grad_norm": 0.85546875, "learning_rate": 0.0004977779668175677, "loss": 5.6014, "mean_token_accuracy": 0.15689835250377654, "num_tokens": 11599627.0, "step": 6285 }, { "entropy": 5.776365804672241, "epoch": 0.5284604074774207, "grad_norm": 0.93359375, "learning_rate": 0.0004977737665692461, "loss": 5.577, "mean_token_accuracy": 0.16786182373762132, "num_tokens": 11608431.0, "step": 6290 }, { "entropy": 5.7039391040802006, "epoch": 0.5288804872925856, "grad_norm": 0.92578125, "learning_rate": 0.0004977695623746021, "loss": 5.4668, "mean_token_accuracy": 0.1596635565161705, "num_tokens": 11617552.0, "step": 6295 }, { "entropy": 5.688570165634156, "epoch": 0.5293005671077504, "grad_norm": 1.015625, "learning_rate": 0.0004977653542337099, "loss": 5.505, "mean_token_accuracy": 0.168349389731884, "num_tokens": 11626828.0, "step": 6300 }, { "entropy": 5.760095262527466, "epoch": 0.5297206469229153, "grad_norm": 0.96484375, "learning_rate": 0.0004977611421466443, "loss": 5.5798, "mean_token_accuracy": 0.16194986999034883, "num_tokens": 11635867.0, "step": 6305 }, { "entropy": 5.784766721725464, "epoch": 0.5301407267380802, "grad_norm": 0.953125, "learning_rate": 0.0004977569261134797, "loss": 5.4934, "mean_token_accuracy": 0.1665690392255783, "num_tokens": 11644711.0, "step": 6310 }, { "entropy": 5.731612682342529, "epoch": 0.5305608065532451, "grad_norm": 1.0078125, "learning_rate": 0.0004977527061342908, "loss": 5.5935, "mean_token_accuracy": 0.16445396840572357, "num_tokens": 11653320.0, "step": 6315 }, { "entropy": 5.741483688354492, "epoch": 0.53098088636841, "grad_norm": 0.890625, "learning_rate": 0.0004977484822091524, "loss": 5.5402, "mean_token_accuracy": 0.16396106481552125, "num_tokens": 11662753.0, "step": 6320 }, { "entropy": 5.774769592285156, "epoch": 0.5314009661835749, "grad_norm": 1.140625, "learning_rate": 0.0004977442543381394, "loss": 5.5781, "mean_token_accuracy": 0.1614773690700531, "num_tokens": 11671622.0, "step": 6325 }, { "entropy": 5.791268253326416, "epoch": 0.5318210459987398, "grad_norm": 1.0078125, "learning_rate": 0.0004977400225213266, "loss": 5.5734, "mean_token_accuracy": 0.158200266957283, "num_tokens": 11679964.0, "step": 6330 }, { "entropy": 5.711006307601929, "epoch": 0.5322411258139046, "grad_norm": 0.9375, "learning_rate": 0.000497735786758789, "loss": 5.5241, "mean_token_accuracy": 0.16033022105693817, "num_tokens": 11688700.0, "step": 6335 }, { "entropy": 5.755384397506714, "epoch": 0.5326612056290695, "grad_norm": 0.96484375, "learning_rate": 0.0004977315470506016, "loss": 5.642, "mean_token_accuracy": 0.1639561802148819, "num_tokens": 11698425.0, "step": 6340 }, { "entropy": 5.840785360336303, "epoch": 0.5330812854442344, "grad_norm": 0.984375, "learning_rate": 0.0004977273033968397, "loss": 5.6163, "mean_token_accuracy": 0.15594931393861772, "num_tokens": 11707705.0, "step": 6345 }, { "entropy": 5.712259912490845, "epoch": 0.5335013652593993, "grad_norm": 0.97265625, "learning_rate": 0.0004977230557975782, "loss": 5.5145, "mean_token_accuracy": 0.16725486963987352, "num_tokens": 11717079.0, "step": 6350 }, { "entropy": 5.719585847854614, "epoch": 0.5339214450745642, "grad_norm": 0.9921875, "learning_rate": 0.0004977188042528923, "loss": 5.5149, "mean_token_accuracy": 0.16303325742483138, "num_tokens": 11725504.0, "step": 6355 }, { "entropy": 5.7473976612091064, "epoch": 0.5343415248897291, "grad_norm": 0.94921875, "learning_rate": 0.0004977145487628576, "loss": 5.5969, "mean_token_accuracy": 0.1630012646317482, "num_tokens": 11735282.0, "step": 6360 }, { "entropy": 5.7814888000488285, "epoch": 0.534761604704894, "grad_norm": 0.921875, "learning_rate": 0.0004977102893275494, "loss": 5.5763, "mean_token_accuracy": 0.15976961851119995, "num_tokens": 11744827.0, "step": 6365 }, { "entropy": 5.773612451553345, "epoch": 0.5351816845200588, "grad_norm": 1.03125, "learning_rate": 0.000497706025947043, "loss": 5.5367, "mean_token_accuracy": 0.16401349604129792, "num_tokens": 11753066.0, "step": 6370 }, { "entropy": 5.736017036437988, "epoch": 0.5356017643352237, "grad_norm": 1.0703125, "learning_rate": 0.0004977017586214142, "loss": 5.5737, "mean_token_accuracy": 0.16376062780618666, "num_tokens": 11761190.0, "step": 6375 }, { "entropy": 5.764604949951172, "epoch": 0.5360218441503886, "grad_norm": 0.94140625, "learning_rate": 0.0004976974873507382, "loss": 5.5103, "mean_token_accuracy": 0.1692323476076126, "num_tokens": 11770321.0, "step": 6380 }, { "entropy": 5.737596845626831, "epoch": 0.5364419239655535, "grad_norm": 0.9140625, "learning_rate": 0.000497693212135091, "loss": 5.5927, "mean_token_accuracy": 0.16351059675216675, "num_tokens": 11778388.0, "step": 6385 }, { "entropy": 5.7780561447143555, "epoch": 0.5368620037807184, "grad_norm": 1.015625, "learning_rate": 0.0004976889329745482, "loss": 5.4529, "mean_token_accuracy": 0.17066252157092093, "num_tokens": 11786250.0, "step": 6390 }, { "entropy": 5.686847257614136, "epoch": 0.5372820835958833, "grad_norm": 0.97265625, "learning_rate": 0.0004976846498691857, "loss": 5.4307, "mean_token_accuracy": 0.1696195513010025, "num_tokens": 11794831.0, "step": 6395 }, { "entropy": 5.72705192565918, "epoch": 0.5377021634110482, "grad_norm": 1.0703125, "learning_rate": 0.0004976803628190792, "loss": 5.4736, "mean_token_accuracy": 0.17794454842805862, "num_tokens": 11803550.0, "step": 6400 }, { "entropy": 5.74969711303711, "epoch": 0.5381222432262129, "grad_norm": 0.9375, "learning_rate": 0.0004976760718243047, "loss": 5.5546, "mean_token_accuracy": 0.16087348014116287, "num_tokens": 11812478.0, "step": 6405 }, { "entropy": 5.741421031951904, "epoch": 0.5385423230413778, "grad_norm": 0.9921875, "learning_rate": 0.0004976717768849383, "loss": 5.516, "mean_token_accuracy": 0.15771948993206025, "num_tokens": 11822463.0, "step": 6410 }, { "entropy": 5.702504682540893, "epoch": 0.5389624028565427, "grad_norm": 0.984375, "learning_rate": 0.0004976674780010561, "loss": 5.5713, "mean_token_accuracy": 0.15501011312007903, "num_tokens": 11831853.0, "step": 6415 }, { "entropy": 5.749214172363281, "epoch": 0.5393824826717076, "grad_norm": 0.94140625, "learning_rate": 0.000497663175172734, "loss": 5.5864, "mean_token_accuracy": 0.15965323597192765, "num_tokens": 11841574.0, "step": 6420 }, { "entropy": 5.805502128601074, "epoch": 0.5398025624868725, "grad_norm": 0.890625, "learning_rate": 0.0004976588684000486, "loss": 5.6666, "mean_token_accuracy": 0.1459944285452366, "num_tokens": 11852489.0, "step": 6425 }, { "entropy": 5.7659282207489015, "epoch": 0.5402226423020374, "grad_norm": 0.921875, "learning_rate": 0.0004976545576830759, "loss": 5.5435, "mean_token_accuracy": 0.15960338413715364, "num_tokens": 11861499.0, "step": 6430 }, { "entropy": 5.764361619949341, "epoch": 0.5406427221172023, "grad_norm": 0.9296875, "learning_rate": 0.0004976502430218924, "loss": 5.6215, "mean_token_accuracy": 0.15667859464883804, "num_tokens": 11871685.0, "step": 6435 }, { "entropy": 5.763283014297485, "epoch": 0.5410628019323671, "grad_norm": 0.83984375, "learning_rate": 0.0004976459244165744, "loss": 5.5216, "mean_token_accuracy": 0.16296222656965256, "num_tokens": 11881340.0, "step": 6440 }, { "entropy": 5.711437559127807, "epoch": 0.541482881747532, "grad_norm": 0.88671875, "learning_rate": 0.0004976416018671986, "loss": 5.5449, "mean_token_accuracy": 0.15986063182353974, "num_tokens": 11890700.0, "step": 6445 }, { "entropy": 5.737738609313965, "epoch": 0.5419029615626969, "grad_norm": 0.98046875, "learning_rate": 0.0004976372753738415, "loss": 5.5329, "mean_token_accuracy": 0.1589517265558243, "num_tokens": 11900329.0, "step": 6450 }, { "entropy": 5.888564586639404, "epoch": 0.5423230413778618, "grad_norm": 0.94921875, "learning_rate": 0.0004976329449365795, "loss": 5.5801, "mean_token_accuracy": 0.1566044047474861, "num_tokens": 11909915.0, "step": 6455 }, { "entropy": 5.737349987030029, "epoch": 0.5427431211930267, "grad_norm": 0.99609375, "learning_rate": 0.0004976286105554897, "loss": 5.5918, "mean_token_accuracy": 0.16518180966377258, "num_tokens": 11918302.0, "step": 6460 }, { "entropy": 5.755007314682007, "epoch": 0.5431632010081916, "grad_norm": 0.9296875, "learning_rate": 0.0004976242722306487, "loss": 5.5454, "mean_token_accuracy": 0.16296235620975494, "num_tokens": 11927794.0, "step": 6465 }, { "entropy": 5.803985500335694, "epoch": 0.5435832808233564, "grad_norm": 0.8515625, "learning_rate": 0.0004976199299621333, "loss": 5.5802, "mean_token_accuracy": 0.16151558607816696, "num_tokens": 11937701.0, "step": 6470 }, { "entropy": 5.689332914352417, "epoch": 0.5440033606385213, "grad_norm": 1.1328125, "learning_rate": 0.0004976155837500205, "loss": 5.4851, "mean_token_accuracy": 0.1696722015738487, "num_tokens": 11946106.0, "step": 6475 }, { "entropy": 5.72600040435791, "epoch": 0.5444234404536862, "grad_norm": 0.98828125, "learning_rate": 0.0004976112335943872, "loss": 5.4262, "mean_token_accuracy": 0.16228668838739396, "num_tokens": 11954604.0, "step": 6480 }, { "entropy": 5.630837154388428, "epoch": 0.5448435202688511, "grad_norm": 0.98046875, "learning_rate": 0.0004976068794953106, "loss": 5.4824, "mean_token_accuracy": 0.16968904286623002, "num_tokens": 11963664.0, "step": 6485 }, { "entropy": 5.780311393737793, "epoch": 0.545263600084016, "grad_norm": 0.8671875, "learning_rate": 0.0004976025214528677, "loss": 5.4771, "mean_token_accuracy": 0.16729624718427658, "num_tokens": 11973426.0, "step": 6490 }, { "entropy": 5.74679913520813, "epoch": 0.5456836798991809, "grad_norm": 0.9765625, "learning_rate": 0.0004975981594671359, "loss": 5.5305, "mean_token_accuracy": 0.16190839260816575, "num_tokens": 11982339.0, "step": 6495 }, { "entropy": 5.776019430160522, "epoch": 0.5461037597143458, "grad_norm": 1.0234375, "learning_rate": 0.0004975937935381921, "loss": 5.5592, "mean_token_accuracy": 0.16586280912160872, "num_tokens": 11992016.0, "step": 6500 }, { "entropy": 5.698467683792114, "epoch": 0.5465238395295106, "grad_norm": 1.1015625, "learning_rate": 0.000497589423666114, "loss": 5.565, "mean_token_accuracy": 0.16193219423294067, "num_tokens": 12000616.0, "step": 6505 }, { "entropy": 5.5959553718566895, "epoch": 0.5469439193446755, "grad_norm": 0.90234375, "learning_rate": 0.0004975850498509789, "loss": 5.4744, "mean_token_accuracy": 0.1637238934636116, "num_tokens": 12009717.0, "step": 6510 }, { "entropy": 5.679888772964477, "epoch": 0.5473639991598404, "grad_norm": 1.0234375, "learning_rate": 0.0004975806720928642, "loss": 5.5583, "mean_token_accuracy": 0.1625445678830147, "num_tokens": 12018020.0, "step": 6515 }, { "entropy": 5.791135978698731, "epoch": 0.5477840789750053, "grad_norm": 1.0, "learning_rate": 0.0004975762903918475, "loss": 5.5404, "mean_token_accuracy": 0.16019310504198075, "num_tokens": 12027119.0, "step": 6520 }, { "entropy": 5.763389539718628, "epoch": 0.5482041587901701, "grad_norm": 1.0234375, "learning_rate": 0.0004975719047480064, "loss": 5.5369, "mean_token_accuracy": 0.16848756968975068, "num_tokens": 12035566.0, "step": 6525 }, { "entropy": 5.692020082473755, "epoch": 0.548624238605335, "grad_norm": 0.9609375, "learning_rate": 0.0004975675151614187, "loss": 5.4426, "mean_token_accuracy": 0.170123191177845, "num_tokens": 12044505.0, "step": 6530 }, { "entropy": 5.619001770019532, "epoch": 0.5490443184204999, "grad_norm": 1.0390625, "learning_rate": 0.000497563121632162, "loss": 5.5066, "mean_token_accuracy": 0.16735866218805312, "num_tokens": 12053338.0, "step": 6535 }, { "entropy": 5.727146291732788, "epoch": 0.5494643982356647, "grad_norm": 0.984375, "learning_rate": 0.0004975587241603142, "loss": 5.5111, "mean_token_accuracy": 0.16334682554006577, "num_tokens": 12063235.0, "step": 6540 }, { "entropy": 5.80925874710083, "epoch": 0.5498844780508296, "grad_norm": 0.96484375, "learning_rate": 0.0004975543227459533, "loss": 5.5874, "mean_token_accuracy": 0.1605127662420273, "num_tokens": 12072490.0, "step": 6545 }, { "entropy": 5.744976043701172, "epoch": 0.5503045578659945, "grad_norm": 0.921875, "learning_rate": 0.0004975499173891571, "loss": 5.6339, "mean_token_accuracy": 0.15866934806108474, "num_tokens": 12081474.0, "step": 6550 }, { "entropy": 5.681692361831665, "epoch": 0.5507246376811594, "grad_norm": 0.921875, "learning_rate": 0.0004975455080900037, "loss": 5.5062, "mean_token_accuracy": 0.1674065738916397, "num_tokens": 12090963.0, "step": 6555 }, { "entropy": 5.727477884292602, "epoch": 0.5511447174963243, "grad_norm": 1.0078125, "learning_rate": 0.0004975410948485713, "loss": 5.5206, "mean_token_accuracy": 0.16142902970314027, "num_tokens": 12099786.0, "step": 6560 }, { "entropy": 5.699660587310791, "epoch": 0.5515647973114892, "grad_norm": 0.94140625, "learning_rate": 0.0004975366776649379, "loss": 5.5353, "mean_token_accuracy": 0.16478368937969207, "num_tokens": 12108469.0, "step": 6565 }, { "entropy": 5.764699554443359, "epoch": 0.5519848771266541, "grad_norm": 0.94140625, "learning_rate": 0.0004975322565391818, "loss": 5.4985, "mean_token_accuracy": 0.16510994732379913, "num_tokens": 12118287.0, "step": 6570 }, { "entropy": 5.802925443649292, "epoch": 0.5524049569418189, "grad_norm": 0.97265625, "learning_rate": 0.0004975278314713814, "loss": 5.6693, "mean_token_accuracy": 0.15847567915916444, "num_tokens": 12127122.0, "step": 6575 }, { "entropy": 5.770570850372314, "epoch": 0.5528250367569838, "grad_norm": 0.9921875, "learning_rate": 0.0004975234024616152, "loss": 5.5604, "mean_token_accuracy": 0.17042070776224136, "num_tokens": 12136395.0, "step": 6580 }, { "entropy": 5.65176568031311, "epoch": 0.5532451165721487, "grad_norm": 0.9921875, "learning_rate": 0.0004975189695099613, "loss": 5.5381, "mean_token_accuracy": 0.16711881011724472, "num_tokens": 12145025.0, "step": 6585 }, { "entropy": 5.764221954345703, "epoch": 0.5536651963873136, "grad_norm": 0.9375, "learning_rate": 0.0004975145326164985, "loss": 5.5774, "mean_token_accuracy": 0.15798249989748, "num_tokens": 12154352.0, "step": 6590 }, { "entropy": 5.7215704917907715, "epoch": 0.5540852762024785, "grad_norm": 0.95703125, "learning_rate": 0.0004975100917813055, "loss": 5.4733, "mean_token_accuracy": 0.16243733167648317, "num_tokens": 12163802.0, "step": 6595 }, { "entropy": 5.689832258224487, "epoch": 0.5545053560176434, "grad_norm": 0.9140625, "learning_rate": 0.0004975056470044606, "loss": 5.5086, "mean_token_accuracy": 0.16092772781848907, "num_tokens": 12173111.0, "step": 6600 }, { "entropy": 5.729002904891968, "epoch": 0.5549254358328082, "grad_norm": 0.984375, "learning_rate": 0.0004975011982860428, "loss": 5.5485, "mean_token_accuracy": 0.16289519965648652, "num_tokens": 12182048.0, "step": 6605 }, { "entropy": 5.7060850143432615, "epoch": 0.5553455156479731, "grad_norm": 0.93359375, "learning_rate": 0.0004974967456261309, "loss": 5.5435, "mean_token_accuracy": 0.16328554153442382, "num_tokens": 12191501.0, "step": 6610 }, { "entropy": 5.75695481300354, "epoch": 0.555765595463138, "grad_norm": 0.92578125, "learning_rate": 0.0004974922890248036, "loss": 5.5591, "mean_token_accuracy": 0.16566281169652938, "num_tokens": 12201132.0, "step": 6615 }, { "entropy": 5.838721704483032, "epoch": 0.5561856752783029, "grad_norm": 0.9765625, "learning_rate": 0.00049748782848214, "loss": 5.6971, "mean_token_accuracy": 0.15937435030937194, "num_tokens": 12211082.0, "step": 6620 }, { "entropy": 5.763456106185913, "epoch": 0.5566057550934678, "grad_norm": 0.88671875, "learning_rate": 0.0004974833639982192, "loss": 5.5107, "mean_token_accuracy": 0.16620800793170928, "num_tokens": 12219946.0, "step": 6625 }, { "entropy": 5.808733177185059, "epoch": 0.5570258349086327, "grad_norm": 0.98046875, "learning_rate": 0.00049747889557312, "loss": 5.6113, "mean_token_accuracy": 0.1599217653274536, "num_tokens": 12229668.0, "step": 6630 }, { "entropy": 5.788719987869262, "epoch": 0.5574459147237976, "grad_norm": 0.9296875, "learning_rate": 0.0004974744232069219, "loss": 5.6015, "mean_token_accuracy": 0.16660431921482086, "num_tokens": 12238750.0, "step": 6635 }, { "entropy": 5.708747816085816, "epoch": 0.5578659945389624, "grad_norm": 1.0390625, "learning_rate": 0.0004974699468997038, "loss": 5.5569, "mean_token_accuracy": 0.15997037440538406, "num_tokens": 12246825.0, "step": 6640 }, { "entropy": 5.660719966888427, "epoch": 0.5582860743541272, "grad_norm": 0.99609375, "learning_rate": 0.0004974654666515452, "loss": 5.4793, "mean_token_accuracy": 0.1639156773686409, "num_tokens": 12256413.0, "step": 6645 }, { "entropy": 5.72155442237854, "epoch": 0.5587061541692921, "grad_norm": 1.0703125, "learning_rate": 0.0004974609824625254, "loss": 5.5267, "mean_token_accuracy": 0.17064841985702514, "num_tokens": 12265458.0, "step": 6650 }, { "entropy": 5.6467994213104244, "epoch": 0.559126233984457, "grad_norm": 0.984375, "learning_rate": 0.0004974564943327239, "loss": 5.4547, "mean_token_accuracy": 0.164512038230896, "num_tokens": 12274124.0, "step": 6655 }, { "entropy": 5.596433830261231, "epoch": 0.5595463137996219, "grad_norm": 0.8984375, "learning_rate": 0.00049745200226222, "loss": 5.4174, "mean_token_accuracy": 0.18331650793552398, "num_tokens": 12283513.0, "step": 6660 }, { "entropy": 5.68831057548523, "epoch": 0.5599663936147868, "grad_norm": 1.03125, "learning_rate": 0.0004974475062510936, "loss": 5.5525, "mean_token_accuracy": 0.1645463690161705, "num_tokens": 12292396.0, "step": 6665 }, { "entropy": 5.747618198394775, "epoch": 0.5603864734299517, "grad_norm": 0.92578125, "learning_rate": 0.0004974430062994242, "loss": 5.5838, "mean_token_accuracy": 0.16515465825796127, "num_tokens": 12301604.0, "step": 6670 }, { "entropy": 5.796191072463989, "epoch": 0.5608065532451165, "grad_norm": 1.0, "learning_rate": 0.0004974385024072912, "loss": 5.6032, "mean_token_accuracy": 0.1587561160326004, "num_tokens": 12310458.0, "step": 6675 }, { "entropy": 5.805469274520874, "epoch": 0.5612266330602814, "grad_norm": 0.97265625, "learning_rate": 0.000497433994574775, "loss": 5.61, "mean_token_accuracy": 0.16035176664590836, "num_tokens": 12319620.0, "step": 6680 }, { "entropy": 5.804027795791626, "epoch": 0.5616467128754463, "grad_norm": 0.91796875, "learning_rate": 0.000497429482801955, "loss": 5.675, "mean_token_accuracy": 0.15980809777975083, "num_tokens": 12329518.0, "step": 6685 }, { "entropy": 5.729493474960327, "epoch": 0.5620667926906112, "grad_norm": 0.90234375, "learning_rate": 0.0004974249670889111, "loss": 5.4737, "mean_token_accuracy": 0.16783603131771088, "num_tokens": 12338244.0, "step": 6690 }, { "entropy": 5.815527057647705, "epoch": 0.5624868725057761, "grad_norm": 1.0078125, "learning_rate": 0.0004974204474357237, "loss": 5.6511, "mean_token_accuracy": 0.16130429953336717, "num_tokens": 12347962.0, "step": 6695 }, { "entropy": 5.79982476234436, "epoch": 0.562906952320941, "grad_norm": 1.0390625, "learning_rate": 0.0004974159238424723, "loss": 5.5647, "mean_token_accuracy": 0.160567244887352, "num_tokens": 12357020.0, "step": 6700 }, { "entropy": 5.701817035675049, "epoch": 0.5633270321361059, "grad_norm": 0.9453125, "learning_rate": 0.0004974113963092376, "loss": 5.5462, "mean_token_accuracy": 0.16599834561347962, "num_tokens": 12366108.0, "step": 6705 }, { "entropy": 5.80378737449646, "epoch": 0.5637471119512707, "grad_norm": 1.0, "learning_rate": 0.0004974068648360995, "loss": 5.4608, "mean_token_accuracy": 0.1770256206393242, "num_tokens": 12374508.0, "step": 6710 }, { "entropy": 5.7393152713775635, "epoch": 0.5641671917664356, "grad_norm": 0.9609375, "learning_rate": 0.0004974023294231383, "loss": 5.4842, "mean_token_accuracy": 0.17291969954967498, "num_tokens": 12383555.0, "step": 6715 }, { "entropy": 5.696082067489624, "epoch": 0.5645872715816005, "grad_norm": 0.921875, "learning_rate": 0.0004973977900704342, "loss": 5.5761, "mean_token_accuracy": 0.1606935977935791, "num_tokens": 12392680.0, "step": 6720 }, { "entropy": 5.782982110977173, "epoch": 0.5650073513967654, "grad_norm": 0.90234375, "learning_rate": 0.0004973932467780679, "loss": 5.6217, "mean_token_accuracy": 0.15996287018060684, "num_tokens": 12401881.0, "step": 6725 }, { "entropy": 5.783317613601684, "epoch": 0.5654274312119303, "grad_norm": 0.93359375, "learning_rate": 0.0004973886995461197, "loss": 5.604, "mean_token_accuracy": 0.15776107162237168, "num_tokens": 12411487.0, "step": 6730 }, { "entropy": 5.699482107162476, "epoch": 0.5658475110270952, "grad_norm": 0.91796875, "learning_rate": 0.0004973841483746703, "loss": 5.4248, "mean_token_accuracy": 0.17524536103010177, "num_tokens": 12420376.0, "step": 6735 }, { "entropy": 5.609754610061645, "epoch": 0.5662675908422601, "grad_norm": 0.8984375, "learning_rate": 0.0004973795932638001, "loss": 5.458, "mean_token_accuracy": 0.17494328320026398, "num_tokens": 12429518.0, "step": 6740 }, { "entropy": 5.715028858184814, "epoch": 0.5666876706574249, "grad_norm": 0.96484375, "learning_rate": 0.00049737503421359, "loss": 5.4491, "mean_token_accuracy": 0.17262679785490037, "num_tokens": 12438952.0, "step": 6745 }, { "entropy": 5.708644962310791, "epoch": 0.5671077504725898, "grad_norm": 1.0859375, "learning_rate": 0.0004973704712241206, "loss": 5.4558, "mean_token_accuracy": 0.16454821228981018, "num_tokens": 12448576.0, "step": 6750 }, { "entropy": 5.688517618179321, "epoch": 0.5675278302877547, "grad_norm": 0.94140625, "learning_rate": 0.0004973659042954729, "loss": 5.4982, "mean_token_accuracy": 0.1647478923201561, "num_tokens": 12458166.0, "step": 6755 }, { "entropy": 5.596537494659424, "epoch": 0.5679479101029196, "grad_norm": 0.9921875, "learning_rate": 0.0004973613334277277, "loss": 5.4163, "mean_token_accuracy": 0.17238699346780778, "num_tokens": 12467271.0, "step": 6760 }, { "entropy": 5.7394147396087645, "epoch": 0.5683679899180845, "grad_norm": 0.96484375, "learning_rate": 0.0004973567586209658, "loss": 5.5871, "mean_token_accuracy": 0.16045358031988144, "num_tokens": 12476255.0, "step": 6765 }, { "entropy": 5.756132364273071, "epoch": 0.5687880697332494, "grad_norm": 0.91796875, "learning_rate": 0.0004973521798752686, "loss": 5.5557, "mean_token_accuracy": 0.16549135744571686, "num_tokens": 12485096.0, "step": 6770 }, { "entropy": 5.816721343994141, "epoch": 0.5692081495484141, "grad_norm": 0.95703125, "learning_rate": 0.000497347597190717, "loss": 5.5779, "mean_token_accuracy": 0.16754906624555588, "num_tokens": 12494405.0, "step": 6775 }, { "entropy": 5.721153497695923, "epoch": 0.569628229363579, "grad_norm": 0.98828125, "learning_rate": 0.0004973430105673921, "loss": 5.5031, "mean_token_accuracy": 0.1665035143494606, "num_tokens": 12503349.0, "step": 6780 }, { "entropy": 5.731798124313355, "epoch": 0.5700483091787439, "grad_norm": 0.94140625, "learning_rate": 0.0004973384200053754, "loss": 5.5885, "mean_token_accuracy": 0.166769115626812, "num_tokens": 12513122.0, "step": 6785 }, { "entropy": 5.723275518417358, "epoch": 0.5704683889939088, "grad_norm": 0.98046875, "learning_rate": 0.000497333825504748, "loss": 5.5249, "mean_token_accuracy": 0.16382726579904555, "num_tokens": 12523614.0, "step": 6790 }, { "entropy": 5.735034370422364, "epoch": 0.5708884688090737, "grad_norm": 0.95703125, "learning_rate": 0.0004973292270655914, "loss": 5.5736, "mean_token_accuracy": 0.15909015834331514, "num_tokens": 12532031.0, "step": 6795 }, { "entropy": 5.809251117706299, "epoch": 0.5713085486242386, "grad_norm": 0.94921875, "learning_rate": 0.000497324624687987, "loss": 5.666, "mean_token_accuracy": 0.15627140551805496, "num_tokens": 12542239.0, "step": 6800 }, { "entropy": 5.856173467636109, "epoch": 0.5717286284394035, "grad_norm": 0.984375, "learning_rate": 0.0004973200183720164, "loss": 5.5806, "mean_token_accuracy": 0.15812304764986038, "num_tokens": 12552608.0, "step": 6805 }, { "entropy": 5.668775606155395, "epoch": 0.5721487082545683, "grad_norm": 0.97265625, "learning_rate": 0.0004973154081177611, "loss": 5.4123, "mean_token_accuracy": 0.16480085700750352, "num_tokens": 12562020.0, "step": 6810 }, { "entropy": 5.667333602905273, "epoch": 0.5725687880697332, "grad_norm": 1.03125, "learning_rate": 0.0004973107939253027, "loss": 5.4832, "mean_token_accuracy": 0.17599694728851317, "num_tokens": 12570519.0, "step": 6815 }, { "entropy": 5.634785413742065, "epoch": 0.5729888678848981, "grad_norm": 0.96875, "learning_rate": 0.0004973061757947233, "loss": 5.4905, "mean_token_accuracy": 0.1663898229598999, "num_tokens": 12579324.0, "step": 6820 }, { "entropy": 5.702479887008667, "epoch": 0.573408947700063, "grad_norm": 0.96875, "learning_rate": 0.0004973015537261043, "loss": 5.5443, "mean_token_accuracy": 0.16654133200645446, "num_tokens": 12588014.0, "step": 6825 }, { "entropy": 5.789632272720337, "epoch": 0.5738290275152279, "grad_norm": 0.921875, "learning_rate": 0.0004972969277195279, "loss": 5.5606, "mean_token_accuracy": 0.16706208139657974, "num_tokens": 12596882.0, "step": 6830 }, { "entropy": 5.706000423431396, "epoch": 0.5742491073303928, "grad_norm": 0.97265625, "learning_rate": 0.0004972922977750757, "loss": 5.4794, "mean_token_accuracy": 0.16413741260766984, "num_tokens": 12606069.0, "step": 6835 }, { "entropy": 5.719677686691284, "epoch": 0.5746691871455577, "grad_norm": 1.4765625, "learning_rate": 0.00049728766389283, "loss": 5.4886, "mean_token_accuracy": 0.16409489065408706, "num_tokens": 12615167.0, "step": 6840 }, { "entropy": 5.6706328868865965, "epoch": 0.5750892669607225, "grad_norm": 1.0546875, "learning_rate": 0.0004972830260728729, "loss": 5.5367, "mean_token_accuracy": 0.1674353748559952, "num_tokens": 12624230.0, "step": 6845 }, { "entropy": 5.757194995880127, "epoch": 0.5755093467758874, "grad_norm": 0.9921875, "learning_rate": 0.0004972783843152863, "loss": 5.5197, "mean_token_accuracy": 0.16837731450796128, "num_tokens": 12633158.0, "step": 6850 }, { "entropy": 5.757494592666626, "epoch": 0.5759294265910523, "grad_norm": 1.046875, "learning_rate": 0.0004972737386201527, "loss": 5.4829, "mean_token_accuracy": 0.16184937953948975, "num_tokens": 12641465.0, "step": 6855 }, { "entropy": 5.663051462173462, "epoch": 0.5763495064062172, "grad_norm": 1.0078125, "learning_rate": 0.0004972690889875541, "loss": 5.4432, "mean_token_accuracy": 0.16741454750299453, "num_tokens": 12650437.0, "step": 6860 }, { "entropy": 5.84966139793396, "epoch": 0.5767695862213821, "grad_norm": 1.0078125, "learning_rate": 0.0004972644354175732, "loss": 5.6532, "mean_token_accuracy": 0.16014729291200638, "num_tokens": 12660072.0, "step": 6865 }, { "entropy": 5.82061538696289, "epoch": 0.577189666036547, "grad_norm": 0.91796875, "learning_rate": 0.0004972597779102922, "loss": 5.6685, "mean_token_accuracy": 0.1602156087756157, "num_tokens": 12670405.0, "step": 6870 }, { "entropy": 5.689360618591309, "epoch": 0.5776097458517119, "grad_norm": 0.94140625, "learning_rate": 0.0004972551164657937, "loss": 5.5423, "mean_token_accuracy": 0.1655457064509392, "num_tokens": 12679992.0, "step": 6875 }, { "entropy": 5.799532318115235, "epoch": 0.5780298256668767, "grad_norm": 0.9453125, "learning_rate": 0.0004972504510841602, "loss": 5.595, "mean_token_accuracy": 0.15801346749067308, "num_tokens": 12690289.0, "step": 6880 }, { "entropy": 5.8381139755249025, "epoch": 0.5784499054820416, "grad_norm": 0.91796875, "learning_rate": 0.0004972457817654745, "loss": 5.5865, "mean_token_accuracy": 0.16085358709096909, "num_tokens": 12700518.0, "step": 6885 }, { "entropy": 5.80426549911499, "epoch": 0.5788699852972065, "grad_norm": 0.99609375, "learning_rate": 0.0004972411085098191, "loss": 5.6329, "mean_token_accuracy": 0.15670239478349685, "num_tokens": 12710603.0, "step": 6890 }, { "entropy": 5.79871392250061, "epoch": 0.5792900651123714, "grad_norm": 0.90625, "learning_rate": 0.000497236431317277, "loss": 5.5266, "mean_token_accuracy": 0.16727050095796586, "num_tokens": 12719298.0, "step": 6895 }, { "entropy": 5.751122093200683, "epoch": 0.5797101449275363, "grad_norm": 1.046875, "learning_rate": 0.000497231750187931, "loss": 5.5178, "mean_token_accuracy": 0.16697220504283905, "num_tokens": 12728368.0, "step": 6900 }, { "entropy": 5.777011489868164, "epoch": 0.5801302247427012, "grad_norm": 0.98046875, "learning_rate": 0.0004972270651218638, "loss": 5.5793, "mean_token_accuracy": 0.16862293779850007, "num_tokens": 12737898.0, "step": 6905 }, { "entropy": 5.75121955871582, "epoch": 0.580550304557866, "grad_norm": 1.046875, "learning_rate": 0.0004972223761191587, "loss": 5.5282, "mean_token_accuracy": 0.16244126260280609, "num_tokens": 12746761.0, "step": 6910 }, { "entropy": 5.668338012695313, "epoch": 0.5809703843730308, "grad_norm": 1.0390625, "learning_rate": 0.0004972176831798986, "loss": 5.4701, "mean_token_accuracy": 0.17220567017793656, "num_tokens": 12755128.0, "step": 6915 }, { "entropy": 5.745266914367676, "epoch": 0.5813904641881957, "grad_norm": 0.89453125, "learning_rate": 0.0004972129863041667, "loss": 5.6462, "mean_token_accuracy": 0.15389580130577088, "num_tokens": 12764727.0, "step": 6920 }, { "entropy": 5.753053855895996, "epoch": 0.5818105440033606, "grad_norm": 0.91796875, "learning_rate": 0.0004972082854920462, "loss": 5.4956, "mean_token_accuracy": 0.16948433965444565, "num_tokens": 12773557.0, "step": 6925 }, { "entropy": 5.731849002838135, "epoch": 0.5822306238185255, "grad_norm": 0.97265625, "learning_rate": 0.0004972035807436203, "loss": 5.5, "mean_token_accuracy": 0.1690128982067108, "num_tokens": 12782525.0, "step": 6930 }, { "entropy": 5.796383476257324, "epoch": 0.5826507036336904, "grad_norm": 0.984375, "learning_rate": 0.0004971988720589723, "loss": 5.5955, "mean_token_accuracy": 0.16173771321773528, "num_tokens": 12791534.0, "step": 6935 }, { "entropy": 5.767693090438843, "epoch": 0.5830707834488553, "grad_norm": 0.91796875, "learning_rate": 0.0004971941594381858, "loss": 5.4897, "mean_token_accuracy": 0.16691604107618332, "num_tokens": 12800662.0, "step": 6940 }, { "entropy": 5.758454275131226, "epoch": 0.5834908632640201, "grad_norm": 0.890625, "learning_rate": 0.0004971894428813441, "loss": 5.5308, "mean_token_accuracy": 0.16786673665046692, "num_tokens": 12809440.0, "step": 6945 }, { "entropy": 5.777513122558593, "epoch": 0.583910943079185, "grad_norm": 0.99609375, "learning_rate": 0.000497184722388531, "loss": 5.6051, "mean_token_accuracy": 0.1612379416823387, "num_tokens": 12818560.0, "step": 6950 }, { "entropy": 5.839304399490357, "epoch": 0.5843310228943499, "grad_norm": 0.96484375, "learning_rate": 0.0004971799979598297, "loss": 5.5324, "mean_token_accuracy": 0.16134003400802613, "num_tokens": 12827898.0, "step": 6955 }, { "entropy": 5.717963171005249, "epoch": 0.5847511027095148, "grad_norm": 0.97265625, "learning_rate": 0.0004971752695953243, "loss": 5.4782, "mean_token_accuracy": 0.16631890833377838, "num_tokens": 12837199.0, "step": 6960 }, { "entropy": 5.713119792938232, "epoch": 0.5851711825246797, "grad_norm": 0.9453125, "learning_rate": 0.0004971705372950984, "loss": 5.5118, "mean_token_accuracy": 0.163696525990963, "num_tokens": 12846493.0, "step": 6965 }, { "entropy": 5.761196613311768, "epoch": 0.5855912623398446, "grad_norm": 0.9140625, "learning_rate": 0.0004971658010592358, "loss": 5.5286, "mean_token_accuracy": 0.16277743577957154, "num_tokens": 12855026.0, "step": 6970 }, { "entropy": 5.776597166061402, "epoch": 0.5860113421550095, "grad_norm": 0.8984375, "learning_rate": 0.0004971610608878205, "loss": 5.5984, "mean_token_accuracy": 0.16150868386030198, "num_tokens": 12864563.0, "step": 6975 }, { "entropy": 5.825516939163208, "epoch": 0.5864314219701743, "grad_norm": 1.0390625, "learning_rate": 0.0004971563167809363, "loss": 5.5258, "mean_token_accuracy": 0.16631446927785873, "num_tokens": 12874358.0, "step": 6980 }, { "entropy": 5.731455850601196, "epoch": 0.5868515017853392, "grad_norm": 0.921875, "learning_rate": 0.0004971515687386674, "loss": 5.5443, "mean_token_accuracy": 0.16318022608757018, "num_tokens": 12883110.0, "step": 6985 }, { "entropy": 5.775928068161011, "epoch": 0.5872715816005041, "grad_norm": 0.8828125, "learning_rate": 0.0004971468167610978, "loss": 5.6099, "mean_token_accuracy": 0.16461408585309983, "num_tokens": 12892977.0, "step": 6990 }, { "entropy": 5.700873947143554, "epoch": 0.587691661415669, "grad_norm": 0.890625, "learning_rate": 0.0004971420608483117, "loss": 5.4117, "mean_token_accuracy": 0.1737138643860817, "num_tokens": 12902327.0, "step": 6995 }, { "entropy": 5.639215755462646, "epoch": 0.5881117412308339, "grad_norm": 0.90625, "learning_rate": 0.0004971373010003936, "loss": 5.4297, "mean_token_accuracy": 0.17889968156814576, "num_tokens": 12911957.0, "step": 7000 }, { "entropy": 5.739131689071655, "epoch": 0.5885318210459988, "grad_norm": 1.0390625, "learning_rate": 0.0004971325372174274, "loss": 5.5105, "mean_token_accuracy": 0.16423840969800949, "num_tokens": 12920380.0, "step": 7005 }, { "entropy": 5.683195638656616, "epoch": 0.5889519008611637, "grad_norm": 0.953125, "learning_rate": 0.0004971277694994976, "loss": 5.5872, "mean_token_accuracy": 0.16479117721319197, "num_tokens": 12929670.0, "step": 7010 }, { "entropy": 5.747731018066406, "epoch": 0.5893719806763285, "grad_norm": 1.0546875, "learning_rate": 0.000497122997846689, "loss": 5.5008, "mean_token_accuracy": 0.1721497356891632, "num_tokens": 12938185.0, "step": 7015 }, { "entropy": 5.772720766067505, "epoch": 0.5897920604914934, "grad_norm": 0.95703125, "learning_rate": 0.0004971182222590857, "loss": 5.5124, "mean_token_accuracy": 0.17144393175840378, "num_tokens": 12947706.0, "step": 7020 }, { "entropy": 5.684038400650024, "epoch": 0.5902121403066583, "grad_norm": 0.91796875, "learning_rate": 0.0004971134427367725, "loss": 5.5055, "mean_token_accuracy": 0.16635899543762206, "num_tokens": 12957393.0, "step": 7025 }, { "entropy": 5.712462902069092, "epoch": 0.5906322201218231, "grad_norm": 0.9296875, "learning_rate": 0.000497108659279834, "loss": 5.4101, "mean_token_accuracy": 0.1759818136692047, "num_tokens": 12967165.0, "step": 7030 }, { "entropy": 5.792285919189453, "epoch": 0.591052299936988, "grad_norm": 1.015625, "learning_rate": 0.0004971038718883551, "loss": 5.5544, "mean_token_accuracy": 0.16032245606184006, "num_tokens": 12976490.0, "step": 7035 }, { "entropy": 5.789936065673828, "epoch": 0.5914723797521529, "grad_norm": 0.9453125, "learning_rate": 0.0004970990805624203, "loss": 5.5441, "mean_token_accuracy": 0.1614286109805107, "num_tokens": 12985423.0, "step": 7040 }, { "entropy": 5.701582956314087, "epoch": 0.5918924595673178, "grad_norm": 1.0546875, "learning_rate": 0.0004970942853021147, "loss": 5.4223, "mean_token_accuracy": 0.17460384517908095, "num_tokens": 12994510.0, "step": 7045 }, { "entropy": 5.767253828048706, "epoch": 0.5923125393824826, "grad_norm": 0.890625, "learning_rate": 0.0004970894861075232, "loss": 5.5559, "mean_token_accuracy": 0.16429632902145386, "num_tokens": 13003383.0, "step": 7050 }, { "entropy": 5.748055267333984, "epoch": 0.5927326191976475, "grad_norm": 0.95703125, "learning_rate": 0.0004970846829787309, "loss": 5.495, "mean_token_accuracy": 0.16619571596384047, "num_tokens": 13012550.0, "step": 7055 }, { "entropy": 5.745292520523071, "epoch": 0.5931526990128124, "grad_norm": 1.015625, "learning_rate": 0.0004970798759158227, "loss": 5.5579, "mean_token_accuracy": 0.16078388690948486, "num_tokens": 13022066.0, "step": 7060 }, { "entropy": 5.743168926239013, "epoch": 0.5935727788279773, "grad_norm": 0.98046875, "learning_rate": 0.0004970750649188839, "loss": 5.536, "mean_token_accuracy": 0.17519628554582595, "num_tokens": 13031008.0, "step": 7065 }, { "entropy": 5.685877513885498, "epoch": 0.5939928586431422, "grad_norm": 0.9140625, "learning_rate": 0.0004970702499879998, "loss": 5.5128, "mean_token_accuracy": 0.16871291399002075, "num_tokens": 13040366.0, "step": 7070 }, { "entropy": 5.668898677825927, "epoch": 0.5944129384583071, "grad_norm": 0.88671875, "learning_rate": 0.0004970654311232554, "loss": 5.5243, "mean_token_accuracy": 0.16426745504140855, "num_tokens": 13051140.0, "step": 7075 }, { "entropy": 5.718292331695556, "epoch": 0.594833018273472, "grad_norm": 1.0234375, "learning_rate": 0.0004970606083247362, "loss": 5.4459, "mean_token_accuracy": 0.16791134625673293, "num_tokens": 13059835.0, "step": 7080 }, { "entropy": 5.667575120925903, "epoch": 0.5952530980886368, "grad_norm": 0.921875, "learning_rate": 0.0004970557815925278, "loss": 5.4135, "mean_token_accuracy": 0.16934545636177062, "num_tokens": 13068909.0, "step": 7085 }, { "entropy": 5.693837451934814, "epoch": 0.5956731779038017, "grad_norm": 0.9609375, "learning_rate": 0.0004970509509267155, "loss": 5.5084, "mean_token_accuracy": 0.16520608812570572, "num_tokens": 13078380.0, "step": 7090 }, { "entropy": 5.738383483886719, "epoch": 0.5960932577189666, "grad_norm": 0.92578125, "learning_rate": 0.0004970461163273849, "loss": 5.5358, "mean_token_accuracy": 0.1652810513973236, "num_tokens": 13087774.0, "step": 7095 }, { "entropy": 5.6614855289459225, "epoch": 0.5965133375341315, "grad_norm": 1.0078125, "learning_rate": 0.0004970412777946219, "loss": 5.3538, "mean_token_accuracy": 0.1728409618139267, "num_tokens": 13095938.0, "step": 7100 }, { "entropy": 5.661937522888183, "epoch": 0.5969334173492964, "grad_norm": 0.95703125, "learning_rate": 0.0004970364353285117, "loss": 5.5099, "mean_token_accuracy": 0.1667941018939018, "num_tokens": 13104661.0, "step": 7105 }, { "entropy": 5.769843101501465, "epoch": 0.5973534971644613, "grad_norm": 1.0078125, "learning_rate": 0.0004970315889291405, "loss": 5.5054, "mean_token_accuracy": 0.16266342252492905, "num_tokens": 13114505.0, "step": 7110 }, { "entropy": 5.647045612335205, "epoch": 0.5977735769796261, "grad_norm": 0.953125, "learning_rate": 0.0004970267385965941, "loss": 5.4399, "mean_token_accuracy": 0.1659441262483597, "num_tokens": 13124590.0, "step": 7115 }, { "entropy": 5.659963178634643, "epoch": 0.598193656794791, "grad_norm": 1.09375, "learning_rate": 0.0004970218843309583, "loss": 5.4255, "mean_token_accuracy": 0.17648224532604218, "num_tokens": 13134026.0, "step": 7120 }, { "entropy": 5.784970045089722, "epoch": 0.5986137366099559, "grad_norm": 0.98828125, "learning_rate": 0.0004970170261323192, "loss": 5.588, "mean_token_accuracy": 0.16741684675216675, "num_tokens": 13142654.0, "step": 7125 }, { "entropy": 5.68078384399414, "epoch": 0.5990338164251208, "grad_norm": 1.0078125, "learning_rate": 0.0004970121640007627, "loss": 5.4971, "mean_token_accuracy": 0.16654934138059616, "num_tokens": 13151177.0, "step": 7130 }, { "entropy": 5.7274463176727295, "epoch": 0.5994538962402857, "grad_norm": 0.99609375, "learning_rate": 0.0004970072979363751, "loss": 5.4843, "mean_token_accuracy": 0.1642145425081253, "num_tokens": 13159689.0, "step": 7135 }, { "entropy": 5.673399639129639, "epoch": 0.5998739760554506, "grad_norm": 0.98046875, "learning_rate": 0.0004970024279392425, "loss": 5.5339, "mean_token_accuracy": 0.16159643679857255, "num_tokens": 13168601.0, "step": 7140 }, { "entropy": 5.740363311767578, "epoch": 0.6002940558706155, "grad_norm": 0.99609375, "learning_rate": 0.0004969975540094513, "loss": 5.5042, "mean_token_accuracy": 0.16813595741987228, "num_tokens": 13177035.0, "step": 7145 }, { "entropy": 5.774371862411499, "epoch": 0.6007141356857802, "grad_norm": 0.96484375, "learning_rate": 0.0004969926761470876, "loss": 5.4729, "mean_token_accuracy": 0.1695594534277916, "num_tokens": 13185444.0, "step": 7150 }, { "entropy": 5.700524473190308, "epoch": 0.6011342155009451, "grad_norm": 0.96875, "learning_rate": 0.000496987794352238, "loss": 5.4721, "mean_token_accuracy": 0.1695847913622856, "num_tokens": 13194987.0, "step": 7155 }, { "entropy": 5.63314642906189, "epoch": 0.60155429531611, "grad_norm": 0.9765625, "learning_rate": 0.0004969829086249889, "loss": 5.5057, "mean_token_accuracy": 0.1687454789876938, "num_tokens": 13203807.0, "step": 7160 }, { "entropy": 5.757598972320556, "epoch": 0.6019743751312749, "grad_norm": 1.0234375, "learning_rate": 0.000496978018965427, "loss": 5.6103, "mean_token_accuracy": 0.16279578655958177, "num_tokens": 13214362.0, "step": 7165 }, { "entropy": 5.789872074127198, "epoch": 0.6023944549464398, "grad_norm": 0.9375, "learning_rate": 0.0004969731253736387, "loss": 5.6048, "mean_token_accuracy": 0.16099970787763596, "num_tokens": 13224192.0, "step": 7170 }, { "entropy": 5.700117921829223, "epoch": 0.6028145347616047, "grad_norm": 0.92578125, "learning_rate": 0.0004969682278497109, "loss": 5.5621, "mean_token_accuracy": 0.1684303879737854, "num_tokens": 13234430.0, "step": 7175 }, { "entropy": 5.711915159225464, "epoch": 0.6032346145767696, "grad_norm": 1.015625, "learning_rate": 0.0004969633263937301, "loss": 5.458, "mean_token_accuracy": 0.1688069686293602, "num_tokens": 13243681.0, "step": 7180 }, { "entropy": 5.86687707901001, "epoch": 0.6036546943919344, "grad_norm": 0.91796875, "learning_rate": 0.0004969584210057832, "loss": 5.7426, "mean_token_accuracy": 0.15597060322761536, "num_tokens": 13254334.0, "step": 7185 }, { "entropy": 5.815514802932739, "epoch": 0.6040747742070993, "grad_norm": 0.88671875, "learning_rate": 0.0004969535116859573, "loss": 5.5268, "mean_token_accuracy": 0.16894952207803726, "num_tokens": 13263781.0, "step": 7190 }, { "entropy": 5.621768617630005, "epoch": 0.6044948540222642, "grad_norm": 1.0078125, "learning_rate": 0.0004969485984343392, "loss": 5.4558, "mean_token_accuracy": 0.16743801385164261, "num_tokens": 13272831.0, "step": 7195 }, { "entropy": 5.7688243865966795, "epoch": 0.6049149338374291, "grad_norm": 1.1015625, "learning_rate": 0.000496943681251016, "loss": 5.5035, "mean_token_accuracy": 0.16227193921804428, "num_tokens": 13281621.0, "step": 7200 }, { "entropy": 5.678845548629761, "epoch": 0.605335013652594, "grad_norm": 1.0, "learning_rate": 0.0004969387601360747, "loss": 5.5005, "mean_token_accuracy": 0.161435130238533, "num_tokens": 13291021.0, "step": 7205 }, { "entropy": 5.728114938735962, "epoch": 0.6057550934677589, "grad_norm": 1.0390625, "learning_rate": 0.0004969338350896026, "loss": 5.5067, "mean_token_accuracy": 0.16854603439569474, "num_tokens": 13299752.0, "step": 7210 }, { "entropy": 5.761625099182129, "epoch": 0.6061751732829238, "grad_norm": 0.9609375, "learning_rate": 0.0004969289061116869, "loss": 5.5252, "mean_token_accuracy": 0.15588051974773406, "num_tokens": 13309112.0, "step": 7215 }, { "entropy": 5.784453392028809, "epoch": 0.6065952530980886, "grad_norm": 1.0, "learning_rate": 0.0004969239732024148, "loss": 5.5312, "mean_token_accuracy": 0.17264840453863145, "num_tokens": 13318328.0, "step": 7220 }, { "entropy": 5.609762954711914, "epoch": 0.6070153329132535, "grad_norm": 0.87109375, "learning_rate": 0.0004969190363618739, "loss": 5.4207, "mean_token_accuracy": 0.16983553618192673, "num_tokens": 13328940.0, "step": 7225 }, { "entropy": 5.653651523590088, "epoch": 0.6074354127284184, "grad_norm": 1.1015625, "learning_rate": 0.0004969140955901516, "loss": 5.4583, "mean_token_accuracy": 0.17219654768705367, "num_tokens": 13337829.0, "step": 7230 }, { "entropy": 5.808972644805908, "epoch": 0.6078554925435833, "grad_norm": 0.875, "learning_rate": 0.0004969091508873352, "loss": 5.6215, "mean_token_accuracy": 0.16035659611225128, "num_tokens": 13348289.0, "step": 7235 }, { "entropy": 5.754089260101319, "epoch": 0.6082755723587482, "grad_norm": 0.99609375, "learning_rate": 0.0004969042022535126, "loss": 5.5477, "mean_token_accuracy": 0.16541809737682342, "num_tokens": 13357292.0, "step": 7240 }, { "entropy": 5.7452880382537845, "epoch": 0.6086956521739131, "grad_norm": 0.99609375, "learning_rate": 0.0004968992496887713, "loss": 5.5828, "mean_token_accuracy": 0.16221534311771393, "num_tokens": 13366640.0, "step": 7245 }, { "entropy": 5.746783971786499, "epoch": 0.609115731989078, "grad_norm": 0.7890625, "learning_rate": 0.0004968942931931989, "loss": 5.4881, "mean_token_accuracy": 0.17632103711366653, "num_tokens": 13377509.0, "step": 7250 }, { "entropy": 5.718003177642823, "epoch": 0.6095358118042428, "grad_norm": 1.0625, "learning_rate": 0.0004968893327668835, "loss": 5.5859, "mean_token_accuracy": 0.1615411803126335, "num_tokens": 13386573.0, "step": 7255 }, { "entropy": 5.676197052001953, "epoch": 0.6099558916194077, "grad_norm": 0.9375, "learning_rate": 0.0004968843684099128, "loss": 5.4274, "mean_token_accuracy": 0.1722585678100586, "num_tokens": 13395790.0, "step": 7260 }, { "entropy": 5.6800004005432125, "epoch": 0.6103759714345726, "grad_norm": 1.0546875, "learning_rate": 0.0004968794001223747, "loss": 5.4747, "mean_token_accuracy": 0.16489816904067994, "num_tokens": 13405265.0, "step": 7265 }, { "entropy": 5.692203521728516, "epoch": 0.6107960512497375, "grad_norm": 1.015625, "learning_rate": 0.0004968744279043574, "loss": 5.4777, "mean_token_accuracy": 0.17131679356098176, "num_tokens": 13413796.0, "step": 7270 }, { "entropy": 5.744186496734619, "epoch": 0.6112161310649024, "grad_norm": 0.97265625, "learning_rate": 0.0004968694517559488, "loss": 5.5307, "mean_token_accuracy": 0.16541121006011963, "num_tokens": 13423299.0, "step": 7275 }, { "entropy": 5.668656826019287, "epoch": 0.6116362108800673, "grad_norm": 0.890625, "learning_rate": 0.0004968644716772371, "loss": 5.4529, "mean_token_accuracy": 0.17369708567857742, "num_tokens": 13432267.0, "step": 7280 }, { "entropy": 5.680675506591797, "epoch": 0.612056290695232, "grad_norm": 0.9296875, "learning_rate": 0.0004968594876683105, "loss": 5.5412, "mean_token_accuracy": 0.16298353672027588, "num_tokens": 13442332.0, "step": 7285 }, { "entropy": 5.697410249710083, "epoch": 0.6124763705103969, "grad_norm": 0.94921875, "learning_rate": 0.0004968544997292572, "loss": 5.4937, "mean_token_accuracy": 0.17212583422660827, "num_tokens": 13451700.0, "step": 7290 }, { "entropy": 5.737648773193359, "epoch": 0.6128964503255618, "grad_norm": 1.0, "learning_rate": 0.0004968495078601659, "loss": 5.5918, "mean_token_accuracy": 0.16140649616718292, "num_tokens": 13461009.0, "step": 7295 }, { "entropy": 5.7446732997894285, "epoch": 0.6133165301407267, "grad_norm": 0.953125, "learning_rate": 0.0004968445120611247, "loss": 5.5815, "mean_token_accuracy": 0.16554148495197296, "num_tokens": 13470341.0, "step": 7300 }, { "entropy": 5.743761396408081, "epoch": 0.6137366099558916, "grad_norm": 0.9140625, "learning_rate": 0.0004968395123322223, "loss": 5.5025, "mean_token_accuracy": 0.1652843788266182, "num_tokens": 13479898.0, "step": 7305 }, { "entropy": 5.698557806015015, "epoch": 0.6141566897710565, "grad_norm": 0.96875, "learning_rate": 0.000496834508673547, "loss": 5.4265, "mean_token_accuracy": 0.16626278609037398, "num_tokens": 13488116.0, "step": 7310 }, { "entropy": 5.7173277854919435, "epoch": 0.6145767695862214, "grad_norm": 0.96484375, "learning_rate": 0.0004968295010851877, "loss": 5.4667, "mean_token_accuracy": 0.1695254847407341, "num_tokens": 13497814.0, "step": 7315 }, { "entropy": 5.703423404693604, "epoch": 0.6149968494013862, "grad_norm": 1.0234375, "learning_rate": 0.0004968244895672331, "loss": 5.4664, "mean_token_accuracy": 0.16524181365966797, "num_tokens": 13506617.0, "step": 7320 }, { "entropy": 5.680415296554566, "epoch": 0.6154169292165511, "grad_norm": 0.93359375, "learning_rate": 0.0004968194741197718, "loss": 5.6305, "mean_token_accuracy": 0.16197476536035538, "num_tokens": 13516632.0, "step": 7325 }, { "entropy": 5.829664039611816, "epoch": 0.615837009031716, "grad_norm": 1.0, "learning_rate": 0.0004968144547428927, "loss": 5.5466, "mean_token_accuracy": 0.16924293488264083, "num_tokens": 13526452.0, "step": 7330 }, { "entropy": 5.744434928894043, "epoch": 0.6162570888468809, "grad_norm": 1.046875, "learning_rate": 0.0004968094314366848, "loss": 5.4566, "mean_token_accuracy": 0.16510533839464187, "num_tokens": 13535663.0, "step": 7335 }, { "entropy": 5.645727968215942, "epoch": 0.6166771686620458, "grad_norm": 0.95703125, "learning_rate": 0.000496804404201237, "loss": 5.3726, "mean_token_accuracy": 0.1793311506509781, "num_tokens": 13544574.0, "step": 7340 }, { "entropy": 5.812160348892212, "epoch": 0.6170972484772107, "grad_norm": 1.078125, "learning_rate": 0.0004967993730366385, "loss": 5.5617, "mean_token_accuracy": 0.16627434641122818, "num_tokens": 13553041.0, "step": 7345 }, { "entropy": 5.655124235153198, "epoch": 0.6175173282923756, "grad_norm": 0.9453125, "learning_rate": 0.0004967943379429781, "loss": 5.5015, "mean_token_accuracy": 0.16323864310979844, "num_tokens": 13562108.0, "step": 7350 }, { "entropy": 5.850181436538696, "epoch": 0.6179374081075404, "grad_norm": 0.93359375, "learning_rate": 0.0004967892989203454, "loss": 5.6673, "mean_token_accuracy": 0.1569588676095009, "num_tokens": 13571500.0, "step": 7355 }, { "entropy": 5.801353788375854, "epoch": 0.6183574879227053, "grad_norm": 0.94921875, "learning_rate": 0.0004967842559688295, "loss": 5.5814, "mean_token_accuracy": 0.16009139716625215, "num_tokens": 13581304.0, "step": 7360 }, { "entropy": 5.694891834259034, "epoch": 0.6187775677378702, "grad_norm": 0.93359375, "learning_rate": 0.0004967792090885195, "loss": 5.4246, "mean_token_accuracy": 0.16926718205213548, "num_tokens": 13590734.0, "step": 7365 }, { "entropy": 5.6554632663726805, "epoch": 0.6191976475530351, "grad_norm": 0.90625, "learning_rate": 0.0004967741582795052, "loss": 5.5091, "mean_token_accuracy": 0.16807454824447632, "num_tokens": 13600486.0, "step": 7370 }, { "entropy": 5.801201295852661, "epoch": 0.6196177273682, "grad_norm": 0.88671875, "learning_rate": 0.0004967691035418758, "loss": 5.5316, "mean_token_accuracy": 0.15901947170495986, "num_tokens": 13610542.0, "step": 7375 }, { "entropy": 5.691852474212647, "epoch": 0.6200378071833649, "grad_norm": 0.9921875, "learning_rate": 0.000496764044875721, "loss": 5.4888, "mean_token_accuracy": 0.16762082427740096, "num_tokens": 13619431.0, "step": 7380 }, { "entropy": 5.662903547286987, "epoch": 0.6204578869985298, "grad_norm": 0.89453125, "learning_rate": 0.0004967589822811303, "loss": 5.5149, "mean_token_accuracy": 0.1655088871717453, "num_tokens": 13629930.0, "step": 7385 }, { "entropy": 5.862593698501587, "epoch": 0.6208779668136946, "grad_norm": 0.90234375, "learning_rate": 0.0004967539157581934, "loss": 5.6389, "mean_token_accuracy": 0.1590859979391098, "num_tokens": 13639439.0, "step": 7390 }, { "entropy": 5.801116275787353, "epoch": 0.6212980466288595, "grad_norm": 0.94140625, "learning_rate": 0.000496748845307, "loss": 5.5583, "mean_token_accuracy": 0.16589334830641747, "num_tokens": 13648548.0, "step": 7395 }, { "entropy": 5.766993808746338, "epoch": 0.6217181264440244, "grad_norm": 0.9453125, "learning_rate": 0.0004967437709276401, "loss": 5.6084, "mean_token_accuracy": 0.16399488151073455, "num_tokens": 13657658.0, "step": 7400 }, { "entropy": 5.650314474105835, "epoch": 0.6221382062591893, "grad_norm": 0.94140625, "learning_rate": 0.0004967386926202034, "loss": 5.3795, "mean_token_accuracy": 0.17246145755052567, "num_tokens": 13666763.0, "step": 7405 }, { "entropy": 5.782988977432251, "epoch": 0.6225582860743542, "grad_norm": 0.96875, "learning_rate": 0.00049673361038478, "loss": 5.6075, "mean_token_accuracy": 0.15505822673439978, "num_tokens": 13676527.0, "step": 7410 }, { "entropy": 5.719121265411377, "epoch": 0.622978365889519, "grad_norm": 0.890625, "learning_rate": 0.0004967285242214599, "loss": 5.578, "mean_token_accuracy": 0.17219377309083939, "num_tokens": 13685404.0, "step": 7415 }, { "entropy": 5.698868083953857, "epoch": 0.6233984457046838, "grad_norm": 1.0625, "learning_rate": 0.000496723434130333, "loss": 5.395, "mean_token_accuracy": 0.16921012550592424, "num_tokens": 13693118.0, "step": 7420 }, { "entropy": 5.7081413745880125, "epoch": 0.6238185255198487, "grad_norm": 0.953125, "learning_rate": 0.0004967183401114898, "loss": 5.4705, "mean_token_accuracy": 0.16425008475780487, "num_tokens": 13702015.0, "step": 7425 }, { "entropy": 5.7164053440094, "epoch": 0.6242386053350136, "grad_norm": 1.5703125, "learning_rate": 0.0004967132421650203, "loss": 5.4688, "mean_token_accuracy": 0.1687057375907898, "num_tokens": 13711658.0, "step": 7430 }, { "entropy": 5.650258636474609, "epoch": 0.6246586851501785, "grad_norm": 0.9921875, "learning_rate": 0.0004967081402910149, "loss": 5.5199, "mean_token_accuracy": 0.1659772053360939, "num_tokens": 13720718.0, "step": 7435 }, { "entropy": 5.714274263381958, "epoch": 0.6250787649653434, "grad_norm": 1.015625, "learning_rate": 0.000496703034489564, "loss": 5.3741, "mean_token_accuracy": 0.17356953918933868, "num_tokens": 13729364.0, "step": 7440 }, { "entropy": 5.797601222991943, "epoch": 0.6254988447805083, "grad_norm": 0.93359375, "learning_rate": 0.0004966979247607579, "loss": 5.684, "mean_token_accuracy": 0.16203884929418563, "num_tokens": 13739436.0, "step": 7445 }, { "entropy": 5.794081306457519, "epoch": 0.6259189245956732, "grad_norm": 0.88671875, "learning_rate": 0.0004966928111046873, "loss": 5.581, "mean_token_accuracy": 0.17139979004859923, "num_tokens": 13749196.0, "step": 7450 }, { "entropy": 5.7291075706481935, "epoch": 0.626339004410838, "grad_norm": 0.90625, "learning_rate": 0.0004966876935214426, "loss": 5.4214, "mean_token_accuracy": 0.17418570071458817, "num_tokens": 13758414.0, "step": 7455 }, { "entropy": 5.686338424682617, "epoch": 0.6267590842260029, "grad_norm": 0.96875, "learning_rate": 0.0004966825720111147, "loss": 5.4894, "mean_token_accuracy": 0.1629626229405403, "num_tokens": 13767496.0, "step": 7460 }, { "entropy": 5.751576089859009, "epoch": 0.6271791640411678, "grad_norm": 1.078125, "learning_rate": 0.0004966774465737942, "loss": 5.6138, "mean_token_accuracy": 0.165596853941679, "num_tokens": 13777033.0, "step": 7465 }, { "entropy": 5.7884539604187015, "epoch": 0.6275992438563327, "grad_norm": 0.95703125, "learning_rate": 0.0004966723172095717, "loss": 5.5726, "mean_token_accuracy": 0.1648782819509506, "num_tokens": 13786313.0, "step": 7470 }, { "entropy": 5.696361017227173, "epoch": 0.6280193236714976, "grad_norm": 0.98046875, "learning_rate": 0.0004966671839185384, "loss": 5.4952, "mean_token_accuracy": 0.16794500648975372, "num_tokens": 13795257.0, "step": 7475 }, { "entropy": 5.62646861076355, "epoch": 0.6284394034866625, "grad_norm": 0.92578125, "learning_rate": 0.0004966620467007851, "loss": 5.4277, "mean_token_accuracy": 0.1720203161239624, "num_tokens": 13804582.0, "step": 7480 }, { "entropy": 5.688060522079468, "epoch": 0.6288594833018274, "grad_norm": 0.8984375, "learning_rate": 0.0004966569055564027, "loss": 5.4029, "mean_token_accuracy": 0.1695487268269062, "num_tokens": 13813248.0, "step": 7485 }, { "entropy": 5.774942111968994, "epoch": 0.6292795631169922, "grad_norm": 0.95703125, "learning_rate": 0.0004966517604854823, "loss": 5.6697, "mean_token_accuracy": 0.1593285620212555, "num_tokens": 13823301.0, "step": 7490 }, { "entropy": 5.691988086700439, "epoch": 0.6296996429321571, "grad_norm": 0.98046875, "learning_rate": 0.0004966466114881152, "loss": 5.4052, "mean_token_accuracy": 0.1739303633570671, "num_tokens": 13832040.0, "step": 7495 }, { "entropy": 5.735798263549805, "epoch": 0.630119722747322, "grad_norm": 0.90625, "learning_rate": 0.0004966414585643925, "loss": 5.6088, "mean_token_accuracy": 0.16045339405536652, "num_tokens": 13841874.0, "step": 7500 }, { "entropy": 5.665113925933838, "epoch": 0.6305398025624869, "grad_norm": 0.97265625, "learning_rate": 0.0004966363017144055, "loss": 5.4215, "mean_token_accuracy": 0.17605502754449845, "num_tokens": 13850755.0, "step": 7505 }, { "entropy": 5.679509687423706, "epoch": 0.6309598823776518, "grad_norm": 0.93359375, "learning_rate": 0.0004966311409382455, "loss": 5.4801, "mean_token_accuracy": 0.16642314195632935, "num_tokens": 13860009.0, "step": 7510 }, { "entropy": 5.653228378295898, "epoch": 0.6313799621928167, "grad_norm": 1.0546875, "learning_rate": 0.0004966259762360039, "loss": 5.4039, "mean_token_accuracy": 0.17477345317602158, "num_tokens": 13868476.0, "step": 7515 }, { "entropy": 5.598419427871704, "epoch": 0.6318000420079816, "grad_norm": 0.92578125, "learning_rate": 0.0004966208076077723, "loss": 5.4152, "mean_token_accuracy": 0.17099131792783737, "num_tokens": 13877367.0, "step": 7520 }, { "entropy": 5.699641418457031, "epoch": 0.6322201218231464, "grad_norm": 0.98828125, "learning_rate": 0.0004966156350536422, "loss": 5.5002, "mean_token_accuracy": 0.1632213681936264, "num_tokens": 13885985.0, "step": 7525 }, { "entropy": 5.636166906356811, "epoch": 0.6326402016383113, "grad_norm": 0.9296875, "learning_rate": 0.0004966104585737054, "loss": 5.4183, "mean_token_accuracy": 0.17092742025852203, "num_tokens": 13895059.0, "step": 7530 }, { "entropy": 5.690598201751709, "epoch": 0.6330602814534761, "grad_norm": 0.9453125, "learning_rate": 0.0004966052781680534, "loss": 5.4839, "mean_token_accuracy": 0.16899570524692537, "num_tokens": 13903789.0, "step": 7535 }, { "entropy": 5.756844615936279, "epoch": 0.633480361268641, "grad_norm": 0.97265625, "learning_rate": 0.0004966000938367778, "loss": 5.4591, "mean_token_accuracy": 0.16894406527280809, "num_tokens": 13913377.0, "step": 7540 }, { "entropy": 5.608310413360596, "epoch": 0.6339004410838059, "grad_norm": 0.9453125, "learning_rate": 0.0004965949055799708, "loss": 5.4127, "mean_token_accuracy": 0.18185660988092422, "num_tokens": 13922141.0, "step": 7545 }, { "entropy": 5.723646020889282, "epoch": 0.6343205208989708, "grad_norm": 1.03125, "learning_rate": 0.0004965897133977241, "loss": 5.4807, "mean_token_accuracy": 0.16371672451496125, "num_tokens": 13930717.0, "step": 7550 }, { "entropy": 5.771508407592774, "epoch": 0.6347406007141357, "grad_norm": 0.94140625, "learning_rate": 0.0004965845172901298, "loss": 5.5515, "mean_token_accuracy": 0.16789867728948593, "num_tokens": 13940344.0, "step": 7555 }, { "entropy": 5.700416374206543, "epoch": 0.6351606805293005, "grad_norm": 1.0625, "learning_rate": 0.0004965793172572798, "loss": 5.4076, "mean_token_accuracy": 0.1729632467031479, "num_tokens": 13948400.0, "step": 7560 }, { "entropy": 5.65832405090332, "epoch": 0.6355807603444654, "grad_norm": 0.921875, "learning_rate": 0.0004965741132992663, "loss": 5.5048, "mean_token_accuracy": 0.16236102432012559, "num_tokens": 13957939.0, "step": 7565 }, { "entropy": 5.741348743438721, "epoch": 0.6360008401596303, "grad_norm": 0.90625, "learning_rate": 0.0004965689054161814, "loss": 5.4767, "mean_token_accuracy": 0.17106067687273024, "num_tokens": 13966943.0, "step": 7570 }, { "entropy": 5.667167472839355, "epoch": 0.6364209199747952, "grad_norm": 0.96484375, "learning_rate": 0.0004965636936081176, "loss": 5.4057, "mean_token_accuracy": 0.16865545958280564, "num_tokens": 13975850.0, "step": 7575 }, { "entropy": 5.769311952590942, "epoch": 0.6368409997899601, "grad_norm": 1.03125, "learning_rate": 0.000496558477875167, "loss": 5.4734, "mean_token_accuracy": 0.17367706149816514, "num_tokens": 13985059.0, "step": 7580 }, { "entropy": 5.775439119338989, "epoch": 0.637261079605125, "grad_norm": 0.94921875, "learning_rate": 0.000496553258217422, "loss": 5.5413, "mean_token_accuracy": 0.1580244779586792, "num_tokens": 13993571.0, "step": 7585 }, { "entropy": 5.728666591644287, "epoch": 0.6376811594202898, "grad_norm": 0.96875, "learning_rate": 0.0004965480346349751, "loss": 5.5175, "mean_token_accuracy": 0.16710771322250367, "num_tokens": 14002326.0, "step": 7590 }, { "entropy": 5.852581930160523, "epoch": 0.6381012392354547, "grad_norm": 0.91796875, "learning_rate": 0.000496542807127919, "loss": 5.661, "mean_token_accuracy": 0.16399567797780037, "num_tokens": 14012002.0, "step": 7595 }, { "entropy": 5.758628559112549, "epoch": 0.6385213190506196, "grad_norm": 0.95703125, "learning_rate": 0.000496537575696346, "loss": 5.5335, "mean_token_accuracy": 0.16143829822540284, "num_tokens": 14022085.0, "step": 7600 }, { "entropy": 5.683157110214234, "epoch": 0.6389413988657845, "grad_norm": 0.9765625, "learning_rate": 0.0004965323403403488, "loss": 5.4223, "mean_token_accuracy": 0.1673789069056511, "num_tokens": 14030706.0, "step": 7605 }, { "entropy": 5.625469160079956, "epoch": 0.6393614786809494, "grad_norm": 0.91015625, "learning_rate": 0.0004965271010600205, "loss": 5.4607, "mean_token_accuracy": 0.1712944433093071, "num_tokens": 14039520.0, "step": 7610 }, { "entropy": 5.717896509170532, "epoch": 0.6397815584961143, "grad_norm": 0.96875, "learning_rate": 0.0004965218578554535, "loss": 5.5437, "mean_token_accuracy": 0.16942658126354218, "num_tokens": 14048407.0, "step": 7615 }, { "entropy": 5.655859279632568, "epoch": 0.6402016383112792, "grad_norm": 1.03125, "learning_rate": 0.000496516610726741, "loss": 5.467, "mean_token_accuracy": 0.17381453812122344, "num_tokens": 14057534.0, "step": 7620 }, { "entropy": 5.645468664169312, "epoch": 0.640621718126444, "grad_norm": 0.97265625, "learning_rate": 0.0004965113596739759, "loss": 5.4169, "mean_token_accuracy": 0.17614233940839769, "num_tokens": 14065992.0, "step": 7625 }, { "entropy": 5.627894401550293, "epoch": 0.6410417979416089, "grad_norm": 1.0078125, "learning_rate": 0.0004965061046972508, "loss": 5.4111, "mean_token_accuracy": 0.16821854412555695, "num_tokens": 14074806.0, "step": 7630 }, { "entropy": 5.659251022338867, "epoch": 0.6414618777567738, "grad_norm": 0.90625, "learning_rate": 0.0004965008457966594, "loss": 5.4789, "mean_token_accuracy": 0.16432067304849624, "num_tokens": 14083813.0, "step": 7635 }, { "entropy": 5.681667470932007, "epoch": 0.6418819575719387, "grad_norm": 0.984375, "learning_rate": 0.0004964955829722945, "loss": 5.4099, "mean_token_accuracy": 0.17027026712894439, "num_tokens": 14092193.0, "step": 7640 }, { "entropy": 5.813520383834839, "epoch": 0.6423020373871036, "grad_norm": 1.0078125, "learning_rate": 0.0004964903162242493, "loss": 5.6342, "mean_token_accuracy": 0.15789156556129455, "num_tokens": 14102797.0, "step": 7645 }, { "entropy": 5.684490537643432, "epoch": 0.6427221172022685, "grad_norm": 0.921875, "learning_rate": 0.0004964850455526173, "loss": 5.4773, "mean_token_accuracy": 0.17116947323083878, "num_tokens": 14112226.0, "step": 7650 }, { "entropy": 5.610788440704345, "epoch": 0.6431421970174334, "grad_norm": 1.0, "learning_rate": 0.0004964797709574917, "loss": 5.4149, "mean_token_accuracy": 0.16700370907783507, "num_tokens": 14121775.0, "step": 7655 }, { "entropy": 5.635930681228638, "epoch": 0.6435622768325981, "grad_norm": 0.91796875, "learning_rate": 0.000496474492438966, "loss": 5.4038, "mean_token_accuracy": 0.17071498185396194, "num_tokens": 14130415.0, "step": 7660 }, { "entropy": 5.693044376373291, "epoch": 0.643982356647763, "grad_norm": 0.890625, "learning_rate": 0.0004964692099971338, "loss": 5.4429, "mean_token_accuracy": 0.17001585066318511, "num_tokens": 14140204.0, "step": 7665 }, { "entropy": 5.683203983306885, "epoch": 0.6444024364629279, "grad_norm": 0.95703125, "learning_rate": 0.0004964639236320885, "loss": 5.378, "mean_token_accuracy": 0.16871996819972992, "num_tokens": 14149595.0, "step": 7670 }, { "entropy": 5.608165884017945, "epoch": 0.6448225162780928, "grad_norm": 0.94140625, "learning_rate": 0.0004964586333439239, "loss": 5.4553, "mean_token_accuracy": 0.1665859803557396, "num_tokens": 14158865.0, "step": 7675 }, { "entropy": 5.660884809494019, "epoch": 0.6452425960932577, "grad_norm": 1.03125, "learning_rate": 0.0004964533391327335, "loss": 5.4102, "mean_token_accuracy": 0.1763300195336342, "num_tokens": 14167962.0, "step": 7680 }, { "entropy": 5.667941331863403, "epoch": 0.6456626759084226, "grad_norm": 1.0390625, "learning_rate": 0.0004964480409986113, "loss": 5.4635, "mean_token_accuracy": 0.1680685743689537, "num_tokens": 14176479.0, "step": 7685 }, { "entropy": 5.753057384490967, "epoch": 0.6460827557235875, "grad_norm": 1.0234375, "learning_rate": 0.0004964427389416512, "loss": 5.4711, "mean_token_accuracy": 0.1684321254491806, "num_tokens": 14185408.0, "step": 7690 }, { "entropy": 5.6439416885375975, "epoch": 0.6465028355387523, "grad_norm": 1.046875, "learning_rate": 0.000496437432961947, "loss": 5.4766, "mean_token_accuracy": 0.17091822624206543, "num_tokens": 14194155.0, "step": 7695 }, { "entropy": 5.633873081207275, "epoch": 0.6469229153539172, "grad_norm": 0.96875, "learning_rate": 0.0004964321230595925, "loss": 5.5054, "mean_token_accuracy": 0.1647911474108696, "num_tokens": 14202779.0, "step": 7700 }, { "entropy": 5.800021934509277, "epoch": 0.6473429951690821, "grad_norm": 0.88671875, "learning_rate": 0.0004964268092346821, "loss": 5.69, "mean_token_accuracy": 0.15756986886262894, "num_tokens": 14212552.0, "step": 7705 }, { "entropy": 5.822913599014282, "epoch": 0.647763074984247, "grad_norm": 0.8984375, "learning_rate": 0.0004964214914873098, "loss": 5.4764, "mean_token_accuracy": 0.16087636500597, "num_tokens": 14222783.0, "step": 7710 }, { "entropy": 5.634449625015259, "epoch": 0.6481831547994119, "grad_norm": 1.0, "learning_rate": 0.0004964161698175697, "loss": 5.358, "mean_token_accuracy": 0.16693367213010787, "num_tokens": 14232085.0, "step": 7715 }, { "entropy": 5.668655920028686, "epoch": 0.6486032346145768, "grad_norm": 0.96484375, "learning_rate": 0.0004964108442255562, "loss": 5.5381, "mean_token_accuracy": 0.16392049193382263, "num_tokens": 14241969.0, "step": 7720 }, { "entropy": 5.647701263427734, "epoch": 0.6490233144297417, "grad_norm": 1.1015625, "learning_rate": 0.0004964055147113637, "loss": 5.4328, "mean_token_accuracy": 0.17618952840566635, "num_tokens": 14251012.0, "step": 7725 }, { "entropy": 5.772505426406861, "epoch": 0.6494433942449065, "grad_norm": 1.09375, "learning_rate": 0.0004964001812750864, "loss": 5.5328, "mean_token_accuracy": 0.1665820762515068, "num_tokens": 14261110.0, "step": 7730 }, { "entropy": 5.718127250671387, "epoch": 0.6498634740600714, "grad_norm": 0.98828125, "learning_rate": 0.000496394843916819, "loss": 5.5204, "mean_token_accuracy": 0.16651073694229127, "num_tokens": 14270869.0, "step": 7735 }, { "entropy": 5.704468631744385, "epoch": 0.6502835538752363, "grad_norm": 0.953125, "learning_rate": 0.0004963895026366558, "loss": 5.4869, "mean_token_accuracy": 0.1666564702987671, "num_tokens": 14279607.0, "step": 7740 }, { "entropy": 5.662124490737915, "epoch": 0.6507036336904012, "grad_norm": 0.9140625, "learning_rate": 0.0004963841574346917, "loss": 5.4635, "mean_token_accuracy": 0.16543798744678498, "num_tokens": 14289282.0, "step": 7745 }, { "entropy": 5.6414776802062985, "epoch": 0.6511237135055661, "grad_norm": 0.9609375, "learning_rate": 0.0004963788083110212, "loss": 5.3949, "mean_token_accuracy": 0.17210839688777924, "num_tokens": 14298658.0, "step": 7750 }, { "entropy": 5.771036195755005, "epoch": 0.651543793320731, "grad_norm": 0.91796875, "learning_rate": 0.000496373455265739, "loss": 5.486, "mean_token_accuracy": 0.16423814594745637, "num_tokens": 14307832.0, "step": 7755 }, { "entropy": 5.679357814788818, "epoch": 0.6519638731358958, "grad_norm": 0.921875, "learning_rate": 0.0004963680982989402, "loss": 5.3936, "mean_token_accuracy": 0.17518658488988875, "num_tokens": 14317122.0, "step": 7760 }, { "entropy": 5.649253225326538, "epoch": 0.6523839529510607, "grad_norm": 1.0, "learning_rate": 0.0004963627374107195, "loss": 5.4302, "mean_token_accuracy": 0.17149852067232133, "num_tokens": 14326069.0, "step": 7765 }, { "entropy": 5.62903995513916, "epoch": 0.6528040327662256, "grad_norm": 0.94140625, "learning_rate": 0.0004963573726011717, "loss": 5.4428, "mean_token_accuracy": 0.17162297070026397, "num_tokens": 14335260.0, "step": 7770 }, { "entropy": 5.761579751968384, "epoch": 0.6532241125813905, "grad_norm": 0.97265625, "learning_rate": 0.0004963520038703922, "loss": 5.5357, "mean_token_accuracy": 0.1569953978061676, "num_tokens": 14345823.0, "step": 7775 }, { "entropy": 5.697815990447998, "epoch": 0.6536441923965554, "grad_norm": 1.0390625, "learning_rate": 0.000496346631218476, "loss": 5.4087, "mean_token_accuracy": 0.1733380824327469, "num_tokens": 14354316.0, "step": 7780 }, { "entropy": 5.62894639968872, "epoch": 0.6540642722117203, "grad_norm": 0.92578125, "learning_rate": 0.000496341254645518, "loss": 5.4553, "mean_token_accuracy": 0.17333490997552872, "num_tokens": 14364539.0, "step": 7785 }, { "entropy": 5.681260681152343, "epoch": 0.6544843520268852, "grad_norm": 0.88671875, "learning_rate": 0.0004963358741516138, "loss": 5.5558, "mean_token_accuracy": 0.16200231909751892, "num_tokens": 14374081.0, "step": 7790 }, { "entropy": 5.70987491607666, "epoch": 0.6549044318420499, "grad_norm": 0.90625, "learning_rate": 0.0004963304897368585, "loss": 5.4557, "mean_token_accuracy": 0.16147168278694152, "num_tokens": 14383255.0, "step": 7795 }, { "entropy": 5.78820161819458, "epoch": 0.6553245116572148, "grad_norm": 1.1015625, "learning_rate": 0.0004963251014013475, "loss": 5.5896, "mean_token_accuracy": 0.1648208513855934, "num_tokens": 14392417.0, "step": 7800 }, { "entropy": 5.850169658660889, "epoch": 0.6557445914723797, "grad_norm": 1.140625, "learning_rate": 0.0004963197091451763, "loss": 5.6177, "mean_token_accuracy": 0.15744412541389466, "num_tokens": 14401899.0, "step": 7805 }, { "entropy": 5.796338748931885, "epoch": 0.6561646712875446, "grad_norm": 0.984375, "learning_rate": 0.0004963143129684405, "loss": 5.5729, "mean_token_accuracy": 0.1602293811738491, "num_tokens": 14411245.0, "step": 7810 }, { "entropy": 5.653019428253174, "epoch": 0.6565847511027095, "grad_norm": 1.0625, "learning_rate": 0.0004963089128712355, "loss": 5.4488, "mean_token_accuracy": 0.17309630364179612, "num_tokens": 14419710.0, "step": 7815 }, { "entropy": 5.627862644195557, "epoch": 0.6570048309178744, "grad_norm": 0.88671875, "learning_rate": 0.0004963035088536571, "loss": 5.4218, "mean_token_accuracy": 0.17990072220563888, "num_tokens": 14430266.0, "step": 7820 }, { "entropy": 5.702354001998901, "epoch": 0.6574249107330393, "grad_norm": 0.9140625, "learning_rate": 0.0004962981009158012, "loss": 5.3956, "mean_token_accuracy": 0.16377443671226502, "num_tokens": 14439515.0, "step": 7825 }, { "entropy": 5.699529790878296, "epoch": 0.6578449905482041, "grad_norm": 1.15625, "learning_rate": 0.0004962926890577632, "loss": 5.4635, "mean_token_accuracy": 0.17154118418693542, "num_tokens": 14448091.0, "step": 7830 }, { "entropy": 5.681157159805298, "epoch": 0.658265070363369, "grad_norm": 0.94140625, "learning_rate": 0.000496287273279639, "loss": 5.4892, "mean_token_accuracy": 0.1665187358856201, "num_tokens": 14457744.0, "step": 7835 }, { "entropy": 5.740079164505005, "epoch": 0.6586851501785339, "grad_norm": 0.953125, "learning_rate": 0.000496281853581525, "loss": 5.4732, "mean_token_accuracy": 0.17109596878290176, "num_tokens": 14467597.0, "step": 7840 }, { "entropy": 5.694699621200561, "epoch": 0.6591052299936988, "grad_norm": 1.015625, "learning_rate": 0.0004962764299635168, "loss": 5.4526, "mean_token_accuracy": 0.17279575616121293, "num_tokens": 14476662.0, "step": 7845 }, { "entropy": 5.751010799407959, "epoch": 0.6595253098088637, "grad_norm": 0.93359375, "learning_rate": 0.0004962710024257105, "loss": 5.5324, "mean_token_accuracy": 0.1658242180943489, "num_tokens": 14486583.0, "step": 7850 }, { "entropy": 5.749286460876465, "epoch": 0.6599453896240286, "grad_norm": 0.94140625, "learning_rate": 0.0004962655709682025, "loss": 5.5343, "mean_token_accuracy": 0.16569894403219224, "num_tokens": 14496528.0, "step": 7855 }, { "entropy": 5.736885070800781, "epoch": 0.6603654694391935, "grad_norm": 0.8671875, "learning_rate": 0.0004962601355910887, "loss": 5.5294, "mean_token_accuracy": 0.16236354857683183, "num_tokens": 14507026.0, "step": 7860 }, { "entropy": 5.597323274612426, "epoch": 0.6607855492543583, "grad_norm": 0.91015625, "learning_rate": 0.0004962546962944656, "loss": 5.3851, "mean_token_accuracy": 0.17176171392202377, "num_tokens": 14516480.0, "step": 7865 }, { "entropy": 5.6523455619812015, "epoch": 0.6612056290695232, "grad_norm": 0.97265625, "learning_rate": 0.0004962492530784295, "loss": 5.3455, "mean_token_accuracy": 0.18076795786619188, "num_tokens": 14525068.0, "step": 7870 }, { "entropy": 5.65107307434082, "epoch": 0.6616257088846881, "grad_norm": 0.921875, "learning_rate": 0.0004962438059430768, "loss": 5.4659, "mean_token_accuracy": 0.17110495269298553, "num_tokens": 14534441.0, "step": 7875 }, { "entropy": 5.705241060256958, "epoch": 0.662045788699853, "grad_norm": 0.9765625, "learning_rate": 0.0004962383548885039, "loss": 5.5629, "mean_token_accuracy": 0.16295325756072998, "num_tokens": 14543026.0, "step": 7880 }, { "entropy": 5.681460857391357, "epoch": 0.6624658685150179, "grad_norm": 0.98046875, "learning_rate": 0.0004962328999148075, "loss": 5.4079, "mean_token_accuracy": 0.17630907893180847, "num_tokens": 14552068.0, "step": 7885 }, { "entropy": 5.707177734375, "epoch": 0.6628859483301828, "grad_norm": 0.96875, "learning_rate": 0.0004962274410220842, "loss": 5.5553, "mean_token_accuracy": 0.16349743753671647, "num_tokens": 14561587.0, "step": 7890 }, { "entropy": 5.7315949440002445, "epoch": 0.6633060281453477, "grad_norm": 0.9453125, "learning_rate": 0.0004962219782104308, "loss": 5.5674, "mean_token_accuracy": 0.17078642100095748, "num_tokens": 14571020.0, "step": 7895 }, { "entropy": 5.748089981079102, "epoch": 0.6637261079605125, "grad_norm": 0.9765625, "learning_rate": 0.0004962165114799439, "loss": 5.5219, "mean_token_accuracy": 0.1581498920917511, "num_tokens": 14580638.0, "step": 7900 }, { "entropy": 5.684006977081299, "epoch": 0.6641461877756774, "grad_norm": 0.87890625, "learning_rate": 0.0004962110408307204, "loss": 5.4244, "mean_token_accuracy": 0.16590397357940673, "num_tokens": 14590173.0, "step": 7905 }, { "entropy": 5.631041860580444, "epoch": 0.6645662675908423, "grad_norm": 0.9609375, "learning_rate": 0.0004962055662628571, "loss": 5.4432, "mean_token_accuracy": 0.17124811559915543, "num_tokens": 14598635.0, "step": 7910 }, { "entropy": 5.715653800964356, "epoch": 0.6649863474060071, "grad_norm": 1.0, "learning_rate": 0.0004962000877764513, "loss": 5.4518, "mean_token_accuracy": 0.1737048864364624, "num_tokens": 14607233.0, "step": 7915 }, { "entropy": 5.8200788497924805, "epoch": 0.665406427221172, "grad_norm": 1.078125, "learning_rate": 0.0004961946053715998, "loss": 5.6133, "mean_token_accuracy": 0.1551756888628006, "num_tokens": 14617483.0, "step": 7920 }, { "entropy": 5.674804592132569, "epoch": 0.665826507036337, "grad_norm": 1.078125, "learning_rate": 0.0004961891190483997, "loss": 5.4394, "mean_token_accuracy": 0.1664857968688011, "num_tokens": 14625805.0, "step": 7925 }, { "entropy": 5.6124043464660645, "epoch": 0.6662465868515017, "grad_norm": 0.98046875, "learning_rate": 0.0004961836288069483, "loss": 5.3569, "mean_token_accuracy": 0.17260607928037644, "num_tokens": 14634605.0, "step": 7930 }, { "entropy": 5.724435472488404, "epoch": 0.6666666666666666, "grad_norm": 0.93359375, "learning_rate": 0.0004961781346473428, "loss": 5.5688, "mean_token_accuracy": 0.15710362046957016, "num_tokens": 14644970.0, "step": 7935 }, { "entropy": 5.721081972122192, "epoch": 0.6670867464818315, "grad_norm": 0.8515625, "learning_rate": 0.0004961726365696805, "loss": 5.4484, "mean_token_accuracy": 0.1679193213582039, "num_tokens": 14655043.0, "step": 7940 }, { "entropy": 5.728677034378052, "epoch": 0.6675068262969964, "grad_norm": 0.94921875, "learning_rate": 0.0004961671345740589, "loss": 5.4375, "mean_token_accuracy": 0.16690576076507568, "num_tokens": 14663994.0, "step": 7945 }, { "entropy": 5.65096526145935, "epoch": 0.6679269061121613, "grad_norm": 0.890625, "learning_rate": 0.0004961616286605753, "loss": 5.4334, "mean_token_accuracy": 0.15957258641719818, "num_tokens": 14674101.0, "step": 7950 }, { "entropy": 5.669146871566772, "epoch": 0.6683469859273262, "grad_norm": 0.921875, "learning_rate": 0.0004961561188293273, "loss": 5.5061, "mean_token_accuracy": 0.16300352662801743, "num_tokens": 14684156.0, "step": 7955 }, { "entropy": 5.617530393600464, "epoch": 0.6687670657424911, "grad_norm": 0.953125, "learning_rate": 0.0004961506050804126, "loss": 5.4206, "mean_token_accuracy": 0.17437569051980972, "num_tokens": 14693223.0, "step": 7960 }, { "entropy": 5.734630155563354, "epoch": 0.6691871455576559, "grad_norm": 0.9296875, "learning_rate": 0.000496145087413929, "loss": 5.4317, "mean_token_accuracy": 0.16657862663269044, "num_tokens": 14702959.0, "step": 7965 }, { "entropy": 5.7842125415802, "epoch": 0.6696072253728208, "grad_norm": 1.0390625, "learning_rate": 0.0004961395658299737, "loss": 5.5569, "mean_token_accuracy": 0.16241346150636674, "num_tokens": 14712146.0, "step": 7970 }, { "entropy": 5.694394731521607, "epoch": 0.6700273051879857, "grad_norm": 0.97265625, "learning_rate": 0.0004961340403286451, "loss": 5.467, "mean_token_accuracy": 0.16332777589559555, "num_tokens": 14721932.0, "step": 7975 }, { "entropy": 5.655285358428955, "epoch": 0.6704473850031506, "grad_norm": 0.96875, "learning_rate": 0.0004961285109100408, "loss": 5.3965, "mean_token_accuracy": 0.173796084523201, "num_tokens": 14731080.0, "step": 7980 }, { "entropy": 5.561992931365967, "epoch": 0.6708674648183155, "grad_norm": 1.015625, "learning_rate": 0.0004961229775742587, "loss": 5.3988, "mean_token_accuracy": 0.1744252011179924, "num_tokens": 14740057.0, "step": 7985 }, { "entropy": 5.722016334533691, "epoch": 0.6712875446334804, "grad_norm": 1.0546875, "learning_rate": 0.000496117440321397, "loss": 5.471, "mean_token_accuracy": 0.1750819519162178, "num_tokens": 14748399.0, "step": 7990 }, { "entropy": 5.704089832305908, "epoch": 0.6717076244486453, "grad_norm": 1.03125, "learning_rate": 0.0004961118991515537, "loss": 5.4962, "mean_token_accuracy": 0.16623101234436036, "num_tokens": 14757215.0, "step": 7995 }, { "entropy": 5.635396480560303, "epoch": 0.6721277042638101, "grad_norm": 0.9921875, "learning_rate": 0.000496106354064827, "loss": 5.4871, "mean_token_accuracy": 0.17475164234638213, "num_tokens": 14766191.0, "step": 8000 }, { "entropy": 5.778530263900757, "epoch": 0.672547784078975, "grad_norm": 0.90234375, "learning_rate": 0.0004961008050613149, "loss": 5.5478, "mean_token_accuracy": 0.16085838228464128, "num_tokens": 14775220.0, "step": 8005 }, { "entropy": 5.735023260116577, "epoch": 0.6729678638941399, "grad_norm": 1.0, "learning_rate": 0.0004960952521411161, "loss": 5.5139, "mean_token_accuracy": 0.16576410979032516, "num_tokens": 14784287.0, "step": 8010 }, { "entropy": 5.812222099304199, "epoch": 0.6733879437093048, "grad_norm": 0.9296875, "learning_rate": 0.0004960896953043287, "loss": 5.5705, "mean_token_accuracy": 0.16489047110080718, "num_tokens": 14794219.0, "step": 8015 }, { "entropy": 5.734062957763672, "epoch": 0.6738080235244697, "grad_norm": 1.0390625, "learning_rate": 0.0004960841345510511, "loss": 5.481, "mean_token_accuracy": 0.1697475478053093, "num_tokens": 14803324.0, "step": 8020 }, { "entropy": 5.720947408676148, "epoch": 0.6742281033396346, "grad_norm": 1.046875, "learning_rate": 0.000496078569881382, "loss": 5.4933, "mean_token_accuracy": 0.16686712205410004, "num_tokens": 14811963.0, "step": 8025 }, { "entropy": 5.6574663639068605, "epoch": 0.6746481831547995, "grad_norm": 1.046875, "learning_rate": 0.0004960730012954198, "loss": 5.442, "mean_token_accuracy": 0.16583069860935212, "num_tokens": 14821903.0, "step": 8030 }, { "entropy": 5.6191630363464355, "epoch": 0.6750682629699643, "grad_norm": 0.984375, "learning_rate": 0.0004960674287932634, "loss": 5.4474, "mean_token_accuracy": 0.16195174753665925, "num_tokens": 14831215.0, "step": 8035 }, { "entropy": 5.726200246810913, "epoch": 0.6754883427851291, "grad_norm": 0.97265625, "learning_rate": 0.0004960618523750111, "loss": 5.3643, "mean_token_accuracy": 0.17416994720697404, "num_tokens": 14840354.0, "step": 8040 }, { "entropy": 5.735062551498413, "epoch": 0.675908422600294, "grad_norm": 0.99609375, "learning_rate": 0.000496056272040762, "loss": 5.5518, "mean_token_accuracy": 0.1663343757390976, "num_tokens": 14849660.0, "step": 8045 }, { "entropy": 5.725352334976196, "epoch": 0.6763285024154589, "grad_norm": 0.94921875, "learning_rate": 0.0004960506877906149, "loss": 5.4372, "mean_token_accuracy": 0.1609252318739891, "num_tokens": 14859819.0, "step": 8050 }, { "entropy": 5.705269145965576, "epoch": 0.6767485822306238, "grad_norm": 0.95703125, "learning_rate": 0.0004960450996246686, "loss": 5.4611, "mean_token_accuracy": 0.16848595291376114, "num_tokens": 14869260.0, "step": 8055 }, { "entropy": 5.671531009674072, "epoch": 0.6771686620457887, "grad_norm": 0.99609375, "learning_rate": 0.0004960395075430222, "loss": 5.4232, "mean_token_accuracy": 0.16953370571136475, "num_tokens": 14878685.0, "step": 8060 }, { "entropy": 5.6622340202331545, "epoch": 0.6775887418609536, "grad_norm": 0.86328125, "learning_rate": 0.0004960339115457748, "loss": 5.4268, "mean_token_accuracy": 0.1659297153353691, "num_tokens": 14888456.0, "step": 8065 }, { "entropy": 5.7056151866912845, "epoch": 0.6780088216761184, "grad_norm": 1.0078125, "learning_rate": 0.0004960283116330255, "loss": 5.5355, "mean_token_accuracy": 0.16633066833019255, "num_tokens": 14897401.0, "step": 8070 }, { "entropy": 5.736236810684204, "epoch": 0.6784289014912833, "grad_norm": 0.93359375, "learning_rate": 0.0004960227078048735, "loss": 5.4573, "mean_token_accuracy": 0.16657501608133315, "num_tokens": 14906741.0, "step": 8075 }, { "entropy": 5.738412714004516, "epoch": 0.6788489813064482, "grad_norm": 0.9296875, "learning_rate": 0.0004960171000614179, "loss": 5.352, "mean_token_accuracy": 0.1779150739312172, "num_tokens": 14916002.0, "step": 8080 }, { "entropy": 5.5718223571777346, "epoch": 0.6792690611216131, "grad_norm": 1.046875, "learning_rate": 0.0004960114884027583, "loss": 5.293, "mean_token_accuracy": 0.18619335889816285, "num_tokens": 14925247.0, "step": 8085 }, { "entropy": 5.642387247085571, "epoch": 0.679689140936778, "grad_norm": 0.97265625, "learning_rate": 0.0004960058728289939, "loss": 5.4049, "mean_token_accuracy": 0.16639461666345595, "num_tokens": 14933925.0, "step": 8090 }, { "entropy": 5.771145582199097, "epoch": 0.6801092207519429, "grad_norm": 1.1015625, "learning_rate": 0.0004960002533402243, "loss": 5.4809, "mean_token_accuracy": 0.16957206577062606, "num_tokens": 14943368.0, "step": 8095 }, { "entropy": 5.723950052261353, "epoch": 0.6805293005671077, "grad_norm": 0.9375, "learning_rate": 0.0004959946299365491, "loss": 5.492, "mean_token_accuracy": 0.16733952164649962, "num_tokens": 14953710.0, "step": 8100 }, { "entropy": 5.733155155181885, "epoch": 0.6809493803822726, "grad_norm": 0.94140625, "learning_rate": 0.0004959890026180677, "loss": 5.5124, "mean_token_accuracy": 0.16363269835710526, "num_tokens": 14962814.0, "step": 8105 }, { "entropy": 5.602628993988037, "epoch": 0.6813694601974375, "grad_norm": 0.9296875, "learning_rate": 0.00049598337138488, "loss": 5.3951, "mean_token_accuracy": 0.1761382609605789, "num_tokens": 14971631.0, "step": 8110 }, { "entropy": 5.703770446777344, "epoch": 0.6817895400126024, "grad_norm": 1.0234375, "learning_rate": 0.0004959777362370855, "loss": 5.3987, "mean_token_accuracy": 0.17302963733673096, "num_tokens": 14980528.0, "step": 8115 }, { "entropy": 5.681969165802002, "epoch": 0.6822096198277673, "grad_norm": 0.953125, "learning_rate": 0.0004959720971747843, "loss": 5.4208, "mean_token_accuracy": 0.17004417777061462, "num_tokens": 14989331.0, "step": 8120 }, { "entropy": 5.671417903900147, "epoch": 0.6826296996429322, "grad_norm": 0.9609375, "learning_rate": 0.0004959664541980762, "loss": 5.4188, "mean_token_accuracy": 0.17401942908763884, "num_tokens": 14999403.0, "step": 8125 }, { "entropy": 5.710601329803467, "epoch": 0.6830497794580971, "grad_norm": 0.98046875, "learning_rate": 0.0004959608073070612, "loss": 5.5114, "mean_token_accuracy": 0.16626310646533965, "num_tokens": 15009388.0, "step": 8130 }, { "entropy": 5.728013801574707, "epoch": 0.6834698592732619, "grad_norm": 0.9609375, "learning_rate": 0.0004959551565018392, "loss": 5.4168, "mean_token_accuracy": 0.17476363331079484, "num_tokens": 15018586.0, "step": 8135 }, { "entropy": 5.666727352142334, "epoch": 0.6838899390884268, "grad_norm": 0.9375, "learning_rate": 0.0004959495017825104, "loss": 5.4379, "mean_token_accuracy": 0.1766646295785904, "num_tokens": 15027982.0, "step": 8140 }, { "entropy": 5.610442161560059, "epoch": 0.6843100189035917, "grad_norm": 0.9765625, "learning_rate": 0.0004959438431491749, "loss": 5.4186, "mean_token_accuracy": 0.1739590048789978, "num_tokens": 15037103.0, "step": 8145 }, { "entropy": 5.644532632827759, "epoch": 0.6847300987187566, "grad_norm": 0.91015625, "learning_rate": 0.000495938180601933, "loss": 5.5204, "mean_token_accuracy": 0.17002057135105134, "num_tokens": 15046739.0, "step": 8150 }, { "entropy": 5.745840978622437, "epoch": 0.6851501785339215, "grad_norm": 0.97265625, "learning_rate": 0.0004959325141408851, "loss": 5.4676, "mean_token_accuracy": 0.1691015213727951, "num_tokens": 15056586.0, "step": 8155 }, { "entropy": 5.6763612747192385, "epoch": 0.6855702583490864, "grad_norm": 1.0078125, "learning_rate": 0.0004959268437661313, "loss": 5.449, "mean_token_accuracy": 0.1687142327427864, "num_tokens": 15066622.0, "step": 8160 }, { "entropy": 5.678127193450928, "epoch": 0.6859903381642513, "grad_norm": 1.1640625, "learning_rate": 0.0004959211694777724, "loss": 5.4139, "mean_token_accuracy": 0.17046130895614625, "num_tokens": 15075415.0, "step": 8165 }, { "entropy": 5.642941427230835, "epoch": 0.686410417979416, "grad_norm": 0.90234375, "learning_rate": 0.0004959154912759086, "loss": 5.41, "mean_token_accuracy": 0.16956793367862702, "num_tokens": 15085087.0, "step": 8170 }, { "entropy": 5.674217653274536, "epoch": 0.6868304977945809, "grad_norm": 1.015625, "learning_rate": 0.0004959098091606406, "loss": 5.4276, "mean_token_accuracy": 0.17282803803682328, "num_tokens": 15093580.0, "step": 8175 }, { "entropy": 5.6009931564331055, "epoch": 0.6872505776097458, "grad_norm": 1.1328125, "learning_rate": 0.0004959041231320692, "loss": 5.4085, "mean_token_accuracy": 0.1755705252289772, "num_tokens": 15104033.0, "step": 8180 }, { "entropy": 5.684960508346558, "epoch": 0.6876706574249107, "grad_norm": 1.078125, "learning_rate": 0.0004958984331902951, "loss": 5.4874, "mean_token_accuracy": 0.16569405645132065, "num_tokens": 15113164.0, "step": 8185 }, { "entropy": 5.652985095977783, "epoch": 0.6880907372400756, "grad_norm": 0.9765625, "learning_rate": 0.0004958927393354188, "loss": 5.4253, "mean_token_accuracy": 0.1720282956957817, "num_tokens": 15122215.0, "step": 8190 }, { "entropy": 5.695438480377197, "epoch": 0.6885108170552405, "grad_norm": 0.99609375, "learning_rate": 0.0004958870415675415, "loss": 5.4113, "mean_token_accuracy": 0.1668440580368042, "num_tokens": 15130877.0, "step": 8195 }, { "entropy": 5.642064094543457, "epoch": 0.6889308968704054, "grad_norm": 0.98046875, "learning_rate": 0.0004958813398867639, "loss": 5.395, "mean_token_accuracy": 0.1738823726773262, "num_tokens": 15140227.0, "step": 8200 }, { "entropy": 5.766198778152466, "epoch": 0.6893509766855702, "grad_norm": 0.9375, "learning_rate": 0.0004958756342931872, "loss": 5.5718, "mean_token_accuracy": 0.16096531748771667, "num_tokens": 15150006.0, "step": 8205 }, { "entropy": 5.710210561752319, "epoch": 0.6897710565007351, "grad_norm": 0.94140625, "learning_rate": 0.0004958699247869122, "loss": 5.4481, "mean_token_accuracy": 0.17095823884010314, "num_tokens": 15160032.0, "step": 8210 }, { "entropy": 5.657556676864624, "epoch": 0.6901911363159, "grad_norm": 0.9140625, "learning_rate": 0.0004958642113680404, "loss": 5.4142, "mean_token_accuracy": 0.169447460770607, "num_tokens": 15168966.0, "step": 8215 }, { "entropy": 5.795995044708252, "epoch": 0.6906112161310649, "grad_norm": 1.1484375, "learning_rate": 0.0004958584940366727, "loss": 5.5844, "mean_token_accuracy": 0.16470508724451066, "num_tokens": 15179337.0, "step": 8220 }, { "entropy": 5.751201152801514, "epoch": 0.6910312959462298, "grad_norm": 0.921875, "learning_rate": 0.0004958527727929106, "loss": 5.4862, "mean_token_accuracy": 0.16709066778421403, "num_tokens": 15188395.0, "step": 8225 }, { "entropy": 5.686253881454467, "epoch": 0.6914513757613947, "grad_norm": 0.91796875, "learning_rate": 0.0004958470476368552, "loss": 5.405, "mean_token_accuracy": 0.17547234743833542, "num_tokens": 15198669.0, "step": 8230 }, { "entropy": 5.664592313766479, "epoch": 0.6918714555765595, "grad_norm": 1.03125, "learning_rate": 0.0004958413185686082, "loss": 5.4306, "mean_token_accuracy": 0.17083754986524582, "num_tokens": 15207371.0, "step": 8235 }, { "entropy": 5.721945762634277, "epoch": 0.6922915353917244, "grad_norm": 1.0078125, "learning_rate": 0.0004958355855882709, "loss": 5.4682, "mean_token_accuracy": 0.16971587836742402, "num_tokens": 15215694.0, "step": 8240 }, { "entropy": 5.71007285118103, "epoch": 0.6927116152068893, "grad_norm": 0.95703125, "learning_rate": 0.000495829848695945, "loss": 5.4325, "mean_token_accuracy": 0.16733045727014542, "num_tokens": 15224963.0, "step": 8245 }, { "entropy": 5.581372213363648, "epoch": 0.6931316950220542, "grad_norm": 0.97265625, "learning_rate": 0.000495824107891732, "loss": 5.2622, "mean_token_accuracy": 0.17676324248313904, "num_tokens": 15233569.0, "step": 8250 }, { "entropy": 5.66594181060791, "epoch": 0.6935517748372191, "grad_norm": 0.98046875, "learning_rate": 0.0004958183631757336, "loss": 5.4461, "mean_token_accuracy": 0.16882045865058898, "num_tokens": 15242671.0, "step": 8255 }, { "entropy": 5.650574827194214, "epoch": 0.693971854652384, "grad_norm": 0.9453125, "learning_rate": 0.0004958126145480517, "loss": 5.402, "mean_token_accuracy": 0.17607998102903366, "num_tokens": 15251698.0, "step": 8260 }, { "entropy": 5.7385705471038815, "epoch": 0.6943919344675489, "grad_norm": 1.0390625, "learning_rate": 0.0004958068620087879, "loss": 5.5143, "mean_token_accuracy": 0.16893487125635148, "num_tokens": 15260608.0, "step": 8265 }, { "entropy": 5.675213384628296, "epoch": 0.6948120142827137, "grad_norm": 0.97265625, "learning_rate": 0.0004958011055580443, "loss": 5.3836, "mean_token_accuracy": 0.1763719156384468, "num_tokens": 15268866.0, "step": 8270 }, { "entropy": 5.606337022781372, "epoch": 0.6952320940978786, "grad_norm": 1.0, "learning_rate": 0.0004957953451959229, "loss": 5.3531, "mean_token_accuracy": 0.18196955025196077, "num_tokens": 15277600.0, "step": 8275 }, { "entropy": 5.608048725128174, "epoch": 0.6956521739130435, "grad_norm": 0.90625, "learning_rate": 0.0004957895809225254, "loss": 5.3712, "mean_token_accuracy": 0.1749396950006485, "num_tokens": 15286016.0, "step": 8280 }, { "entropy": 5.654298305511475, "epoch": 0.6960722537282084, "grad_norm": 0.98046875, "learning_rate": 0.0004957838127379544, "loss": 5.4302, "mean_token_accuracy": 0.1747421383857727, "num_tokens": 15294676.0, "step": 8285 }, { "entropy": 5.679285192489624, "epoch": 0.6964923335433733, "grad_norm": 0.94921875, "learning_rate": 0.0004957780406423118, "loss": 5.4205, "mean_token_accuracy": 0.17060777693986892, "num_tokens": 15304084.0, "step": 8290 }, { "entropy": 5.68450517654419, "epoch": 0.6969124133585382, "grad_norm": 1.0390625, "learning_rate": 0.0004957722646356999, "loss": 5.4217, "mean_token_accuracy": 0.16955252438783647, "num_tokens": 15314182.0, "step": 8295 }, { "entropy": 5.683474969863892, "epoch": 0.697332493173703, "grad_norm": 0.94921875, "learning_rate": 0.0004957664847182209, "loss": 5.5497, "mean_token_accuracy": 0.16295086219906807, "num_tokens": 15324213.0, "step": 8300 }, { "entropy": 5.713848733901978, "epoch": 0.6977525729888678, "grad_norm": 1.078125, "learning_rate": 0.0004957607008899774, "loss": 5.4767, "mean_token_accuracy": 0.16117004156112671, "num_tokens": 15333122.0, "step": 8305 }, { "entropy": 5.745818662643432, "epoch": 0.6981726528040327, "grad_norm": 1.015625, "learning_rate": 0.0004957549131510717, "loss": 5.5546, "mean_token_accuracy": 0.1615897461771965, "num_tokens": 15342199.0, "step": 8310 }, { "entropy": 5.769120693206787, "epoch": 0.6985927326191976, "grad_norm": 0.9453125, "learning_rate": 0.0004957491215016065, "loss": 5.5272, "mean_token_accuracy": 0.1642034813761711, "num_tokens": 15352463.0, "step": 8315 }, { "entropy": 5.628503799438477, "epoch": 0.6990128124343625, "grad_norm": 1.0078125, "learning_rate": 0.0004957433259416841, "loss": 5.3547, "mean_token_accuracy": 0.16713788211345673, "num_tokens": 15361815.0, "step": 8320 }, { "entropy": 5.698371410369873, "epoch": 0.6994328922495274, "grad_norm": 0.9609375, "learning_rate": 0.0004957375264714075, "loss": 5.4572, "mean_token_accuracy": 0.16114626228809356, "num_tokens": 15371773.0, "step": 8325 }, { "entropy": 5.616191101074219, "epoch": 0.6998529720646923, "grad_norm": 0.96484375, "learning_rate": 0.0004957317230908792, "loss": 5.4195, "mean_token_accuracy": 0.16928454339504242, "num_tokens": 15380881.0, "step": 8330 }, { "entropy": 5.6188719272613525, "epoch": 0.7002730518798572, "grad_norm": 0.99609375, "learning_rate": 0.0004957259158002022, "loss": 5.2754, "mean_token_accuracy": 0.17819535881280898, "num_tokens": 15389310.0, "step": 8335 }, { "entropy": 5.60901665687561, "epoch": 0.700693131695022, "grad_norm": 0.94921875, "learning_rate": 0.0004957201045994791, "loss": 5.3881, "mean_token_accuracy": 0.17114198654890062, "num_tokens": 15398584.0, "step": 8340 }, { "entropy": 5.68906078338623, "epoch": 0.7011132115101869, "grad_norm": 1.0078125, "learning_rate": 0.0004957142894888131, "loss": 5.4298, "mean_token_accuracy": 0.17326382100582122, "num_tokens": 15407208.0, "step": 8345 }, { "entropy": 5.684459161758423, "epoch": 0.7015332913253518, "grad_norm": 0.92578125, "learning_rate": 0.0004957084704683071, "loss": 5.466, "mean_token_accuracy": 0.16816584765911102, "num_tokens": 15416474.0, "step": 8350 }, { "entropy": 5.681583738327026, "epoch": 0.7019533711405167, "grad_norm": 0.98828125, "learning_rate": 0.0004957026475380642, "loss": 5.4514, "mean_token_accuracy": 0.17442281246185304, "num_tokens": 15426101.0, "step": 8355 }, { "entropy": 5.714492893218994, "epoch": 0.7023734509556816, "grad_norm": 1.0078125, "learning_rate": 0.0004956968206981875, "loss": 5.4927, "mean_token_accuracy": 0.16649986654520035, "num_tokens": 15435910.0, "step": 8360 }, { "entropy": 5.751401758193969, "epoch": 0.7027935307708465, "grad_norm": 0.96484375, "learning_rate": 0.0004956909899487803, "loss": 5.5138, "mean_token_accuracy": 0.1684841424226761, "num_tokens": 15445494.0, "step": 8365 }, { "entropy": 5.6478063583374025, "epoch": 0.7032136105860114, "grad_norm": 0.84375, "learning_rate": 0.0004956851552899459, "loss": 5.4225, "mean_token_accuracy": 0.17504344284534454, "num_tokens": 15455332.0, "step": 8370 }, { "entropy": 5.68330488204956, "epoch": 0.7036336904011762, "grad_norm": 0.91796875, "learning_rate": 0.0004956793167217874, "loss": 5.484, "mean_token_accuracy": 0.16238084584474563, "num_tokens": 15464241.0, "step": 8375 }, { "entropy": 5.763062286376953, "epoch": 0.7040537702163411, "grad_norm": 1.109375, "learning_rate": 0.0004956734742444087, "loss": 5.4807, "mean_token_accuracy": 0.17389054596424103, "num_tokens": 15473473.0, "step": 8380 }, { "entropy": 5.65634708404541, "epoch": 0.704473850031506, "grad_norm": 1.0390625, "learning_rate": 0.0004956676278579129, "loss": 5.3614, "mean_token_accuracy": 0.1734451323747635, "num_tokens": 15482494.0, "step": 8385 }, { "entropy": 5.589113998413086, "epoch": 0.7048939298466709, "grad_norm": 0.94140625, "learning_rate": 0.0004956617775624037, "loss": 5.3843, "mean_token_accuracy": 0.17073923498392105, "num_tokens": 15491180.0, "step": 8390 }, { "entropy": 5.657165622711181, "epoch": 0.7053140096618358, "grad_norm": 1.0078125, "learning_rate": 0.0004956559233579848, "loss": 5.4323, "mean_token_accuracy": 0.16821539252996445, "num_tokens": 15501035.0, "step": 8395 }, { "entropy": 5.6735498905181885, "epoch": 0.7057340894770007, "grad_norm": 0.96875, "learning_rate": 0.0004956500652447598, "loss": 5.426, "mean_token_accuracy": 0.17123993486166, "num_tokens": 15510191.0, "step": 8400 }, { "entropy": 5.642197751998902, "epoch": 0.7061541692921655, "grad_norm": 0.9453125, "learning_rate": 0.0004956442032228324, "loss": 5.486, "mean_token_accuracy": 0.17094990164041518, "num_tokens": 15519253.0, "step": 8405 }, { "entropy": 5.679303741455078, "epoch": 0.7065742491073304, "grad_norm": 0.9921875, "learning_rate": 0.0004956383372923067, "loss": 5.4521, "mean_token_accuracy": 0.16864797472953796, "num_tokens": 15528348.0, "step": 8410 }, { "entropy": 5.756013870239258, "epoch": 0.7069943289224953, "grad_norm": 0.89453125, "learning_rate": 0.0004956324674532864, "loss": 5.5294, "mean_token_accuracy": 0.16600346565246582, "num_tokens": 15537557.0, "step": 8415 }, { "entropy": 5.761501026153565, "epoch": 0.7074144087376601, "grad_norm": 0.90234375, "learning_rate": 0.0004956265937058757, "loss": 5.4449, "mean_token_accuracy": 0.17098963260650635, "num_tokens": 15546745.0, "step": 8420 }, { "entropy": 5.6882706642150875, "epoch": 0.707834488552825, "grad_norm": 0.96875, "learning_rate": 0.0004956207160501784, "loss": 5.3722, "mean_token_accuracy": 0.17445978671312332, "num_tokens": 15555532.0, "step": 8425 }, { "entropy": 5.64253797531128, "epoch": 0.70825456836799, "grad_norm": 0.9765625, "learning_rate": 0.0004956148344862987, "loss": 5.4332, "mean_token_accuracy": 0.17582773566246032, "num_tokens": 15564189.0, "step": 8430 }, { "entropy": 5.58995246887207, "epoch": 0.7086746481831548, "grad_norm": 0.9453125, "learning_rate": 0.0004956089490143408, "loss": 5.4465, "mean_token_accuracy": 0.16621713638305663, "num_tokens": 15574116.0, "step": 8435 }, { "entropy": 5.764248561859131, "epoch": 0.7090947279983196, "grad_norm": 0.98828125, "learning_rate": 0.0004956030596344089, "loss": 5.4297, "mean_token_accuracy": 0.1704532414674759, "num_tokens": 15583031.0, "step": 8440 }, { "entropy": 5.756300067901611, "epoch": 0.7095148078134845, "grad_norm": 0.8671875, "learning_rate": 0.0004955971663466075, "loss": 5.5617, "mean_token_accuracy": 0.1687937393784523, "num_tokens": 15592576.0, "step": 8445 }, { "entropy": 5.753180599212646, "epoch": 0.7099348876286494, "grad_norm": 0.96484375, "learning_rate": 0.0004955912691510407, "loss": 5.479, "mean_token_accuracy": 0.17366782128810881, "num_tokens": 15601065.0, "step": 8450 }, { "entropy": 5.669482135772705, "epoch": 0.7103549674438143, "grad_norm": 0.98828125, "learning_rate": 0.0004955853680478134, "loss": 5.4236, "mean_token_accuracy": 0.16465443670749663, "num_tokens": 15610112.0, "step": 8455 }, { "entropy": 5.672362327575684, "epoch": 0.7107750472589792, "grad_norm": 0.984375, "learning_rate": 0.0004955794630370297, "loss": 5.4069, "mean_token_accuracy": 0.16875406056642533, "num_tokens": 15618890.0, "step": 8460 }, { "entropy": 5.661868476867676, "epoch": 0.7111951270741441, "grad_norm": 0.95703125, "learning_rate": 0.0004955735541187945, "loss": 5.4497, "mean_token_accuracy": 0.17067780196666718, "num_tokens": 15627678.0, "step": 8465 }, { "entropy": 5.752597522735596, "epoch": 0.711615206889309, "grad_norm": 1.0546875, "learning_rate": 0.0004955676412932124, "loss": 5.4364, "mean_token_accuracy": 0.17605146616697312, "num_tokens": 15636833.0, "step": 8470 }, { "entropy": 5.6645129203796385, "epoch": 0.7120352867044738, "grad_norm": 1.0390625, "learning_rate": 0.0004955617245603881, "loss": 5.4432, "mean_token_accuracy": 0.1653780534863472, "num_tokens": 15646571.0, "step": 8475 }, { "entropy": 5.627372455596924, "epoch": 0.7124553665196387, "grad_norm": 1.0546875, "learning_rate": 0.0004955558039204263, "loss": 5.4762, "mean_token_accuracy": 0.17126149088144302, "num_tokens": 15654907.0, "step": 8480 }, { "entropy": 5.70549750328064, "epoch": 0.7128754463348036, "grad_norm": 0.98828125, "learning_rate": 0.0004955498793734321, "loss": 5.4261, "mean_token_accuracy": 0.17386842668056487, "num_tokens": 15664336.0, "step": 8485 }, { "entropy": 5.71790714263916, "epoch": 0.7132955261499685, "grad_norm": 0.98828125, "learning_rate": 0.0004955439509195103, "loss": 5.4874, "mean_token_accuracy": 0.17014443427324294, "num_tokens": 15674000.0, "step": 8490 }, { "entropy": 5.714636373519897, "epoch": 0.7137156059651334, "grad_norm": 0.9921875, "learning_rate": 0.0004955380185587661, "loss": 5.4778, "mean_token_accuracy": 0.1748084157705307, "num_tokens": 15684214.0, "step": 8495 }, { "entropy": 5.718602418899536, "epoch": 0.7141356857802983, "grad_norm": 1.0390625, "learning_rate": 0.0004955320822913043, "loss": 5.4875, "mean_token_accuracy": 0.1683593288064003, "num_tokens": 15693546.0, "step": 8500 }, { "entropy": 5.689378499984741, "epoch": 0.7145557655954632, "grad_norm": 0.95703125, "learning_rate": 0.0004955261421172302, "loss": 5.3905, "mean_token_accuracy": 0.17118469923734664, "num_tokens": 15702310.0, "step": 8505 }, { "entropy": 5.666873931884766, "epoch": 0.714975845410628, "grad_norm": 1.0234375, "learning_rate": 0.0004955201980366493, "loss": 5.4483, "mean_token_accuracy": 0.17365573197603226, "num_tokens": 15711544.0, "step": 8510 }, { "entropy": 5.563192462921142, "epoch": 0.7153959252257929, "grad_norm": 1.015625, "learning_rate": 0.0004955142500496665, "loss": 5.3457, "mean_token_accuracy": 0.17584063559770585, "num_tokens": 15720914.0, "step": 8515 }, { "entropy": 5.685083055496216, "epoch": 0.7158160050409578, "grad_norm": 0.96875, "learning_rate": 0.0004955082981563872, "loss": 5.4339, "mean_token_accuracy": 0.1660164326429367, "num_tokens": 15729825.0, "step": 8520 }, { "entropy": 5.712343072891235, "epoch": 0.7162360848561227, "grad_norm": 0.97265625, "learning_rate": 0.000495502342356917, "loss": 5.4367, "mean_token_accuracy": 0.17206156849861146, "num_tokens": 15739649.0, "step": 8525 }, { "entropy": 5.704998302459717, "epoch": 0.7166561646712876, "grad_norm": 1.078125, "learning_rate": 0.0004954963826513614, "loss": 5.3455, "mean_token_accuracy": 0.1762731447815895, "num_tokens": 15747805.0, "step": 8530 }, { "entropy": 5.720434188842773, "epoch": 0.7170762444864525, "grad_norm": 0.9140625, "learning_rate": 0.000495490419039826, "loss": 5.4727, "mean_token_accuracy": 0.16803782433271408, "num_tokens": 15757267.0, "step": 8535 }, { "entropy": 5.670850324630737, "epoch": 0.7174963243016174, "grad_norm": 0.9375, "learning_rate": 0.0004954844515224162, "loss": 5.4318, "mean_token_accuracy": 0.1713361293077469, "num_tokens": 15767412.0, "step": 8540 }, { "entropy": 5.62501802444458, "epoch": 0.7179164041167821, "grad_norm": 1.0078125, "learning_rate": 0.0004954784800992379, "loss": 5.4551, "mean_token_accuracy": 0.16513469964265823, "num_tokens": 15776813.0, "step": 8545 }, { "entropy": 5.7299168586730955, "epoch": 0.718336483931947, "grad_norm": 0.953125, "learning_rate": 0.0004954725047703969, "loss": 5.4713, "mean_token_accuracy": 0.16940504908561707, "num_tokens": 15786258.0, "step": 8550 }, { "entropy": 5.700893259048462, "epoch": 0.7187565637471119, "grad_norm": 0.95703125, "learning_rate": 0.000495466525535999, "loss": 5.4489, "mean_token_accuracy": 0.1717313602566719, "num_tokens": 15795673.0, "step": 8555 }, { "entropy": 5.704733180999756, "epoch": 0.7191766435622768, "grad_norm": 0.9609375, "learning_rate": 0.0004954605423961501, "loss": 5.4469, "mean_token_accuracy": 0.16943047791719437, "num_tokens": 15805050.0, "step": 8560 }, { "entropy": 5.609813070297241, "epoch": 0.7195967233774417, "grad_norm": 0.9921875, "learning_rate": 0.0004954545553509562, "loss": 5.3888, "mean_token_accuracy": 0.181489497423172, "num_tokens": 15813347.0, "step": 8565 }, { "entropy": 5.7514173030853275, "epoch": 0.7200168031926066, "grad_norm": 0.92578125, "learning_rate": 0.0004954485644005235, "loss": 5.517, "mean_token_accuracy": 0.16980722844600676, "num_tokens": 15823528.0, "step": 8570 }, { "entropy": 5.688001346588135, "epoch": 0.7204368830077714, "grad_norm": 1.0078125, "learning_rate": 0.0004954425695449578, "loss": 5.4175, "mean_token_accuracy": 0.16882595270872117, "num_tokens": 15832727.0, "step": 8575 }, { "entropy": 5.731327629089355, "epoch": 0.7208569628229363, "grad_norm": 0.8828125, "learning_rate": 0.0004954365707843657, "loss": 5.4921, "mean_token_accuracy": 0.16411554515361787, "num_tokens": 15842402.0, "step": 8580 }, { "entropy": 5.649729061126709, "epoch": 0.7212770426381012, "grad_norm": 0.98046875, "learning_rate": 0.0004954305681188531, "loss": 5.3546, "mean_token_accuracy": 0.17328893691301345, "num_tokens": 15850886.0, "step": 8585 }, { "entropy": 5.83349871635437, "epoch": 0.7216971224532661, "grad_norm": 1.125, "learning_rate": 0.0004954245615485265, "loss": 5.6498, "mean_token_accuracy": 0.16681575551629066, "num_tokens": 15860093.0, "step": 8590 }, { "entropy": 5.686976051330566, "epoch": 0.722117202268431, "grad_norm": 1.0, "learning_rate": 0.0004954185510734924, "loss": 5.3677, "mean_token_accuracy": 0.17604906260967254, "num_tokens": 15868681.0, "step": 8595 }, { "entropy": 5.683244371414185, "epoch": 0.7225372820835959, "grad_norm": 1.0, "learning_rate": 0.0004954125366938571, "loss": 5.4332, "mean_token_accuracy": 0.1739838093519211, "num_tokens": 15878041.0, "step": 8600 }, { "entropy": 5.649758672714233, "epoch": 0.7229573618987608, "grad_norm": 0.984375, "learning_rate": 0.0004954065184097271, "loss": 5.4359, "mean_token_accuracy": 0.16881918907165527, "num_tokens": 15887562.0, "step": 8605 }, { "entropy": 5.665257740020752, "epoch": 0.7233774417139256, "grad_norm": 1.03125, "learning_rate": 0.0004954004962212092, "loss": 5.3612, "mean_token_accuracy": 0.18196807503700257, "num_tokens": 15896480.0, "step": 8610 }, { "entropy": 5.814667558670044, "epoch": 0.7237975215290905, "grad_norm": 0.9296875, "learning_rate": 0.0004953944701284101, "loss": 5.5867, "mean_token_accuracy": 0.1643269270658493, "num_tokens": 15906743.0, "step": 8615 }, { "entropy": 5.7049860000610355, "epoch": 0.7242176013442554, "grad_norm": 0.94140625, "learning_rate": 0.0004953884401314363, "loss": 5.5353, "mean_token_accuracy": 0.15585060715675353, "num_tokens": 15915981.0, "step": 8620 }, { "entropy": 5.70078763961792, "epoch": 0.7246376811594203, "grad_norm": 0.98828125, "learning_rate": 0.0004953824062303949, "loss": 5.3731, "mean_token_accuracy": 0.17181483656167984, "num_tokens": 15924117.0, "step": 8625 }, { "entropy": 5.644768333435058, "epoch": 0.7250577609745852, "grad_norm": 0.98828125, "learning_rate": 0.0004953763684253926, "loss": 5.3972, "mean_token_accuracy": 0.17694538086652756, "num_tokens": 15933124.0, "step": 8630 }, { "entropy": 5.634309864044189, "epoch": 0.7254778407897501, "grad_norm": 0.96875, "learning_rate": 0.0004953703267165364, "loss": 5.2899, "mean_token_accuracy": 0.16893458366394043, "num_tokens": 15942422.0, "step": 8635 }, { "entropy": 5.707332992553711, "epoch": 0.725897920604915, "grad_norm": 1.0078125, "learning_rate": 0.0004953642811039332, "loss": 5.5175, "mean_token_accuracy": 0.16249137073755265, "num_tokens": 15950989.0, "step": 8640 }, { "entropy": 5.731482267379761, "epoch": 0.7263180004200798, "grad_norm": 0.94921875, "learning_rate": 0.0004953582315876904, "loss": 5.5094, "mean_token_accuracy": 0.17023360580205918, "num_tokens": 15959659.0, "step": 8645 }, { "entropy": 5.667933464050293, "epoch": 0.7267380802352447, "grad_norm": 0.984375, "learning_rate": 0.000495352178167915, "loss": 5.3771, "mean_token_accuracy": 0.184345543384552, "num_tokens": 15968102.0, "step": 8650 }, { "entropy": 5.740264129638672, "epoch": 0.7271581600504096, "grad_norm": 0.99609375, "learning_rate": 0.0004953461208447143, "loss": 5.5071, "mean_token_accuracy": 0.16474466323852538, "num_tokens": 15977705.0, "step": 8655 }, { "entropy": 5.689959383010864, "epoch": 0.7275782398655745, "grad_norm": 1.0546875, "learning_rate": 0.0004953400596181953, "loss": 5.5055, "mean_token_accuracy": 0.1639854222536087, "num_tokens": 15986703.0, "step": 8660 }, { "entropy": 5.696602773666382, "epoch": 0.7279983196807394, "grad_norm": 0.9765625, "learning_rate": 0.0004953339944884657, "loss": 5.413, "mean_token_accuracy": 0.17565635591745377, "num_tokens": 15995672.0, "step": 8665 }, { "entropy": 5.609744215011597, "epoch": 0.7284183994959043, "grad_norm": 0.95703125, "learning_rate": 0.0004953279254556329, "loss": 5.3653, "mean_token_accuracy": 0.17787732928991318, "num_tokens": 16004437.0, "step": 8670 }, { "entropy": 5.684988927841187, "epoch": 0.7288384793110692, "grad_norm": 1.0625, "learning_rate": 0.0004953218525198043, "loss": 5.3973, "mean_token_accuracy": 0.16983381360769273, "num_tokens": 16012847.0, "step": 8675 }, { "entropy": 5.713296937942505, "epoch": 0.7292585591262339, "grad_norm": 0.91015625, "learning_rate": 0.0004953157756810876, "loss": 5.4282, "mean_token_accuracy": 0.1725606307387352, "num_tokens": 16022213.0, "step": 8680 }, { "entropy": 5.682115602493286, "epoch": 0.7296786389413988, "grad_norm": 0.93359375, "learning_rate": 0.0004953096949395902, "loss": 5.4937, "mean_token_accuracy": 0.17437110096216202, "num_tokens": 16031411.0, "step": 8685 }, { "entropy": 5.728541135787964, "epoch": 0.7300987187565637, "grad_norm": 0.921875, "learning_rate": 0.0004953036102954202, "loss": 5.5302, "mean_token_accuracy": 0.1656832292675972, "num_tokens": 16041227.0, "step": 8690 }, { "entropy": 5.642122364044189, "epoch": 0.7305187985717286, "grad_norm": 0.921875, "learning_rate": 0.0004952975217486852, "loss": 5.351, "mean_token_accuracy": 0.17716092318296434, "num_tokens": 16049777.0, "step": 8695 }, { "entropy": 5.650162410736084, "epoch": 0.7309388783868935, "grad_norm": 0.92578125, "learning_rate": 0.0004952914292994928, "loss": 5.4486, "mean_token_accuracy": 0.17793299108743668, "num_tokens": 16059093.0, "step": 8700 }, { "entropy": 5.73065881729126, "epoch": 0.7313589582020584, "grad_norm": 1.046875, "learning_rate": 0.0004952853329479514, "loss": 5.4722, "mean_token_accuracy": 0.17585868388414383, "num_tokens": 16068550.0, "step": 8705 }, { "entropy": 5.72199182510376, "epoch": 0.7317790380172233, "grad_norm": 1.0078125, "learning_rate": 0.0004952792326941686, "loss": 5.5006, "mean_token_accuracy": 0.1687643826007843, "num_tokens": 16078286.0, "step": 8710 }, { "entropy": 5.703016853332519, "epoch": 0.7321991178323881, "grad_norm": 0.9453125, "learning_rate": 0.0004952731285382527, "loss": 5.4409, "mean_token_accuracy": 0.17153105437755584, "num_tokens": 16087560.0, "step": 8715 }, { "entropy": 5.625837564468384, "epoch": 0.732619197647553, "grad_norm": 0.99609375, "learning_rate": 0.0004952670204803118, "loss": 5.4053, "mean_token_accuracy": 0.17746168673038482, "num_tokens": 16097478.0, "step": 8720 }, { "entropy": 5.729750633239746, "epoch": 0.7330392774627179, "grad_norm": 0.9296875, "learning_rate": 0.0004952609085204539, "loss": 5.5063, "mean_token_accuracy": 0.1720663473010063, "num_tokens": 16106884.0, "step": 8725 }, { "entropy": 5.6860052108764645, "epoch": 0.7334593572778828, "grad_norm": 0.9921875, "learning_rate": 0.0004952547926587876, "loss": 5.441, "mean_token_accuracy": 0.16728868782520295, "num_tokens": 16115689.0, "step": 8730 }, { "entropy": 5.649883890151978, "epoch": 0.7338794370930477, "grad_norm": 0.95703125, "learning_rate": 0.0004952486728954209, "loss": 5.3619, "mean_token_accuracy": 0.1765601083636284, "num_tokens": 16125237.0, "step": 8735 }, { "entropy": 5.612076044082642, "epoch": 0.7342995169082126, "grad_norm": 0.953125, "learning_rate": 0.0004952425492304624, "loss": 5.3915, "mean_token_accuracy": 0.17717382460832595, "num_tokens": 16133940.0, "step": 8740 }, { "entropy": 5.666138172149658, "epoch": 0.7347195967233774, "grad_norm": 1.0234375, "learning_rate": 0.0004952364216640207, "loss": 5.4628, "mean_token_accuracy": 0.17273564040660858, "num_tokens": 16143256.0, "step": 8745 }, { "entropy": 5.705369853973389, "epoch": 0.7351396765385423, "grad_norm": 0.8671875, "learning_rate": 0.000495230290196204, "loss": 5.3631, "mean_token_accuracy": 0.17574335932731627, "num_tokens": 16153259.0, "step": 8750 }, { "entropy": 5.7202253341674805, "epoch": 0.7355597563537072, "grad_norm": 1.1796875, "learning_rate": 0.0004952241548271212, "loss": 5.5937, "mean_token_accuracy": 0.1582058347761631, "num_tokens": 16162125.0, "step": 8755 }, { "entropy": 5.736495399475098, "epoch": 0.7359798361688721, "grad_norm": 0.96484375, "learning_rate": 0.0004952180155568809, "loss": 5.5053, "mean_token_accuracy": 0.16641440689563752, "num_tokens": 16171680.0, "step": 8760 }, { "entropy": 5.735837650299072, "epoch": 0.736399915984037, "grad_norm": 0.88671875, "learning_rate": 0.0004952118723855919, "loss": 5.5002, "mean_token_accuracy": 0.1728304609656334, "num_tokens": 16181559.0, "step": 8765 }, { "entropy": 5.693457317352295, "epoch": 0.7368199957992019, "grad_norm": 0.9609375, "learning_rate": 0.0004952057253133628, "loss": 5.4567, "mean_token_accuracy": 0.1683572053909302, "num_tokens": 16190611.0, "step": 8770 }, { "entropy": 5.697279071807861, "epoch": 0.7372400756143668, "grad_norm": 1.0, "learning_rate": 0.0004951995743403028, "loss": 5.4704, "mean_token_accuracy": 0.1670347899198532, "num_tokens": 16200156.0, "step": 8775 }, { "entropy": 5.6646442890167235, "epoch": 0.7376601554295316, "grad_norm": 0.91796875, "learning_rate": 0.0004951934194665208, "loss": 5.4466, "mean_token_accuracy": 0.16776914596557618, "num_tokens": 16209808.0, "step": 8780 }, { "entropy": 5.636610317230224, "epoch": 0.7380802352446965, "grad_norm": 0.95703125, "learning_rate": 0.0004951872606921257, "loss": 5.3929, "mean_token_accuracy": 0.17188532203435897, "num_tokens": 16219243.0, "step": 8785 }, { "entropy": 5.651103210449219, "epoch": 0.7385003150598614, "grad_norm": 0.89453125, "learning_rate": 0.0004951810980172265, "loss": 5.4078, "mean_token_accuracy": 0.1813236728310585, "num_tokens": 16228180.0, "step": 8790 }, { "entropy": 5.680718803405762, "epoch": 0.7389203948750263, "grad_norm": 1.0078125, "learning_rate": 0.0004951749314419327, "loss": 5.4423, "mean_token_accuracy": 0.1698632076382637, "num_tokens": 16237045.0, "step": 8795 }, { "entropy": 5.674114608764649, "epoch": 0.7393404746901912, "grad_norm": 1.0390625, "learning_rate": 0.0004951687609663533, "loss": 5.3539, "mean_token_accuracy": 0.17704771608114242, "num_tokens": 16245307.0, "step": 8800 }, { "entropy": 5.695535135269165, "epoch": 0.739760554505356, "grad_norm": 0.91796875, "learning_rate": 0.0004951625865905977, "loss": 5.394, "mean_token_accuracy": 0.16814257353544235, "num_tokens": 16255047.0, "step": 8805 }, { "entropy": 5.661872816085816, "epoch": 0.740180634320521, "grad_norm": 0.9765625, "learning_rate": 0.0004951564083147753, "loss": 5.4255, "mean_token_accuracy": 0.17673753798007966, "num_tokens": 16264969.0, "step": 8810 }, { "entropy": 5.688477516174316, "epoch": 0.7406007141356857, "grad_norm": 0.9140625, "learning_rate": 0.0004951502261389953, "loss": 5.5307, "mean_token_accuracy": 0.1647869899868965, "num_tokens": 16274757.0, "step": 8815 }, { "entropy": 5.648443365097046, "epoch": 0.7410207939508506, "grad_norm": 0.9921875, "learning_rate": 0.0004951440400633677, "loss": 5.4089, "mean_token_accuracy": 0.18536317497491836, "num_tokens": 16283409.0, "step": 8820 }, { "entropy": 5.6198039054870605, "epoch": 0.7414408737660155, "grad_norm": 1.0234375, "learning_rate": 0.0004951378500880015, "loss": 5.3963, "mean_token_accuracy": 0.1721627041697502, "num_tokens": 16293206.0, "step": 8825 }, { "entropy": 5.728368425369263, "epoch": 0.7418609535811804, "grad_norm": 1.0234375, "learning_rate": 0.0004951316562130067, "loss": 5.4044, "mean_token_accuracy": 0.1735895723104477, "num_tokens": 16303121.0, "step": 8830 }, { "entropy": 5.662971448898316, "epoch": 0.7422810333963453, "grad_norm": 0.87109375, "learning_rate": 0.000495125458438493, "loss": 5.3791, "mean_token_accuracy": 0.1796284094452858, "num_tokens": 16312710.0, "step": 8835 }, { "entropy": 5.774871492385865, "epoch": 0.7427011132115102, "grad_norm": 1.0078125, "learning_rate": 0.0004951192567645702, "loss": 5.59, "mean_token_accuracy": 0.16738404482603073, "num_tokens": 16322280.0, "step": 8840 }, { "entropy": 5.625352287292481, "epoch": 0.7431211930266751, "grad_norm": 0.99609375, "learning_rate": 0.0004951130511913481, "loss": 5.4247, "mean_token_accuracy": 0.17108162343502045, "num_tokens": 16331656.0, "step": 8845 }, { "entropy": 5.648809814453125, "epoch": 0.7435412728418399, "grad_norm": 0.97265625, "learning_rate": 0.0004951068417189366, "loss": 5.4397, "mean_token_accuracy": 0.17098239809274673, "num_tokens": 16341074.0, "step": 8850 }, { "entropy": 5.7191088676452635, "epoch": 0.7439613526570048, "grad_norm": 0.9609375, "learning_rate": 0.0004951006283474457, "loss": 5.4336, "mean_token_accuracy": 0.16694654077291488, "num_tokens": 16350097.0, "step": 8855 }, { "entropy": 5.541397190093994, "epoch": 0.7443814324721697, "grad_norm": 0.94140625, "learning_rate": 0.0004950944110769856, "loss": 5.3389, "mean_token_accuracy": 0.17541486024856567, "num_tokens": 16359274.0, "step": 8860 }, { "entropy": 5.590544271469116, "epoch": 0.7448015122873346, "grad_norm": 0.9921875, "learning_rate": 0.0004950881899076663, "loss": 5.3201, "mean_token_accuracy": 0.18839261084794998, "num_tokens": 16368445.0, "step": 8865 }, { "entropy": 5.711699295043945, "epoch": 0.7452215921024995, "grad_norm": 0.91796875, "learning_rate": 0.0004950819648395979, "loss": 5.4246, "mean_token_accuracy": 0.17058226317167283, "num_tokens": 16377689.0, "step": 8870 }, { "entropy": 5.673012161254883, "epoch": 0.7456416719176644, "grad_norm": 0.95703125, "learning_rate": 0.000495075735872891, "loss": 5.3794, "mean_token_accuracy": 0.16905369162559508, "num_tokens": 16386713.0, "step": 8875 }, { "entropy": 5.6853879451751705, "epoch": 0.7460617517328293, "grad_norm": 1.0390625, "learning_rate": 0.0004950695030076557, "loss": 5.4044, "mean_token_accuracy": 0.17320440262556075, "num_tokens": 16395390.0, "step": 8880 }, { "entropy": 5.667076969146729, "epoch": 0.7464818315479941, "grad_norm": 1.125, "learning_rate": 0.0004950632662440027, "loss": 5.4718, "mean_token_accuracy": 0.17193606197834016, "num_tokens": 16404531.0, "step": 8885 }, { "entropy": 5.617186498641968, "epoch": 0.746901911363159, "grad_norm": 1.015625, "learning_rate": 0.0004950570255820419, "loss": 5.366, "mean_token_accuracy": 0.1783306986093521, "num_tokens": 16413649.0, "step": 8890 }, { "entropy": 5.63929853439331, "epoch": 0.7473219911783239, "grad_norm": 0.92578125, "learning_rate": 0.0004950507810218843, "loss": 5.4949, "mean_token_accuracy": 0.16654341220855712, "num_tokens": 16423247.0, "step": 8895 }, { "entropy": 5.712381362915039, "epoch": 0.7477420709934888, "grad_norm": 1.0859375, "learning_rate": 0.0004950445325636405, "loss": 5.4008, "mean_token_accuracy": 0.1695175752043724, "num_tokens": 16432190.0, "step": 8900 }, { "entropy": 5.728821516036987, "epoch": 0.7481621508086537, "grad_norm": 0.87890625, "learning_rate": 0.0004950382802074211, "loss": 5.3723, "mean_token_accuracy": 0.18152749091386794, "num_tokens": 16443091.0, "step": 8905 }, { "entropy": 5.64563479423523, "epoch": 0.7485822306238186, "grad_norm": 0.9453125, "learning_rate": 0.0004950320239533369, "loss": 5.4228, "mean_token_accuracy": 0.17709367275238036, "num_tokens": 16452077.0, "step": 8910 }, { "entropy": 5.783951139450073, "epoch": 0.7490023104389834, "grad_norm": 1.1328125, "learning_rate": 0.0004950257638014986, "loss": 5.5452, "mean_token_accuracy": 0.16610444486141204, "num_tokens": 16461893.0, "step": 8915 }, { "entropy": 5.720726251602173, "epoch": 0.7494223902541483, "grad_norm": 0.9453125, "learning_rate": 0.0004950194997520172, "loss": 5.3846, "mean_token_accuracy": 0.1724250763654709, "num_tokens": 16470904.0, "step": 8920 }, { "entropy": 5.644548130035401, "epoch": 0.7498424700693131, "grad_norm": 1.1484375, "learning_rate": 0.0004950132318050037, "loss": 5.4305, "mean_token_accuracy": 0.1721814066171646, "num_tokens": 16480130.0, "step": 8925 }, { "entropy": 5.673683929443359, "epoch": 0.750262549884478, "grad_norm": 1.046875, "learning_rate": 0.0004950069599605691, "loss": 5.4916, "mean_token_accuracy": 0.17197586894035338, "num_tokens": 16489485.0, "step": 8930 }, { "entropy": 5.701393413543701, "epoch": 0.750682629699643, "grad_norm": 1.078125, "learning_rate": 0.0004950006842188245, "loss": 5.4405, "mean_token_accuracy": 0.17944686561822892, "num_tokens": 16498529.0, "step": 8935 }, { "entropy": 5.677621603012085, "epoch": 0.7511027095148078, "grad_norm": 0.98046875, "learning_rate": 0.000494994404579881, "loss": 5.3675, "mean_token_accuracy": 0.1745915085077286, "num_tokens": 16508094.0, "step": 8940 }, { "entropy": 5.678492450714112, "epoch": 0.7515227893299727, "grad_norm": 1.015625, "learning_rate": 0.00049498812104385, "loss": 5.4613, "mean_token_accuracy": 0.16839174926280975, "num_tokens": 16517620.0, "step": 8945 }, { "entropy": 5.660134744644165, "epoch": 0.7519428691451375, "grad_norm": 1.0078125, "learning_rate": 0.0004949818336108425, "loss": 5.4496, "mean_token_accuracy": 0.17120658308267594, "num_tokens": 16526720.0, "step": 8950 }, { "entropy": 5.636377191543579, "epoch": 0.7523629489603024, "grad_norm": 0.99609375, "learning_rate": 0.0004949755422809703, "loss": 5.4296, "mean_token_accuracy": 0.1683360904455185, "num_tokens": 16535979.0, "step": 8955 }, { "entropy": 5.661738634109497, "epoch": 0.7527830287754673, "grad_norm": 1.0390625, "learning_rate": 0.0004949692470543446, "loss": 5.3185, "mean_token_accuracy": 0.18360351473093034, "num_tokens": 16544538.0, "step": 8960 }, { "entropy": 5.621352195739746, "epoch": 0.7532031085906322, "grad_norm": 0.96875, "learning_rate": 0.0004949629479310769, "loss": 5.3992, "mean_token_accuracy": 0.174703386425972, "num_tokens": 16553962.0, "step": 8965 }, { "entropy": 5.662003660202027, "epoch": 0.7536231884057971, "grad_norm": 1.0, "learning_rate": 0.0004949566449112788, "loss": 5.3139, "mean_token_accuracy": 0.1806609570980072, "num_tokens": 16562652.0, "step": 8970 }, { "entropy": 5.713546705245972, "epoch": 0.754043268220962, "grad_norm": 1.03125, "learning_rate": 0.0004949503379950621, "loss": 5.4185, "mean_token_accuracy": 0.17080322057008743, "num_tokens": 16570887.0, "step": 8975 }, { "entropy": 5.730198669433594, "epoch": 0.7544633480361269, "grad_norm": 0.91015625, "learning_rate": 0.0004949440271825385, "loss": 5.5407, "mean_token_accuracy": 0.17042307704687118, "num_tokens": 16581469.0, "step": 8980 }, { "entropy": 5.709934711456299, "epoch": 0.7548834278512917, "grad_norm": 1.0078125, "learning_rate": 0.0004949377124738196, "loss": 5.4467, "mean_token_accuracy": 0.1689629077911377, "num_tokens": 16590213.0, "step": 8985 }, { "entropy": 5.6945000171661375, "epoch": 0.7553035076664566, "grad_norm": 1.015625, "learning_rate": 0.0004949313938690174, "loss": 5.4155, "mean_token_accuracy": 0.1702682465314865, "num_tokens": 16598384.0, "step": 8990 }, { "entropy": 5.618514919281006, "epoch": 0.7557235874816215, "grad_norm": 0.9765625, "learning_rate": 0.0004949250713682438, "loss": 5.397, "mean_token_accuracy": 0.1767728477716446, "num_tokens": 16607670.0, "step": 8995 }, { "entropy": 5.694167900085449, "epoch": 0.7561436672967864, "grad_norm": 0.98046875, "learning_rate": 0.0004949187449716107, "loss": 5.4889, "mean_token_accuracy": 0.171578086912632, "num_tokens": 16617560.0, "step": 9000 }, { "epoch": 0.7561436672967864, "eval_entropy": 5.450514390263918, "eval_loss": 5.435747146606445, "eval_mean_token_accuracy": 0.17926591218436613, "eval_num_tokens": 16617560.0, "eval_runtime": 21.0554, "eval_samples_per_second": 1774.652, "eval_steps_per_second": 221.843, "step": 9000 } ], "logging_steps": 5, "max_steps": 119020, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3605047976263680.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }