{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2520478890989288, "eval_steps": 3000, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 10.742606925964356, "epoch": 0.0004200798151648813, "grad_norm": 5.21875, "learning_rate": 2e-06, "loss": 10.7358, "mean_token_accuracy": 0.0, "num_tokens": 8348.0, "step": 5 }, { "entropy": 10.74260492324829, "epoch": 0.0008401596303297626, "grad_norm": 5.15625, "learning_rate": 4.5e-06, "loss": 10.7547, "mean_token_accuracy": 0.0, "num_tokens": 17465.0, "step": 10 }, { "entropy": 10.742631721496583, "epoch": 0.001260239445494644, "grad_norm": 5.25, "learning_rate": 7e-06, "loss": 10.7247, "mean_token_accuracy": 0.00010341261513531208, "num_tokens": 26627.0, "step": 15 }, { "entropy": 10.742714214324952, "epoch": 0.0016803192606595252, "grad_norm": 4.96875, "learning_rate": 9.5e-06, "loss": 10.6807, "mean_token_accuracy": 0.0, "num_tokens": 36069.0, "step": 20 }, { "entropy": 10.742774486541748, "epoch": 0.002100399075824407, "grad_norm": 4.96875, "learning_rate": 1.2e-05, "loss": 10.564, "mean_token_accuracy": 0.0009151221020147204, "num_tokens": 44967.0, "step": 25 }, { "entropy": 10.742547607421875, "epoch": 0.002520478890989288, "grad_norm": 3.8125, "learning_rate": 1.4500000000000002e-05, "loss": 10.4843, "mean_token_accuracy": 0.0172414593398571, "num_tokens": 55132.0, "step": 30 }, { "entropy": 10.741770172119141, "epoch": 0.0029405587061541692, "grad_norm": 3.1875, "learning_rate": 1.7000000000000003e-05, "loss": 10.3322, "mean_token_accuracy": 0.044619453698396684, "num_tokens": 65141.0, "step": 35 }, { "entropy": 10.739381885528564, "epoch": 0.0033606385213190504, "grad_norm": 2.484375, "learning_rate": 1.95e-05, "loss": 10.2048, "mean_token_accuracy": 0.04063304513692856, "num_tokens": 74007.0, "step": 40 }, { "entropy": 10.735391807556152, "epoch": 0.003780718336483932, "grad_norm": 2.203125, "learning_rate": 2.2e-05, "loss": 10.1027, "mean_token_accuracy": 0.04380051270127296, "num_tokens": 83736.0, "step": 45 }, { "entropy": 10.731560325622558, "epoch": 0.004200798151648814, "grad_norm": 2.03125, "learning_rate": 2.4500000000000003e-05, "loss": 10.0024, "mean_token_accuracy": 0.04462047629058361, "num_tokens": 92525.0, "step": 50 }, { "entropy": 10.729215049743653, "epoch": 0.004620877966813695, "grad_norm": 2.046875, "learning_rate": 2.7e-05, "loss": 9.9462, "mean_token_accuracy": 0.042681990377604964, "num_tokens": 102015.0, "step": 55 }, { "entropy": 10.728453350067138, "epoch": 0.005040957781978576, "grad_norm": 1.7890625, "learning_rate": 2.95e-05, "loss": 9.9154, "mean_token_accuracy": 0.03954915180802345, "num_tokens": 110887.0, "step": 60 }, { "entropy": 10.727616500854491, "epoch": 0.005461037597143457, "grad_norm": 1.8828125, "learning_rate": 3.2e-05, "loss": 9.8453, "mean_token_accuracy": 0.04232911877334118, "num_tokens": 120442.0, "step": 65 }, { "entropy": 10.726141738891602, "epoch": 0.0058811174123083385, "grad_norm": 1.9609375, "learning_rate": 3.4500000000000005e-05, "loss": 9.7509, "mean_token_accuracy": 0.041194649040699007, "num_tokens": 129297.0, "step": 70 }, { "entropy": 10.723711013793945, "epoch": 0.00630119722747322, "grad_norm": 1.8828125, "learning_rate": 3.7e-05, "loss": 9.7015, "mean_token_accuracy": 0.04228766188025475, "num_tokens": 138305.0, "step": 75 }, { "entropy": 10.719814491271972, "epoch": 0.006721277042638101, "grad_norm": 1.96875, "learning_rate": 3.95e-05, "loss": 9.6499, "mean_token_accuracy": 0.04200226049870252, "num_tokens": 147640.0, "step": 80 }, { "entropy": 10.714290428161622, "epoch": 0.007141356857802983, "grad_norm": 1.8515625, "learning_rate": 4.2000000000000004e-05, "loss": 9.576, "mean_token_accuracy": 0.04255363866686821, "num_tokens": 157633.0, "step": 85 }, { "entropy": 10.707357215881348, "epoch": 0.007561436672967864, "grad_norm": 1.671875, "learning_rate": 4.45e-05, "loss": 9.5382, "mean_token_accuracy": 0.03800953794270754, "num_tokens": 167984.0, "step": 90 }, { "entropy": 10.699947547912597, "epoch": 0.007981516488132745, "grad_norm": 1.7421875, "learning_rate": 4.7000000000000004e-05, "loss": 9.4351, "mean_token_accuracy": 0.04883353523910046, "num_tokens": 176984.0, "step": 95 }, { "entropy": 10.683709812164306, "epoch": 0.008401596303297627, "grad_norm": 1.890625, "learning_rate": 4.9500000000000004e-05, "loss": 9.3133, "mean_token_accuracy": 0.051684480533003806, "num_tokens": 185931.0, "step": 100 }, { "entropy": 10.665494632720947, "epoch": 0.008821676118462508, "grad_norm": 1.859375, "learning_rate": 5.2e-05, "loss": 9.2723, "mean_token_accuracy": 0.05058838985860348, "num_tokens": 195065.0, "step": 105 }, { "entropy": 10.650426483154297, "epoch": 0.00924175593362739, "grad_norm": 1.703125, "learning_rate": 5.45e-05, "loss": 9.1345, "mean_token_accuracy": 0.05380081832408905, "num_tokens": 203687.0, "step": 110 }, { "entropy": 10.613165855407715, "epoch": 0.00966183574879227, "grad_norm": 1.6484375, "learning_rate": 5.7e-05, "loss": 9.0467, "mean_token_accuracy": 0.057396522164344786, "num_tokens": 212847.0, "step": 115 }, { "entropy": 10.554168796539306, "epoch": 0.010081915563957152, "grad_norm": 1.6875, "learning_rate": 5.9499999999999996e-05, "loss": 8.93, "mean_token_accuracy": 0.05599412247538567, "num_tokens": 222593.0, "step": 120 }, { "entropy": 10.50309362411499, "epoch": 0.010501995379122032, "grad_norm": 1.6875, "learning_rate": 6.2e-05, "loss": 8.7842, "mean_token_accuracy": 0.054633737355470655, "num_tokens": 231174.0, "step": 125 }, { "entropy": 10.446444129943847, "epoch": 0.010922075194286915, "grad_norm": 1.5546875, "learning_rate": 6.450000000000001e-05, "loss": 8.6507, "mean_token_accuracy": 0.05882068388164043, "num_tokens": 239833.0, "step": 130 }, { "entropy": 10.371571159362793, "epoch": 0.011342155009451797, "grad_norm": 1.53125, "learning_rate": 6.7e-05, "loss": 8.62, "mean_token_accuracy": 0.05638743191957474, "num_tokens": 248794.0, "step": 135 }, { "entropy": 10.297250938415527, "epoch": 0.011762234824616677, "grad_norm": 1.4375, "learning_rate": 6.950000000000001e-05, "loss": 8.5299, "mean_token_accuracy": 0.056220804899930955, "num_tokens": 257123.0, "step": 140 }, { "entropy": 10.228730010986329, "epoch": 0.012182314639781559, "grad_norm": 1.453125, "learning_rate": 7.2e-05, "loss": 8.2842, "mean_token_accuracy": 0.05619280487298965, "num_tokens": 266088.0, "step": 145 }, { "entropy": 10.08653745651245, "epoch": 0.01260239445494644, "grad_norm": 1.21875, "learning_rate": 7.45e-05, "loss": 8.3619, "mean_token_accuracy": 0.0516346599906683, "num_tokens": 276074.0, "step": 150 }, { "entropy": 9.963776969909668, "epoch": 0.013022474270111321, "grad_norm": 1.171875, "learning_rate": 7.7e-05, "loss": 8.1944, "mean_token_accuracy": 0.054025283083319664, "num_tokens": 285280.0, "step": 155 }, { "entropy": 9.805997848510742, "epoch": 0.013442554085276202, "grad_norm": 1.171875, "learning_rate": 7.950000000000001e-05, "loss": 8.151, "mean_token_accuracy": 0.052671706303954124, "num_tokens": 296115.0, "step": 160 }, { "entropy": 9.606755542755128, "epoch": 0.013862633900441084, "grad_norm": 0.99609375, "learning_rate": 8.2e-05, "loss": 7.9584, "mean_token_accuracy": 0.05575060956180096, "num_tokens": 305483.0, "step": 165 }, { "entropy": 9.449717140197754, "epoch": 0.014282713715605966, "grad_norm": 0.93359375, "learning_rate": 8.450000000000001e-05, "loss": 7.9165, "mean_token_accuracy": 0.058218777552247046, "num_tokens": 314000.0, "step": 170 }, { "entropy": 9.167982482910157, "epoch": 0.014702793530770846, "grad_norm": 1.1953125, "learning_rate": 8.7e-05, "loss": 7.8517, "mean_token_accuracy": 0.062257979065179825, "num_tokens": 323667.0, "step": 175 }, { "entropy": 8.951386070251464, "epoch": 0.015122873345935728, "grad_norm": 0.9296875, "learning_rate": 8.95e-05, "loss": 7.8029, "mean_token_accuracy": 0.06150264739990234, "num_tokens": 332695.0, "step": 180 }, { "entropy": 8.776250171661378, "epoch": 0.015542953161100609, "grad_norm": 0.9609375, "learning_rate": 9.2e-05, "loss": 7.643, "mean_token_accuracy": 0.05887415409088135, "num_tokens": 342428.0, "step": 185 }, { "entropy": 8.602806949615479, "epoch": 0.01596303297626549, "grad_norm": 0.79296875, "learning_rate": 9.45e-05, "loss": 7.7106, "mean_token_accuracy": 0.06374814324080944, "num_tokens": 353587.0, "step": 190 }, { "entropy": 8.474033164978028, "epoch": 0.01638311279143037, "grad_norm": 0.93359375, "learning_rate": 9.7e-05, "loss": 7.6401, "mean_token_accuracy": 0.06406850814819336, "num_tokens": 362997.0, "step": 195 }, { "entropy": 8.364265060424804, "epoch": 0.016803192606595255, "grad_norm": 0.95703125, "learning_rate": 9.95e-05, "loss": 7.6617, "mean_token_accuracy": 0.06993534453213215, "num_tokens": 372346.0, "step": 200 }, { "entropy": 8.375140285491943, "epoch": 0.017223272421760135, "grad_norm": 1.0, "learning_rate": 0.000102, "loss": 7.5334, "mean_token_accuracy": 0.06646758764982223, "num_tokens": 381575.0, "step": 205 }, { "entropy": 8.26815767288208, "epoch": 0.017643352236925015, "grad_norm": 0.90625, "learning_rate": 0.00010449999999999999, "loss": 7.5902, "mean_token_accuracy": 0.07085754275321961, "num_tokens": 390706.0, "step": 210 }, { "entropy": 8.218460845947266, "epoch": 0.018063432052089896, "grad_norm": 0.828125, "learning_rate": 0.000107, "loss": 7.5876, "mean_token_accuracy": 0.07221915200352669, "num_tokens": 400000.0, "step": 215 }, { "entropy": 8.139337062835693, "epoch": 0.01848351186725478, "grad_norm": 0.85546875, "learning_rate": 0.0001095, "loss": 7.5295, "mean_token_accuracy": 0.07644539698958397, "num_tokens": 409447.0, "step": 220 }, { "entropy": 8.122040271759033, "epoch": 0.01890359168241966, "grad_norm": 1.1328125, "learning_rate": 0.000112, "loss": 7.5068, "mean_token_accuracy": 0.07519292533397674, "num_tokens": 418417.0, "step": 225 }, { "entropy": 8.067694330215454, "epoch": 0.01932367149758454, "grad_norm": 0.9609375, "learning_rate": 0.0001145, "loss": 7.4664, "mean_token_accuracy": 0.07503528967499733, "num_tokens": 427619.0, "step": 230 }, { "entropy": 8.071773529052734, "epoch": 0.019743751312749424, "grad_norm": 0.96484375, "learning_rate": 0.00011700000000000001, "loss": 7.5131, "mean_token_accuracy": 0.07185145244002342, "num_tokens": 437931.0, "step": 235 }, { "entropy": 8.109980726242066, "epoch": 0.020163831127914304, "grad_norm": 0.9609375, "learning_rate": 0.00011949999999999999, "loss": 7.552, "mean_token_accuracy": 0.07611973807215691, "num_tokens": 447595.0, "step": 240 }, { "entropy": 8.026875400543213, "epoch": 0.020583910943079185, "grad_norm": 0.94921875, "learning_rate": 0.000122, "loss": 7.4164, "mean_token_accuracy": 0.07035953775048256, "num_tokens": 457062.0, "step": 245 }, { "entropy": 8.063331604003906, "epoch": 0.021003990758244065, "grad_norm": 1.015625, "learning_rate": 0.0001245, "loss": 7.5166, "mean_token_accuracy": 0.07237975299358368, "num_tokens": 466191.0, "step": 250 }, { "entropy": 8.050399017333984, "epoch": 0.02142407057340895, "grad_norm": 1.2734375, "learning_rate": 0.000127, "loss": 7.4443, "mean_token_accuracy": 0.07492763809859752, "num_tokens": 475693.0, "step": 255 }, { "entropy": 8.024266242980957, "epoch": 0.02184415038857383, "grad_norm": 1.0234375, "learning_rate": 0.0001295, "loss": 7.4691, "mean_token_accuracy": 0.07379123903810977, "num_tokens": 485173.0, "step": 260 }, { "entropy": 7.993921422958374, "epoch": 0.02226423020373871, "grad_norm": 0.99609375, "learning_rate": 0.000132, "loss": 7.3863, "mean_token_accuracy": 0.08008474782109261, "num_tokens": 493985.0, "step": 265 }, { "entropy": 7.907951974868775, "epoch": 0.022684310018903593, "grad_norm": 1.125, "learning_rate": 0.00013450000000000002, "loss": 7.4036, "mean_token_accuracy": 0.07586845718324184, "num_tokens": 502837.0, "step": 270 }, { "entropy": 7.981403732299805, "epoch": 0.023104389834068473, "grad_norm": 0.91015625, "learning_rate": 0.00013700000000000002, "loss": 7.3605, "mean_token_accuracy": 0.07924394458532333, "num_tokens": 511503.0, "step": 275 }, { "entropy": 7.977783203125, "epoch": 0.023524469649233354, "grad_norm": 0.92578125, "learning_rate": 0.0001395, "loss": 7.5335, "mean_token_accuracy": 0.0751778606325388, "num_tokens": 521499.0, "step": 280 }, { "entropy": 7.871473217010498, "epoch": 0.023944549464398234, "grad_norm": 1.0703125, "learning_rate": 0.00014199999999999998, "loss": 7.2955, "mean_token_accuracy": 0.0799000546336174, "num_tokens": 530067.0, "step": 285 }, { "entropy": 7.885423564910889, "epoch": 0.024364629279563118, "grad_norm": 0.921875, "learning_rate": 0.0001445, "loss": 7.2851, "mean_token_accuracy": 0.08089336939156055, "num_tokens": 538559.0, "step": 290 }, { "entropy": 7.956486988067627, "epoch": 0.024784709094728, "grad_norm": 1.0078125, "learning_rate": 0.000147, "loss": 7.4858, "mean_token_accuracy": 0.07482350952923297, "num_tokens": 547288.0, "step": 295 }, { "entropy": 7.870783424377441, "epoch": 0.02520478890989288, "grad_norm": 0.8828125, "learning_rate": 0.0001495, "loss": 7.3589, "mean_token_accuracy": 0.07514288201928139, "num_tokens": 557269.0, "step": 300 }, { "entropy": 7.939627742767334, "epoch": 0.025624868725057762, "grad_norm": 0.96484375, "learning_rate": 0.000152, "loss": 7.3914, "mean_token_accuracy": 0.07472754344344139, "num_tokens": 567280.0, "step": 305 }, { "entropy": 7.828274822235107, "epoch": 0.026044948540222643, "grad_norm": 0.91796875, "learning_rate": 0.00015450000000000001, "loss": 7.2341, "mean_token_accuracy": 0.07823858335614205, "num_tokens": 576609.0, "step": 310 }, { "entropy": 7.761577320098877, "epoch": 0.026465028355387523, "grad_norm": 1.046875, "learning_rate": 0.000157, "loss": 7.1336, "mean_token_accuracy": 0.08791142702102661, "num_tokens": 586053.0, "step": 315 }, { "entropy": 7.695616436004639, "epoch": 0.026885108170552403, "grad_norm": 0.94921875, "learning_rate": 0.0001595, "loss": 7.3339, "mean_token_accuracy": 0.08298731297254562, "num_tokens": 594649.0, "step": 320 }, { "entropy": 7.869348049163818, "epoch": 0.027305187985717287, "grad_norm": 1.109375, "learning_rate": 0.000162, "loss": 7.2862, "mean_token_accuracy": 0.07372522614896297, "num_tokens": 603445.0, "step": 325 }, { "entropy": 7.86638765335083, "epoch": 0.027725267800882167, "grad_norm": 1.0625, "learning_rate": 0.00016450000000000001, "loss": 7.3613, "mean_token_accuracy": 0.07848134562373162, "num_tokens": 613611.0, "step": 330 }, { "entropy": 7.971248960494995, "epoch": 0.028145347616047048, "grad_norm": 1.0703125, "learning_rate": 0.00016700000000000002, "loss": 7.5217, "mean_token_accuracy": 0.07931054159998893, "num_tokens": 623024.0, "step": 335 }, { "entropy": 7.725814580917358, "epoch": 0.02856542743121193, "grad_norm": 1.2734375, "learning_rate": 0.00016950000000000003, "loss": 7.225, "mean_token_accuracy": 0.08345521688461303, "num_tokens": 631624.0, "step": 340 }, { "entropy": 7.762637519836426, "epoch": 0.028985507246376812, "grad_norm": 1.0078125, "learning_rate": 0.00017199999999999998, "loss": 7.1844, "mean_token_accuracy": 0.08410112038254738, "num_tokens": 640473.0, "step": 345 }, { "entropy": 7.841788578033447, "epoch": 0.029405587061541692, "grad_norm": 1.0625, "learning_rate": 0.00017449999999999999, "loss": 7.3409, "mean_token_accuracy": 0.08037517666816711, "num_tokens": 649692.0, "step": 350 }, { "entropy": 7.800195980072021, "epoch": 0.029825666876706573, "grad_norm": 1.0390625, "learning_rate": 0.000177, "loss": 7.2995, "mean_token_accuracy": 0.08097823038697242, "num_tokens": 658236.0, "step": 355 }, { "entropy": 7.668969297409058, "epoch": 0.030245746691871456, "grad_norm": 1.0859375, "learning_rate": 0.0001795, "loss": 7.0948, "mean_token_accuracy": 0.08619136661291123, "num_tokens": 667175.0, "step": 360 }, { "entropy": 7.798488330841065, "epoch": 0.030665826507036337, "grad_norm": 1.125, "learning_rate": 0.000182, "loss": 7.3842, "mean_token_accuracy": 0.07823293879628182, "num_tokens": 676456.0, "step": 365 }, { "entropy": 7.812319660186768, "epoch": 0.031085906322201217, "grad_norm": 0.9765625, "learning_rate": 0.0001845, "loss": 7.3503, "mean_token_accuracy": 0.07726633399724961, "num_tokens": 686881.0, "step": 370 }, { "entropy": 7.688674831390381, "epoch": 0.0315059861373661, "grad_norm": 1.0234375, "learning_rate": 0.000187, "loss": 7.1373, "mean_token_accuracy": 0.0819906547665596, "num_tokens": 696045.0, "step": 375 }, { "entropy": 7.655067443847656, "epoch": 0.03192606595253098, "grad_norm": 1.1484375, "learning_rate": 0.0001895, "loss": 7.1112, "mean_token_accuracy": 0.08879919424653053, "num_tokens": 704729.0, "step": 380 }, { "entropy": 7.4980494499206545, "epoch": 0.032346145767695865, "grad_norm": 0.953125, "learning_rate": 0.000192, "loss": 7.1679, "mean_token_accuracy": 0.07921729236841202, "num_tokens": 714331.0, "step": 385 }, { "entropy": 7.735121536254883, "epoch": 0.03276622558286074, "grad_norm": 1.0625, "learning_rate": 0.0001945, "loss": 7.1229, "mean_token_accuracy": 0.08520057946443557, "num_tokens": 722788.0, "step": 390 }, { "entropy": 7.683975791931152, "epoch": 0.033186305398025626, "grad_norm": 1.2421875, "learning_rate": 0.00019700000000000002, "loss": 7.1944, "mean_token_accuracy": 0.08690556064248085, "num_tokens": 731417.0, "step": 395 }, { "entropy": 7.576824569702149, "epoch": 0.03360638521319051, "grad_norm": 0.9140625, "learning_rate": 0.00019950000000000002, "loss": 7.1549, "mean_token_accuracy": 0.08151165619492531, "num_tokens": 741034.0, "step": 400 }, { "entropy": 7.698281908035279, "epoch": 0.034026465028355386, "grad_norm": 0.9453125, "learning_rate": 0.000202, "loss": 7.156, "mean_token_accuracy": 0.08484743162989616, "num_tokens": 749596.0, "step": 405 }, { "entropy": 7.556124067306518, "epoch": 0.03444654484352027, "grad_norm": 0.921875, "learning_rate": 0.00020449999999999998, "loss": 7.1145, "mean_token_accuracy": 0.08153974264860153, "num_tokens": 758931.0, "step": 410 }, { "entropy": 7.533982944488526, "epoch": 0.03486662465868515, "grad_norm": 1.0390625, "learning_rate": 0.000207, "loss": 7.0206, "mean_token_accuracy": 0.09019657000899314, "num_tokens": 767534.0, "step": 415 }, { "entropy": 7.6061821460723875, "epoch": 0.03528670447385003, "grad_norm": 1.078125, "learning_rate": 0.0002095, "loss": 7.0789, "mean_token_accuracy": 0.08290171101689339, "num_tokens": 776456.0, "step": 420 }, { "entropy": 7.5107566833496096, "epoch": 0.035706784289014915, "grad_norm": 1.0078125, "learning_rate": 0.000212, "loss": 7.1362, "mean_token_accuracy": 0.08152465149760246, "num_tokens": 786172.0, "step": 425 }, { "entropy": 7.553678846359253, "epoch": 0.03612686410417979, "grad_norm": 0.97265625, "learning_rate": 0.0002145, "loss": 7.0139, "mean_token_accuracy": 0.09106989204883575, "num_tokens": 795081.0, "step": 430 }, { "entropy": 7.604944372177124, "epoch": 0.036546943919344675, "grad_norm": 1.03125, "learning_rate": 0.00021700000000000002, "loss": 7.0628, "mean_token_accuracy": 0.08461785838007926, "num_tokens": 804259.0, "step": 435 }, { "entropy": 7.534902191162109, "epoch": 0.03696702373450956, "grad_norm": 1.109375, "learning_rate": 0.0002195, "loss": 7.0873, "mean_token_accuracy": 0.08283074498176575, "num_tokens": 813463.0, "step": 440 }, { "entropy": 7.502531671524048, "epoch": 0.037387103549674436, "grad_norm": 1.046875, "learning_rate": 0.000222, "loss": 7.0035, "mean_token_accuracy": 0.09452007561922074, "num_tokens": 823029.0, "step": 445 }, { "entropy": 7.486780834197998, "epoch": 0.03780718336483932, "grad_norm": 1.015625, "learning_rate": 0.0002245, "loss": 7.0727, "mean_token_accuracy": 0.08529324010014534, "num_tokens": 832902.0, "step": 450 }, { "entropy": 7.476432847976684, "epoch": 0.0382272631800042, "grad_norm": 1.0, "learning_rate": 0.00022700000000000002, "loss": 7.0158, "mean_token_accuracy": 0.08854726403951645, "num_tokens": 842162.0, "step": 455 }, { "entropy": 7.52789797782898, "epoch": 0.03864734299516908, "grad_norm": 1.0625, "learning_rate": 0.00022950000000000002, "loss": 7.0493, "mean_token_accuracy": 0.08622511699795724, "num_tokens": 852328.0, "step": 460 }, { "entropy": 7.449561357498169, "epoch": 0.039067422810333964, "grad_norm": 1.046875, "learning_rate": 0.00023200000000000003, "loss": 7.0104, "mean_token_accuracy": 0.09133929386734962, "num_tokens": 860929.0, "step": 465 }, { "entropy": 7.458409357070923, "epoch": 0.03948750262549885, "grad_norm": 1.1015625, "learning_rate": 0.00023449999999999998, "loss": 7.0901, "mean_token_accuracy": 0.08522843271493911, "num_tokens": 869144.0, "step": 470 }, { "entropy": 7.584603118896484, "epoch": 0.039907582440663725, "grad_norm": 1.1484375, "learning_rate": 0.000237, "loss": 7.03, "mean_token_accuracy": 0.09454337358474732, "num_tokens": 877447.0, "step": 475 }, { "entropy": 7.431310081481934, "epoch": 0.04032766225582861, "grad_norm": 0.99609375, "learning_rate": 0.0002395, "loss": 6.9871, "mean_token_accuracy": 0.08733554184436798, "num_tokens": 887020.0, "step": 480 }, { "entropy": 7.453667879104614, "epoch": 0.040747742070993485, "grad_norm": 1.171875, "learning_rate": 0.000242, "loss": 7.0323, "mean_token_accuracy": 0.08681000843644142, "num_tokens": 895937.0, "step": 485 }, { "entropy": 7.41835618019104, "epoch": 0.04116782188615837, "grad_norm": 1.0234375, "learning_rate": 0.0002445, "loss": 7.0366, "mean_token_accuracy": 0.08261745497584343, "num_tokens": 905446.0, "step": 490 }, { "entropy": 7.464281463623047, "epoch": 0.04158790170132325, "grad_norm": 1.078125, "learning_rate": 0.000247, "loss": 6.9289, "mean_token_accuracy": 0.09576694294810295, "num_tokens": 914547.0, "step": 495 }, { "entropy": 7.421106290817261, "epoch": 0.04200798151648813, "grad_norm": 1.0703125, "learning_rate": 0.0002495, "loss": 6.9377, "mean_token_accuracy": 0.0962467186152935, "num_tokens": 922900.0, "step": 500 }, { "entropy": 7.401471900939941, "epoch": 0.042428061331653014, "grad_norm": 1.1484375, "learning_rate": 0.000252, "loss": 6.9572, "mean_token_accuracy": 0.09509932994842529, "num_tokens": 930876.0, "step": 505 }, { "entropy": 7.342588901519775, "epoch": 0.0428481411468179, "grad_norm": 0.98828125, "learning_rate": 0.0002545, "loss": 7.0021, "mean_token_accuracy": 0.09231638312339782, "num_tokens": 939871.0, "step": 510 }, { "entropy": 7.44086856842041, "epoch": 0.043268220961982774, "grad_norm": 1.1875, "learning_rate": 0.000257, "loss": 6.988, "mean_token_accuracy": 0.09245615154504776, "num_tokens": 948673.0, "step": 515 }, { "entropy": 7.274595832824707, "epoch": 0.04368830077714766, "grad_norm": 1.015625, "learning_rate": 0.0002595, "loss": 6.9409, "mean_token_accuracy": 0.08984568417072296, "num_tokens": 957603.0, "step": 520 }, { "entropy": 7.436605787277221, "epoch": 0.04410838059231254, "grad_norm": 1.1015625, "learning_rate": 0.000262, "loss": 7.0062, "mean_token_accuracy": 0.08319340422749519, "num_tokens": 967731.0, "step": 525 }, { "entropy": 7.435907888412475, "epoch": 0.04452846040747742, "grad_norm": 1.140625, "learning_rate": 0.00026450000000000003, "loss": 7.0032, "mean_token_accuracy": 0.09049810692667962, "num_tokens": 977427.0, "step": 530 }, { "entropy": 7.3634380340576175, "epoch": 0.0449485402226423, "grad_norm": 1.125, "learning_rate": 0.00026700000000000004, "loss": 6.9827, "mean_token_accuracy": 0.0860845424234867, "num_tokens": 986758.0, "step": 535 }, { "entropy": 7.425018453598023, "epoch": 0.045368620037807186, "grad_norm": 1.2578125, "learning_rate": 0.00026950000000000005, "loss": 6.9738, "mean_token_accuracy": 0.09986243322491646, "num_tokens": 996377.0, "step": 540 }, { "entropy": 7.333861589431763, "epoch": 0.04578869985297206, "grad_norm": 1.0859375, "learning_rate": 0.00027200000000000005, "loss": 7.0222, "mean_token_accuracy": 0.08520096391439438, "num_tokens": 1006483.0, "step": 545 }, { "entropy": 7.269639205932617, "epoch": 0.04620877966813695, "grad_norm": 0.984375, "learning_rate": 0.0002745, "loss": 6.9248, "mean_token_accuracy": 0.091129120439291, "num_tokens": 1016132.0, "step": 550 }, { "entropy": 7.3355879306793215, "epoch": 0.04662885948330183, "grad_norm": 1.171875, "learning_rate": 0.000277, "loss": 6.8796, "mean_token_accuracy": 0.09489664137363434, "num_tokens": 1024970.0, "step": 555 }, { "entropy": 7.3572368144989015, "epoch": 0.04704893929846671, "grad_norm": 0.96484375, "learning_rate": 0.0002795, "loss": 6.9525, "mean_token_accuracy": 0.09272714778780937, "num_tokens": 1034335.0, "step": 560 }, { "entropy": 7.423572063446045, "epoch": 0.04746901911363159, "grad_norm": 1.015625, "learning_rate": 0.00028199999999999997, "loss": 7.0075, "mean_token_accuracy": 0.09945140630006791, "num_tokens": 1043954.0, "step": 565 }, { "entropy": 7.319319725036621, "epoch": 0.04788909892879647, "grad_norm": 1.0234375, "learning_rate": 0.0002845, "loss": 6.9431, "mean_token_accuracy": 0.09524357318878174, "num_tokens": 1053554.0, "step": 570 }, { "entropy": 7.376662826538086, "epoch": 0.04830917874396135, "grad_norm": 1.0078125, "learning_rate": 0.000287, "loss": 6.8893, "mean_token_accuracy": 0.0956316351890564, "num_tokens": 1062008.0, "step": 575 }, { "entropy": 7.246560859680176, "epoch": 0.048729258559126236, "grad_norm": 1.1171875, "learning_rate": 0.0002895, "loss": 6.9602, "mean_token_accuracy": 0.09502239599823951, "num_tokens": 1070740.0, "step": 580 }, { "entropy": 7.361734390258789, "epoch": 0.04914933837429111, "grad_norm": 1.203125, "learning_rate": 0.000292, "loss": 6.9451, "mean_token_accuracy": 0.09238593950867653, "num_tokens": 1079681.0, "step": 585 }, { "entropy": 7.294089078903198, "epoch": 0.049569418189456, "grad_norm": 1.015625, "learning_rate": 0.0002945, "loss": 6.8326, "mean_token_accuracy": 0.09609337821602822, "num_tokens": 1088979.0, "step": 590 }, { "entropy": 7.192009592056275, "epoch": 0.04998949800462088, "grad_norm": 1.1171875, "learning_rate": 0.000297, "loss": 6.8381, "mean_token_accuracy": 0.09695586860179901, "num_tokens": 1097870.0, "step": 595 }, { "entropy": 7.285109043121338, "epoch": 0.05040957781978576, "grad_norm": 1.109375, "learning_rate": 0.0002995, "loss": 6.9361, "mean_token_accuracy": 0.09410082027316094, "num_tokens": 1107948.0, "step": 600 }, { "entropy": 7.2816235542297365, "epoch": 0.05082965763495064, "grad_norm": 1.109375, "learning_rate": 0.000302, "loss": 6.856, "mean_token_accuracy": 0.09758619442582131, "num_tokens": 1117032.0, "step": 605 }, { "entropy": 7.1946680545806885, "epoch": 0.051249737450115525, "grad_norm": 1.0078125, "learning_rate": 0.0003045, "loss": 6.8323, "mean_token_accuracy": 0.09758584424853325, "num_tokens": 1127834.0, "step": 610 }, { "entropy": 7.325930643081665, "epoch": 0.0516698172652804, "grad_norm": 1.234375, "learning_rate": 0.000307, "loss": 6.9314, "mean_token_accuracy": 0.10701763778924941, "num_tokens": 1137382.0, "step": 615 }, { "entropy": 7.191529178619385, "epoch": 0.052089897080445285, "grad_norm": 1.0546875, "learning_rate": 0.0003095, "loss": 6.7726, "mean_token_accuracy": 0.1016211412847042, "num_tokens": 1146095.0, "step": 620 }, { "entropy": 7.197086191177368, "epoch": 0.05250997689561017, "grad_norm": 1.0234375, "learning_rate": 0.000312, "loss": 6.8164, "mean_token_accuracy": 0.09977484568953514, "num_tokens": 1154981.0, "step": 625 }, { "entropy": 7.111207914352417, "epoch": 0.052930056710775046, "grad_norm": 1.203125, "learning_rate": 0.0003145, "loss": 6.822, "mean_token_accuracy": 0.09889646545052529, "num_tokens": 1164939.0, "step": 630 }, { "entropy": 7.286598014831543, "epoch": 0.05335013652593993, "grad_norm": 1.046875, "learning_rate": 0.000317, "loss": 6.9423, "mean_token_accuracy": 0.0905054323375225, "num_tokens": 1174991.0, "step": 635 }, { "entropy": 7.268424129486084, "epoch": 0.05377021634110481, "grad_norm": 0.98046875, "learning_rate": 0.0003195, "loss": 6.9893, "mean_token_accuracy": 0.09030458927154542, "num_tokens": 1184885.0, "step": 640 }, { "entropy": 7.25072751045227, "epoch": 0.05419029615626969, "grad_norm": 1.1640625, "learning_rate": 0.000322, "loss": 6.8843, "mean_token_accuracy": 0.09418094158172607, "num_tokens": 1193637.0, "step": 645 }, { "entropy": 7.144441413879394, "epoch": 0.054610375971434574, "grad_norm": 1.1328125, "learning_rate": 0.00032450000000000003, "loss": 6.6712, "mean_token_accuracy": 0.10373484939336777, "num_tokens": 1202188.0, "step": 650 }, { "entropy": 7.2327552318573, "epoch": 0.05503045578659945, "grad_norm": 1.1484375, "learning_rate": 0.00032700000000000003, "loss": 6.8046, "mean_token_accuracy": 0.09572408124804496, "num_tokens": 1210768.0, "step": 655 }, { "entropy": 7.196833848953247, "epoch": 0.055450535601764335, "grad_norm": 1.1171875, "learning_rate": 0.00032950000000000004, "loss": 6.8024, "mean_token_accuracy": 0.09782998114824296, "num_tokens": 1219819.0, "step": 660 }, { "entropy": 7.211909484863281, "epoch": 0.05587061541692922, "grad_norm": 0.91796875, "learning_rate": 0.00033200000000000005, "loss": 6.8553, "mean_token_accuracy": 0.09061138033866882, "num_tokens": 1229703.0, "step": 665 }, { "entropy": 7.242569494247436, "epoch": 0.056290695232094096, "grad_norm": 1.1796875, "learning_rate": 0.00033450000000000005, "loss": 6.8929, "mean_token_accuracy": 0.09304608702659607, "num_tokens": 1238942.0, "step": 670 }, { "entropy": 7.276552438735962, "epoch": 0.05671077504725898, "grad_norm": 1.015625, "learning_rate": 0.000337, "loss": 6.9316, "mean_token_accuracy": 0.09855509251356125, "num_tokens": 1248943.0, "step": 675 }, { "entropy": 7.130473899841308, "epoch": 0.05713085486242386, "grad_norm": 1.015625, "learning_rate": 0.0003395, "loss": 6.8196, "mean_token_accuracy": 0.09641827270388603, "num_tokens": 1257761.0, "step": 680 }, { "entropy": 7.069635629653931, "epoch": 0.05755093467758874, "grad_norm": 1.1328125, "learning_rate": 0.000342, "loss": 6.7531, "mean_token_accuracy": 0.09635655134916306, "num_tokens": 1267216.0, "step": 685 }, { "entropy": 7.244167423248291, "epoch": 0.057971014492753624, "grad_norm": 1.0703125, "learning_rate": 0.00034449999999999997, "loss": 6.8517, "mean_token_accuracy": 0.09775793552398682, "num_tokens": 1277210.0, "step": 690 }, { "entropy": 7.151098155975342, "epoch": 0.05839109430791851, "grad_norm": 1.078125, "learning_rate": 0.000347, "loss": 6.7848, "mean_token_accuracy": 0.09209914952516556, "num_tokens": 1285310.0, "step": 695 }, { "entropy": 7.133235788345337, "epoch": 0.058811174123083385, "grad_norm": 1.1015625, "learning_rate": 0.0003495, "loss": 6.7884, "mean_token_accuracy": 0.0997276745736599, "num_tokens": 1294421.0, "step": 700 }, { "entropy": 7.089715480804443, "epoch": 0.05923125393824827, "grad_norm": 1.0078125, "learning_rate": 0.000352, "loss": 6.6149, "mean_token_accuracy": 0.10670206919312478, "num_tokens": 1303281.0, "step": 705 }, { "entropy": 7.096017217636108, "epoch": 0.059651333753413145, "grad_norm": 1.3046875, "learning_rate": 0.0003545, "loss": 6.7841, "mean_token_accuracy": 0.1047137551009655, "num_tokens": 1312280.0, "step": 710 }, { "entropy": 7.01336669921875, "epoch": 0.06007141356857803, "grad_norm": 1.0390625, "learning_rate": 0.000357, "loss": 6.7519, "mean_token_accuracy": 0.09830996096134186, "num_tokens": 1321243.0, "step": 715 }, { "entropy": 7.150788021087647, "epoch": 0.06049149338374291, "grad_norm": 1.0234375, "learning_rate": 0.0003595, "loss": 6.8411, "mean_token_accuracy": 0.0983475923538208, "num_tokens": 1330324.0, "step": 720 }, { "entropy": 7.074830770492554, "epoch": 0.06091157319890779, "grad_norm": 1.140625, "learning_rate": 0.000362, "loss": 6.6865, "mean_token_accuracy": 0.1045832097530365, "num_tokens": 1339485.0, "step": 725 }, { "entropy": 7.180077934265137, "epoch": 0.06133165301407267, "grad_norm": 1.2578125, "learning_rate": 0.0003645, "loss": 6.8327, "mean_token_accuracy": 0.09178336262702942, "num_tokens": 1348640.0, "step": 730 }, { "entropy": 7.070912313461304, "epoch": 0.06175173282923756, "grad_norm": 1.203125, "learning_rate": 0.000367, "loss": 6.7313, "mean_token_accuracy": 0.10252036228775978, "num_tokens": 1357581.0, "step": 735 }, { "entropy": 7.097622108459473, "epoch": 0.062171812644402434, "grad_norm": 1.171875, "learning_rate": 0.0003695, "loss": 6.7976, "mean_token_accuracy": 0.09888288527727127, "num_tokens": 1367883.0, "step": 740 }, { "entropy": 7.072182083129883, "epoch": 0.06259189245956731, "grad_norm": 1.078125, "learning_rate": 0.000372, "loss": 6.7536, "mean_token_accuracy": 0.09760352596640587, "num_tokens": 1376936.0, "step": 745 }, { "entropy": 6.975026559829712, "epoch": 0.0630119722747322, "grad_norm": 1.15625, "learning_rate": 0.0003745, "loss": 6.6653, "mean_token_accuracy": 0.10172178596258163, "num_tokens": 1386359.0, "step": 750 }, { "entropy": 7.0470263957977295, "epoch": 0.06343205208989708, "grad_norm": 1.0234375, "learning_rate": 0.000377, "loss": 6.7205, "mean_token_accuracy": 0.10334330797195435, "num_tokens": 1395223.0, "step": 755 }, { "entropy": 7.237481212615966, "epoch": 0.06385213190506196, "grad_norm": 0.9375, "learning_rate": 0.0003795, "loss": 6.8854, "mean_token_accuracy": 0.09526007026433944, "num_tokens": 1404917.0, "step": 760 }, { "entropy": 7.060393810272217, "epoch": 0.06427221172022685, "grad_norm": 1.109375, "learning_rate": 0.000382, "loss": 6.7712, "mean_token_accuracy": 0.10844952017068862, "num_tokens": 1413348.0, "step": 765 }, { "entropy": 7.010181617736817, "epoch": 0.06469229153539173, "grad_norm": 1.109375, "learning_rate": 0.0003845, "loss": 6.751, "mean_token_accuracy": 0.0988110676407814, "num_tokens": 1421726.0, "step": 770 }, { "entropy": 7.068030214309692, "epoch": 0.0651123713505566, "grad_norm": 1.015625, "learning_rate": 0.00038700000000000003, "loss": 6.7626, "mean_token_accuracy": 0.10152493417263031, "num_tokens": 1430686.0, "step": 775 }, { "entropy": 7.124918842315674, "epoch": 0.06553245116572148, "grad_norm": 1.1015625, "learning_rate": 0.00038950000000000003, "loss": 6.7567, "mean_token_accuracy": 0.10261558443307876, "num_tokens": 1439499.0, "step": 780 }, { "entropy": 7.08576397895813, "epoch": 0.06595253098088637, "grad_norm": 1.1953125, "learning_rate": 0.00039200000000000004, "loss": 6.7308, "mean_token_accuracy": 0.10436978489160538, "num_tokens": 1448220.0, "step": 785 }, { "entropy": 6.918930721282959, "epoch": 0.06637261079605125, "grad_norm": 1.0, "learning_rate": 0.00039450000000000005, "loss": 6.7623, "mean_token_accuracy": 0.09306630715727807, "num_tokens": 1458217.0, "step": 790 }, { "entropy": 7.050667333602905, "epoch": 0.06679269061121614, "grad_norm": 1.0703125, "learning_rate": 0.00039700000000000005, "loss": 6.6615, "mean_token_accuracy": 0.10148273557424545, "num_tokens": 1467422.0, "step": 795 }, { "entropy": 7.04574761390686, "epoch": 0.06721277042638102, "grad_norm": 1.03125, "learning_rate": 0.0003995, "loss": 6.6428, "mean_token_accuracy": 0.10174536257982254, "num_tokens": 1476152.0, "step": 800 }, { "entropy": 6.920849370956421, "epoch": 0.06763285024154589, "grad_norm": 1.140625, "learning_rate": 0.000402, "loss": 6.7303, "mean_token_accuracy": 0.09813930094242096, "num_tokens": 1485248.0, "step": 805 }, { "entropy": 7.021937704086303, "epoch": 0.06805293005671077, "grad_norm": 1.09375, "learning_rate": 0.0004045, "loss": 6.6965, "mean_token_accuracy": 0.10005066767334939, "num_tokens": 1494248.0, "step": 810 }, { "entropy": 7.009239387512207, "epoch": 0.06847300987187566, "grad_norm": 1.078125, "learning_rate": 0.00040699999999999997, "loss": 6.7988, "mean_token_accuracy": 0.10206111744046212, "num_tokens": 1503565.0, "step": 815 }, { "entropy": 7.153907108306885, "epoch": 0.06889308968704054, "grad_norm": 1.046875, "learning_rate": 0.0004095, "loss": 6.8967, "mean_token_accuracy": 0.09253153279423713, "num_tokens": 1513227.0, "step": 820 }, { "entropy": 7.081949377059937, "epoch": 0.06931316950220542, "grad_norm": 1.0625, "learning_rate": 0.000412, "loss": 6.6785, "mean_token_accuracy": 0.10418465957045556, "num_tokens": 1522312.0, "step": 825 }, { "entropy": 6.934855031967163, "epoch": 0.0697332493173703, "grad_norm": 1.09375, "learning_rate": 0.0004145, "loss": 6.6359, "mean_token_accuracy": 0.1031254269182682, "num_tokens": 1531720.0, "step": 830 }, { "entropy": 6.970464134216309, "epoch": 0.07015332913253518, "grad_norm": 1.09375, "learning_rate": 0.000417, "loss": 6.7192, "mean_token_accuracy": 0.09493932947516441, "num_tokens": 1541238.0, "step": 835 }, { "entropy": 7.103578281402588, "epoch": 0.07057340894770006, "grad_norm": 1.1015625, "learning_rate": 0.0004195, "loss": 6.8114, "mean_token_accuracy": 0.0987453043460846, "num_tokens": 1550875.0, "step": 840 }, { "entropy": 6.948361873626709, "epoch": 0.07099348876286495, "grad_norm": 1.0234375, "learning_rate": 0.000422, "loss": 6.7522, "mean_token_accuracy": 0.10080962181091309, "num_tokens": 1560287.0, "step": 845 }, { "entropy": 6.981166744232178, "epoch": 0.07141356857802983, "grad_norm": 1.0546875, "learning_rate": 0.0004245, "loss": 6.6378, "mean_token_accuracy": 0.10372715294361115, "num_tokens": 1569043.0, "step": 850 }, { "entropy": 6.902826881408691, "epoch": 0.07183364839319471, "grad_norm": 1.0546875, "learning_rate": 0.000427, "loss": 6.6697, "mean_token_accuracy": 0.10197147876024246, "num_tokens": 1578112.0, "step": 855 }, { "entropy": 6.874331331253051, "epoch": 0.07225372820835958, "grad_norm": 1.1015625, "learning_rate": 0.0004295, "loss": 6.5725, "mean_token_accuracy": 0.1078405149281025, "num_tokens": 1586587.0, "step": 860 }, { "entropy": 7.059461355209351, "epoch": 0.07267380802352447, "grad_norm": 1.078125, "learning_rate": 0.000432, "loss": 6.7397, "mean_token_accuracy": 0.09989926218986511, "num_tokens": 1595585.0, "step": 865 }, { "entropy": 6.951946210861206, "epoch": 0.07309388783868935, "grad_norm": 1.09375, "learning_rate": 0.0004345, "loss": 6.6946, "mean_token_accuracy": 0.10353797450661659, "num_tokens": 1605355.0, "step": 870 }, { "entropy": 6.944614362716675, "epoch": 0.07351396765385423, "grad_norm": 1.1328125, "learning_rate": 0.000437, "loss": 6.7108, "mean_token_accuracy": 0.09883329644799232, "num_tokens": 1613637.0, "step": 875 }, { "entropy": 6.975859832763672, "epoch": 0.07393404746901912, "grad_norm": 1.109375, "learning_rate": 0.0004395, "loss": 6.6703, "mean_token_accuracy": 0.10343916267156601, "num_tokens": 1622731.0, "step": 880 }, { "entropy": 7.003747940063477, "epoch": 0.074354127284184, "grad_norm": 1.0390625, "learning_rate": 0.000442, "loss": 6.6373, "mean_token_accuracy": 0.10040950924158096, "num_tokens": 1632098.0, "step": 885 }, { "entropy": 6.826285457611084, "epoch": 0.07477420709934887, "grad_norm": 0.96484375, "learning_rate": 0.0004445, "loss": 6.6454, "mean_token_accuracy": 0.09755287617444992, "num_tokens": 1641259.0, "step": 890 }, { "entropy": 7.0150947093963625, "epoch": 0.07519428691451376, "grad_norm": 1.1875, "learning_rate": 0.000447, "loss": 6.7262, "mean_token_accuracy": 0.09560549557209015, "num_tokens": 1651362.0, "step": 895 }, { "entropy": 6.897852563858033, "epoch": 0.07561436672967864, "grad_norm": 1.171875, "learning_rate": 0.00044950000000000003, "loss": 6.6487, "mean_token_accuracy": 0.10112505033612251, "num_tokens": 1660190.0, "step": 900 }, { "entropy": 6.90705189704895, "epoch": 0.07603444654484352, "grad_norm": 1.1953125, "learning_rate": 0.00045200000000000004, "loss": 6.663, "mean_token_accuracy": 0.10142350941896439, "num_tokens": 1669020.0, "step": 905 }, { "entropy": 6.973592853546142, "epoch": 0.0764545263600084, "grad_norm": 1.140625, "learning_rate": 0.00045450000000000004, "loss": 6.6861, "mean_token_accuracy": 0.1048488400876522, "num_tokens": 1678158.0, "step": 910 }, { "entropy": 6.985338020324707, "epoch": 0.07687460617517328, "grad_norm": 1.1328125, "learning_rate": 0.00045700000000000005, "loss": 6.7084, "mean_token_accuracy": 0.10136276260018348, "num_tokens": 1687481.0, "step": 915 }, { "entropy": 6.876794004440308, "epoch": 0.07729468599033816, "grad_norm": 1.140625, "learning_rate": 0.00045950000000000006, "loss": 6.6666, "mean_token_accuracy": 0.10845559537410736, "num_tokens": 1696782.0, "step": 920 }, { "entropy": 6.932897567749023, "epoch": 0.07771476580550304, "grad_norm": 1.0390625, "learning_rate": 0.000462, "loss": 6.6725, "mean_token_accuracy": 0.10497085899114608, "num_tokens": 1706153.0, "step": 925 }, { "entropy": 6.9077776908874515, "epoch": 0.07813484562066793, "grad_norm": 1.0078125, "learning_rate": 0.0004645, "loss": 6.6889, "mean_token_accuracy": 0.10281107649207115, "num_tokens": 1715585.0, "step": 930 }, { "entropy": 7.106683778762817, "epoch": 0.07855492543583281, "grad_norm": 1.3359375, "learning_rate": 0.000467, "loss": 6.8042, "mean_token_accuracy": 0.10099845305085182, "num_tokens": 1724857.0, "step": 935 }, { "entropy": 6.858903789520264, "epoch": 0.0789750052509977, "grad_norm": 1.15625, "learning_rate": 0.0004695, "loss": 6.6175, "mean_token_accuracy": 0.10900806412100791, "num_tokens": 1733528.0, "step": 940 }, { "entropy": 7.006282758712769, "epoch": 0.07939508506616257, "grad_norm": 0.9140625, "learning_rate": 0.000472, "loss": 6.7383, "mean_token_accuracy": 0.10379872918128967, "num_tokens": 1742953.0, "step": 945 }, { "entropy": 6.92790584564209, "epoch": 0.07981516488132745, "grad_norm": 1.1015625, "learning_rate": 0.0004745, "loss": 6.6988, "mean_token_accuracy": 0.10636084228754043, "num_tokens": 1752155.0, "step": 950 }, { "entropy": 6.911950254440308, "epoch": 0.08023524469649233, "grad_norm": 1.171875, "learning_rate": 0.000477, "loss": 6.5687, "mean_token_accuracy": 0.10838210806250573, "num_tokens": 1760562.0, "step": 955 }, { "entropy": 6.83457088470459, "epoch": 0.08065532451165722, "grad_norm": 1.1796875, "learning_rate": 0.0004795, "loss": 6.5891, "mean_token_accuracy": 0.10088410004973411, "num_tokens": 1769631.0, "step": 960 }, { "entropy": 6.914610385894775, "epoch": 0.0810754043268221, "grad_norm": 1.21875, "learning_rate": 0.000482, "loss": 6.6346, "mean_token_accuracy": 0.10217849463224411, "num_tokens": 1779080.0, "step": 965 }, { "entropy": 6.8898755550384525, "epoch": 0.08149548414198697, "grad_norm": 1.296875, "learning_rate": 0.0004845, "loss": 6.6271, "mean_token_accuracy": 0.10570115596055984, "num_tokens": 1787830.0, "step": 970 }, { "entropy": 6.751455068588257, "epoch": 0.08191556395715185, "grad_norm": 1.125, "learning_rate": 0.000487, "loss": 6.5346, "mean_token_accuracy": 0.10223312452435493, "num_tokens": 1796998.0, "step": 975 }, { "entropy": 6.8943780899047855, "epoch": 0.08233564377231674, "grad_norm": 1.0625, "learning_rate": 0.0004895, "loss": 6.6202, "mean_token_accuracy": 0.10597362667322159, "num_tokens": 1806194.0, "step": 980 }, { "entropy": 6.700069093704224, "epoch": 0.08275572358748162, "grad_norm": 0.9609375, "learning_rate": 0.000492, "loss": 6.5072, "mean_token_accuracy": 0.10932167768478393, "num_tokens": 1815751.0, "step": 985 }, { "entropy": 6.749313592910767, "epoch": 0.0831758034026465, "grad_norm": 0.953125, "learning_rate": 0.0004945, "loss": 6.5857, "mean_token_accuracy": 0.10682184919714928, "num_tokens": 1825379.0, "step": 990 }, { "entropy": 6.845586490631104, "epoch": 0.08359588321781139, "grad_norm": 1.1328125, "learning_rate": 0.000497, "loss": 6.5541, "mean_token_accuracy": 0.10507402196526527, "num_tokens": 1834158.0, "step": 995 }, { "entropy": 6.844553852081299, "epoch": 0.08401596303297626, "grad_norm": 1.0625, "learning_rate": 0.0004995, "loss": 6.5161, "mean_token_accuracy": 0.10857650190591812, "num_tokens": 1842724.0, "step": 1000 }, { "entropy": 6.795124101638794, "epoch": 0.08443604284814114, "grad_norm": 1.046875, "learning_rate": 0.000499999998724557, "loss": 6.5362, "mean_token_accuracy": 0.10392995700240135, "num_tokens": 1852485.0, "step": 1005 }, { "entropy": 6.765092468261718, "epoch": 0.08485612266330603, "grad_norm": 1.109375, "learning_rate": 0.0004999999935430703, "loss": 6.575, "mean_token_accuracy": 0.10723726153373718, "num_tokens": 1861303.0, "step": 1010 }, { "entropy": 6.745694637298584, "epoch": 0.08527620247847091, "grad_norm": 1.125, "learning_rate": 0.0004999999843758243, "loss": 6.5409, "mean_token_accuracy": 0.1151320680975914, "num_tokens": 1870859.0, "step": 1015 }, { "entropy": 6.8996889114379885, "epoch": 0.0856962822936358, "grad_norm": 1.0078125, "learning_rate": 0.0004999999712228196, "loss": 6.7032, "mean_token_accuracy": 0.10041022300720215, "num_tokens": 1880295.0, "step": 1020 }, { "entropy": 6.899116802215576, "epoch": 0.08611636210880068, "grad_norm": 1.09375, "learning_rate": 0.0004999999540840562, "loss": 6.6176, "mean_token_accuracy": 0.10147540494799615, "num_tokens": 1889193.0, "step": 1025 }, { "entropy": 6.797919845581054, "epoch": 0.08653644192396555, "grad_norm": 1.0625, "learning_rate": 0.0004999999329595345, "loss": 6.709, "mean_token_accuracy": 0.09875654354691506, "num_tokens": 1899437.0, "step": 1030 }, { "entropy": 6.910034608840943, "epoch": 0.08695652173913043, "grad_norm": 1.03125, "learning_rate": 0.0004999999078492548, "loss": 6.6032, "mean_token_accuracy": 0.10777303576469421, "num_tokens": 1907882.0, "step": 1035 }, { "entropy": 6.728742361068726, "epoch": 0.08737660155429532, "grad_norm": 0.9375, "learning_rate": 0.0004999998787532176, "loss": 6.5131, "mean_token_accuracy": 0.1080910786986351, "num_tokens": 1916872.0, "step": 1040 }, { "entropy": 6.86653618812561, "epoch": 0.0877966813694602, "grad_norm": 1.0625, "learning_rate": 0.0004999998456714234, "loss": 6.6681, "mean_token_accuracy": 0.1074354499578476, "num_tokens": 1926636.0, "step": 1045 }, { "entropy": 6.773524904251099, "epoch": 0.08821676118462508, "grad_norm": 1.1640625, "learning_rate": 0.0004999998086038729, "loss": 6.5697, "mean_token_accuracy": 0.108617003262043, "num_tokens": 1935962.0, "step": 1050 }, { "entropy": 6.809631824493408, "epoch": 0.08863684099978995, "grad_norm": 1.078125, "learning_rate": 0.0004999997675505665, "loss": 6.5493, "mean_token_accuracy": 0.10353536382317544, "num_tokens": 1944600.0, "step": 1055 }, { "entropy": 6.8208941459655765, "epoch": 0.08905692081495484, "grad_norm": 1.015625, "learning_rate": 0.0004999997225115052, "loss": 6.7156, "mean_token_accuracy": 0.10389059409499168, "num_tokens": 1954234.0, "step": 1060 }, { "entropy": 6.95792784690857, "epoch": 0.08947700063011972, "grad_norm": 1.0625, "learning_rate": 0.0004999996734866896, "loss": 6.677, "mean_token_accuracy": 0.10057736709713935, "num_tokens": 1964499.0, "step": 1065 }, { "entropy": 6.662513589859008, "epoch": 0.0898970804452846, "grad_norm": 1.1640625, "learning_rate": 0.0004999996204761206, "loss": 6.3883, "mean_token_accuracy": 0.11360553354024887, "num_tokens": 1973635.0, "step": 1070 }, { "entropy": 6.745052719116211, "epoch": 0.09031716026044949, "grad_norm": 0.95703125, "learning_rate": 0.0004999995634797993, "loss": 6.5278, "mean_token_accuracy": 0.1087425634264946, "num_tokens": 1983509.0, "step": 1075 }, { "entropy": 6.769761800765991, "epoch": 0.09073724007561437, "grad_norm": 1.1484375, "learning_rate": 0.0004999995024977265, "loss": 6.5385, "mean_token_accuracy": 0.11216638460755349, "num_tokens": 1992336.0, "step": 1080 }, { "entropy": 6.855973386764527, "epoch": 0.09115731989077924, "grad_norm": 0.99609375, "learning_rate": 0.0004999994375299034, "loss": 6.5509, "mean_token_accuracy": 0.1137130968272686, "num_tokens": 2001931.0, "step": 1085 }, { "entropy": 6.615939617156982, "epoch": 0.09157739970594413, "grad_norm": 0.98828125, "learning_rate": 0.000499999368576331, "loss": 6.4174, "mean_token_accuracy": 0.11283476129174233, "num_tokens": 2010935.0, "step": 1090 }, { "entropy": 6.7152961730957035, "epoch": 0.09199747952110901, "grad_norm": 1.109375, "learning_rate": 0.0004999992956370109, "loss": 6.4684, "mean_token_accuracy": 0.11342488676309585, "num_tokens": 2020587.0, "step": 1095 }, { "entropy": 6.688837385177612, "epoch": 0.0924175593362739, "grad_norm": 1.046875, "learning_rate": 0.000499999218711944, "loss": 6.5046, "mean_token_accuracy": 0.10743609666824341, "num_tokens": 2029743.0, "step": 1100 }, { "entropy": 6.771305274963379, "epoch": 0.09283763915143878, "grad_norm": 1.1484375, "learning_rate": 0.0004999991378011317, "loss": 6.5286, "mean_token_accuracy": 0.11453117504715919, "num_tokens": 2038468.0, "step": 1105 }, { "entropy": 6.67022180557251, "epoch": 0.09325771896660366, "grad_norm": 1.046875, "learning_rate": 0.0004999990529045757, "loss": 6.4451, "mean_token_accuracy": 0.11554965823888778, "num_tokens": 2047456.0, "step": 1110 }, { "entropy": 6.870058679580689, "epoch": 0.09367779878176853, "grad_norm": 0.9765625, "learning_rate": 0.0004999989640222771, "loss": 6.7458, "mean_token_accuracy": 0.09942527562379837, "num_tokens": 2056691.0, "step": 1115 }, { "entropy": 6.829685544967651, "epoch": 0.09409787859693342, "grad_norm": 1.03125, "learning_rate": 0.000499998871154238, "loss": 6.5487, "mean_token_accuracy": 0.10888865366578102, "num_tokens": 2066068.0, "step": 1120 }, { "entropy": 6.725253868103027, "epoch": 0.0945179584120983, "grad_norm": 1.015625, "learning_rate": 0.0004999987743004597, "loss": 6.4837, "mean_token_accuracy": 0.11379996240139008, "num_tokens": 2075113.0, "step": 1125 }, { "entropy": 6.7777934074401855, "epoch": 0.09493803822726318, "grad_norm": 0.9609375, "learning_rate": 0.0004999986734609438, "loss": 6.6044, "mean_token_accuracy": 0.11070828661322593, "num_tokens": 2084557.0, "step": 1130 }, { "entropy": 6.817347526550293, "epoch": 0.09535811804242807, "grad_norm": 1.0625, "learning_rate": 0.0004999985686356923, "loss": 6.497, "mean_token_accuracy": 0.10584703534841537, "num_tokens": 2093424.0, "step": 1135 }, { "entropy": 6.7462608337402346, "epoch": 0.09577819785759294, "grad_norm": 1.03125, "learning_rate": 0.000499998459824707, "loss": 6.6329, "mean_token_accuracy": 0.10303654298186302, "num_tokens": 2103066.0, "step": 1140 }, { "entropy": 6.799277830123901, "epoch": 0.09619827767275782, "grad_norm": 1.046875, "learning_rate": 0.00049999834702799, "loss": 6.5085, "mean_token_accuracy": 0.11131441742181777, "num_tokens": 2112447.0, "step": 1145 }, { "entropy": 6.711055421829224, "epoch": 0.0966183574879227, "grad_norm": 0.9375, "learning_rate": 0.0004999982302455431, "loss": 6.52, "mean_token_accuracy": 0.11281892731785774, "num_tokens": 2121949.0, "step": 1150 }, { "entropy": 6.780323314666748, "epoch": 0.09703843730308759, "grad_norm": 1.015625, "learning_rate": 0.0004999981094773683, "loss": 6.4157, "mean_token_accuracy": 0.1144998162984848, "num_tokens": 2130464.0, "step": 1155 }, { "entropy": 6.697625207901001, "epoch": 0.09745851711825247, "grad_norm": 1.140625, "learning_rate": 0.000499997984723468, "loss": 6.5921, "mean_token_accuracy": 0.1068018026649952, "num_tokens": 2139577.0, "step": 1160 }, { "entropy": 6.569090557098389, "epoch": 0.09787859693341736, "grad_norm": 0.96484375, "learning_rate": 0.0004999978559838441, "loss": 6.3121, "mean_token_accuracy": 0.11300956755876541, "num_tokens": 2147919.0, "step": 1165 }, { "entropy": 6.716167974472046, "epoch": 0.09829867674858223, "grad_norm": 1.0390625, "learning_rate": 0.0004999977232584991, "loss": 6.4791, "mean_token_accuracy": 0.11262017637491226, "num_tokens": 2156936.0, "step": 1170 }, { "entropy": 6.6336616516113285, "epoch": 0.09871875656374711, "grad_norm": 1.0859375, "learning_rate": 0.0004999975865474354, "loss": 6.5492, "mean_token_accuracy": 0.10994603037834168, "num_tokens": 2165362.0, "step": 1175 }, { "entropy": 6.719806575775147, "epoch": 0.099138836378912, "grad_norm": 1.1796875, "learning_rate": 0.0004999974458506551, "loss": 6.4705, "mean_token_accuracy": 0.11214353889226913, "num_tokens": 2173665.0, "step": 1180 }, { "entropy": 6.786266422271728, "epoch": 0.09955891619407688, "grad_norm": 1.2578125, "learning_rate": 0.000499997301168161, "loss": 6.4531, "mean_token_accuracy": 0.11377902403473854, "num_tokens": 2182222.0, "step": 1185 }, { "entropy": 6.670177459716797, "epoch": 0.09997899600924176, "grad_norm": 0.9609375, "learning_rate": 0.0004999971524999556, "loss": 6.528, "mean_token_accuracy": 0.11228533461689949, "num_tokens": 2192358.0, "step": 1190 }, { "entropy": 6.779563045501709, "epoch": 0.10039907582440663, "grad_norm": 1.03125, "learning_rate": 0.0004999969998460414, "loss": 6.5039, "mean_token_accuracy": 0.10956505164504052, "num_tokens": 2201889.0, "step": 1195 }, { "entropy": 6.6560157299041744, "epoch": 0.10081915563957151, "grad_norm": 1.3359375, "learning_rate": 0.0004999968432064213, "loss": 6.5232, "mean_token_accuracy": 0.11500915959477424, "num_tokens": 2211810.0, "step": 1200 }, { "entropy": 6.652071762084961, "epoch": 0.1012392354547364, "grad_norm": 0.921875, "learning_rate": 0.0004999966825810979, "loss": 6.4474, "mean_token_accuracy": 0.11259665861725807, "num_tokens": 2221123.0, "step": 1205 }, { "entropy": 6.634405040740967, "epoch": 0.10165931526990128, "grad_norm": 1.0703125, "learning_rate": 0.0004999965179700742, "loss": 6.402, "mean_token_accuracy": 0.1181789293885231, "num_tokens": 2230129.0, "step": 1210 }, { "entropy": 6.625933122634888, "epoch": 0.10207939508506617, "grad_norm": 1.03125, "learning_rate": 0.000499996349373353, "loss": 6.4624, "mean_token_accuracy": 0.11246607527136802, "num_tokens": 2239929.0, "step": 1215 }, { "entropy": 6.709180927276611, "epoch": 0.10249947490023105, "grad_norm": 1.0390625, "learning_rate": 0.0004999961767909374, "loss": 6.4292, "mean_token_accuracy": 0.11479318514466286, "num_tokens": 2248078.0, "step": 1220 }, { "entropy": 6.59263162612915, "epoch": 0.10291955471539592, "grad_norm": 1.0625, "learning_rate": 0.0004999960002228303, "loss": 6.5262, "mean_token_accuracy": 0.11000767946243287, "num_tokens": 2256975.0, "step": 1225 }, { "entropy": 6.708470964431763, "epoch": 0.1033396345305608, "grad_norm": 1.15625, "learning_rate": 0.0004999958196690349, "loss": 6.3792, "mean_token_accuracy": 0.11624118462204933, "num_tokens": 2265797.0, "step": 1230 }, { "entropy": 6.645881128311157, "epoch": 0.10375971434572569, "grad_norm": 1.0234375, "learning_rate": 0.0004999956351295545, "loss": 6.4736, "mean_token_accuracy": 0.1176276110112667, "num_tokens": 2274099.0, "step": 1235 }, { "entropy": 6.599815797805786, "epoch": 0.10417979416089057, "grad_norm": 1.03125, "learning_rate": 0.0004999954466043922, "loss": 6.3853, "mean_token_accuracy": 0.11810432821512222, "num_tokens": 2282360.0, "step": 1240 }, { "entropy": 6.57668776512146, "epoch": 0.10459987397605545, "grad_norm": 0.96875, "learning_rate": 0.0004999952540935514, "loss": 6.4891, "mean_token_accuracy": 0.11048517748713493, "num_tokens": 2292714.0, "step": 1245 }, { "entropy": 6.675060033798218, "epoch": 0.10501995379122034, "grad_norm": 1.0859375, "learning_rate": 0.0004999950575970356, "loss": 6.4361, "mean_token_accuracy": 0.11576245203614235, "num_tokens": 2301633.0, "step": 1250 }, { "entropy": 6.642887592315674, "epoch": 0.10544003360638521, "grad_norm": 1.0234375, "learning_rate": 0.0004999948571148482, "loss": 6.3931, "mean_token_accuracy": 0.12049147412180901, "num_tokens": 2310067.0, "step": 1255 }, { "entropy": 6.610925579071045, "epoch": 0.10586011342155009, "grad_norm": 1.046875, "learning_rate": 0.0004999946526469927, "loss": 6.4927, "mean_token_accuracy": 0.11412879601120948, "num_tokens": 2320090.0, "step": 1260 }, { "entropy": 6.649963521957398, "epoch": 0.10628019323671498, "grad_norm": 1.03125, "learning_rate": 0.0004999944441934728, "loss": 6.4451, "mean_token_accuracy": 0.11852803751826287, "num_tokens": 2329255.0, "step": 1265 }, { "entropy": 6.678138732910156, "epoch": 0.10670027305187986, "grad_norm": 1.109375, "learning_rate": 0.0004999942317542922, "loss": 6.5261, "mean_token_accuracy": 0.11407028958201408, "num_tokens": 2339535.0, "step": 1270 }, { "entropy": 6.635104560852051, "epoch": 0.10712035286704474, "grad_norm": 1.0546875, "learning_rate": 0.0004999940153294546, "loss": 6.425, "mean_token_accuracy": 0.11798783987760544, "num_tokens": 2348948.0, "step": 1275 }, { "entropy": 6.629437446594238, "epoch": 0.10754043268220961, "grad_norm": 0.99609375, "learning_rate": 0.000499993794918964, "loss": 6.4518, "mean_token_accuracy": 0.10851866900920867, "num_tokens": 2359141.0, "step": 1280 }, { "entropy": 6.612447357177734, "epoch": 0.1079605124973745, "grad_norm": 1.1875, "learning_rate": 0.0004999935705228241, "loss": 6.5007, "mean_token_accuracy": 0.10988411605358124, "num_tokens": 2368906.0, "step": 1285 }, { "entropy": 6.720192527770996, "epoch": 0.10838059231253938, "grad_norm": 1.15625, "learning_rate": 0.0004999933421410389, "loss": 6.4756, "mean_token_accuracy": 0.11632761880755424, "num_tokens": 2377029.0, "step": 1290 }, { "entropy": 6.682251882553101, "epoch": 0.10880067212770426, "grad_norm": 0.84765625, "learning_rate": 0.0004999931097736125, "loss": 6.5226, "mean_token_accuracy": 0.10841714516282082, "num_tokens": 2387088.0, "step": 1295 }, { "entropy": 6.616416501998901, "epoch": 0.10922075194286915, "grad_norm": 1.015625, "learning_rate": 0.0004999928734205492, "loss": 6.4358, "mean_token_accuracy": 0.11085559725761414, "num_tokens": 2395596.0, "step": 1300 }, { "entropy": 6.630216932296753, "epoch": 0.10964083175803403, "grad_norm": 1.09375, "learning_rate": 0.0004999926330818528, "loss": 6.4278, "mean_token_accuracy": 0.11868382543325424, "num_tokens": 2404506.0, "step": 1305 }, { "entropy": 6.615355587005615, "epoch": 0.1100609115731989, "grad_norm": 1.109375, "learning_rate": 0.0004999923887575278, "loss": 6.4742, "mean_token_accuracy": 0.11464583277702331, "num_tokens": 2414342.0, "step": 1310 }, { "entropy": 6.68165545463562, "epoch": 0.11048099138836379, "grad_norm": 1.0859375, "learning_rate": 0.0004999921404475785, "loss": 6.4271, "mean_token_accuracy": 0.11960532069206238, "num_tokens": 2423076.0, "step": 1315 }, { "entropy": 6.567938899993896, "epoch": 0.11090107120352867, "grad_norm": 0.91796875, "learning_rate": 0.0004999918881520093, "loss": 6.3809, "mean_token_accuracy": 0.1204459622502327, "num_tokens": 2432492.0, "step": 1320 }, { "entropy": 6.610611057281494, "epoch": 0.11132115101869355, "grad_norm": 0.96875, "learning_rate": 0.0004999916318708246, "loss": 6.3447, "mean_token_accuracy": 0.1213211365044117, "num_tokens": 2441916.0, "step": 1325 }, { "entropy": 6.550094270706177, "epoch": 0.11174123083385844, "grad_norm": 1.1015625, "learning_rate": 0.0004999913716040291, "loss": 6.4, "mean_token_accuracy": 0.11803905665874481, "num_tokens": 2450932.0, "step": 1330 }, { "entropy": 6.5825268745422365, "epoch": 0.11216131064902331, "grad_norm": 1.0859375, "learning_rate": 0.0004999911073516272, "loss": 6.4156, "mean_token_accuracy": 0.11501810997724533, "num_tokens": 2460058.0, "step": 1335 }, { "entropy": 6.541036558151245, "epoch": 0.11258139046418819, "grad_norm": 0.98046875, "learning_rate": 0.0004999908391136237, "loss": 6.3486, "mean_token_accuracy": 0.11862518936395645, "num_tokens": 2469607.0, "step": 1340 }, { "entropy": 6.54659481048584, "epoch": 0.11300147027935308, "grad_norm": 1.09375, "learning_rate": 0.0004999905668900234, "loss": 6.4037, "mean_token_accuracy": 0.11429757624864578, "num_tokens": 2478345.0, "step": 1345 }, { "entropy": 6.665723133087158, "epoch": 0.11342155009451796, "grad_norm": 1.15625, "learning_rate": 0.000499990290680831, "loss": 6.3362, "mean_token_accuracy": 0.11939993128180504, "num_tokens": 2486662.0, "step": 1350 }, { "entropy": 6.539735174179077, "epoch": 0.11384162990968284, "grad_norm": 1.0859375, "learning_rate": 0.0004999900104860516, "loss": 6.4496, "mean_token_accuracy": 0.11450904607772827, "num_tokens": 2495392.0, "step": 1355 }, { "entropy": 6.640576314926148, "epoch": 0.11426170972484773, "grad_norm": 1.0546875, "learning_rate": 0.0004999897263056898, "loss": 6.4824, "mean_token_accuracy": 0.11427311152219773, "num_tokens": 2505254.0, "step": 1360 }, { "entropy": 6.6059410572052, "epoch": 0.1146817895400126, "grad_norm": 1.0, "learning_rate": 0.000499989438139751, "loss": 6.2902, "mean_token_accuracy": 0.12163057401776314, "num_tokens": 2514096.0, "step": 1365 }, { "entropy": 6.572102785110474, "epoch": 0.11510186935517748, "grad_norm": 0.9453125, "learning_rate": 0.0004999891459882401, "loss": 6.3036, "mean_token_accuracy": 0.12106614261865616, "num_tokens": 2523635.0, "step": 1370 }, { "entropy": 6.518535518646241, "epoch": 0.11552194917034236, "grad_norm": 0.99609375, "learning_rate": 0.0004999888498511624, "loss": 6.3872, "mean_token_accuracy": 0.117999816685915, "num_tokens": 2532528.0, "step": 1375 }, { "entropy": 6.522701168060303, "epoch": 0.11594202898550725, "grad_norm": 1.0625, "learning_rate": 0.0004999885497285229, "loss": 6.3026, "mean_token_accuracy": 0.11809839084744453, "num_tokens": 2541893.0, "step": 1380 }, { "entropy": 6.516852188110351, "epoch": 0.11636210880067213, "grad_norm": 0.99609375, "learning_rate": 0.0004999882456203273, "loss": 6.3627, "mean_token_accuracy": 0.11867272853851318, "num_tokens": 2551551.0, "step": 1385 }, { "entropy": 6.592957019805908, "epoch": 0.11678218861583702, "grad_norm": 1.1171875, "learning_rate": 0.0004999879375265806, "loss": 6.314, "mean_token_accuracy": 0.1192450650036335, "num_tokens": 2560183.0, "step": 1390 }, { "entropy": 6.526823472976685, "epoch": 0.11720226843100189, "grad_norm": 1.1484375, "learning_rate": 0.0004999876254472886, "loss": 6.2065, "mean_token_accuracy": 0.127345572412014, "num_tokens": 2568697.0, "step": 1395 }, { "entropy": 6.488171815872192, "epoch": 0.11762234824616677, "grad_norm": 0.97265625, "learning_rate": 0.0004999873093824565, "loss": 6.4136, "mean_token_accuracy": 0.1172497920691967, "num_tokens": 2578151.0, "step": 1400 }, { "entropy": 6.697162342071533, "epoch": 0.11804242806133165, "grad_norm": 1.1171875, "learning_rate": 0.0004999869893320902, "loss": 6.5415, "mean_token_accuracy": 0.11695929765701293, "num_tokens": 2585901.0, "step": 1405 }, { "entropy": 6.558137512207031, "epoch": 0.11846250787649654, "grad_norm": 1.0234375, "learning_rate": 0.0004999866652961952, "loss": 6.3565, "mean_token_accuracy": 0.11195311546325684, "num_tokens": 2595655.0, "step": 1410 }, { "entropy": 6.547592639923096, "epoch": 0.11888258769166142, "grad_norm": 0.984375, "learning_rate": 0.0004999863372747773, "loss": 6.3241, "mean_token_accuracy": 0.1137452982366085, "num_tokens": 2604949.0, "step": 1415 }, { "entropy": 6.549184036254883, "epoch": 0.11930266750682629, "grad_norm": 1.125, "learning_rate": 0.0004999860052678423, "loss": 6.3987, "mean_token_accuracy": 0.12182095795869827, "num_tokens": 2614260.0, "step": 1420 }, { "entropy": 6.533220100402832, "epoch": 0.11972274732199117, "grad_norm": 1.046875, "learning_rate": 0.0004999856692753959, "loss": 6.3846, "mean_token_accuracy": 0.11606933474540711, "num_tokens": 2623740.0, "step": 1425 }, { "entropy": 6.56026554107666, "epoch": 0.12014282713715606, "grad_norm": 1.0390625, "learning_rate": 0.0004999853292974444, "loss": 6.2829, "mean_token_accuracy": 0.1191012591123581, "num_tokens": 2631998.0, "step": 1430 }, { "entropy": 6.436700010299683, "epoch": 0.12056290695232094, "grad_norm": 0.96875, "learning_rate": 0.0004999849853339936, "loss": 6.4441, "mean_token_accuracy": 0.12089451104402542, "num_tokens": 2641169.0, "step": 1435 }, { "entropy": 6.6503981590271, "epoch": 0.12098298676748583, "grad_norm": 0.9140625, "learning_rate": 0.0004999846373850497, "loss": 6.2726, "mean_token_accuracy": 0.12328374907374381, "num_tokens": 2650576.0, "step": 1440 }, { "entropy": 6.504758834838867, "epoch": 0.12140306658265071, "grad_norm": 1.0234375, "learning_rate": 0.0004999842854506186, "loss": 6.3597, "mean_token_accuracy": 0.11508475914597512, "num_tokens": 2660817.0, "step": 1445 }, { "entropy": 6.454709720611572, "epoch": 0.12182314639781558, "grad_norm": 1.0859375, "learning_rate": 0.0004999839295307069, "loss": 6.317, "mean_token_accuracy": 0.11818674132227898, "num_tokens": 2669338.0, "step": 1450 }, { "entropy": 6.5724732875823975, "epoch": 0.12224322621298046, "grad_norm": 1.078125, "learning_rate": 0.0004999835696253206, "loss": 6.3698, "mean_token_accuracy": 0.11763316094875335, "num_tokens": 2679108.0, "step": 1455 }, { "entropy": 6.542471504211425, "epoch": 0.12266330602814535, "grad_norm": 0.9453125, "learning_rate": 0.0004999832057344664, "loss": 6.3312, "mean_token_accuracy": 0.11857884675264359, "num_tokens": 2688126.0, "step": 1460 }, { "entropy": 6.3690132141113285, "epoch": 0.12308338584331023, "grad_norm": 1.0390625, "learning_rate": 0.0004999828378581504, "loss": 6.2827, "mean_token_accuracy": 0.12631092369556426, "num_tokens": 2697245.0, "step": 1465 }, { "entropy": 6.5668089389801025, "epoch": 0.12350346565847511, "grad_norm": 1.046875, "learning_rate": 0.0004999824659963793, "loss": 6.3543, "mean_token_accuracy": 0.12048940360546112, "num_tokens": 2705934.0, "step": 1470 }, { "entropy": 6.516648006439209, "epoch": 0.12392354547364, "grad_norm": 1.125, "learning_rate": 0.0004999820901491598, "loss": 6.2753, "mean_token_accuracy": 0.12523386031389236, "num_tokens": 2714367.0, "step": 1475 }, { "entropy": 6.416815328598022, "epoch": 0.12434362528880487, "grad_norm": 1.0390625, "learning_rate": 0.0004999817103164983, "loss": 6.3117, "mean_token_accuracy": 0.12113343179225922, "num_tokens": 2724366.0, "step": 1480 }, { "entropy": 6.518594264984131, "epoch": 0.12476370510396975, "grad_norm": 0.953125, "learning_rate": 0.0004999813264984017, "loss": 6.3262, "mean_token_accuracy": 0.11913523152470588, "num_tokens": 2733980.0, "step": 1485 }, { "entropy": 6.520108652114868, "epoch": 0.12518378491913462, "grad_norm": 1.0234375, "learning_rate": 0.0004999809386948767, "loss": 6.3232, "mean_token_accuracy": 0.11875561475753785, "num_tokens": 2744013.0, "step": 1490 }, { "entropy": 6.4508843421936035, "epoch": 0.12560386473429952, "grad_norm": 1.1640625, "learning_rate": 0.0004999805469059302, "loss": 6.3917, "mean_token_accuracy": 0.1202739343047142, "num_tokens": 2753385.0, "step": 1495 }, { "entropy": 6.467165565490722, "epoch": 0.1260239445494644, "grad_norm": 1.03125, "learning_rate": 0.0004999801511315693, "loss": 6.2443, "mean_token_accuracy": 0.11950960382819176, "num_tokens": 2762875.0, "step": 1500 }, { "entropy": 6.561000490188599, "epoch": 0.1264440243646293, "grad_norm": 1.0234375, "learning_rate": 0.0004999797513718007, "loss": 6.3133, "mean_token_accuracy": 0.12554540634155273, "num_tokens": 2772182.0, "step": 1505 }, { "entropy": 6.398244476318359, "epoch": 0.12686410417979416, "grad_norm": 1.0234375, "learning_rate": 0.0004999793476266317, "loss": 6.2652, "mean_token_accuracy": 0.12494927272200584, "num_tokens": 2780814.0, "step": 1510 }, { "entropy": 6.759689664840698, "epoch": 0.12728418399495905, "grad_norm": 1.0234375, "learning_rate": 0.0004999789398960695, "loss": 6.5371, "mean_token_accuracy": 0.120218076556921, "num_tokens": 2791104.0, "step": 1515 }, { "entropy": 6.380699729919433, "epoch": 0.12770426381012392, "grad_norm": 0.9921875, "learning_rate": 0.0004999785281801212, "loss": 6.2392, "mean_token_accuracy": 0.12141881808638573, "num_tokens": 2800081.0, "step": 1520 }, { "entropy": 6.502162122726441, "epoch": 0.1281243436252888, "grad_norm": 1.0703125, "learning_rate": 0.000499978112478794, "loss": 6.3645, "mean_token_accuracy": 0.11820052862167359, "num_tokens": 2809096.0, "step": 1525 }, { "entropy": 6.559705686569214, "epoch": 0.1285444234404537, "grad_norm": 1.0, "learning_rate": 0.0004999776927920955, "loss": 6.3324, "mean_token_accuracy": 0.12376131415367127, "num_tokens": 2818857.0, "step": 1530 }, { "entropy": 6.478033876419067, "epoch": 0.12896450325561856, "grad_norm": 1.0703125, "learning_rate": 0.000499977269120033, "loss": 6.3924, "mean_token_accuracy": 0.11640017554163933, "num_tokens": 2829332.0, "step": 1535 }, { "entropy": 6.471277475357056, "epoch": 0.12938458307078346, "grad_norm": 0.9453125, "learning_rate": 0.000499976841462614, "loss": 6.3118, "mean_token_accuracy": 0.11578154116868973, "num_tokens": 2839193.0, "step": 1540 }, { "entropy": 6.515983152389526, "epoch": 0.12980466288594833, "grad_norm": 0.94921875, "learning_rate": 0.000499976409819846, "loss": 6.3126, "mean_token_accuracy": 0.1165178470313549, "num_tokens": 2848535.0, "step": 1545 }, { "entropy": 6.329218864440918, "epoch": 0.1302247427011132, "grad_norm": 0.9609375, "learning_rate": 0.0004999759741917369, "loss": 6.2119, "mean_token_accuracy": 0.12768493369221687, "num_tokens": 2858090.0, "step": 1550 }, { "entropy": 6.4847986698150635, "epoch": 0.1306448225162781, "grad_norm": 1.1640625, "learning_rate": 0.0004999755345782941, "loss": 6.3672, "mean_token_accuracy": 0.1186487466096878, "num_tokens": 2866984.0, "step": 1555 }, { "entropy": 6.419411611557007, "epoch": 0.13106490233144297, "grad_norm": 0.89453125, "learning_rate": 0.0004999750909795256, "loss": 6.1757, "mean_token_accuracy": 0.1280258044600487, "num_tokens": 2876550.0, "step": 1560 }, { "entropy": 6.461032104492188, "epoch": 0.13148498214660786, "grad_norm": 0.98046875, "learning_rate": 0.0004999746433954394, "loss": 6.2774, "mean_token_accuracy": 0.1213872842490673, "num_tokens": 2885782.0, "step": 1565 }, { "entropy": 6.447916793823242, "epoch": 0.13190506196177273, "grad_norm": 1.0, "learning_rate": 0.000499974191826043, "loss": 6.2448, "mean_token_accuracy": 0.13687582612037658, "num_tokens": 2894807.0, "step": 1570 }, { "entropy": 6.439778518676758, "epoch": 0.1323251417769376, "grad_norm": 1.171875, "learning_rate": 0.0004999737362713448, "loss": 6.2925, "mean_token_accuracy": 0.1238982230424881, "num_tokens": 2904076.0, "step": 1575 }, { "entropy": 6.471430492401123, "epoch": 0.1327452215921025, "grad_norm": 1.0390625, "learning_rate": 0.0004999732767313527, "loss": 6.2033, "mean_token_accuracy": 0.1205870471894741, "num_tokens": 2913761.0, "step": 1580 }, { "entropy": 6.509069633483887, "epoch": 0.13316530140726737, "grad_norm": 1.0546875, "learning_rate": 0.0004999728132060746, "loss": 6.4228, "mean_token_accuracy": 0.12286271527409554, "num_tokens": 2922848.0, "step": 1585 }, { "entropy": 6.5165454864501955, "epoch": 0.13358538122243227, "grad_norm": 0.953125, "learning_rate": 0.0004999723456955192, "loss": 6.3079, "mean_token_accuracy": 0.11906806230545045, "num_tokens": 2932718.0, "step": 1590 }, { "entropy": 6.353040504455566, "epoch": 0.13400546103759714, "grad_norm": 0.9765625, "learning_rate": 0.0004999718741996945, "loss": 6.2648, "mean_token_accuracy": 0.12362491562962533, "num_tokens": 2942686.0, "step": 1595 }, { "entropy": 6.480581188201905, "epoch": 0.13442554085276204, "grad_norm": 0.98046875, "learning_rate": 0.000499971398718609, "loss": 6.2304, "mean_token_accuracy": 0.12233746945858001, "num_tokens": 2952096.0, "step": 1600 }, { "entropy": 6.41249566078186, "epoch": 0.1348456206679269, "grad_norm": 1.0234375, "learning_rate": 0.0004999709192522708, "loss": 6.3139, "mean_token_accuracy": 0.12512291446328164, "num_tokens": 2960660.0, "step": 1605 }, { "entropy": 6.536613845825196, "epoch": 0.13526570048309178, "grad_norm": 0.91796875, "learning_rate": 0.0004999704358006887, "loss": 6.3118, "mean_token_accuracy": 0.12129077091813087, "num_tokens": 2969834.0, "step": 1610 }, { "entropy": 6.4085368633270265, "epoch": 0.13568578029825668, "grad_norm": 1.09375, "learning_rate": 0.0004999699483638712, "loss": 6.2906, "mean_token_accuracy": 0.12232841104269028, "num_tokens": 2979023.0, "step": 1615 }, { "entropy": 6.476312971115112, "epoch": 0.13610586011342155, "grad_norm": 1.015625, "learning_rate": 0.0004999694569418269, "loss": 6.2964, "mean_token_accuracy": 0.12233099341392517, "num_tokens": 2988083.0, "step": 1620 }, { "entropy": 6.359239149093628, "epoch": 0.13652593992858644, "grad_norm": 0.9921875, "learning_rate": 0.0004999689615345645, "loss": 6.2196, "mean_token_accuracy": 0.12490532472729683, "num_tokens": 2997240.0, "step": 1625 }, { "entropy": 6.505274820327759, "epoch": 0.1369460197437513, "grad_norm": 1.0859375, "learning_rate": 0.0004999684621420928, "loss": 6.2805, "mean_token_accuracy": 0.12174654453992843, "num_tokens": 3007077.0, "step": 1630 }, { "entropy": 6.501539659500122, "epoch": 0.13736609955891618, "grad_norm": 1.0078125, "learning_rate": 0.0004999679587644205, "loss": 6.3282, "mean_token_accuracy": 0.11869422942399979, "num_tokens": 3015821.0, "step": 1635 }, { "entropy": 6.434766483306885, "epoch": 0.13778617937408108, "grad_norm": 1.046875, "learning_rate": 0.0004999674514015568, "loss": 6.2508, "mean_token_accuracy": 0.1246812529861927, "num_tokens": 3025858.0, "step": 1640 }, { "entropy": 6.406217813491821, "epoch": 0.13820625918924595, "grad_norm": 0.98046875, "learning_rate": 0.0004999669400535105, "loss": 6.2132, "mean_token_accuracy": 0.12023203670978547, "num_tokens": 3035537.0, "step": 1645 }, { "entropy": 6.359542560577393, "epoch": 0.13862633900441085, "grad_norm": 1.140625, "learning_rate": 0.0004999664247202907, "loss": 6.152, "mean_token_accuracy": 0.12406394928693772, "num_tokens": 3044204.0, "step": 1650 }, { "entropy": 6.404636430740356, "epoch": 0.13904641881957572, "grad_norm": 1.03125, "learning_rate": 0.0004999659054019066, "loss": 6.2994, "mean_token_accuracy": 0.12448503151535988, "num_tokens": 3053111.0, "step": 1655 }, { "entropy": 6.443476963043213, "epoch": 0.1394664986347406, "grad_norm": 1.0390625, "learning_rate": 0.0004999653820983673, "loss": 6.2201, "mean_token_accuracy": 0.12843194082379342, "num_tokens": 3062456.0, "step": 1660 }, { "entropy": 6.356498098373413, "epoch": 0.13988657844990549, "grad_norm": 0.98828125, "learning_rate": 0.000499964854809682, "loss": 6.2579, "mean_token_accuracy": 0.12453076243400574, "num_tokens": 3071132.0, "step": 1665 }, { "entropy": 6.388091611862182, "epoch": 0.14030665826507036, "grad_norm": 0.98046875, "learning_rate": 0.0004999643235358602, "loss": 6.2078, "mean_token_accuracy": 0.12833356559276582, "num_tokens": 3080892.0, "step": 1670 }, { "entropy": 6.392906522750854, "epoch": 0.14072673808023525, "grad_norm": 1.015625, "learning_rate": 0.0004999637882769112, "loss": 6.1429, "mean_token_accuracy": 0.12803655937314035, "num_tokens": 3089874.0, "step": 1675 }, { "entropy": 6.369514799118042, "epoch": 0.14114681789540012, "grad_norm": 0.91796875, "learning_rate": 0.0004999632490328447, "loss": 6.2814, "mean_token_accuracy": 0.12487674206495285, "num_tokens": 3099535.0, "step": 1680 }, { "entropy": 6.432224130630493, "epoch": 0.14156689771056502, "grad_norm": 0.984375, "learning_rate": 0.0004999627058036699, "loss": 6.24, "mean_token_accuracy": 0.12075779214501381, "num_tokens": 3108772.0, "step": 1685 }, { "entropy": 6.430401134490967, "epoch": 0.1419869775257299, "grad_norm": 1.0234375, "learning_rate": 0.0004999621585893966, "loss": 6.2696, "mean_token_accuracy": 0.11704754754900933, "num_tokens": 3118333.0, "step": 1690 }, { "entropy": 6.450057506561279, "epoch": 0.14240705734089476, "grad_norm": 1.0625, "learning_rate": 0.0004999616073900346, "loss": 6.3013, "mean_token_accuracy": 0.12180939391255378, "num_tokens": 3127356.0, "step": 1695 }, { "entropy": 6.412153673171997, "epoch": 0.14282713715605966, "grad_norm": 1.0859375, "learning_rate": 0.0004999610522055935, "loss": 6.2662, "mean_token_accuracy": 0.1200573742389679, "num_tokens": 3136859.0, "step": 1700 }, { "entropy": 6.451931762695312, "epoch": 0.14324721697122453, "grad_norm": 0.9921875, "learning_rate": 0.0004999604930360832, "loss": 6.2945, "mean_token_accuracy": 0.12161469012498856, "num_tokens": 3146607.0, "step": 1705 }, { "entropy": 6.3816108226776125, "epoch": 0.14366729678638943, "grad_norm": 0.95703125, "learning_rate": 0.0004999599298815136, "loss": 6.2364, "mean_token_accuracy": 0.12764545828104018, "num_tokens": 3156327.0, "step": 1710 }, { "entropy": 6.309280204772949, "epoch": 0.1440873766015543, "grad_norm": 1.5390625, "learning_rate": 0.0004999593627418947, "loss": 6.177, "mean_token_accuracy": 0.13247063681483268, "num_tokens": 3165559.0, "step": 1715 }, { "entropy": 6.405248212814331, "epoch": 0.14450745641671917, "grad_norm": 1.0625, "learning_rate": 0.0004999587916172365, "loss": 6.2704, "mean_token_accuracy": 0.1183898076415062, "num_tokens": 3173850.0, "step": 1720 }, { "entropy": 6.435620069503784, "epoch": 0.14492753623188406, "grad_norm": 1.0078125, "learning_rate": 0.0004999582165075492, "loss": 6.22, "mean_token_accuracy": 0.11956866905093193, "num_tokens": 3182838.0, "step": 1725 }, { "entropy": 6.2884269714355465, "epoch": 0.14534761604704893, "grad_norm": 1.0234375, "learning_rate": 0.0004999576374128429, "loss": 6.202, "mean_token_accuracy": 0.1219302274286747, "num_tokens": 3191692.0, "step": 1730 }, { "entropy": 6.500776195526123, "epoch": 0.14576769586221383, "grad_norm": 1.0703125, "learning_rate": 0.0004999570543331279, "loss": 6.226, "mean_token_accuracy": 0.1263854332268238, "num_tokens": 3200069.0, "step": 1735 }, { "entropy": 6.411444854736328, "epoch": 0.1461877756773787, "grad_norm": 1.140625, "learning_rate": 0.0004999564672684145, "loss": 6.3228, "mean_token_accuracy": 0.12090336456894875, "num_tokens": 3209653.0, "step": 1740 }, { "entropy": 6.448664712905884, "epoch": 0.14660785549254357, "grad_norm": 1.03125, "learning_rate": 0.0004999558762187131, "loss": 6.1938, "mean_token_accuracy": 0.12701231315732003, "num_tokens": 3218313.0, "step": 1745 }, { "entropy": 6.32896614074707, "epoch": 0.14702793530770847, "grad_norm": 1.015625, "learning_rate": 0.0004999552811840342, "loss": 6.1297, "mean_token_accuracy": 0.12769370079040526, "num_tokens": 3227525.0, "step": 1750 }, { "entropy": 6.335414171218872, "epoch": 0.14744801512287334, "grad_norm": 0.94921875, "learning_rate": 0.0004999546821643884, "loss": 6.2408, "mean_token_accuracy": 0.12636618986725806, "num_tokens": 3237022.0, "step": 1755 }, { "entropy": 6.317769384384155, "epoch": 0.14786809493803824, "grad_norm": 0.9921875, "learning_rate": 0.0004999540791597861, "loss": 6.1464, "mean_token_accuracy": 0.12537204548716546, "num_tokens": 3246605.0, "step": 1760 }, { "entropy": 6.258312511444092, "epoch": 0.1482881747532031, "grad_norm": 1.03125, "learning_rate": 0.0004999534721702383, "loss": 6.0956, "mean_token_accuracy": 0.13141294568777084, "num_tokens": 3255587.0, "step": 1765 }, { "entropy": 6.364277791976929, "epoch": 0.148708254568368, "grad_norm": 1.0234375, "learning_rate": 0.0004999528611957553, "loss": 6.1968, "mean_token_accuracy": 0.1267327442765236, "num_tokens": 3265669.0, "step": 1770 }, { "entropy": 6.433037424087525, "epoch": 0.14912833438353287, "grad_norm": 1.078125, "learning_rate": 0.0004999522462363485, "loss": 6.1795, "mean_token_accuracy": 0.12822128161787988, "num_tokens": 3275013.0, "step": 1775 }, { "entropy": 6.372742748260498, "epoch": 0.14954841419869774, "grad_norm": 0.91796875, "learning_rate": 0.0004999516272920283, "loss": 6.2775, "mean_token_accuracy": 0.12774404734373093, "num_tokens": 3284723.0, "step": 1780 }, { "entropy": 6.256136322021485, "epoch": 0.14996849401386264, "grad_norm": 0.96484375, "learning_rate": 0.000499951004362806, "loss": 6.1087, "mean_token_accuracy": 0.13196263536810876, "num_tokens": 3293860.0, "step": 1785 }, { "entropy": 6.278848552703858, "epoch": 0.1503885738290275, "grad_norm": 0.9765625, "learning_rate": 0.0004999503774486924, "loss": 6.1623, "mean_token_accuracy": 0.13007338494062423, "num_tokens": 3303158.0, "step": 1790 }, { "entropy": 6.253765487670899, "epoch": 0.1508086536441924, "grad_norm": 0.96484375, "learning_rate": 0.0004999497465496987, "loss": 6.1083, "mean_token_accuracy": 0.1231241799890995, "num_tokens": 3313068.0, "step": 1795 }, { "entropy": 6.319281959533692, "epoch": 0.15122873345935728, "grad_norm": 1.0390625, "learning_rate": 0.000499949111665836, "loss": 6.1761, "mean_token_accuracy": 0.12510209009051323, "num_tokens": 3321885.0, "step": 1800 }, { "entropy": 6.368197298049926, "epoch": 0.15164881327452215, "grad_norm": 1.015625, "learning_rate": 0.0004999484727971158, "loss": 6.1707, "mean_token_accuracy": 0.12798358947038652, "num_tokens": 3330924.0, "step": 1805 }, { "entropy": 6.339307403564453, "epoch": 0.15206889308968705, "grad_norm": 1.0625, "learning_rate": 0.000499947829943549, "loss": 6.1964, "mean_token_accuracy": 0.12618306949734687, "num_tokens": 3340070.0, "step": 1810 }, { "entropy": 6.394219160079956, "epoch": 0.15248897290485192, "grad_norm": 0.984375, "learning_rate": 0.0004999471831051474, "loss": 6.1922, "mean_token_accuracy": 0.13684661015868188, "num_tokens": 3349870.0, "step": 1815 }, { "entropy": 6.330759143829345, "epoch": 0.1529090527200168, "grad_norm": 0.94921875, "learning_rate": 0.0004999465322819222, "loss": 6.2371, "mean_token_accuracy": 0.12111249193549156, "num_tokens": 3359573.0, "step": 1820 }, { "entropy": 6.372816276550293, "epoch": 0.15332913253518168, "grad_norm": 1.046875, "learning_rate": 0.0004999458774738851, "loss": 6.1732, "mean_token_accuracy": 0.13470285311341285, "num_tokens": 3368577.0, "step": 1825 }, { "entropy": 6.352361059188842, "epoch": 0.15374921235034655, "grad_norm": 1.078125, "learning_rate": 0.0004999452186810476, "loss": 6.1469, "mean_token_accuracy": 0.13113251850008964, "num_tokens": 3377801.0, "step": 1830 }, { "entropy": 6.3680521011352536, "epoch": 0.15416929216551145, "grad_norm": 1.046875, "learning_rate": 0.0004999445559034214, "loss": 6.1995, "mean_token_accuracy": 0.12895982414484025, "num_tokens": 3386666.0, "step": 1835 }, { "entropy": 6.443807363510132, "epoch": 0.15458937198067632, "grad_norm": 0.97265625, "learning_rate": 0.0004999438891410181, "loss": 6.3344, "mean_token_accuracy": 0.12429568618535995, "num_tokens": 3396086.0, "step": 1840 }, { "entropy": 6.371559190750122, "epoch": 0.15500945179584122, "grad_norm": 1.0234375, "learning_rate": 0.0004999432183938496, "loss": 6.2503, "mean_token_accuracy": 0.1258139818906784, "num_tokens": 3404894.0, "step": 1845 }, { "entropy": 6.40411787033081, "epoch": 0.1554295316110061, "grad_norm": 1.015625, "learning_rate": 0.0004999425436619279, "loss": 6.2301, "mean_token_accuracy": 0.1250107169151306, "num_tokens": 3414172.0, "step": 1850 }, { "entropy": 6.4263053894042965, "epoch": 0.15584961142617096, "grad_norm": 0.9375, "learning_rate": 0.000499941864945265, "loss": 6.2069, "mean_token_accuracy": 0.12341500893235206, "num_tokens": 3423409.0, "step": 1855 }, { "entropy": 6.2579625129699705, "epoch": 0.15626969124133586, "grad_norm": 0.99609375, "learning_rate": 0.0004999411822438726, "loss": 6.1554, "mean_token_accuracy": 0.12717969343066216, "num_tokens": 3433047.0, "step": 1860 }, { "entropy": 6.4037513256073, "epoch": 0.15668977105650073, "grad_norm": 1.078125, "learning_rate": 0.000499940495557763, "loss": 6.1468, "mean_token_accuracy": 0.12783457711338997, "num_tokens": 3442490.0, "step": 1865 }, { "entropy": 6.303406810760498, "epoch": 0.15710985087166562, "grad_norm": 0.9921875, "learning_rate": 0.0004999398048869485, "loss": 6.2099, "mean_token_accuracy": 0.129954195022583, "num_tokens": 3451804.0, "step": 1870 }, { "entropy": 6.385490417480469, "epoch": 0.1575299306868305, "grad_norm": 0.984375, "learning_rate": 0.000499939110231441, "loss": 6.199, "mean_token_accuracy": 0.1304432988166809, "num_tokens": 3461481.0, "step": 1875 }, { "entropy": 6.364220190048218, "epoch": 0.1579500105019954, "grad_norm": 1.0234375, "learning_rate": 0.0004999384115912531, "loss": 6.2449, "mean_token_accuracy": 0.13135363310575485, "num_tokens": 3471798.0, "step": 1880 }, { "entropy": 6.247316694259643, "epoch": 0.15837009031716026, "grad_norm": 0.96875, "learning_rate": 0.000499937708966397, "loss": 6.1296, "mean_token_accuracy": 0.12637364491820335, "num_tokens": 3481386.0, "step": 1885 }, { "entropy": 6.332306051254273, "epoch": 0.15879017013232513, "grad_norm": 0.97265625, "learning_rate": 0.0004999370023568853, "loss": 6.127, "mean_token_accuracy": 0.1316571466624737, "num_tokens": 3489981.0, "step": 1890 }, { "entropy": 6.299954462051391, "epoch": 0.15921024994749003, "grad_norm": 1.03125, "learning_rate": 0.0004999362917627304, "loss": 6.1227, "mean_token_accuracy": 0.1305247150361538, "num_tokens": 3498551.0, "step": 1895 }, { "entropy": 6.316105461120605, "epoch": 0.1596303297626549, "grad_norm": 1.046875, "learning_rate": 0.0004999355771839448, "loss": 6.0979, "mean_token_accuracy": 0.12954429015517235, "num_tokens": 3507921.0, "step": 1900 }, { "entropy": 6.470440483093261, "epoch": 0.1600504095778198, "grad_norm": 1.078125, "learning_rate": 0.0004999348586205414, "loss": 6.2729, "mean_token_accuracy": 0.13220328316092492, "num_tokens": 3517570.0, "step": 1905 }, { "entropy": 6.38808388710022, "epoch": 0.16047048939298467, "grad_norm": 1.0703125, "learning_rate": 0.0004999341360725327, "loss": 6.2438, "mean_token_accuracy": 0.123927091807127, "num_tokens": 3526774.0, "step": 1910 }, { "entropy": 6.285849714279175, "epoch": 0.16089056920814954, "grad_norm": 1.03125, "learning_rate": 0.0004999334095399317, "loss": 6.1859, "mean_token_accuracy": 0.1361298866569996, "num_tokens": 3535319.0, "step": 1915 }, { "entropy": 6.249746656417846, "epoch": 0.16131064902331443, "grad_norm": 0.98828125, "learning_rate": 0.0004999326790227512, "loss": 6.1605, "mean_token_accuracy": 0.1271871216595173, "num_tokens": 3544468.0, "step": 1920 }, { "entropy": 6.217294788360595, "epoch": 0.1617307288384793, "grad_norm": 0.9140625, "learning_rate": 0.0004999319445210041, "loss": 6.0261, "mean_token_accuracy": 0.1361843690276146, "num_tokens": 3553529.0, "step": 1925 }, { "entropy": 6.290815734863282, "epoch": 0.1621508086536442, "grad_norm": 0.96875, "learning_rate": 0.0004999312060347034, "loss": 6.1011, "mean_token_accuracy": 0.13233864828944206, "num_tokens": 3563053.0, "step": 1930 }, { "entropy": 6.224975728988648, "epoch": 0.16257088846880907, "grad_norm": 0.953125, "learning_rate": 0.0004999304635638621, "loss": 6.0288, "mean_token_accuracy": 0.1342104844748974, "num_tokens": 3571877.0, "step": 1935 }, { "entropy": 6.233099460601807, "epoch": 0.16299096828397394, "grad_norm": 0.92578125, "learning_rate": 0.0004999297171084935, "loss": 6.091, "mean_token_accuracy": 0.13373700231313707, "num_tokens": 3581496.0, "step": 1940 }, { "entropy": 6.324843549728394, "epoch": 0.16341104809913884, "grad_norm": 0.98828125, "learning_rate": 0.0004999289666686109, "loss": 6.1071, "mean_token_accuracy": 0.1308230109512806, "num_tokens": 3590752.0, "step": 1945 }, { "entropy": 6.129473495483398, "epoch": 0.1638311279143037, "grad_norm": 0.98046875, "learning_rate": 0.0004999282122442274, "loss": 6.1072, "mean_token_accuracy": 0.1328013814985752, "num_tokens": 3599885.0, "step": 1950 }, { "entropy": 6.387533235549927, "epoch": 0.1642512077294686, "grad_norm": 0.9296875, "learning_rate": 0.0004999274538353564, "loss": 6.1968, "mean_token_accuracy": 0.12293331325054169, "num_tokens": 3610039.0, "step": 1955 }, { "entropy": 6.2677867889404295, "epoch": 0.16467128754463348, "grad_norm": 1.015625, "learning_rate": 0.0004999266914420114, "loss": 6.1123, "mean_token_accuracy": 0.12491545528173446, "num_tokens": 3619954.0, "step": 1960 }, { "entropy": 6.291842746734619, "epoch": 0.16509136735979837, "grad_norm": 1.0078125, "learning_rate": 0.000499925925064206, "loss": 6.0646, "mean_token_accuracy": 0.13617814630270003, "num_tokens": 3628164.0, "step": 1965 }, { "entropy": 6.377547359466552, "epoch": 0.16551144717496324, "grad_norm": 1.046875, "learning_rate": 0.0004999251547019535, "loss": 6.2126, "mean_token_accuracy": 0.13370679765939714, "num_tokens": 3636778.0, "step": 1970 }, { "entropy": 6.318364191055298, "epoch": 0.16593152699012811, "grad_norm": 0.9609375, "learning_rate": 0.0004999243803552678, "loss": 6.1666, "mean_token_accuracy": 0.13474627435207367, "num_tokens": 3647046.0, "step": 1975 }, { "entropy": 6.2661604404449465, "epoch": 0.166351606805293, "grad_norm": 1.03125, "learning_rate": 0.0004999236020241625, "loss": 6.0969, "mean_token_accuracy": 0.1302388660609722, "num_tokens": 3656130.0, "step": 1980 }, { "entropy": 6.294794940948487, "epoch": 0.16677168662045788, "grad_norm": 0.9921875, "learning_rate": 0.0004999228197086514, "loss": 6.1791, "mean_token_accuracy": 0.12147556319832802, "num_tokens": 3666145.0, "step": 1985 }, { "entropy": 6.308886766433716, "epoch": 0.16719176643562278, "grad_norm": 0.88671875, "learning_rate": 0.0004999220334087484, "loss": 6.2221, "mean_token_accuracy": 0.12820759564638137, "num_tokens": 3676722.0, "step": 1990 }, { "entropy": 6.34148588180542, "epoch": 0.16761184625078765, "grad_norm": 1.0, "learning_rate": 0.0004999212431244673, "loss": 6.1977, "mean_token_accuracy": 0.1265730917453766, "num_tokens": 3685880.0, "step": 1995 }, { "entropy": 6.220745372772217, "epoch": 0.16803192606595252, "grad_norm": 0.98828125, "learning_rate": 0.0004999204488558222, "loss": 6.0332, "mean_token_accuracy": 0.13368572890758515, "num_tokens": 3695167.0, "step": 2000 }, { "entropy": 6.279938268661499, "epoch": 0.16845200588111742, "grad_norm": 0.96875, "learning_rate": 0.0004999196506028273, "loss": 6.1455, "mean_token_accuracy": 0.12803823873400688, "num_tokens": 3703700.0, "step": 2005 }, { "entropy": 6.340878582000732, "epoch": 0.1688720856962823, "grad_norm": 1.0390625, "learning_rate": 0.0004999188483654965, "loss": 6.0938, "mean_token_accuracy": 0.12776080071926116, "num_tokens": 3712825.0, "step": 2010 }, { "entropy": 6.229676914215088, "epoch": 0.16929216551144718, "grad_norm": 0.9453125, "learning_rate": 0.0004999180421438442, "loss": 6.0447, "mean_token_accuracy": 0.13442618474364282, "num_tokens": 3721807.0, "step": 2015 }, { "entropy": 6.3377564430236815, "epoch": 0.16971224532661205, "grad_norm": 1.0625, "learning_rate": 0.0004999172319378846, "loss": 6.2308, "mean_token_accuracy": 0.12342165559530258, "num_tokens": 3730502.0, "step": 2020 }, { "entropy": 6.334515047073364, "epoch": 0.17013232514177692, "grad_norm": 0.98828125, "learning_rate": 0.0004999164177476319, "loss": 6.1138, "mean_token_accuracy": 0.13388336971402168, "num_tokens": 3739696.0, "step": 2025 }, { "entropy": 6.170955038070678, "epoch": 0.17055240495694182, "grad_norm": 1.0625, "learning_rate": 0.0004999155995731009, "loss": 6.1168, "mean_token_accuracy": 0.1329979881644249, "num_tokens": 3748675.0, "step": 2030 }, { "entropy": 6.440923643112183, "epoch": 0.1709724847721067, "grad_norm": 1.0234375, "learning_rate": 0.0004999147774143057, "loss": 6.1895, "mean_token_accuracy": 0.12849014177918433, "num_tokens": 3757714.0, "step": 2035 }, { "entropy": 6.217456531524658, "epoch": 0.1713925645872716, "grad_norm": 1.0, "learning_rate": 0.000499913951271261, "loss": 6.0181, "mean_token_accuracy": 0.13668849244713782, "num_tokens": 3767589.0, "step": 2040 }, { "entropy": 6.216994047164917, "epoch": 0.17181264440243646, "grad_norm": 1.125, "learning_rate": 0.0004999131211439816, "loss": 6.1246, "mean_token_accuracy": 0.13397686704993247, "num_tokens": 3777261.0, "step": 2045 }, { "entropy": 6.3198566913604735, "epoch": 0.17223272421760136, "grad_norm": 1.015625, "learning_rate": 0.000499912287032482, "loss": 6.0738, "mean_token_accuracy": 0.13602124899625778, "num_tokens": 3786658.0, "step": 2050 }, { "entropy": 6.19984622001648, "epoch": 0.17265280403276623, "grad_norm": 1.0703125, "learning_rate": 0.000499911448936777, "loss": 6.0669, "mean_token_accuracy": 0.14067015573382377, "num_tokens": 3794977.0, "step": 2055 }, { "entropy": 6.179085731506348, "epoch": 0.1730728838479311, "grad_norm": 0.93359375, "learning_rate": 0.0004999106068568816, "loss": 6.1457, "mean_token_accuracy": 0.12947675883769988, "num_tokens": 3805138.0, "step": 2060 }, { "entropy": 6.279845762252807, "epoch": 0.173492963663096, "grad_norm": 1.015625, "learning_rate": 0.0004999097607928106, "loss": 6.0911, "mean_token_accuracy": 0.13879665359854698, "num_tokens": 3814444.0, "step": 2065 }, { "entropy": 6.212150764465332, "epoch": 0.17391304347826086, "grad_norm": 0.984375, "learning_rate": 0.0004999089107445788, "loss": 6.0398, "mean_token_accuracy": 0.13306153938174248, "num_tokens": 3822859.0, "step": 2070 }, { "entropy": 6.133330869674682, "epoch": 0.17433312329342576, "grad_norm": 0.9140625, "learning_rate": 0.0004999080567122016, "loss": 6.0707, "mean_token_accuracy": 0.13198764845728875, "num_tokens": 3833159.0, "step": 2075 }, { "entropy": 6.295455646514893, "epoch": 0.17475320310859063, "grad_norm": 1.015625, "learning_rate": 0.0004999071986956941, "loss": 6.0856, "mean_token_accuracy": 0.13797224685549736, "num_tokens": 3842136.0, "step": 2080 }, { "entropy": 6.208657741546631, "epoch": 0.1751732829237555, "grad_norm": 1.0234375, "learning_rate": 0.0004999063366950713, "loss": 6.1499, "mean_token_accuracy": 0.12877421900629998, "num_tokens": 3851406.0, "step": 2085 }, { "entropy": 6.217505025863647, "epoch": 0.1755933627389204, "grad_norm": 1.0078125, "learning_rate": 0.0004999054707103486, "loss": 6.0713, "mean_token_accuracy": 0.1279774695634842, "num_tokens": 3861061.0, "step": 2090 }, { "entropy": 6.265169095993042, "epoch": 0.17601344255408527, "grad_norm": 1.0234375, "learning_rate": 0.0004999046007415412, "loss": 6.0378, "mean_token_accuracy": 0.12900712937116623, "num_tokens": 3870357.0, "step": 2095 }, { "entropy": 6.2917054176330565, "epoch": 0.17643352236925017, "grad_norm": 1.0, "learning_rate": 0.0004999037267886646, "loss": 6.0715, "mean_token_accuracy": 0.13141706436872483, "num_tokens": 3879393.0, "step": 2100 }, { "entropy": 6.180794954299927, "epoch": 0.17685360218441504, "grad_norm": 1.046875, "learning_rate": 0.0004999028488517343, "loss": 6.0832, "mean_token_accuracy": 0.13525146320462228, "num_tokens": 3888030.0, "step": 2105 }, { "entropy": 6.266747093200683, "epoch": 0.1772736819995799, "grad_norm": 1.0234375, "learning_rate": 0.0004999019669307659, "loss": 6.0788, "mean_token_accuracy": 0.1376435212790966, "num_tokens": 3897430.0, "step": 2110 }, { "entropy": 6.238908100128174, "epoch": 0.1776937618147448, "grad_norm": 0.9296875, "learning_rate": 0.0004999010810257749, "loss": 6.0977, "mean_token_accuracy": 0.12719068825244903, "num_tokens": 3907711.0, "step": 2115 }, { "entropy": 6.189173746109009, "epoch": 0.17811384162990967, "grad_norm": 0.9765625, "learning_rate": 0.0004999001911367771, "loss": 6.0411, "mean_token_accuracy": 0.13638337776064874, "num_tokens": 3915816.0, "step": 2120 }, { "entropy": 6.22648811340332, "epoch": 0.17853392144507457, "grad_norm": 0.96484375, "learning_rate": 0.0004998992972637883, "loss": 6.1538, "mean_token_accuracy": 0.12582943066954613, "num_tokens": 3925162.0, "step": 2125 }, { "entropy": 6.284874153137207, "epoch": 0.17895400126023944, "grad_norm": 0.94921875, "learning_rate": 0.0004998983994068242, "loss": 6.0395, "mean_token_accuracy": 0.13122835606336594, "num_tokens": 3934476.0, "step": 2130 }, { "entropy": 6.186276054382324, "epoch": 0.17937408107540434, "grad_norm": 0.93359375, "learning_rate": 0.0004998974975659006, "loss": 6.0907, "mean_token_accuracy": 0.1297646477818489, "num_tokens": 3943501.0, "step": 2135 }, { "entropy": 6.205726194381714, "epoch": 0.1797941608905692, "grad_norm": 0.96484375, "learning_rate": 0.0004998965917410338, "loss": 6.0816, "mean_token_accuracy": 0.12778471410274506, "num_tokens": 3953663.0, "step": 2140 }, { "entropy": 6.211074018478394, "epoch": 0.18021424070573408, "grad_norm": 1.0078125, "learning_rate": 0.0004998956819322397, "loss": 6.0495, "mean_token_accuracy": 0.13608243688941002, "num_tokens": 3962634.0, "step": 2145 }, { "entropy": 6.177238512039184, "epoch": 0.18063432052089898, "grad_norm": 0.94921875, "learning_rate": 0.0004998947681395343, "loss": 6.052, "mean_token_accuracy": 0.13605224341154099, "num_tokens": 3972496.0, "step": 2150 }, { "entropy": 6.390697908401489, "epoch": 0.18105440033606385, "grad_norm": 1.03125, "learning_rate": 0.000499893850362934, "loss": 6.2977, "mean_token_accuracy": 0.12441082820296287, "num_tokens": 3980724.0, "step": 2155 }, { "entropy": 6.262918901443482, "epoch": 0.18147448015122875, "grad_norm": 0.96875, "learning_rate": 0.0004998929286024548, "loss": 6.1304, "mean_token_accuracy": 0.1300631955265999, "num_tokens": 3989842.0, "step": 2160 }, { "entropy": 6.230935716629029, "epoch": 0.18189455996639362, "grad_norm": 1.109375, "learning_rate": 0.0004998920028581133, "loss": 6.0378, "mean_token_accuracy": 0.14167480319738388, "num_tokens": 3998534.0, "step": 2165 }, { "entropy": 6.241239356994629, "epoch": 0.18231463978155849, "grad_norm": 0.9765625, "learning_rate": 0.0004998910731299258, "loss": 6.0631, "mean_token_accuracy": 0.13066420927643776, "num_tokens": 4007677.0, "step": 2170 }, { "entropy": 6.19789605140686, "epoch": 0.18273471959672338, "grad_norm": 1.0234375, "learning_rate": 0.0004998901394179085, "loss": 6.1007, "mean_token_accuracy": 0.12627347633242608, "num_tokens": 4016347.0, "step": 2175 }, { "entropy": 6.198655843734741, "epoch": 0.18315479941188825, "grad_norm": 1.046875, "learning_rate": 0.0004998892017220784, "loss": 5.9767, "mean_token_accuracy": 0.14088783264160157, "num_tokens": 4025199.0, "step": 2180 }, { "entropy": 6.262273931503296, "epoch": 0.18357487922705315, "grad_norm": 1.0859375, "learning_rate": 0.0004998882600424519, "loss": 6.0603, "mean_token_accuracy": 0.1286892294883728, "num_tokens": 4033933.0, "step": 2185 }, { "entropy": 6.162368822097778, "epoch": 0.18399495904221802, "grad_norm": 0.9609375, "learning_rate": 0.0004998873143790455, "loss": 5.9753, "mean_token_accuracy": 0.1438771367073059, "num_tokens": 4042891.0, "step": 2190 }, { "entropy": 6.274066638946533, "epoch": 0.1844150388573829, "grad_norm": 0.9609375, "learning_rate": 0.0004998863647318763, "loss": 6.1041, "mean_token_accuracy": 0.13264708146452903, "num_tokens": 4051123.0, "step": 2195 }, { "entropy": 6.144877004623413, "epoch": 0.1848351186725478, "grad_norm": 1.046875, "learning_rate": 0.0004998854111009608, "loss": 6.0715, "mean_token_accuracy": 0.12865814492106437, "num_tokens": 4060025.0, "step": 2200 }, { "entropy": 6.182585954666138, "epoch": 0.18525519848771266, "grad_norm": 0.90625, "learning_rate": 0.0004998844534863161, "loss": 5.991, "mean_token_accuracy": 0.1295328378677368, "num_tokens": 4069363.0, "step": 2205 }, { "entropy": 6.241155099868775, "epoch": 0.18567527830287756, "grad_norm": 0.99609375, "learning_rate": 0.0004998834918879592, "loss": 6.1376, "mean_token_accuracy": 0.133307021856308, "num_tokens": 4078855.0, "step": 2210 }, { "entropy": 6.206245565414429, "epoch": 0.18609535811804243, "grad_norm": 0.9453125, "learning_rate": 0.000499882526305907, "loss": 6.0804, "mean_token_accuracy": 0.12953457087278367, "num_tokens": 4087801.0, "step": 2215 }, { "entropy": 6.248236179351807, "epoch": 0.18651543793320732, "grad_norm": 0.91796875, "learning_rate": 0.0004998815567401765, "loss": 6.0926, "mean_token_accuracy": 0.1376325160264969, "num_tokens": 4096949.0, "step": 2220 }, { "entropy": 6.279425954818725, "epoch": 0.1869355177483722, "grad_norm": 1.03125, "learning_rate": 0.0004998805831907851, "loss": 6.0617, "mean_token_accuracy": 0.13082574903964997, "num_tokens": 4105399.0, "step": 2225 }, { "entropy": 6.169968605041504, "epoch": 0.18735559756353706, "grad_norm": 1.0078125, "learning_rate": 0.0004998796056577501, "loss": 6.0071, "mean_token_accuracy": 0.12926321402192115, "num_tokens": 4113873.0, "step": 2230 }, { "entropy": 6.154512643814087, "epoch": 0.18777567737870196, "grad_norm": 0.90625, "learning_rate": 0.0004998786241410886, "loss": 6.0586, "mean_token_accuracy": 0.13699585050344468, "num_tokens": 4123528.0, "step": 2235 }, { "entropy": 6.2988721370697025, "epoch": 0.18819575719386683, "grad_norm": 0.9140625, "learning_rate": 0.000499877638640818, "loss": 6.0699, "mean_token_accuracy": 0.13017342165112494, "num_tokens": 4133370.0, "step": 2240 }, { "entropy": 6.184452104568481, "epoch": 0.18861583700903173, "grad_norm": 0.94140625, "learning_rate": 0.000499876649156956, "loss": 5.9844, "mean_token_accuracy": 0.13666255846619607, "num_tokens": 4142370.0, "step": 2245 }, { "entropy": 6.133312082290649, "epoch": 0.1890359168241966, "grad_norm": 0.96875, "learning_rate": 0.0004998756556895196, "loss": 6.0725, "mean_token_accuracy": 0.1354515865445137, "num_tokens": 4152367.0, "step": 2250 }, { "entropy": 6.21663122177124, "epoch": 0.18945599663936147, "grad_norm": 1.0078125, "learning_rate": 0.000499874658238527, "loss": 6.0625, "mean_token_accuracy": 0.13495326191186904, "num_tokens": 4161126.0, "step": 2255 }, { "entropy": 6.186970901489258, "epoch": 0.18987607645452637, "grad_norm": 1.0078125, "learning_rate": 0.0004998736568039957, "loss": 5.9748, "mean_token_accuracy": 0.13723411411046982, "num_tokens": 4169910.0, "step": 2260 }, { "entropy": 6.1857301712036135, "epoch": 0.19029615626969124, "grad_norm": 0.9921875, "learning_rate": 0.0004998726513859432, "loss": 6.1067, "mean_token_accuracy": 0.12761787325143814, "num_tokens": 4179893.0, "step": 2265 }, { "entropy": 6.308238935470581, "epoch": 0.19071623608485613, "grad_norm": 0.9140625, "learning_rate": 0.0004998716419843875, "loss": 6.12, "mean_token_accuracy": 0.13745217099785806, "num_tokens": 4190065.0, "step": 2270 }, { "entropy": 6.090948486328125, "epoch": 0.191136315900021, "grad_norm": 1.015625, "learning_rate": 0.0004998706285993465, "loss": 6.0313, "mean_token_accuracy": 0.1420229621231556, "num_tokens": 4198395.0, "step": 2275 }, { "entropy": 6.282499647140503, "epoch": 0.19155639571518587, "grad_norm": 0.9453125, "learning_rate": 0.0004998696112308381, "loss": 6.0533, "mean_token_accuracy": 0.1310360386967659, "num_tokens": 4207555.0, "step": 2280 }, { "entropy": 6.088230180740356, "epoch": 0.19197647553035077, "grad_norm": 0.9296875, "learning_rate": 0.0004998685898788803, "loss": 5.9946, "mean_token_accuracy": 0.13536595478653907, "num_tokens": 4216533.0, "step": 2285 }, { "entropy": 6.274929618835449, "epoch": 0.19239655534551564, "grad_norm": 1.0390625, "learning_rate": 0.0004998675645434914, "loss": 6.1095, "mean_token_accuracy": 0.13767784610390663, "num_tokens": 4225575.0, "step": 2290 }, { "entropy": 6.153714513778686, "epoch": 0.19281663516068054, "grad_norm": 1.0234375, "learning_rate": 0.0004998665352246891, "loss": 5.8958, "mean_token_accuracy": 0.14245088025927544, "num_tokens": 4234306.0, "step": 2295 }, { "entropy": 6.08680305480957, "epoch": 0.1932367149758454, "grad_norm": 0.9609375, "learning_rate": 0.0004998655019224921, "loss": 6.0823, "mean_token_accuracy": 0.1359329827129841, "num_tokens": 4243998.0, "step": 2300 }, { "entropy": 6.237053394317627, "epoch": 0.19365679479101028, "grad_norm": 0.98828125, "learning_rate": 0.0004998644646369185, "loss": 5.9776, "mean_token_accuracy": 0.13352483361959458, "num_tokens": 4253653.0, "step": 2305 }, { "entropy": 6.139167737960816, "epoch": 0.19407687460617518, "grad_norm": 0.98828125, "learning_rate": 0.0004998634233679865, "loss": 6.0652, "mean_token_accuracy": 0.1278400629758835, "num_tokens": 4263305.0, "step": 2310 }, { "entropy": 6.127392339706421, "epoch": 0.19449695442134005, "grad_norm": 1.0078125, "learning_rate": 0.000499862378115715, "loss": 5.9342, "mean_token_accuracy": 0.14543856382369996, "num_tokens": 4272212.0, "step": 2315 }, { "entropy": 6.305202007293701, "epoch": 0.19491703423650494, "grad_norm": 1.0625, "learning_rate": 0.0004998613288801221, "loss": 6.1375, "mean_token_accuracy": 0.13151465207338334, "num_tokens": 4281445.0, "step": 2320 }, { "entropy": 6.2177956104278564, "epoch": 0.1953371140516698, "grad_norm": 0.9609375, "learning_rate": 0.0004998602756612267, "loss": 6.055, "mean_token_accuracy": 0.1372949168086052, "num_tokens": 4290938.0, "step": 2325 }, { "entropy": 6.175972557067871, "epoch": 0.1957571938668347, "grad_norm": 0.9765625, "learning_rate": 0.0004998592184590471, "loss": 6.0786, "mean_token_accuracy": 0.13233636021614076, "num_tokens": 4300022.0, "step": 2330 }, { "entropy": 6.134920358657837, "epoch": 0.19617727368199958, "grad_norm": 1.0, "learning_rate": 0.0004998581572736024, "loss": 5.9674, "mean_token_accuracy": 0.1363460712134838, "num_tokens": 4308910.0, "step": 2335 }, { "entropy": 6.092206907272339, "epoch": 0.19659735349716445, "grad_norm": 0.93359375, "learning_rate": 0.0004998570921049112, "loss": 5.9454, "mean_token_accuracy": 0.13969452679157257, "num_tokens": 4317136.0, "step": 2340 }, { "entropy": 6.112558746337891, "epoch": 0.19701743331232935, "grad_norm": 1.046875, "learning_rate": 0.0004998560229529924, "loss": 5.9993, "mean_token_accuracy": 0.1428337089717388, "num_tokens": 4326163.0, "step": 2345 }, { "entropy": 6.308993816375732, "epoch": 0.19743751312749422, "grad_norm": 0.97265625, "learning_rate": 0.0004998549498178649, "loss": 6.1402, "mean_token_accuracy": 0.13658420667052268, "num_tokens": 4335837.0, "step": 2350 }, { "entropy": 6.216946363449097, "epoch": 0.19785759294265912, "grad_norm": 1.09375, "learning_rate": 0.0004998538726995477, "loss": 6.0561, "mean_token_accuracy": 0.1374947391450405, "num_tokens": 4345108.0, "step": 2355 }, { "entropy": 6.217574787139893, "epoch": 0.198277672757824, "grad_norm": 0.953125, "learning_rate": 0.00049985279159806, "loss": 6.0722, "mean_token_accuracy": 0.1334306165575981, "num_tokens": 4353761.0, "step": 2360 }, { "entropy": 6.1630774974823, "epoch": 0.19869775257298886, "grad_norm": 0.99609375, "learning_rate": 0.0004998517065134208, "loss": 6.0354, "mean_token_accuracy": 0.13587109968066216, "num_tokens": 4363244.0, "step": 2365 }, { "entropy": 6.205533790588379, "epoch": 0.19911783238815375, "grad_norm": 0.92578125, "learning_rate": 0.0004998506174456494, "loss": 6.0386, "mean_token_accuracy": 0.13257589265704156, "num_tokens": 4373034.0, "step": 2370 }, { "entropy": 6.200410652160644, "epoch": 0.19953791220331862, "grad_norm": 0.90625, "learning_rate": 0.0004998495243947653, "loss": 5.9816, "mean_token_accuracy": 0.13029902279376984, "num_tokens": 4382554.0, "step": 2375 }, { "entropy": 6.191087865829468, "epoch": 0.19995799201848352, "grad_norm": 1.03125, "learning_rate": 0.0004998484273607875, "loss": 5.9843, "mean_token_accuracy": 0.14299238696694375, "num_tokens": 4391001.0, "step": 2380 }, { "entropy": 6.023518228530884, "epoch": 0.2003780718336484, "grad_norm": 0.9140625, "learning_rate": 0.0004998473263437356, "loss": 5.9141, "mean_token_accuracy": 0.13673870489001275, "num_tokens": 4400632.0, "step": 2385 }, { "entropy": 6.105119514465332, "epoch": 0.20079815164881326, "grad_norm": 0.97265625, "learning_rate": 0.000499846221343629, "loss": 6.0095, "mean_token_accuracy": 0.12952324375510216, "num_tokens": 4409565.0, "step": 2390 }, { "entropy": 6.128167533874512, "epoch": 0.20121823146397816, "grad_norm": 1.0234375, "learning_rate": 0.0004998451123604875, "loss": 5.944, "mean_token_accuracy": 0.14282809123396872, "num_tokens": 4418384.0, "step": 2395 }, { "entropy": 6.1983355522155765, "epoch": 0.20163831127914303, "grad_norm": 1.0546875, "learning_rate": 0.0004998439993943306, "loss": 6.0692, "mean_token_accuracy": 0.1389256276190281, "num_tokens": 4427581.0, "step": 2400 }, { "entropy": 6.267655086517334, "epoch": 0.20205839109430793, "grad_norm": 1.0078125, "learning_rate": 0.0004998428824451779, "loss": 6.0521, "mean_token_accuracy": 0.1341543450951576, "num_tokens": 4436572.0, "step": 2405 }, { "entropy": 6.1763083934783936, "epoch": 0.2024784709094728, "grad_norm": 1.0078125, "learning_rate": 0.0004998417615130495, "loss": 6.055, "mean_token_accuracy": 0.13537125810980796, "num_tokens": 4445230.0, "step": 2410 }, { "entropy": 6.247248315811158, "epoch": 0.2028985507246377, "grad_norm": 0.98046875, "learning_rate": 0.0004998406365979649, "loss": 6.1134, "mean_token_accuracy": 0.13383878991007805, "num_tokens": 4454251.0, "step": 2415 }, { "entropy": 6.136447811126709, "epoch": 0.20331863053980256, "grad_norm": 0.9375, "learning_rate": 0.0004998395076999443, "loss": 5.9699, "mean_token_accuracy": 0.13695907220244408, "num_tokens": 4463949.0, "step": 2420 }, { "entropy": 6.227413558959961, "epoch": 0.20373871035496743, "grad_norm": 1.03125, "learning_rate": 0.0004998383748190076, "loss": 6.1649, "mean_token_accuracy": 0.12917085587978364, "num_tokens": 4473373.0, "step": 2425 }, { "entropy": 6.249214363098145, "epoch": 0.20415879017013233, "grad_norm": 1.0234375, "learning_rate": 0.0004998372379551748, "loss": 5.9842, "mean_token_accuracy": 0.1414948470890522, "num_tokens": 4482303.0, "step": 2430 }, { "entropy": 6.117572832107544, "epoch": 0.2045788699852972, "grad_norm": 0.9765625, "learning_rate": 0.0004998360971084663, "loss": 5.9567, "mean_token_accuracy": 0.1317524030804634, "num_tokens": 4491214.0, "step": 2435 }, { "entropy": 6.057681226730347, "epoch": 0.2049989498004621, "grad_norm": 0.97265625, "learning_rate": 0.0004998349522789019, "loss": 5.8856, "mean_token_accuracy": 0.14377139806747435, "num_tokens": 4500099.0, "step": 2440 }, { "entropy": 6.115459060668945, "epoch": 0.20541902961562697, "grad_norm": 0.96875, "learning_rate": 0.0004998338034665021, "loss": 5.9692, "mean_token_accuracy": 0.1437109664082527, "num_tokens": 4509893.0, "step": 2445 }, { "entropy": 6.08744249343872, "epoch": 0.20583910943079184, "grad_norm": 0.98828125, "learning_rate": 0.0004998326506712872, "loss": 5.9375, "mean_token_accuracy": 0.13774847760796546, "num_tokens": 4518606.0, "step": 2450 }, { "entropy": 6.11673412322998, "epoch": 0.20625918924595674, "grad_norm": 0.99609375, "learning_rate": 0.0004998314938932778, "loss": 6.0218, "mean_token_accuracy": 0.14001012295484544, "num_tokens": 4528392.0, "step": 2455 }, { "entropy": 6.221143388748169, "epoch": 0.2066792690611216, "grad_norm": 0.96875, "learning_rate": 0.0004998303331324943, "loss": 5.9923, "mean_token_accuracy": 0.13821439668536187, "num_tokens": 4536983.0, "step": 2460 }, { "entropy": 6.041988134384155, "epoch": 0.2070993488762865, "grad_norm": 0.96875, "learning_rate": 0.0004998291683889571, "loss": 5.9145, "mean_token_accuracy": 0.1391140677034855, "num_tokens": 4544967.0, "step": 2465 }, { "entropy": 6.134957313537598, "epoch": 0.20751942869145137, "grad_norm": 1.0234375, "learning_rate": 0.000499827999662687, "loss": 5.9727, "mean_token_accuracy": 0.13200750946998596, "num_tokens": 4554646.0, "step": 2470 }, { "entropy": 6.192252588272095, "epoch": 0.20793950850661624, "grad_norm": 0.9453125, "learning_rate": 0.0004998268269537046, "loss": 5.9954, "mean_token_accuracy": 0.1370847873389721, "num_tokens": 4564040.0, "step": 2475 }, { "entropy": 6.091167068481445, "epoch": 0.20835958832178114, "grad_norm": 0.96875, "learning_rate": 0.0004998256502620308, "loss": 6.0187, "mean_token_accuracy": 0.14094985872507096, "num_tokens": 4573758.0, "step": 2480 }, { "entropy": 6.206011056900024, "epoch": 0.208779668136946, "grad_norm": 0.92578125, "learning_rate": 0.0004998244695876864, "loss": 6.0452, "mean_token_accuracy": 0.13380730673670768, "num_tokens": 4582097.0, "step": 2485 }, { "entropy": 6.0949585914611815, "epoch": 0.2091997479521109, "grad_norm": 1.015625, "learning_rate": 0.0004998232849306921, "loss": 6.0055, "mean_token_accuracy": 0.13993047401309014, "num_tokens": 4590687.0, "step": 2490 }, { "entropy": 6.1933338165283205, "epoch": 0.20961982776727578, "grad_norm": 0.9765625, "learning_rate": 0.0004998220962910693, "loss": 5.9965, "mean_token_accuracy": 0.13453714549541473, "num_tokens": 4599497.0, "step": 2495 }, { "entropy": 6.101396179199218, "epoch": 0.21003990758244068, "grad_norm": 1.0390625, "learning_rate": 0.0004998209036688386, "loss": 5.9532, "mean_token_accuracy": 0.13716981932520866, "num_tokens": 4607958.0, "step": 2500 }, { "entropy": 6.216299772262573, "epoch": 0.21045998739760555, "grad_norm": 0.96484375, "learning_rate": 0.0004998197070640216, "loss": 6.0812, "mean_token_accuracy": 0.1314453199505806, "num_tokens": 4617515.0, "step": 2505 }, { "entropy": 6.2111225605010985, "epoch": 0.21088006721277042, "grad_norm": 0.9765625, "learning_rate": 0.0004998185064766391, "loss": 5.9892, "mean_token_accuracy": 0.135587390512228, "num_tokens": 4627037.0, "step": 2510 }, { "entropy": 6.083059787750244, "epoch": 0.21130014702793531, "grad_norm": 0.91015625, "learning_rate": 0.0004998173019067127, "loss": 5.9864, "mean_token_accuracy": 0.13536423593759536, "num_tokens": 4637393.0, "step": 2515 }, { "entropy": 6.111885261535645, "epoch": 0.21172022684310018, "grad_norm": 0.98828125, "learning_rate": 0.0004998160933542633, "loss": 6.0252, "mean_token_accuracy": 0.12426691725850106, "num_tokens": 4646832.0, "step": 2520 }, { "entropy": 6.200415229797363, "epoch": 0.21214030665826508, "grad_norm": 1.0703125, "learning_rate": 0.0004998148808193128, "loss": 6.0364, "mean_token_accuracy": 0.1378290109336376, "num_tokens": 4655719.0, "step": 2525 }, { "entropy": 6.140298128128052, "epoch": 0.21256038647342995, "grad_norm": 0.953125, "learning_rate": 0.0004998136643018823, "loss": 5.9978, "mean_token_accuracy": 0.1409161224961281, "num_tokens": 4665364.0, "step": 2530 }, { "entropy": 6.113859462738037, "epoch": 0.21298046628859482, "grad_norm": 1.0234375, "learning_rate": 0.0004998124438019935, "loss": 5.9707, "mean_token_accuracy": 0.13255369514226914, "num_tokens": 4674760.0, "step": 2535 }, { "entropy": 6.032169342041016, "epoch": 0.21340054610375972, "grad_norm": 0.9375, "learning_rate": 0.0004998112193196681, "loss": 5.8954, "mean_token_accuracy": 0.1398087151348591, "num_tokens": 4683900.0, "step": 2540 }, { "entropy": 6.009505701065064, "epoch": 0.2138206259189246, "grad_norm": 0.98046875, "learning_rate": 0.0004998099908549277, "loss": 5.9487, "mean_token_accuracy": 0.1326383799314499, "num_tokens": 4693915.0, "step": 2545 }, { "entropy": 6.048102998733521, "epoch": 0.2142407057340895, "grad_norm": 0.98046875, "learning_rate": 0.000499808758407794, "loss": 5.7948, "mean_token_accuracy": 0.1494914174079895, "num_tokens": 4703102.0, "step": 2550 }, { "entropy": 6.130202150344848, "epoch": 0.21466078554925436, "grad_norm": 0.96875, "learning_rate": 0.0004998075219782889, "loss": 6.0201, "mean_token_accuracy": 0.13604088351130486, "num_tokens": 4712925.0, "step": 2555 }, { "entropy": 6.086578845977783, "epoch": 0.21508086536441923, "grad_norm": 1.0078125, "learning_rate": 0.0004998062815664344, "loss": 5.9508, "mean_token_accuracy": 0.13391971811652184, "num_tokens": 4722641.0, "step": 2560 }, { "entropy": 6.060202693939209, "epoch": 0.21550094517958412, "grad_norm": 0.9375, "learning_rate": 0.0004998050371722524, "loss": 6.028, "mean_token_accuracy": 0.13827937468886375, "num_tokens": 4732603.0, "step": 2565 }, { "entropy": 6.060051965713501, "epoch": 0.215921024994749, "grad_norm": 0.90625, "learning_rate": 0.0004998037887957649, "loss": 5.8655, "mean_token_accuracy": 0.1426350235939026, "num_tokens": 4742644.0, "step": 2570 }, { "entropy": 6.2458967685699465, "epoch": 0.2163411048099139, "grad_norm": 0.9765625, "learning_rate": 0.0004998025364369939, "loss": 6.1759, "mean_token_accuracy": 0.1332129217684269, "num_tokens": 4751482.0, "step": 2575 }, { "entropy": 6.246464967727661, "epoch": 0.21676118462507876, "grad_norm": 1.03125, "learning_rate": 0.0004998012800959619, "loss": 6.0435, "mean_token_accuracy": 0.13494925051927567, "num_tokens": 4760593.0, "step": 2580 }, { "entropy": 6.139482402801514, "epoch": 0.21718126444024366, "grad_norm": 1.046875, "learning_rate": 0.0004998000197726909, "loss": 6.041, "mean_token_accuracy": 0.14071242287755012, "num_tokens": 4769294.0, "step": 2585 }, { "entropy": 6.151182079315186, "epoch": 0.21760134425540853, "grad_norm": 0.87890625, "learning_rate": 0.0004997987554672033, "loss": 5.9433, "mean_token_accuracy": 0.13458855599164962, "num_tokens": 4779239.0, "step": 2590 }, { "entropy": 6.153560495376587, "epoch": 0.2180214240705734, "grad_norm": 0.921875, "learning_rate": 0.0004997974871795215, "loss": 6.0165, "mean_token_accuracy": 0.13904761373996735, "num_tokens": 4788211.0, "step": 2595 }, { "entropy": 6.1266923427581785, "epoch": 0.2184415038857383, "grad_norm": 0.87109375, "learning_rate": 0.000499796214909668, "loss": 5.9707, "mean_token_accuracy": 0.14307306259870528, "num_tokens": 4797921.0, "step": 2600 }, { "entropy": 6.151721715927124, "epoch": 0.21886158370090317, "grad_norm": 0.97265625, "learning_rate": 0.0004997949386576653, "loss": 5.9792, "mean_token_accuracy": 0.1372672997415066, "num_tokens": 4807772.0, "step": 2605 }, { "entropy": 5.999966764450074, "epoch": 0.21928166351606806, "grad_norm": 0.9375, "learning_rate": 0.000499793658423536, "loss": 6.0037, "mean_token_accuracy": 0.13394766226410865, "num_tokens": 4817999.0, "step": 2610 }, { "entropy": 6.197027158737183, "epoch": 0.21970174333123293, "grad_norm": 1.0625, "learning_rate": 0.0004997923742073028, "loss": 5.9552, "mean_token_accuracy": 0.14477612674236298, "num_tokens": 4826679.0, "step": 2615 }, { "entropy": 6.0403674125671385, "epoch": 0.2201218231463978, "grad_norm": 1.015625, "learning_rate": 0.0004997910860089884, "loss": 5.9647, "mean_token_accuracy": 0.13903913348913194, "num_tokens": 4834998.0, "step": 2620 }, { "entropy": 6.119702100753784, "epoch": 0.2205419029615627, "grad_norm": 1.0234375, "learning_rate": 0.0004997897938286156, "loss": 5.9173, "mean_token_accuracy": 0.13934070989489555, "num_tokens": 4843635.0, "step": 2625 }, { "entropy": 6.135205316543579, "epoch": 0.22096198277672757, "grad_norm": 1.0859375, "learning_rate": 0.0004997884976662075, "loss": 6.0334, "mean_token_accuracy": 0.13847846239805223, "num_tokens": 4852027.0, "step": 2630 }, { "entropy": 6.115947484970093, "epoch": 0.22138206259189247, "grad_norm": 1.0390625, "learning_rate": 0.0004997871975217868, "loss": 5.9555, "mean_token_accuracy": 0.1428781971335411, "num_tokens": 4861244.0, "step": 2635 }, { "entropy": 6.043252468109131, "epoch": 0.22180214240705734, "grad_norm": 0.95703125, "learning_rate": 0.0004997858933953768, "loss": 5.8579, "mean_token_accuracy": 0.14281281381845473, "num_tokens": 4869902.0, "step": 2640 }, { "entropy": 6.012739181518555, "epoch": 0.2222222222222222, "grad_norm": 0.95703125, "learning_rate": 0.0004997845852870004, "loss": 5.8421, "mean_token_accuracy": 0.1463296964764595, "num_tokens": 4878502.0, "step": 2645 }, { "entropy": 6.089871215820312, "epoch": 0.2226423020373871, "grad_norm": 0.9765625, "learning_rate": 0.0004997832731966806, "loss": 5.9032, "mean_token_accuracy": 0.14714645445346833, "num_tokens": 4888348.0, "step": 2650 }, { "entropy": 6.06225700378418, "epoch": 0.22306238185255198, "grad_norm": 1.015625, "learning_rate": 0.0004997819571244411, "loss": 5.972, "mean_token_accuracy": 0.1450254276394844, "num_tokens": 4897302.0, "step": 2655 }, { "entropy": 6.0446860790252686, "epoch": 0.22348246166771688, "grad_norm": 1.0, "learning_rate": 0.0004997806370703049, "loss": 5.9876, "mean_token_accuracy": 0.14430617392063141, "num_tokens": 4907078.0, "step": 2660 }, { "entropy": 6.057806348800659, "epoch": 0.22390254148288175, "grad_norm": 0.8671875, "learning_rate": 0.0004997793130342954, "loss": 5.8272, "mean_token_accuracy": 0.1456086441874504, "num_tokens": 4917489.0, "step": 2665 }, { "entropy": 5.973814630508423, "epoch": 0.22432262129804661, "grad_norm": 0.9765625, "learning_rate": 0.0004997779850164363, "loss": 5.9156, "mean_token_accuracy": 0.140571466088295, "num_tokens": 4927073.0, "step": 2670 }, { "entropy": 6.177860355377197, "epoch": 0.2247427011132115, "grad_norm": 0.98828125, "learning_rate": 0.0004997766530167508, "loss": 6.019, "mean_token_accuracy": 0.1344543881714344, "num_tokens": 4935464.0, "step": 2675 }, { "entropy": 6.22092981338501, "epoch": 0.22516278092837638, "grad_norm": 1.0078125, "learning_rate": 0.0004997753170352627, "loss": 6.0914, "mean_token_accuracy": 0.13605839386582375, "num_tokens": 4944718.0, "step": 2680 }, { "entropy": 6.105925226211548, "epoch": 0.22558286074354128, "grad_norm": 1.03125, "learning_rate": 0.0004997739770719955, "loss": 5.9844, "mean_token_accuracy": 0.13587288782000542, "num_tokens": 4954223.0, "step": 2685 }, { "entropy": 6.107930469512939, "epoch": 0.22600294055870615, "grad_norm": 0.921875, "learning_rate": 0.000499772633126973, "loss": 6.0132, "mean_token_accuracy": 0.13594387769699096, "num_tokens": 4963371.0, "step": 2690 }, { "entropy": 6.04271125793457, "epoch": 0.22642302037387105, "grad_norm": 0.98046875, "learning_rate": 0.0004997712852002192, "loss": 5.8679, "mean_token_accuracy": 0.1471228800714016, "num_tokens": 4972973.0, "step": 2695 }, { "entropy": 6.086397647857666, "epoch": 0.22684310018903592, "grad_norm": 1.0234375, "learning_rate": 0.0004997699332917578, "loss": 6.1119, "mean_token_accuracy": 0.12916670590639115, "num_tokens": 4982808.0, "step": 2700 }, { "entropy": 6.201492786407471, "epoch": 0.2272631800042008, "grad_norm": 0.94140625, "learning_rate": 0.0004997685774016127, "loss": 5.9896, "mean_token_accuracy": 0.13685485795140268, "num_tokens": 4992427.0, "step": 2705 }, { "entropy": 6.162964010238648, "epoch": 0.22768325981936569, "grad_norm": 0.84375, "learning_rate": 0.000499767217529808, "loss": 6.1604, "mean_token_accuracy": 0.12921097874641418, "num_tokens": 5003562.0, "step": 2710 }, { "entropy": 6.098525857925415, "epoch": 0.22810333963453056, "grad_norm": 0.890625, "learning_rate": 0.0004997658536763678, "loss": 5.8638, "mean_token_accuracy": 0.1451013281941414, "num_tokens": 5013429.0, "step": 2715 }, { "entropy": 6.117339611053467, "epoch": 0.22852341944969545, "grad_norm": 0.953125, "learning_rate": 0.0004997644858413163, "loss": 6.0022, "mean_token_accuracy": 0.14247513711452484, "num_tokens": 5022045.0, "step": 2720 }, { "entropy": 6.008642053604126, "epoch": 0.22894349926486032, "grad_norm": 0.88671875, "learning_rate": 0.0004997631140246775, "loss": 5.8287, "mean_token_accuracy": 0.14408515840768815, "num_tokens": 5032260.0, "step": 2725 }, { "entropy": 6.021863174438477, "epoch": 0.2293635790800252, "grad_norm": 0.9453125, "learning_rate": 0.000499761738226476, "loss": 5.8626, "mean_token_accuracy": 0.14258013665676117, "num_tokens": 5041688.0, "step": 2730 }, { "entropy": 6.056025457382202, "epoch": 0.2297836588951901, "grad_norm": 0.9765625, "learning_rate": 0.000499760358446736, "loss": 5.9702, "mean_token_accuracy": 0.13718490228056907, "num_tokens": 5051005.0, "step": 2735 }, { "entropy": 6.152891635894775, "epoch": 0.23020373871035496, "grad_norm": 0.96484375, "learning_rate": 0.000499758974685482, "loss": 5.9147, "mean_token_accuracy": 0.13967233374714852, "num_tokens": 5060084.0, "step": 2740 }, { "entropy": 6.059838390350341, "epoch": 0.23062381852551986, "grad_norm": 1.0859375, "learning_rate": 0.0004997575869427385, "loss": 5.9122, "mean_token_accuracy": 0.14734914749860764, "num_tokens": 5069081.0, "step": 2745 }, { "entropy": 6.0928624153137205, "epoch": 0.23104389834068473, "grad_norm": 0.9609375, "learning_rate": 0.00049975619521853, "loss": 5.9121, "mean_token_accuracy": 0.13845374211668968, "num_tokens": 5078597.0, "step": 2750 }, { "entropy": 6.052087306976318, "epoch": 0.2314639781558496, "grad_norm": 0.953125, "learning_rate": 0.0004997547995128814, "loss": 5.9554, "mean_token_accuracy": 0.14530446976423264, "num_tokens": 5087607.0, "step": 2755 }, { "entropy": 6.094136476516724, "epoch": 0.2318840579710145, "grad_norm": 1.078125, "learning_rate": 0.0004997533998258171, "loss": 5.9424, "mean_token_accuracy": 0.14329736083745956, "num_tokens": 5097412.0, "step": 2760 }, { "entropy": 6.16567211151123, "epoch": 0.23230413778617937, "grad_norm": 0.984375, "learning_rate": 0.0004997519961573622, "loss": 6.0152, "mean_token_accuracy": 0.13348544016480446, "num_tokens": 5105817.0, "step": 2765 }, { "entropy": 6.226717376708985, "epoch": 0.23272421760134426, "grad_norm": 1.0625, "learning_rate": 0.0004997505885075414, "loss": 6.0522, "mean_token_accuracy": 0.13480133637785913, "num_tokens": 5114958.0, "step": 2770 }, { "entropy": 6.084324312210083, "epoch": 0.23314429741650913, "grad_norm": 0.9609375, "learning_rate": 0.0004997491768763795, "loss": 5.9898, "mean_token_accuracy": 0.13868246227502823, "num_tokens": 5123728.0, "step": 2775 }, { "entropy": 6.100927209854126, "epoch": 0.23356437723167403, "grad_norm": 0.9921875, "learning_rate": 0.0004997477612639018, "loss": 6.0218, "mean_token_accuracy": 0.13395264372229576, "num_tokens": 5134099.0, "step": 2780 }, { "entropy": 6.162116241455078, "epoch": 0.2339844570468389, "grad_norm": 1.0, "learning_rate": 0.0004997463416701332, "loss": 6.0325, "mean_token_accuracy": 0.13172747194766998, "num_tokens": 5142934.0, "step": 2785 }, { "entropy": 6.000607919692993, "epoch": 0.23440453686200377, "grad_norm": 0.99609375, "learning_rate": 0.0004997449180950989, "loss": 5.8681, "mean_token_accuracy": 0.15649961084127426, "num_tokens": 5151835.0, "step": 2790 }, { "entropy": 6.038245487213135, "epoch": 0.23482461667716867, "grad_norm": 0.9140625, "learning_rate": 0.0004997434905388241, "loss": 5.921, "mean_token_accuracy": 0.1477814018726349, "num_tokens": 5161136.0, "step": 2795 }, { "entropy": 6.029763174057007, "epoch": 0.23524469649233354, "grad_norm": 0.921875, "learning_rate": 0.000499742059001334, "loss": 5.8684, "mean_token_accuracy": 0.14450337663292884, "num_tokens": 5170741.0, "step": 2800 }, { "entropy": 6.046102046966553, "epoch": 0.23566477630749844, "grad_norm": 0.9921875, "learning_rate": 0.0004997406234826541, "loss": 5.9001, "mean_token_accuracy": 0.14729267880320548, "num_tokens": 5180549.0, "step": 2805 }, { "entropy": 5.980107164382934, "epoch": 0.2360848561226633, "grad_norm": 0.88671875, "learning_rate": 0.0004997391839828098, "loss": 5.8667, "mean_token_accuracy": 0.14962306916713713, "num_tokens": 5189486.0, "step": 2810 }, { "entropy": 6.044159746170044, "epoch": 0.23650493593782818, "grad_norm": 0.96484375, "learning_rate": 0.0004997377405018266, "loss": 5.9303, "mean_token_accuracy": 0.13750530928373336, "num_tokens": 5198525.0, "step": 2815 }, { "entropy": 6.075648498535156, "epoch": 0.23692501575299307, "grad_norm": 0.99609375, "learning_rate": 0.00049973629303973, "loss": 5.9734, "mean_token_accuracy": 0.14086321070790292, "num_tokens": 5207124.0, "step": 2820 }, { "entropy": 5.964286422729492, "epoch": 0.23734509556815794, "grad_norm": 0.8984375, "learning_rate": 0.0004997348415965457, "loss": 5.8079, "mean_token_accuracy": 0.14603810012340546, "num_tokens": 5216529.0, "step": 2825 }, { "entropy": 6.12622709274292, "epoch": 0.23776517538332284, "grad_norm": 1.03125, "learning_rate": 0.0004997333861722995, "loss": 5.9402, "mean_token_accuracy": 0.14331007972359658, "num_tokens": 5225796.0, "step": 2830 }, { "entropy": 6.085462188720703, "epoch": 0.2381852551984877, "grad_norm": 1.0703125, "learning_rate": 0.000499731926767017, "loss": 5.9732, "mean_token_accuracy": 0.14003979936242103, "num_tokens": 5233876.0, "step": 2835 }, { "entropy": 6.016348743438721, "epoch": 0.23860533501365258, "grad_norm": 0.9375, "learning_rate": 0.0004997304633807242, "loss": 5.9695, "mean_token_accuracy": 0.13823127001523972, "num_tokens": 5244782.0, "step": 2840 }, { "entropy": 6.077929925918579, "epoch": 0.23902541482881748, "grad_norm": 0.99609375, "learning_rate": 0.0004997289960134468, "loss": 5.8993, "mean_token_accuracy": 0.14192162305116654, "num_tokens": 5253453.0, "step": 2845 }, { "entropy": 6.049857330322266, "epoch": 0.23944549464398235, "grad_norm": 1.0546875, "learning_rate": 0.0004997275246652111, "loss": 5.9414, "mean_token_accuracy": 0.14183279648423194, "num_tokens": 5262355.0, "step": 2850 }, { "entropy": 6.019342088699341, "epoch": 0.23986557445914725, "grad_norm": 1.0, "learning_rate": 0.000499726049336043, "loss": 5.8652, "mean_token_accuracy": 0.14227822795510292, "num_tokens": 5271959.0, "step": 2855 }, { "entropy": 6.045290803909301, "epoch": 0.24028565427431212, "grad_norm": 1.0546875, "learning_rate": 0.0004997245700259686, "loss": 5.8938, "mean_token_accuracy": 0.14394148513674737, "num_tokens": 5281393.0, "step": 2860 }, { "entropy": 6.126777935028076, "epoch": 0.240705734089477, "grad_norm": 0.921875, "learning_rate": 0.0004997230867350141, "loss": 6.0153, "mean_token_accuracy": 0.13795892894268036, "num_tokens": 5290979.0, "step": 2865 }, { "entropy": 6.170654964447022, "epoch": 0.24112581390464188, "grad_norm": 0.9921875, "learning_rate": 0.0004997215994632059, "loss": 5.9662, "mean_token_accuracy": 0.1420626498758793, "num_tokens": 5300263.0, "step": 2870 }, { "entropy": 6.098070096969605, "epoch": 0.24154589371980675, "grad_norm": 0.94921875, "learning_rate": 0.0004997201082105704, "loss": 5.9973, "mean_token_accuracy": 0.1376795694231987, "num_tokens": 5309522.0, "step": 2875 }, { "entropy": 6.09854941368103, "epoch": 0.24196597353497165, "grad_norm": 1.03125, "learning_rate": 0.0004997186129771338, "loss": 5.9906, "mean_token_accuracy": 0.1443823680281639, "num_tokens": 5319770.0, "step": 2880 }, { "entropy": 6.159392309188843, "epoch": 0.24238605335013652, "grad_norm": 1.015625, "learning_rate": 0.0004997171137629226, "loss": 5.9994, "mean_token_accuracy": 0.14119460731744765, "num_tokens": 5328400.0, "step": 2885 }, { "entropy": 6.00137939453125, "epoch": 0.24280613316530142, "grad_norm": 1.03125, "learning_rate": 0.0004997156105679636, "loss": 5.8054, "mean_token_accuracy": 0.15445883423089982, "num_tokens": 5336338.0, "step": 2890 }, { "entropy": 5.9904273509979244, "epoch": 0.2432262129804663, "grad_norm": 0.97265625, "learning_rate": 0.0004997141033922832, "loss": 5.8983, "mean_token_accuracy": 0.1381608746945858, "num_tokens": 5345391.0, "step": 2895 }, { "entropy": 6.080091238021851, "epoch": 0.24364629279563116, "grad_norm": 0.9921875, "learning_rate": 0.0004997125922359081, "loss": 5.9345, "mean_token_accuracy": 0.13472433462738992, "num_tokens": 5354709.0, "step": 2900 }, { "entropy": 6.0483152866363525, "epoch": 0.24406637261079606, "grad_norm": 1.0, "learning_rate": 0.0004997110770988652, "loss": 5.8441, "mean_token_accuracy": 0.14647466093301773, "num_tokens": 5363738.0, "step": 2905 }, { "entropy": 6.065390634536743, "epoch": 0.24448645242596093, "grad_norm": 1.078125, "learning_rate": 0.0004997095579811813, "loss": 5.9742, "mean_token_accuracy": 0.14132302552461623, "num_tokens": 5373583.0, "step": 2910 }, { "entropy": 6.1408384323120115, "epoch": 0.24490653224112582, "grad_norm": 0.875, "learning_rate": 0.0004997080348828833, "loss": 6.0104, "mean_token_accuracy": 0.14406906738877295, "num_tokens": 5383486.0, "step": 2915 }, { "entropy": 6.012083101272583, "epoch": 0.2453266120562907, "grad_norm": 1.0390625, "learning_rate": 0.0004997065078039981, "loss": 5.9283, "mean_token_accuracy": 0.13883504942059516, "num_tokens": 5391974.0, "step": 2920 }, { "entropy": 6.098450088500977, "epoch": 0.24574669187145556, "grad_norm": 1.03125, "learning_rate": 0.0004997049767445529, "loss": 5.9688, "mean_token_accuracy": 0.13587900176644324, "num_tokens": 5400882.0, "step": 2925 }, { "entropy": 6.1687455654144285, "epoch": 0.24616677168662046, "grad_norm": 0.96484375, "learning_rate": 0.0004997034417045746, "loss": 5.9199, "mean_token_accuracy": 0.13755179792642594, "num_tokens": 5410538.0, "step": 2930 }, { "entropy": 6.019326400756836, "epoch": 0.24658685150178533, "grad_norm": 0.99609375, "learning_rate": 0.0004997019026840907, "loss": 5.8134, "mean_token_accuracy": 0.14420632421970367, "num_tokens": 5419406.0, "step": 2935 }, { "entropy": 5.9686970710754395, "epoch": 0.24700693131695023, "grad_norm": 0.98046875, "learning_rate": 0.0004997003596831282, "loss": 5.941, "mean_token_accuracy": 0.13971618413925171, "num_tokens": 5428817.0, "step": 2940 }, { "entropy": 6.097631120681763, "epoch": 0.2474270111321151, "grad_norm": 0.98828125, "learning_rate": 0.0004996988127017145, "loss": 5.9448, "mean_token_accuracy": 0.13872243240475654, "num_tokens": 5438277.0, "step": 2945 }, { "entropy": 6.047083616256714, "epoch": 0.24784709094728, "grad_norm": 1.0234375, "learning_rate": 0.0004996972617398772, "loss": 5.974, "mean_token_accuracy": 0.13909853398799896, "num_tokens": 5447440.0, "step": 2950 }, { "entropy": 6.065885257720947, "epoch": 0.24826717076244487, "grad_norm": 0.98828125, "learning_rate": 0.0004996957067976435, "loss": 5.9005, "mean_token_accuracy": 0.13819090723991395, "num_tokens": 5455988.0, "step": 2955 }, { "entropy": 6.079396390914917, "epoch": 0.24868725057760974, "grad_norm": 0.96875, "learning_rate": 0.0004996941478750411, "loss": 5.895, "mean_token_accuracy": 0.14170320481061935, "num_tokens": 5464996.0, "step": 2960 }, { "entropy": 6.131442737579346, "epoch": 0.24910733039277463, "grad_norm": 0.9140625, "learning_rate": 0.0004996925849720975, "loss": 6.0433, "mean_token_accuracy": 0.13297844752669336, "num_tokens": 5474174.0, "step": 2965 }, { "entropy": 6.144496154785156, "epoch": 0.2495274102079395, "grad_norm": 1.0390625, "learning_rate": 0.0004996910180888405, "loss": 5.928, "mean_token_accuracy": 0.14379495605826378, "num_tokens": 5482838.0, "step": 2970 }, { "entropy": 6.089239263534546, "epoch": 0.2499474900231044, "grad_norm": 0.9609375, "learning_rate": 0.0004996894472252977, "loss": 5.9339, "mean_token_accuracy": 0.1420593172311783, "num_tokens": 5491616.0, "step": 2975 }, { "entropy": 5.992457008361816, "epoch": 0.25036756983826924, "grad_norm": 0.94921875, "learning_rate": 0.0004996878723814973, "loss": 5.9265, "mean_token_accuracy": 0.13892921283841134, "num_tokens": 5500942.0, "step": 2980 }, { "entropy": 6.117427587509155, "epoch": 0.25078764965343414, "grad_norm": 0.94921875, "learning_rate": 0.0004996862935574667, "loss": 5.8788, "mean_token_accuracy": 0.13912170454859735, "num_tokens": 5510078.0, "step": 2985 }, { "entropy": 5.943054437637329, "epoch": 0.25120772946859904, "grad_norm": 0.94140625, "learning_rate": 0.0004996847107532342, "loss": 5.9134, "mean_token_accuracy": 0.14340257570147513, "num_tokens": 5518924.0, "step": 2990 }, { "entropy": 6.108536148071289, "epoch": 0.25162780928376394, "grad_norm": 0.93359375, "learning_rate": 0.0004996831239688277, "loss": 5.9216, "mean_token_accuracy": 0.13749035373330115, "num_tokens": 5527385.0, "step": 2995 }, { "entropy": 5.977105903625488, "epoch": 0.2520478890989288, "grad_norm": 0.95703125, "learning_rate": 0.0004996815332042754, "loss": 5.766, "mean_token_accuracy": 0.15047305673360825, "num_tokens": 5536781.0, "step": 3000 }, { "epoch": 0.2520478890989288, "eval_entropy": 5.7445289912557636, "eval_loss": 5.931798458099365, "eval_mean_token_accuracy": 0.1480788363722414, "eval_num_tokens": 5536781.0, "eval_runtime": 21.0325, "eval_samples_per_second": 1776.586, "eval_steps_per_second": 222.085, "step": 3000 } ], "logging_steps": 5, "max_steps": 119020, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1203541354414080.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }