{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5040957781978576, "eval_steps": 3000, "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 7.253151750564575, "epoch": 0.0004200798151648813, "grad_norm": 1.2578125, "learning_rate": 2e-06, "loss": 6.8984, "mean_token_accuracy": 0.09029425084590911, "num_tokens": 8348.0, "step": 5 }, { "entropy": 7.219348526000976, "epoch": 0.0008401596303297626, "grad_norm": 1.3359375, "learning_rate": 4.5e-06, "loss": 6.9563, "mean_token_accuracy": 0.0939315177500248, "num_tokens": 17465.0, "step": 10 }, { "entropy": 7.203219985961914, "epoch": 0.001260239445494644, "grad_norm": 1.1171875, "learning_rate": 7e-06, "loss": 6.9004, "mean_token_accuracy": 0.08940818756818772, "num_tokens": 26627.0, "step": 15 }, { "entropy": 7.224360418319702, "epoch": 0.0016803192606595252, "grad_norm": 1.1640625, "learning_rate": 9.5e-06, "loss": 6.9259, "mean_token_accuracy": 0.09512931853532791, "num_tokens": 36069.0, "step": 20 }, { "entropy": 7.186052703857422, "epoch": 0.002100399075824407, "grad_norm": 1.2734375, "learning_rate": 1.2e-05, "loss": 6.8641, "mean_token_accuracy": 0.09390396177768708, "num_tokens": 44967.0, "step": 25 }, { "entropy": 7.151274633407593, "epoch": 0.002520478890989288, "grad_norm": 1.1875, "learning_rate": 1.4500000000000002e-05, "loss": 6.964, "mean_token_accuracy": 0.08688623458147049, "num_tokens": 55132.0, "step": 30 }, { "entropy": 7.053877639770508, "epoch": 0.0029405587061541692, "grad_norm": 1.0546875, "learning_rate": 1.7000000000000003e-05, "loss": 6.7637, "mean_token_accuracy": 0.09662552699446678, "num_tokens": 65141.0, "step": 35 }, { "entropy": 7.099168729782105, "epoch": 0.0033606385213190504, "grad_norm": 1.0625, "learning_rate": 1.95e-05, "loss": 6.8446, "mean_token_accuracy": 0.09567792639136315, "num_tokens": 74007.0, "step": 40 }, { "entropy": 7.140014743804931, "epoch": 0.003780718336483932, "grad_norm": 1.078125, "learning_rate": 2.2e-05, "loss": 6.8569, "mean_token_accuracy": 0.09555562734603881, "num_tokens": 83736.0, "step": 45 }, { "entropy": 7.1646524429321286, "epoch": 0.004200798151648814, "grad_norm": 1.0390625, "learning_rate": 2.4500000000000003e-05, "loss": 6.8505, "mean_token_accuracy": 0.09592381715774537, "num_tokens": 92525.0, "step": 50 }, { "entropy": 7.107996654510498, "epoch": 0.004620877966813695, "grad_norm": 1.0390625, "learning_rate": 2.7e-05, "loss": 6.7755, "mean_token_accuracy": 0.09664912968873977, "num_tokens": 102015.0, "step": 55 }, { "entropy": 7.137420606613159, "epoch": 0.005040957781978576, "grad_norm": 1.0859375, "learning_rate": 2.95e-05, "loss": 6.9768, "mean_token_accuracy": 0.0867392435669899, "num_tokens": 110887.0, "step": 60 }, { "entropy": 7.134914398193359, "epoch": 0.005461037597143457, "grad_norm": 1.1171875, "learning_rate": 3.2e-05, "loss": 6.9024, "mean_token_accuracy": 0.08935272470116615, "num_tokens": 120442.0, "step": 65 }, { "entropy": 7.1443713188171385, "epoch": 0.0058811174123083385, "grad_norm": 1.1953125, "learning_rate": 3.4500000000000005e-05, "loss": 6.8989, "mean_token_accuracy": 0.09712273105978966, "num_tokens": 129297.0, "step": 70 }, { "entropy": 7.160055351257324, "epoch": 0.00630119722747322, "grad_norm": 1.0390625, "learning_rate": 3.7e-05, "loss": 6.8507, "mean_token_accuracy": 0.09877990111708641, "num_tokens": 138305.0, "step": 75 }, { "entropy": 7.141705083847046, "epoch": 0.006721277042638101, "grad_norm": 1.0859375, "learning_rate": 3.95e-05, "loss": 6.8372, "mean_token_accuracy": 0.09292416870594025, "num_tokens": 147640.0, "step": 80 }, { "entropy": 7.002462768554688, "epoch": 0.007141356857802983, "grad_norm": 1.078125, "learning_rate": 4.2000000000000004e-05, "loss": 6.75, "mean_token_accuracy": 0.09611514061689377, "num_tokens": 157633.0, "step": 85 }, { "entropy": 7.100855731964112, "epoch": 0.007561436672967864, "grad_norm": 0.96875, "learning_rate": 4.45e-05, "loss": 6.7673, "mean_token_accuracy": 0.08827547580003739, "num_tokens": 167984.0, "step": 90 }, { "entropy": 7.129158973693848, "epoch": 0.007981516488132745, "grad_norm": 1.0234375, "learning_rate": 4.7000000000000004e-05, "loss": 6.8948, "mean_token_accuracy": 0.099730733782053, "num_tokens": 176984.0, "step": 95 }, { "entropy": 7.069277429580689, "epoch": 0.008401596303297627, "grad_norm": 1.1796875, "learning_rate": 4.9500000000000004e-05, "loss": 6.7542, "mean_token_accuracy": 0.0975344181060791, "num_tokens": 185931.0, "step": 100 }, { "entropy": 7.183401107788086, "epoch": 0.008821676118462508, "grad_norm": 1.125, "learning_rate": 5.2e-05, "loss": 6.8312, "mean_token_accuracy": 0.09061248749494552, "num_tokens": 195065.0, "step": 105 }, { "entropy": 7.100264024734497, "epoch": 0.00924175593362739, "grad_norm": 1.1640625, "learning_rate": 5.45e-05, "loss": 6.8219, "mean_token_accuracy": 0.09686729088425636, "num_tokens": 203687.0, "step": 110 }, { "entropy": 7.042130661010742, "epoch": 0.00966183574879227, "grad_norm": 1.109375, "learning_rate": 5.7e-05, "loss": 6.8491, "mean_token_accuracy": 0.09541062936186791, "num_tokens": 212847.0, "step": 115 }, { "entropy": 7.127160167694091, "epoch": 0.010081915563957152, "grad_norm": 1.203125, "learning_rate": 5.9499999999999996e-05, "loss": 6.8242, "mean_token_accuracy": 0.0955698125064373, "num_tokens": 222593.0, "step": 120 }, { "entropy": 7.0331028461456295, "epoch": 0.010501995379122032, "grad_norm": 1.1875, "learning_rate": 6.2e-05, "loss": 6.711, "mean_token_accuracy": 0.0981454961001873, "num_tokens": 231174.0, "step": 125 }, { "entropy": 6.9479889392852785, "epoch": 0.010922075194286915, "grad_norm": 1.140625, "learning_rate": 6.450000000000001e-05, "loss": 6.663, "mean_token_accuracy": 0.10388810336589813, "num_tokens": 239833.0, "step": 130 }, { "entropy": 7.190658855438232, "epoch": 0.011342155009451797, "grad_norm": 1.125, "learning_rate": 6.7e-05, "loss": 6.951, "mean_token_accuracy": 0.0949922852218151, "num_tokens": 248794.0, "step": 135 }, { "entropy": 7.251979541778565, "epoch": 0.011762234824616677, "grad_norm": 1.2734375, "learning_rate": 6.950000000000001e-05, "loss": 6.9449, "mean_token_accuracy": 0.09376412332057953, "num_tokens": 257123.0, "step": 140 }, { "entropy": 7.008625459671021, "epoch": 0.012182314639781559, "grad_norm": 1.1171875, "learning_rate": 7.2e-05, "loss": 6.6678, "mean_token_accuracy": 0.10136394873261452, "num_tokens": 266088.0, "step": 145 }, { "entropy": 7.118732166290283, "epoch": 0.01260239445494644, "grad_norm": 1.0625, "learning_rate": 7.45e-05, "loss": 6.9922, "mean_token_accuracy": 0.08856561928987502, "num_tokens": 276074.0, "step": 150 }, { "entropy": 7.265724229812622, "epoch": 0.013022474270111321, "grad_norm": 1.078125, "learning_rate": 7.7e-05, "loss": 6.8415, "mean_token_accuracy": 0.09575222656130791, "num_tokens": 285280.0, "step": 155 }, { "entropy": 7.09201340675354, "epoch": 0.013442554085276202, "grad_norm": 1.09375, "learning_rate": 7.950000000000001e-05, "loss": 6.9508, "mean_token_accuracy": 0.08855971023440361, "num_tokens": 296115.0, "step": 160 }, { "entropy": 7.172540521621704, "epoch": 0.013862633900441084, "grad_norm": 1.03125, "learning_rate": 8.2e-05, "loss": 6.7833, "mean_token_accuracy": 0.09475113973021507, "num_tokens": 305483.0, "step": 165 }, { "entropy": 7.142146444320678, "epoch": 0.014282713715605966, "grad_norm": 1.0859375, "learning_rate": 8.450000000000001e-05, "loss": 6.798, "mean_token_accuracy": 0.09611742347478866, "num_tokens": 314000.0, "step": 170 }, { "entropy": 7.191121196746826, "epoch": 0.014702793530770846, "grad_norm": 1.1171875, "learning_rate": 8.7e-05, "loss": 6.8719, "mean_token_accuracy": 0.09523176699876786, "num_tokens": 323667.0, "step": 175 }, { "entropy": 7.111018800735474, "epoch": 0.015122873345935728, "grad_norm": 1.1640625, "learning_rate": 8.95e-05, "loss": 6.9126, "mean_token_accuracy": 0.09633364677429199, "num_tokens": 332695.0, "step": 180 }, { "entropy": 7.152137660980225, "epoch": 0.015542953161100609, "grad_norm": 1.265625, "learning_rate": 9.2e-05, "loss": 6.7081, "mean_token_accuracy": 0.09874472171068191, "num_tokens": 342428.0, "step": 185 }, { "entropy": 7.018444013595581, "epoch": 0.01596303297626549, "grad_norm": 1.0390625, "learning_rate": 9.45e-05, "loss": 6.9196, "mean_token_accuracy": 0.09128761291503906, "num_tokens": 353587.0, "step": 190 }, { "entropy": 7.154490900039673, "epoch": 0.01638311279143037, "grad_norm": 1.234375, "learning_rate": 9.7e-05, "loss": 6.8259, "mean_token_accuracy": 0.09795344024896621, "num_tokens": 362997.0, "step": 195 }, { "entropy": 7.170652723312378, "epoch": 0.016803192606595255, "grad_norm": 1.0703125, "learning_rate": 9.95e-05, "loss": 6.8563, "mean_token_accuracy": 0.09546189531683921, "num_tokens": 372346.0, "step": 200 }, { "entropy": 7.110174036026001, "epoch": 0.017223272421760135, "grad_norm": 1.328125, "learning_rate": 0.000102, "loss": 6.741, "mean_token_accuracy": 0.09936807751655578, "num_tokens": 381575.0, "step": 205 }, { "entropy": 7.106418037414551, "epoch": 0.017643352236925015, "grad_norm": 1.1484375, "learning_rate": 0.00010449999999999999, "loss": 6.8292, "mean_token_accuracy": 0.09433561563491821, "num_tokens": 390706.0, "step": 210 }, { "entropy": 7.192156839370727, "epoch": 0.018063432052089896, "grad_norm": 1.140625, "learning_rate": 0.000107, "loss": 6.8516, "mean_token_accuracy": 0.0938378892838955, "num_tokens": 400000.0, "step": 215 }, { "entropy": 6.9766045093536375, "epoch": 0.01848351186725478, "grad_norm": 1.140625, "learning_rate": 0.0001095, "loss": 6.8445, "mean_token_accuracy": 0.10025399252772331, "num_tokens": 409447.0, "step": 220 }, { "entropy": 7.228094434738159, "epoch": 0.01890359168241966, "grad_norm": 1.34375, "learning_rate": 0.000112, "loss": 6.7628, "mean_token_accuracy": 0.09841207265853882, "num_tokens": 418417.0, "step": 225 }, { "entropy": 6.980171537399292, "epoch": 0.01932367149758454, "grad_norm": 1.2890625, "learning_rate": 0.0001145, "loss": 6.7865, "mean_token_accuracy": 0.10002906545996666, "num_tokens": 427619.0, "step": 230 }, { "entropy": 7.174957370758056, "epoch": 0.019743751312749424, "grad_norm": 1.09375, "learning_rate": 0.00011700000000000001, "loss": 6.8595, "mean_token_accuracy": 0.09506258964538575, "num_tokens": 437931.0, "step": 235 }, { "entropy": 7.124919366836548, "epoch": 0.020163831127914304, "grad_norm": 1.203125, "learning_rate": 0.00011949999999999999, "loss": 6.8624, "mean_token_accuracy": 0.10153809040784836, "num_tokens": 447595.0, "step": 240 }, { "entropy": 7.076248693466186, "epoch": 0.020583910943079185, "grad_norm": 1.2734375, "learning_rate": 0.000122, "loss": 6.7534, "mean_token_accuracy": 0.09595257192850112, "num_tokens": 457062.0, "step": 245 }, { "entropy": 7.11194372177124, "epoch": 0.021003990758244065, "grad_norm": 1.3359375, "learning_rate": 0.0001245, "loss": 6.8486, "mean_token_accuracy": 0.09663526639342308, "num_tokens": 466191.0, "step": 250 }, { "entropy": 7.119431734085083, "epoch": 0.02142407057340895, "grad_norm": 1.40625, "learning_rate": 0.000127, "loss": 6.8411, "mean_token_accuracy": 0.09689914286136628, "num_tokens": 475693.0, "step": 255 }, { "entropy": 7.101942634582519, "epoch": 0.02184415038857383, "grad_norm": 1.2578125, "learning_rate": 0.0001295, "loss": 6.8733, "mean_token_accuracy": 0.0926995851099491, "num_tokens": 485173.0, "step": 260 }, { "entropy": 7.02587628364563, "epoch": 0.02226423020373871, "grad_norm": 1.1796875, "learning_rate": 0.000132, "loss": 6.7261, "mean_token_accuracy": 0.1030467577278614, "num_tokens": 493985.0, "step": 265 }, { "entropy": 7.204363059997559, "epoch": 0.022684310018903593, "grad_norm": 1.3515625, "learning_rate": 0.00013450000000000002, "loss": 6.8283, "mean_token_accuracy": 0.09744107499718666, "num_tokens": 502837.0, "step": 270 }, { "entropy": 7.018724775314331, "epoch": 0.023104389834068473, "grad_norm": 1.328125, "learning_rate": 0.00013700000000000002, "loss": 6.7299, "mean_token_accuracy": 0.10230938643217087, "num_tokens": 511503.0, "step": 275 }, { "entropy": 7.203450679779053, "epoch": 0.023524469649233354, "grad_norm": 1.1640625, "learning_rate": 0.0001395, "loss": 6.9591, "mean_token_accuracy": 0.09394779950380325, "num_tokens": 521499.0, "step": 280 }, { "entropy": 7.085290002822876, "epoch": 0.023944549464398234, "grad_norm": 1.359375, "learning_rate": 0.00014199999999999998, "loss": 6.7304, "mean_token_accuracy": 0.09568566456437111, "num_tokens": 530067.0, "step": 285 }, { "entropy": 7.13321099281311, "epoch": 0.024364629279563118, "grad_norm": 1.1640625, "learning_rate": 0.0001445, "loss": 6.7422, "mean_token_accuracy": 0.10064690634608268, "num_tokens": 538559.0, "step": 290 }, { "entropy": 7.135575151443481, "epoch": 0.024784709094728, "grad_norm": 1.25, "learning_rate": 0.000147, "loss": 6.9403, "mean_token_accuracy": 0.09277286529541015, "num_tokens": 547288.0, "step": 295 }, { "entropy": 7.132419538497925, "epoch": 0.02520478890989288, "grad_norm": 1.125, "learning_rate": 0.0001495, "loss": 6.8165, "mean_token_accuracy": 0.09548124819993972, "num_tokens": 557269.0, "step": 300 }, { "entropy": 7.107891798019409, "epoch": 0.025624868725057762, "grad_norm": 1.1796875, "learning_rate": 0.000152, "loss": 6.8392, "mean_token_accuracy": 0.09125733524560928, "num_tokens": 567280.0, "step": 305 }, { "entropy": 7.009502935409546, "epoch": 0.026044948540222643, "grad_norm": 1.171875, "learning_rate": 0.00015450000000000001, "loss": 6.6443, "mean_token_accuracy": 0.0992837019264698, "num_tokens": 576609.0, "step": 310 }, { "entropy": 6.9137735843658445, "epoch": 0.026465028355387523, "grad_norm": 1.234375, "learning_rate": 0.000157, "loss": 6.627, "mean_token_accuracy": 0.1031131848692894, "num_tokens": 586053.0, "step": 315 }, { "entropy": 7.00633134841919, "epoch": 0.026885108170552403, "grad_norm": 1.3203125, "learning_rate": 0.0001595, "loss": 6.8295, "mean_token_accuracy": 0.09749070778489113, "num_tokens": 594649.0, "step": 320 }, { "entropy": 7.084966850280762, "epoch": 0.027305187985717287, "grad_norm": 1.1484375, "learning_rate": 0.000162, "loss": 6.7142, "mean_token_accuracy": 0.09451463893055916, "num_tokens": 603445.0, "step": 325 }, { "entropy": 7.088579750061035, "epoch": 0.027725267800882167, "grad_norm": 1.265625, "learning_rate": 0.00016450000000000001, "loss": 6.8658, "mean_token_accuracy": 0.09315285831689835, "num_tokens": 613611.0, "step": 330 }, { "entropy": 7.3623809814453125, "epoch": 0.028145347616047048, "grad_norm": 1.28125, "learning_rate": 0.00016700000000000002, "loss": 7.0713, "mean_token_accuracy": 0.0919966921210289, "num_tokens": 623024.0, "step": 335 }, { "entropy": 6.999190092086792, "epoch": 0.02856542743121193, "grad_norm": 1.3671875, "learning_rate": 0.00016950000000000003, "loss": 6.7385, "mean_token_accuracy": 0.1017606370151043, "num_tokens": 631624.0, "step": 340 }, { "entropy": 6.994140338897705, "epoch": 0.028985507246376812, "grad_norm": 1.390625, "learning_rate": 0.00017199999999999998, "loss": 6.6633, "mean_token_accuracy": 0.10479742139577866, "num_tokens": 640473.0, "step": 345 }, { "entropy": 7.0762580871582035, "epoch": 0.029405587061541692, "grad_norm": 1.3671875, "learning_rate": 0.00017449999999999999, "loss": 6.9006, "mean_token_accuracy": 0.09429975003004074, "num_tokens": 649692.0, "step": 350 }, { "entropy": 7.142873811721802, "epoch": 0.029825666876706573, "grad_norm": 1.2734375, "learning_rate": 0.000177, "loss": 6.8377, "mean_token_accuracy": 0.09720863476395607, "num_tokens": 658236.0, "step": 355 }, { "entropy": 6.952843904495239, "epoch": 0.030245746691871456, "grad_norm": 1.171875, "learning_rate": 0.0001795, "loss": 6.6251, "mean_token_accuracy": 0.09688405320048332, "num_tokens": 667175.0, "step": 360 }, { "entropy": 7.14975872039795, "epoch": 0.030665826507036337, "grad_norm": 1.2578125, "learning_rate": 0.000182, "loss": 6.9438, "mean_token_accuracy": 0.09247877895832061, "num_tokens": 676456.0, "step": 365 }, { "entropy": 7.224644327163697, "epoch": 0.031085906322201217, "grad_norm": 1.2421875, "learning_rate": 0.0001845, "loss": 6.926, "mean_token_accuracy": 0.08845363929867744, "num_tokens": 686881.0, "step": 370 }, { "entropy": 6.932204675674439, "epoch": 0.0315059861373661, "grad_norm": 1.203125, "learning_rate": 0.000187, "loss": 6.6985, "mean_token_accuracy": 0.09394535645842553, "num_tokens": 696045.0, "step": 375 }, { "entropy": 6.965549278259277, "epoch": 0.03192606595253098, "grad_norm": 1.2578125, "learning_rate": 0.0001895, "loss": 6.6653, "mean_token_accuracy": 0.10160319805145264, "num_tokens": 704729.0, "step": 380 }, { "entropy": 6.985969495773316, "epoch": 0.032346145767695865, "grad_norm": 1.1171875, "learning_rate": 0.000192, "loss": 6.7426, "mean_token_accuracy": 0.0950203962624073, "num_tokens": 714331.0, "step": 385 }, { "entropy": 7.101364660263061, "epoch": 0.03276622558286074, "grad_norm": 1.3515625, "learning_rate": 0.0001945, "loss": 6.6912, "mean_token_accuracy": 0.10189466029405594, "num_tokens": 722788.0, "step": 390 }, { "entropy": 7.007110738754273, "epoch": 0.033186305398025626, "grad_norm": 1.359375, "learning_rate": 0.00019700000000000002, "loss": 6.7849, "mean_token_accuracy": 0.10113223120570183, "num_tokens": 731417.0, "step": 395 }, { "entropy": 7.0138044357299805, "epoch": 0.03360638521319051, "grad_norm": 1.1484375, "learning_rate": 0.00019950000000000002, "loss": 6.7534, "mean_token_accuracy": 0.09411026313900947, "num_tokens": 741034.0, "step": 400 }, { "entropy": 7.00489068031311, "epoch": 0.034026465028355386, "grad_norm": 1.1953125, "learning_rate": 0.000202, "loss": 6.7516, "mean_token_accuracy": 0.09877323731780052, "num_tokens": 749596.0, "step": 405 }, { "entropy": 7.017527103424072, "epoch": 0.03444654484352027, "grad_norm": 1.0546875, "learning_rate": 0.00020449999999999998, "loss": 6.7218, "mean_token_accuracy": 0.09416642934083938, "num_tokens": 758931.0, "step": 410 }, { "entropy": 6.871344518661499, "epoch": 0.03486662465868515, "grad_norm": 1.2109375, "learning_rate": 0.000207, "loss": 6.6216, "mean_token_accuracy": 0.10105381533503532, "num_tokens": 767534.0, "step": 415 }, { "entropy": 6.960817480087281, "epoch": 0.03528670447385003, "grad_norm": 1.1640625, "learning_rate": 0.0002095, "loss": 6.6761, "mean_token_accuracy": 0.10064006224274635, "num_tokens": 776456.0, "step": 420 }, { "entropy": 7.008622884750366, "epoch": 0.035706784289014915, "grad_norm": 1.21875, "learning_rate": 0.000212, "loss": 6.7646, "mean_token_accuracy": 0.09697613269090652, "num_tokens": 786172.0, "step": 425 }, { "entropy": 6.960406351089477, "epoch": 0.03612686410417979, "grad_norm": 1.234375, "learning_rate": 0.0002145, "loss": 6.6658, "mean_token_accuracy": 0.10456070601940155, "num_tokens": 795081.0, "step": 430 }, { "entropy": 6.916972398757935, "epoch": 0.036546943919344675, "grad_norm": 1.3359375, "learning_rate": 0.00021700000000000002, "loss": 6.6979, "mean_token_accuracy": 0.09141508191823959, "num_tokens": 804259.0, "step": 435 }, { "entropy": 6.980181550979614, "epoch": 0.03696702373450956, "grad_norm": 1.3515625, "learning_rate": 0.0002195, "loss": 6.7251, "mean_token_accuracy": 0.0985159382224083, "num_tokens": 813463.0, "step": 440 }, { "entropy": 6.952114248275757, "epoch": 0.037387103549674436, "grad_norm": 1.328125, "learning_rate": 0.000222, "loss": 6.6346, "mean_token_accuracy": 0.10426531285047531, "num_tokens": 823029.0, "step": 445 }, { "entropy": 6.91859712600708, "epoch": 0.03780718336483932, "grad_norm": 1.1875, "learning_rate": 0.0002245, "loss": 6.7327, "mean_token_accuracy": 0.0944428451359272, "num_tokens": 832902.0, "step": 450 }, { "entropy": 6.92227520942688, "epoch": 0.0382272631800042, "grad_norm": 1.1796875, "learning_rate": 0.00022700000000000002, "loss": 6.6724, "mean_token_accuracy": 0.10073406398296356, "num_tokens": 842162.0, "step": 455 }, { "entropy": 6.979531192779541, "epoch": 0.03864734299516908, "grad_norm": 1.265625, "learning_rate": 0.00022950000000000002, "loss": 6.7091, "mean_token_accuracy": 0.0995998091995716, "num_tokens": 852328.0, "step": 460 }, { "entropy": 6.8678590774536135, "epoch": 0.039067422810333964, "grad_norm": 1.359375, "learning_rate": 0.00023200000000000003, "loss": 6.6831, "mean_token_accuracy": 0.1023336872458458, "num_tokens": 860929.0, "step": 465 }, { "entropy": 7.0033402919769285, "epoch": 0.03948750262549885, "grad_norm": 1.3046875, "learning_rate": 0.00023449999999999998, "loss": 6.7492, "mean_token_accuracy": 0.09593449011445046, "num_tokens": 869144.0, "step": 470 }, { "entropy": 7.003532409667969, "epoch": 0.039907582440663725, "grad_norm": 1.2421875, "learning_rate": 0.000237, "loss": 6.705, "mean_token_accuracy": 0.10385636389255523, "num_tokens": 877447.0, "step": 475 }, { "entropy": 6.86921706199646, "epoch": 0.04032766225582861, "grad_norm": 1.1875, "learning_rate": 0.0002395, "loss": 6.6601, "mean_token_accuracy": 0.09642177075147629, "num_tokens": 887020.0, "step": 480 }, { "entropy": 6.996332120895386, "epoch": 0.040747742070993485, "grad_norm": 1.359375, "learning_rate": 0.000242, "loss": 6.7054, "mean_token_accuracy": 0.0975713811814785, "num_tokens": 895937.0, "step": 485 }, { "entropy": 6.852901887893677, "epoch": 0.04116782188615837, "grad_norm": 1.2734375, "learning_rate": 0.0002445, "loss": 6.7267, "mean_token_accuracy": 0.09650165066123009, "num_tokens": 905446.0, "step": 490 }, { "entropy": 6.893301010131836, "epoch": 0.04158790170132325, "grad_norm": 1.3046875, "learning_rate": 0.000247, "loss": 6.6036, "mean_token_accuracy": 0.10643761828541756, "num_tokens": 914547.0, "step": 495 }, { "entropy": 6.915640449523925, "epoch": 0.04200798151648813, "grad_norm": 1.2890625, "learning_rate": 0.0002495, "loss": 6.6263, "mean_token_accuracy": 0.10527556240558625, "num_tokens": 922900.0, "step": 500 }, { "entropy": 6.947235059738159, "epoch": 0.042428061331653014, "grad_norm": 1.2578125, "learning_rate": 0.000252, "loss": 6.6686, "mean_token_accuracy": 0.10355583727359771, "num_tokens": 930876.0, "step": 505 }, { "entropy": 6.88210015296936, "epoch": 0.0428481411468179, "grad_norm": 1.109375, "learning_rate": 0.0002545, "loss": 6.7087, "mean_token_accuracy": 0.10312066823244095, "num_tokens": 939871.0, "step": 510 }, { "entropy": 6.947447443008423, "epoch": 0.043268220961982774, "grad_norm": 1.3203125, "learning_rate": 0.000257, "loss": 6.695, "mean_token_accuracy": 0.10180827602744102, "num_tokens": 948673.0, "step": 515 }, { "entropy": 6.816449880599976, "epoch": 0.04368830077714766, "grad_norm": 1.296875, "learning_rate": 0.0002595, "loss": 6.6488, "mean_token_accuracy": 0.09842450320720672, "num_tokens": 957603.0, "step": 520 }, { "entropy": 6.928069686889648, "epoch": 0.04410838059231254, "grad_norm": 1.1171875, "learning_rate": 0.000262, "loss": 6.7274, "mean_token_accuracy": 0.09575201719999313, "num_tokens": 967731.0, "step": 525 }, { "entropy": 6.940513849258423, "epoch": 0.04452846040747742, "grad_norm": 1.328125, "learning_rate": 0.00026450000000000003, "loss": 6.7098, "mean_token_accuracy": 0.10156730636954307, "num_tokens": 977427.0, "step": 530 }, { "entropy": 6.883533573150634, "epoch": 0.0449485402226423, "grad_norm": 1.203125, "learning_rate": 0.00026700000000000004, "loss": 6.7126, "mean_token_accuracy": 0.09694371595978737, "num_tokens": 986758.0, "step": 535 }, { "entropy": 7.054216384887695, "epoch": 0.045368620037807186, "grad_norm": 1.359375, "learning_rate": 0.00026950000000000005, "loss": 6.7073, "mean_token_accuracy": 0.10619494765996933, "num_tokens": 996377.0, "step": 540 }, { "entropy": 6.823930788040161, "epoch": 0.04578869985297206, "grad_norm": 1.34375, "learning_rate": 0.00027200000000000005, "loss": 6.7762, "mean_token_accuracy": 0.09864854142069816, "num_tokens": 1006483.0, "step": 545 }, { "entropy": 6.839679384231568, "epoch": 0.04620877966813695, "grad_norm": 1.2265625, "learning_rate": 0.0002745, "loss": 6.6608, "mean_token_accuracy": 0.09898171871900559, "num_tokens": 1016132.0, "step": 550 }, { "entropy": 6.886328125, "epoch": 0.04662885948330183, "grad_norm": 1.265625, "learning_rate": 0.000277, "loss": 6.5878, "mean_token_accuracy": 0.10534627884626388, "num_tokens": 1024970.0, "step": 555 }, { "entropy": 6.892021656036377, "epoch": 0.04704893929846671, "grad_norm": 1.1328125, "learning_rate": 0.0002795, "loss": 6.7043, "mean_token_accuracy": 0.0967423141002655, "num_tokens": 1034335.0, "step": 560 }, { "entropy": 6.914703607559204, "epoch": 0.04746901911363159, "grad_norm": 1.0546875, "learning_rate": 0.00028199999999999997, "loss": 6.756, "mean_token_accuracy": 0.10775318518280982, "num_tokens": 1043954.0, "step": 565 }, { "entropy": 6.942829847335815, "epoch": 0.04788909892879647, "grad_norm": 1.09375, "learning_rate": 0.0002845, "loss": 6.6882, "mean_token_accuracy": 0.10058957412838936, "num_tokens": 1053554.0, "step": 570 }, { "entropy": 6.854119396209716, "epoch": 0.04830917874396135, "grad_norm": 1.2265625, "learning_rate": 0.000287, "loss": 6.6366, "mean_token_accuracy": 0.10385002046823502, "num_tokens": 1062008.0, "step": 575 }, { "entropy": 6.868479824066162, "epoch": 0.048729258559126236, "grad_norm": 1.2578125, "learning_rate": 0.0002895, "loss": 6.7048, "mean_token_accuracy": 0.10346106439828873, "num_tokens": 1070740.0, "step": 580 }, { "entropy": 6.8440343856811525, "epoch": 0.04914933837429111, "grad_norm": 1.4140625, "learning_rate": 0.000292, "loss": 6.7057, "mean_token_accuracy": 0.10240900367498398, "num_tokens": 1079681.0, "step": 585 }, { "entropy": 6.858892154693604, "epoch": 0.049569418189456, "grad_norm": 1.1796875, "learning_rate": 0.0002945, "loss": 6.5847, "mean_token_accuracy": 0.10450911447405815, "num_tokens": 1088979.0, "step": 590 }, { "entropy": 6.772767686843872, "epoch": 0.04998949800462088, "grad_norm": 1.3671875, "learning_rate": 0.000297, "loss": 6.5832, "mean_token_accuracy": 0.10501813441514969, "num_tokens": 1097870.0, "step": 595 }, { "entropy": 6.856569433212281, "epoch": 0.05040957781978576, "grad_norm": 1.171875, "learning_rate": 0.0002995, "loss": 6.714, "mean_token_accuracy": 0.09948427230119705, "num_tokens": 1107948.0, "step": 600 }, { "entropy": 6.876928329467773, "epoch": 0.05082965763495064, "grad_norm": 1.21875, "learning_rate": 0.000302, "loss": 6.6226, "mean_token_accuracy": 0.1076712541282177, "num_tokens": 1117032.0, "step": 605 }, { "entropy": 6.769250106811524, "epoch": 0.051249737450115525, "grad_norm": 1.2734375, "learning_rate": 0.0003045, "loss": 6.5928, "mean_token_accuracy": 0.10671919211745262, "num_tokens": 1127834.0, "step": 610 }, { "entropy": 6.948690032958984, "epoch": 0.0516698172652804, "grad_norm": 1.40625, "learning_rate": 0.000307, "loss": 6.695, "mean_token_accuracy": 0.11499854996800422, "num_tokens": 1137382.0, "step": 615 }, { "entropy": 6.77532000541687, "epoch": 0.052089897080445285, "grad_norm": 1.2578125, "learning_rate": 0.0003095, "loss": 6.5422, "mean_token_accuracy": 0.11223937124013901, "num_tokens": 1146095.0, "step": 620 }, { "entropy": 6.723394155502319, "epoch": 0.05250997689561017, "grad_norm": 1.1640625, "learning_rate": 0.000312, "loss": 6.5822, "mean_token_accuracy": 0.10564726367592811, "num_tokens": 1154981.0, "step": 625 }, { "entropy": 6.776411151885986, "epoch": 0.052930056710775046, "grad_norm": 1.6328125, "learning_rate": 0.0003145, "loss": 6.6153, "mean_token_accuracy": 0.10753953084349632, "num_tokens": 1164939.0, "step": 630 }, { "entropy": 6.936794233322144, "epoch": 0.05335013652593993, "grad_norm": 1.203125, "learning_rate": 0.000317, "loss": 6.7291, "mean_token_accuracy": 0.09790047407150268, "num_tokens": 1174991.0, "step": 635 }, { "entropy": 6.818718576431275, "epoch": 0.05377021634110481, "grad_norm": 1.109375, "learning_rate": 0.0003195, "loss": 6.78, "mean_token_accuracy": 0.09581352695822716, "num_tokens": 1184885.0, "step": 640 }, { "entropy": 6.859689378738404, "epoch": 0.05419029615626969, "grad_norm": 1.3046875, "learning_rate": 0.000322, "loss": 6.6652, "mean_token_accuracy": 0.10177846625447273, "num_tokens": 1193637.0, "step": 645 }, { "entropy": 6.70958137512207, "epoch": 0.054610375971434574, "grad_norm": 1.2578125, "learning_rate": 0.00032450000000000003, "loss": 6.4505, "mean_token_accuracy": 0.11398516818881035, "num_tokens": 1202188.0, "step": 650 }, { "entropy": 6.731061363220215, "epoch": 0.05503045578659945, "grad_norm": 1.28125, "learning_rate": 0.00032700000000000003, "loss": 6.5923, "mean_token_accuracy": 0.10111142173409463, "num_tokens": 1210768.0, "step": 655 }, { "entropy": 6.75755124092102, "epoch": 0.055450535601764335, "grad_norm": 1.2578125, "learning_rate": 0.00032950000000000004, "loss": 6.5885, "mean_token_accuracy": 0.10299575850367546, "num_tokens": 1219819.0, "step": 660 }, { "entropy": 6.8775472164154055, "epoch": 0.05587061541692922, "grad_norm": 1.0234375, "learning_rate": 0.00033200000000000005, "loss": 6.6507, "mean_token_accuracy": 0.09766614213585853, "num_tokens": 1229703.0, "step": 665 }, { "entropy": 6.829215049743652, "epoch": 0.056290695232094096, "grad_norm": 1.328125, "learning_rate": 0.00033450000000000005, "loss": 6.6863, "mean_token_accuracy": 0.09930930510163308, "num_tokens": 1238942.0, "step": 670 }, { "entropy": 6.886805677413941, "epoch": 0.05671077504725898, "grad_norm": 1.1875, "learning_rate": 0.000337, "loss": 6.7475, "mean_token_accuracy": 0.09512239620089531, "num_tokens": 1248943.0, "step": 675 }, { "entropy": 6.774325275421143, "epoch": 0.05713085486242386, "grad_norm": 1.125, "learning_rate": 0.0003395, "loss": 6.6164, "mean_token_accuracy": 0.10321223735809326, "num_tokens": 1257761.0, "step": 680 }, { "entropy": 6.6621216297149655, "epoch": 0.05755093467758874, "grad_norm": 1.3203125, "learning_rate": 0.000342, "loss": 6.5622, "mean_token_accuracy": 0.10228212624788284, "num_tokens": 1267216.0, "step": 685 }, { "entropy": 6.826507520675659, "epoch": 0.057971014492753624, "grad_norm": 1.2109375, "learning_rate": 0.00034449999999999997, "loss": 6.6587, "mean_token_accuracy": 0.1079720102250576, "num_tokens": 1277210.0, "step": 690 }, { "entropy": 6.737741279602051, "epoch": 0.05839109430791851, "grad_norm": 1.21875, "learning_rate": 0.000347, "loss": 6.5588, "mean_token_accuracy": 0.10001136437058449, "num_tokens": 1285310.0, "step": 695 }, { "entropy": 6.800521755218506, "epoch": 0.058811174123083385, "grad_norm": 1.265625, "learning_rate": 0.0003495, "loss": 6.5887, "mean_token_accuracy": 0.10580191239714623, "num_tokens": 1294421.0, "step": 700 }, { "entropy": 6.603023052215576, "epoch": 0.05923125393824827, "grad_norm": 1.1953125, "learning_rate": 0.000352, "loss": 6.4007, "mean_token_accuracy": 0.11451570391654968, "num_tokens": 1303281.0, "step": 705 }, { "entropy": 6.694077110290527, "epoch": 0.059651333753413145, "grad_norm": 1.2265625, "learning_rate": 0.0003545, "loss": 6.5884, "mean_token_accuracy": 0.1103569135069847, "num_tokens": 1312280.0, "step": 710 }, { "entropy": 6.703026485443115, "epoch": 0.06007141356857803, "grad_norm": 1.1484375, "learning_rate": 0.000357, "loss": 6.5455, "mean_token_accuracy": 0.10655389800667762, "num_tokens": 1321243.0, "step": 715 }, { "entropy": 6.783720779418945, "epoch": 0.06049149338374291, "grad_norm": 1.1640625, "learning_rate": 0.0003595, "loss": 6.6752, "mean_token_accuracy": 0.10890973284840584, "num_tokens": 1330324.0, "step": 720 }, { "entropy": 6.716011047363281, "epoch": 0.06091157319890779, "grad_norm": 1.25, "learning_rate": 0.000362, "loss": 6.4895, "mean_token_accuracy": 0.11130202338099479, "num_tokens": 1339485.0, "step": 725 }, { "entropy": 6.763300609588623, "epoch": 0.06133165301407267, "grad_norm": 1.2890625, "learning_rate": 0.0003645, "loss": 6.6603, "mean_token_accuracy": 0.09835303947329521, "num_tokens": 1348640.0, "step": 730 }, { "entropy": 6.686880588531494, "epoch": 0.06175173282923756, "grad_norm": 1.25, "learning_rate": 0.000367, "loss": 6.5352, "mean_token_accuracy": 0.10910931676626205, "num_tokens": 1357581.0, "step": 735 }, { "entropy": 6.789766788482666, "epoch": 0.062171812644402434, "grad_norm": 1.2265625, "learning_rate": 0.0003695, "loss": 6.6092, "mean_token_accuracy": 0.10686837136745453, "num_tokens": 1367883.0, "step": 740 }, { "entropy": 6.689715576171875, "epoch": 0.06259189245956731, "grad_norm": 1.1953125, "learning_rate": 0.000372, "loss": 6.5732, "mean_token_accuracy": 0.10044000372290611, "num_tokens": 1376936.0, "step": 745 }, { "entropy": 6.619902896881103, "epoch": 0.0630119722747322, "grad_norm": 1.2421875, "learning_rate": 0.0003745, "loss": 6.5013, "mean_token_accuracy": 0.10585071742534638, "num_tokens": 1386359.0, "step": 750 }, { "entropy": 6.691353893280029, "epoch": 0.06343205208989708, "grad_norm": 1.1640625, "learning_rate": 0.000377, "loss": 6.5614, "mean_token_accuracy": 0.10925468727946282, "num_tokens": 1395223.0, "step": 755 }, { "entropy": 6.756776332855225, "epoch": 0.06385213190506196, "grad_norm": 1.15625, "learning_rate": 0.0003795, "loss": 6.7048, "mean_token_accuracy": 0.10100763738155365, "num_tokens": 1404917.0, "step": 760 }, { "entropy": 6.7892101287841795, "epoch": 0.06427221172022685, "grad_norm": 1.2421875, "learning_rate": 0.000382, "loss": 6.6077, "mean_token_accuracy": 0.11203819289803504, "num_tokens": 1413348.0, "step": 765 }, { "entropy": 6.617217540740967, "epoch": 0.06469229153539173, "grad_norm": 1.265625, "learning_rate": 0.0003845, "loss": 6.5804, "mean_token_accuracy": 0.10595368966460228, "num_tokens": 1421726.0, "step": 770 }, { "entropy": 6.699965381622315, "epoch": 0.0651123713505566, "grad_norm": 1.140625, "learning_rate": 0.00038700000000000003, "loss": 6.5984, "mean_token_accuracy": 0.10766990706324578, "num_tokens": 1430686.0, "step": 775 }, { "entropy": 6.773920488357544, "epoch": 0.06553245116572148, "grad_norm": 1.171875, "learning_rate": 0.00038950000000000003, "loss": 6.5765, "mean_token_accuracy": 0.10770290642976761, "num_tokens": 1439499.0, "step": 780 }, { "entropy": 6.685867691040039, "epoch": 0.06595253098088637, "grad_norm": 1.328125, "learning_rate": 0.00039200000000000004, "loss": 6.5731, "mean_token_accuracy": 0.10584950372576714, "num_tokens": 1448220.0, "step": 785 }, { "entropy": 6.635032224655151, "epoch": 0.06637261079605125, "grad_norm": 1.0, "learning_rate": 0.00039450000000000005, "loss": 6.5914, "mean_token_accuracy": 0.09675629287958146, "num_tokens": 1458217.0, "step": 790 }, { "entropy": 6.699159860610962, "epoch": 0.06679269061121614, "grad_norm": 1.125, "learning_rate": 0.00039700000000000005, "loss": 6.4848, "mean_token_accuracy": 0.10567129477858543, "num_tokens": 1467422.0, "step": 795 }, { "entropy": 6.620410299301147, "epoch": 0.06721277042638102, "grad_norm": 1.203125, "learning_rate": 0.0003995, "loss": 6.4767, "mean_token_accuracy": 0.11094664260745049, "num_tokens": 1476152.0, "step": 800 }, { "entropy": 6.650699758529663, "epoch": 0.06763285024154589, "grad_norm": 1.265625, "learning_rate": 0.000402, "loss": 6.5738, "mean_token_accuracy": 0.1043787069618702, "num_tokens": 1485248.0, "step": 805 }, { "entropy": 6.619400262832642, "epoch": 0.06805293005671077, "grad_norm": 1.2265625, "learning_rate": 0.0004045, "loss": 6.5511, "mean_token_accuracy": 0.10442669913172722, "num_tokens": 1494248.0, "step": 810 }, { "entropy": 6.722617673873901, "epoch": 0.06847300987187566, "grad_norm": 1.28125, "learning_rate": 0.00040699999999999997, "loss": 6.6573, "mean_token_accuracy": 0.10585515722632408, "num_tokens": 1503565.0, "step": 815 }, { "entropy": 6.83908371925354, "epoch": 0.06889308968704054, "grad_norm": 1.15625, "learning_rate": 0.0004095, "loss": 6.745, "mean_token_accuracy": 0.10003346055746079, "num_tokens": 1513227.0, "step": 820 }, { "entropy": 6.658945035934448, "epoch": 0.06931316950220542, "grad_norm": 1.2421875, "learning_rate": 0.000412, "loss": 6.5346, "mean_token_accuracy": 0.10508675500750542, "num_tokens": 1522312.0, "step": 825 }, { "entropy": 6.637969160079956, "epoch": 0.0697332493173703, "grad_norm": 1.1796875, "learning_rate": 0.0004145, "loss": 6.4802, "mean_token_accuracy": 0.10670675709843636, "num_tokens": 1531720.0, "step": 830 }, { "entropy": 6.6340169429779055, "epoch": 0.07015332913253518, "grad_norm": 1.140625, "learning_rate": 0.000417, "loss": 6.5721, "mean_token_accuracy": 0.10074454993009567, "num_tokens": 1541238.0, "step": 835 }, { "entropy": 6.695564794540405, "epoch": 0.07057340894770006, "grad_norm": 1.21875, "learning_rate": 0.0004195, "loss": 6.6648, "mean_token_accuracy": 0.10375690832734108, "num_tokens": 1550875.0, "step": 840 }, { "entropy": 6.645870971679687, "epoch": 0.07099348876286495, "grad_norm": 1.0703125, "learning_rate": 0.000422, "loss": 6.6076, "mean_token_accuracy": 0.10648187175393105, "num_tokens": 1560287.0, "step": 845 }, { "entropy": 6.6967510223388675, "epoch": 0.07141356857802983, "grad_norm": 1.2265625, "learning_rate": 0.0004245, "loss": 6.4978, "mean_token_accuracy": 0.11105224043130875, "num_tokens": 1569043.0, "step": 850 }, { "entropy": 6.554346418380737, "epoch": 0.07183364839319471, "grad_norm": 1.140625, "learning_rate": 0.000427, "loss": 6.5154, "mean_token_accuracy": 0.11203170269727707, "num_tokens": 1578112.0, "step": 855 }, { "entropy": 6.515066003799438, "epoch": 0.07225372820835958, "grad_norm": 1.171875, "learning_rate": 0.0004295, "loss": 6.4364, "mean_token_accuracy": 0.11132391095161438, "num_tokens": 1586587.0, "step": 860 }, { "entropy": 6.742719125747681, "epoch": 0.07267380802352447, "grad_norm": 1.1796875, "learning_rate": 0.000432, "loss": 6.5978, "mean_token_accuracy": 0.10682642236351966, "num_tokens": 1595585.0, "step": 865 }, { "entropy": 6.641049814224243, "epoch": 0.07309388783868935, "grad_norm": 1.171875, "learning_rate": 0.0004345, "loss": 6.551, "mean_token_accuracy": 0.10661023184657097, "num_tokens": 1605355.0, "step": 870 }, { "entropy": 6.638308906555176, "epoch": 0.07351396765385423, "grad_norm": 1.234375, "learning_rate": 0.000437, "loss": 6.5889, "mean_token_accuracy": 0.10184741988778115, "num_tokens": 1613637.0, "step": 875 }, { "entropy": 6.658770608901977, "epoch": 0.07393404746901912, "grad_norm": 1.1953125, "learning_rate": 0.0004395, "loss": 6.5351, "mean_token_accuracy": 0.10806669145822526, "num_tokens": 1622731.0, "step": 880 }, { "entropy": 6.610404300689697, "epoch": 0.074354127284184, "grad_norm": 1.140625, "learning_rate": 0.000442, "loss": 6.5083, "mean_token_accuracy": 0.10660439133644103, "num_tokens": 1632098.0, "step": 885 }, { "entropy": 6.581112480163574, "epoch": 0.07477420709934887, "grad_norm": 1.0703125, "learning_rate": 0.0004445, "loss": 6.5117, "mean_token_accuracy": 0.10190015733242035, "num_tokens": 1641259.0, "step": 890 }, { "entropy": 6.669602966308593, "epoch": 0.07519428691451376, "grad_norm": 1.1875, "learning_rate": 0.000447, "loss": 6.5769, "mean_token_accuracy": 0.103199552744627, "num_tokens": 1651362.0, "step": 895 }, { "entropy": 6.581767272949219, "epoch": 0.07561436672967864, "grad_norm": 1.234375, "learning_rate": 0.00044950000000000003, "loss": 6.4893, "mean_token_accuracy": 0.1063641555607319, "num_tokens": 1660190.0, "step": 900 }, { "entropy": 6.573458099365235, "epoch": 0.07603444654484352, "grad_norm": 1.2421875, "learning_rate": 0.00045200000000000004, "loss": 6.5473, "mean_token_accuracy": 0.10238413438200951, "num_tokens": 1669020.0, "step": 905 }, { "entropy": 6.650429916381836, "epoch": 0.0764545263600084, "grad_norm": 1.2265625, "learning_rate": 0.00045450000000000004, "loss": 6.5657, "mean_token_accuracy": 0.1067358523607254, "num_tokens": 1678158.0, "step": 910 }, { "entropy": 6.640725135803223, "epoch": 0.07687460617517328, "grad_norm": 1.234375, "learning_rate": 0.00045700000000000005, "loss": 6.5723, "mean_token_accuracy": 0.10445328801870346, "num_tokens": 1687481.0, "step": 915 }, { "entropy": 6.5850495338439945, "epoch": 0.07729468599033816, "grad_norm": 1.1171875, "learning_rate": 0.00045950000000000006, "loss": 6.5327, "mean_token_accuracy": 0.11253217458724976, "num_tokens": 1696782.0, "step": 920 }, { "entropy": 6.555831384658814, "epoch": 0.07771476580550304, "grad_norm": 1.1171875, "learning_rate": 0.000462, "loss": 6.528, "mean_token_accuracy": 0.10824255496263505, "num_tokens": 1706153.0, "step": 925 }, { "entropy": 6.625135850906372, "epoch": 0.07813484562066793, "grad_norm": 1.0859375, "learning_rate": 0.0004645, "loss": 6.559, "mean_token_accuracy": 0.1069357268512249, "num_tokens": 1715585.0, "step": 930 }, { "entropy": 6.768569469451904, "epoch": 0.07855492543583281, "grad_norm": 1.578125, "learning_rate": 0.000467, "loss": 6.6818, "mean_token_accuracy": 0.10121209248900413, "num_tokens": 1724857.0, "step": 935 }, { "entropy": 6.553330516815185, "epoch": 0.0789750052509977, "grad_norm": 1.21875, "learning_rate": 0.0004695, "loss": 6.4953, "mean_token_accuracy": 0.11398640796542167, "num_tokens": 1733528.0, "step": 940 }, { "entropy": 6.64405460357666, "epoch": 0.07939508506616257, "grad_norm": 1.0234375, "learning_rate": 0.000472, "loss": 6.616, "mean_token_accuracy": 0.10737027525901795, "num_tokens": 1742953.0, "step": 945 }, { "entropy": 6.612694597244262, "epoch": 0.07981516488132745, "grad_norm": 1.1953125, "learning_rate": 0.0004745, "loss": 6.5605, "mean_token_accuracy": 0.11337294653058053, "num_tokens": 1752155.0, "step": 950 }, { "entropy": 6.579654312133789, "epoch": 0.08023524469649233, "grad_norm": 1.2421875, "learning_rate": 0.000477, "loss": 6.4539, "mean_token_accuracy": 0.10857293009757996, "num_tokens": 1760562.0, "step": 955 }, { "entropy": 6.568000841140747, "epoch": 0.08065532451165722, "grad_norm": 1.171875, "learning_rate": 0.0004795, "loss": 6.4953, "mean_token_accuracy": 0.10662117302417755, "num_tokens": 1769631.0, "step": 960 }, { "entropy": 6.530600309371948, "epoch": 0.0810754043268221, "grad_norm": 1.2578125, "learning_rate": 0.000482, "loss": 6.513, "mean_token_accuracy": 0.10268357619643212, "num_tokens": 1779080.0, "step": 965 }, { "entropy": 6.608699893951416, "epoch": 0.08149548414198697, "grad_norm": 1.4375, "learning_rate": 0.0004845, "loss": 6.5062, "mean_token_accuracy": 0.1082501009106636, "num_tokens": 1787830.0, "step": 970 }, { "entropy": 6.3936583518981935, "epoch": 0.08191556395715185, "grad_norm": 1.1484375, "learning_rate": 0.000487, "loss": 6.4399, "mean_token_accuracy": 0.10422300174832344, "num_tokens": 1796998.0, "step": 975 }, { "entropy": 6.6043681621551515, "epoch": 0.08233564377231674, "grad_norm": 1.1953125, "learning_rate": 0.0004895, "loss": 6.5165, "mean_token_accuracy": 0.1103883646428585, "num_tokens": 1806194.0, "step": 980 }, { "entropy": 6.372901821136475, "epoch": 0.08275572358748162, "grad_norm": 1.0234375, "learning_rate": 0.000492, "loss": 6.4139, "mean_token_accuracy": 0.11206447035074234, "num_tokens": 1815751.0, "step": 985 }, { "entropy": 6.442894506454468, "epoch": 0.0831758034026465, "grad_norm": 1.0546875, "learning_rate": 0.0004945, "loss": 6.4907, "mean_token_accuracy": 0.11046408414840699, "num_tokens": 1825379.0, "step": 990 }, { "entropy": 6.55560712814331, "epoch": 0.08359588321781139, "grad_norm": 1.203125, "learning_rate": 0.000497, "loss": 6.454, "mean_token_accuracy": 0.10851948186755181, "num_tokens": 1834158.0, "step": 995 }, { "entropy": 6.516812181472778, "epoch": 0.08401596303297626, "grad_norm": 1.2109375, "learning_rate": 0.0004995, "loss": 6.4132, "mean_token_accuracy": 0.10896480083465576, "num_tokens": 1842724.0, "step": 1000 }, { "entropy": 6.518254280090332, "epoch": 0.08443604284814114, "grad_norm": 1.125, "learning_rate": 0.000499999998724557, "loss": 6.4359, "mean_token_accuracy": 0.11062911972403526, "num_tokens": 1852485.0, "step": 1005 }, { "entropy": 6.472753667831421, "epoch": 0.08485612266330603, "grad_norm": 1.2109375, "learning_rate": 0.0004999999935430703, "loss": 6.4668, "mean_token_accuracy": 0.11211320757865906, "num_tokens": 1861303.0, "step": 1010 }, { "entropy": 6.340228652954101, "epoch": 0.08527620247847091, "grad_norm": 1.1953125, "learning_rate": 0.0004999999843758243, "loss": 6.4328, "mean_token_accuracy": 0.11544003784656524, "num_tokens": 1870859.0, "step": 1015 }, { "entropy": 6.671287488937378, "epoch": 0.0856962822936358, "grad_norm": 1.1328125, "learning_rate": 0.0004999999712228196, "loss": 6.6205, "mean_token_accuracy": 0.0996169812977314, "num_tokens": 1880295.0, "step": 1020 }, { "entropy": 6.63971586227417, "epoch": 0.08611636210880068, "grad_norm": 1.0625, "learning_rate": 0.0004999999540840562, "loss": 6.5111, "mean_token_accuracy": 0.11260380744934081, "num_tokens": 1889193.0, "step": 1025 }, { "entropy": 6.493311834335327, "epoch": 0.08653644192396555, "grad_norm": 1.140625, "learning_rate": 0.0004999999329595345, "loss": 6.6021, "mean_token_accuracy": 0.09916243478655815, "num_tokens": 1899437.0, "step": 1030 }, { "entropy": 6.5729930877685545, "epoch": 0.08695652173913043, "grad_norm": 1.078125, "learning_rate": 0.0004999999078492548, "loss": 6.5058, "mean_token_accuracy": 0.10446131974458694, "num_tokens": 1907882.0, "step": 1035 }, { "entropy": 6.492805910110474, "epoch": 0.08737660155429532, "grad_norm": 1.0078125, "learning_rate": 0.0004999998787532176, "loss": 6.4138, "mean_token_accuracy": 0.1126400038599968, "num_tokens": 1916872.0, "step": 1040 }, { "entropy": 6.506380701065064, "epoch": 0.0877966813694602, "grad_norm": 1.1484375, "learning_rate": 0.0004999998456714234, "loss": 6.5924, "mean_token_accuracy": 0.11272363439202308, "num_tokens": 1926636.0, "step": 1045 }, { "entropy": 6.5269097805023195, "epoch": 0.08821676118462508, "grad_norm": 1.2109375, "learning_rate": 0.0004999998086038729, "loss": 6.4905, "mean_token_accuracy": 0.10973675772547722, "num_tokens": 1935962.0, "step": 1050 }, { "entropy": 6.491125774383545, "epoch": 0.08863684099978995, "grad_norm": 1.1328125, "learning_rate": 0.0004999997675505665, "loss": 6.4506, "mean_token_accuracy": 0.11091897338628769, "num_tokens": 1944600.0, "step": 1055 }, { "entropy": 6.560743236541748, "epoch": 0.08905692081495484, "grad_norm": 1.109375, "learning_rate": 0.0004999997225115052, "loss": 6.6403, "mean_token_accuracy": 0.10566280484199524, "num_tokens": 1954234.0, "step": 1060 }, { "entropy": 6.71457724571228, "epoch": 0.08947700063011972, "grad_norm": 1.1328125, "learning_rate": 0.0004999996734866896, "loss": 6.5989, "mean_token_accuracy": 0.10413395762443542, "num_tokens": 1964499.0, "step": 1065 }, { "entropy": 6.307662582397461, "epoch": 0.0898970804452846, "grad_norm": 1.171875, "learning_rate": 0.0004999996204761206, "loss": 6.3004, "mean_token_accuracy": 0.11687865033745766, "num_tokens": 1973635.0, "step": 1070 }, { "entropy": 6.497924661636352, "epoch": 0.09031716026044949, "grad_norm": 1.015625, "learning_rate": 0.0004999995634797993, "loss": 6.4546, "mean_token_accuracy": 0.11474235132336616, "num_tokens": 1983509.0, "step": 1075 }, { "entropy": 6.480887317657471, "epoch": 0.09073724007561437, "grad_norm": 1.140625, "learning_rate": 0.0004999995024977265, "loss": 6.4317, "mean_token_accuracy": 0.11495376154780387, "num_tokens": 1992336.0, "step": 1080 }, { "entropy": 6.518788290023804, "epoch": 0.09115731989077924, "grad_norm": 1.0546875, "learning_rate": 0.0004999994375299034, "loss": 6.467, "mean_token_accuracy": 0.11141329482197762, "num_tokens": 2001931.0, "step": 1085 }, { "entropy": 6.396731853485107, "epoch": 0.09157739970594413, "grad_norm": 1.0625, "learning_rate": 0.000499999368576331, "loss": 6.3268, "mean_token_accuracy": 0.11944503113627433, "num_tokens": 2010935.0, "step": 1090 }, { "entropy": 6.410158395767212, "epoch": 0.09199747952110901, "grad_norm": 1.03125, "learning_rate": 0.0004999992956370109, "loss": 6.3911, "mean_token_accuracy": 0.1134261205792427, "num_tokens": 2020587.0, "step": 1095 }, { "entropy": 6.34518141746521, "epoch": 0.0924175593362739, "grad_norm": 1.0703125, "learning_rate": 0.000499999218711944, "loss": 6.4155, "mean_token_accuracy": 0.11149835661053657, "num_tokens": 2029743.0, "step": 1100 }, { "entropy": 6.548259019851685, "epoch": 0.09283763915143878, "grad_norm": 1.1953125, "learning_rate": 0.0004999991378011317, "loss": 6.4423, "mean_token_accuracy": 0.11465151533484459, "num_tokens": 2038468.0, "step": 1105 }, { "entropy": 6.381483364105224, "epoch": 0.09325771896660366, "grad_norm": 1.1171875, "learning_rate": 0.0004999990529045757, "loss": 6.3588, "mean_token_accuracy": 0.11419494673609734, "num_tokens": 2047456.0, "step": 1110 }, { "entropy": 6.610185289382935, "epoch": 0.09367779878176853, "grad_norm": 1.0078125, "learning_rate": 0.0004999989640222771, "loss": 6.6665, "mean_token_accuracy": 0.09994395300745965, "num_tokens": 2056691.0, "step": 1115 }, { "entropy": 6.5219189643859865, "epoch": 0.09409787859693342, "grad_norm": 1.0546875, "learning_rate": 0.000499998871154238, "loss": 6.4644, "mean_token_accuracy": 0.10945621505379677, "num_tokens": 2066068.0, "step": 1120 }, { "entropy": 6.522899913787842, "epoch": 0.0945179584120983, "grad_norm": 1.0546875, "learning_rate": 0.0004999987743004597, "loss": 6.3814, "mean_token_accuracy": 0.11605900377035142, "num_tokens": 2075113.0, "step": 1125 }, { "entropy": 6.463716268539429, "epoch": 0.09493803822726318, "grad_norm": 1.0625, "learning_rate": 0.0004999986734609438, "loss": 6.5424, "mean_token_accuracy": 0.11121912077069282, "num_tokens": 2084557.0, "step": 1130 }, { "entropy": 6.5195088386535645, "epoch": 0.09535811804242807, "grad_norm": 1.1484375, "learning_rate": 0.0004999985686356923, "loss": 6.4293, "mean_token_accuracy": 0.10896992832422256, "num_tokens": 2093424.0, "step": 1135 }, { "entropy": 6.501124429702759, "epoch": 0.09577819785759294, "grad_norm": 1.09375, "learning_rate": 0.000499998459824707, "loss": 6.541, "mean_token_accuracy": 0.10720071643590927, "num_tokens": 2103066.0, "step": 1140 }, { "entropy": 6.485859060287476, "epoch": 0.09619827767275782, "grad_norm": 1.0859375, "learning_rate": 0.00049999834702799, "loss": 6.4218, "mean_token_accuracy": 0.11379043385386467, "num_tokens": 2112447.0, "step": 1145 }, { "entropy": 6.421380949020386, "epoch": 0.0966183574879227, "grad_norm": 1.0703125, "learning_rate": 0.0004999982302455431, "loss": 6.4471, "mean_token_accuracy": 0.11498942598700523, "num_tokens": 2121949.0, "step": 1150 }, { "entropy": 6.473872327804566, "epoch": 0.09703843730308759, "grad_norm": 1.0703125, "learning_rate": 0.0004999981094773683, "loss": 6.3538, "mean_token_accuracy": 0.11318295449018478, "num_tokens": 2130464.0, "step": 1155 }, { "entropy": 6.440775918960571, "epoch": 0.09745851711825247, "grad_norm": 1.140625, "learning_rate": 0.000499997984723468, "loss": 6.516, "mean_token_accuracy": 0.10744207352399826, "num_tokens": 2139577.0, "step": 1160 }, { "entropy": 6.234747648239136, "epoch": 0.09787859693341736, "grad_norm": 1.0078125, "learning_rate": 0.0004999978559838441, "loss": 6.2367, "mean_token_accuracy": 0.11980548799037934, "num_tokens": 2147919.0, "step": 1165 }, { "entropy": 6.425514888763428, "epoch": 0.09829867674858223, "grad_norm": 1.1015625, "learning_rate": 0.0004999977232584991, "loss": 6.411, "mean_token_accuracy": 0.11489588171243667, "num_tokens": 2156936.0, "step": 1170 }, { "entropy": 6.470327949523925, "epoch": 0.09871875656374711, "grad_norm": 1.109375, "learning_rate": 0.0004999975865474354, "loss": 6.474, "mean_token_accuracy": 0.11181759610772132, "num_tokens": 2165362.0, "step": 1175 }, { "entropy": 6.40549669265747, "epoch": 0.099138836378912, "grad_norm": 1.1640625, "learning_rate": 0.0004999974458506551, "loss": 6.3913, "mean_token_accuracy": 0.11321083605289459, "num_tokens": 2173665.0, "step": 1180 }, { "entropy": 6.484990215301513, "epoch": 0.09955891619407688, "grad_norm": 1.1875, "learning_rate": 0.000499997301168161, "loss": 6.3547, "mean_token_accuracy": 0.1165615513920784, "num_tokens": 2182222.0, "step": 1185 }, { "entropy": 6.47981333732605, "epoch": 0.09997899600924176, "grad_norm": 1.0234375, "learning_rate": 0.0004999971524999556, "loss": 6.4586, "mean_token_accuracy": 0.11381722316145897, "num_tokens": 2192358.0, "step": 1190 }, { "entropy": 6.44229564666748, "epoch": 0.10039907582440663, "grad_norm": 1.0703125, "learning_rate": 0.0004999969998460414, "loss": 6.4519, "mean_token_accuracy": 0.11379328668117523, "num_tokens": 2201889.0, "step": 1195 }, { "entropy": 6.39982123374939, "epoch": 0.10081915563957151, "grad_norm": 1.34375, "learning_rate": 0.0004999968432064213, "loss": 6.4571, "mean_token_accuracy": 0.11692695915699006, "num_tokens": 2211810.0, "step": 1200 }, { "entropy": 6.344041538238526, "epoch": 0.1012392354547364, "grad_norm": 1.0078125, "learning_rate": 0.0004999966825810979, "loss": 6.401, "mean_token_accuracy": 0.11303748413920403, "num_tokens": 2221123.0, "step": 1205 }, { "entropy": 6.35064206123352, "epoch": 0.10165931526990128, "grad_norm": 1.0859375, "learning_rate": 0.0004999965179700742, "loss": 6.3287, "mean_token_accuracy": 0.11802673861384391, "num_tokens": 2230129.0, "step": 1210 }, { "entropy": 6.3616985321044925, "epoch": 0.10207939508506617, "grad_norm": 1.0234375, "learning_rate": 0.000499996349373353, "loss": 6.3828, "mean_token_accuracy": 0.11236807405948639, "num_tokens": 2239929.0, "step": 1215 }, { "entropy": 6.4787780284881595, "epoch": 0.10249947490023105, "grad_norm": 1.1171875, "learning_rate": 0.0004999961767909374, "loss": 6.3565, "mean_token_accuracy": 0.11246357783675194, "num_tokens": 2248078.0, "step": 1220 }, { "entropy": 6.329130172729492, "epoch": 0.10291955471539592, "grad_norm": 1.078125, "learning_rate": 0.0004999960002228303, "loss": 6.4464, "mean_token_accuracy": 0.11457708552479744, "num_tokens": 2256975.0, "step": 1225 }, { "entropy": 6.41359543800354, "epoch": 0.1033396345305608, "grad_norm": 1.1953125, "learning_rate": 0.0004999958196690349, "loss": 6.3075, "mean_token_accuracy": 0.11755945533514023, "num_tokens": 2265797.0, "step": 1230 }, { "entropy": 6.318053436279297, "epoch": 0.10375971434572569, "grad_norm": 1.109375, "learning_rate": 0.0004999956351295545, "loss": 6.407, "mean_token_accuracy": 0.12029065862298012, "num_tokens": 2274099.0, "step": 1235 }, { "entropy": 6.369567108154297, "epoch": 0.10417979416089057, "grad_norm": 1.0390625, "learning_rate": 0.0004999954466043922, "loss": 6.3363, "mean_token_accuracy": 0.11501603052020073, "num_tokens": 2282360.0, "step": 1240 }, { "entropy": 6.357530832290649, "epoch": 0.10459987397605545, "grad_norm": 1.0234375, "learning_rate": 0.0004999952540935514, "loss": 6.427, "mean_token_accuracy": 0.10687654167413711, "num_tokens": 2292714.0, "step": 1245 }, { "entropy": 6.4097044467926025, "epoch": 0.10501995379122034, "grad_norm": 1.0859375, "learning_rate": 0.0004999950575970356, "loss": 6.3634, "mean_token_accuracy": 0.11621319502592087, "num_tokens": 2301633.0, "step": 1250 }, { "entropy": 6.369806289672852, "epoch": 0.10544003360638521, "grad_norm": 1.0234375, "learning_rate": 0.0004999948571148482, "loss": 6.3449, "mean_token_accuracy": 0.12026969790458679, "num_tokens": 2310067.0, "step": 1255 }, { "entropy": 6.404969978332519, "epoch": 0.10586011342155009, "grad_norm": 1.1015625, "learning_rate": 0.0004999946526469927, "loss": 6.4302, "mean_token_accuracy": 0.11556015834212303, "num_tokens": 2320090.0, "step": 1260 }, { "entropy": 6.372836637496948, "epoch": 0.10628019323671498, "grad_norm": 1.140625, "learning_rate": 0.0004999944441934728, "loss": 6.3703, "mean_token_accuracy": 0.1213706873357296, "num_tokens": 2329255.0, "step": 1265 }, { "entropy": 6.461856746673584, "epoch": 0.10670027305187986, "grad_norm": 1.1640625, "learning_rate": 0.0004999942317542922, "loss": 6.4648, "mean_token_accuracy": 0.11396320164203644, "num_tokens": 2339535.0, "step": 1270 }, { "entropy": 6.327042865753174, "epoch": 0.10712035286704474, "grad_norm": 1.0625, "learning_rate": 0.0004999940153294546, "loss": 6.3664, "mean_token_accuracy": 0.11329737976193428, "num_tokens": 2348948.0, "step": 1275 }, { "entropy": 6.429010534286499, "epoch": 0.10754043268220961, "grad_norm": 1.0703125, "learning_rate": 0.000499993794918964, "loss": 6.3999, "mean_token_accuracy": 0.11255929544568062, "num_tokens": 2359141.0, "step": 1280 }, { "entropy": 6.310593366622925, "epoch": 0.1079605124973745, "grad_norm": 1.2109375, "learning_rate": 0.0004999935705228241, "loss": 6.4541, "mean_token_accuracy": 0.11158784702420235, "num_tokens": 2368906.0, "step": 1285 }, { "entropy": 6.527987766265869, "epoch": 0.10838059231253938, "grad_norm": 1.171875, "learning_rate": 0.0004999933421410389, "loss": 6.4033, "mean_token_accuracy": 0.11808300763368607, "num_tokens": 2377029.0, "step": 1290 }, { "entropy": 6.42291259765625, "epoch": 0.10880067212770426, "grad_norm": 0.8984375, "learning_rate": 0.0004999931097736125, "loss": 6.4774, "mean_token_accuracy": 0.11096713319420815, "num_tokens": 2387088.0, "step": 1295 }, { "entropy": 6.416878700256348, "epoch": 0.10922075194286915, "grad_norm": 1.0703125, "learning_rate": 0.0004999928734205492, "loss": 6.3793, "mean_token_accuracy": 0.11391115635633468, "num_tokens": 2395596.0, "step": 1300 }, { "entropy": 6.3132000923156735, "epoch": 0.10964083175803403, "grad_norm": 1.1171875, "learning_rate": 0.0004999926330818528, "loss": 6.3632, "mean_token_accuracy": 0.12021223455667496, "num_tokens": 2404506.0, "step": 1305 }, { "entropy": 6.388805866241455, "epoch": 0.1100609115731989, "grad_norm": 1.1015625, "learning_rate": 0.0004999923887575278, "loss": 6.4068, "mean_token_accuracy": 0.11463478580117226, "num_tokens": 2414342.0, "step": 1310 }, { "entropy": 6.404540014266968, "epoch": 0.11048099138836379, "grad_norm": 1.125, "learning_rate": 0.0004999921404475785, "loss": 6.3616, "mean_token_accuracy": 0.11288373470306397, "num_tokens": 2423076.0, "step": 1315 }, { "entropy": 6.345643043518066, "epoch": 0.11090107120352867, "grad_norm": 0.96875, "learning_rate": 0.0004999918881520093, "loss": 6.3217, "mean_token_accuracy": 0.12194343283772469, "num_tokens": 2432492.0, "step": 1320 }, { "entropy": 6.29752926826477, "epoch": 0.11132115101869355, "grad_norm": 1.03125, "learning_rate": 0.0004999916318708246, "loss": 6.2719, "mean_token_accuracy": 0.12224209308624268, "num_tokens": 2441916.0, "step": 1325 }, { "entropy": 6.3799408912658695, "epoch": 0.11174123083385844, "grad_norm": 1.109375, "learning_rate": 0.0004999913716040291, "loss": 6.3145, "mean_token_accuracy": 0.12352623343467713, "num_tokens": 2450932.0, "step": 1330 }, { "entropy": 6.248048543930054, "epoch": 0.11216131064902331, "grad_norm": 1.1875, "learning_rate": 0.0004999911073516272, "loss": 6.354, "mean_token_accuracy": 0.11721153929829597, "num_tokens": 2460058.0, "step": 1335 }, { "entropy": 6.305795478820801, "epoch": 0.11258139046418819, "grad_norm": 1.03125, "learning_rate": 0.0004999908391136237, "loss": 6.3027, "mean_token_accuracy": 0.11657693609595299, "num_tokens": 2469607.0, "step": 1340 }, { "entropy": 6.403741073608399, "epoch": 0.11300147027935308, "grad_norm": 1.109375, "learning_rate": 0.0004999905668900234, "loss": 6.3338, "mean_token_accuracy": 0.11997214257717133, "num_tokens": 2478345.0, "step": 1345 }, { "entropy": 6.354722595214843, "epoch": 0.11342155009451796, "grad_norm": 1.234375, "learning_rate": 0.000499990290680831, "loss": 6.2712, "mean_token_accuracy": 0.12181988656520844, "num_tokens": 2486662.0, "step": 1350 }, { "entropy": 6.314315986633301, "epoch": 0.11384162990968284, "grad_norm": 1.109375, "learning_rate": 0.0004999900104860516, "loss": 6.4018, "mean_token_accuracy": 0.1190257914364338, "num_tokens": 2495392.0, "step": 1355 }, { "entropy": 6.396038579940796, "epoch": 0.11426170972484773, "grad_norm": 1.0859375, "learning_rate": 0.0004999897263056898, "loss": 6.4374, "mean_token_accuracy": 0.11400101035833358, "num_tokens": 2505254.0, "step": 1360 }, { "entropy": 6.415481328964233, "epoch": 0.1146817895400126, "grad_norm": 1.1015625, "learning_rate": 0.000499989438139751, "loss": 6.242, "mean_token_accuracy": 0.12410487607121468, "num_tokens": 2514096.0, "step": 1365 }, { "entropy": 6.27006311416626, "epoch": 0.11510186935517748, "grad_norm": 0.9140625, "learning_rate": 0.0004999891459882401, "loss": 6.2694, "mean_token_accuracy": 0.11747925281524658, "num_tokens": 2523635.0, "step": 1370 }, { "entropy": 6.264933919906616, "epoch": 0.11552194917034236, "grad_norm": 1.0546875, "learning_rate": 0.0004999888498511624, "loss": 6.3385, "mean_token_accuracy": 0.11789564937353134, "num_tokens": 2532528.0, "step": 1375 }, { "entropy": 6.311061573028565, "epoch": 0.11594202898550725, "grad_norm": 1.109375, "learning_rate": 0.0004999885497285229, "loss": 6.2494, "mean_token_accuracy": 0.1151455245912075, "num_tokens": 2541893.0, "step": 1380 }, { "entropy": 6.299208688735962, "epoch": 0.11636210880067213, "grad_norm": 1.03125, "learning_rate": 0.0004999882456203273, "loss": 6.3013, "mean_token_accuracy": 0.12206007465720177, "num_tokens": 2551551.0, "step": 1385 }, { "entropy": 6.300058650970459, "epoch": 0.11678218861583702, "grad_norm": 1.1875, "learning_rate": 0.0004999879375265806, "loss": 6.2409, "mean_token_accuracy": 0.11834143102169037, "num_tokens": 2560183.0, "step": 1390 }, { "entropy": 6.282702207565308, "epoch": 0.11720226843100189, "grad_norm": 1.1484375, "learning_rate": 0.0004999876254472886, "loss": 6.1445, "mean_token_accuracy": 0.12687626332044602, "num_tokens": 2568697.0, "step": 1395 }, { "entropy": 6.272518634796143, "epoch": 0.11762234824616677, "grad_norm": 0.99609375, "learning_rate": 0.0004999873093824565, "loss": 6.3599, "mean_token_accuracy": 0.11727140471339226, "num_tokens": 2578151.0, "step": 1400 }, { "entropy": 6.458877515792847, "epoch": 0.11804242806133165, "grad_norm": 1.1640625, "learning_rate": 0.0004999869893320902, "loss": 6.466, "mean_token_accuracy": 0.11793015375733376, "num_tokens": 2585901.0, "step": 1405 }, { "entropy": 6.321643972396851, "epoch": 0.11846250787649654, "grad_norm": 1.078125, "learning_rate": 0.0004999866652961952, "loss": 6.3084, "mean_token_accuracy": 0.11437714174389839, "num_tokens": 2595655.0, "step": 1410 }, { "entropy": 6.353274202346801, "epoch": 0.11888258769166142, "grad_norm": 1.015625, "learning_rate": 0.0004999863372747773, "loss": 6.2695, "mean_token_accuracy": 0.11410242393612861, "num_tokens": 2604949.0, "step": 1415 }, { "entropy": 6.335246944427491, "epoch": 0.11930266750682629, "grad_norm": 1.25, "learning_rate": 0.0004999860052678423, "loss": 6.3408, "mean_token_accuracy": 0.11986073330044747, "num_tokens": 2614260.0, "step": 1420 }, { "entropy": 6.268476724624634, "epoch": 0.11972274732199117, "grad_norm": 1.2421875, "learning_rate": 0.0004999856692753959, "loss": 6.3378, "mean_token_accuracy": 0.11452461034059525, "num_tokens": 2623740.0, "step": 1425 }, { "entropy": 6.361990165710449, "epoch": 0.12014282713715606, "grad_norm": 1.1015625, "learning_rate": 0.0004999853292974444, "loss": 6.2447, "mean_token_accuracy": 0.11876029148697853, "num_tokens": 2631998.0, "step": 1430 }, { "entropy": 6.266280698776245, "epoch": 0.12056290695232094, "grad_norm": 0.99609375, "learning_rate": 0.0004999849853339936, "loss": 6.3713, "mean_token_accuracy": 0.12363618090748787, "num_tokens": 2641169.0, "step": 1435 }, { "entropy": 6.410887956619263, "epoch": 0.12098298676748583, "grad_norm": 0.94140625, "learning_rate": 0.0004999846373850497, "loss": 6.2122, "mean_token_accuracy": 0.12586963474750518, "num_tokens": 2650576.0, "step": 1440 }, { "entropy": 6.177606773376465, "epoch": 0.12140306658265071, "grad_norm": 1.015625, "learning_rate": 0.0004999842854506186, "loss": 6.3172, "mean_token_accuracy": 0.11797089278697967, "num_tokens": 2660817.0, "step": 1445 }, { "entropy": 6.282220935821533, "epoch": 0.12182314639781558, "grad_norm": 1.1171875, "learning_rate": 0.0004999839295307069, "loss": 6.2561, "mean_token_accuracy": 0.1204748846590519, "num_tokens": 2669338.0, "step": 1450 }, { "entropy": 6.330542469024659, "epoch": 0.12224322621298046, "grad_norm": 1.140625, "learning_rate": 0.0004999835696253206, "loss": 6.3117, "mean_token_accuracy": 0.11790118813514709, "num_tokens": 2679108.0, "step": 1455 }, { "entropy": 6.36133828163147, "epoch": 0.12266330602814535, "grad_norm": 0.97265625, "learning_rate": 0.0004999832057344664, "loss": 6.2739, "mean_token_accuracy": 0.11782214790582657, "num_tokens": 2688126.0, "step": 1460 }, { "entropy": 6.110802221298218, "epoch": 0.12308338584331023, "grad_norm": 1.1171875, "learning_rate": 0.0004999828378581504, "loss": 6.2493, "mean_token_accuracy": 0.1267126329243183, "num_tokens": 2697245.0, "step": 1465 }, { "entropy": 6.332847547531128, "epoch": 0.12350346565847511, "grad_norm": 1.0703125, "learning_rate": 0.0004999824659963793, "loss": 6.3012, "mean_token_accuracy": 0.12391207665205002, "num_tokens": 2705934.0, "step": 1470 }, { "entropy": 6.2204491138458256, "epoch": 0.12392354547364, "grad_norm": 1.203125, "learning_rate": 0.0004999820901491598, "loss": 6.2299, "mean_token_accuracy": 0.12465188652276993, "num_tokens": 2714367.0, "step": 1475 }, { "entropy": 6.265383291244507, "epoch": 0.12434362528880487, "grad_norm": 1.1171875, "learning_rate": 0.0004999817103164983, "loss": 6.2882, "mean_token_accuracy": 0.12172888666391372, "num_tokens": 2724366.0, "step": 1480 }, { "entropy": 6.282348680496216, "epoch": 0.12476370510396975, "grad_norm": 1.0, "learning_rate": 0.0004999813264984017, "loss": 6.284, "mean_token_accuracy": 0.11969415470957756, "num_tokens": 2733980.0, "step": 1485 }, { "entropy": 6.340108251571655, "epoch": 0.12518378491913462, "grad_norm": 0.98046875, "learning_rate": 0.0004999809386948767, "loss": 6.2714, "mean_token_accuracy": 0.12543393298983574, "num_tokens": 2744013.0, "step": 1490 }, { "entropy": 6.2037091732025145, "epoch": 0.12560386473429952, "grad_norm": 1.1640625, "learning_rate": 0.0004999805469059302, "loss": 6.3445, "mean_token_accuracy": 0.11714137867093086, "num_tokens": 2753385.0, "step": 1495 }, { "entropy": 6.296666240692138, "epoch": 0.1260239445494644, "grad_norm": 1.09375, "learning_rate": 0.0004999801511315693, "loss": 6.1931, "mean_token_accuracy": 0.12192152738571167, "num_tokens": 2762875.0, "step": 1500 }, { "entropy": 6.284714651107788, "epoch": 0.1264440243646293, "grad_norm": 1.0859375, "learning_rate": 0.0004999797513718007, "loss": 6.2602, "mean_token_accuracy": 0.12598440870642663, "num_tokens": 2772182.0, "step": 1505 }, { "entropy": 6.161528825759888, "epoch": 0.12686410417979416, "grad_norm": 1.0625, "learning_rate": 0.0004999793476266317, "loss": 6.2127, "mean_token_accuracy": 0.12566941007971763, "num_tokens": 2780814.0, "step": 1510 }, { "entropy": 6.568186330795288, "epoch": 0.12728418399495905, "grad_norm": 1.046875, "learning_rate": 0.0004999789398960695, "loss": 6.483, "mean_token_accuracy": 0.11876541525125503, "num_tokens": 2791104.0, "step": 1515 }, { "entropy": 6.092951726913452, "epoch": 0.12770426381012392, "grad_norm": 1.0703125, "learning_rate": 0.0004999785281801212, "loss": 6.1965, "mean_token_accuracy": 0.12290481179952621, "num_tokens": 2800081.0, "step": 1520 }, { "entropy": 6.291093444824218, "epoch": 0.1281243436252888, "grad_norm": 1.1015625, "learning_rate": 0.000499978112478794, "loss": 6.3106, "mean_token_accuracy": 0.12357923462986946, "num_tokens": 2809096.0, "step": 1525 }, { "entropy": 6.3231532096862795, "epoch": 0.1285444234404537, "grad_norm": 1.0703125, "learning_rate": 0.0004999776927920955, "loss": 6.2832, "mean_token_accuracy": 0.11848211586475373, "num_tokens": 2818857.0, "step": 1530 }, { "entropy": 6.26645565032959, "epoch": 0.12896450325561856, "grad_norm": 1.0859375, "learning_rate": 0.000499977269120033, "loss": 6.3691, "mean_token_accuracy": 0.11801103353500367, "num_tokens": 2829332.0, "step": 1535 }, { "entropy": 6.273476028442383, "epoch": 0.12938458307078346, "grad_norm": 1.015625, "learning_rate": 0.000499976841462614, "loss": 6.2806, "mean_token_accuracy": 0.1197669893503189, "num_tokens": 2839193.0, "step": 1540 }, { "entropy": 6.308599948883057, "epoch": 0.12980466288594833, "grad_norm": 1.0, "learning_rate": 0.000499976409819846, "loss": 6.2735, "mean_token_accuracy": 0.11774821653962135, "num_tokens": 2848535.0, "step": 1545 }, { "entropy": 6.131243658065796, "epoch": 0.1302247427011132, "grad_norm": 1.0, "learning_rate": 0.0004999759741917369, "loss": 6.1661, "mean_token_accuracy": 0.12612850666046144, "num_tokens": 2858090.0, "step": 1550 }, { "entropy": 6.301682853698731, "epoch": 0.1306448225162781, "grad_norm": 1.15625, "learning_rate": 0.0004999755345782941, "loss": 6.3181, "mean_token_accuracy": 0.1226440578699112, "num_tokens": 2866984.0, "step": 1555 }, { "entropy": 6.184937286376953, "epoch": 0.13106490233144297, "grad_norm": 0.94921875, "learning_rate": 0.0004999750909795256, "loss": 6.1325, "mean_token_accuracy": 0.12440444529056549, "num_tokens": 2876550.0, "step": 1560 }, { "entropy": 6.258828830718994, "epoch": 0.13148498214660786, "grad_norm": 1.0390625, "learning_rate": 0.0004999746433954394, "loss": 6.241, "mean_token_accuracy": 0.12188669666647911, "num_tokens": 2885782.0, "step": 1565 }, { "entropy": 6.217999792098999, "epoch": 0.13190506196177273, "grad_norm": 1.0625, "learning_rate": 0.000499974191826043, "loss": 6.2134, "mean_token_accuracy": 0.1303790420293808, "num_tokens": 2894807.0, "step": 1570 }, { "entropy": 6.286883115768433, "epoch": 0.1323251417769376, "grad_norm": 1.2109375, "learning_rate": 0.0004999737362713448, "loss": 6.2503, "mean_token_accuracy": 0.12286639586091042, "num_tokens": 2904076.0, "step": 1575 }, { "entropy": 6.20257887840271, "epoch": 0.1327452215921025, "grad_norm": 1.2578125, "learning_rate": 0.0004999732767313527, "loss": 6.1442, "mean_token_accuracy": 0.12661461755633355, "num_tokens": 2913761.0, "step": 1580 }, { "entropy": 6.346931409835816, "epoch": 0.13316530140726737, "grad_norm": 1.1171875, "learning_rate": 0.0004999728132060746, "loss": 6.3898, "mean_token_accuracy": 0.12459043636918068, "num_tokens": 2922848.0, "step": 1585 }, { "entropy": 6.276056003570557, "epoch": 0.13358538122243227, "grad_norm": 0.95703125, "learning_rate": 0.0004999723456955192, "loss": 6.271, "mean_token_accuracy": 0.1242086909711361, "num_tokens": 2932718.0, "step": 1590 }, { "entropy": 6.151839303970337, "epoch": 0.13400546103759714, "grad_norm": 1.015625, "learning_rate": 0.0004999718741996945, "loss": 6.2133, "mean_token_accuracy": 0.12332009747624398, "num_tokens": 2942686.0, "step": 1595 }, { "entropy": 6.2299316883087155, "epoch": 0.13442554085276204, "grad_norm": 1.0546875, "learning_rate": 0.000499971398718609, "loss": 6.194, "mean_token_accuracy": 0.12265397682785988, "num_tokens": 2952096.0, "step": 1600 }, { "entropy": 6.27053918838501, "epoch": 0.1348456206679269, "grad_norm": 1.03125, "learning_rate": 0.0004999709192522708, "loss": 6.2496, "mean_token_accuracy": 0.12414331436157226, "num_tokens": 2960660.0, "step": 1605 }, { "entropy": 6.299257707595825, "epoch": 0.13526570048309178, "grad_norm": 1.0, "learning_rate": 0.0004999704358006887, "loss": 6.2485, "mean_token_accuracy": 0.1208350658416748, "num_tokens": 2969834.0, "step": 1610 }, { "entropy": 6.205888414382935, "epoch": 0.13568578029825668, "grad_norm": 1.1328125, "learning_rate": 0.0004999699483638712, "loss": 6.2531, "mean_token_accuracy": 0.12345886677503586, "num_tokens": 2979023.0, "step": 1615 }, { "entropy": 6.2445619106292725, "epoch": 0.13610586011342155, "grad_norm": 1.0625, "learning_rate": 0.0004999694569418269, "loss": 6.2532, "mean_token_accuracy": 0.12339803278446197, "num_tokens": 2988083.0, "step": 1620 }, { "entropy": 6.20722599029541, "epoch": 0.13652593992858644, "grad_norm": 1.0390625, "learning_rate": 0.0004999689615345645, "loss": 6.1689, "mean_token_accuracy": 0.1274717427790165, "num_tokens": 2997240.0, "step": 1625 }, { "entropy": 6.2464118003845215, "epoch": 0.1369460197437513, "grad_norm": 1.125, "learning_rate": 0.0004999684621420928, "loss": 6.2565, "mean_token_accuracy": 0.12297938466072082, "num_tokens": 3007077.0, "step": 1630 }, { "entropy": 6.293700885772705, "epoch": 0.13736609955891618, "grad_norm": 1.0546875, "learning_rate": 0.0004999679587644205, "loss": 6.2787, "mean_token_accuracy": 0.1208807609975338, "num_tokens": 3015821.0, "step": 1635 }, { "entropy": 6.17975435256958, "epoch": 0.13778617937408108, "grad_norm": 1.1171875, "learning_rate": 0.0004999674514015568, "loss": 6.2054, "mean_token_accuracy": 0.1252801388502121, "num_tokens": 3025858.0, "step": 1640 }, { "entropy": 6.2544519901275635, "epoch": 0.13820625918924595, "grad_norm": 1.0703125, "learning_rate": 0.0004999669400535105, "loss": 6.1887, "mean_token_accuracy": 0.11709433272480965, "num_tokens": 3035537.0, "step": 1645 }, { "entropy": 6.045716142654419, "epoch": 0.13862633900441085, "grad_norm": 1.1953125, "learning_rate": 0.0004999664247202907, "loss": 6.1026, "mean_token_accuracy": 0.12171815410256386, "num_tokens": 3044204.0, "step": 1650 }, { "entropy": 6.264136171340942, "epoch": 0.13904641881957572, "grad_norm": 1.125, "learning_rate": 0.0004999659054019066, "loss": 6.2747, "mean_token_accuracy": 0.1242525890469551, "num_tokens": 3053111.0, "step": 1655 }, { "entropy": 6.191974449157715, "epoch": 0.1394664986347406, "grad_norm": 1.109375, "learning_rate": 0.0004999653820983673, "loss": 6.1818, "mean_token_accuracy": 0.12419796586036683, "num_tokens": 3062456.0, "step": 1660 }, { "entropy": 6.188469123840332, "epoch": 0.13988657844990549, "grad_norm": 1.078125, "learning_rate": 0.000499964854809682, "loss": 6.2114, "mean_token_accuracy": 0.12520652115345002, "num_tokens": 3071132.0, "step": 1665 }, { "entropy": 6.217535066604614, "epoch": 0.14030665826507036, "grad_norm": 1.0078125, "learning_rate": 0.0004999643235358602, "loss": 6.1733, "mean_token_accuracy": 0.12733130380511284, "num_tokens": 3080892.0, "step": 1670 }, { "entropy": 6.150455570220947, "epoch": 0.14072673808023525, "grad_norm": 1.09375, "learning_rate": 0.0004999637882769112, "loss": 6.1088, "mean_token_accuracy": 0.13008806556463243, "num_tokens": 3089874.0, "step": 1675 }, { "entropy": 6.232478332519531, "epoch": 0.14114681789540012, "grad_norm": 0.96484375, "learning_rate": 0.0004999632490328447, "loss": 6.2504, "mean_token_accuracy": 0.12480302304029464, "num_tokens": 3099535.0, "step": 1680 }, { "entropy": 6.220491170883179, "epoch": 0.14156689771056502, "grad_norm": 1.0078125, "learning_rate": 0.0004999627058036699, "loss": 6.1932, "mean_token_accuracy": 0.12512605339288713, "num_tokens": 3108772.0, "step": 1685 }, { "entropy": 6.254866027832032, "epoch": 0.1419869775257299, "grad_norm": 1.0625, "learning_rate": 0.0004999621585893966, "loss": 6.2305, "mean_token_accuracy": 0.11818314492702484, "num_tokens": 3118333.0, "step": 1690 }, { "entropy": 6.258799934387207, "epoch": 0.14240705734089476, "grad_norm": 1.0859375, "learning_rate": 0.0004999616073900346, "loss": 6.2544, "mean_token_accuracy": 0.1175099603831768, "num_tokens": 3127356.0, "step": 1695 }, { "entropy": 6.158872365951538, "epoch": 0.14282713715605966, "grad_norm": 1.125, "learning_rate": 0.0004999610522055935, "loss": 6.2288, "mean_token_accuracy": 0.12072905600070953, "num_tokens": 3136859.0, "step": 1700 }, { "entropy": 6.2665447235107425, "epoch": 0.14324721697122453, "grad_norm": 1.0859375, "learning_rate": 0.0004999604930360832, "loss": 6.2553, "mean_token_accuracy": 0.11907806620001793, "num_tokens": 3146607.0, "step": 1705 }, { "entropy": 6.134842443466186, "epoch": 0.14366729678638943, "grad_norm": 1.0078125, "learning_rate": 0.0004999599298815136, "loss": 6.1946, "mean_token_accuracy": 0.12945861145853996, "num_tokens": 3156327.0, "step": 1710 }, { "entropy": 6.1354063034057615, "epoch": 0.1440873766015543, "grad_norm": 1.6015625, "learning_rate": 0.0004999593627418947, "loss": 6.1466, "mean_token_accuracy": 0.13011169731616973, "num_tokens": 3165559.0, "step": 1715 }, { "entropy": 6.2760594367980955, "epoch": 0.14450745641671917, "grad_norm": 1.0859375, "learning_rate": 0.0004999587916172365, "loss": 6.247, "mean_token_accuracy": 0.11565925851464272, "num_tokens": 3173850.0, "step": 1720 }, { "entropy": 6.219358253479004, "epoch": 0.14492753623188406, "grad_norm": 1.0390625, "learning_rate": 0.0004999582165075492, "loss": 6.1819, "mean_token_accuracy": 0.12041235193610192, "num_tokens": 3182838.0, "step": 1725 }, { "entropy": 6.098177671432495, "epoch": 0.14534761604704893, "grad_norm": 1.0859375, "learning_rate": 0.0004999576374128429, "loss": 6.1848, "mean_token_accuracy": 0.12102061733603478, "num_tokens": 3191692.0, "step": 1730 }, { "entropy": 6.281035900115967, "epoch": 0.14576769586221383, "grad_norm": 1.109375, "learning_rate": 0.0004999570543331279, "loss": 6.2096, "mean_token_accuracy": 0.12320142686367035, "num_tokens": 3200069.0, "step": 1735 }, { "entropy": 6.174470567703247, "epoch": 0.1461877756773787, "grad_norm": 1.2109375, "learning_rate": 0.0004999564672684145, "loss": 6.2813, "mean_token_accuracy": 0.11924844831228257, "num_tokens": 3209653.0, "step": 1740 }, { "entropy": 6.267424774169922, "epoch": 0.14660785549254357, "grad_norm": 1.0703125, "learning_rate": 0.0004999558762187131, "loss": 6.1641, "mean_token_accuracy": 0.1311913624405861, "num_tokens": 3218313.0, "step": 1745 }, { "entropy": 6.102459383010864, "epoch": 0.14702793530770847, "grad_norm": 1.09375, "learning_rate": 0.0004999552811840342, "loss": 6.0922, "mean_token_accuracy": 0.12970734164118766, "num_tokens": 3227525.0, "step": 1750 }, { "entropy": 6.164451694488525, "epoch": 0.14744801512287334, "grad_norm": 1.015625, "learning_rate": 0.0004999546821643884, "loss": 6.2098, "mean_token_accuracy": 0.12783958092331887, "num_tokens": 3237022.0, "step": 1755 }, { "entropy": 6.150622081756592, "epoch": 0.14786809493803824, "grad_norm": 1.078125, "learning_rate": 0.0004999540791597861, "loss": 6.1154, "mean_token_accuracy": 0.1278826355934143, "num_tokens": 3246605.0, "step": 1760 }, { "entropy": 6.106969261169434, "epoch": 0.1482881747532031, "grad_norm": 1.0859375, "learning_rate": 0.0004999534721702383, "loss": 6.065, "mean_token_accuracy": 0.13145939782261848, "num_tokens": 3255587.0, "step": 1765 }, { "entropy": 6.141215419769287, "epoch": 0.148708254568368, "grad_norm": 1.1171875, "learning_rate": 0.0004999528611957553, "loss": 6.1666, "mean_token_accuracy": 0.1264194056391716, "num_tokens": 3265669.0, "step": 1770 }, { "entropy": 6.255412673950195, "epoch": 0.14912833438353287, "grad_norm": 1.1484375, "learning_rate": 0.0004999522462363485, "loss": 6.1518, "mean_token_accuracy": 0.13116262778639792, "num_tokens": 3275013.0, "step": 1775 }, { "entropy": 6.1518439769744875, "epoch": 0.14954841419869774, "grad_norm": 1.015625, "learning_rate": 0.0004999516272920283, "loss": 6.255, "mean_token_accuracy": 0.1255984991788864, "num_tokens": 3284723.0, "step": 1780 }, { "entropy": 6.11306095123291, "epoch": 0.14996849401386264, "grad_norm": 1.046875, "learning_rate": 0.000499951004362806, "loss": 6.0833, "mean_token_accuracy": 0.12718486189842224, "num_tokens": 3293860.0, "step": 1785 }, { "entropy": 6.046157026290894, "epoch": 0.1503885738290275, "grad_norm": 1.09375, "learning_rate": 0.0004999503774486924, "loss": 6.1405, "mean_token_accuracy": 0.1226385310292244, "num_tokens": 3303158.0, "step": 1790 }, { "entropy": 6.138220262527466, "epoch": 0.1508086536441924, "grad_norm": 1.015625, "learning_rate": 0.0004999497465496987, "loss": 6.0637, "mean_token_accuracy": 0.12298208549618721, "num_tokens": 3313068.0, "step": 1795 }, { "entropy": 6.1492797374725345, "epoch": 0.15122873345935728, "grad_norm": 1.1015625, "learning_rate": 0.000499949111665836, "loss": 6.1591, "mean_token_accuracy": 0.12638431563973426, "num_tokens": 3321885.0, "step": 1800 }, { "entropy": 6.2002543926239015, "epoch": 0.15164881327452215, "grad_norm": 1.0546875, "learning_rate": 0.0004999484727971158, "loss": 6.1371, "mean_token_accuracy": 0.12693015187978746, "num_tokens": 3330924.0, "step": 1805 }, { "entropy": 6.14166145324707, "epoch": 0.15206889308968705, "grad_norm": 1.03125, "learning_rate": 0.000499947829943549, "loss": 6.176, "mean_token_accuracy": 0.12311631590127944, "num_tokens": 3340070.0, "step": 1810 }, { "entropy": 6.207650995254516, "epoch": 0.15248897290485192, "grad_norm": 1.0703125, "learning_rate": 0.0004999471831051474, "loss": 6.1689, "mean_token_accuracy": 0.13365908414125444, "num_tokens": 3349870.0, "step": 1815 }, { "entropy": 6.160492658615112, "epoch": 0.1529090527200168, "grad_norm": 1.03125, "learning_rate": 0.0004999465322819222, "loss": 6.2169, "mean_token_accuracy": 0.12209457084536553, "num_tokens": 3359573.0, "step": 1820 }, { "entropy": 6.222156381607055, "epoch": 0.15332913253518168, "grad_norm": 1.125, "learning_rate": 0.0004999458774738851, "loss": 6.1684, "mean_token_accuracy": 0.13460491448640824, "num_tokens": 3368577.0, "step": 1825 }, { "entropy": 6.116939735412598, "epoch": 0.15374921235034655, "grad_norm": 1.09375, "learning_rate": 0.0004999452186810476, "loss": 6.1162, "mean_token_accuracy": 0.13111207485198975, "num_tokens": 3377801.0, "step": 1830 }, { "entropy": 6.230794095993042, "epoch": 0.15416929216551145, "grad_norm": 1.21875, "learning_rate": 0.0004999445559034214, "loss": 6.1624, "mean_token_accuracy": 0.12796897292137147, "num_tokens": 3386666.0, "step": 1835 }, { "entropy": 6.314156770706177, "epoch": 0.15458937198067632, "grad_norm": 1.0625, "learning_rate": 0.0004999438891410181, "loss": 6.3117, "mean_token_accuracy": 0.12008170932531356, "num_tokens": 3396086.0, "step": 1840 }, { "entropy": 6.129251384735108, "epoch": 0.15500945179584122, "grad_norm": 1.0625, "learning_rate": 0.0004999432183938496, "loss": 6.2244, "mean_token_accuracy": 0.13115857616066934, "num_tokens": 3404894.0, "step": 1845 }, { "entropy": 6.144184207916259, "epoch": 0.1554295316110061, "grad_norm": 1.0703125, "learning_rate": 0.0004999425436619279, "loss": 6.2119, "mean_token_accuracy": 0.12016721740365029, "num_tokens": 3414172.0, "step": 1850 }, { "entropy": 6.27680606842041, "epoch": 0.15584961142617096, "grad_norm": 0.98046875, "learning_rate": 0.000499941864945265, "loss": 6.1711, "mean_token_accuracy": 0.12068818733096123, "num_tokens": 3423409.0, "step": 1855 }, { "entropy": 6.115350151062012, "epoch": 0.15626969124133586, "grad_norm": 1.046875, "learning_rate": 0.0004999411822438726, "loss": 6.1448, "mean_token_accuracy": 0.12664692029356955, "num_tokens": 3433047.0, "step": 1860 }, { "entropy": 6.1765364646911625, "epoch": 0.15668977105650073, "grad_norm": 1.1953125, "learning_rate": 0.000499940495557763, "loss": 6.1229, "mean_token_accuracy": 0.12646402716636657, "num_tokens": 3442490.0, "step": 1865 }, { "entropy": 6.212595748901367, "epoch": 0.15710985087166562, "grad_norm": 1.09375, "learning_rate": 0.0004999398048869485, "loss": 6.1955, "mean_token_accuracy": 0.12634197920560836, "num_tokens": 3451804.0, "step": 1870 }, { "entropy": 6.19734468460083, "epoch": 0.1575299306868305, "grad_norm": 1.0625, "learning_rate": 0.000499939110231441, "loss": 6.1767, "mean_token_accuracy": 0.1279009036719799, "num_tokens": 3461481.0, "step": 1875 }, { "entropy": 6.233086156845093, "epoch": 0.1579500105019954, "grad_norm": 1.171875, "learning_rate": 0.0004999384115912531, "loss": 6.2344, "mean_token_accuracy": 0.12624539732933043, "num_tokens": 3471798.0, "step": 1880 }, { "entropy": 6.015159845352173, "epoch": 0.15837009031716026, "grad_norm": 1.046875, "learning_rate": 0.000499937708966397, "loss": 6.1259, "mean_token_accuracy": 0.12552440091967582, "num_tokens": 3481386.0, "step": 1885 }, { "entropy": 6.193439149856568, "epoch": 0.15879017013232513, "grad_norm": 1.015625, "learning_rate": 0.0004999370023568853, "loss": 6.1184, "mean_token_accuracy": 0.12572901472449302, "num_tokens": 3489981.0, "step": 1890 }, { "entropy": 6.112298202514649, "epoch": 0.15921024994749003, "grad_norm": 1.1171875, "learning_rate": 0.0004999362917627304, "loss": 6.0882, "mean_token_accuracy": 0.1290791854262352, "num_tokens": 3498551.0, "step": 1895 }, { "entropy": 6.154214668273926, "epoch": 0.1596303297626549, "grad_norm": 1.1953125, "learning_rate": 0.0004999355771839448, "loss": 6.0678, "mean_token_accuracy": 0.13199156373739243, "num_tokens": 3507921.0, "step": 1900 }, { "entropy": 6.25932993888855, "epoch": 0.1600504095778198, "grad_norm": 1.2109375, "learning_rate": 0.0004999348586205414, "loss": 6.2391, "mean_token_accuracy": 0.12772736102342605, "num_tokens": 3517570.0, "step": 1905 }, { "entropy": 6.199037790298462, "epoch": 0.16047048939298467, "grad_norm": 1.1640625, "learning_rate": 0.0004999341360725327, "loss": 6.2333, "mean_token_accuracy": 0.12291170060634612, "num_tokens": 3526774.0, "step": 1910 }, { "entropy": 6.160675954818726, "epoch": 0.16089056920814954, "grad_norm": 1.125, "learning_rate": 0.0004999334095399317, "loss": 6.1636, "mean_token_accuracy": 0.13550060987472534, "num_tokens": 3535319.0, "step": 1915 }, { "entropy": 6.045867156982422, "epoch": 0.16131064902331443, "grad_norm": 1.109375, "learning_rate": 0.0004999326790227512, "loss": 6.1319, "mean_token_accuracy": 0.13111512288451194, "num_tokens": 3544468.0, "step": 1920 }, { "entropy": 6.046793222427368, "epoch": 0.1617307288384793, "grad_norm": 1.0546875, "learning_rate": 0.0004999319445210041, "loss": 6.0122, "mean_token_accuracy": 0.13604443669319152, "num_tokens": 3553529.0, "step": 1925 }, { "entropy": 6.111843824386597, "epoch": 0.1621508086536442, "grad_norm": 1.1484375, "learning_rate": 0.0004999312060347034, "loss": 6.0683, "mean_token_accuracy": 0.12906411960721015, "num_tokens": 3563053.0, "step": 1930 }, { "entropy": 6.050889587402343, "epoch": 0.16257088846880907, "grad_norm": 1.0390625, "learning_rate": 0.0004999304635638621, "loss": 6.0231, "mean_token_accuracy": 0.13277052119374275, "num_tokens": 3571877.0, "step": 1935 }, { "entropy": 6.067962026596069, "epoch": 0.16299096828397394, "grad_norm": 1.0234375, "learning_rate": 0.0004999297171084935, "loss": 6.0714, "mean_token_accuracy": 0.1285587027668953, "num_tokens": 3581496.0, "step": 1940 }, { "entropy": 6.136378002166748, "epoch": 0.16341104809913884, "grad_norm": 1.0859375, "learning_rate": 0.0004999289666686109, "loss": 6.0886, "mean_token_accuracy": 0.12886847257614137, "num_tokens": 3590752.0, "step": 1945 }, { "entropy": 6.010327911376953, "epoch": 0.1638311279143037, "grad_norm": 1.015625, "learning_rate": 0.0004999282122442274, "loss": 6.0834, "mean_token_accuracy": 0.13124383464455605, "num_tokens": 3599885.0, "step": 1950 }, { "entropy": 6.264139842987061, "epoch": 0.1642512077294686, "grad_norm": 1.03125, "learning_rate": 0.0004999274538353564, "loss": 6.1701, "mean_token_accuracy": 0.12521142810583114, "num_tokens": 3610039.0, "step": 1955 }, { "entropy": 6.073106908798218, "epoch": 0.16467128754463348, "grad_norm": 1.1484375, "learning_rate": 0.0004999266914420114, "loss": 6.1051, "mean_token_accuracy": 0.12478168606758118, "num_tokens": 3619954.0, "step": 1960 }, { "entropy": 6.187624120712281, "epoch": 0.16509136735979837, "grad_norm": 1.125, "learning_rate": 0.000499925925064206, "loss": 6.0438, "mean_token_accuracy": 0.13393656834959983, "num_tokens": 3628164.0, "step": 1965 }, { "entropy": 6.205664491653442, "epoch": 0.16551144717496324, "grad_norm": 1.125, "learning_rate": 0.0004999251547019535, "loss": 6.1949, "mean_token_accuracy": 0.13081590309739113, "num_tokens": 3636778.0, "step": 1970 }, { "entropy": 6.189227771759033, "epoch": 0.16593152699012811, "grad_norm": 1.046875, "learning_rate": 0.0004999243803552678, "loss": 6.1562, "mean_token_accuracy": 0.1314916841685772, "num_tokens": 3647046.0, "step": 1975 }, { "entropy": 6.080912733078003, "epoch": 0.166351606805293, "grad_norm": 1.1328125, "learning_rate": 0.0004999236020241625, "loss": 6.0778, "mean_token_accuracy": 0.12676682993769645, "num_tokens": 3656130.0, "step": 1980 }, { "entropy": 6.122231149673462, "epoch": 0.16677168662045788, "grad_norm": 1.1015625, "learning_rate": 0.0004999228197086514, "loss": 6.1524, "mean_token_accuracy": 0.12139208093285561, "num_tokens": 3666145.0, "step": 1985 }, { "entropy": 6.180514907836914, "epoch": 0.16719176643562278, "grad_norm": 0.95703125, "learning_rate": 0.0004999220334087484, "loss": 6.1973, "mean_token_accuracy": 0.12297596782445908, "num_tokens": 3676722.0, "step": 1990 }, { "entropy": 6.160478258132935, "epoch": 0.16761184625078765, "grad_norm": 1.0703125, "learning_rate": 0.0004999212431244673, "loss": 6.1951, "mean_token_accuracy": 0.12390567734837532, "num_tokens": 3685880.0, "step": 1995 }, { "entropy": 6.056979942321777, "epoch": 0.16803192606595252, "grad_norm": 1.0703125, "learning_rate": 0.0004999204488558222, "loss": 6.0309, "mean_token_accuracy": 0.13338653817772866, "num_tokens": 3695167.0, "step": 2000 }, { "entropy": 6.132491636276245, "epoch": 0.16845200588111742, "grad_norm": 1.1171875, "learning_rate": 0.0004999196506028273, "loss": 6.1254, "mean_token_accuracy": 0.12816390842199327, "num_tokens": 3703700.0, "step": 2005 }, { "entropy": 6.12127537727356, "epoch": 0.1688720856962823, "grad_norm": 1.1328125, "learning_rate": 0.0004999188483654965, "loss": 6.0676, "mean_token_accuracy": 0.13127523884177209, "num_tokens": 3712825.0, "step": 2010 }, { "entropy": 6.0288161277771, "epoch": 0.16929216551144718, "grad_norm": 1.0390625, "learning_rate": 0.0004999180421438442, "loss": 6.0433, "mean_token_accuracy": 0.13027536049485205, "num_tokens": 3721807.0, "step": 2015 }, { "entropy": 6.227120208740234, "epoch": 0.16971224532661205, "grad_norm": 1.1640625, "learning_rate": 0.0004999172319378846, "loss": 6.215, "mean_token_accuracy": 0.12012537494301796, "num_tokens": 3730502.0, "step": 2020 }, { "entropy": 6.142453145980835, "epoch": 0.17013232514177692, "grad_norm": 1.078125, "learning_rate": 0.0004999164177476319, "loss": 6.1007, "mean_token_accuracy": 0.13319698646664618, "num_tokens": 3739696.0, "step": 2025 }, { "entropy": 5.983246898651123, "epoch": 0.17055240495694182, "grad_norm": 1.1328125, "learning_rate": 0.0004999155995731009, "loss": 6.0931, "mean_token_accuracy": 0.12937767952680587, "num_tokens": 3748675.0, "step": 2030 }, { "entropy": 6.279529285430908, "epoch": 0.1709724847721067, "grad_norm": 1.078125, "learning_rate": 0.0004999147774143057, "loss": 6.1818, "mean_token_accuracy": 0.12119651660323143, "num_tokens": 3757714.0, "step": 2035 }, { "entropy": 6.021195363998413, "epoch": 0.1713925645872716, "grad_norm": 1.0859375, "learning_rate": 0.000499913951271261, "loss": 5.9995, "mean_token_accuracy": 0.13538317754864693, "num_tokens": 3767589.0, "step": 2040 }, { "entropy": 6.069708347320557, "epoch": 0.17181264440243646, "grad_norm": 1.25, "learning_rate": 0.0004999131211439816, "loss": 6.1103, "mean_token_accuracy": 0.1307745970785618, "num_tokens": 3777261.0, "step": 2045 }, { "entropy": 6.199843311309815, "epoch": 0.17223272421760136, "grad_norm": 1.1015625, "learning_rate": 0.000499912287032482, "loss": 6.0616, "mean_token_accuracy": 0.138626891374588, "num_tokens": 3786658.0, "step": 2050 }, { "entropy": 5.944658851623535, "epoch": 0.17265280403276623, "grad_norm": 1.1484375, "learning_rate": 0.000499911448936777, "loss": 6.0687, "mean_token_accuracy": 0.13718581050634385, "num_tokens": 3794977.0, "step": 2055 }, { "entropy": 6.036355209350586, "epoch": 0.1730728838479311, "grad_norm": 1.0234375, "learning_rate": 0.0004999106068568816, "loss": 6.1519, "mean_token_accuracy": 0.12767373770475388, "num_tokens": 3805138.0, "step": 2060 }, { "entropy": 6.216224002838135, "epoch": 0.173492963663096, "grad_norm": 1.1484375, "learning_rate": 0.0004999097607928106, "loss": 6.0775, "mean_token_accuracy": 0.13682154789566994, "num_tokens": 3814444.0, "step": 2065 }, { "entropy": 6.071376657485962, "epoch": 0.17391304347826086, "grad_norm": 1.09375, "learning_rate": 0.0004999089107445788, "loss": 6.0367, "mean_token_accuracy": 0.12884577140212058, "num_tokens": 3822859.0, "step": 2070 }, { "entropy": 6.008500671386718, "epoch": 0.17433312329342576, "grad_norm": 1.0234375, "learning_rate": 0.0004999080567122016, "loss": 6.0618, "mean_token_accuracy": 0.13177087977528573, "num_tokens": 3833159.0, "step": 2075 }, { "entropy": 6.108531808853149, "epoch": 0.17475320310859063, "grad_norm": 1.109375, "learning_rate": 0.0004999071986956941, "loss": 6.0712, "mean_token_accuracy": 0.132802564650774, "num_tokens": 3842136.0, "step": 2080 }, { "entropy": 6.1049566745758055, "epoch": 0.1751732829237555, "grad_norm": 1.0859375, "learning_rate": 0.0004999063366950713, "loss": 6.1334, "mean_token_accuracy": 0.12718930542469026, "num_tokens": 3851406.0, "step": 2085 }, { "entropy": 6.059695482254028, "epoch": 0.1755933627389204, "grad_norm": 1.0859375, "learning_rate": 0.0004999054707103486, "loss": 6.0576, "mean_token_accuracy": 0.13101131469011307, "num_tokens": 3861061.0, "step": 2090 }, { "entropy": 6.151092004776001, "epoch": 0.17601344255408527, "grad_norm": 1.140625, "learning_rate": 0.0004999046007415412, "loss": 6.0408, "mean_token_accuracy": 0.12858548611402512, "num_tokens": 3870357.0, "step": 2095 }, { "entropy": 6.117391204833984, "epoch": 0.17643352236925017, "grad_norm": 1.078125, "learning_rate": 0.0004999037267886646, "loss": 6.0479, "mean_token_accuracy": 0.1312727876007557, "num_tokens": 3879393.0, "step": 2100 }, { "entropy": 6.020799350738526, "epoch": 0.17685360218441504, "grad_norm": 1.1328125, "learning_rate": 0.0004999028488517343, "loss": 6.0773, "mean_token_accuracy": 0.1310334399342537, "num_tokens": 3888030.0, "step": 2105 }, { "entropy": 6.100592184066772, "epoch": 0.1772736819995799, "grad_norm": 1.1796875, "learning_rate": 0.0004999019669307659, "loss": 6.0848, "mean_token_accuracy": 0.13095394000411034, "num_tokens": 3897430.0, "step": 2110 }, { "entropy": 6.128114986419678, "epoch": 0.1776937618147448, "grad_norm": 1.0, "learning_rate": 0.0004999010810257749, "loss": 6.1085, "mean_token_accuracy": 0.1252062700688839, "num_tokens": 3907711.0, "step": 2115 }, { "entropy": 6.000101137161255, "epoch": 0.17811384162990967, "grad_norm": 1.078125, "learning_rate": 0.0004999001911367771, "loss": 6.0329, "mean_token_accuracy": 0.13599146604537965, "num_tokens": 3915816.0, "step": 2120 }, { "entropy": 6.115037250518799, "epoch": 0.17853392144507457, "grad_norm": 1.0546875, "learning_rate": 0.0004998992972637883, "loss": 6.1562, "mean_token_accuracy": 0.12152478694915772, "num_tokens": 3925162.0, "step": 2125 }, { "entropy": 6.161215543746948, "epoch": 0.17895400126023944, "grad_norm": 1.03125, "learning_rate": 0.0004998983994068242, "loss": 6.0325, "mean_token_accuracy": 0.13381896317005157, "num_tokens": 3934476.0, "step": 2130 }, { "entropy": 6.025839996337891, "epoch": 0.17937408107540434, "grad_norm": 1.03125, "learning_rate": 0.0004998974975659006, "loss": 6.0854, "mean_token_accuracy": 0.1257789060473442, "num_tokens": 3943501.0, "step": 2135 }, { "entropy": 6.134726524353027, "epoch": 0.1797941608905692, "grad_norm": 1.0625, "learning_rate": 0.0004998965917410338, "loss": 6.0793, "mean_token_accuracy": 0.12519558221101762, "num_tokens": 3953663.0, "step": 2140 }, { "entropy": 6.026215839385986, "epoch": 0.18021424070573408, "grad_norm": 1.2421875, "learning_rate": 0.0004998956819322397, "loss": 6.0415, "mean_token_accuracy": 0.1311854176223278, "num_tokens": 3962634.0, "step": 2145 }, { "entropy": 6.095232534408569, "epoch": 0.18063432052089898, "grad_norm": 1.0546875, "learning_rate": 0.0004998947681395343, "loss": 6.0476, "mean_token_accuracy": 0.13477237224578859, "num_tokens": 3972496.0, "step": 2150 }, { "entropy": 6.216877603530884, "epoch": 0.18105440033606385, "grad_norm": 1.09375, "learning_rate": 0.000499893850362934, "loss": 6.294, "mean_token_accuracy": 0.12388142496347428, "num_tokens": 3980724.0, "step": 2155 }, { "entropy": 6.148508739471436, "epoch": 0.18147448015122875, "grad_norm": 1.125, "learning_rate": 0.0004998929286024548, "loss": 6.1193, "mean_token_accuracy": 0.12675963416695596, "num_tokens": 3989842.0, "step": 2160 }, { "entropy": 6.089916467666626, "epoch": 0.18189455996639362, "grad_norm": 1.171875, "learning_rate": 0.0004998920028581133, "loss": 6.0396, "mean_token_accuracy": 0.13960086852312087, "num_tokens": 3998534.0, "step": 2165 }, { "entropy": 6.071965646743775, "epoch": 0.18231463978155849, "grad_norm": 1.046875, "learning_rate": 0.0004998910731299258, "loss": 6.066, "mean_token_accuracy": 0.12839170619845391, "num_tokens": 4007677.0, "step": 2170 }, { "entropy": 6.075281476974487, "epoch": 0.18273471959672338, "grad_norm": 1.09375, "learning_rate": 0.0004998901394179085, "loss": 6.1252, "mean_token_accuracy": 0.1271742030978203, "num_tokens": 4016347.0, "step": 2175 }, { "entropy": 6.099024391174316, "epoch": 0.18315479941188825, "grad_norm": 1.125, "learning_rate": 0.0004998892017220784, "loss": 5.981, "mean_token_accuracy": 0.13410005643963813, "num_tokens": 4025199.0, "step": 2180 }, { "entropy": 6.039233541488647, "epoch": 0.18357487922705315, "grad_norm": 1.21875, "learning_rate": 0.0004998882600424519, "loss": 6.0482, "mean_token_accuracy": 0.12497256994247437, "num_tokens": 4033933.0, "step": 2185 }, { "entropy": 6.077206420898437, "epoch": 0.18399495904221802, "grad_norm": 1.2109375, "learning_rate": 0.0004998873143790455, "loss": 5.9828, "mean_token_accuracy": 0.14159614518284797, "num_tokens": 4042891.0, "step": 2190 }, { "entropy": 6.073574495315552, "epoch": 0.1844150388573829, "grad_norm": 1.1171875, "learning_rate": 0.0004998863647318763, "loss": 6.0991, "mean_token_accuracy": 0.12677306681871414, "num_tokens": 4051123.0, "step": 2195 }, { "entropy": 6.040490913391113, "epoch": 0.1848351186725478, "grad_norm": 1.1796875, "learning_rate": 0.0004998854111009608, "loss": 6.0708, "mean_token_accuracy": 0.12666793614625932, "num_tokens": 4060025.0, "step": 2200 }, { "entropy": 6.00008749961853, "epoch": 0.18525519848771266, "grad_norm": 0.98046875, "learning_rate": 0.0004998844534863161, "loss": 5.9771, "mean_token_accuracy": 0.12744748294353486, "num_tokens": 4069363.0, "step": 2205 }, { "entropy": 6.136739826202392, "epoch": 0.18567527830287756, "grad_norm": 1.0546875, "learning_rate": 0.0004998834918879592, "loss": 6.1326, "mean_token_accuracy": 0.1333842933177948, "num_tokens": 4078855.0, "step": 2210 }, { "entropy": 6.1249613761901855, "epoch": 0.18609535811804243, "grad_norm": 0.9921875, "learning_rate": 0.000499882526305907, "loss": 6.0922, "mean_token_accuracy": 0.1307423233985901, "num_tokens": 4087801.0, "step": 2215 }, { "entropy": 6.088332986831665, "epoch": 0.18651543793320732, "grad_norm": 1.015625, "learning_rate": 0.0004998815567401765, "loss": 6.1042, "mean_token_accuracy": 0.1286585159599781, "num_tokens": 4096949.0, "step": 2220 }, { "entropy": 6.1469251155853275, "epoch": 0.1869355177483722, "grad_norm": 1.09375, "learning_rate": 0.0004998805831907851, "loss": 6.0481, "mean_token_accuracy": 0.13261425495147705, "num_tokens": 4105399.0, "step": 2225 }, { "entropy": 6.030412244796753, "epoch": 0.18735559756353706, "grad_norm": 1.125, "learning_rate": 0.0004998796056577501, "loss": 6.0131, "mean_token_accuracy": 0.12757696062326432, "num_tokens": 4113873.0, "step": 2230 }, { "entropy": 6.031377696990967, "epoch": 0.18777567737870196, "grad_norm": 1.015625, "learning_rate": 0.0004998786241410886, "loss": 6.0712, "mean_token_accuracy": 0.13365804105997087, "num_tokens": 4123528.0, "step": 2235 }, { "entropy": 6.171572303771972, "epoch": 0.18819575719386683, "grad_norm": 1.03125, "learning_rate": 0.000499877638640818, "loss": 6.0658, "mean_token_accuracy": 0.1285923771560192, "num_tokens": 4133370.0, "step": 2240 }, { "entropy": 5.986340188980103, "epoch": 0.18861583700903173, "grad_norm": 1.0234375, "learning_rate": 0.000499876649156956, "loss": 5.9815, "mean_token_accuracy": 0.13429070338606836, "num_tokens": 4142370.0, "step": 2245 }, { "entropy": 6.028431034088134, "epoch": 0.1890359168241966, "grad_norm": 1.078125, "learning_rate": 0.0004998756556895196, "loss": 6.0667, "mean_token_accuracy": 0.13125480934977532, "num_tokens": 4152367.0, "step": 2250 }, { "entropy": 6.113050174713135, "epoch": 0.18945599663936147, "grad_norm": 1.1328125, "learning_rate": 0.000499874658238527, "loss": 6.0648, "mean_token_accuracy": 0.1346374697983265, "num_tokens": 4161126.0, "step": 2255 }, { "entropy": 6.028095388412476, "epoch": 0.18987607645452637, "grad_norm": 1.1328125, "learning_rate": 0.0004998736568039957, "loss": 5.96, "mean_token_accuracy": 0.13207067623734475, "num_tokens": 4169910.0, "step": 2260 }, { "entropy": 6.0694433689117435, "epoch": 0.19029615626969124, "grad_norm": 1.125, "learning_rate": 0.0004998726513859432, "loss": 6.1159, "mean_token_accuracy": 0.12696689143776893, "num_tokens": 4179893.0, "step": 2265 }, { "entropy": 6.183452177047729, "epoch": 0.19071623608485613, "grad_norm": 1.0078125, "learning_rate": 0.0004998716419843875, "loss": 6.1192, "mean_token_accuracy": 0.1379365175962448, "num_tokens": 4190065.0, "step": 2270 }, { "entropy": 5.935803985595703, "epoch": 0.191136315900021, "grad_norm": 1.1640625, "learning_rate": 0.0004998706285993465, "loss": 6.0341, "mean_token_accuracy": 0.13346357494592667, "num_tokens": 4198395.0, "step": 2275 }, { "entropy": 6.13513503074646, "epoch": 0.19155639571518587, "grad_norm": 1.140625, "learning_rate": 0.0004998696112308381, "loss": 6.066, "mean_token_accuracy": 0.1285228006541729, "num_tokens": 4207555.0, "step": 2280 }, { "entropy": 5.988098859786987, "epoch": 0.19197647553035077, "grad_norm": 1.0703125, "learning_rate": 0.0004998685898788803, "loss": 6.0031, "mean_token_accuracy": 0.13245714083313942, "num_tokens": 4216533.0, "step": 2285 }, { "entropy": 6.156089878082275, "epoch": 0.19239655534551564, "grad_norm": 1.2109375, "learning_rate": 0.0004998675645434914, "loss": 6.1082, "mean_token_accuracy": 0.13477368876338006, "num_tokens": 4225575.0, "step": 2290 }, { "entropy": 5.991475677490234, "epoch": 0.19281663516068054, "grad_norm": 1.125, "learning_rate": 0.0004998665352246891, "loss": 5.8834, "mean_token_accuracy": 0.1395990490913391, "num_tokens": 4234306.0, "step": 2295 }, { "entropy": 5.991662073135376, "epoch": 0.1932367149758454, "grad_norm": 1.0625, "learning_rate": 0.0004998655019224921, "loss": 6.0833, "mean_token_accuracy": 0.13282228037714958, "num_tokens": 4243998.0, "step": 2300 }, { "entropy": 6.109613370895386, "epoch": 0.19365679479101028, "grad_norm": 1.1171875, "learning_rate": 0.0004998644646369185, "loss": 5.9798, "mean_token_accuracy": 0.1297495998442173, "num_tokens": 4253653.0, "step": 2305 }, { "entropy": 5.980562829971314, "epoch": 0.19407687460617518, "grad_norm": 1.1171875, "learning_rate": 0.0004998634233679865, "loss": 6.0677, "mean_token_accuracy": 0.12498517185449601, "num_tokens": 4263305.0, "step": 2310 }, { "entropy": 6.036490631103516, "epoch": 0.19449695442134005, "grad_norm": 1.1640625, "learning_rate": 0.000499862378115715, "loss": 5.942, "mean_token_accuracy": 0.13776571899652482, "num_tokens": 4272212.0, "step": 2315 }, { "entropy": 6.152922439575195, "epoch": 0.19491703423650494, "grad_norm": 1.1796875, "learning_rate": 0.0004998613288801221, "loss": 6.1425, "mean_token_accuracy": 0.12971725761890412, "num_tokens": 4281445.0, "step": 2320 }, { "entropy": 6.123716592788696, "epoch": 0.1953371140516698, "grad_norm": 1.0625, "learning_rate": 0.0004998602756612267, "loss": 6.0573, "mean_token_accuracy": 0.13039504289627074, "num_tokens": 4290938.0, "step": 2325 }, { "entropy": 6.004360866546631, "epoch": 0.1957571938668347, "grad_norm": 1.1015625, "learning_rate": 0.0004998592184590471, "loss": 6.0764, "mean_token_accuracy": 0.13114980682730676, "num_tokens": 4300022.0, "step": 2330 }, { "entropy": 5.995278835296631, "epoch": 0.19617727368199958, "grad_norm": 1.203125, "learning_rate": 0.0004998581572736024, "loss": 5.9693, "mean_token_accuracy": 0.1386754259467125, "num_tokens": 4308910.0, "step": 2335 }, { "entropy": 5.990830326080323, "epoch": 0.19659735349716445, "grad_norm": 1.109375, "learning_rate": 0.0004998570921049112, "loss": 5.964, "mean_token_accuracy": 0.13531014919281006, "num_tokens": 4317136.0, "step": 2340 }, { "entropy": 6.019982814788818, "epoch": 0.19701743331232935, "grad_norm": 1.125, "learning_rate": 0.0004998560229529924, "loss": 6.0043, "mean_token_accuracy": 0.13840724304318427, "num_tokens": 4326163.0, "step": 2345 }, { "entropy": 6.181583261489868, "epoch": 0.19743751312749422, "grad_norm": 1.09375, "learning_rate": 0.0004998549498178649, "loss": 6.1515, "mean_token_accuracy": 0.13239141255617143, "num_tokens": 4335837.0, "step": 2350 }, { "entropy": 6.1028111457824705, "epoch": 0.19785759294265912, "grad_norm": 1.203125, "learning_rate": 0.0004998538726995477, "loss": 6.0502, "mean_token_accuracy": 0.13465801179409026, "num_tokens": 4345108.0, "step": 2355 }, { "entropy": 6.11204948425293, "epoch": 0.198277672757824, "grad_norm": 1.078125, "learning_rate": 0.00049985279159806, "loss": 6.0904, "mean_token_accuracy": 0.13041200041770934, "num_tokens": 4353761.0, "step": 2360 }, { "entropy": 6.025763607025146, "epoch": 0.19869775257298886, "grad_norm": 1.1796875, "learning_rate": 0.0004998517065134208, "loss": 6.0492, "mean_token_accuracy": 0.1321948856115341, "num_tokens": 4363244.0, "step": 2365 }, { "entropy": 6.045213079452514, "epoch": 0.19911783238815375, "grad_norm": 1.1015625, "learning_rate": 0.0004998506174456494, "loss": 6.0414, "mean_token_accuracy": 0.1313652828335762, "num_tokens": 4373034.0, "step": 2370 }, { "entropy": 6.018647909164429, "epoch": 0.19953791220331862, "grad_norm": 1.078125, "learning_rate": 0.0004998495243947653, "loss": 5.9888, "mean_token_accuracy": 0.12499892637133599, "num_tokens": 4382554.0, "step": 2375 }, { "entropy": 6.050378942489624, "epoch": 0.19995799201848352, "grad_norm": 1.15625, "learning_rate": 0.0004998484273607875, "loss": 6.0109, "mean_token_accuracy": 0.13378100991249084, "num_tokens": 4391001.0, "step": 2380 }, { "entropy": 5.911934089660645, "epoch": 0.2003780718336484, "grad_norm": 1.078125, "learning_rate": 0.0004998473263437356, "loss": 5.9179, "mean_token_accuracy": 0.13600271940231323, "num_tokens": 4400632.0, "step": 2385 }, { "entropy": 6.02356653213501, "epoch": 0.20079815164881326, "grad_norm": 1.15625, "learning_rate": 0.000499846221343629, "loss": 6.0212, "mean_token_accuracy": 0.12660589516162873, "num_tokens": 4409565.0, "step": 2390 }, { "entropy": 6.01599760055542, "epoch": 0.20121823146397816, "grad_norm": 1.1328125, "learning_rate": 0.0004998451123604875, "loss": 5.9683, "mean_token_accuracy": 0.13648212924599648, "num_tokens": 4418384.0, "step": 2395 }, { "entropy": 6.052209377288818, "epoch": 0.20163831127914303, "grad_norm": 1.265625, "learning_rate": 0.0004998439993943306, "loss": 6.0768, "mean_token_accuracy": 0.13455061092972756, "num_tokens": 4427581.0, "step": 2400 }, { "entropy": 6.126885080337525, "epoch": 0.20205839109430793, "grad_norm": 1.1796875, "learning_rate": 0.0004998428824451779, "loss": 6.0655, "mean_token_accuracy": 0.12827305421233176, "num_tokens": 4436572.0, "step": 2405 }, { "entropy": 6.031488513946533, "epoch": 0.2024784709094728, "grad_norm": 1.1953125, "learning_rate": 0.0004998417615130495, "loss": 6.0686, "mean_token_accuracy": 0.13068403899669648, "num_tokens": 4445230.0, "step": 2410 }, { "entropy": 6.176716995239258, "epoch": 0.2028985507246377, "grad_norm": 1.1875, "learning_rate": 0.0004998406365979649, "loss": 6.1411, "mean_token_accuracy": 0.13211808502674102, "num_tokens": 4454251.0, "step": 2415 }, { "entropy": 6.0446735382080075, "epoch": 0.20331863053980256, "grad_norm": 1.078125, "learning_rate": 0.0004998395076999443, "loss": 5.9835, "mean_token_accuracy": 0.13375458046793937, "num_tokens": 4463949.0, "step": 2420 }, { "entropy": 6.100032329559326, "epoch": 0.20373871035496743, "grad_norm": 1.09375, "learning_rate": 0.0004998383748190076, "loss": 6.1638, "mean_token_accuracy": 0.12677136287093163, "num_tokens": 4473373.0, "step": 2425 }, { "entropy": 6.1406354904174805, "epoch": 0.20415879017013233, "grad_norm": 2.515625, "learning_rate": 0.0004998372379551748, "loss": 6.0192, "mean_token_accuracy": 0.1297541990876198, "num_tokens": 4482303.0, "step": 2430 }, { "entropy": 5.960326719284057, "epoch": 0.2045788699852972, "grad_norm": 1.078125, "learning_rate": 0.0004998360971084663, "loss": 5.9691, "mean_token_accuracy": 0.13114270120859145, "num_tokens": 4491214.0, "step": 2435 }, { "entropy": 5.993530750274658, "epoch": 0.2049989498004621, "grad_norm": 1.15625, "learning_rate": 0.0004998349522789019, "loss": 5.8971, "mean_token_accuracy": 0.13634659722447395, "num_tokens": 4500099.0, "step": 2440 }, { "entropy": 5.949355411529541, "epoch": 0.20541902961562697, "grad_norm": 1.3359375, "learning_rate": 0.0004998338034665021, "loss": 5.9773, "mean_token_accuracy": 0.13802963197231294, "num_tokens": 4509893.0, "step": 2445 }, { "entropy": 6.010091781616211, "epoch": 0.20583910943079184, "grad_norm": 1.109375, "learning_rate": 0.0004998326506712872, "loss": 5.9481, "mean_token_accuracy": 0.1345847800374031, "num_tokens": 4518606.0, "step": 2450 }, { "entropy": 6.054938316345215, "epoch": 0.20625918924595674, "grad_norm": 1.1484375, "learning_rate": 0.0004998314938932778, "loss": 6.0368, "mean_token_accuracy": 0.13336761966347693, "num_tokens": 4528392.0, "step": 2455 }, { "entropy": 6.06166090965271, "epoch": 0.2066792690611216, "grad_norm": 1.0859375, "learning_rate": 0.0004998303331324943, "loss": 6.0068, "mean_token_accuracy": 0.13653545677661896, "num_tokens": 4536983.0, "step": 2460 }, { "entropy": 5.900071907043457, "epoch": 0.2070993488762865, "grad_norm": 1.125, "learning_rate": 0.0004998291683889571, "loss": 5.9105, "mean_token_accuracy": 0.14012753888964652, "num_tokens": 4544967.0, "step": 2465 }, { "entropy": 6.0055500030517575, "epoch": 0.20751942869145137, "grad_norm": 1.1875, "learning_rate": 0.000499827999662687, "loss": 5.9915, "mean_token_accuracy": 0.1313713811337948, "num_tokens": 4554646.0, "step": 2470 }, { "entropy": 6.090430212020874, "epoch": 0.20793950850661624, "grad_norm": 1.109375, "learning_rate": 0.0004998268269537046, "loss": 6.0166, "mean_token_accuracy": 0.13576155975461007, "num_tokens": 4564040.0, "step": 2475 }, { "entropy": 6.014799499511719, "epoch": 0.20835958832178114, "grad_norm": 1.1171875, "learning_rate": 0.0004998256502620308, "loss": 6.0293, "mean_token_accuracy": 0.13742104843258857, "num_tokens": 4573758.0, "step": 2480 }, { "entropy": 6.133381319046021, "epoch": 0.208779668136946, "grad_norm": 1.09375, "learning_rate": 0.0004998244695876864, "loss": 6.0616, "mean_token_accuracy": 0.13331351354718207, "num_tokens": 4582097.0, "step": 2485 }, { "entropy": 5.951827907562256, "epoch": 0.2091997479521109, "grad_norm": 1.2421875, "learning_rate": 0.0004998232849306921, "loss": 6.0184, "mean_token_accuracy": 0.13623687997460365, "num_tokens": 4590687.0, "step": 2490 }, { "entropy": 6.113849639892578, "epoch": 0.20961982776727578, "grad_norm": 1.1640625, "learning_rate": 0.0004998220962910693, "loss": 6.0063, "mean_token_accuracy": 0.1295908585190773, "num_tokens": 4599497.0, "step": 2495 }, { "entropy": 6.004902267456055, "epoch": 0.21003990758244068, "grad_norm": 1.2109375, "learning_rate": 0.0004998209036688386, "loss": 5.9761, "mean_token_accuracy": 0.13407944440841674, "num_tokens": 4607958.0, "step": 2500 }, { "entropy": 6.0813335418701175, "epoch": 0.21045998739760555, "grad_norm": 1.1015625, "learning_rate": 0.0004998197070640216, "loss": 6.1175, "mean_token_accuracy": 0.1263462521135807, "num_tokens": 4617515.0, "step": 2505 }, { "entropy": 6.137440729141235, "epoch": 0.21088006721277042, "grad_norm": 1.0625, "learning_rate": 0.0004998185064766391, "loss": 5.9907, "mean_token_accuracy": 0.1353794366121292, "num_tokens": 4627037.0, "step": 2510 }, { "entropy": 5.9435793399810795, "epoch": 0.21130014702793531, "grad_norm": 1.109375, "learning_rate": 0.0004998173019067127, "loss": 5.9898, "mean_token_accuracy": 0.13374351039528848, "num_tokens": 4637393.0, "step": 2515 }, { "entropy": 5.990574741363526, "epoch": 0.21172022684310018, "grad_norm": 1.1484375, "learning_rate": 0.0004998160933542633, "loss": 6.0354, "mean_token_accuracy": 0.1225339263677597, "num_tokens": 4646832.0, "step": 2520 }, { "entropy": 6.078363418579102, "epoch": 0.21214030665826508, "grad_norm": 1.203125, "learning_rate": 0.0004998148808193128, "loss": 6.0571, "mean_token_accuracy": 0.1361754283308983, "num_tokens": 4655719.0, "step": 2525 }, { "entropy": 6.027440595626831, "epoch": 0.21256038647342995, "grad_norm": 1.203125, "learning_rate": 0.0004998136643018823, "loss": 6.0247, "mean_token_accuracy": 0.13285491690039636, "num_tokens": 4665364.0, "step": 2530 }, { "entropy": 6.0071735858917235, "epoch": 0.21298046628859482, "grad_norm": 1.1484375, "learning_rate": 0.0004998124438019935, "loss": 5.9795, "mean_token_accuracy": 0.13230021819472312, "num_tokens": 4674760.0, "step": 2535 }, { "entropy": 5.971972846984864, "epoch": 0.21340054610375972, "grad_norm": 1.09375, "learning_rate": 0.0004998112193196681, "loss": 5.9064, "mean_token_accuracy": 0.1363543339073658, "num_tokens": 4683900.0, "step": 2540 }, { "entropy": 5.9554856300354, "epoch": 0.2138206259189246, "grad_norm": 1.203125, "learning_rate": 0.0004998099908549277, "loss": 5.9628, "mean_token_accuracy": 0.12749610617756843, "num_tokens": 4693915.0, "step": 2545 }, { "entropy": 5.917505264282227, "epoch": 0.2142407057340895, "grad_norm": 1.109375, "learning_rate": 0.000499808758407794, "loss": 5.8105, "mean_token_accuracy": 0.14476394206285476, "num_tokens": 4703102.0, "step": 2550 }, { "entropy": 6.020478820800781, "epoch": 0.21466078554925436, "grad_norm": 1.1328125, "learning_rate": 0.0004998075219782889, "loss": 6.0403, "mean_token_accuracy": 0.13109349682927132, "num_tokens": 4712925.0, "step": 2555 }, { "entropy": 6.050074434280395, "epoch": 0.21508086536441923, "grad_norm": 1.1796875, "learning_rate": 0.0004998062815664344, "loss": 5.9753, "mean_token_accuracy": 0.13155975714325904, "num_tokens": 4722641.0, "step": 2560 }, { "entropy": 5.953602123260498, "epoch": 0.21550094517958412, "grad_norm": 1.140625, "learning_rate": 0.0004998050371722524, "loss": 6.0504, "mean_token_accuracy": 0.12833617106080056, "num_tokens": 4732603.0, "step": 2565 }, { "entropy": 5.983808517456055, "epoch": 0.215921024994749, "grad_norm": 1.0078125, "learning_rate": 0.0004998037887957649, "loss": 5.8814, "mean_token_accuracy": 0.1358911283314228, "num_tokens": 4742644.0, "step": 2570 }, { "entropy": 6.108869409561157, "epoch": 0.2163411048099139, "grad_norm": 1.078125, "learning_rate": 0.0004998025364369939, "loss": 6.2019, "mean_token_accuracy": 0.1304102584719658, "num_tokens": 4751482.0, "step": 2575 }, { "entropy": 6.196333599090576, "epoch": 0.21676118462507876, "grad_norm": 1.203125, "learning_rate": 0.0004998012800959619, "loss": 6.0606, "mean_token_accuracy": 0.13010098412632942, "num_tokens": 4760593.0, "step": 2580 }, { "entropy": 6.040443658828735, "epoch": 0.21718126444024366, "grad_norm": 1.25, "learning_rate": 0.0004998000197726909, "loss": 6.0456, "mean_token_accuracy": 0.13714693188667298, "num_tokens": 4769294.0, "step": 2585 }, { "entropy": 6.037139892578125, "epoch": 0.21760134425540853, "grad_norm": 1.0625, "learning_rate": 0.0004997987554672033, "loss": 5.9468, "mean_token_accuracy": 0.1388247735798359, "num_tokens": 4779239.0, "step": 2590 }, { "entropy": 6.028454113006592, "epoch": 0.2180214240705734, "grad_norm": 1.09375, "learning_rate": 0.0004997974871795215, "loss": 6.0394, "mean_token_accuracy": 0.1312641680240631, "num_tokens": 4788211.0, "step": 2595 }, { "entropy": 6.064166069030762, "epoch": 0.2184415038857383, "grad_norm": 1.078125, "learning_rate": 0.000499796214909668, "loss": 6.0092, "mean_token_accuracy": 0.13773878663778305, "num_tokens": 4797921.0, "step": 2600 }, { "entropy": 6.0709045886993405, "epoch": 0.21886158370090317, "grad_norm": 1.1484375, "learning_rate": 0.0004997949386576653, "loss": 6.0042, "mean_token_accuracy": 0.1314603127539158, "num_tokens": 4807772.0, "step": 2605 }, { "entropy": 5.99371829032898, "epoch": 0.21928166351606806, "grad_norm": 1.0859375, "learning_rate": 0.000499793658423536, "loss": 6.0192, "mean_token_accuracy": 0.13455276489257811, "num_tokens": 4817999.0, "step": 2610 }, { "entropy": 6.059015130996704, "epoch": 0.21970174333123293, "grad_norm": 1.2109375, "learning_rate": 0.0004997923742073028, "loss": 5.9804, "mean_token_accuracy": 0.14437463730573655, "num_tokens": 4826679.0, "step": 2615 }, { "entropy": 5.943931245803833, "epoch": 0.2201218231463978, "grad_norm": 1.15625, "learning_rate": 0.0004997910860089884, "loss": 5.9832, "mean_token_accuracy": 0.13589627295732498, "num_tokens": 4834998.0, "step": 2620 }, { "entropy": 6.057482719421387, "epoch": 0.2205419029615627, "grad_norm": 1.1484375, "learning_rate": 0.0004997897938286156, "loss": 5.9317, "mean_token_accuracy": 0.13643529042601585, "num_tokens": 4843635.0, "step": 2625 }, { "entropy": 6.018534517288208, "epoch": 0.22096198277672757, "grad_norm": 1.21875, "learning_rate": 0.0004997884976662075, "loss": 6.055, "mean_token_accuracy": 0.1327926956117153, "num_tokens": 4852027.0, "step": 2630 }, { "entropy": 6.1170580863952635, "epoch": 0.22138206259189247, "grad_norm": 1.15625, "learning_rate": 0.0004997871975217868, "loss": 5.9753, "mean_token_accuracy": 0.14027014896273612, "num_tokens": 4861244.0, "step": 2635 }, { "entropy": 5.881848526000977, "epoch": 0.22180214240705734, "grad_norm": 1.15625, "learning_rate": 0.0004997858933953768, "loss": 5.8911, "mean_token_accuracy": 0.13821944668889047, "num_tokens": 4869902.0, "step": 2640 }, { "entropy": 5.9110313892364506, "epoch": 0.2222222222222222, "grad_norm": 1.1015625, "learning_rate": 0.0004997845852870004, "loss": 5.8706, "mean_token_accuracy": 0.1410742297768593, "num_tokens": 4878502.0, "step": 2645 }, { "entropy": 5.945323467254639, "epoch": 0.2226423020373871, "grad_norm": 1.1171875, "learning_rate": 0.0004997832731966806, "loss": 5.9249, "mean_token_accuracy": 0.1411375291645527, "num_tokens": 4888348.0, "step": 2650 }, { "entropy": 5.97375717163086, "epoch": 0.22306238185255198, "grad_norm": 1.15625, "learning_rate": 0.0004997819571244411, "loss": 5.9955, "mean_token_accuracy": 0.13679953366518022, "num_tokens": 4897302.0, "step": 2655 }, { "entropy": 6.008918142318725, "epoch": 0.22348246166771688, "grad_norm": 1.125, "learning_rate": 0.0004997806370703049, "loss": 6.0213, "mean_token_accuracy": 0.13776542693376542, "num_tokens": 4907078.0, "step": 2660 }, { "entropy": 5.947820472717285, "epoch": 0.22390254148288175, "grad_norm": 1.0078125, "learning_rate": 0.0004997793130342954, "loss": 5.8494, "mean_token_accuracy": 0.13728704303503036, "num_tokens": 4917489.0, "step": 2665 }, { "entropy": 5.903385925292969, "epoch": 0.22432262129804661, "grad_norm": 1.15625, "learning_rate": 0.0004997779850164363, "loss": 5.9513, "mean_token_accuracy": 0.1372543305158615, "num_tokens": 4927073.0, "step": 2670 }, { "entropy": 6.074141645431519, "epoch": 0.2247427011132115, "grad_norm": 1.21875, "learning_rate": 0.0004997766530167508, "loss": 6.0449, "mean_token_accuracy": 0.13193764314055442, "num_tokens": 4935464.0, "step": 2675 }, { "entropy": 6.150123214721679, "epoch": 0.22516278092837638, "grad_norm": 1.25, "learning_rate": 0.0004997753170352627, "loss": 6.1293, "mean_token_accuracy": 0.13081697300076484, "num_tokens": 4944718.0, "step": 2680 }, { "entropy": 6.070659351348877, "epoch": 0.22558286074354128, "grad_norm": 1.25, "learning_rate": 0.0004997739770719955, "loss": 6.0077, "mean_token_accuracy": 0.13340400233864785, "num_tokens": 4954223.0, "step": 2685 }, { "entropy": 5.978194713592529, "epoch": 0.22600294055870615, "grad_norm": 1.125, "learning_rate": 0.000499772633126973, "loss": 6.0415, "mean_token_accuracy": 0.12924405336380004, "num_tokens": 4963371.0, "step": 2690 }, { "entropy": 5.975288677215576, "epoch": 0.22642302037387105, "grad_norm": 1.1640625, "learning_rate": 0.0004997712852002192, "loss": 5.9086, "mean_token_accuracy": 0.1422348402440548, "num_tokens": 4972973.0, "step": 2695 }, { "entropy": 6.000183725357056, "epoch": 0.22684310018903592, "grad_norm": 1.21875, "learning_rate": 0.0004997699332917578, "loss": 6.141, "mean_token_accuracy": 0.12485837489366532, "num_tokens": 4982808.0, "step": 2700 }, { "entropy": 6.124313545227051, "epoch": 0.2272631800042008, "grad_norm": 1.09375, "learning_rate": 0.0004997685774016127, "loss": 6.0087, "mean_token_accuracy": 0.13304658830165864, "num_tokens": 4992427.0, "step": 2705 }, { "entropy": 6.108034229278564, "epoch": 0.22768325981936569, "grad_norm": 1.0546875, "learning_rate": 0.000499767217529808, "loss": 6.1924, "mean_token_accuracy": 0.12314376682043075, "num_tokens": 5003562.0, "step": 2710 }, { "entropy": 6.024952030181884, "epoch": 0.22810333963453056, "grad_norm": 1.046875, "learning_rate": 0.0004997658536763678, "loss": 5.8848, "mean_token_accuracy": 0.13965026810765266, "num_tokens": 5013429.0, "step": 2715 }, { "entropy": 6.031959342956543, "epoch": 0.22852341944969545, "grad_norm": 1.046875, "learning_rate": 0.0004997644858413163, "loss": 6.0173, "mean_token_accuracy": 0.1369493454694748, "num_tokens": 5022045.0, "step": 2720 }, { "entropy": 5.9200338363647464, "epoch": 0.22894349926486032, "grad_norm": 1.0390625, "learning_rate": 0.0004997631140246775, "loss": 5.8591, "mean_token_accuracy": 0.14441144168376924, "num_tokens": 5032260.0, "step": 2725 }, { "entropy": 5.958108282089233, "epoch": 0.2293635790800252, "grad_norm": 1.1953125, "learning_rate": 0.000499761738226476, "loss": 5.9041, "mean_token_accuracy": 0.1375686287879944, "num_tokens": 5041688.0, "step": 2730 }, { "entropy": 5.965337133407592, "epoch": 0.2297836588951901, "grad_norm": 1.359375, "learning_rate": 0.000499760358446736, "loss": 6.0072, "mean_token_accuracy": 0.13241190686821938, "num_tokens": 5051005.0, "step": 2735 }, { "entropy": 6.077162218093872, "epoch": 0.23020373871035496, "grad_norm": 1.1171875, "learning_rate": 0.000499758974685482, "loss": 5.9379, "mean_token_accuracy": 0.13544429317116738, "num_tokens": 5060084.0, "step": 2740 }, { "entropy": 5.981188869476318, "epoch": 0.23062381852551986, "grad_norm": 1.203125, "learning_rate": 0.0004997575869427385, "loss": 5.953, "mean_token_accuracy": 0.13910676240921022, "num_tokens": 5069081.0, "step": 2745 }, { "entropy": 5.978755378723145, "epoch": 0.23104389834068473, "grad_norm": 1.1171875, "learning_rate": 0.00049975619521853, "loss": 5.9429, "mean_token_accuracy": 0.13454415425658225, "num_tokens": 5078597.0, "step": 2750 }, { "entropy": 5.911752319335937, "epoch": 0.2314639781558496, "grad_norm": 1.0859375, "learning_rate": 0.0004997547995128814, "loss": 5.9829, "mean_token_accuracy": 0.13646793067455293, "num_tokens": 5087607.0, "step": 2755 }, { "entropy": 6.035622882843017, "epoch": 0.2318840579710145, "grad_norm": 1.3359375, "learning_rate": 0.0004997533998258171, "loss": 5.9832, "mean_token_accuracy": 0.13701630160212516, "num_tokens": 5097412.0, "step": 2760 }, { "entropy": 6.060332536697388, "epoch": 0.23230413778617937, "grad_norm": 1.1484375, "learning_rate": 0.0004997519961573622, "loss": 6.0518, "mean_token_accuracy": 0.1287323147058487, "num_tokens": 5105817.0, "step": 2765 }, { "entropy": 6.132694864273072, "epoch": 0.23272421760134426, "grad_norm": 1.3046875, "learning_rate": 0.0004997505885075414, "loss": 6.0843, "mean_token_accuracy": 0.13087237104773522, "num_tokens": 5114958.0, "step": 2770 }, { "entropy": 6.002990245819092, "epoch": 0.23314429741650913, "grad_norm": 1.15625, "learning_rate": 0.0004997491768763795, "loss": 6.0022, "mean_token_accuracy": 0.13458002656698226, "num_tokens": 5123728.0, "step": 2775 }, { "entropy": 6.000336790084839, "epoch": 0.23356437723167403, "grad_norm": 1.1953125, "learning_rate": 0.0004997477612639018, "loss": 6.0532, "mean_token_accuracy": 0.12724062129855157, "num_tokens": 5134099.0, "step": 2780 }, { "entropy": 6.150645542144775, "epoch": 0.2339844570468389, "grad_norm": 1.140625, "learning_rate": 0.0004997463416701332, "loss": 6.0567, "mean_token_accuracy": 0.12823428884148597, "num_tokens": 5142934.0, "step": 2785 }, { "entropy": 5.973050594329834, "epoch": 0.23440453686200377, "grad_norm": 1.1484375, "learning_rate": 0.0004997449180950989, "loss": 5.9005, "mean_token_accuracy": 0.15188876092433928, "num_tokens": 5151835.0, "step": 2790 }, { "entropy": 5.945583391189575, "epoch": 0.23482461667716867, "grad_norm": 1.1015625, "learning_rate": 0.0004997434905388241, "loss": 5.9533, "mean_token_accuracy": 0.14123927503824235, "num_tokens": 5161136.0, "step": 2795 }, { "entropy": 5.966424179077149, "epoch": 0.23524469649233354, "grad_norm": 1.1171875, "learning_rate": 0.000499742059001334, "loss": 5.887, "mean_token_accuracy": 0.141261176019907, "num_tokens": 5170741.0, "step": 2800 }, { "entropy": 5.931414556503296, "epoch": 0.23566477630749844, "grad_norm": 1.109375, "learning_rate": 0.0004997406234826541, "loss": 5.9226, "mean_token_accuracy": 0.14311096221208572, "num_tokens": 5180549.0, "step": 2805 }, { "entropy": 5.932320833206177, "epoch": 0.2360848561226633, "grad_norm": 1.0859375, "learning_rate": 0.0004997391839828098, "loss": 5.8876, "mean_token_accuracy": 0.14145613387227057, "num_tokens": 5189486.0, "step": 2810 }, { "entropy": 5.9496715545654295, "epoch": 0.23650493593782818, "grad_norm": 1.1171875, "learning_rate": 0.0004997377405018266, "loss": 5.9643, "mean_token_accuracy": 0.1311965249478817, "num_tokens": 5198525.0, "step": 2815 }, { "entropy": 6.014245939254761, "epoch": 0.23692501575299307, "grad_norm": 1.109375, "learning_rate": 0.00049973629303973, "loss": 6.0158, "mean_token_accuracy": 0.13539923653006553, "num_tokens": 5207124.0, "step": 2820 }, { "entropy": 5.944891500473022, "epoch": 0.23734509556815794, "grad_norm": 1.0546875, "learning_rate": 0.0004997348415965457, "loss": 5.85, "mean_token_accuracy": 0.1407323271036148, "num_tokens": 5216529.0, "step": 2825 }, { "entropy": 5.993428897857666, "epoch": 0.23776517538332284, "grad_norm": 1.203125, "learning_rate": 0.0004997333861722995, "loss": 5.9831, "mean_token_accuracy": 0.138421493768692, "num_tokens": 5225796.0, "step": 2830 }, { "entropy": 6.012037515640259, "epoch": 0.2381852551984877, "grad_norm": 1.234375, "learning_rate": 0.000499731926767017, "loss": 6.0269, "mean_token_accuracy": 0.13556732088327408, "num_tokens": 5233876.0, "step": 2835 }, { "entropy": 5.9344642639160154, "epoch": 0.23860533501365258, "grad_norm": 1.203125, "learning_rate": 0.0004997304633807242, "loss": 6.0019, "mean_token_accuracy": 0.12836523801088334, "num_tokens": 5244782.0, "step": 2840 }, { "entropy": 5.987623119354248, "epoch": 0.23902541482881748, "grad_norm": 1.1171875, "learning_rate": 0.0004997289960134468, "loss": 5.9579, "mean_token_accuracy": 0.1335374064743519, "num_tokens": 5253453.0, "step": 2845 }, { "entropy": 5.978561115264893, "epoch": 0.23944549464398235, "grad_norm": 1.1875, "learning_rate": 0.0004997275246652111, "loss": 5.9635, "mean_token_accuracy": 0.14111838340759278, "num_tokens": 5262355.0, "step": 2850 }, { "entropy": 5.952275371551513, "epoch": 0.23986557445914725, "grad_norm": 1.1953125, "learning_rate": 0.000499726049336043, "loss": 5.8897, "mean_token_accuracy": 0.1419723652303219, "num_tokens": 5271959.0, "step": 2855 }, { "entropy": 6.006475210189819, "epoch": 0.24028565427431212, "grad_norm": 1.15625, "learning_rate": 0.0004997245700259686, "loss": 5.9216, "mean_token_accuracy": 0.1432231843471527, "num_tokens": 5281393.0, "step": 2860 }, { "entropy": 5.981584358215332, "epoch": 0.240705734089477, "grad_norm": 1.1171875, "learning_rate": 0.0004997230867350141, "loss": 6.0599, "mean_token_accuracy": 0.13176842033863068, "num_tokens": 5290979.0, "step": 2865 }, { "entropy": 6.072908639907837, "epoch": 0.24112581390464188, "grad_norm": 1.1171875, "learning_rate": 0.0004997215994632059, "loss": 5.9983, "mean_token_accuracy": 0.13976021558046342, "num_tokens": 5300263.0, "step": 2870 }, { "entropy": 5.99395980834961, "epoch": 0.24154589371980675, "grad_norm": 1.15625, "learning_rate": 0.0004997201082105704, "loss": 6.0192, "mean_token_accuracy": 0.13117292299866676, "num_tokens": 5309522.0, "step": 2875 }, { "entropy": 6.010568284988404, "epoch": 0.24196597353497165, "grad_norm": 1.171875, "learning_rate": 0.0004997186129771338, "loss": 6.0248, "mean_token_accuracy": 0.13696857616305352, "num_tokens": 5319770.0, "step": 2880 }, { "entropy": 6.136290264129639, "epoch": 0.24238605335013652, "grad_norm": 1.1953125, "learning_rate": 0.0004997171137629226, "loss": 6.0295, "mean_token_accuracy": 0.1379177153110504, "num_tokens": 5328400.0, "step": 2885 }, { "entropy": 5.855829429626465, "epoch": 0.24280613316530142, "grad_norm": 1.1953125, "learning_rate": 0.0004997156105679636, "loss": 5.8334, "mean_token_accuracy": 0.14593008533120155, "num_tokens": 5336338.0, "step": 2890 }, { "entropy": 5.898982286453247, "epoch": 0.2432262129804663, "grad_norm": 1.1484375, "learning_rate": 0.0004997141033922832, "loss": 5.9375, "mean_token_accuracy": 0.13418934270739555, "num_tokens": 5345391.0, "step": 2895 }, { "entropy": 6.035576057434082, "epoch": 0.24364629279563116, "grad_norm": 1.203125, "learning_rate": 0.0004997125922359081, "loss": 5.9508, "mean_token_accuracy": 0.13234915360808372, "num_tokens": 5354709.0, "step": 2900 }, { "entropy": 6.013994407653809, "epoch": 0.24406637261079606, "grad_norm": 1.1640625, "learning_rate": 0.0004997110770988652, "loss": 5.8796, "mean_token_accuracy": 0.1399741917848587, "num_tokens": 5363738.0, "step": 2905 }, { "entropy": 5.9528099536895756, "epoch": 0.24448645242596093, "grad_norm": 1.2109375, "learning_rate": 0.0004997095579811813, "loss": 6.0023, "mean_token_accuracy": 0.13593828454613685, "num_tokens": 5373583.0, "step": 2910 }, { "entropy": 6.03057632446289, "epoch": 0.24490653224112582, "grad_norm": 1.1328125, "learning_rate": 0.0004997080348828833, "loss": 6.0477, "mean_token_accuracy": 0.1340787522494793, "num_tokens": 5383486.0, "step": 2915 }, { "entropy": 5.969451522827148, "epoch": 0.2453266120562907, "grad_norm": 1.2109375, "learning_rate": 0.0004997065078039981, "loss": 5.9591, "mean_token_accuracy": 0.1328844092786312, "num_tokens": 5391974.0, "step": 2920 }, { "entropy": 6.031870555877686, "epoch": 0.24574669187145556, "grad_norm": 1.171875, "learning_rate": 0.0004997049767445529, "loss": 5.9995, "mean_token_accuracy": 0.13087670058012008, "num_tokens": 5400882.0, "step": 2925 }, { "entropy": 6.1388874530792235, "epoch": 0.24616677168662046, "grad_norm": 1.1953125, "learning_rate": 0.0004997034417045746, "loss": 5.958, "mean_token_accuracy": 0.13255189657211303, "num_tokens": 5410538.0, "step": 2930 }, { "entropy": 5.891916513442993, "epoch": 0.24658685150178533, "grad_norm": 1.1484375, "learning_rate": 0.0004997019026840907, "loss": 5.8523, "mean_token_accuracy": 0.1406748116016388, "num_tokens": 5419406.0, "step": 2935 }, { "entropy": 5.81290192604065, "epoch": 0.24700693131695023, "grad_norm": 1.1640625, "learning_rate": 0.0004997003596831282, "loss": 5.9661, "mean_token_accuracy": 0.13368260413408278, "num_tokens": 5428817.0, "step": 2940 }, { "entropy": 6.030734586715698, "epoch": 0.2474270111321151, "grad_norm": 1.1171875, "learning_rate": 0.0004996988127017145, "loss": 5.9967, "mean_token_accuracy": 0.1356920287013054, "num_tokens": 5438277.0, "step": 2945 }, { "entropy": 5.991678762435913, "epoch": 0.24784709094728, "grad_norm": 1.15625, "learning_rate": 0.0004996972617398772, "loss": 6.0095, "mean_token_accuracy": 0.13253712952136992, "num_tokens": 5447440.0, "step": 2950 }, { "entropy": 5.990732574462891, "epoch": 0.24826717076244487, "grad_norm": 1.140625, "learning_rate": 0.0004996957067976435, "loss": 5.9368, "mean_token_accuracy": 0.13873122334480287, "num_tokens": 5455988.0, "step": 2955 }, { "entropy": 6.013759565353394, "epoch": 0.24868725057760974, "grad_norm": 1.1484375, "learning_rate": 0.0004996941478750411, "loss": 5.9498, "mean_token_accuracy": 0.13479771465063095, "num_tokens": 5464996.0, "step": 2960 }, { "entropy": 6.04653902053833, "epoch": 0.24910733039277463, "grad_norm": 1.046875, "learning_rate": 0.0004996925849720975, "loss": 6.0789, "mean_token_accuracy": 0.12909941822290422, "num_tokens": 5474174.0, "step": 2965 }, { "entropy": 6.1094592094421385, "epoch": 0.2495274102079395, "grad_norm": 1.234375, "learning_rate": 0.0004996910180888405, "loss": 5.9515, "mean_token_accuracy": 0.14010420814156532, "num_tokens": 5482838.0, "step": 2970 }, { "entropy": 5.933987855911255, "epoch": 0.2499474900231044, "grad_norm": 1.078125, "learning_rate": 0.0004996894472252977, "loss": 5.9796, "mean_token_accuracy": 0.13611237108707427, "num_tokens": 5491616.0, "step": 2975 }, { "entropy": 5.946248006820679, "epoch": 0.25036756983826924, "grad_norm": 1.1328125, "learning_rate": 0.0004996878723814973, "loss": 5.9758, "mean_token_accuracy": 0.13201134279370308, "num_tokens": 5500942.0, "step": 2980 }, { "entropy": 6.011964797973633, "epoch": 0.25078764965343414, "grad_norm": 1.1015625, "learning_rate": 0.0004996862935574667, "loss": 5.9171, "mean_token_accuracy": 0.13369757011532785, "num_tokens": 5510078.0, "step": 2985 }, { "entropy": 5.91240873336792, "epoch": 0.25120772946859904, "grad_norm": 1.0625, "learning_rate": 0.0004996847107532342, "loss": 5.9402, "mean_token_accuracy": 0.13632848113775253, "num_tokens": 5518924.0, "step": 2990 }, { "entropy": 6.007929849624634, "epoch": 0.25162780928376394, "grad_norm": 1.0859375, "learning_rate": 0.0004996831239688277, "loss": 5.9543, "mean_token_accuracy": 0.13016238808631897, "num_tokens": 5527385.0, "step": 2995 }, { "entropy": 5.906063604354858, "epoch": 0.2520478890989288, "grad_norm": 1.1484375, "learning_rate": 0.0004996815332042754, "loss": 5.8144, "mean_token_accuracy": 0.14219059348106383, "num_tokens": 5536781.0, "step": 3000 }, { "epoch": 0.2520478890989288, "eval_entropy": 5.731865753634434, "eval_loss": 5.98077917098999, "eval_mean_token_accuracy": 0.14166069063261308, "eval_num_tokens": 5536781.0, "eval_runtime": 27.289, "eval_samples_per_second": 1369.272, "eval_steps_per_second": 171.168, "step": 3000 }, { "entropy": 5.928550434112549, "epoch": 0.2524679689140937, "grad_norm": 1.09375, "learning_rate": 0.0004996799384596054, "loss": 6.0018, "mean_token_accuracy": 0.13845922499895097, "num_tokens": 5545893.0, "step": 3005 }, { "entropy": 6.047553873062133, "epoch": 0.2528880487292586, "grad_norm": 1.1484375, "learning_rate": 0.0004996783397348461, "loss": 5.9557, "mean_token_accuracy": 0.13133809193968773, "num_tokens": 5555818.0, "step": 3010 }, { "entropy": 5.98364634513855, "epoch": 0.2533081285444234, "grad_norm": 1.046875, "learning_rate": 0.0004996767370300256, "loss": 5.9338, "mean_token_accuracy": 0.13684593588113786, "num_tokens": 5565331.0, "step": 3015 }, { "entropy": 6.016663599014282, "epoch": 0.2537282083595883, "grad_norm": 1.1796875, "learning_rate": 0.0004996751303451724, "loss": 5.9132, "mean_token_accuracy": 0.1414400041103363, "num_tokens": 5574003.0, "step": 3020 }, { "entropy": 5.934581279754639, "epoch": 0.2541482881747532, "grad_norm": 1.1328125, "learning_rate": 0.0004996735196803149, "loss": 5.8367, "mean_token_accuracy": 0.14427052065730095, "num_tokens": 5582517.0, "step": 3025 }, { "entropy": 5.937156009674072, "epoch": 0.2545683679899181, "grad_norm": 1.078125, "learning_rate": 0.0004996719050354818, "loss": 6.0272, "mean_token_accuracy": 0.1350693427026272, "num_tokens": 5591952.0, "step": 3030 }, { "entropy": 5.999459314346313, "epoch": 0.25498844780508295, "grad_norm": 1.109375, "learning_rate": 0.0004996702864107015, "loss": 5.9271, "mean_token_accuracy": 0.1392418310046196, "num_tokens": 5601460.0, "step": 3035 }, { "entropy": 6.104486131668091, "epoch": 0.25540852762024785, "grad_norm": 1.1875, "learning_rate": 0.0004996686638060028, "loss": 6.05, "mean_token_accuracy": 0.13465244546532631, "num_tokens": 5610776.0, "step": 3040 }, { "entropy": 5.98651351928711, "epoch": 0.25582860743541275, "grad_norm": 1.2265625, "learning_rate": 0.0004996670372214144, "loss": 5.9593, "mean_token_accuracy": 0.1381534829735756, "num_tokens": 5619627.0, "step": 3045 }, { "entropy": 5.846577882766724, "epoch": 0.2562486872505776, "grad_norm": 1.140625, "learning_rate": 0.0004996654066569651, "loss": 5.8254, "mean_token_accuracy": 0.1441572315990925, "num_tokens": 5628969.0, "step": 3050 }, { "entropy": 5.9285993576049805, "epoch": 0.2566687670657425, "grad_norm": 1.140625, "learning_rate": 0.0004996637721126839, "loss": 5.8895, "mean_token_accuracy": 0.13785991445183754, "num_tokens": 5638629.0, "step": 3055 }, { "entropy": 6.003174924850464, "epoch": 0.2570888468809074, "grad_norm": 1.1796875, "learning_rate": 0.0004996621335885996, "loss": 5.9755, "mean_token_accuracy": 0.13731449097394943, "num_tokens": 5647571.0, "step": 3060 }, { "entropy": 5.987164306640625, "epoch": 0.2575089266960722, "grad_norm": 1.40625, "learning_rate": 0.0004996604910847413, "loss": 5.8754, "mean_token_accuracy": 0.14926859214901925, "num_tokens": 5656709.0, "step": 3065 }, { "entropy": 5.953637361526489, "epoch": 0.2579290065112371, "grad_norm": 1.0859375, "learning_rate": 0.000499658844601138, "loss": 6.0688, "mean_token_accuracy": 0.1354634039103985, "num_tokens": 5665714.0, "step": 3070 }, { "entropy": 6.097519016265869, "epoch": 0.258349086326402, "grad_norm": 1.1171875, "learning_rate": 0.000499657194137819, "loss": 6.0234, "mean_token_accuracy": 0.13649424612522126, "num_tokens": 5675854.0, "step": 3075 }, { "entropy": 5.9704999923706055, "epoch": 0.2587691661415669, "grad_norm": 1.1328125, "learning_rate": 0.0004996555396948136, "loss": 5.8448, "mean_token_accuracy": 0.13540665656328202, "num_tokens": 5685690.0, "step": 3080 }, { "entropy": 5.92389030456543, "epoch": 0.25918924595673176, "grad_norm": 1.0625, "learning_rate": 0.0004996538812721509, "loss": 5.8958, "mean_token_accuracy": 0.1428774431347847, "num_tokens": 5695766.0, "step": 3085 }, { "entropy": 5.918001127243042, "epoch": 0.25960932577189666, "grad_norm": 1.4375, "learning_rate": 0.0004996522188698603, "loss": 5.9586, "mean_token_accuracy": 0.13920465260744094, "num_tokens": 5704365.0, "step": 3090 }, { "entropy": 6.103138256072998, "epoch": 0.26002940558706156, "grad_norm": 1.234375, "learning_rate": 0.0004996505524879714, "loss": 6.0694, "mean_token_accuracy": 0.13200636729598045, "num_tokens": 5713345.0, "step": 3095 }, { "entropy": 6.0067219734191895, "epoch": 0.2604494854022264, "grad_norm": 1.078125, "learning_rate": 0.0004996488821265137, "loss": 5.8544, "mean_token_accuracy": 0.1430236168205738, "num_tokens": 5722907.0, "step": 3100 }, { "entropy": 5.831529951095581, "epoch": 0.2608695652173913, "grad_norm": 1.1640625, "learning_rate": 0.0004996472077855166, "loss": 5.898, "mean_token_accuracy": 0.14116744548082352, "num_tokens": 5731589.0, "step": 3105 }, { "entropy": 5.962000036239624, "epoch": 0.2612896450325562, "grad_norm": 1.125, "learning_rate": 0.00049964552946501, "loss": 5.9005, "mean_token_accuracy": 0.13415754735469818, "num_tokens": 5739922.0, "step": 3110 }, { "entropy": 5.892115211486816, "epoch": 0.2617097248477211, "grad_norm": 1.171875, "learning_rate": 0.0004996438471650235, "loss": 5.8122, "mean_token_accuracy": 0.14354836270213128, "num_tokens": 5749206.0, "step": 3115 }, { "entropy": 5.935215997695923, "epoch": 0.26212980466288593, "grad_norm": 1.140625, "learning_rate": 0.0004996421608855869, "loss": 5.8703, "mean_token_accuracy": 0.1430413119494915, "num_tokens": 5758803.0, "step": 3120 }, { "entropy": 5.921274280548095, "epoch": 0.26254988447805083, "grad_norm": 1.1015625, "learning_rate": 0.0004996404706267301, "loss": 5.9525, "mean_token_accuracy": 0.1340932957828045, "num_tokens": 5768368.0, "step": 3125 }, { "entropy": 5.895799207687378, "epoch": 0.26296996429321573, "grad_norm": 1.2109375, "learning_rate": 0.000499638776388483, "loss": 5.8028, "mean_token_accuracy": 0.14530150592327118, "num_tokens": 5776707.0, "step": 3130 }, { "entropy": 5.956259107589721, "epoch": 0.26339004410838057, "grad_norm": 1.234375, "learning_rate": 0.0004996370781708757, "loss": 5.9872, "mean_token_accuracy": 0.13445577397942543, "num_tokens": 5787037.0, "step": 3135 }, { "entropy": 5.948802375793457, "epoch": 0.26381012392354547, "grad_norm": 1.203125, "learning_rate": 0.0004996353759739382, "loss": 5.9353, "mean_token_accuracy": 0.14012779742479325, "num_tokens": 5796630.0, "step": 3140 }, { "entropy": 5.918614721298217, "epoch": 0.26423020373871037, "grad_norm": 1.2109375, "learning_rate": 0.0004996336697977007, "loss": 5.9569, "mean_token_accuracy": 0.13389407992362976, "num_tokens": 5806402.0, "step": 3145 }, { "entropy": 5.899946784973144, "epoch": 0.2646502835538752, "grad_norm": 1.21875, "learning_rate": 0.0004996319596421933, "loss": 5.8948, "mean_token_accuracy": 0.14252272099256516, "num_tokens": 5815742.0, "step": 3150 }, { "entropy": 5.881059455871582, "epoch": 0.2650703633690401, "grad_norm": 1.1171875, "learning_rate": 0.0004996302455074466, "loss": 5.9116, "mean_token_accuracy": 0.13981909155845643, "num_tokens": 5824915.0, "step": 3155 }, { "entropy": 6.034554386138916, "epoch": 0.265490443184205, "grad_norm": 1.109375, "learning_rate": 0.0004996285273934906, "loss": 5.9346, "mean_token_accuracy": 0.13784030005335807, "num_tokens": 5834978.0, "step": 3160 }, { "entropy": 6.000072288513183, "epoch": 0.2659105229993699, "grad_norm": 1.0546875, "learning_rate": 0.000499626805300356, "loss": 6.0896, "mean_token_accuracy": 0.13608634248375892, "num_tokens": 5845684.0, "step": 3165 }, { "entropy": 6.076979541778565, "epoch": 0.26633060281453474, "grad_norm": 1.171875, "learning_rate": 0.0004996250792280732, "loss": 5.9723, "mean_token_accuracy": 0.131855096668005, "num_tokens": 5854905.0, "step": 3170 }, { "entropy": 6.017704057693481, "epoch": 0.26675068262969964, "grad_norm": 1.25, "learning_rate": 0.0004996233491766727, "loss": 5.995, "mean_token_accuracy": 0.1348758891224861, "num_tokens": 5863654.0, "step": 3175 }, { "entropy": 5.958924627304077, "epoch": 0.26717076244486454, "grad_norm": 1.1484375, "learning_rate": 0.0004996216151461854, "loss": 5.9782, "mean_token_accuracy": 0.14263538494706154, "num_tokens": 5872442.0, "step": 3180 }, { "entropy": 6.007285785675049, "epoch": 0.2675908422600294, "grad_norm": 1.21875, "learning_rate": 0.0004996198771366417, "loss": 5.9003, "mean_token_accuracy": 0.14180114939808847, "num_tokens": 5882372.0, "step": 3185 }, { "entropy": 5.762058162689209, "epoch": 0.2680109220751943, "grad_norm": 1.1484375, "learning_rate": 0.0004996181351480726, "loss": 5.7096, "mean_token_accuracy": 0.1481250509619713, "num_tokens": 5891113.0, "step": 3190 }, { "entropy": 5.856069707870484, "epoch": 0.2684310018903592, "grad_norm": 1.1015625, "learning_rate": 0.0004996163891805089, "loss": 5.9546, "mean_token_accuracy": 0.14241180717945098, "num_tokens": 5899582.0, "step": 3195 }, { "entropy": 6.006877613067627, "epoch": 0.2688510817055241, "grad_norm": 1.1328125, "learning_rate": 0.0004996146392339815, "loss": 5.903, "mean_token_accuracy": 0.13792204037308692, "num_tokens": 5908938.0, "step": 3200 }, { "entropy": 5.92903504371643, "epoch": 0.2692711615206889, "grad_norm": 1.234375, "learning_rate": 0.0004996128853085215, "loss": 5.873, "mean_token_accuracy": 0.14175159782171248, "num_tokens": 5918055.0, "step": 3205 }, { "entropy": 5.962112808227539, "epoch": 0.2696912413358538, "grad_norm": 1.1484375, "learning_rate": 0.0004996111274041598, "loss": 5.8745, "mean_token_accuracy": 0.1343413420021534, "num_tokens": 5926744.0, "step": 3210 }, { "entropy": 5.949939441680908, "epoch": 0.2701113211510187, "grad_norm": 1.109375, "learning_rate": 0.0004996093655209277, "loss": 5.9643, "mean_token_accuracy": 0.13674649894237517, "num_tokens": 5936521.0, "step": 3215 }, { "entropy": 6.075492525100708, "epoch": 0.27053140096618356, "grad_norm": 1.1953125, "learning_rate": 0.0004996075996588563, "loss": 6.0296, "mean_token_accuracy": 0.1298151694238186, "num_tokens": 5945010.0, "step": 3220 }, { "entropy": 5.941525030136108, "epoch": 0.27095148078134845, "grad_norm": 1.21875, "learning_rate": 0.000499605829817977, "loss": 5.9212, "mean_token_accuracy": 0.13907130137085916, "num_tokens": 5953766.0, "step": 3225 }, { "entropy": 5.937248849868775, "epoch": 0.27137156059651335, "grad_norm": 1.0859375, "learning_rate": 0.000499604055998321, "loss": 5.8382, "mean_token_accuracy": 0.14228671863675119, "num_tokens": 5962168.0, "step": 3230 }, { "entropy": 5.887500286102295, "epoch": 0.2717916404116782, "grad_norm": 1.03125, "learning_rate": 0.0004996022781999198, "loss": 5.8689, "mean_token_accuracy": 0.14494396820664407, "num_tokens": 5971627.0, "step": 3235 }, { "entropy": 5.927862691879272, "epoch": 0.2722117202268431, "grad_norm": 1.15625, "learning_rate": 0.000499600496422805, "loss": 5.949, "mean_token_accuracy": 0.13576936945319176, "num_tokens": 5981775.0, "step": 3240 }, { "entropy": 5.942725610733032, "epoch": 0.272631800042008, "grad_norm": 1.1484375, "learning_rate": 0.000499598710667008, "loss": 5.8857, "mean_token_accuracy": 0.1394686594605446, "num_tokens": 5991097.0, "step": 3245 }, { "entropy": 5.9221861362457275, "epoch": 0.2730518798571729, "grad_norm": 1.3125, "learning_rate": 0.0004995969209325604, "loss": 5.9369, "mean_token_accuracy": 0.1354317285120487, "num_tokens": 5999517.0, "step": 3250 }, { "entropy": 5.942779064178467, "epoch": 0.2734719596723377, "grad_norm": 1.09375, "learning_rate": 0.0004995951272194941, "loss": 5.9079, "mean_token_accuracy": 0.13176682814955712, "num_tokens": 6008545.0, "step": 3255 }, { "entropy": 6.00344705581665, "epoch": 0.2738920394875026, "grad_norm": 1.2421875, "learning_rate": 0.0004995933295278407, "loss": 5.8989, "mean_token_accuracy": 0.13847036063671112, "num_tokens": 6017366.0, "step": 3260 }, { "entropy": 5.900600910186768, "epoch": 0.2743121193026675, "grad_norm": 1.375, "learning_rate": 0.0004995915278576321, "loss": 5.843, "mean_token_accuracy": 0.14386921525001525, "num_tokens": 6025597.0, "step": 3265 }, { "entropy": 6.0085962295532225, "epoch": 0.27473219911783237, "grad_norm": 1.21875, "learning_rate": 0.0004995897222089004, "loss": 5.9437, "mean_token_accuracy": 0.1424303874373436, "num_tokens": 6034239.0, "step": 3270 }, { "entropy": 6.0732769012451175, "epoch": 0.27515227893299726, "grad_norm": 1.109375, "learning_rate": 0.0004995879125816772, "loss": 5.9769, "mean_token_accuracy": 0.13496886044740677, "num_tokens": 6043837.0, "step": 3275 }, { "entropy": 5.846703004837036, "epoch": 0.27557235874816216, "grad_norm": 1.0703125, "learning_rate": 0.0004995860989759949, "loss": 5.9195, "mean_token_accuracy": 0.14464289993047713, "num_tokens": 6053217.0, "step": 3280 }, { "entropy": 6.013448190689087, "epoch": 0.27599243856332706, "grad_norm": 1.203125, "learning_rate": 0.0004995842813918855, "loss": 5.9292, "mean_token_accuracy": 0.141995108127594, "num_tokens": 6061553.0, "step": 3285 }, { "entropy": 5.900888109207154, "epoch": 0.2764125183784919, "grad_norm": 1.328125, "learning_rate": 0.0004995824598293812, "loss": 5.8195, "mean_token_accuracy": 0.14140584841370582, "num_tokens": 6070080.0, "step": 3290 }, { "entropy": 5.960742044448852, "epoch": 0.2768325981936568, "grad_norm": 1.25, "learning_rate": 0.0004995806342885142, "loss": 5.9653, "mean_token_accuracy": 0.14112535640597343, "num_tokens": 6078438.0, "step": 3295 }, { "entropy": 5.993828868865966, "epoch": 0.2772526780088217, "grad_norm": 1.203125, "learning_rate": 0.000499578804769317, "loss": 5.9591, "mean_token_accuracy": 0.13578373640775682, "num_tokens": 6087794.0, "step": 3300 }, { "entropy": 6.002421045303345, "epoch": 0.27767275782398654, "grad_norm": 1.078125, "learning_rate": 0.0004995769712718218, "loss": 5.9684, "mean_token_accuracy": 0.13969296216964722, "num_tokens": 6096709.0, "step": 3305 }, { "entropy": 5.897220087051392, "epoch": 0.27809283763915144, "grad_norm": 1.2109375, "learning_rate": 0.0004995751337960613, "loss": 5.9029, "mean_token_accuracy": 0.14022547677159308, "num_tokens": 6105866.0, "step": 3310 }, { "entropy": 6.0129883766174315, "epoch": 0.27851291745431633, "grad_norm": 1.1640625, "learning_rate": 0.0004995732923420679, "loss": 5.8481, "mean_token_accuracy": 0.1446751207113266, "num_tokens": 6114882.0, "step": 3315 }, { "entropy": 5.85595121383667, "epoch": 0.2789329972694812, "grad_norm": 1.1953125, "learning_rate": 0.0004995714469098743, "loss": 5.8116, "mean_token_accuracy": 0.1394299313426018, "num_tokens": 6123978.0, "step": 3320 }, { "entropy": 5.841559362411499, "epoch": 0.2793530770846461, "grad_norm": 1.2109375, "learning_rate": 0.000499569597499513, "loss": 5.9636, "mean_token_accuracy": 0.1420348674058914, "num_tokens": 6133246.0, "step": 3325 }, { "entropy": 5.949489116668701, "epoch": 0.27977315689981097, "grad_norm": 1.0546875, "learning_rate": 0.0004995677441110172, "loss": 5.8295, "mean_token_accuracy": 0.1390916422009468, "num_tokens": 6142865.0, "step": 3330 }, { "entropy": 5.958270597457886, "epoch": 0.28019323671497587, "grad_norm": 1.1484375, "learning_rate": 0.0004995658867444192, "loss": 5.9185, "mean_token_accuracy": 0.13403844311833382, "num_tokens": 6152492.0, "step": 3335 }, { "entropy": 5.9247795104980465, "epoch": 0.2806133165301407, "grad_norm": 1.203125, "learning_rate": 0.0004995640253997523, "loss": 5.9182, "mean_token_accuracy": 0.13453099131584167, "num_tokens": 6161953.0, "step": 3340 }, { "entropy": 5.869791507720947, "epoch": 0.2810333963453056, "grad_norm": 1.09375, "learning_rate": 0.0004995621600770492, "loss": 5.7688, "mean_token_accuracy": 0.14543831422924997, "num_tokens": 6171467.0, "step": 3345 }, { "entropy": 5.874020433425903, "epoch": 0.2814534761604705, "grad_norm": 1.09375, "learning_rate": 0.0004995602907763431, "loss": 5.8552, "mean_token_accuracy": 0.1376187428832054, "num_tokens": 6180646.0, "step": 3350 }, { "entropy": 5.929383373260498, "epoch": 0.28187355597563535, "grad_norm": 1.203125, "learning_rate": 0.0004995584174976672, "loss": 5.8713, "mean_token_accuracy": 0.1346499465405941, "num_tokens": 6189832.0, "step": 3355 }, { "entropy": 5.927767086029053, "epoch": 0.28229363579080025, "grad_norm": 1.234375, "learning_rate": 0.0004995565402410544, "loss": 5.7654, "mean_token_accuracy": 0.14761848300695418, "num_tokens": 6198339.0, "step": 3360 }, { "entropy": 5.892761945724487, "epoch": 0.28271371560596514, "grad_norm": 1.359375, "learning_rate": 0.0004995546590065383, "loss": 5.8571, "mean_token_accuracy": 0.1422846756875515, "num_tokens": 6207564.0, "step": 3365 }, { "entropy": 5.9255454540252686, "epoch": 0.28313379542113004, "grad_norm": 1.2578125, "learning_rate": 0.0004995527737941518, "loss": 5.9347, "mean_token_accuracy": 0.13853738307952881, "num_tokens": 6216056.0, "step": 3370 }, { "entropy": 5.896576976776123, "epoch": 0.2835538752362949, "grad_norm": 1.203125, "learning_rate": 0.0004995508846039287, "loss": 5.8965, "mean_token_accuracy": 0.13626314997673034, "num_tokens": 6225573.0, "step": 3375 }, { "entropy": 6.006501722335815, "epoch": 0.2839739550514598, "grad_norm": 1.21875, "learning_rate": 0.0004995489914359023, "loss": 6.0096, "mean_token_accuracy": 0.13380790427327155, "num_tokens": 6235057.0, "step": 3380 }, { "entropy": 6.008948469161988, "epoch": 0.2843940348666247, "grad_norm": 1.6015625, "learning_rate": 0.0004995470942901061, "loss": 5.9217, "mean_token_accuracy": 0.13976462185382843, "num_tokens": 6244164.0, "step": 3385 }, { "entropy": 6.019155550003052, "epoch": 0.2848141146817895, "grad_norm": 1.8984375, "learning_rate": 0.0004995451931665738, "loss": 5.9344, "mean_token_accuracy": 0.13991687223315238, "num_tokens": 6253095.0, "step": 3390 }, { "entropy": 5.957593202590942, "epoch": 0.2852341944969544, "grad_norm": 1.15625, "learning_rate": 0.000499543288065339, "loss": 5.8639, "mean_token_accuracy": 0.13952580690383912, "num_tokens": 6261134.0, "step": 3395 }, { "entropy": 5.833978176116943, "epoch": 0.2856542743121193, "grad_norm": 1.265625, "learning_rate": 0.0004995413789864354, "loss": 5.879, "mean_token_accuracy": 0.14211501479148864, "num_tokens": 6270384.0, "step": 3400 }, { "entropy": 5.952889156341553, "epoch": 0.28607435412728416, "grad_norm": 1.1171875, "learning_rate": 0.0004995394659298971, "loss": 5.8153, "mean_token_accuracy": 0.14977512061595916, "num_tokens": 6279702.0, "step": 3405 }, { "entropy": 5.889900207519531, "epoch": 0.28649443394244906, "grad_norm": 1.1796875, "learning_rate": 0.0004995375488957576, "loss": 5.8612, "mean_token_accuracy": 0.13872572034597397, "num_tokens": 6288297.0, "step": 3410 }, { "entropy": 5.9331536293029785, "epoch": 0.28691451375761395, "grad_norm": 1.1328125, "learning_rate": 0.000499535627884051, "loss": 5.9516, "mean_token_accuracy": 0.13464640453457832, "num_tokens": 6297288.0, "step": 3415 }, { "entropy": 6.120673799514771, "epoch": 0.28733459357277885, "grad_norm": 1.1328125, "learning_rate": 0.0004995337028948115, "loss": 5.974, "mean_token_accuracy": 0.13250542506575586, "num_tokens": 6306719.0, "step": 3420 }, { "entropy": 5.837868595123291, "epoch": 0.2877546733879437, "grad_norm": 1.2578125, "learning_rate": 0.0004995317739280731, "loss": 5.8018, "mean_token_accuracy": 0.14693671017885207, "num_tokens": 6316639.0, "step": 3425 }, { "entropy": 5.928999614715576, "epoch": 0.2881747532031086, "grad_norm": 1.1875, "learning_rate": 0.0004995298409838699, "loss": 5.9251, "mean_token_accuracy": 0.1393180750310421, "num_tokens": 6326879.0, "step": 3430 }, { "entropy": 5.877130842208862, "epoch": 0.2885948330182735, "grad_norm": 1.0859375, "learning_rate": 0.000499527904062236, "loss": 5.8293, "mean_token_accuracy": 0.14481945633888244, "num_tokens": 6335729.0, "step": 3435 }, { "entropy": 5.915560340881347, "epoch": 0.28901491283343833, "grad_norm": 1.2578125, "learning_rate": 0.0004995259631632061, "loss": 5.8973, "mean_token_accuracy": 0.13230996280908586, "num_tokens": 6345154.0, "step": 3440 }, { "entropy": 5.962369394302368, "epoch": 0.28943499264860323, "grad_norm": 1.15625, "learning_rate": 0.0004995240182868143, "loss": 5.8479, "mean_token_accuracy": 0.14117665588855743, "num_tokens": 6354309.0, "step": 3445 }, { "entropy": 5.844361209869385, "epoch": 0.2898550724637681, "grad_norm": 1.0625, "learning_rate": 0.0004995220694330951, "loss": 5.8255, "mean_token_accuracy": 0.14228973686695098, "num_tokens": 6363389.0, "step": 3450 }, { "entropy": 5.88813967704773, "epoch": 0.290275152278933, "grad_norm": 1.1015625, "learning_rate": 0.0004995201166020832, "loss": 5.8844, "mean_token_accuracy": 0.13614363521337508, "num_tokens": 6372475.0, "step": 3455 }, { "entropy": 5.972552013397217, "epoch": 0.29069523209409787, "grad_norm": 1.2109375, "learning_rate": 0.000499518159793813, "loss": 5.8329, "mean_token_accuracy": 0.14460824504494668, "num_tokens": 6380906.0, "step": 3460 }, { "entropy": 5.866478300094604, "epoch": 0.29111531190926276, "grad_norm": 1.1875, "learning_rate": 0.000499516199008319, "loss": 5.8563, "mean_token_accuracy": 0.14013876989483834, "num_tokens": 6390085.0, "step": 3465 }, { "entropy": 5.941966962814331, "epoch": 0.29153539172442766, "grad_norm": 1.21875, "learning_rate": 0.0004995142342456364, "loss": 5.9125, "mean_token_accuracy": 0.13554606810212136, "num_tokens": 6399441.0, "step": 3470 }, { "entropy": 6.016114854812622, "epoch": 0.2919554715395925, "grad_norm": 1.1484375, "learning_rate": 0.0004995122655057997, "loss": 5.9876, "mean_token_accuracy": 0.13846278935670853, "num_tokens": 6408995.0, "step": 3475 }, { "entropy": 5.820221567153931, "epoch": 0.2923755513547574, "grad_norm": 1.1796875, "learning_rate": 0.0004995102927888437, "loss": 5.7443, "mean_token_accuracy": 0.1459552101790905, "num_tokens": 6418080.0, "step": 3480 }, { "entropy": 5.911068105697632, "epoch": 0.2927956311699223, "grad_norm": 1.3359375, "learning_rate": 0.0004995083160948036, "loss": 5.9075, "mean_token_accuracy": 0.13615152686834336, "num_tokens": 6426732.0, "step": 3485 }, { "entropy": 5.973341417312622, "epoch": 0.29321571098508714, "grad_norm": 1.125, "learning_rate": 0.0004995063354237141, "loss": 5.9199, "mean_token_accuracy": 0.14315774142742158, "num_tokens": 6435957.0, "step": 3490 }, { "entropy": 5.914236402511596, "epoch": 0.29363579080025204, "grad_norm": 1.390625, "learning_rate": 0.0004995043507756107, "loss": 5.864, "mean_token_accuracy": 0.13712269440293312, "num_tokens": 6445642.0, "step": 3495 }, { "entropy": 5.931887197494507, "epoch": 0.29405587061541694, "grad_norm": 1.328125, "learning_rate": 0.0004995023621505282, "loss": 5.906, "mean_token_accuracy": 0.1387566529214382, "num_tokens": 6454664.0, "step": 3500 }, { "entropy": 5.8483740329742435, "epoch": 0.29447595043058183, "grad_norm": 1.234375, "learning_rate": 0.000499500369548502, "loss": 5.8204, "mean_token_accuracy": 0.13878127932548523, "num_tokens": 6463224.0, "step": 3505 }, { "entropy": 6.101959562301635, "epoch": 0.2948960302457467, "grad_norm": 1.1015625, "learning_rate": 0.0004994983729695674, "loss": 6.0552, "mean_token_accuracy": 0.13270595893263817, "num_tokens": 6473112.0, "step": 3510 }, { "entropy": 5.9638348579406735, "epoch": 0.2953161100609116, "grad_norm": 1.375, "learning_rate": 0.0004994963724137595, "loss": 5.8923, "mean_token_accuracy": 0.14195917025208474, "num_tokens": 6482062.0, "step": 3515 }, { "entropy": 5.8792516708374025, "epoch": 0.29573618987607647, "grad_norm": 1.359375, "learning_rate": 0.0004994943678811142, "loss": 5.8699, "mean_token_accuracy": 0.13623269721865655, "num_tokens": 6490568.0, "step": 3520 }, { "entropy": 5.920031452178955, "epoch": 0.2961562696912413, "grad_norm": 1.09375, "learning_rate": 0.0004994923593716667, "loss": 5.9294, "mean_token_accuracy": 0.1400933049619198, "num_tokens": 6500815.0, "step": 3525 }, { "entropy": 5.929981470108032, "epoch": 0.2965763495064062, "grad_norm": 1.1328125, "learning_rate": 0.0004994903468854527, "loss": 5.8119, "mean_token_accuracy": 0.1481688842177391, "num_tokens": 6509529.0, "step": 3530 }, { "entropy": 5.864474868774414, "epoch": 0.2969964293215711, "grad_norm": 1.34375, "learning_rate": 0.0004994883304225077, "loss": 5.8729, "mean_token_accuracy": 0.13643766716122627, "num_tokens": 6517934.0, "step": 3535 }, { "entropy": 5.941485595703125, "epoch": 0.297416509136736, "grad_norm": 1.15625, "learning_rate": 0.0004994863099828675, "loss": 5.8357, "mean_token_accuracy": 0.13737112134695054, "num_tokens": 6526098.0, "step": 3540 }, { "entropy": 5.857544040679931, "epoch": 0.29783658895190085, "grad_norm": 1.1953125, "learning_rate": 0.000499484285566568, "loss": 5.8676, "mean_token_accuracy": 0.13825918808579446, "num_tokens": 6535831.0, "step": 3545 }, { "entropy": 5.862282800674438, "epoch": 0.29825666876706575, "grad_norm": 1.125, "learning_rate": 0.0004994822571736449, "loss": 5.7848, "mean_token_accuracy": 0.138202403485775, "num_tokens": 6545704.0, "step": 3550 }, { "entropy": 5.923550367355347, "epoch": 0.29867674858223064, "grad_norm": 1.2265625, "learning_rate": 0.0004994802248041342, "loss": 5.8178, "mean_token_accuracy": 0.14043487086892129, "num_tokens": 6554423.0, "step": 3555 }, { "entropy": 5.915227842330933, "epoch": 0.2990968283973955, "grad_norm": 1.2421875, "learning_rate": 0.000499478188458072, "loss": 5.8699, "mean_token_accuracy": 0.14151394963264466, "num_tokens": 6563989.0, "step": 3560 }, { "entropy": 5.978702878952026, "epoch": 0.2995169082125604, "grad_norm": 1.4296875, "learning_rate": 0.0004994761481354943, "loss": 6.0168, "mean_token_accuracy": 0.1372433789074421, "num_tokens": 6572745.0, "step": 3565 }, { "entropy": 6.085881900787354, "epoch": 0.2999369880277253, "grad_norm": 1.171875, "learning_rate": 0.0004994741038364371, "loss": 6.0023, "mean_token_accuracy": 0.1347330242395401, "num_tokens": 6581723.0, "step": 3570 }, { "entropy": 5.837055253982544, "epoch": 0.3003570678428901, "grad_norm": 1.3046875, "learning_rate": 0.0004994720555609369, "loss": 5.7255, "mean_token_accuracy": 0.14691844433546067, "num_tokens": 6590342.0, "step": 3575 }, { "entropy": 5.793679332733154, "epoch": 0.300777147658055, "grad_norm": 1.34375, "learning_rate": 0.0004994700033090297, "loss": 5.7828, "mean_token_accuracy": 0.14932364374399185, "num_tokens": 6599206.0, "step": 3580 }, { "entropy": 6.014719247817993, "epoch": 0.3011972274732199, "grad_norm": 1.2578125, "learning_rate": 0.000499467947080752, "loss": 6.1117, "mean_token_accuracy": 0.1273707590997219, "num_tokens": 6608947.0, "step": 3585 }, { "entropy": 5.987322616577148, "epoch": 0.3016173072883848, "grad_norm": 1.265625, "learning_rate": 0.0004994658868761402, "loss": 5.8883, "mean_token_accuracy": 0.14657592847943307, "num_tokens": 6618378.0, "step": 3590 }, { "entropy": 5.946252870559692, "epoch": 0.30203738710354966, "grad_norm": 1.203125, "learning_rate": 0.0004994638226952307, "loss": 5.9383, "mean_token_accuracy": 0.1343943029642105, "num_tokens": 6627527.0, "step": 3595 }, { "entropy": 5.973345470428467, "epoch": 0.30245746691871456, "grad_norm": 1.2265625, "learning_rate": 0.0004994617545380604, "loss": 5.8799, "mean_token_accuracy": 0.1394343391060829, "num_tokens": 6636964.0, "step": 3600 }, { "entropy": 5.846103811264038, "epoch": 0.30287754673387945, "grad_norm": 1.2265625, "learning_rate": 0.0004994596824046656, "loss": 5.8373, "mean_token_accuracy": 0.14053009524941446, "num_tokens": 6646074.0, "step": 3605 }, { "entropy": 5.920796251296997, "epoch": 0.3032976265490443, "grad_norm": 1.15625, "learning_rate": 0.000499457606295083, "loss": 5.897, "mean_token_accuracy": 0.14094351455569268, "num_tokens": 6655027.0, "step": 3610 }, { "entropy": 5.804931497573852, "epoch": 0.3037177063642092, "grad_norm": 1.390625, "learning_rate": 0.0004994555262093495, "loss": 5.6923, "mean_token_accuracy": 0.1544790118932724, "num_tokens": 6663747.0, "step": 3615 }, { "entropy": 6.0041478157043455, "epoch": 0.3041377861793741, "grad_norm": 1.28125, "learning_rate": 0.000499453442147502, "loss": 6.0113, "mean_token_accuracy": 0.13213684484362603, "num_tokens": 6672922.0, "step": 3620 }, { "entropy": 5.917838859558105, "epoch": 0.304557865994539, "grad_norm": 1.234375, "learning_rate": 0.0004994513541095773, "loss": 5.8314, "mean_token_accuracy": 0.14857635647058487, "num_tokens": 6682233.0, "step": 3625 }, { "entropy": 5.912606573104858, "epoch": 0.30497794580970383, "grad_norm": 1.3515625, "learning_rate": 0.0004994492620956126, "loss": 5.8901, "mean_token_accuracy": 0.1396655946969986, "num_tokens": 6691593.0, "step": 3630 }, { "entropy": 5.919918155670166, "epoch": 0.30539802562486873, "grad_norm": 1.1328125, "learning_rate": 0.0004994471661056445, "loss": 5.8861, "mean_token_accuracy": 0.14072583466768265, "num_tokens": 6701318.0, "step": 3635 }, { "entropy": 6.009689807891846, "epoch": 0.3058181054400336, "grad_norm": 1.125, "learning_rate": 0.0004994450661397106, "loss": 5.892, "mean_token_accuracy": 0.1426208183169365, "num_tokens": 6710059.0, "step": 3640 }, { "entropy": 6.047525215148926, "epoch": 0.30623818525519847, "grad_norm": 1.171875, "learning_rate": 0.000499442962197848, "loss": 5.975, "mean_token_accuracy": 0.13458139076828957, "num_tokens": 6719811.0, "step": 3645 }, { "entropy": 5.868422555923462, "epoch": 0.30665826507036337, "grad_norm": 1.2421875, "learning_rate": 0.0004994408542800937, "loss": 5.8541, "mean_token_accuracy": 0.14217756688594818, "num_tokens": 6728789.0, "step": 3650 }, { "entropy": 5.868635082244873, "epoch": 0.30707834488552826, "grad_norm": 1.171875, "learning_rate": 0.0004994387423864855, "loss": 5.8459, "mean_token_accuracy": 0.14152047485113145, "num_tokens": 6737706.0, "step": 3655 }, { "entropy": 5.855863761901856, "epoch": 0.3074984247006931, "grad_norm": 1.1875, "learning_rate": 0.0004994366265170603, "loss": 5.7885, "mean_token_accuracy": 0.15381008386611938, "num_tokens": 6746861.0, "step": 3660 }, { "entropy": 5.958421134948731, "epoch": 0.307918504515858, "grad_norm": 1.3828125, "learning_rate": 0.0004994345066718558, "loss": 5.9914, "mean_token_accuracy": 0.13055020123720168, "num_tokens": 6755242.0, "step": 3665 }, { "entropy": 6.003214979171753, "epoch": 0.3083385843310229, "grad_norm": 1.1640625, "learning_rate": 0.0004994323828509098, "loss": 5.9324, "mean_token_accuracy": 0.13040165677666665, "num_tokens": 6764549.0, "step": 3670 }, { "entropy": 5.916761636734009, "epoch": 0.3087586641461878, "grad_norm": 1.3125, "learning_rate": 0.0004994302550542596, "loss": 5.9094, "mean_token_accuracy": 0.145879103243351, "num_tokens": 6774123.0, "step": 3675 }, { "entropy": 5.818877267837524, "epoch": 0.30917874396135264, "grad_norm": 1.21875, "learning_rate": 0.000499428123281943, "loss": 5.6883, "mean_token_accuracy": 0.14885310381650924, "num_tokens": 6782922.0, "step": 3680 }, { "entropy": 5.8812672138214115, "epoch": 0.30959882377651754, "grad_norm": 1.125, "learning_rate": 0.0004994259875339978, "loss": 5.9311, "mean_token_accuracy": 0.13911449760198594, "num_tokens": 6792042.0, "step": 3685 }, { "entropy": 6.0224377632141115, "epoch": 0.31001890359168244, "grad_norm": 1.234375, "learning_rate": 0.0004994238478104617, "loss": 5.9298, "mean_token_accuracy": 0.13841390311717988, "num_tokens": 6800994.0, "step": 3690 }, { "entropy": 5.922906446456909, "epoch": 0.3104389834068473, "grad_norm": 1.1875, "learning_rate": 0.0004994217041113727, "loss": 5.8716, "mean_token_accuracy": 0.14270309880375862, "num_tokens": 6809938.0, "step": 3695 }, { "entropy": 5.951845121383667, "epoch": 0.3108590632220122, "grad_norm": 1.046875, "learning_rate": 0.0004994195564367688, "loss": 5.9849, "mean_token_accuracy": 0.1360231176018715, "num_tokens": 6820289.0, "step": 3700 }, { "entropy": 5.994351577758789, "epoch": 0.3112791430371771, "grad_norm": 1.2734375, "learning_rate": 0.0004994174047866882, "loss": 5.8235, "mean_token_accuracy": 0.14149386510252954, "num_tokens": 6830068.0, "step": 3705 }, { "entropy": 5.771749830245971, "epoch": 0.3116992228523419, "grad_norm": 1.34375, "learning_rate": 0.0004994152491611686, "loss": 5.8521, "mean_token_accuracy": 0.1437979094684124, "num_tokens": 6838591.0, "step": 3710 }, { "entropy": 5.865754890441894, "epoch": 0.3121193026675068, "grad_norm": 1.15625, "learning_rate": 0.0004994130895602485, "loss": 5.8204, "mean_token_accuracy": 0.13915161341428756, "num_tokens": 6847796.0, "step": 3715 }, { "entropy": 6.016102695465088, "epoch": 0.3125393824826717, "grad_norm": 1.2734375, "learning_rate": 0.000499410925983966, "loss": 5.9097, "mean_token_accuracy": 0.14341016113758087, "num_tokens": 6856585.0, "step": 3720 }, { "entropy": 5.82035460472107, "epoch": 0.3129594622978366, "grad_norm": 1.3515625, "learning_rate": 0.0004994087584323596, "loss": 5.8224, "mean_token_accuracy": 0.1492151916027069, "num_tokens": 6865757.0, "step": 3725 }, { "entropy": 5.874684762954712, "epoch": 0.31337954211300145, "grad_norm": 1.125, "learning_rate": 0.0004994065869054676, "loss": 5.8703, "mean_token_accuracy": 0.13963879272341728, "num_tokens": 6875371.0, "step": 3730 }, { "entropy": 5.973430156707764, "epoch": 0.31379962192816635, "grad_norm": 1.265625, "learning_rate": 0.0004994044114033283, "loss": 5.9223, "mean_token_accuracy": 0.1317138932645321, "num_tokens": 6884050.0, "step": 3735 }, { "entropy": 5.9677238941192625, "epoch": 0.31421970174333125, "grad_norm": 1.28125, "learning_rate": 0.0004994022319259806, "loss": 5.8772, "mean_token_accuracy": 0.14338692352175714, "num_tokens": 6893079.0, "step": 3740 }, { "entropy": 5.878354215621949, "epoch": 0.3146397815584961, "grad_norm": 1.1875, "learning_rate": 0.0004994000484734629, "loss": 5.9909, "mean_token_accuracy": 0.14075467139482498, "num_tokens": 6903100.0, "step": 3745 }, { "entropy": 5.928855514526367, "epoch": 0.315059861373661, "grad_norm": 1.078125, "learning_rate": 0.0004993978610458137, "loss": 5.837, "mean_token_accuracy": 0.14158818423748015, "num_tokens": 6912164.0, "step": 3750 }, { "entropy": 5.849039506912232, "epoch": 0.3154799411888259, "grad_norm": 1.25, "learning_rate": 0.0004993956696430721, "loss": 5.8489, "mean_token_accuracy": 0.13852613866329194, "num_tokens": 6921183.0, "step": 3755 }, { "entropy": 5.947899580001831, "epoch": 0.3159000210039908, "grad_norm": 1.15625, "learning_rate": 0.0004993934742652768, "loss": 5.918, "mean_token_accuracy": 0.13974586948752404, "num_tokens": 6931325.0, "step": 3760 }, { "entropy": 5.930049276351928, "epoch": 0.3163201008191556, "grad_norm": 1.2578125, "learning_rate": 0.0004993912749124665, "loss": 5.8111, "mean_token_accuracy": 0.14738105684518815, "num_tokens": 6940234.0, "step": 3765 }, { "entropy": 5.850810146331787, "epoch": 0.3167401806343205, "grad_norm": 1.3125, "learning_rate": 0.0004993890715846804, "loss": 5.9136, "mean_token_accuracy": 0.1437673717737198, "num_tokens": 6949067.0, "step": 3770 }, { "entropy": 5.959416913986206, "epoch": 0.3171602604494854, "grad_norm": 1.1328125, "learning_rate": 0.0004993868642819574, "loss": 5.8851, "mean_token_accuracy": 0.13802511468529702, "num_tokens": 6959085.0, "step": 3775 }, { "entropy": 5.905188941955567, "epoch": 0.31758034026465026, "grad_norm": 1.2421875, "learning_rate": 0.0004993846530043367, "loss": 5.9143, "mean_token_accuracy": 0.1347965881228447, "num_tokens": 6967392.0, "step": 3780 }, { "entropy": 5.903332805633545, "epoch": 0.31800042007981516, "grad_norm": 1.1953125, "learning_rate": 0.0004993824377518574, "loss": 5.8461, "mean_token_accuracy": 0.1488291099667549, "num_tokens": 6976369.0, "step": 3785 }, { "entropy": 5.983600234985351, "epoch": 0.31842049989498006, "grad_norm": 1.1640625, "learning_rate": 0.0004993802185245587, "loss": 5.8623, "mean_token_accuracy": 0.14806569889187812, "num_tokens": 6985889.0, "step": 3790 }, { "entropy": 5.85070834159851, "epoch": 0.3188405797101449, "grad_norm": 1.140625, "learning_rate": 0.00049937799532248, "loss": 5.8915, "mean_token_accuracy": 0.1322341412305832, "num_tokens": 6995396.0, "step": 3795 }, { "entropy": 6.083252477645874, "epoch": 0.3192606595253098, "grad_norm": 1.1875, "learning_rate": 0.0004993757681456607, "loss": 5.94, "mean_token_accuracy": 0.13651170060038567, "num_tokens": 7004666.0, "step": 3800 }, { "entropy": 5.98352370262146, "epoch": 0.3196807393404747, "grad_norm": 1.15625, "learning_rate": 0.0004993735369941401, "loss": 5.9741, "mean_token_accuracy": 0.13378495275974273, "num_tokens": 7014608.0, "step": 3805 }, { "entropy": 5.872314596176148, "epoch": 0.3201008191556396, "grad_norm": 1.1484375, "learning_rate": 0.0004993713018679579, "loss": 5.8367, "mean_token_accuracy": 0.13734248280525208, "num_tokens": 7023671.0, "step": 3810 }, { "entropy": 5.887682008743286, "epoch": 0.32052089897080444, "grad_norm": 1.125, "learning_rate": 0.0004993690627671536, "loss": 5.8912, "mean_token_accuracy": 0.13631067648530007, "num_tokens": 7033786.0, "step": 3815 }, { "entropy": 5.925025987625122, "epoch": 0.32094097878596933, "grad_norm": 1.2734375, "learning_rate": 0.0004993668196917669, "loss": 5.8026, "mean_token_accuracy": 0.14422772228717803, "num_tokens": 7042162.0, "step": 3820 }, { "entropy": 5.993115663528442, "epoch": 0.32136105860113423, "grad_norm": 1.1953125, "learning_rate": 0.0004993645726418375, "loss": 5.946, "mean_token_accuracy": 0.13835487216711045, "num_tokens": 7051903.0, "step": 3825 }, { "entropy": 5.828875875473022, "epoch": 0.3217811384162991, "grad_norm": 1.1953125, "learning_rate": 0.0004993623216174053, "loss": 5.7755, "mean_token_accuracy": 0.15191829651594163, "num_tokens": 7060229.0, "step": 3830 }, { "entropy": 5.8507789134979244, "epoch": 0.32220121823146397, "grad_norm": 1.21875, "learning_rate": 0.00049936006661851, "loss": 5.8708, "mean_token_accuracy": 0.13942239433526993, "num_tokens": 7069040.0, "step": 3835 }, { "entropy": 5.9059672355651855, "epoch": 0.32262129804662887, "grad_norm": 1.3359375, "learning_rate": 0.0004993578076451917, "loss": 5.755, "mean_token_accuracy": 0.1418701082468033, "num_tokens": 7078409.0, "step": 3840 }, { "entropy": 5.73867621421814, "epoch": 0.32304137786179377, "grad_norm": 1.1953125, "learning_rate": 0.0004993555446974903, "loss": 5.841, "mean_token_accuracy": 0.13865280598402024, "num_tokens": 7087983.0, "step": 3845 }, { "entropy": 5.833849382400513, "epoch": 0.3234614576769586, "grad_norm": 1.515625, "learning_rate": 0.000499353277775446, "loss": 5.7975, "mean_token_accuracy": 0.14864109456539154, "num_tokens": 7097277.0, "step": 3850 }, { "entropy": 5.906425857543946, "epoch": 0.3238815374921235, "grad_norm": 1.2890625, "learning_rate": 0.0004993510068790989, "loss": 5.6902, "mean_token_accuracy": 0.15845565646886825, "num_tokens": 7105918.0, "step": 3855 }, { "entropy": 5.729353284835815, "epoch": 0.3243016173072884, "grad_norm": 1.2109375, "learning_rate": 0.0004993487320084892, "loss": 5.7663, "mean_token_accuracy": 0.15123388469219207, "num_tokens": 7115049.0, "step": 3860 }, { "entropy": 5.881364727020264, "epoch": 0.32472169712245325, "grad_norm": 1.1328125, "learning_rate": 0.0004993464531636573, "loss": 5.848, "mean_token_accuracy": 0.1397578552365303, "num_tokens": 7124862.0, "step": 3865 }, { "entropy": 5.853383731842041, "epoch": 0.32514177693761814, "grad_norm": 1.1875, "learning_rate": 0.0004993441703446435, "loss": 5.7449, "mean_token_accuracy": 0.14876592308282852, "num_tokens": 7133280.0, "step": 3870 }, { "entropy": 5.926945161819458, "epoch": 0.32556185675278304, "grad_norm": 1.2109375, "learning_rate": 0.0004993418835514882, "loss": 5.9468, "mean_token_accuracy": 0.13614385947585106, "num_tokens": 7142446.0, "step": 3875 }, { "entropy": 5.948226881027222, "epoch": 0.3259819365679479, "grad_norm": 1.046875, "learning_rate": 0.0004993395927842321, "loss": 5.8437, "mean_token_accuracy": 0.1408906787633896, "num_tokens": 7152143.0, "step": 3880 }, { "entropy": 5.944639015197754, "epoch": 0.3264020163831128, "grad_norm": 1.265625, "learning_rate": 0.0004993372980429155, "loss": 5.9314, "mean_token_accuracy": 0.13912539780139924, "num_tokens": 7162046.0, "step": 3885 }, { "entropy": 5.925892972946167, "epoch": 0.3268220961982777, "grad_norm": 1.171875, "learning_rate": 0.0004993349993275792, "loss": 5.8046, "mean_token_accuracy": 0.141475647687912, "num_tokens": 7171557.0, "step": 3890 }, { "entropy": 5.751782655715942, "epoch": 0.3272421760134426, "grad_norm": 1.2265625, "learning_rate": 0.0004993326966382639, "loss": 5.7152, "mean_token_accuracy": 0.14672790616750717, "num_tokens": 7180927.0, "step": 3895 }, { "entropy": 5.833934020996094, "epoch": 0.3276622558286074, "grad_norm": 1.3984375, "learning_rate": 0.0004993303899750104, "loss": 5.784, "mean_token_accuracy": 0.14524296969175338, "num_tokens": 7189552.0, "step": 3900 }, { "entropy": 5.929954242706299, "epoch": 0.3280823356437723, "grad_norm": 1.65625, "learning_rate": 0.0004993280793378595, "loss": 5.8197, "mean_token_accuracy": 0.14072833955287933, "num_tokens": 7197857.0, "step": 3905 }, { "entropy": 5.886449480056763, "epoch": 0.3285024154589372, "grad_norm": 1.2578125, "learning_rate": 0.0004993257647268522, "loss": 5.7906, "mean_token_accuracy": 0.15043871477246284, "num_tokens": 7206785.0, "step": 3910 }, { "entropy": 5.909309577941895, "epoch": 0.32892249527410206, "grad_norm": 1.1484375, "learning_rate": 0.0004993234461420295, "loss": 5.8845, "mean_token_accuracy": 0.13852151483297348, "num_tokens": 7216360.0, "step": 3915 }, { "entropy": 5.842478799819946, "epoch": 0.32934257508926695, "grad_norm": 1.4453125, "learning_rate": 0.0004993211235834326, "loss": 5.6839, "mean_token_accuracy": 0.16384934931993483, "num_tokens": 7224890.0, "step": 3920 }, { "entropy": 5.758263874053955, "epoch": 0.32976265490443185, "grad_norm": 1.2421875, "learning_rate": 0.0004993187970511023, "loss": 5.735, "mean_token_accuracy": 0.16684153228998183, "num_tokens": 7234442.0, "step": 3925 }, { "entropy": 5.873496627807617, "epoch": 0.33018273471959675, "grad_norm": 1.1953125, "learning_rate": 0.0004993164665450801, "loss": 5.8937, "mean_token_accuracy": 0.14423306286334991, "num_tokens": 7244023.0, "step": 3930 }, { "entropy": 5.877293682098388, "epoch": 0.3306028145347616, "grad_norm": 1.2578125, "learning_rate": 0.0004993141320654072, "loss": 5.7546, "mean_token_accuracy": 0.14760554879903792, "num_tokens": 7253548.0, "step": 3935 }, { "entropy": 5.862598562240601, "epoch": 0.3310228943499265, "grad_norm": 1.1875, "learning_rate": 0.000499311793612125, "loss": 5.8187, "mean_token_accuracy": 0.14365085512399672, "num_tokens": 7262962.0, "step": 3940 }, { "entropy": 5.9090534210205075, "epoch": 0.3314429741650914, "grad_norm": 1.1875, "learning_rate": 0.0004993094511852748, "loss": 5.8412, "mean_token_accuracy": 0.1450173959136009, "num_tokens": 7272234.0, "step": 3945 }, { "entropy": 5.899152231216431, "epoch": 0.33186305398025623, "grad_norm": 1.15625, "learning_rate": 0.0004993071047848983, "loss": 5.8162, "mean_token_accuracy": 0.1440458543598652, "num_tokens": 7281524.0, "step": 3950 }, { "entropy": 5.8174926280975345, "epoch": 0.3322831337954211, "grad_norm": 1.1875, "learning_rate": 0.0004993047544110368, "loss": 5.7104, "mean_token_accuracy": 0.1493962250649929, "num_tokens": 7289601.0, "step": 3955 }, { "entropy": 5.727307987213135, "epoch": 0.332703213610586, "grad_norm": 1.3671875, "learning_rate": 0.0004993024000637321, "loss": 5.6805, "mean_token_accuracy": 0.15279524326324462, "num_tokens": 7298508.0, "step": 3960 }, { "entropy": 5.863152980804443, "epoch": 0.33312329342575087, "grad_norm": 1.1484375, "learning_rate": 0.0004993000417430259, "loss": 5.899, "mean_token_accuracy": 0.14116615280508996, "num_tokens": 7309065.0, "step": 3965 }, { "entropy": 5.987276601791382, "epoch": 0.33354337324091576, "grad_norm": 1.140625, "learning_rate": 0.00049929767944896, "loss": 5.9279, "mean_token_accuracy": 0.14180676117539406, "num_tokens": 7319669.0, "step": 3970 }, { "entropy": 5.944264316558838, "epoch": 0.33396345305608066, "grad_norm": 1.34375, "learning_rate": 0.0004992953131815761, "loss": 5.8675, "mean_token_accuracy": 0.14308214634656907, "num_tokens": 7328425.0, "step": 3975 }, { "entropy": 5.837700128555298, "epoch": 0.33438353287124556, "grad_norm": 1.3359375, "learning_rate": 0.0004992929429409164, "loss": 5.7527, "mean_token_accuracy": 0.1491951271891594, "num_tokens": 7337369.0, "step": 3980 }, { "entropy": 5.8214014053344725, "epoch": 0.3348036126864104, "grad_norm": 1.1796875, "learning_rate": 0.0004992905687270225, "loss": 5.8183, "mean_token_accuracy": 0.1459653303027153, "num_tokens": 7346829.0, "step": 3985 }, { "entropy": 5.947172021865844, "epoch": 0.3352236925015753, "grad_norm": 1.359375, "learning_rate": 0.0004992881905399368, "loss": 5.8631, "mean_token_accuracy": 0.14260231107473373, "num_tokens": 7355976.0, "step": 3990 }, { "entropy": 5.880902576446533, "epoch": 0.3356437723167402, "grad_norm": 1.3125, "learning_rate": 0.0004992858083797013, "loss": 5.8357, "mean_token_accuracy": 0.13988892063498498, "num_tokens": 7365210.0, "step": 3995 }, { "entropy": 5.887827444076538, "epoch": 0.33606385213190504, "grad_norm": 1.2734375, "learning_rate": 0.0004992834222463581, "loss": 5.8916, "mean_token_accuracy": 0.1366453118622303, "num_tokens": 7374175.0, "step": 4000 }, { "entropy": 5.9429937362670895, "epoch": 0.33648393194706994, "grad_norm": 1.328125, "learning_rate": 0.0004992810321399496, "loss": 5.9143, "mean_token_accuracy": 0.13915260806679725, "num_tokens": 7383302.0, "step": 4005 }, { "entropy": 5.919287061691284, "epoch": 0.33690401176223483, "grad_norm": 1.234375, "learning_rate": 0.0004992786380605182, "loss": 5.8924, "mean_token_accuracy": 0.1387575164437294, "num_tokens": 7392746.0, "step": 4010 }, { "entropy": 5.856390810012817, "epoch": 0.33732409157739973, "grad_norm": 1.328125, "learning_rate": 0.0004992762400081062, "loss": 5.7403, "mean_token_accuracy": 0.1431375488638878, "num_tokens": 7401604.0, "step": 4015 }, { "entropy": 5.85037088394165, "epoch": 0.3377441713925646, "grad_norm": 1.265625, "learning_rate": 0.0004992738379827559, "loss": 5.8333, "mean_token_accuracy": 0.14066973105072975, "num_tokens": 7410594.0, "step": 4020 }, { "entropy": 5.8780660152435305, "epoch": 0.33816425120772947, "grad_norm": 1.2578125, "learning_rate": 0.0004992714319845101, "loss": 5.7395, "mean_token_accuracy": 0.15344746708869933, "num_tokens": 7418831.0, "step": 4025 }, { "entropy": 5.772813749313355, "epoch": 0.33858433102289437, "grad_norm": 1.2265625, "learning_rate": 0.0004992690220134116, "loss": 5.7892, "mean_token_accuracy": 0.1462782494723797, "num_tokens": 7427731.0, "step": 4030 }, { "entropy": 5.971381282806396, "epoch": 0.3390044108380592, "grad_norm": 1.3125, "learning_rate": 0.0004992666080695027, "loss": 5.9117, "mean_token_accuracy": 0.13779002502560617, "num_tokens": 7436447.0, "step": 4035 }, { "entropy": 5.921543741226197, "epoch": 0.3394244906532241, "grad_norm": 1.2734375, "learning_rate": 0.0004992641901528262, "loss": 5.811, "mean_token_accuracy": 0.14552520364522933, "num_tokens": 7445352.0, "step": 4040 }, { "entropy": 5.886599731445313, "epoch": 0.339844570468389, "grad_norm": 1.1640625, "learning_rate": 0.0004992617682634252, "loss": 5.8513, "mean_token_accuracy": 0.14520843252539634, "num_tokens": 7454298.0, "step": 4045 }, { "entropy": 5.877497959136963, "epoch": 0.34026465028355385, "grad_norm": 1.2265625, "learning_rate": 0.0004992593424013424, "loss": 5.8719, "mean_token_accuracy": 0.14371060207486153, "num_tokens": 7463543.0, "step": 4050 }, { "entropy": 5.891790342330933, "epoch": 0.34068473009871875, "grad_norm": 1.390625, "learning_rate": 0.0004992569125666209, "loss": 5.8933, "mean_token_accuracy": 0.1396215297281742, "num_tokens": 7472701.0, "step": 4055 }, { "entropy": 6.004440689086914, "epoch": 0.34110480991388364, "grad_norm": 1.0859375, "learning_rate": 0.0004992544787593037, "loss": 5.8639, "mean_token_accuracy": 0.13958193585276604, "num_tokens": 7481123.0, "step": 4060 }, { "entropy": 5.980106163024902, "epoch": 0.34152488972904854, "grad_norm": 1.1796875, "learning_rate": 0.0004992520409794338, "loss": 5.9379, "mean_token_accuracy": 0.1414543256163597, "num_tokens": 7490439.0, "step": 4065 }, { "entropy": 5.900824117660522, "epoch": 0.3419449695442134, "grad_norm": 1.09375, "learning_rate": 0.0004992495992270544, "loss": 5.7958, "mean_token_accuracy": 0.14631813019514084, "num_tokens": 7499326.0, "step": 4070 }, { "entropy": 5.891779184341431, "epoch": 0.3423650493593783, "grad_norm": 1.2109375, "learning_rate": 0.0004992471535022089, "loss": 5.8633, "mean_token_accuracy": 0.14064768627285956, "num_tokens": 7509407.0, "step": 4075 }, { "entropy": 5.933660221099854, "epoch": 0.3427851291745432, "grad_norm": 1.0859375, "learning_rate": 0.0004992447038049405, "loss": 5.9161, "mean_token_accuracy": 0.14266983717679976, "num_tokens": 7518443.0, "step": 4080 }, { "entropy": 5.83581509590149, "epoch": 0.343205208989708, "grad_norm": 1.3359375, "learning_rate": 0.0004992422501352927, "loss": 5.7669, "mean_token_accuracy": 0.14985841438174247, "num_tokens": 7527609.0, "step": 4085 }, { "entropy": 5.924132442474365, "epoch": 0.3436252888048729, "grad_norm": 1.21875, "learning_rate": 0.0004992397924933089, "loss": 5.8592, "mean_token_accuracy": 0.13954362720251084, "num_tokens": 7536890.0, "step": 4090 }, { "entropy": 5.917565202713012, "epoch": 0.3440453686200378, "grad_norm": 1.2265625, "learning_rate": 0.0004992373308790325, "loss": 5.811, "mean_token_accuracy": 0.14937977269291877, "num_tokens": 7546509.0, "step": 4095 }, { "entropy": 5.8026703834533695, "epoch": 0.3444654484352027, "grad_norm": 1.265625, "learning_rate": 0.0004992348652925074, "loss": 5.8363, "mean_token_accuracy": 0.14076031297445296, "num_tokens": 7555336.0, "step": 4100 }, { "entropy": 5.921528148651123, "epoch": 0.34488552825036756, "grad_norm": 1.2734375, "learning_rate": 0.0004992323957337771, "loss": 5.7996, "mean_token_accuracy": 0.14651144444942474, "num_tokens": 7565210.0, "step": 4105 }, { "entropy": 5.882741546630859, "epoch": 0.34530560806553245, "grad_norm": 1.140625, "learning_rate": 0.0004992299222028855, "loss": 5.892, "mean_token_accuracy": 0.1487639456987381, "num_tokens": 7574516.0, "step": 4110 }, { "entropy": 5.829581546783447, "epoch": 0.34572568788069735, "grad_norm": 1.1171875, "learning_rate": 0.0004992274446998761, "loss": 5.7261, "mean_token_accuracy": 0.15058319717645646, "num_tokens": 7583219.0, "step": 4115 }, { "entropy": 5.973264932632446, "epoch": 0.3461457676958622, "grad_norm": 1.171875, "learning_rate": 0.0004992249632247929, "loss": 5.9592, "mean_token_accuracy": 0.13624709472060204, "num_tokens": 7592050.0, "step": 4120 }, { "entropy": 5.926892328262329, "epoch": 0.3465658475110271, "grad_norm": 1.2109375, "learning_rate": 0.0004992224777776802, "loss": 5.7982, "mean_token_accuracy": 0.14514639526605605, "num_tokens": 7600718.0, "step": 4125 }, { "entropy": 5.865804290771484, "epoch": 0.346985927326192, "grad_norm": 1.2734375, "learning_rate": 0.0004992199883585816, "loss": 5.8233, "mean_token_accuracy": 0.14894086718559266, "num_tokens": 7609191.0, "step": 4130 }, { "entropy": 5.919741106033325, "epoch": 0.34740600714135683, "grad_norm": 1.3671875, "learning_rate": 0.0004992174949675413, "loss": 5.8602, "mean_token_accuracy": 0.14337126538157463, "num_tokens": 7618509.0, "step": 4135 }, { "entropy": 5.859026050567627, "epoch": 0.34782608695652173, "grad_norm": 1.2890625, "learning_rate": 0.0004992149976046037, "loss": 5.7922, "mean_token_accuracy": 0.1422334760427475, "num_tokens": 7627851.0, "step": 4140 }, { "entropy": 5.82733941078186, "epoch": 0.3482461667716866, "grad_norm": 1.2734375, "learning_rate": 0.0004992124962698128, "loss": 5.8424, "mean_token_accuracy": 0.14138613119721413, "num_tokens": 7636748.0, "step": 4145 }, { "entropy": 5.9045960903167725, "epoch": 0.3486662465868515, "grad_norm": 1.296875, "learning_rate": 0.000499209990963213, "loss": 5.7782, "mean_token_accuracy": 0.14405173510313035, "num_tokens": 7645436.0, "step": 4150 }, { "entropy": 5.942763233184815, "epoch": 0.34908632640201637, "grad_norm": 2.21875, "learning_rate": 0.0004992074816848487, "loss": 5.912, "mean_token_accuracy": 0.1420220673084259, "num_tokens": 7655414.0, "step": 4155 }, { "entropy": 5.806369590759277, "epoch": 0.34950640621718126, "grad_norm": 1.28125, "learning_rate": 0.0004992049684347642, "loss": 5.6828, "mean_token_accuracy": 0.1479445531964302, "num_tokens": 7664295.0, "step": 4160 }, { "entropy": 5.8392486572265625, "epoch": 0.34992648603234616, "grad_norm": 1.359375, "learning_rate": 0.0004992024512130042, "loss": 5.8256, "mean_token_accuracy": 0.14557857811450958, "num_tokens": 7673295.0, "step": 4165 }, { "entropy": 5.829631280899048, "epoch": 0.350346565847511, "grad_norm": 1.1171875, "learning_rate": 0.0004991999300196132, "loss": 5.8282, "mean_token_accuracy": 0.14183629676699638, "num_tokens": 7682932.0, "step": 4170 }, { "entropy": 5.967007064819336, "epoch": 0.3507666456626759, "grad_norm": 1.3125, "learning_rate": 0.0004991974048546359, "loss": 5.8389, "mean_token_accuracy": 0.1410795919597149, "num_tokens": 7692105.0, "step": 4175 }, { "entropy": 5.839569425582885, "epoch": 0.3511867254778408, "grad_norm": 1.625, "learning_rate": 0.000499194875718117, "loss": 5.8278, "mean_token_accuracy": 0.14473577067255974, "num_tokens": 7701294.0, "step": 4180 }, { "entropy": 5.905254983901978, "epoch": 0.3516068052930057, "grad_norm": 1.609375, "learning_rate": 0.0004991923426101013, "loss": 5.8161, "mean_token_accuracy": 0.13896411135792733, "num_tokens": 7710964.0, "step": 4185 }, { "entropy": 5.987090635299682, "epoch": 0.35202688510817054, "grad_norm": 1.296875, "learning_rate": 0.0004991898055306337, "loss": 5.9588, "mean_token_accuracy": 0.13461354821920396, "num_tokens": 7719938.0, "step": 4190 }, { "entropy": 5.921365547180176, "epoch": 0.35244696492333544, "grad_norm": 1.21875, "learning_rate": 0.0004991872644797591, "loss": 5.8516, "mean_token_accuracy": 0.13835739120841026, "num_tokens": 7729129.0, "step": 4195 }, { "entropy": 5.872485494613647, "epoch": 0.35286704473850034, "grad_norm": 1.328125, "learning_rate": 0.0004991847194575226, "loss": 5.8744, "mean_token_accuracy": 0.135695618391037, "num_tokens": 7738506.0, "step": 4200 }, { "entropy": 5.9815671920776365, "epoch": 0.3532871245536652, "grad_norm": 1.375, "learning_rate": 0.0004991821704639693, "loss": 5.9756, "mean_token_accuracy": 0.1382329933345318, "num_tokens": 7749320.0, "step": 4205 }, { "entropy": 6.020629215240478, "epoch": 0.3537072043688301, "grad_norm": 1.3046875, "learning_rate": 0.0004991796174991443, "loss": 5.8318, "mean_token_accuracy": 0.14441338777542115, "num_tokens": 7758735.0, "step": 4210 }, { "entropy": 5.784712409973144, "epoch": 0.354127284183995, "grad_norm": 1.21875, "learning_rate": 0.0004991770605630927, "loss": 5.7838, "mean_token_accuracy": 0.14566663056612014, "num_tokens": 7767556.0, "step": 4215 }, { "entropy": 5.859303379058838, "epoch": 0.3545473639991598, "grad_norm": 1.3515625, "learning_rate": 0.0004991744996558599, "loss": 5.8127, "mean_token_accuracy": 0.14865762144327163, "num_tokens": 7776615.0, "step": 4220 }, { "entropy": 5.8661195755004885, "epoch": 0.3549674438143247, "grad_norm": 1.4140625, "learning_rate": 0.0004991719347774913, "loss": 5.8425, "mean_token_accuracy": 0.1501207634806633, "num_tokens": 7785288.0, "step": 4225 }, { "entropy": 5.876074028015137, "epoch": 0.3553875236294896, "grad_norm": 1.2734375, "learning_rate": 0.0004991693659280324, "loss": 5.768, "mean_token_accuracy": 0.14568567126989365, "num_tokens": 7794381.0, "step": 4230 }, { "entropy": 5.80495433807373, "epoch": 0.3558076034446545, "grad_norm": 1.296875, "learning_rate": 0.0004991667931075284, "loss": 5.7286, "mean_token_accuracy": 0.14424108862876892, "num_tokens": 7803265.0, "step": 4235 }, { "entropy": 5.866269779205322, "epoch": 0.35622768325981935, "grad_norm": 1.25, "learning_rate": 0.0004991642163160252, "loss": 5.8494, "mean_token_accuracy": 0.14268068224191666, "num_tokens": 7812445.0, "step": 4240 }, { "entropy": 5.940937805175781, "epoch": 0.35664776307498425, "grad_norm": 1.1953125, "learning_rate": 0.0004991616355535684, "loss": 5.8433, "mean_token_accuracy": 0.14912385791540145, "num_tokens": 7822073.0, "step": 4245 }, { "entropy": 5.912532901763916, "epoch": 0.35706784289014915, "grad_norm": 1.28125, "learning_rate": 0.0004991590508202036, "loss": 5.8223, "mean_token_accuracy": 0.14509293287992478, "num_tokens": 7831193.0, "step": 4250 }, { "entropy": 5.9145135402679445, "epoch": 0.357487922705314, "grad_norm": 1.2578125, "learning_rate": 0.0004991564621159766, "loss": 5.8496, "mean_token_accuracy": 0.1429229497909546, "num_tokens": 7840311.0, "step": 4255 }, { "entropy": 5.863569116592407, "epoch": 0.3579080025204789, "grad_norm": 1.375, "learning_rate": 0.0004991538694409334, "loss": 5.8807, "mean_token_accuracy": 0.13731356635689734, "num_tokens": 7849622.0, "step": 4260 }, { "entropy": 5.866101455688477, "epoch": 0.3583280823356438, "grad_norm": 1.390625, "learning_rate": 0.0004991512727951198, "loss": 5.8355, "mean_token_accuracy": 0.14157909527420998, "num_tokens": 7859494.0, "step": 4265 }, { "entropy": 6.04155158996582, "epoch": 0.3587481621508087, "grad_norm": 1.1640625, "learning_rate": 0.0004991486721785818, "loss": 5.9463, "mean_token_accuracy": 0.13783199936151505, "num_tokens": 7868526.0, "step": 4270 }, { "entropy": 5.926672124862671, "epoch": 0.3591682419659735, "grad_norm": 1.2265625, "learning_rate": 0.0004991460675913655, "loss": 5.7679, "mean_token_accuracy": 0.145780511200428, "num_tokens": 7877631.0, "step": 4275 }, { "entropy": 5.883526086807251, "epoch": 0.3595883217811384, "grad_norm": 1.2578125, "learning_rate": 0.000499143459033517, "loss": 5.8215, "mean_token_accuracy": 0.14927242919802666, "num_tokens": 7886814.0, "step": 4280 }, { "entropy": 5.745771408081055, "epoch": 0.3600084015963033, "grad_norm": 1.296875, "learning_rate": 0.0004991408465050825, "loss": 5.6432, "mean_token_accuracy": 0.1510879337787628, "num_tokens": 7896337.0, "step": 4285 }, { "entropy": 5.832517528533936, "epoch": 0.36042848141146816, "grad_norm": 1.1328125, "learning_rate": 0.0004991382300061084, "loss": 5.9099, "mean_token_accuracy": 0.13439226746559144, "num_tokens": 7906071.0, "step": 4290 }, { "entropy": 5.962070560455322, "epoch": 0.36084856122663306, "grad_norm": 1.1796875, "learning_rate": 0.0004991356095366409, "loss": 5.8959, "mean_token_accuracy": 0.14356234297156334, "num_tokens": 7915003.0, "step": 4295 }, { "entropy": 5.901134014129639, "epoch": 0.36126864104179796, "grad_norm": 1.2734375, "learning_rate": 0.0004991329850967266, "loss": 5.7553, "mean_token_accuracy": 0.14801599234342575, "num_tokens": 7924408.0, "step": 4300 }, { "entropy": 5.79054274559021, "epoch": 0.3616887208569628, "grad_norm": 1.2265625, "learning_rate": 0.0004991303566864118, "loss": 5.7258, "mean_token_accuracy": 0.14961198195815087, "num_tokens": 7934717.0, "step": 4305 }, { "entropy": 5.805612707138062, "epoch": 0.3621088006721277, "grad_norm": 1.65625, "learning_rate": 0.0004991277243057431, "loss": 5.7802, "mean_token_accuracy": 0.14127594009041786, "num_tokens": 7944278.0, "step": 4310 }, { "entropy": 5.852519369125366, "epoch": 0.3625288804872926, "grad_norm": 1.40625, "learning_rate": 0.0004991250879547673, "loss": 5.8079, "mean_token_accuracy": 0.147132358700037, "num_tokens": 7953344.0, "step": 4315 }, { "entropy": 5.814604616165161, "epoch": 0.3629489603024575, "grad_norm": 1.1640625, "learning_rate": 0.0004991224476335309, "loss": 5.8228, "mean_token_accuracy": 0.14245626628398894, "num_tokens": 7962869.0, "step": 4320 }, { "entropy": 5.943966865539551, "epoch": 0.36336904011762233, "grad_norm": 1.328125, "learning_rate": 0.0004991198033420807, "loss": 5.8144, "mean_token_accuracy": 0.14347591027617454, "num_tokens": 7971981.0, "step": 4325 }, { "entropy": 5.808311653137207, "epoch": 0.36378911993278723, "grad_norm": 1.234375, "learning_rate": 0.0004991171550804636, "loss": 5.7932, "mean_token_accuracy": 0.14206481873989105, "num_tokens": 7980979.0, "step": 4330 }, { "entropy": 5.905320501327514, "epoch": 0.36420919974795213, "grad_norm": 1.2265625, "learning_rate": 0.0004991145028487266, "loss": 5.8545, "mean_token_accuracy": 0.14277459532022477, "num_tokens": 7989607.0, "step": 4335 }, { "entropy": 5.837810134887695, "epoch": 0.36462927956311697, "grad_norm": 1.203125, "learning_rate": 0.0004991118466469165, "loss": 5.6845, "mean_token_accuracy": 0.1493909515440464, "num_tokens": 7998356.0, "step": 4340 }, { "entropy": 5.851336050033569, "epoch": 0.36504935937828187, "grad_norm": 1.234375, "learning_rate": 0.0004991091864750805, "loss": 5.7847, "mean_token_accuracy": 0.15024245530366898, "num_tokens": 8007596.0, "step": 4345 }, { "entropy": 5.901401472091675, "epoch": 0.36546943919344677, "grad_norm": 1.3828125, "learning_rate": 0.0004991065223332655, "loss": 5.8505, "mean_token_accuracy": 0.14087127447128295, "num_tokens": 8016493.0, "step": 4350 }, { "entropy": 5.870211553573609, "epoch": 0.36588951900861166, "grad_norm": 1.1875, "learning_rate": 0.0004991038542215191, "loss": 5.8227, "mean_token_accuracy": 0.13674163743853568, "num_tokens": 8025867.0, "step": 4355 }, { "entropy": 5.875178384780884, "epoch": 0.3663095988237765, "grad_norm": 1.2265625, "learning_rate": 0.0004991011821398882, "loss": 5.8524, "mean_token_accuracy": 0.14893437176942825, "num_tokens": 8036251.0, "step": 4360 }, { "entropy": 5.93429970741272, "epoch": 0.3667296786389414, "grad_norm": 1.1875, "learning_rate": 0.0004990985060884202, "loss": 5.7934, "mean_token_accuracy": 0.14601994007825853, "num_tokens": 8045647.0, "step": 4365 }, { "entropy": 5.862146234512329, "epoch": 0.3671497584541063, "grad_norm": 1.46875, "learning_rate": 0.0004990958260671627, "loss": 5.8699, "mean_token_accuracy": 0.13680800497531892, "num_tokens": 8056025.0, "step": 4370 }, { "entropy": 5.831518173217773, "epoch": 0.36756983826927114, "grad_norm": 1.421875, "learning_rate": 0.0004990931420761629, "loss": 5.801, "mean_token_accuracy": 0.14898931458592415, "num_tokens": 8065029.0, "step": 4375 }, { "entropy": 5.986507749557495, "epoch": 0.36798991808443604, "grad_norm": 1.296875, "learning_rate": 0.0004990904541154685, "loss": 5.7557, "mean_token_accuracy": 0.1577958881855011, "num_tokens": 8073249.0, "step": 4380 }, { "entropy": 5.926363086700439, "epoch": 0.36840999789960094, "grad_norm": 1.3046875, "learning_rate": 0.0004990877621851271, "loss": 5.8958, "mean_token_accuracy": 0.14099107980728148, "num_tokens": 8082039.0, "step": 4385 }, { "entropy": 5.77527756690979, "epoch": 0.3688300777147658, "grad_norm": 1.3515625, "learning_rate": 0.0004990850662851863, "loss": 5.7314, "mean_token_accuracy": 0.14939506947994233, "num_tokens": 8090011.0, "step": 4390 }, { "entropy": 5.903356552124023, "epoch": 0.3692501575299307, "grad_norm": 1.1875, "learning_rate": 0.0004990823664156941, "loss": 5.8578, "mean_token_accuracy": 0.1503001019358635, "num_tokens": 8099934.0, "step": 4395 }, { "entropy": 5.969454669952393, "epoch": 0.3696702373450956, "grad_norm": 1.2421875, "learning_rate": 0.0004990796625766981, "loss": 5.8544, "mean_token_accuracy": 0.14106032997369766, "num_tokens": 8108969.0, "step": 4400 }, { "entropy": 5.791495323181152, "epoch": 0.3700903171602605, "grad_norm": 1.171875, "learning_rate": 0.0004990769547682462, "loss": 5.7606, "mean_token_accuracy": 0.14634729623794557, "num_tokens": 8117372.0, "step": 4405 }, { "entropy": 6.0099778175354, "epoch": 0.3705103969754253, "grad_norm": 1.1640625, "learning_rate": 0.0004990742429903866, "loss": 5.9586, "mean_token_accuracy": 0.13980280980467796, "num_tokens": 8127108.0, "step": 4410 }, { "entropy": 5.983245754241944, "epoch": 0.3709304767905902, "grad_norm": 1.234375, "learning_rate": 0.000499071527243167, "loss": 5.9341, "mean_token_accuracy": 0.13901495784521103, "num_tokens": 8137392.0, "step": 4415 }, { "entropy": 5.866679716110229, "epoch": 0.3713505566057551, "grad_norm": 1.90625, "learning_rate": 0.0004990688075266357, "loss": 5.7869, "mean_token_accuracy": 0.15010641515254974, "num_tokens": 8146257.0, "step": 4420 }, { "entropy": 5.818060064315796, "epoch": 0.37177063642091995, "grad_norm": 1.203125, "learning_rate": 0.0004990660838408409, "loss": 5.7406, "mean_token_accuracy": 0.14406471997499465, "num_tokens": 8154952.0, "step": 4425 }, { "entropy": 5.896796941757202, "epoch": 0.37219071623608485, "grad_norm": 1.2421875, "learning_rate": 0.0004990633561858308, "loss": 5.8106, "mean_token_accuracy": 0.14199803844094278, "num_tokens": 8164365.0, "step": 4430 }, { "entropy": 5.897768831253051, "epoch": 0.37261079605124975, "grad_norm": 1.265625, "learning_rate": 0.0004990606245616537, "loss": 5.806, "mean_token_accuracy": 0.14310891702771186, "num_tokens": 8172614.0, "step": 4435 }, { "entropy": 5.942156267166138, "epoch": 0.37303087586641465, "grad_norm": 1.2265625, "learning_rate": 0.0004990578889683579, "loss": 5.8804, "mean_token_accuracy": 0.1399701401591301, "num_tokens": 8182445.0, "step": 4440 }, { "entropy": 5.876403188705444, "epoch": 0.3734509556815795, "grad_norm": 1.1875, "learning_rate": 0.0004990551494059921, "loss": 5.759, "mean_token_accuracy": 0.14562231674790382, "num_tokens": 8191871.0, "step": 4445 }, { "entropy": 5.910711050033569, "epoch": 0.3738710354967444, "grad_norm": 1.2421875, "learning_rate": 0.0004990524058746047, "loss": 5.9163, "mean_token_accuracy": 0.14722812101244925, "num_tokens": 8200658.0, "step": 4450 }, { "entropy": 5.870158910751343, "epoch": 0.3742911153119093, "grad_norm": 1.2421875, "learning_rate": 0.0004990496583742443, "loss": 5.828, "mean_token_accuracy": 0.13948734179139138, "num_tokens": 8209776.0, "step": 4455 }, { "entropy": 5.8510631084442135, "epoch": 0.3747111951270741, "grad_norm": 1.5390625, "learning_rate": 0.0004990469069049596, "loss": 5.7655, "mean_token_accuracy": 0.14834618717432022, "num_tokens": 8219401.0, "step": 4460 }, { "entropy": 5.841431474685669, "epoch": 0.375131274942239, "grad_norm": 1.2734375, "learning_rate": 0.0004990441514667993, "loss": 5.8164, "mean_token_accuracy": 0.14826851561665536, "num_tokens": 8228762.0, "step": 4465 }, { "entropy": 5.943975448608398, "epoch": 0.3755513547574039, "grad_norm": 1.40625, "learning_rate": 0.0004990413920598121, "loss": 5.8117, "mean_token_accuracy": 0.1469542607665062, "num_tokens": 8236612.0, "step": 4470 }, { "entropy": 5.879862689971924, "epoch": 0.37597143457256876, "grad_norm": 1.296875, "learning_rate": 0.0004990386286840471, "loss": 5.831, "mean_token_accuracy": 0.14322375506162643, "num_tokens": 8245043.0, "step": 4475 }, { "entropy": 6.000154876708985, "epoch": 0.37639151438773366, "grad_norm": 1.21875, "learning_rate": 0.0004990358613395532, "loss": 5.9178, "mean_token_accuracy": 0.14013071805238725, "num_tokens": 8255270.0, "step": 4480 }, { "entropy": 5.975326061248779, "epoch": 0.37681159420289856, "grad_norm": 1.546875, "learning_rate": 0.0004990330900263792, "loss": 5.8795, "mean_token_accuracy": 0.13827874809503554, "num_tokens": 8264761.0, "step": 4485 }, { "entropy": 5.914210987091065, "epoch": 0.37723167401806346, "grad_norm": 1.265625, "learning_rate": 0.0004990303147445745, "loss": 5.8254, "mean_token_accuracy": 0.14292607977986335, "num_tokens": 8274308.0, "step": 4490 }, { "entropy": 5.824473428726196, "epoch": 0.3776517538332283, "grad_norm": 1.4765625, "learning_rate": 0.0004990275354941881, "loss": 5.7135, "mean_token_accuracy": 0.15530410706996917, "num_tokens": 8283323.0, "step": 4495 }, { "entropy": 5.8834668636322025, "epoch": 0.3780718336483932, "grad_norm": 1.1328125, "learning_rate": 0.0004990247522752694, "loss": 6.0456, "mean_token_accuracy": 0.12988803088665007, "num_tokens": 8293452.0, "step": 4500 }, { "entropy": 5.955636644363404, "epoch": 0.3784919134635581, "grad_norm": 1.2109375, "learning_rate": 0.0004990219650878674, "loss": 5.7218, "mean_token_accuracy": 0.15242000669240952, "num_tokens": 8302941.0, "step": 4505 }, { "entropy": 5.847525358200073, "epoch": 0.37891199327872294, "grad_norm": 1.6796875, "learning_rate": 0.0004990191739320318, "loss": 5.7466, "mean_token_accuracy": 0.1486208386719227, "num_tokens": 8311811.0, "step": 4510 }, { "entropy": 5.740224170684814, "epoch": 0.37933207309388783, "grad_norm": 1.2109375, "learning_rate": 0.0004990163788078117, "loss": 5.6461, "mean_token_accuracy": 0.15186312943696975, "num_tokens": 8321130.0, "step": 4515 }, { "entropy": 5.8464127540588375, "epoch": 0.37975215290905273, "grad_norm": 1.2265625, "learning_rate": 0.0004990135797152569, "loss": 5.7849, "mean_token_accuracy": 0.1421021416783333, "num_tokens": 8330233.0, "step": 4520 }, { "entropy": 5.841711711883545, "epoch": 0.3801722327242176, "grad_norm": 1.5234375, "learning_rate": 0.0004990107766544169, "loss": 5.7542, "mean_token_accuracy": 0.14685214608907698, "num_tokens": 8338585.0, "step": 4525 }, { "entropy": 5.820804738998413, "epoch": 0.38059231253938247, "grad_norm": 1.2734375, "learning_rate": 0.0004990079696253413, "loss": 5.7772, "mean_token_accuracy": 0.15359208285808562, "num_tokens": 8346618.0, "step": 4530 }, { "entropy": 5.856412267684936, "epoch": 0.38101239235454737, "grad_norm": 1.625, "learning_rate": 0.0004990051586280799, "loss": 5.7538, "mean_token_accuracy": 0.14895081669092178, "num_tokens": 8356273.0, "step": 4535 }, { "entropy": 5.84465913772583, "epoch": 0.38143247216971227, "grad_norm": 1.3046875, "learning_rate": 0.0004990023436626824, "loss": 5.7524, "mean_token_accuracy": 0.15021177157759666, "num_tokens": 8366668.0, "step": 4540 }, { "entropy": 5.9355755805969235, "epoch": 0.3818525519848771, "grad_norm": 1.453125, "learning_rate": 0.0004989995247291988, "loss": 5.8869, "mean_token_accuracy": 0.14657219648361205, "num_tokens": 8375610.0, "step": 4545 }, { "entropy": 5.892274522781372, "epoch": 0.382272631800042, "grad_norm": 1.3046875, "learning_rate": 0.0004989967018276789, "loss": 5.7508, "mean_token_accuracy": 0.14968783855438234, "num_tokens": 8384455.0, "step": 4550 }, { "entropy": 5.7833092212677, "epoch": 0.3826927116152069, "grad_norm": 1.2578125, "learning_rate": 0.0004989938749581727, "loss": 5.7971, "mean_token_accuracy": 0.14261699542403222, "num_tokens": 8393868.0, "step": 4555 }, { "entropy": 5.87280683517456, "epoch": 0.38311279143037175, "grad_norm": 1.3359375, "learning_rate": 0.0004989910441207305, "loss": 5.8159, "mean_token_accuracy": 0.142460335791111, "num_tokens": 8402916.0, "step": 4560 }, { "entropy": 5.866139030456543, "epoch": 0.38353287124553664, "grad_norm": 1.390625, "learning_rate": 0.0004989882093154023, "loss": 5.7386, "mean_token_accuracy": 0.14703422486782075, "num_tokens": 8411649.0, "step": 4565 }, { "entropy": 5.867181587219238, "epoch": 0.38395295106070154, "grad_norm": 1.203125, "learning_rate": 0.0004989853705422381, "loss": 5.8685, "mean_token_accuracy": 0.14120551571249962, "num_tokens": 8420393.0, "step": 4570 }, { "entropy": 5.828695583343506, "epoch": 0.38437303087586644, "grad_norm": 1.3125, "learning_rate": 0.0004989825278012886, "loss": 5.7515, "mean_token_accuracy": 0.14682712703943251, "num_tokens": 8429404.0, "step": 4575 }, { "entropy": 5.838935279846192, "epoch": 0.3847931106910313, "grad_norm": 1.5234375, "learning_rate": 0.000498979681092604, "loss": 5.7956, "mean_token_accuracy": 0.1419872298836708, "num_tokens": 8438299.0, "step": 4580 }, { "entropy": 5.8123194694519045, "epoch": 0.3852131905061962, "grad_norm": 1.3359375, "learning_rate": 0.0004989768304162345, "loss": 5.7252, "mean_token_accuracy": 0.14794419556856156, "num_tokens": 8447392.0, "step": 4585 }, { "entropy": 5.904961681365966, "epoch": 0.3856332703213611, "grad_norm": 1.3515625, "learning_rate": 0.0004989739757722308, "loss": 5.8303, "mean_token_accuracy": 0.14383739084005356, "num_tokens": 8456361.0, "step": 4590 }, { "entropy": 5.8575108528137205, "epoch": 0.3860533501365259, "grad_norm": 1.234375, "learning_rate": 0.0004989711171606436, "loss": 5.7685, "mean_token_accuracy": 0.14821998178958892, "num_tokens": 8465548.0, "step": 4595 }, { "entropy": 5.868388605117798, "epoch": 0.3864734299516908, "grad_norm": 1.1953125, "learning_rate": 0.0004989682545815232, "loss": 5.7918, "mean_token_accuracy": 0.1475853756070137, "num_tokens": 8474454.0, "step": 4600 }, { "entropy": 5.851971101760864, "epoch": 0.3868935097668557, "grad_norm": 1.3515625, "learning_rate": 0.0004989653880349207, "loss": 5.702, "mean_token_accuracy": 0.14926233142614365, "num_tokens": 8482694.0, "step": 4605 }, { "entropy": 5.900271463394165, "epoch": 0.38731358958202056, "grad_norm": 1.3515625, "learning_rate": 0.0004989625175208864, "loss": 5.8168, "mean_token_accuracy": 0.1423931635916233, "num_tokens": 8491162.0, "step": 4610 }, { "entropy": 5.7257490158081055, "epoch": 0.38773366939718545, "grad_norm": 1.40625, "learning_rate": 0.0004989596430394717, "loss": 5.6821, "mean_token_accuracy": 0.15937959849834443, "num_tokens": 8500716.0, "step": 4615 }, { "entropy": 5.808234357833863, "epoch": 0.38815374921235035, "grad_norm": 1.34375, "learning_rate": 0.000498956764590727, "loss": 5.708, "mean_token_accuracy": 0.14701972305774688, "num_tokens": 8508871.0, "step": 4620 }, { "entropy": 5.95609483718872, "epoch": 0.38857382902751525, "grad_norm": 1.640625, "learning_rate": 0.0004989538821747037, "loss": 5.9183, "mean_token_accuracy": 0.14093246757984162, "num_tokens": 8518450.0, "step": 4625 }, { "entropy": 5.905833387374878, "epoch": 0.3889939088426801, "grad_norm": 1.203125, "learning_rate": 0.0004989509957914527, "loss": 5.8226, "mean_token_accuracy": 0.13851853534579278, "num_tokens": 8528238.0, "step": 4630 }, { "entropy": 5.809105253219604, "epoch": 0.389413988657845, "grad_norm": 1.2265625, "learning_rate": 0.0004989481054410251, "loss": 5.7101, "mean_token_accuracy": 0.14374718815088272, "num_tokens": 8537587.0, "step": 4635 }, { "entropy": 5.856853580474853, "epoch": 0.3898340684730099, "grad_norm": 1.3125, "learning_rate": 0.0004989452111234721, "loss": 5.832, "mean_token_accuracy": 0.14598365053534507, "num_tokens": 8547703.0, "step": 4640 }, { "entropy": 5.896316909790039, "epoch": 0.39025414828817473, "grad_norm": 1.421875, "learning_rate": 0.000498942312838845, "loss": 5.7758, "mean_token_accuracy": 0.14673489332199097, "num_tokens": 8557001.0, "step": 4645 }, { "entropy": 5.753481054306031, "epoch": 0.3906742281033396, "grad_norm": 1.3828125, "learning_rate": 0.0004989394105871952, "loss": 5.6574, "mean_token_accuracy": 0.15356985479593277, "num_tokens": 8565638.0, "step": 4650 }, { "entropy": 5.9358145236969, "epoch": 0.3910943079185045, "grad_norm": 1.3515625, "learning_rate": 0.000498936504368574, "loss": 5.824, "mean_token_accuracy": 0.14542939141392708, "num_tokens": 8574428.0, "step": 4655 }, { "entropy": 5.850586032867431, "epoch": 0.3915143877336694, "grad_norm": 1.421875, "learning_rate": 0.0004989335941830329, "loss": 5.7948, "mean_token_accuracy": 0.14559997841715813, "num_tokens": 8583157.0, "step": 4660 }, { "entropy": 5.788503217697143, "epoch": 0.39193446754883426, "grad_norm": 1.34375, "learning_rate": 0.0004989306800306236, "loss": 5.7506, "mean_token_accuracy": 0.14149378091096879, "num_tokens": 8592382.0, "step": 4665 }, { "entropy": 5.7819007396697994, "epoch": 0.39235454736399916, "grad_norm": 1.2578125, "learning_rate": 0.0004989277619113975, "loss": 5.7277, "mean_token_accuracy": 0.15354567617177964, "num_tokens": 8601058.0, "step": 4670 }, { "entropy": 5.888355016708374, "epoch": 0.39277462717916406, "grad_norm": 1.25, "learning_rate": 0.0004989248398254065, "loss": 5.809, "mean_token_accuracy": 0.1449069932103157, "num_tokens": 8609479.0, "step": 4675 }, { "entropy": 5.876518249511719, "epoch": 0.3931947069943289, "grad_norm": 1.25, "learning_rate": 0.0004989219137727021, "loss": 5.7915, "mean_token_accuracy": 0.14826752841472626, "num_tokens": 8618860.0, "step": 4680 }, { "entropy": 5.84470911026001, "epoch": 0.3936147868094938, "grad_norm": 1.28125, "learning_rate": 0.0004989189837533365, "loss": 5.735, "mean_token_accuracy": 0.14821926057338713, "num_tokens": 8627462.0, "step": 4685 }, { "entropy": 5.9267487049102785, "epoch": 0.3940348666246587, "grad_norm": 1.1875, "learning_rate": 0.0004989160497673613, "loss": 5.9044, "mean_token_accuracy": 0.14269359409809113, "num_tokens": 8637569.0, "step": 4690 }, { "entropy": 5.871499490737915, "epoch": 0.39445494643982354, "grad_norm": 1.421875, "learning_rate": 0.0004989131118148286, "loss": 5.717, "mean_token_accuracy": 0.14633206725120546, "num_tokens": 8645440.0, "step": 4695 }, { "entropy": 5.8017088890075685, "epoch": 0.39487502625498844, "grad_norm": 1.265625, "learning_rate": 0.0004989101698957904, "loss": 5.8682, "mean_token_accuracy": 0.14381342753767967, "num_tokens": 8655077.0, "step": 4700 }, { "entropy": 5.897370004653931, "epoch": 0.39529510607015333, "grad_norm": 1.34375, "learning_rate": 0.0004989072240102988, "loss": 5.802, "mean_token_accuracy": 0.14655678868293762, "num_tokens": 8663126.0, "step": 4705 }, { "entropy": 5.9488218307495115, "epoch": 0.39571518588531823, "grad_norm": 1.7578125, "learning_rate": 0.0004989042741584061, "loss": 5.7704, "mean_token_accuracy": 0.14519642665982246, "num_tokens": 8672386.0, "step": 4710 }, { "entropy": 5.720374536514282, "epoch": 0.3961352657004831, "grad_norm": 1.3671875, "learning_rate": 0.0004989013203401645, "loss": 5.7158, "mean_token_accuracy": 0.14469049870967865, "num_tokens": 8681930.0, "step": 4715 }, { "entropy": 5.841011619567871, "epoch": 0.396555345515648, "grad_norm": 1.4375, "learning_rate": 0.0004988983625556264, "loss": 5.7771, "mean_token_accuracy": 0.14254847168922424, "num_tokens": 8690993.0, "step": 4720 }, { "entropy": 5.822189235687256, "epoch": 0.39697542533081287, "grad_norm": 1.2734375, "learning_rate": 0.0004988954008048438, "loss": 5.7542, "mean_token_accuracy": 0.1459605447947979, "num_tokens": 8699497.0, "step": 4725 }, { "entropy": 5.995753383636474, "epoch": 0.3973955051459777, "grad_norm": 1.265625, "learning_rate": 0.0004988924350878697, "loss": 5.9601, "mean_token_accuracy": 0.13361823558807373, "num_tokens": 8709274.0, "step": 4730 }, { "entropy": 5.916924142837525, "epoch": 0.3978155849611426, "grad_norm": 1.46875, "learning_rate": 0.0004988894654047563, "loss": 5.8045, "mean_token_accuracy": 0.14044342935085297, "num_tokens": 8718158.0, "step": 4735 }, { "entropy": 5.780064105987549, "epoch": 0.3982356647763075, "grad_norm": 1.734375, "learning_rate": 0.0004988864917555562, "loss": 5.6905, "mean_token_accuracy": 0.147587950527668, "num_tokens": 8727459.0, "step": 4740 }, { "entropy": 5.872677993774414, "epoch": 0.3986557445914724, "grad_norm": 1.6171875, "learning_rate": 0.0004988835141403224, "loss": 5.838, "mean_token_accuracy": 0.15111552625894548, "num_tokens": 8737614.0, "step": 4745 }, { "entropy": 5.793733787536621, "epoch": 0.39907582440663725, "grad_norm": 1.4609375, "learning_rate": 0.0004988805325591073, "loss": 5.6586, "mean_token_accuracy": 0.14995396584272386, "num_tokens": 8746799.0, "step": 4750 }, { "entropy": 5.854796743392944, "epoch": 0.39949590422180214, "grad_norm": 1.4140625, "learning_rate": 0.0004988775470119639, "loss": 5.8437, "mean_token_accuracy": 0.14196141958236694, "num_tokens": 8756555.0, "step": 4755 }, { "entropy": 5.811031866073608, "epoch": 0.39991598403696704, "grad_norm": 1.3203125, "learning_rate": 0.0004988745574989451, "loss": 5.8678, "mean_token_accuracy": 0.14688000455498695, "num_tokens": 8765849.0, "step": 4760 }, { "entropy": 5.97343111038208, "epoch": 0.4003360638521319, "grad_norm": 1.7734375, "learning_rate": 0.0004988715640201036, "loss": 5.9254, "mean_token_accuracy": 0.14007072225213052, "num_tokens": 8775713.0, "step": 4765 }, { "entropy": 5.833617496490478, "epoch": 0.4007561436672968, "grad_norm": 1.3671875, "learning_rate": 0.0004988685665754928, "loss": 5.757, "mean_token_accuracy": 0.14587045535445214, "num_tokens": 8784717.0, "step": 4770 }, { "entropy": 5.850550556182862, "epoch": 0.4011762234824617, "grad_norm": 1.265625, "learning_rate": 0.0004988655651651656, "loss": 5.7758, "mean_token_accuracy": 0.14603159129619597, "num_tokens": 8794388.0, "step": 4775 }, { "entropy": 5.799873685836792, "epoch": 0.4015963032976265, "grad_norm": 1.3046875, "learning_rate": 0.0004988625597891751, "loss": 5.7832, "mean_token_accuracy": 0.1439102217555046, "num_tokens": 8802436.0, "step": 4780 }, { "entropy": 5.901528596878052, "epoch": 0.4020163831127914, "grad_norm": 1.2265625, "learning_rate": 0.0004988595504475746, "loss": 5.7387, "mean_token_accuracy": 0.14495839625597, "num_tokens": 8811184.0, "step": 4785 }, { "entropy": 5.891771650314331, "epoch": 0.4024364629279563, "grad_norm": 1.28125, "learning_rate": 0.0004988565371404175, "loss": 5.8182, "mean_token_accuracy": 0.14259228706359864, "num_tokens": 8820525.0, "step": 4790 }, { "entropy": 5.840786600112915, "epoch": 0.4028565427431212, "grad_norm": 1.3046875, "learning_rate": 0.0004988535198677571, "loss": 5.6717, "mean_token_accuracy": 0.15582364946603774, "num_tokens": 8828928.0, "step": 4795 }, { "entropy": 5.857646703720093, "epoch": 0.40327662255828606, "grad_norm": 1.34375, "learning_rate": 0.0004988504986296469, "loss": 5.9008, "mean_token_accuracy": 0.1377402052283287, "num_tokens": 8838615.0, "step": 4800 }, { "entropy": 5.874919462203979, "epoch": 0.40369670237345096, "grad_norm": 1.2421875, "learning_rate": 0.0004988474734261404, "loss": 5.8723, "mean_token_accuracy": 0.1379916787147522, "num_tokens": 8848709.0, "step": 4805 }, { "entropy": 5.955801677703858, "epoch": 0.40411678218861585, "grad_norm": 1.3046875, "learning_rate": 0.0004988444442572911, "loss": 5.8116, "mean_token_accuracy": 0.1377601645886898, "num_tokens": 8858277.0, "step": 4810 }, { "entropy": 5.8524405002594, "epoch": 0.4045368620037807, "grad_norm": 1.3828125, "learning_rate": 0.0004988414111231528, "loss": 5.7865, "mean_token_accuracy": 0.146932952105999, "num_tokens": 8868436.0, "step": 4815 }, { "entropy": 5.81365385055542, "epoch": 0.4049569418189456, "grad_norm": 1.2421875, "learning_rate": 0.000498838374023779, "loss": 5.7632, "mean_token_accuracy": 0.1458624616265297, "num_tokens": 8877740.0, "step": 4820 }, { "entropy": 5.897781801223755, "epoch": 0.4053770216341105, "grad_norm": 1.296875, "learning_rate": 0.0004988353329592239, "loss": 5.7534, "mean_token_accuracy": 0.1458469048142433, "num_tokens": 8887408.0, "step": 4825 }, { "entropy": 5.865758180618286, "epoch": 0.4057971014492754, "grad_norm": 1.3125, "learning_rate": 0.0004988322879295409, "loss": 5.9214, "mean_token_accuracy": 0.13947931975126265, "num_tokens": 8897141.0, "step": 4830 }, { "entropy": 5.81132230758667, "epoch": 0.40621718126444023, "grad_norm": 1.3359375, "learning_rate": 0.0004988292389347844, "loss": 5.6894, "mean_token_accuracy": 0.153226038813591, "num_tokens": 8905747.0, "step": 4835 }, { "entropy": 5.9674177169799805, "epoch": 0.40663726107960513, "grad_norm": 1.28125, "learning_rate": 0.000498826185975008, "loss": 5.8416, "mean_token_accuracy": 0.14231978505849838, "num_tokens": 8914926.0, "step": 4840 }, { "entropy": 5.819942045211792, "epoch": 0.40705734089477, "grad_norm": 1.28125, "learning_rate": 0.0004988231290502662, "loss": 5.8576, "mean_token_accuracy": 0.14279644340276718, "num_tokens": 8923956.0, "step": 4845 }, { "entropy": 5.920290994644165, "epoch": 0.40747742070993487, "grad_norm": 1.4140625, "learning_rate": 0.0004988200681606127, "loss": 5.724, "mean_token_accuracy": 0.1409930519759655, "num_tokens": 8932654.0, "step": 4850 }, { "entropy": 5.8871392726898195, "epoch": 0.40789750052509977, "grad_norm": 1.2890625, "learning_rate": 0.000498817003306102, "loss": 5.7162, "mean_token_accuracy": 0.15066221207380295, "num_tokens": 8941716.0, "step": 4855 }, { "entropy": 5.792912864685059, "epoch": 0.40831758034026466, "grad_norm": 1.546875, "learning_rate": 0.0004988139344867884, "loss": 5.7907, "mean_token_accuracy": 0.14440437257289887, "num_tokens": 8950377.0, "step": 4860 }, { "entropy": 5.823495244979858, "epoch": 0.4087376601554295, "grad_norm": 1.2890625, "learning_rate": 0.0004988108617027261, "loss": 5.7438, "mean_token_accuracy": 0.1429793991148472, "num_tokens": 8959857.0, "step": 4865 }, { "entropy": 5.811827516555786, "epoch": 0.4091577399705944, "grad_norm": 1.453125, "learning_rate": 0.0004988077849539698, "loss": 5.6945, "mean_token_accuracy": 0.14948356971144677, "num_tokens": 8968272.0, "step": 4870 }, { "entropy": 5.826247787475586, "epoch": 0.4095778197857593, "grad_norm": 1.5625, "learning_rate": 0.0004988047042405736, "loss": 5.765, "mean_token_accuracy": 0.1494896613061428, "num_tokens": 8977445.0, "step": 4875 }, { "entropy": 5.932396030426025, "epoch": 0.4099978996009242, "grad_norm": 1.3671875, "learning_rate": 0.0004988016195625924, "loss": 5.8399, "mean_token_accuracy": 0.13975587710738183, "num_tokens": 8987315.0, "step": 4880 }, { "entropy": 5.874420547485352, "epoch": 0.41041797941608904, "grad_norm": 1.265625, "learning_rate": 0.0004987985309200807, "loss": 5.8545, "mean_token_accuracy": 0.14247876554727554, "num_tokens": 8998119.0, "step": 4885 }, { "entropy": 5.734928989410401, "epoch": 0.41083805923125394, "grad_norm": 1.328125, "learning_rate": 0.0004987954383130934, "loss": 5.7211, "mean_token_accuracy": 0.15389348119497298, "num_tokens": 9007167.0, "step": 4890 }, { "entropy": 5.831616592407227, "epoch": 0.41125813904641884, "grad_norm": 1.4296875, "learning_rate": 0.000498792341741685, "loss": 5.7751, "mean_token_accuracy": 0.14183046370744706, "num_tokens": 9016690.0, "step": 4895 }, { "entropy": 5.9142228126525875, "epoch": 0.4116782188615837, "grad_norm": 1.4453125, "learning_rate": 0.0004987892412059106, "loss": 5.8608, "mean_token_accuracy": 0.14799081161618233, "num_tokens": 9026117.0, "step": 4900 }, { "entropy": 5.786413145065308, "epoch": 0.4120982986767486, "grad_norm": 1.2578125, "learning_rate": 0.0004987861367058251, "loss": 5.7333, "mean_token_accuracy": 0.14700580313801764, "num_tokens": 9035754.0, "step": 4905 }, { "entropy": 5.831767606735229, "epoch": 0.4125183784919135, "grad_norm": 1.5703125, "learning_rate": 0.0004987830282414833, "loss": 5.7383, "mean_token_accuracy": 0.1511758364737034, "num_tokens": 9045453.0, "step": 4910 }, { "entropy": 5.920518159866333, "epoch": 0.41293845830707837, "grad_norm": 1.2734375, "learning_rate": 0.0004987799158129404, "loss": 5.866, "mean_token_accuracy": 0.14217546358704566, "num_tokens": 9056045.0, "step": 4915 }, { "entropy": 5.811709785461426, "epoch": 0.4133585381222432, "grad_norm": 1.703125, "learning_rate": 0.0004987767994202516, "loss": 5.7367, "mean_token_accuracy": 0.14320328831672668, "num_tokens": 9065728.0, "step": 4920 }, { "entropy": 5.832678318023682, "epoch": 0.4137786179374081, "grad_norm": 1.2890625, "learning_rate": 0.0004987736790634719, "loss": 5.7682, "mean_token_accuracy": 0.14474476575851442, "num_tokens": 9075522.0, "step": 4925 }, { "entropy": 5.789442873001098, "epoch": 0.414198697752573, "grad_norm": 1.3671875, "learning_rate": 0.0004987705547426568, "loss": 5.7403, "mean_token_accuracy": 0.141367207467556, "num_tokens": 9084412.0, "step": 4930 }, { "entropy": 5.866390943527222, "epoch": 0.41461877756773785, "grad_norm": 1.375, "learning_rate": 0.0004987674264578615, "loss": 5.8202, "mean_token_accuracy": 0.13946182429790496, "num_tokens": 9094289.0, "step": 4935 }, { "entropy": 5.853621578216552, "epoch": 0.41503885738290275, "grad_norm": 1.328125, "learning_rate": 0.0004987642942091414, "loss": 5.7305, "mean_token_accuracy": 0.1456735163927078, "num_tokens": 9103124.0, "step": 4940 }, { "entropy": 5.831802606582642, "epoch": 0.41545893719806765, "grad_norm": 1.3984375, "learning_rate": 0.0004987611579965523, "loss": 5.6742, "mean_token_accuracy": 0.14086953178048134, "num_tokens": 9112794.0, "step": 4945 }, { "entropy": 5.873828983306884, "epoch": 0.4158790170132325, "grad_norm": 1.421875, "learning_rate": 0.0004987580178201492, "loss": 5.8342, "mean_token_accuracy": 0.1499299481511116, "num_tokens": 9122718.0, "step": 4950 }, { "entropy": 5.850664281845093, "epoch": 0.4162990968283974, "grad_norm": 1.53125, "learning_rate": 0.0004987548736799882, "loss": 5.8516, "mean_token_accuracy": 0.14340481981635095, "num_tokens": 9131855.0, "step": 4955 }, { "entropy": 5.820421504974365, "epoch": 0.4167191766435623, "grad_norm": 1.5859375, "learning_rate": 0.0004987517255761248, "loss": 5.6959, "mean_token_accuracy": 0.15145303905010224, "num_tokens": 9141102.0, "step": 4960 }, { "entropy": 5.806706857681275, "epoch": 0.4171392564587272, "grad_norm": 1.6484375, "learning_rate": 0.0004987485735086148, "loss": 5.7767, "mean_token_accuracy": 0.14581410586833954, "num_tokens": 9150552.0, "step": 4965 }, { "entropy": 5.901687812805176, "epoch": 0.417559336273892, "grad_norm": 1.2734375, "learning_rate": 0.000498745417477514, "loss": 5.7678, "mean_token_accuracy": 0.1465201199054718, "num_tokens": 9160105.0, "step": 4970 }, { "entropy": 5.833481121063232, "epoch": 0.4179794160890569, "grad_norm": 1.3046875, "learning_rate": 0.0004987422574828784, "loss": 5.7442, "mean_token_accuracy": 0.14697531685233117, "num_tokens": 9169367.0, "step": 4975 }, { "entropy": 5.801551008224488, "epoch": 0.4183994959042218, "grad_norm": 1.40625, "learning_rate": 0.0004987390935247639, "loss": 5.6473, "mean_token_accuracy": 0.1524960733950138, "num_tokens": 9177872.0, "step": 4980 }, { "entropy": 5.88420295715332, "epoch": 0.41881957571938666, "grad_norm": 1.3125, "learning_rate": 0.0004987359256032265, "loss": 5.842, "mean_token_accuracy": 0.13641551584005357, "num_tokens": 9187879.0, "step": 4985 }, { "entropy": 5.839552402496338, "epoch": 0.41923965553455156, "grad_norm": 1.3125, "learning_rate": 0.0004987327537183225, "loss": 5.7627, "mean_token_accuracy": 0.14583497866988182, "num_tokens": 9198281.0, "step": 4990 }, { "entropy": 5.830025053024292, "epoch": 0.41965973534971646, "grad_norm": 1.2734375, "learning_rate": 0.0004987295778701078, "loss": 5.7616, "mean_token_accuracy": 0.1472972884774208, "num_tokens": 9207670.0, "step": 4995 }, { "entropy": 5.890192222595215, "epoch": 0.42007981516488135, "grad_norm": 1.3125, "learning_rate": 0.000498726398058639, "loss": 5.7624, "mean_token_accuracy": 0.1478295773267746, "num_tokens": 9216995.0, "step": 5000 }, { "entropy": 5.86348524093628, "epoch": 0.4204998949800462, "grad_norm": 1.203125, "learning_rate": 0.0004987232142839723, "loss": 5.84, "mean_token_accuracy": 0.14336878657341004, "num_tokens": 9227330.0, "step": 5005 }, { "entropy": 5.881588125228882, "epoch": 0.4209199747952111, "grad_norm": 1.46875, "learning_rate": 0.0004987200265461638, "loss": 5.765, "mean_token_accuracy": 0.15374772921204566, "num_tokens": 9236666.0, "step": 5010 }, { "entropy": 5.874930953979492, "epoch": 0.421340054610376, "grad_norm": 1.34375, "learning_rate": 0.0004987168348452705, "loss": 5.7753, "mean_token_accuracy": 0.14532062411308289, "num_tokens": 9246388.0, "step": 5015 }, { "entropy": 5.783464574813843, "epoch": 0.42176013442554083, "grad_norm": 1.296875, "learning_rate": 0.0004987136391813485, "loss": 5.7285, "mean_token_accuracy": 0.15355198979377746, "num_tokens": 9255239.0, "step": 5020 }, { "entropy": 5.740076160430908, "epoch": 0.42218021424070573, "grad_norm": 1.484375, "learning_rate": 0.0004987104395544547, "loss": 5.7036, "mean_token_accuracy": 0.14613911658525466, "num_tokens": 9264468.0, "step": 5025 }, { "entropy": 5.791272926330566, "epoch": 0.42260029405587063, "grad_norm": 1.421875, "learning_rate": 0.0004987072359646455, "loss": 5.767, "mean_token_accuracy": 0.15074100941419602, "num_tokens": 9274140.0, "step": 5030 }, { "entropy": 5.844473838806152, "epoch": 0.42302037387103547, "grad_norm": 1.4921875, "learning_rate": 0.0004987040284119778, "loss": 5.7527, "mean_token_accuracy": 0.14496962279081343, "num_tokens": 9283539.0, "step": 5035 }, { "entropy": 5.793737411499023, "epoch": 0.42344045368620037, "grad_norm": 1.6796875, "learning_rate": 0.0004987008168965087, "loss": 5.7492, "mean_token_accuracy": 0.1446337193250656, "num_tokens": 9292664.0, "step": 5040 }, { "entropy": 5.882410097122192, "epoch": 0.42386053350136527, "grad_norm": 1.4375, "learning_rate": 0.0004986976014182946, "loss": 5.8632, "mean_token_accuracy": 0.14475177749991416, "num_tokens": 9302814.0, "step": 5045 }, { "entropy": 5.91447172164917, "epoch": 0.42428061331653016, "grad_norm": 1.5234375, "learning_rate": 0.0004986943819773927, "loss": 5.8446, "mean_token_accuracy": 0.13993989303708076, "num_tokens": 9312654.0, "step": 5050 }, { "entropy": 5.944200277328491, "epoch": 0.424700693131695, "grad_norm": 1.71875, "learning_rate": 0.00049869115857386, "loss": 5.8629, "mean_token_accuracy": 0.13871566727757453, "num_tokens": 9322271.0, "step": 5055 }, { "entropy": 5.914797401428222, "epoch": 0.4251207729468599, "grad_norm": 1.578125, "learning_rate": 0.0004986879312077536, "loss": 5.7991, "mean_token_accuracy": 0.14315683469176294, "num_tokens": 9331341.0, "step": 5060 }, { "entropy": 5.854491138458252, "epoch": 0.4255408527620248, "grad_norm": 1.4921875, "learning_rate": 0.0004986846998791308, "loss": 5.7292, "mean_token_accuracy": 0.1450161539018154, "num_tokens": 9339863.0, "step": 5065 }, { "entropy": 5.760708379745483, "epoch": 0.42596093257718964, "grad_norm": 1.484375, "learning_rate": 0.0004986814645880485, "loss": 5.7122, "mean_token_accuracy": 0.14831122979521752, "num_tokens": 9349488.0, "step": 5070 }, { "entropy": 5.763440895080566, "epoch": 0.42638101239235454, "grad_norm": 1.234375, "learning_rate": 0.0004986782253345645, "loss": 5.7155, "mean_token_accuracy": 0.1446495160460472, "num_tokens": 9357977.0, "step": 5075 }, { "entropy": 5.873974847793579, "epoch": 0.42680109220751944, "grad_norm": 1.4765625, "learning_rate": 0.0004986749821187358, "loss": 5.8291, "mean_token_accuracy": 0.14651158899068834, "num_tokens": 9367449.0, "step": 5080 }, { "entropy": 5.939826440811157, "epoch": 0.42722117202268434, "grad_norm": 1.5078125, "learning_rate": 0.00049867173494062, "loss": 5.8443, "mean_token_accuracy": 0.14635560363531114, "num_tokens": 9377070.0, "step": 5085 }, { "entropy": 5.814030504226684, "epoch": 0.4276412518378492, "grad_norm": 1.78125, "learning_rate": 0.0004986684838002744, "loss": 5.6402, "mean_token_accuracy": 0.14356765672564506, "num_tokens": 9385881.0, "step": 5090 }, { "entropy": 5.812148237228394, "epoch": 0.4280613316530141, "grad_norm": 1.3125, "learning_rate": 0.0004986652286977569, "loss": 5.7935, "mean_token_accuracy": 0.14297776967287062, "num_tokens": 9395159.0, "step": 5095 }, { "entropy": 5.8577290058135985, "epoch": 0.428481411468179, "grad_norm": 2.21875, "learning_rate": 0.0004986619696331252, "loss": 5.7352, "mean_token_accuracy": 0.14707281142473222, "num_tokens": 9404590.0, "step": 5100 }, { "entropy": 5.849812984466553, "epoch": 0.4289014912833438, "grad_norm": 1.265625, "learning_rate": 0.0004986587066064367, "loss": 5.7485, "mean_token_accuracy": 0.15095358043909074, "num_tokens": 9414452.0, "step": 5105 }, { "entropy": 5.895452976226807, "epoch": 0.4293215710985087, "grad_norm": 1.703125, "learning_rate": 0.0004986554396177494, "loss": 5.8792, "mean_token_accuracy": 0.136289519071579, "num_tokens": 9424004.0, "step": 5110 }, { "entropy": 5.892696142196655, "epoch": 0.4297416509136736, "grad_norm": 1.984375, "learning_rate": 0.0004986521686671212, "loss": 5.7555, "mean_token_accuracy": 0.1541634440422058, "num_tokens": 9433487.0, "step": 5115 }, { "entropy": 5.878170013427734, "epoch": 0.43016173072883845, "grad_norm": 1.5078125, "learning_rate": 0.00049864889375461, "loss": 5.8153, "mean_token_accuracy": 0.14046706035733222, "num_tokens": 9442742.0, "step": 5120 }, { "entropy": 5.880551862716675, "epoch": 0.43058181054400335, "grad_norm": 1.203125, "learning_rate": 0.0004986456148802738, "loss": 5.8803, "mean_token_accuracy": 0.1430477738380432, "num_tokens": 9452550.0, "step": 5125 }, { "entropy": 5.947691774368286, "epoch": 0.43100189035916825, "grad_norm": 1.34375, "learning_rate": 0.0004986423320441707, "loss": 5.8325, "mean_token_accuracy": 0.13825444877147675, "num_tokens": 9461920.0, "step": 5130 }, { "entropy": 5.91607780456543, "epoch": 0.43142197017433315, "grad_norm": 1.4921875, "learning_rate": 0.0004986390452463588, "loss": 5.7649, "mean_token_accuracy": 0.1430374413728714, "num_tokens": 9470817.0, "step": 5135 }, { "entropy": 5.730179119110107, "epoch": 0.431842049989498, "grad_norm": 1.40625, "learning_rate": 0.0004986357544868964, "loss": 5.7124, "mean_token_accuracy": 0.15054248571395873, "num_tokens": 9479936.0, "step": 5140 }, { "entropy": 5.873006772994995, "epoch": 0.4322621298046629, "grad_norm": 1.3984375, "learning_rate": 0.0004986324597658418, "loss": 5.7327, "mean_token_accuracy": 0.15253930985927583, "num_tokens": 9489818.0, "step": 5145 }, { "entropy": 5.753958749771118, "epoch": 0.4326822096198278, "grad_norm": 1.5078125, "learning_rate": 0.0004986291610832533, "loss": 5.7373, "mean_token_accuracy": 0.1437373712658882, "num_tokens": 9499688.0, "step": 5150 }, { "entropy": 5.927468347549438, "epoch": 0.4331022894349926, "grad_norm": 1.4765625, "learning_rate": 0.0004986258584391892, "loss": 5.8034, "mean_token_accuracy": 0.14375736117362975, "num_tokens": 9509581.0, "step": 5155 }, { "entropy": 5.965673732757568, "epoch": 0.4335223692501575, "grad_norm": 1.4375, "learning_rate": 0.0004986225518337084, "loss": 5.8699, "mean_token_accuracy": 0.14116424545645714, "num_tokens": 9518556.0, "step": 5160 }, { "entropy": 5.785204839706421, "epoch": 0.4339424490653224, "grad_norm": 1.3671875, "learning_rate": 0.0004986192412668692, "loss": 5.7774, "mean_token_accuracy": 0.14554062783718108, "num_tokens": 9527612.0, "step": 5165 }, { "entropy": 5.780537843704224, "epoch": 0.4343625288804873, "grad_norm": 1.5546875, "learning_rate": 0.0004986159267387302, "loss": 5.6635, "mean_token_accuracy": 0.15324972867965697, "num_tokens": 9535882.0, "step": 5170 }, { "entropy": 5.812596940994263, "epoch": 0.43478260869565216, "grad_norm": 2.09375, "learning_rate": 0.0004986126082493502, "loss": 5.7687, "mean_token_accuracy": 0.15061265081167222, "num_tokens": 9544799.0, "step": 5175 }, { "entropy": 5.831722640991211, "epoch": 0.43520268851081706, "grad_norm": 1.4375, "learning_rate": 0.0004986092857987881, "loss": 5.6862, "mean_token_accuracy": 0.14822531789541243, "num_tokens": 9553805.0, "step": 5180 }, { "entropy": 5.813159799575805, "epoch": 0.43562276832598196, "grad_norm": 1.3828125, "learning_rate": 0.0004986059593871026, "loss": 5.717, "mean_token_accuracy": 0.14472756162285805, "num_tokens": 9563493.0, "step": 5185 }, { "entropy": 5.843486309051514, "epoch": 0.4360428481411468, "grad_norm": 1.5234375, "learning_rate": 0.0004986026290143527, "loss": 5.8036, "mean_token_accuracy": 0.1416473552584648, "num_tokens": 9572297.0, "step": 5190 }, { "entropy": 5.981090354919433, "epoch": 0.4364629279563117, "grad_norm": 2.265625, "learning_rate": 0.0004985992946805973, "loss": 5.944, "mean_token_accuracy": 0.13955255076289177, "num_tokens": 9581967.0, "step": 5195 }, { "entropy": 5.916019868850708, "epoch": 0.4368830077714766, "grad_norm": 1.2734375, "learning_rate": 0.0004985959563858955, "loss": 5.8377, "mean_token_accuracy": 0.14462848305702208, "num_tokens": 9590885.0, "step": 5200 }, { "entropy": 5.895185899734497, "epoch": 0.43730308758664144, "grad_norm": 1.3046875, "learning_rate": 0.0004985926141303066, "loss": 5.7664, "mean_token_accuracy": 0.14494283646345138, "num_tokens": 9599247.0, "step": 5205 }, { "entropy": 5.799692010879516, "epoch": 0.43772316740180633, "grad_norm": 1.6484375, "learning_rate": 0.0004985892679138896, "loss": 5.6813, "mean_token_accuracy": 0.15228856280446051, "num_tokens": 9608296.0, "step": 5210 }, { "entropy": 5.7856512546539305, "epoch": 0.43814324721697123, "grad_norm": 1.625, "learning_rate": 0.0004985859177367038, "loss": 5.743, "mean_token_accuracy": 0.14503268152475357, "num_tokens": 9616734.0, "step": 5215 }, { "entropy": 5.923814201354981, "epoch": 0.43856332703213613, "grad_norm": 1.90625, "learning_rate": 0.0004985825635988087, "loss": 5.8284, "mean_token_accuracy": 0.14173559248447418, "num_tokens": 9626246.0, "step": 5220 }, { "entropy": 5.93772325515747, "epoch": 0.43898340684730097, "grad_norm": 2.796875, "learning_rate": 0.0004985792055002635, "loss": 5.6928, "mean_token_accuracy": 0.14719630777835846, "num_tokens": 9634963.0, "step": 5225 }, { "entropy": 5.874487066268921, "epoch": 0.43940348666246587, "grad_norm": 1.4765625, "learning_rate": 0.0004985758434411278, "loss": 5.7792, "mean_token_accuracy": 0.1490402415394783, "num_tokens": 9643615.0, "step": 5230 }, { "entropy": 5.830899858474732, "epoch": 0.43982356647763077, "grad_norm": 1.5234375, "learning_rate": 0.0004985724774214613, "loss": 5.7618, "mean_token_accuracy": 0.14308954402804375, "num_tokens": 9653306.0, "step": 5235 }, { "entropy": 5.849113464355469, "epoch": 0.4402436462927956, "grad_norm": 1.390625, "learning_rate": 0.0004985691074413233, "loss": 5.7907, "mean_token_accuracy": 0.14236897826194764, "num_tokens": 9662389.0, "step": 5240 }, { "entropy": 5.755652236938476, "epoch": 0.4406637261079605, "grad_norm": 1.3515625, "learning_rate": 0.0004985657335007739, "loss": 5.7572, "mean_token_accuracy": 0.1460557647049427, "num_tokens": 9671183.0, "step": 5245 }, { "entropy": 5.91265082359314, "epoch": 0.4410838059231254, "grad_norm": 1.28125, "learning_rate": 0.0004985623555998725, "loss": 5.7558, "mean_token_accuracy": 0.15076574087142944, "num_tokens": 9680544.0, "step": 5250 }, { "entropy": 5.915414428710937, "epoch": 0.4415038857382903, "grad_norm": 1.453125, "learning_rate": 0.0004985589737386791, "loss": 5.7849, "mean_token_accuracy": 0.14628923386335374, "num_tokens": 9690137.0, "step": 5255 }, { "entropy": 5.76580057144165, "epoch": 0.44192396555345514, "grad_norm": 1.375, "learning_rate": 0.0004985555879172535, "loss": 5.718, "mean_token_accuracy": 0.14928821474313736, "num_tokens": 9699149.0, "step": 5260 }, { "entropy": 5.85457215309143, "epoch": 0.44234404536862004, "grad_norm": 1.3359375, "learning_rate": 0.000498552198135656, "loss": 5.8061, "mean_token_accuracy": 0.1479725457727909, "num_tokens": 9709308.0, "step": 5265 }, { "entropy": 5.86218228340149, "epoch": 0.44276412518378494, "grad_norm": 1.5, "learning_rate": 0.0004985488043939462, "loss": 5.7433, "mean_token_accuracy": 0.14763879179954528, "num_tokens": 9718462.0, "step": 5270 }, { "entropy": 5.848517847061157, "epoch": 0.4431842049989498, "grad_norm": 1.34375, "learning_rate": 0.0004985454066921846, "loss": 5.6657, "mean_token_accuracy": 0.15294953137636186, "num_tokens": 9727626.0, "step": 5275 }, { "entropy": 5.697809362411499, "epoch": 0.4436042848141147, "grad_norm": 1.53125, "learning_rate": 0.0004985420050304312, "loss": 5.6983, "mean_token_accuracy": 0.14653837010264398, "num_tokens": 9737091.0, "step": 5280 }, { "entropy": 5.776859283447266, "epoch": 0.4440243646292796, "grad_norm": 1.6171875, "learning_rate": 0.0004985385994087462, "loss": 5.7667, "mean_token_accuracy": 0.14408197328448297, "num_tokens": 9746135.0, "step": 5285 }, { "entropy": 5.90691499710083, "epoch": 0.4444444444444444, "grad_norm": 1.5, "learning_rate": 0.0004985351898271901, "loss": 5.6909, "mean_token_accuracy": 0.14973507225513458, "num_tokens": 9754549.0, "step": 5290 }, { "entropy": 5.90071930885315, "epoch": 0.4448645242596093, "grad_norm": 1.453125, "learning_rate": 0.0004985317762858231, "loss": 5.8522, "mean_token_accuracy": 0.1393520101904869, "num_tokens": 9764219.0, "step": 5295 }, { "entropy": 5.850748586654663, "epoch": 0.4452846040747742, "grad_norm": 1.390625, "learning_rate": 0.000498528358784706, "loss": 5.6867, "mean_token_accuracy": 0.15024393796920776, "num_tokens": 9772234.0, "step": 5300 }, { "entropy": 5.805743646621704, "epoch": 0.4457046838899391, "grad_norm": 1.3359375, "learning_rate": 0.000498524937323899, "loss": 5.7412, "mean_token_accuracy": 0.1513320118188858, "num_tokens": 9781417.0, "step": 5305 }, { "entropy": 5.939965295791626, "epoch": 0.44612476370510395, "grad_norm": 1.3515625, "learning_rate": 0.0004985215119034628, "loss": 5.8569, "mean_token_accuracy": 0.14016103446483613, "num_tokens": 9791286.0, "step": 5310 }, { "entropy": 5.8461981296539305, "epoch": 0.44654484352026885, "grad_norm": 1.3984375, "learning_rate": 0.0004985180825234582, "loss": 5.8526, "mean_token_accuracy": 0.14801667556166648, "num_tokens": 9802157.0, "step": 5315 }, { "entropy": 5.973970174789429, "epoch": 0.44696492333543375, "grad_norm": 1.359375, "learning_rate": 0.0004985146491839459, "loss": 5.8417, "mean_token_accuracy": 0.1345731109380722, "num_tokens": 9812646.0, "step": 5320 }, { "entropy": 5.948847103118896, "epoch": 0.4473850031505986, "grad_norm": 1.5, "learning_rate": 0.0004985112118849865, "loss": 5.8516, "mean_token_accuracy": 0.13398924767971038, "num_tokens": 9822274.0, "step": 5325 }, { "entropy": 5.782685327529907, "epoch": 0.4478050829657635, "grad_norm": 1.375, "learning_rate": 0.0004985077706266412, "loss": 5.6553, "mean_token_accuracy": 0.14450234919786453, "num_tokens": 9831337.0, "step": 5330 }, { "entropy": 5.838633728027344, "epoch": 0.4482251627809284, "grad_norm": 1.4296875, "learning_rate": 0.0004985043254089708, "loss": 5.7708, "mean_token_accuracy": 0.1397250510752201, "num_tokens": 9840798.0, "step": 5335 }, { "entropy": 5.778713369369507, "epoch": 0.44864524259609323, "grad_norm": 1.359375, "learning_rate": 0.0004985008762320364, "loss": 5.7606, "mean_token_accuracy": 0.14193158969283104, "num_tokens": 9850117.0, "step": 5340 }, { "entropy": 5.850035953521728, "epoch": 0.4490653224112581, "grad_norm": 1.328125, "learning_rate": 0.000498497423095899, "loss": 5.7103, "mean_token_accuracy": 0.1520538941025734, "num_tokens": 9858227.0, "step": 5345 }, { "entropy": 5.814627599716187, "epoch": 0.449485402226423, "grad_norm": 1.3125, "learning_rate": 0.0004984939660006199, "loss": 5.7869, "mean_token_accuracy": 0.14539863988757135, "num_tokens": 9867157.0, "step": 5350 }, { "entropy": 5.7682719230651855, "epoch": 0.4499054820415879, "grad_norm": 1.25, "learning_rate": 0.0004984905049462602, "loss": 5.7017, "mean_token_accuracy": 0.14406744837760926, "num_tokens": 9877045.0, "step": 5355 }, { "entropy": 5.952353000640869, "epoch": 0.45032556185675277, "grad_norm": 1.2265625, "learning_rate": 0.0004984870399328814, "loss": 5.8435, "mean_token_accuracy": 0.14134326651692392, "num_tokens": 9886637.0, "step": 5360 }, { "entropy": 5.7878330707550045, "epoch": 0.45074564167191766, "grad_norm": 1.5234375, "learning_rate": 0.0004984835709605446, "loss": 5.7137, "mean_token_accuracy": 0.15287383496761323, "num_tokens": 9895601.0, "step": 5365 }, { "entropy": 5.821693420410156, "epoch": 0.45116572148708256, "grad_norm": 1.2734375, "learning_rate": 0.0004984800980293116, "loss": 5.8616, "mean_token_accuracy": 0.14498891532421113, "num_tokens": 9904775.0, "step": 5370 }, { "entropy": 5.841092920303344, "epoch": 0.4515858013022474, "grad_norm": 1.234375, "learning_rate": 0.0004984766211392435, "loss": 5.7982, "mean_token_accuracy": 0.14115200862288474, "num_tokens": 9913795.0, "step": 5375 }, { "entropy": 5.857733345031738, "epoch": 0.4520058811174123, "grad_norm": 1.3671875, "learning_rate": 0.0004984731402904024, "loss": 5.6297, "mean_token_accuracy": 0.15439519211649894, "num_tokens": 9922576.0, "step": 5380 }, { "entropy": 5.776189756393433, "epoch": 0.4524259609325772, "grad_norm": 1.2421875, "learning_rate": 0.0004984696554828496, "loss": 5.6223, "mean_token_accuracy": 0.1497255176305771, "num_tokens": 9930971.0, "step": 5385 }, { "entropy": 5.789079189300537, "epoch": 0.4528460407477421, "grad_norm": 1.421875, "learning_rate": 0.0004984661667166468, "loss": 5.7472, "mean_token_accuracy": 0.15051774978637694, "num_tokens": 9939628.0, "step": 5390 }, { "entropy": 5.814688301086425, "epoch": 0.45326612056290694, "grad_norm": 1.3125, "learning_rate": 0.0004984626739918561, "loss": 5.7234, "mean_token_accuracy": 0.15411069244146347, "num_tokens": 9948397.0, "step": 5395 }, { "entropy": 5.903667163848877, "epoch": 0.45368620037807184, "grad_norm": 1.1875, "learning_rate": 0.0004984591773085391, "loss": 5.7883, "mean_token_accuracy": 0.14472548216581343, "num_tokens": 9957683.0, "step": 5400 }, { "entropy": 5.878373336791992, "epoch": 0.45410628019323673, "grad_norm": 1.3671875, "learning_rate": 0.0004984556766667578, "loss": 5.775, "mean_token_accuracy": 0.14814501702785493, "num_tokens": 9966756.0, "step": 5405 }, { "entropy": 5.782114458084107, "epoch": 0.4545263600084016, "grad_norm": 1.78125, "learning_rate": 0.0004984521720665743, "loss": 5.7889, "mean_token_accuracy": 0.14963518232107162, "num_tokens": 9976000.0, "step": 5410 }, { "entropy": 5.890175533294678, "epoch": 0.4549464398235665, "grad_norm": 1.3515625, "learning_rate": 0.0004984486635080507, "loss": 5.7788, "mean_token_accuracy": 0.14989694356918334, "num_tokens": 9985509.0, "step": 5415 }, { "entropy": 5.741406440734863, "epoch": 0.45536651963873137, "grad_norm": 1.5625, "learning_rate": 0.0004984451509912489, "loss": 5.7261, "mean_token_accuracy": 0.14937452524900435, "num_tokens": 9994342.0, "step": 5420 }, { "entropy": 5.76695761680603, "epoch": 0.4557865994538962, "grad_norm": 1.59375, "learning_rate": 0.0004984416345162315, "loss": 5.7717, "mean_token_accuracy": 0.14712392613291742, "num_tokens": 10004249.0, "step": 5425 }, { "entropy": 5.824491500854492, "epoch": 0.4562066792690611, "grad_norm": 1.3125, "learning_rate": 0.0004984381140830605, "loss": 5.7345, "mean_token_accuracy": 0.14771454110741616, "num_tokens": 10012430.0, "step": 5430 }, { "entropy": 5.907290077209472, "epoch": 0.456626759084226, "grad_norm": 1.265625, "learning_rate": 0.0004984345896917984, "loss": 5.745, "mean_token_accuracy": 0.14652188569307328, "num_tokens": 10021434.0, "step": 5435 }, { "entropy": 5.826172637939453, "epoch": 0.4570468388993909, "grad_norm": 1.34375, "learning_rate": 0.0004984310613425076, "loss": 5.7465, "mean_token_accuracy": 0.1499892771244049, "num_tokens": 10030473.0, "step": 5440 }, { "entropy": 5.826717472076416, "epoch": 0.45746691871455575, "grad_norm": 1.3515625, "learning_rate": 0.0004984275290352506, "loss": 5.7186, "mean_token_accuracy": 0.15438036620616913, "num_tokens": 10039057.0, "step": 5445 }, { "entropy": 5.879296159744262, "epoch": 0.45788699852972065, "grad_norm": 1.2421875, "learning_rate": 0.0004984239927700899, "loss": 5.8158, "mean_token_accuracy": 0.14881263822317123, "num_tokens": 10047998.0, "step": 5450 }, { "entropy": 5.906938457489014, "epoch": 0.45830707834488554, "grad_norm": 1.5, "learning_rate": 0.0004984204525470883, "loss": 5.7609, "mean_token_accuracy": 0.14349722415208815, "num_tokens": 10057479.0, "step": 5455 }, { "entropy": 5.797386360168457, "epoch": 0.4587271581600504, "grad_norm": 1.2734375, "learning_rate": 0.0004984169083663084, "loss": 5.7284, "mean_token_accuracy": 0.14215826913714408, "num_tokens": 10067754.0, "step": 5460 }, { "entropy": 5.769140672683716, "epoch": 0.4591472379752153, "grad_norm": 1.2578125, "learning_rate": 0.0004984133602278129, "loss": 5.8074, "mean_token_accuracy": 0.14632865488529206, "num_tokens": 10076815.0, "step": 5465 }, { "entropy": 5.929787874221802, "epoch": 0.4595673177903802, "grad_norm": 1.4140625, "learning_rate": 0.000498409808131665, "loss": 5.8172, "mean_token_accuracy": 0.14295997619628906, "num_tokens": 10086300.0, "step": 5470 }, { "entropy": 5.820594406127929, "epoch": 0.4599873976055451, "grad_norm": 1.34375, "learning_rate": 0.0004984062520779272, "loss": 5.714, "mean_token_accuracy": 0.15191168189048768, "num_tokens": 10095383.0, "step": 5475 }, { "entropy": 5.724618530273437, "epoch": 0.4604074774207099, "grad_norm": 1.234375, "learning_rate": 0.0004984026920666628, "loss": 5.6894, "mean_token_accuracy": 0.14800985455513, "num_tokens": 10103971.0, "step": 5480 }, { "entropy": 5.801212739944458, "epoch": 0.4608275572358748, "grad_norm": 1.203125, "learning_rate": 0.0004983991280979347, "loss": 5.6852, "mean_token_accuracy": 0.15083771497011184, "num_tokens": 10113028.0, "step": 5485 }, { "entropy": 5.8347760200500485, "epoch": 0.4612476370510397, "grad_norm": 1.15625, "learning_rate": 0.0004983955601718061, "loss": 5.6653, "mean_token_accuracy": 0.14897917956113815, "num_tokens": 10121890.0, "step": 5490 }, { "entropy": 5.866452312469482, "epoch": 0.46166771686620456, "grad_norm": 1.3984375, "learning_rate": 0.0004983919882883401, "loss": 5.792, "mean_token_accuracy": 0.14483870565891266, "num_tokens": 10131655.0, "step": 5495 }, { "entropy": 5.878374290466309, "epoch": 0.46208779668136946, "grad_norm": 1.2578125, "learning_rate": 0.0004983884124476, "loss": 5.7823, "mean_token_accuracy": 0.1475811406970024, "num_tokens": 10140778.0, "step": 5500 }, { "entropy": 5.852365493774414, "epoch": 0.46250787649653435, "grad_norm": 1.53125, "learning_rate": 0.0004983848326496494, "loss": 5.841, "mean_token_accuracy": 0.14311327114701272, "num_tokens": 10150229.0, "step": 5505 }, { "entropy": 5.915929079055786, "epoch": 0.4629279563116992, "grad_norm": 1.3046875, "learning_rate": 0.0004983812488945513, "loss": 5.7457, "mean_token_accuracy": 0.1459451988339424, "num_tokens": 10158939.0, "step": 5510 }, { "entropy": 5.816072797775268, "epoch": 0.4633480361268641, "grad_norm": 1.3046875, "learning_rate": 0.0004983776611823696, "loss": 5.7201, "mean_token_accuracy": 0.1463111788034439, "num_tokens": 10168383.0, "step": 5515 }, { "entropy": 5.759098815917969, "epoch": 0.463768115942029, "grad_norm": 1.3984375, "learning_rate": 0.0004983740695131676, "loss": 5.7315, "mean_token_accuracy": 0.14871828705072404, "num_tokens": 10178678.0, "step": 5520 }, { "entropy": 5.8468421459197994, "epoch": 0.4641881957571939, "grad_norm": 1.2421875, "learning_rate": 0.000498370473887009, "loss": 5.7383, "mean_token_accuracy": 0.14837254881858825, "num_tokens": 10188964.0, "step": 5525 }, { "entropy": 5.893353366851807, "epoch": 0.46460827557235873, "grad_norm": 1.296875, "learning_rate": 0.0004983668743039573, "loss": 5.7476, "mean_token_accuracy": 0.15488529577851295, "num_tokens": 10198333.0, "step": 5530 }, { "entropy": 5.760245847702026, "epoch": 0.46502835538752363, "grad_norm": 1.6015625, "learning_rate": 0.0004983632707640766, "loss": 5.7603, "mean_token_accuracy": 0.14777156189084054, "num_tokens": 10207876.0, "step": 5535 }, { "entropy": 5.78720121383667, "epoch": 0.4654484352026885, "grad_norm": 1.1796875, "learning_rate": 0.0004983596632674306, "loss": 5.699, "mean_token_accuracy": 0.15150602906942368, "num_tokens": 10216822.0, "step": 5540 }, { "entropy": 5.902374649047852, "epoch": 0.46586851501785337, "grad_norm": 1.1640625, "learning_rate": 0.0004983560518140831, "loss": 5.8282, "mean_token_accuracy": 0.13706823736429213, "num_tokens": 10226887.0, "step": 5545 }, { "entropy": 5.864826297760009, "epoch": 0.46628859483301827, "grad_norm": 1.1875, "learning_rate": 0.0004983524364040982, "loss": 5.6734, "mean_token_accuracy": 0.1513754189014435, "num_tokens": 10235935.0, "step": 5550 }, { "entropy": 5.801887845993042, "epoch": 0.46670867464818316, "grad_norm": 1.3125, "learning_rate": 0.0004983488170375399, "loss": 5.6238, "mean_token_accuracy": 0.1499588668346405, "num_tokens": 10245590.0, "step": 5555 }, { "entropy": 5.719043874740601, "epoch": 0.46712875446334806, "grad_norm": 1.296875, "learning_rate": 0.0004983451937144723, "loss": 5.7005, "mean_token_accuracy": 0.14591586142778395, "num_tokens": 10255104.0, "step": 5560 }, { "entropy": 5.659116125106811, "epoch": 0.4675488342785129, "grad_norm": 1.25, "learning_rate": 0.0004983415664349595, "loss": 5.571, "mean_token_accuracy": 0.15797292441129684, "num_tokens": 10264236.0, "step": 5565 }, { "entropy": 5.817941188812256, "epoch": 0.4679689140936778, "grad_norm": 1.25, "learning_rate": 0.0004983379351990659, "loss": 5.6847, "mean_token_accuracy": 0.1529952183365822, "num_tokens": 10273335.0, "step": 5570 }, { "entropy": 5.723038148880005, "epoch": 0.4683889939088427, "grad_norm": 1.234375, "learning_rate": 0.0004983343000068559, "loss": 5.6501, "mean_token_accuracy": 0.1551787719130516, "num_tokens": 10282206.0, "step": 5575 }, { "entropy": 5.658549833297729, "epoch": 0.46880907372400754, "grad_norm": 1.234375, "learning_rate": 0.0004983306608583937, "loss": 5.6064, "mean_token_accuracy": 0.1606610283255577, "num_tokens": 10290056.0, "step": 5580 }, { "entropy": 5.765953302383423, "epoch": 0.46922915353917244, "grad_norm": 1.21875, "learning_rate": 0.0004983270177537438, "loss": 5.688, "mean_token_accuracy": 0.14721163958311081, "num_tokens": 10299726.0, "step": 5585 }, { "entropy": 5.791565895080566, "epoch": 0.46964923335433734, "grad_norm": 1.3046875, "learning_rate": 0.0004983233706929708, "loss": 5.7499, "mean_token_accuracy": 0.14841315150260925, "num_tokens": 10308696.0, "step": 5590 }, { "entropy": 5.832704639434814, "epoch": 0.4700693131695022, "grad_norm": 1.3125, "learning_rate": 0.0004983197196761392, "loss": 5.8361, "mean_token_accuracy": 0.14296030402183532, "num_tokens": 10317845.0, "step": 5595 }, { "entropy": 5.783017778396607, "epoch": 0.4704893929846671, "grad_norm": 1.34375, "learning_rate": 0.0004983160647033139, "loss": 5.7164, "mean_token_accuracy": 0.1538068726658821, "num_tokens": 10326563.0, "step": 5600 }, { "entropy": 5.82874116897583, "epoch": 0.470909472799832, "grad_norm": 1.2734375, "learning_rate": 0.0004983124057745595, "loss": 5.7062, "mean_token_accuracy": 0.14498351216316224, "num_tokens": 10335931.0, "step": 5605 }, { "entropy": 5.721787977218628, "epoch": 0.47132955261499687, "grad_norm": 1.265625, "learning_rate": 0.0004983087428899408, "loss": 5.7048, "mean_token_accuracy": 0.1419524259865284, "num_tokens": 10344984.0, "step": 5610 }, { "entropy": 5.79919285774231, "epoch": 0.4717496324301617, "grad_norm": 1.421875, "learning_rate": 0.0004983050760495227, "loss": 5.7392, "mean_token_accuracy": 0.1474568262696266, "num_tokens": 10353522.0, "step": 5615 }, { "entropy": 5.907325601577758, "epoch": 0.4721697122453266, "grad_norm": 1.375, "learning_rate": 0.0004983014052533702, "loss": 5.7558, "mean_token_accuracy": 0.15104926228523255, "num_tokens": 10363527.0, "step": 5620 }, { "entropy": 5.760684251785278, "epoch": 0.4725897920604915, "grad_norm": 1.1328125, "learning_rate": 0.0004982977305015481, "loss": 5.6642, "mean_token_accuracy": 0.14723679050803185, "num_tokens": 10372040.0, "step": 5625 }, { "entropy": 5.758435773849487, "epoch": 0.47300987187565635, "grad_norm": 1.296875, "learning_rate": 0.0004982940517941219, "loss": 5.6546, "mean_token_accuracy": 0.14704783335328103, "num_tokens": 10381279.0, "step": 5630 }, { "entropy": 5.856901502609253, "epoch": 0.47342995169082125, "grad_norm": 1.3359375, "learning_rate": 0.0004982903691311564, "loss": 5.812, "mean_token_accuracy": 0.14307899996638299, "num_tokens": 10390608.0, "step": 5635 }, { "entropy": 5.73579969406128, "epoch": 0.47385003150598615, "grad_norm": 1.3671875, "learning_rate": 0.0004982866825127172, "loss": 5.6083, "mean_token_accuracy": 0.1558460235595703, "num_tokens": 10399851.0, "step": 5640 }, { "entropy": 5.866163825988769, "epoch": 0.47427011132115104, "grad_norm": 1.21875, "learning_rate": 0.0004982829919388692, "loss": 5.9015, "mean_token_accuracy": 0.1417351670563221, "num_tokens": 10410425.0, "step": 5645 }, { "entropy": 5.8129795551300045, "epoch": 0.4746901911363159, "grad_norm": 1.2265625, "learning_rate": 0.0004982792974096781, "loss": 5.6731, "mean_token_accuracy": 0.15132275298237802, "num_tokens": 10418783.0, "step": 5650 }, { "entropy": 5.84749026298523, "epoch": 0.4751102709514808, "grad_norm": 1.4296875, "learning_rate": 0.000498275598925209, "loss": 5.8182, "mean_token_accuracy": 0.14057652279734612, "num_tokens": 10427360.0, "step": 5655 }, { "entropy": 5.890136814117431, "epoch": 0.4755303507666457, "grad_norm": 1.2734375, "learning_rate": 0.0004982718964855277, "loss": 5.7924, "mean_token_accuracy": 0.1445530578494072, "num_tokens": 10436613.0, "step": 5660 }, { "entropy": 5.866897964477539, "epoch": 0.4759504305818105, "grad_norm": 1.3828125, "learning_rate": 0.0004982681900907, "loss": 5.824, "mean_token_accuracy": 0.14708172976970674, "num_tokens": 10445055.0, "step": 5665 }, { "entropy": 5.747807741165161, "epoch": 0.4763705103969754, "grad_norm": 1.21875, "learning_rate": 0.000498264479740791, "loss": 5.6512, "mean_token_accuracy": 0.15550965517759324, "num_tokens": 10454516.0, "step": 5670 }, { "entropy": 5.870613145828247, "epoch": 0.4767905902121403, "grad_norm": 1.203125, "learning_rate": 0.0004982607654358668, "loss": 5.7869, "mean_token_accuracy": 0.15260702520608901, "num_tokens": 10463771.0, "step": 5675 }, { "entropy": 5.8192380428314205, "epoch": 0.47721067002730516, "grad_norm": 1.484375, "learning_rate": 0.000498257047175993, "loss": 5.7218, "mean_token_accuracy": 0.1428370013833046, "num_tokens": 10473783.0, "step": 5680 }, { "entropy": 5.814655399322509, "epoch": 0.47763074984247006, "grad_norm": 1.5, "learning_rate": 0.0004982533249612357, "loss": 5.7006, "mean_token_accuracy": 0.14914624020457268, "num_tokens": 10483424.0, "step": 5685 }, { "entropy": 5.733596324920654, "epoch": 0.47805082965763496, "grad_norm": 1.4921875, "learning_rate": 0.0004982495987916607, "loss": 5.6225, "mean_token_accuracy": 0.15583412498235702, "num_tokens": 10492536.0, "step": 5690 }, { "entropy": 5.762752342224121, "epoch": 0.47847090947279985, "grad_norm": 1.390625, "learning_rate": 0.0004982458686673339, "loss": 5.7437, "mean_token_accuracy": 0.14716439545154572, "num_tokens": 10501616.0, "step": 5695 }, { "entropy": 5.899823999404907, "epoch": 0.4788909892879647, "grad_norm": 1.3359375, "learning_rate": 0.0004982421345883217, "loss": 5.7689, "mean_token_accuracy": 0.14301676750183107, "num_tokens": 10511190.0, "step": 5700 }, { "entropy": 5.779811668395996, "epoch": 0.4793110691031296, "grad_norm": 1.3359375, "learning_rate": 0.0004982383965546898, "loss": 5.7065, "mean_token_accuracy": 0.14420118555426598, "num_tokens": 10520310.0, "step": 5705 }, { "entropy": 5.831453895568847, "epoch": 0.4797311489182945, "grad_norm": 1.421875, "learning_rate": 0.0004982346545665048, "loss": 5.6901, "mean_token_accuracy": 0.15481160432100297, "num_tokens": 10528711.0, "step": 5710 }, { "entropy": 5.8021493434906, "epoch": 0.48015122873345933, "grad_norm": 1.265625, "learning_rate": 0.0004982309086238328, "loss": 5.784, "mean_token_accuracy": 0.1420671336352825, "num_tokens": 10538484.0, "step": 5715 }, { "entropy": 5.836937665939331, "epoch": 0.48057130854862423, "grad_norm": 1.421875, "learning_rate": 0.0004982271587267403, "loss": 5.7229, "mean_token_accuracy": 0.1457110583782196, "num_tokens": 10547623.0, "step": 5720 }, { "entropy": 5.850609970092774, "epoch": 0.48099138836378913, "grad_norm": 1.3671875, "learning_rate": 0.0004982234048752935, "loss": 5.6716, "mean_token_accuracy": 0.14635915756225587, "num_tokens": 10556234.0, "step": 5725 }, { "entropy": 5.8946315288543705, "epoch": 0.481411468178954, "grad_norm": 1.4296875, "learning_rate": 0.000498219647069559, "loss": 5.8951, "mean_token_accuracy": 0.13762107565999032, "num_tokens": 10566308.0, "step": 5730 }, { "entropy": 5.863095760345459, "epoch": 0.48183154799411887, "grad_norm": 1.515625, "learning_rate": 0.0004982158853096035, "loss": 5.8416, "mean_token_accuracy": 0.13780406042933463, "num_tokens": 10575212.0, "step": 5735 }, { "entropy": 5.846006250381469, "epoch": 0.48225162780928377, "grad_norm": 1.2734375, "learning_rate": 0.0004982121195954935, "loss": 5.6097, "mean_token_accuracy": 0.1565147191286087, "num_tokens": 10584590.0, "step": 5740 }, { "entropy": 5.7412327289581295, "epoch": 0.48267170762444866, "grad_norm": 1.2421875, "learning_rate": 0.0004982083499272957, "loss": 5.67, "mean_token_accuracy": 0.14907400235533713, "num_tokens": 10593997.0, "step": 5745 }, { "entropy": 5.807267189025879, "epoch": 0.4830917874396135, "grad_norm": 1.265625, "learning_rate": 0.0004982045763050768, "loss": 5.8041, "mean_token_accuracy": 0.1457974396646023, "num_tokens": 10603299.0, "step": 5750 }, { "entropy": 5.819139719009399, "epoch": 0.4835118672547784, "grad_norm": 1.25, "learning_rate": 0.0004982007987289041, "loss": 5.7361, "mean_token_accuracy": 0.14838966429233552, "num_tokens": 10613546.0, "step": 5755 }, { "entropy": 5.766327381134033, "epoch": 0.4839319470699433, "grad_norm": 1.6484375, "learning_rate": 0.0004981970171988439, "loss": 5.7119, "mean_token_accuracy": 0.15573283806443214, "num_tokens": 10622966.0, "step": 5760 }, { "entropy": 5.8138025283813475, "epoch": 0.48435202688510814, "grad_norm": 1.5703125, "learning_rate": 0.0004981932317149636, "loss": 5.7886, "mean_token_accuracy": 0.14435389563441275, "num_tokens": 10633441.0, "step": 5765 }, { "entropy": 5.843524694442749, "epoch": 0.48477210670027304, "grad_norm": 1.46875, "learning_rate": 0.00049818944227733, "loss": 5.7587, "mean_token_accuracy": 0.14196690768003464, "num_tokens": 10643124.0, "step": 5770 }, { "entropy": 5.8232954978942875, "epoch": 0.48519218651543794, "grad_norm": 1.3671875, "learning_rate": 0.0004981856488860105, "loss": 5.7342, "mean_token_accuracy": 0.14407299906015397, "num_tokens": 10652517.0, "step": 5775 }, { "entropy": 5.807135486602784, "epoch": 0.48561226633060284, "grad_norm": 1.3203125, "learning_rate": 0.0004981818515410721, "loss": 5.7752, "mean_token_accuracy": 0.1420580416917801, "num_tokens": 10663352.0, "step": 5780 }, { "entropy": 5.849091625213623, "epoch": 0.4860323461457677, "grad_norm": 1.4453125, "learning_rate": 0.0004981780502425821, "loss": 5.7942, "mean_token_accuracy": 0.14775322377681732, "num_tokens": 10672430.0, "step": 5785 }, { "entropy": 5.835392618179322, "epoch": 0.4864524259609326, "grad_norm": 1.4921875, "learning_rate": 0.0004981742449906079, "loss": 5.7391, "mean_token_accuracy": 0.1527121603488922, "num_tokens": 10681908.0, "step": 5790 }, { "entropy": 5.863359975814819, "epoch": 0.4868725057760975, "grad_norm": 1.4609375, "learning_rate": 0.0004981704357852168, "loss": 5.7322, "mean_token_accuracy": 0.14731571227312087, "num_tokens": 10691259.0, "step": 5795 }, { "entropy": 5.765371608734131, "epoch": 0.4872925855912623, "grad_norm": 1.3515625, "learning_rate": 0.0004981666226264764, "loss": 5.6277, "mean_token_accuracy": 0.14454422146081924, "num_tokens": 10699668.0, "step": 5800 }, { "entropy": 5.800967359542847, "epoch": 0.4877126654064272, "grad_norm": 1.125, "learning_rate": 0.0004981628055144542, "loss": 5.6801, "mean_token_accuracy": 0.15341386646032334, "num_tokens": 10709146.0, "step": 5805 }, { "entropy": 5.810514068603515, "epoch": 0.4881327452215921, "grad_norm": 1.828125, "learning_rate": 0.0004981589844492177, "loss": 5.7695, "mean_token_accuracy": 0.14522838592529297, "num_tokens": 10718724.0, "step": 5810 }, { "entropy": 5.761872911453247, "epoch": 0.488552825036757, "grad_norm": 1.3515625, "learning_rate": 0.0004981551594308349, "loss": 5.7216, "mean_token_accuracy": 0.147171813249588, "num_tokens": 10728101.0, "step": 5815 }, { "entropy": 5.862204074859619, "epoch": 0.48897290485192185, "grad_norm": 1.4609375, "learning_rate": 0.0004981513304593733, "loss": 5.7442, "mean_token_accuracy": 0.15811584144830704, "num_tokens": 10736750.0, "step": 5820 }, { "entropy": 5.867376232147217, "epoch": 0.48939298466708675, "grad_norm": 1.3671875, "learning_rate": 0.0004981474975349006, "loss": 5.9234, "mean_token_accuracy": 0.1416974514722824, "num_tokens": 10746914.0, "step": 5825 }, { "entropy": 5.859874296188354, "epoch": 0.48981306448225165, "grad_norm": 1.5234375, "learning_rate": 0.000498143660657485, "loss": 5.7529, "mean_token_accuracy": 0.1483662724494934, "num_tokens": 10755786.0, "step": 5830 }, { "entropy": 5.708901691436767, "epoch": 0.4902331442974165, "grad_norm": 1.34375, "learning_rate": 0.0004981398198271944, "loss": 5.62, "mean_token_accuracy": 0.15774477571249007, "num_tokens": 10764821.0, "step": 5835 }, { "entropy": 5.774704790115356, "epoch": 0.4906532241125814, "grad_norm": 1.2734375, "learning_rate": 0.0004981359750440968, "loss": 5.7095, "mean_token_accuracy": 0.1461211383342743, "num_tokens": 10773569.0, "step": 5840 }, { "entropy": 5.725243186950683, "epoch": 0.4910733039277463, "grad_norm": 1.5078125, "learning_rate": 0.0004981321263082603, "loss": 5.6856, "mean_token_accuracy": 0.1431654214859009, "num_tokens": 10782298.0, "step": 5845 }, { "entropy": 5.738628387451172, "epoch": 0.4914933837429111, "grad_norm": 1.28125, "learning_rate": 0.000498128273619753, "loss": 5.6828, "mean_token_accuracy": 0.14975926280021667, "num_tokens": 10792087.0, "step": 5850 }, { "entropy": 5.819939708709716, "epoch": 0.491913463558076, "grad_norm": 1.3203125, "learning_rate": 0.0004981244169786433, "loss": 5.7701, "mean_token_accuracy": 0.14303557947278023, "num_tokens": 10801641.0, "step": 5855 }, { "entropy": 5.9354065418243405, "epoch": 0.4923335433732409, "grad_norm": 1.328125, "learning_rate": 0.0004981205563849994, "loss": 5.8349, "mean_token_accuracy": 0.14316534698009492, "num_tokens": 10811612.0, "step": 5860 }, { "entropy": 5.813403034210205, "epoch": 0.4927536231884058, "grad_norm": 1.4609375, "learning_rate": 0.0004981166918388897, "loss": 5.6588, "mean_token_accuracy": 0.15225751399993898, "num_tokens": 10821608.0, "step": 5865 }, { "entropy": 5.730116319656372, "epoch": 0.49317370300357066, "grad_norm": 1.390625, "learning_rate": 0.0004981128233403828, "loss": 5.6081, "mean_token_accuracy": 0.15562243461608888, "num_tokens": 10830679.0, "step": 5870 }, { "entropy": 5.774923992156983, "epoch": 0.49359378281873556, "grad_norm": 1.4921875, "learning_rate": 0.000498108950889547, "loss": 5.6988, "mean_token_accuracy": 0.14943243488669394, "num_tokens": 10839669.0, "step": 5875 }, { "entropy": 5.790045118331909, "epoch": 0.49401386263390046, "grad_norm": 1.3046875, "learning_rate": 0.0004981050744864512, "loss": 5.6749, "mean_token_accuracy": 0.14829235821962355, "num_tokens": 10849666.0, "step": 5880 }, { "entropy": 5.747752380371094, "epoch": 0.4944339424490653, "grad_norm": 1.59375, "learning_rate": 0.0004981011941311638, "loss": 5.5867, "mean_token_accuracy": 0.15378802865743638, "num_tokens": 10858225.0, "step": 5885 }, { "entropy": 5.754449367523193, "epoch": 0.4948540222642302, "grad_norm": 1.3515625, "learning_rate": 0.0004980973098237535, "loss": 5.6944, "mean_token_accuracy": 0.14568711966276168, "num_tokens": 10867466.0, "step": 5890 }, { "entropy": 5.807771825790406, "epoch": 0.4952741020793951, "grad_norm": 1.34375, "learning_rate": 0.0004980934215642894, "loss": 5.734, "mean_token_accuracy": 0.15061958581209184, "num_tokens": 10875850.0, "step": 5895 }, { "entropy": 5.799838733673096, "epoch": 0.49569418189456, "grad_norm": 1.2578125, "learning_rate": 0.00049808952935284, "loss": 5.6623, "mean_token_accuracy": 0.15344742313027382, "num_tokens": 10885154.0, "step": 5900 }, { "entropy": 5.781932210922241, "epoch": 0.49611426170972484, "grad_norm": 1.65625, "learning_rate": 0.0004980856331894747, "loss": 5.7613, "mean_token_accuracy": 0.1433933824300766, "num_tokens": 10894080.0, "step": 5905 }, { "entropy": 5.767203950881958, "epoch": 0.49653434152488973, "grad_norm": 1.3828125, "learning_rate": 0.0004980817330742621, "loss": 5.7532, "mean_token_accuracy": 0.14163193702697754, "num_tokens": 10903248.0, "step": 5910 }, { "entropy": 5.848739862442017, "epoch": 0.49695442134005463, "grad_norm": 1.3046875, "learning_rate": 0.0004980778290072716, "loss": 5.7241, "mean_token_accuracy": 0.14693543910980225, "num_tokens": 10912939.0, "step": 5915 }, { "entropy": 5.852625370025635, "epoch": 0.4973745011552195, "grad_norm": 1.8515625, "learning_rate": 0.0004980739209885722, "loss": 5.7214, "mean_token_accuracy": 0.15410863906145095, "num_tokens": 10921505.0, "step": 5920 }, { "entropy": 5.887701892852784, "epoch": 0.49779458097038437, "grad_norm": 1.484375, "learning_rate": 0.0004980700090182331, "loss": 5.8102, "mean_token_accuracy": 0.15374419540166856, "num_tokens": 10931861.0, "step": 5925 }, { "entropy": 5.864461851119995, "epoch": 0.49821466078554927, "grad_norm": 1.5, "learning_rate": 0.0004980660930963238, "loss": 5.7275, "mean_token_accuracy": 0.14706841260194778, "num_tokens": 10940810.0, "step": 5930 }, { "entropy": 5.740756177902222, "epoch": 0.4986347406007141, "grad_norm": 1.359375, "learning_rate": 0.0004980621732229133, "loss": 5.6064, "mean_token_accuracy": 0.15016069263219833, "num_tokens": 10949514.0, "step": 5935 }, { "entropy": 5.8102155208587645, "epoch": 0.499054820415879, "grad_norm": 1.8984375, "learning_rate": 0.0004980582493980714, "loss": 5.8295, "mean_token_accuracy": 0.14151460081338882, "num_tokens": 10959161.0, "step": 5940 }, { "entropy": 5.781671619415283, "epoch": 0.4994749002310439, "grad_norm": 1.4609375, "learning_rate": 0.0004980543216218674, "loss": 5.6979, "mean_token_accuracy": 0.15644685178995132, "num_tokens": 10968983.0, "step": 5945 }, { "entropy": 5.819313478469849, "epoch": 0.4998949800462088, "grad_norm": 1.3203125, "learning_rate": 0.0004980503898943711, "loss": 5.8145, "mean_token_accuracy": 0.14870925694704057, "num_tokens": 10978044.0, "step": 5950 }, { "entropy": 5.8994536876678465, "epoch": 0.5003150598613737, "grad_norm": 1.328125, "learning_rate": 0.0004980464542156519, "loss": 5.7276, "mean_token_accuracy": 0.15195106863975524, "num_tokens": 10986980.0, "step": 5955 }, { "entropy": 5.804496479034424, "epoch": 0.5007351396765385, "grad_norm": 1.4140625, "learning_rate": 0.0004980425145857796, "loss": 5.6674, "mean_token_accuracy": 0.16026414483785628, "num_tokens": 10995163.0, "step": 5960 }, { "entropy": 5.705014753341675, "epoch": 0.5011552194917034, "grad_norm": 1.28125, "learning_rate": 0.000498038571004824, "loss": 5.6084, "mean_token_accuracy": 0.15652424693107606, "num_tokens": 11003722.0, "step": 5965 }, { "entropy": 5.725177145004272, "epoch": 0.5015752993068683, "grad_norm": 1.1640625, "learning_rate": 0.0004980346234728549, "loss": 5.6641, "mean_token_accuracy": 0.1604843556880951, "num_tokens": 11013176.0, "step": 5970 }, { "entropy": 5.815953350067138, "epoch": 0.5019953791220332, "grad_norm": 1.2578125, "learning_rate": 0.0004980306719899424, "loss": 5.7337, "mean_token_accuracy": 0.14801866561174393, "num_tokens": 11022636.0, "step": 5975 }, { "entropy": 5.758071231842041, "epoch": 0.5024154589371981, "grad_norm": 1.390625, "learning_rate": 0.0004980267165561564, "loss": 5.6897, "mean_token_accuracy": 0.1525172308087349, "num_tokens": 11031896.0, "step": 5980 }, { "entropy": 5.801525115966797, "epoch": 0.502835538752363, "grad_norm": 1.3984375, "learning_rate": 0.0004980227571715669, "loss": 5.7359, "mean_token_accuracy": 0.14860138446092605, "num_tokens": 11040802.0, "step": 5985 }, { "entropy": 5.774550342559815, "epoch": 0.5032556185675279, "grad_norm": 1.2109375, "learning_rate": 0.0004980187938362441, "loss": 5.6483, "mean_token_accuracy": 0.14492484778165818, "num_tokens": 11049701.0, "step": 5990 }, { "entropy": 5.841704797744751, "epoch": 0.5036756983826927, "grad_norm": 1.2734375, "learning_rate": 0.0004980148265502581, "loss": 5.831, "mean_token_accuracy": 0.14276057258248329, "num_tokens": 11059555.0, "step": 5995 }, { "entropy": 5.823680114746094, "epoch": 0.5040957781978576, "grad_norm": 1.3671875, "learning_rate": 0.0004980108553136795, "loss": 5.7444, "mean_token_accuracy": 0.14943215250968933, "num_tokens": 11068940.0, "step": 6000 }, { "epoch": 0.5040957781978576, "eval_entropy": 5.697147493315468, "eval_loss": 5.740706920623779, "eval_mean_token_accuracy": 0.15491213997795275, "eval_num_tokens": 11068940.0, "eval_runtime": 27.2377, "eval_samples_per_second": 1371.85, "eval_steps_per_second": 171.49, "step": 6000 } ], "logging_steps": 5, "max_steps": 119020, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.6177520553984e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }