{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0162990968283974, "eval_steps": 3000, "global_step": 24000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 10.69201192855835, "epoch": 0.0004200798151648813, "grad_norm": 13.375, "learning_rate": 2e-06, "loss": 10.8001, "mean_token_accuracy": 0.0, "num_tokens": 8348.0, "step": 5 }, { "entropy": 10.691978454589844, "epoch": 0.0008401596303297626, "grad_norm": 12.5, "learning_rate": 4.5e-06, "loss": 10.7548, "mean_token_accuracy": 0.00010881392518058419, "num_tokens": 17465.0, "step": 10 }, { "entropy": 10.691164684295654, "epoch": 0.001260239445494644, "grad_norm": 9.9375, "learning_rate": 7e-06, "loss": 10.5365, "mean_token_accuracy": 0.021085147676058114, "num_tokens": 26627.0, "step": 15 }, { "entropy": 10.678658771514893, "epoch": 0.0016803192606595252, "grad_norm": 6.46875, "learning_rate": 9.5e-06, "loss": 10.2026, "mean_token_accuracy": 0.046403773874044416, "num_tokens": 36069.0, "step": 20 }, { "entropy": 10.598964595794678, "epoch": 0.002100399075824407, "grad_norm": 4.46875, "learning_rate": 1.2e-05, "loss": 9.8984, "mean_token_accuracy": 0.04546841159462929, "num_tokens": 44967.0, "step": 25 }, { "entropy": 10.592682838439941, "epoch": 0.002520478890989288, "grad_norm": 3.25, "learning_rate": 1.4500000000000002e-05, "loss": 9.8253, "mean_token_accuracy": 0.04163686409592628, "num_tokens": 55132.0, "step": 30 }, { "entropy": 10.616032028198243, "epoch": 0.0029405587061541692, "grad_norm": 2.734375, "learning_rate": 1.7000000000000003e-05, "loss": 9.6909, "mean_token_accuracy": 0.04541983306407928, "num_tokens": 65141.0, "step": 35 }, { "entropy": 10.587666893005371, "epoch": 0.0033606385213190504, "grad_norm": 2.453125, "learning_rate": 1.95e-05, "loss": 9.6967, "mean_token_accuracy": 0.040509892627596855, "num_tokens": 74007.0, "step": 40 }, { "entropy": 10.587863063812256, "epoch": 0.003780718336483932, "grad_norm": 2.453125, "learning_rate": 2.2e-05, "loss": 9.6278, "mean_token_accuracy": 0.04380051270127296, "num_tokens": 83736.0, "step": 45 }, { "entropy": 10.581284713745116, "epoch": 0.004200798151648814, "grad_norm": 2.359375, "learning_rate": 2.4500000000000003e-05, "loss": 9.5554, "mean_token_accuracy": 0.04462047629058361, "num_tokens": 92525.0, "step": 50 }, { "entropy": 10.579821586608887, "epoch": 0.004620877966813695, "grad_norm": 2.515625, "learning_rate": 2.7e-05, "loss": 9.5042, "mean_token_accuracy": 0.0499776991084218, "num_tokens": 102015.0, "step": 55 }, { "entropy": 10.527470588684082, "epoch": 0.005040957781978576, "grad_norm": 2.203125, "learning_rate": 2.95e-05, "loss": 9.4648, "mean_token_accuracy": 0.05102687180042267, "num_tokens": 110887.0, "step": 60 }, { "entropy": 10.398450374603271, "epoch": 0.005461037597143457, "grad_norm": 2.265625, "learning_rate": 3.2e-05, "loss": 9.3768, "mean_token_accuracy": 0.05401572398841381, "num_tokens": 120442.0, "step": 65 }, { "entropy": 10.466637897491456, "epoch": 0.0058811174123083385, "grad_norm": 2.34375, "learning_rate": 3.4500000000000005e-05, "loss": 9.2516, "mean_token_accuracy": 0.05276094898581505, "num_tokens": 129297.0, "step": 70 }, { "entropy": 10.477723217010498, "epoch": 0.00630119722747322, "grad_norm": 2.1875, "learning_rate": 3.7e-05, "loss": 9.1585, "mean_token_accuracy": 0.05686353407800197, "num_tokens": 138305.0, "step": 75 }, { "entropy": 10.401033782958985, "epoch": 0.006721277042638101, "grad_norm": 2.3125, "learning_rate": 3.95e-05, "loss": 9.0976, "mean_token_accuracy": 0.055690228939056396, "num_tokens": 147640.0, "step": 80 }, { "entropy": 10.44783878326416, "epoch": 0.007141356857802983, "grad_norm": 2.1875, "learning_rate": 4.2000000000000004e-05, "loss": 8.9803, "mean_token_accuracy": 0.05669833719730377, "num_tokens": 157633.0, "step": 85 }, { "entropy": 10.396310806274414, "epoch": 0.007561436672967864, "grad_norm": 1.921875, "learning_rate": 4.45e-05, "loss": 8.9499, "mean_token_accuracy": 0.05056734494864941, "num_tokens": 167984.0, "step": 90 }, { "entropy": 10.333494663238525, "epoch": 0.007981516488132745, "grad_norm": 1.90625, "learning_rate": 4.7000000000000004e-05, "loss": 8.8301, "mean_token_accuracy": 0.06639725379645825, "num_tokens": 176984.0, "step": 95 }, { "entropy": 10.28737268447876, "epoch": 0.008401596303297627, "grad_norm": 2.171875, "learning_rate": 4.9500000000000004e-05, "loss": 8.654, "mean_token_accuracy": 0.06538619883358479, "num_tokens": 185931.0, "step": 100 }, { "entropy": 10.208460235595703, "epoch": 0.008821676118462508, "grad_norm": 2.921875, "learning_rate": 5.2e-05, "loss": 8.6478, "mean_token_accuracy": 0.050938266515731814, "num_tokens": 195065.0, "step": 105 }, { "entropy": 10.092334175109864, "epoch": 0.00924175593362739, "grad_norm": 1.9453125, "learning_rate": 5.45e-05, "loss": 8.5099, "mean_token_accuracy": 0.06477361544966698, "num_tokens": 203687.0, "step": 110 }, { "entropy": 10.105284690856934, "epoch": 0.00966183574879227, "grad_norm": 1.9296875, "learning_rate": 5.7e-05, "loss": 8.4081, "mean_token_accuracy": 0.0666894868016243, "num_tokens": 212847.0, "step": 115 }, { "entropy": 9.957781219482422, "epoch": 0.010081915563957152, "grad_norm": 1.71875, "learning_rate": 5.9499999999999996e-05, "loss": 8.3004, "mean_token_accuracy": 0.0674133587628603, "num_tokens": 222593.0, "step": 120 }, { "entropy": 9.889359092712402, "epoch": 0.010501995379122032, "grad_norm": 1.6953125, "learning_rate": 6.2e-05, "loss": 8.129, "mean_token_accuracy": 0.07197456955909728, "num_tokens": 231174.0, "step": 125 }, { "entropy": 9.669556808471679, "epoch": 0.010922075194286915, "grad_norm": 1.703125, "learning_rate": 6.450000000000001e-05, "loss": 7.9843, "mean_token_accuracy": 0.07425511926412583, "num_tokens": 239833.0, "step": 130 }, { "entropy": 9.519672775268555, "epoch": 0.011342155009451797, "grad_norm": 1.4296875, "learning_rate": 6.7e-05, "loss": 8.0143, "mean_token_accuracy": 0.07254141308367253, "num_tokens": 248794.0, "step": 135 }, { "entropy": 9.303325176239014, "epoch": 0.011762234824616677, "grad_norm": 1.6953125, "learning_rate": 6.950000000000001e-05, "loss": 7.9537, "mean_token_accuracy": 0.07010119631886483, "num_tokens": 257123.0, "step": 140 }, { "entropy": 9.143257808685302, "epoch": 0.012182314639781559, "grad_norm": 1.3359375, "learning_rate": 7.2e-05, "loss": 7.6458, "mean_token_accuracy": 0.07959595024585724, "num_tokens": 266088.0, "step": 145 }, { "entropy": 8.888239574432372, "epoch": 0.01260239445494644, "grad_norm": 1.15625, "learning_rate": 7.45e-05, "loss": 7.8236, "mean_token_accuracy": 0.07102414257824421, "num_tokens": 276074.0, "step": 150 }, { "entropy": 8.727731895446777, "epoch": 0.013022474270111321, "grad_norm": 1.265625, "learning_rate": 7.7e-05, "loss": 7.7082, "mean_token_accuracy": 0.07570267021656037, "num_tokens": 285280.0, "step": 155 }, { "entropy": 8.563877964019776, "epoch": 0.013442554085276202, "grad_norm": 1.1328125, "learning_rate": 7.950000000000001e-05, "loss": 7.6962, "mean_token_accuracy": 0.06895132511854171, "num_tokens": 296115.0, "step": 160 }, { "entropy": 8.412875747680664, "epoch": 0.013862633900441084, "grad_norm": 1.2734375, "learning_rate": 8.2e-05, "loss": 7.5497, "mean_token_accuracy": 0.07601302340626717, "num_tokens": 305483.0, "step": 165 }, { "entropy": 8.340911769866944, "epoch": 0.014282713715605966, "grad_norm": 1.2109375, "learning_rate": 8.450000000000001e-05, "loss": 7.5593, "mean_token_accuracy": 0.07040085420012474, "num_tokens": 314000.0, "step": 170 }, { "entropy": 8.245043659210205, "epoch": 0.014702793530770846, "grad_norm": 1.5234375, "learning_rate": 8.7e-05, "loss": 7.5541, "mean_token_accuracy": 0.07777635231614113, "num_tokens": 323667.0, "step": 175 }, { "entropy": 8.15629415512085, "epoch": 0.015122873345935728, "grad_norm": 1.4296875, "learning_rate": 8.95e-05, "loss": 7.5554, "mean_token_accuracy": 0.07515333034098148, "num_tokens": 332695.0, "step": 180 }, { "entropy": 8.065321111679078, "epoch": 0.015542953161100609, "grad_norm": 1.1875, "learning_rate": 9.2e-05, "loss": 7.3947, "mean_token_accuracy": 0.07709791958332061, "num_tokens": 342428.0, "step": 185 }, { "entropy": 8.054158020019532, "epoch": 0.01596303297626549, "grad_norm": 1.140625, "learning_rate": 9.45e-05, "loss": 7.5079, "mean_token_accuracy": 0.0735605925321579, "num_tokens": 353587.0, "step": 190 }, { "entropy": 7.988022661209106, "epoch": 0.01638311279143037, "grad_norm": 1.34375, "learning_rate": 9.7e-05, "loss": 7.443, "mean_token_accuracy": 0.07551693692803382, "num_tokens": 362997.0, "step": 195 }, { "entropy": 8.02585473060608, "epoch": 0.016803192606595255, "grad_norm": 1.1796875, "learning_rate": 9.95e-05, "loss": 7.4821, "mean_token_accuracy": 0.07873391062021255, "num_tokens": 372346.0, "step": 200 }, { "entropy": 7.984146022796631, "epoch": 0.017223272421760135, "grad_norm": 1.65625, "learning_rate": 0.000102, "loss": 7.3473, "mean_token_accuracy": 0.07624267861247062, "num_tokens": 381575.0, "step": 205 }, { "entropy": 7.912975454330445, "epoch": 0.017643352236925015, "grad_norm": 1.171875, "learning_rate": 0.00010449999999999999, "loss": 7.4236, "mean_token_accuracy": 0.0766436841338873, "num_tokens": 390706.0, "step": 210 }, { "entropy": 7.888600492477417, "epoch": 0.018063432052089896, "grad_norm": 1.34375, "learning_rate": 0.000107, "loss": 7.4209, "mean_token_accuracy": 0.0734835498034954, "num_tokens": 400000.0, "step": 215 }, { "entropy": 7.803367996215821, "epoch": 0.01848351186725478, "grad_norm": 1.28125, "learning_rate": 0.0001095, "loss": 7.3774, "mean_token_accuracy": 0.08182684779167175, "num_tokens": 409447.0, "step": 220 }, { "entropy": 7.875886058807373, "epoch": 0.01890359168241966, "grad_norm": 1.4921875, "learning_rate": 0.000112, "loss": 7.3393, "mean_token_accuracy": 0.08449244052171707, "num_tokens": 418417.0, "step": 225 }, { "entropy": 7.78724856376648, "epoch": 0.01932367149758454, "grad_norm": 1.359375, "learning_rate": 0.0001145, "loss": 7.3048, "mean_token_accuracy": 0.08006256446242332, "num_tokens": 427619.0, "step": 230 }, { "entropy": 7.736767053604126, "epoch": 0.019743751312749424, "grad_norm": 1.421875, "learning_rate": 0.00011700000000000001, "loss": 7.372, "mean_token_accuracy": 0.07579129710793495, "num_tokens": 437931.0, "step": 235 }, { "entropy": 7.841858673095703, "epoch": 0.020163831127914304, "grad_norm": 1.3359375, "learning_rate": 0.00011949999999999999, "loss": 7.4001, "mean_token_accuracy": 0.08351109325885772, "num_tokens": 447595.0, "step": 240 }, { "entropy": 7.7983135223388675, "epoch": 0.020583910943079185, "grad_norm": 1.2890625, "learning_rate": 0.000122, "loss": 7.2633, "mean_token_accuracy": 0.07488272562623025, "num_tokens": 457062.0, "step": 245 }, { "entropy": 7.813820743560791, "epoch": 0.021003990758244065, "grad_norm": 1.46875, "learning_rate": 0.0001245, "loss": 7.3567, "mean_token_accuracy": 0.07759504988789559, "num_tokens": 466191.0, "step": 250 }, { "entropy": 7.757200431823731, "epoch": 0.02142407057340895, "grad_norm": 1.484375, "learning_rate": 0.000127, "loss": 7.3146, "mean_token_accuracy": 0.08031945005059242, "num_tokens": 475693.0, "step": 255 }, { "entropy": 7.7279805660247805, "epoch": 0.02184415038857383, "grad_norm": 1.25, "learning_rate": 0.0001295, "loss": 7.3269, "mean_token_accuracy": 0.08141026981174945, "num_tokens": 485173.0, "step": 260 }, { "entropy": 7.724671411514282, "epoch": 0.02226423020373871, "grad_norm": 1.2265625, "learning_rate": 0.000132, "loss": 7.2369, "mean_token_accuracy": 0.083962532132864, "num_tokens": 493985.0, "step": 265 }, { "entropy": 7.6601485252380375, "epoch": 0.022684310018903593, "grad_norm": 1.3125, "learning_rate": 0.00013450000000000002, "loss": 7.2687, "mean_token_accuracy": 0.08190520852804184, "num_tokens": 502837.0, "step": 270 }, { "entropy": 7.751116943359375, "epoch": 0.023104389834068473, "grad_norm": 1.328125, "learning_rate": 0.00013700000000000002, "loss": 7.2065, "mean_token_accuracy": 0.0843705341219902, "num_tokens": 511503.0, "step": 275 }, { "entropy": 7.717013120651245, "epoch": 0.023524469649233354, "grad_norm": 1.28125, "learning_rate": 0.0001395, "loss": 7.4058, "mean_token_accuracy": 0.08034609854221345, "num_tokens": 521499.0, "step": 280 }, { "entropy": 7.592406368255615, "epoch": 0.023944549464398234, "grad_norm": 1.3984375, "learning_rate": 0.00014199999999999998, "loss": 7.166, "mean_token_accuracy": 0.08277052193880081, "num_tokens": 530067.0, "step": 285 }, { "entropy": 7.6297852993011475, "epoch": 0.024364629279563118, "grad_norm": 1.2734375, "learning_rate": 0.0001445, "loss": 7.1721, "mean_token_accuracy": 0.08475914299488067, "num_tokens": 538559.0, "step": 290 }, { "entropy": 7.705462646484375, "epoch": 0.024784709094728, "grad_norm": 1.1953125, "learning_rate": 0.000147, "loss": 7.3653, "mean_token_accuracy": 0.07328721843659877, "num_tokens": 547288.0, "step": 295 }, { "entropy": 7.596541261672973, "epoch": 0.02520478890989288, "grad_norm": 1.203125, "learning_rate": 0.0001495, "loss": 7.2357, "mean_token_accuracy": 0.07816045507788658, "num_tokens": 557269.0, "step": 300 }, { "entropy": 7.701767444610596, "epoch": 0.025624868725057762, "grad_norm": 1.6953125, "learning_rate": 0.000152, "loss": 7.2628, "mean_token_accuracy": 0.07311495915055274, "num_tokens": 567280.0, "step": 305 }, { "entropy": 7.602482271194458, "epoch": 0.026044948540222643, "grad_norm": 1.1484375, "learning_rate": 0.00015450000000000001, "loss": 7.0908, "mean_token_accuracy": 0.08299101889133453, "num_tokens": 576609.0, "step": 310 }, { "entropy": 7.399111747741699, "epoch": 0.026465028355387523, "grad_norm": 1.1875, "learning_rate": 0.000157, "loss": 7.0032, "mean_token_accuracy": 0.09095181971788406, "num_tokens": 586053.0, "step": 315 }, { "entropy": 7.507453203201294, "epoch": 0.026885108170552403, "grad_norm": 1.2265625, "learning_rate": 0.0001595, "loss": 7.203, "mean_token_accuracy": 0.08823259696364402, "num_tokens": 594649.0, "step": 320 }, { "entropy": 7.599713850021362, "epoch": 0.027305187985717287, "grad_norm": 1.34375, "learning_rate": 0.000162, "loss": 7.1383, "mean_token_accuracy": 0.08195743858814239, "num_tokens": 603445.0, "step": 325 }, { "entropy": 7.587759685516358, "epoch": 0.027725267800882167, "grad_norm": 1.3125, "learning_rate": 0.00016450000000000001, "loss": 7.2543, "mean_token_accuracy": 0.07800514288246632, "num_tokens": 613611.0, "step": 330 }, { "entropy": 7.745543384552002, "epoch": 0.028145347616047048, "grad_norm": 1.3515625, "learning_rate": 0.00016700000000000002, "loss": 7.429, "mean_token_accuracy": 0.07839688062667846, "num_tokens": 623024.0, "step": 335 }, { "entropy": 7.4431709289550785, "epoch": 0.02856542743121193, "grad_norm": 1.2265625, "learning_rate": 0.00016950000000000003, "loss": 7.1028, "mean_token_accuracy": 0.08672705665230751, "num_tokens": 631624.0, "step": 340 }, { "entropy": 7.574361371994018, "epoch": 0.028985507246376812, "grad_norm": 1.3515625, "learning_rate": 0.00017199999999999998, "loss": 7.0557, "mean_token_accuracy": 0.08923942148685456, "num_tokens": 640473.0, "step": 345 }, { "entropy": 7.541849613189697, "epoch": 0.029405587061541692, "grad_norm": 1.3125, "learning_rate": 0.00017449999999999999, "loss": 7.2383, "mean_token_accuracy": 0.08173563033342361, "num_tokens": 649692.0, "step": 350 }, { "entropy": 7.571516275405884, "epoch": 0.029825666876706573, "grad_norm": 1.484375, "learning_rate": 0.000177, "loss": 7.1875, "mean_token_accuracy": 0.08110572174191474, "num_tokens": 658236.0, "step": 355 }, { "entropy": 7.34685640335083, "epoch": 0.030245746691871456, "grad_norm": 1.2421875, "learning_rate": 0.0001795, "loss": 6.9645, "mean_token_accuracy": 0.08569629490375519, "num_tokens": 667175.0, "step": 360 }, { "entropy": 7.556408214569092, "epoch": 0.030665826507036337, "grad_norm": 1.3203125, "learning_rate": 0.000182, "loss": 7.2834, "mean_token_accuracy": 0.08148858584463596, "num_tokens": 676456.0, "step": 365 }, { "entropy": 7.606632947921753, "epoch": 0.031085906322201217, "grad_norm": 1.1953125, "learning_rate": 0.0001845, "loss": 7.2448, "mean_token_accuracy": 0.08052070513367653, "num_tokens": 686881.0, "step": 370 }, { "entropy": 7.371811389923096, "epoch": 0.0315059861373661, "grad_norm": 1.125, "learning_rate": 0.000187, "loss": 7.0307, "mean_token_accuracy": 0.08108055517077446, "num_tokens": 696045.0, "step": 375 }, { "entropy": 7.382633686065674, "epoch": 0.03192606595253098, "grad_norm": 1.359375, "learning_rate": 0.0001895, "loss": 7.003, "mean_token_accuracy": 0.09089459106326103, "num_tokens": 704729.0, "step": 380 }, { "entropy": 7.353933048248291, "epoch": 0.032346145767695865, "grad_norm": 1.0546875, "learning_rate": 0.000192, "loss": 7.0639, "mean_token_accuracy": 0.08123919740319252, "num_tokens": 714331.0, "step": 385 }, { "entropy": 7.430750465393066, "epoch": 0.03276622558286074, "grad_norm": 1.2734375, "learning_rate": 0.0001945, "loss": 7.0163, "mean_token_accuracy": 0.08898987770080566, "num_tokens": 722788.0, "step": 390 }, { "entropy": 7.388132476806641, "epoch": 0.033186305398025626, "grad_norm": 1.28125, "learning_rate": 0.00019700000000000002, "loss": 7.0996, "mean_token_accuracy": 0.0889863982796669, "num_tokens": 731417.0, "step": 395 }, { "entropy": 7.394377708435059, "epoch": 0.03360638521319051, "grad_norm": 1.15625, "learning_rate": 0.00019950000000000002, "loss": 7.0686, "mean_token_accuracy": 0.0865507885813713, "num_tokens": 741034.0, "step": 400 }, { "entropy": 7.370957660675049, "epoch": 0.034026465028355386, "grad_norm": 1.1484375, "learning_rate": 0.000202, "loss": 7.063, "mean_token_accuracy": 0.08408316597342491, "num_tokens": 749596.0, "step": 405 }, { "entropy": 7.360737991333008, "epoch": 0.03444654484352027, "grad_norm": 1.0859375, "learning_rate": 0.00020449999999999998, "loss": 7.0166, "mean_token_accuracy": 0.08443826884031295, "num_tokens": 758931.0, "step": 410 }, { "entropy": 7.253893661499023, "epoch": 0.03486662465868515, "grad_norm": 1.1484375, "learning_rate": 0.000207, "loss": 6.9221, "mean_token_accuracy": 0.08874604031443596, "num_tokens": 767534.0, "step": 415 }, { "entropy": 7.336139726638794, "epoch": 0.03528670447385003, "grad_norm": 1.28125, "learning_rate": 0.0002095, "loss": 6.9742, "mean_token_accuracy": 0.08901742175221443, "num_tokens": 776456.0, "step": 420 }, { "entropy": 7.32063570022583, "epoch": 0.035706784289014915, "grad_norm": 1.21875, "learning_rate": 0.000212, "loss": 7.0512, "mean_token_accuracy": 0.0825334556400776, "num_tokens": 786172.0, "step": 425 }, { "entropy": 7.2836973667144775, "epoch": 0.03612686410417979, "grad_norm": 1.328125, "learning_rate": 0.0002145, "loss": 6.9281, "mean_token_accuracy": 0.09393875077366828, "num_tokens": 795081.0, "step": 430 }, { "entropy": 7.279390621185303, "epoch": 0.036546943919344675, "grad_norm": 1.3828125, "learning_rate": 0.00021700000000000002, "loss": 6.9729, "mean_token_accuracy": 0.08336275964975357, "num_tokens": 804259.0, "step": 435 }, { "entropy": 7.3233130931854244, "epoch": 0.03696702373450956, "grad_norm": 1.2265625, "learning_rate": 0.0002195, "loss": 6.9836, "mean_token_accuracy": 0.08346287980675697, "num_tokens": 813463.0, "step": 440 }, { "entropy": 7.265643119812012, "epoch": 0.037387103549674436, "grad_norm": 1.3125, "learning_rate": 0.000222, "loss": 6.915, "mean_token_accuracy": 0.09436434507369995, "num_tokens": 823029.0, "step": 445 }, { "entropy": 7.2830162525177, "epoch": 0.03780718336483932, "grad_norm": 1.2265625, "learning_rate": 0.0002245, "loss": 6.9822, "mean_token_accuracy": 0.08020757511258125, "num_tokens": 832902.0, "step": 450 }, { "entropy": 7.172808027267456, "epoch": 0.0382272631800042, "grad_norm": 1.1015625, "learning_rate": 0.00022700000000000002, "loss": 6.9269, "mean_token_accuracy": 0.08937018439173698, "num_tokens": 842162.0, "step": 455 }, { "entropy": 7.261403322219849, "epoch": 0.03864734299516908, "grad_norm": 1.3046875, "learning_rate": 0.00022950000000000002, "loss": 6.9709, "mean_token_accuracy": 0.09120814129710197, "num_tokens": 852328.0, "step": 460 }, { "entropy": 7.207744789123535, "epoch": 0.039067422810333964, "grad_norm": 1.2109375, "learning_rate": 0.00023200000000000003, "loss": 6.9283, "mean_token_accuracy": 0.08966456726193428, "num_tokens": 860929.0, "step": 465 }, { "entropy": 7.253277540206909, "epoch": 0.03948750262549885, "grad_norm": 1.2734375, "learning_rate": 0.00023449999999999998, "loss": 7.0043, "mean_token_accuracy": 0.0854820430278778, "num_tokens": 869144.0, "step": 470 }, { "entropy": 7.303921031951904, "epoch": 0.039907582440663725, "grad_norm": 1.3671875, "learning_rate": 0.000237, "loss": 6.9451, "mean_token_accuracy": 0.09673570543527603, "num_tokens": 877447.0, "step": 475 }, { "entropy": 7.20126519203186, "epoch": 0.04032766225582861, "grad_norm": 1.1953125, "learning_rate": 0.0002395, "loss": 6.9017, "mean_token_accuracy": 0.08463463708758354, "num_tokens": 887020.0, "step": 480 }, { "entropy": 7.1618622779846195, "epoch": 0.040747742070993485, "grad_norm": 1.3515625, "learning_rate": 0.000242, "loss": 6.9503, "mean_token_accuracy": 0.08903224021196365, "num_tokens": 895937.0, "step": 485 }, { "entropy": 7.172050189971924, "epoch": 0.04116782188615837, "grad_norm": 1.1484375, "learning_rate": 0.0002445, "loss": 6.9573, "mean_token_accuracy": 0.08436014279723167, "num_tokens": 905446.0, "step": 490 }, { "entropy": 7.1261190414428714, "epoch": 0.04158790170132325, "grad_norm": 1.3046875, "learning_rate": 0.000247, "loss": 6.8507, "mean_token_accuracy": 0.09782563373446465, "num_tokens": 914547.0, "step": 495 }, { "entropy": 7.219514274597168, "epoch": 0.04200798151648813, "grad_norm": 1.3515625, "learning_rate": 0.0002495, "loss": 6.8597, "mean_token_accuracy": 0.09429225027561187, "num_tokens": 922900.0, "step": 500 }, { "entropy": 7.174054384231567, "epoch": 0.042428061331653014, "grad_norm": 1.296875, "learning_rate": 0.000252, "loss": 6.9026, "mean_token_accuracy": 0.09461246877908706, "num_tokens": 930876.0, "step": 505 }, { "entropy": 7.149679851531983, "epoch": 0.0428481411468179, "grad_norm": 1.234375, "learning_rate": 0.0002545, "loss": 6.9327, "mean_token_accuracy": 0.09384474828839302, "num_tokens": 939871.0, "step": 510 }, { "entropy": 7.1536510467529295, "epoch": 0.043268220961982774, "grad_norm": 1.3203125, "learning_rate": 0.000257, "loss": 6.9204, "mean_token_accuracy": 0.08957441225647926, "num_tokens": 948673.0, "step": 515 }, { "entropy": 7.07887830734253, "epoch": 0.04368830077714766, "grad_norm": 1.1875, "learning_rate": 0.0002595, "loss": 6.8686, "mean_token_accuracy": 0.08727961704134941, "num_tokens": 957603.0, "step": 520 }, { "entropy": 7.11884388923645, "epoch": 0.04410838059231254, "grad_norm": 1.1484375, "learning_rate": 0.000262, "loss": 6.9378, "mean_token_accuracy": 0.08589621968567371, "num_tokens": 967731.0, "step": 525 }, { "entropy": 7.1688611030578615, "epoch": 0.04452846040747742, "grad_norm": 1.3828125, "learning_rate": 0.00026450000000000003, "loss": 6.9387, "mean_token_accuracy": 0.09485394582152366, "num_tokens": 977427.0, "step": 530 }, { "entropy": 7.146421909332275, "epoch": 0.0449485402226423, "grad_norm": 1.4140625, "learning_rate": 0.00026700000000000004, "loss": 6.9243, "mean_token_accuracy": 0.08625848963856697, "num_tokens": 986758.0, "step": 535 }, { "entropy": 7.25874433517456, "epoch": 0.045368620037807186, "grad_norm": 1.2890625, "learning_rate": 0.00026950000000000005, "loss": 6.92, "mean_token_accuracy": 0.09832347258925438, "num_tokens": 996377.0, "step": 540 }, { "entropy": 7.057836389541626, "epoch": 0.04578869985297206, "grad_norm": 1.2109375, "learning_rate": 0.00027200000000000005, "loss": 6.9742, "mean_token_accuracy": 0.08528567403554917, "num_tokens": 1006483.0, "step": 545 }, { "entropy": 6.995539855957031, "epoch": 0.04620877966813695, "grad_norm": 1.2109375, "learning_rate": 0.0002745, "loss": 6.8574, "mean_token_accuracy": 0.08858747258782387, "num_tokens": 1016132.0, "step": 550 }, { "entropy": 7.106180238723755, "epoch": 0.04662885948330183, "grad_norm": 1.109375, "learning_rate": 0.000277, "loss": 6.7984, "mean_token_accuracy": 0.09407598823308945, "num_tokens": 1024970.0, "step": 555 }, { "entropy": 7.142482328414917, "epoch": 0.04704893929846671, "grad_norm": 1.0625, "learning_rate": 0.0002795, "loss": 6.8936, "mean_token_accuracy": 0.08978619575500488, "num_tokens": 1034335.0, "step": 560 }, { "entropy": 7.139913558959961, "epoch": 0.04746901911363159, "grad_norm": 1.15625, "learning_rate": 0.00028199999999999997, "loss": 6.9495, "mean_token_accuracy": 0.0973325490951538, "num_tokens": 1043954.0, "step": 565 }, { "entropy": 7.08342981338501, "epoch": 0.04788909892879647, "grad_norm": 1.0234375, "learning_rate": 0.0002845, "loss": 6.8806, "mean_token_accuracy": 0.09276892617344856, "num_tokens": 1053554.0, "step": 570 }, { "entropy": 7.0591119766235355, "epoch": 0.04830917874396135, "grad_norm": 1.203125, "learning_rate": 0.000287, "loss": 6.8354, "mean_token_accuracy": 0.09314879402518272, "num_tokens": 1062008.0, "step": 575 }, { "entropy": 7.029165410995484, "epoch": 0.048729258559126236, "grad_norm": 1.3046875, "learning_rate": 0.0002895, "loss": 6.9074, "mean_token_accuracy": 0.09056607261300087, "num_tokens": 1070740.0, "step": 580 }, { "entropy": 7.027670526504517, "epoch": 0.04914933837429111, "grad_norm": 1.2890625, "learning_rate": 0.000292, "loss": 6.8895, "mean_token_accuracy": 0.09351922869682312, "num_tokens": 1079681.0, "step": 585 }, { "entropy": 7.076567363739014, "epoch": 0.049569418189456, "grad_norm": 1.1953125, "learning_rate": 0.0002945, "loss": 6.7669, "mean_token_accuracy": 0.0963557355105877, "num_tokens": 1088979.0, "step": 590 }, { "entropy": 6.955168056488037, "epoch": 0.04998949800462088, "grad_norm": 1.2578125, "learning_rate": 0.000297, "loss": 6.7794, "mean_token_accuracy": 0.09716788977384568, "num_tokens": 1097870.0, "step": 595 }, { "entropy": 7.0498795986175535, "epoch": 0.05040957781978576, "grad_norm": 1.3046875, "learning_rate": 0.0002995, "loss": 6.8985, "mean_token_accuracy": 0.08934849128127098, "num_tokens": 1107948.0, "step": 600 }, { "entropy": 7.038954401016236, "epoch": 0.05082965763495064, "grad_norm": 1.1875, "learning_rate": 0.000302, "loss": 6.8034, "mean_token_accuracy": 0.09711324200034141, "num_tokens": 1117032.0, "step": 605 }, { "entropy": 7.016556072235107, "epoch": 0.051249737450115525, "grad_norm": 1.140625, "learning_rate": 0.0003045, "loss": 6.7736, "mean_token_accuracy": 0.10140406414866447, "num_tokens": 1127834.0, "step": 610 }, { "entropy": 7.053543567657471, "epoch": 0.0516698172652804, "grad_norm": 1.328125, "learning_rate": 0.000307, "loss": 6.8664, "mean_token_accuracy": 0.10583841800689697, "num_tokens": 1137382.0, "step": 615 }, { "entropy": 6.960672283172608, "epoch": 0.052089897080445285, "grad_norm": 1.1875, "learning_rate": 0.0003095, "loss": 6.7295, "mean_token_accuracy": 0.09906250685453415, "num_tokens": 1146095.0, "step": 620 }, { "entropy": 6.916978216171264, "epoch": 0.05250997689561017, "grad_norm": 1.1796875, "learning_rate": 0.000312, "loss": 6.7648, "mean_token_accuracy": 0.1004838652908802, "num_tokens": 1154981.0, "step": 625 }, { "entropy": 6.948708629608154, "epoch": 0.052930056710775046, "grad_norm": 1.5390625, "learning_rate": 0.0003145, "loss": 6.7765, "mean_token_accuracy": 0.10312124192714692, "num_tokens": 1164939.0, "step": 630 }, { "entropy": 7.024917793273926, "epoch": 0.05335013652593993, "grad_norm": 1.2265625, "learning_rate": 0.000317, "loss": 6.8939, "mean_token_accuracy": 0.09090543612837791, "num_tokens": 1174991.0, "step": 635 }, { "entropy": 7.0208131790161135, "epoch": 0.05377021634110481, "grad_norm": 1.09375, "learning_rate": 0.0003195, "loss": 6.9459, "mean_token_accuracy": 0.08811391443014145, "num_tokens": 1184885.0, "step": 640 }, { "entropy": 6.984617424011231, "epoch": 0.05419029615626969, "grad_norm": 1.265625, "learning_rate": 0.000322, "loss": 6.8348, "mean_token_accuracy": 0.09274234399199485, "num_tokens": 1193637.0, "step": 645 }, { "entropy": 6.901879405975341, "epoch": 0.054610375971434574, "grad_norm": 1.203125, "learning_rate": 0.00032450000000000003, "loss": 6.6237, "mean_token_accuracy": 0.10028594210743905, "num_tokens": 1202188.0, "step": 650 }, { "entropy": 6.964693355560303, "epoch": 0.05503045578659945, "grad_norm": 1.25, "learning_rate": 0.00032700000000000003, "loss": 6.7513, "mean_token_accuracy": 0.09297072812914849, "num_tokens": 1210768.0, "step": 655 }, { "entropy": 6.921257066726684, "epoch": 0.055450535601764335, "grad_norm": 1.296875, "learning_rate": 0.00032950000000000004, "loss": 6.7581, "mean_token_accuracy": 0.09513410851359368, "num_tokens": 1219819.0, "step": 660 }, { "entropy": 6.969961500167846, "epoch": 0.05587061541692922, "grad_norm": 0.98828125, "learning_rate": 0.00033200000000000005, "loss": 6.8151, "mean_token_accuracy": 0.08720013573765754, "num_tokens": 1229703.0, "step": 665 }, { "entropy": 7.008356428146362, "epoch": 0.056290695232094096, "grad_norm": 1.2421875, "learning_rate": 0.00033450000000000005, "loss": 6.8385, "mean_token_accuracy": 0.09394309446215629, "num_tokens": 1238942.0, "step": 670 }, { "entropy": 7.041683959960937, "epoch": 0.05671077504725898, "grad_norm": 1.0625, "learning_rate": 0.000337, "loss": 6.8901, "mean_token_accuracy": 0.0907767005264759, "num_tokens": 1248943.0, "step": 675 }, { "entropy": 6.869440269470215, "epoch": 0.05713085486242386, "grad_norm": 1.1640625, "learning_rate": 0.0003395, "loss": 6.7728, "mean_token_accuracy": 0.09719423428177834, "num_tokens": 1257761.0, "step": 680 }, { "entropy": 6.80675859451294, "epoch": 0.05755093467758874, "grad_norm": 1.21875, "learning_rate": 0.000342, "loss": 6.722, "mean_token_accuracy": 0.09433782026171685, "num_tokens": 1267216.0, "step": 685 }, { "entropy": 6.962690448760986, "epoch": 0.057971014492753624, "grad_norm": 1.1640625, "learning_rate": 0.00034449999999999997, "loss": 6.8182, "mean_token_accuracy": 0.09524153247475624, "num_tokens": 1277210.0, "step": 690 }, { "entropy": 6.910012054443359, "epoch": 0.05839109430791851, "grad_norm": 1.15625, "learning_rate": 0.000347, "loss": 6.7268, "mean_token_accuracy": 0.09480128362774849, "num_tokens": 1285310.0, "step": 695 }, { "entropy": 6.9359142780303955, "epoch": 0.058811174123083385, "grad_norm": 1.21875, "learning_rate": 0.0003495, "loss": 6.7418, "mean_token_accuracy": 0.09830545634031296, "num_tokens": 1294421.0, "step": 700 }, { "entropy": 6.773298215866089, "epoch": 0.05923125393824827, "grad_norm": 1.1953125, "learning_rate": 0.000352, "loss": 6.5648, "mean_token_accuracy": 0.10509093776345253, "num_tokens": 1303281.0, "step": 705 }, { "entropy": 6.848818397521972, "epoch": 0.059651333753413145, "grad_norm": 1.234375, "learning_rate": 0.0003545, "loss": 6.7413, "mean_token_accuracy": 0.10247144997119903, "num_tokens": 1312280.0, "step": 710 }, { "entropy": 6.792526483535767, "epoch": 0.06007141356857803, "grad_norm": 1.09375, "learning_rate": 0.000357, "loss": 6.703, "mean_token_accuracy": 0.09476525709033012, "num_tokens": 1321243.0, "step": 715 }, { "entropy": 6.8667539119720455, "epoch": 0.06049149338374291, "grad_norm": 1.15625, "learning_rate": 0.0003595, "loss": 6.8092, "mean_token_accuracy": 0.10024766996502876, "num_tokens": 1330324.0, "step": 720 }, { "entropy": 6.874475002288818, "epoch": 0.06091157319890779, "grad_norm": 1.2265625, "learning_rate": 0.000362, "loss": 6.6476, "mean_token_accuracy": 0.10230677276849746, "num_tokens": 1339485.0, "step": 725 }, { "entropy": 6.930787801742554, "epoch": 0.06133165301407267, "grad_norm": 1.2109375, "learning_rate": 0.0003645, "loss": 6.8065, "mean_token_accuracy": 0.09302590638399125, "num_tokens": 1348640.0, "step": 730 }, { "entropy": 6.799437236785889, "epoch": 0.06175173282923756, "grad_norm": 1.21875, "learning_rate": 0.000367, "loss": 6.6978, "mean_token_accuracy": 0.09949951842427254, "num_tokens": 1357581.0, "step": 735 }, { "entropy": 6.888378238677978, "epoch": 0.062171812644402434, "grad_norm": 1.1953125, "learning_rate": 0.0003695, "loss": 6.7652, "mean_token_accuracy": 0.09876005351543427, "num_tokens": 1367883.0, "step": 740 }, { "entropy": 6.812366771697998, "epoch": 0.06259189245956731, "grad_norm": 1.15625, "learning_rate": 0.000372, "loss": 6.7175, "mean_token_accuracy": 0.09678780436515808, "num_tokens": 1376936.0, "step": 745 }, { "entropy": 6.708990812301636, "epoch": 0.0630119722747322, "grad_norm": 1.1796875, "learning_rate": 0.0003745, "loss": 6.6402, "mean_token_accuracy": 0.09989499375224113, "num_tokens": 1386359.0, "step": 750 }, { "entropy": 6.86722469329834, "epoch": 0.06343205208989708, "grad_norm": 1.125, "learning_rate": 0.000377, "loss": 6.6965, "mean_token_accuracy": 0.10066593587398528, "num_tokens": 1395223.0, "step": 755 }, { "entropy": 6.944450616836548, "epoch": 0.06385213190506196, "grad_norm": 1.0625, "learning_rate": 0.0003795, "loss": 6.847, "mean_token_accuracy": 0.09334802627563477, "num_tokens": 1404917.0, "step": 760 }, { "entropy": 6.823553276062012, "epoch": 0.06427221172022685, "grad_norm": 1.2578125, "learning_rate": 0.000382, "loss": 6.7474, "mean_token_accuracy": 0.10658529698848725, "num_tokens": 1413348.0, "step": 765 }, { "entropy": 6.7500804424285885, "epoch": 0.06469229153539173, "grad_norm": 1.203125, "learning_rate": 0.0003845, "loss": 6.7193, "mean_token_accuracy": 0.09804128184914589, "num_tokens": 1421726.0, "step": 770 }, { "entropy": 6.822430419921875, "epoch": 0.0651123713505566, "grad_norm": 1.109375, "learning_rate": 0.00038700000000000003, "loss": 6.7314, "mean_token_accuracy": 0.09830505326390267, "num_tokens": 1430686.0, "step": 775 }, { "entropy": 6.889693403244019, "epoch": 0.06553245116572148, "grad_norm": 1.109375, "learning_rate": 0.00038950000000000003, "loss": 6.7193, "mean_token_accuracy": 0.1001870684325695, "num_tokens": 1439499.0, "step": 780 }, { "entropy": 6.836849641799927, "epoch": 0.06595253098088637, "grad_norm": 1.328125, "learning_rate": 0.00039200000000000004, "loss": 6.7144, "mean_token_accuracy": 0.10016432479023933, "num_tokens": 1448220.0, "step": 785 }, { "entropy": 6.703166866302491, "epoch": 0.06637261079605125, "grad_norm": 1.0, "learning_rate": 0.00039450000000000005, "loss": 6.7252, "mean_token_accuracy": 0.09049011170864105, "num_tokens": 1458217.0, "step": 790 }, { "entropy": 6.805354738235474, "epoch": 0.06679269061121614, "grad_norm": 1.1171875, "learning_rate": 0.00039700000000000005, "loss": 6.6229, "mean_token_accuracy": 0.0928824745118618, "num_tokens": 1467422.0, "step": 795 }, { "entropy": 6.788901376724243, "epoch": 0.06721277042638102, "grad_norm": 1.1484375, "learning_rate": 0.0003995, "loss": 6.6204, "mean_token_accuracy": 0.10320913046598434, "num_tokens": 1476152.0, "step": 800 }, { "entropy": 6.731419372558594, "epoch": 0.06763285024154589, "grad_norm": 1.15625, "learning_rate": 0.000402, "loss": 6.7128, "mean_token_accuracy": 0.09539571255445481, "num_tokens": 1485248.0, "step": 805 }, { "entropy": 6.7255181789398195, "epoch": 0.06805293005671077, "grad_norm": 1.15625, "learning_rate": 0.0004045, "loss": 6.6711, "mean_token_accuracy": 0.09965705946087837, "num_tokens": 1494248.0, "step": 810 }, { "entropy": 6.825131368637085, "epoch": 0.06847300987187566, "grad_norm": 1.265625, "learning_rate": 0.00040699999999999997, "loss": 6.785, "mean_token_accuracy": 0.09547284319996834, "num_tokens": 1503565.0, "step": 815 }, { "entropy": 6.932170867919922, "epoch": 0.06889308968704054, "grad_norm": 1.109375, "learning_rate": 0.0004095, "loss": 6.8605, "mean_token_accuracy": 0.09502148702740669, "num_tokens": 1513227.0, "step": 820 }, { "entropy": 6.8283134460449215, "epoch": 0.06931316950220542, "grad_norm": 1.2109375, "learning_rate": 0.000412, "loss": 6.6616, "mean_token_accuracy": 0.1039304107427597, "num_tokens": 1522312.0, "step": 825 }, { "entropy": 6.6956737518310545, "epoch": 0.0697332493173703, "grad_norm": 1.125, "learning_rate": 0.0004145, "loss": 6.5989, "mean_token_accuracy": 0.10552669763565063, "num_tokens": 1531720.0, "step": 830 }, { "entropy": 6.70291919708252, "epoch": 0.07015332913253518, "grad_norm": 1.140625, "learning_rate": 0.000417, "loss": 6.7026, "mean_token_accuracy": 0.09495449438691139, "num_tokens": 1541238.0, "step": 835 }, { "entropy": 6.867031812667847, "epoch": 0.07057340894770006, "grad_norm": 1.1953125, "learning_rate": 0.0004195, "loss": 6.7955, "mean_token_accuracy": 0.09560235142707825, "num_tokens": 1550875.0, "step": 840 }, { "entropy": 6.679243516921997, "epoch": 0.07099348876286495, "grad_norm": 1.046875, "learning_rate": 0.000422, "loss": 6.7373, "mean_token_accuracy": 0.10205229669809342, "num_tokens": 1560287.0, "step": 845 }, { "entropy": 6.812178373336792, "epoch": 0.07141356857802983, "grad_norm": 1.09375, "learning_rate": 0.0004245, "loss": 6.6139, "mean_token_accuracy": 0.10624400898814201, "num_tokens": 1569043.0, "step": 850 }, { "entropy": 6.66694450378418, "epoch": 0.07183364839319471, "grad_norm": 1.0625, "learning_rate": 0.000427, "loss": 6.6372, "mean_token_accuracy": 0.10226837545633316, "num_tokens": 1578112.0, "step": 855 }, { "entropy": 6.592900228500366, "epoch": 0.07225372820835958, "grad_norm": 1.109375, "learning_rate": 0.0004295, "loss": 6.5542, "mean_token_accuracy": 0.10482543483376502, "num_tokens": 1586587.0, "step": 860 }, { "entropy": 6.831333017349243, "epoch": 0.07267380802352447, "grad_norm": 1.125, "learning_rate": 0.000432, "loss": 6.7191, "mean_token_accuracy": 0.0988001950085163, "num_tokens": 1595585.0, "step": 865 }, { "entropy": 6.7406104564666744, "epoch": 0.07309388783868935, "grad_norm": 1.109375, "learning_rate": 0.0004345, "loss": 6.6715, "mean_token_accuracy": 0.1029144361615181, "num_tokens": 1605355.0, "step": 870 }, { "entropy": 6.673774909973145, "epoch": 0.07351396765385423, "grad_norm": 1.1640625, "learning_rate": 0.000437, "loss": 6.7087, "mean_token_accuracy": 0.0972638413310051, "num_tokens": 1613637.0, "step": 875 }, { "entropy": 6.780192899703979, "epoch": 0.07393404746901912, "grad_norm": 1.140625, "learning_rate": 0.0004395, "loss": 6.6547, "mean_token_accuracy": 0.10374342575669289, "num_tokens": 1622731.0, "step": 880 }, { "entropy": 6.733386611938476, "epoch": 0.074354127284184, "grad_norm": 1.09375, "learning_rate": 0.000442, "loss": 6.6411, "mean_token_accuracy": 0.09785914570093154, "num_tokens": 1632098.0, "step": 885 }, { "entropy": 6.656809377670288, "epoch": 0.07477420709934887, "grad_norm": 1.015625, "learning_rate": 0.0004445, "loss": 6.6333, "mean_token_accuracy": 0.09908856153488159, "num_tokens": 1641259.0, "step": 890 }, { "entropy": 6.787235689163208, "epoch": 0.07519428691451376, "grad_norm": 1.2109375, "learning_rate": 0.000447, "loss": 6.7023, "mean_token_accuracy": 0.09753435328602791, "num_tokens": 1651362.0, "step": 895 }, { "entropy": 6.644986867904663, "epoch": 0.07561436672967864, "grad_norm": 1.1953125, "learning_rate": 0.00044950000000000003, "loss": 6.6169, "mean_token_accuracy": 0.09910911172628403, "num_tokens": 1660190.0, "step": 900 }, { "entropy": 6.722699403762817, "epoch": 0.07603444654484352, "grad_norm": 1.234375, "learning_rate": 0.00045200000000000004, "loss": 6.659, "mean_token_accuracy": 0.09519267976284027, "num_tokens": 1669020.0, "step": 905 }, { "entropy": 6.747388315200806, "epoch": 0.0764545263600084, "grad_norm": 1.1171875, "learning_rate": 0.00045450000000000004, "loss": 6.6775, "mean_token_accuracy": 0.10076266825199127, "num_tokens": 1678158.0, "step": 910 }, { "entropy": 6.702866649627685, "epoch": 0.07687460617517328, "grad_norm": 1.15625, "learning_rate": 0.00045700000000000005, "loss": 6.6868, "mean_token_accuracy": 0.09906790256500245, "num_tokens": 1687481.0, "step": 915 }, { "entropy": 6.647071504592896, "epoch": 0.07729468599033816, "grad_norm": 1.125, "learning_rate": 0.00045950000000000006, "loss": 6.6511, "mean_token_accuracy": 0.10402323752641678, "num_tokens": 1696782.0, "step": 920 }, { "entropy": 6.6832818508148195, "epoch": 0.07771476580550304, "grad_norm": 1.0859375, "learning_rate": 0.000462, "loss": 6.6575, "mean_token_accuracy": 0.10666462555527687, "num_tokens": 1706153.0, "step": 925 }, { "entropy": 6.698217678070068, "epoch": 0.07813484562066793, "grad_norm": 1.0703125, "learning_rate": 0.0004645, "loss": 6.6895, "mean_token_accuracy": 0.10017500966787338, "num_tokens": 1715585.0, "step": 930 }, { "entropy": 6.823991441726685, "epoch": 0.07855492543583281, "grad_norm": 1.4921875, "learning_rate": 0.000467, "loss": 6.8005, "mean_token_accuracy": 0.09734346494078636, "num_tokens": 1724857.0, "step": 935 }, { "entropy": 6.700028705596924, "epoch": 0.0789750052509977, "grad_norm": 1.1875, "learning_rate": 0.0004695, "loss": 6.6103, "mean_token_accuracy": 0.10624456107616424, "num_tokens": 1733528.0, "step": 940 }, { "entropy": 6.742655563354492, "epoch": 0.07939508506616257, "grad_norm": 0.99609375, "learning_rate": 0.000472, "loss": 6.7304, "mean_token_accuracy": 0.10352228581905365, "num_tokens": 1742953.0, "step": 945 }, { "entropy": 6.669600582122802, "epoch": 0.07981516488132745, "grad_norm": 1.2265625, "learning_rate": 0.0004745, "loss": 6.6746, "mean_token_accuracy": 0.10271603912115097, "num_tokens": 1752155.0, "step": 950 }, { "entropy": 6.660818243026734, "epoch": 0.08023524469649233, "grad_norm": 1.234375, "learning_rate": 0.000477, "loss": 6.5695, "mean_token_accuracy": 0.10144439786672592, "num_tokens": 1760562.0, "step": 955 }, { "entropy": 6.623502588272094, "epoch": 0.08065532451165722, "grad_norm": 1.1875, "learning_rate": 0.0004795, "loss": 6.5902, "mean_token_accuracy": 0.1015326887369156, "num_tokens": 1769631.0, "step": 960 }, { "entropy": 6.647875261306763, "epoch": 0.0810754043268221, "grad_norm": 1.265625, "learning_rate": 0.000482, "loss": 6.624, "mean_token_accuracy": 0.10202456414699554, "num_tokens": 1779080.0, "step": 965 }, { "entropy": 6.654635858535767, "epoch": 0.08149548414198697, "grad_norm": 1.375, "learning_rate": 0.0004845, "loss": 6.6146, "mean_token_accuracy": 0.10121759623289109, "num_tokens": 1787830.0, "step": 970 }, { "entropy": 6.546731615066529, "epoch": 0.08191556395715185, "grad_norm": 1.0859375, "learning_rate": 0.000487, "loss": 6.5331, "mean_token_accuracy": 0.10186785906553268, "num_tokens": 1796998.0, "step": 975 }, { "entropy": 6.6796527862548825, "epoch": 0.08233564377231674, "grad_norm": 1.171875, "learning_rate": 0.0004895, "loss": 6.619, "mean_token_accuracy": 0.10591355115175247, "num_tokens": 1806194.0, "step": 980 }, { "entropy": 6.40926570892334, "epoch": 0.08275572358748162, "grad_norm": 1.046875, "learning_rate": 0.000492, "loss": 6.514, "mean_token_accuracy": 0.10517977550625801, "num_tokens": 1815751.0, "step": 985 }, { "entropy": 6.57440676689148, "epoch": 0.0831758034026465, "grad_norm": 1.0, "learning_rate": 0.0004945, "loss": 6.5942, "mean_token_accuracy": 0.10343918055295945, "num_tokens": 1825379.0, "step": 990 }, { "entropy": 6.637695789337158, "epoch": 0.08359588321781139, "grad_norm": 1.1015625, "learning_rate": 0.000497, "loss": 6.5522, "mean_token_accuracy": 0.10346684157848358, "num_tokens": 1834158.0, "step": 995 }, { "entropy": 6.537919807434082, "epoch": 0.08401596303297626, "grad_norm": 1.1171875, "learning_rate": 0.0004995, "loss": 6.5098, "mean_token_accuracy": 0.10425886288285255, "num_tokens": 1842724.0, "step": 1000 }, { "entropy": 6.62498288154602, "epoch": 0.08443604284814114, "grad_norm": 1.015625, "learning_rate": 0.000499999998724557, "loss": 6.5288, "mean_token_accuracy": 0.10198150128126145, "num_tokens": 1852485.0, "step": 1005 }, { "entropy": 6.57701358795166, "epoch": 0.08485612266330603, "grad_norm": 1.1484375, "learning_rate": 0.0004999999935430703, "loss": 6.5545, "mean_token_accuracy": 0.11041983366012573, "num_tokens": 1861303.0, "step": 1010 }, { "entropy": 6.423639154434204, "epoch": 0.08527620247847091, "grad_norm": 1.203125, "learning_rate": 0.0004999999843758243, "loss": 6.5428, "mean_token_accuracy": 0.11022127270698548, "num_tokens": 1870859.0, "step": 1015 }, { "entropy": 6.760848808288574, "epoch": 0.0856962822936358, "grad_norm": 1.03125, "learning_rate": 0.0004999999712228196, "loss": 6.7105, "mean_token_accuracy": 0.09618140533566474, "num_tokens": 1880295.0, "step": 1020 }, { "entropy": 6.645368003845215, "epoch": 0.08611636210880068, "grad_norm": 1.03125, "learning_rate": 0.0004999999540840562, "loss": 6.6079, "mean_token_accuracy": 0.1056639552116394, "num_tokens": 1889193.0, "step": 1025 }, { "entropy": 6.568785905838013, "epoch": 0.08653644192396555, "grad_norm": 1.0703125, "learning_rate": 0.0004999999329595345, "loss": 6.7096, "mean_token_accuracy": 0.09398577436804771, "num_tokens": 1899437.0, "step": 1030 }, { "entropy": 6.708119821548462, "epoch": 0.08695652173913043, "grad_norm": 1.046875, "learning_rate": 0.0004999999078492548, "loss": 6.5939, "mean_token_accuracy": 0.1046712227165699, "num_tokens": 1907882.0, "step": 1035 }, { "entropy": 6.493611288070679, "epoch": 0.08737660155429532, "grad_norm": 0.99609375, "learning_rate": 0.0004999998787532176, "loss": 6.5021, "mean_token_accuracy": 0.10290396809577942, "num_tokens": 1916872.0, "step": 1040 }, { "entropy": 6.608988046646118, "epoch": 0.0877966813694602, "grad_norm": 1.0625, "learning_rate": 0.0004999998456714234, "loss": 6.675, "mean_token_accuracy": 0.10352342054247857, "num_tokens": 1926636.0, "step": 1045 }, { "entropy": 6.586896228790283, "epoch": 0.08821676118462508, "grad_norm": 1.203125, "learning_rate": 0.0004999998086038729, "loss": 6.5742, "mean_token_accuracy": 0.10714709535241126, "num_tokens": 1935962.0, "step": 1050 }, { "entropy": 6.579021549224853, "epoch": 0.08863684099978995, "grad_norm": 1.0625, "learning_rate": 0.0004999997675505665, "loss": 6.5514, "mean_token_accuracy": 0.10487730801105499, "num_tokens": 1944600.0, "step": 1055 }, { "entropy": 6.625632095336914, "epoch": 0.08905692081495484, "grad_norm": 1.109375, "learning_rate": 0.0004999997225115052, "loss": 6.7269, "mean_token_accuracy": 0.10071012005209923, "num_tokens": 1954234.0, "step": 1060 }, { "entropy": 6.7796577453613285, "epoch": 0.08947700063011972, "grad_norm": 1.1171875, "learning_rate": 0.0004999996734866896, "loss": 6.683, "mean_token_accuracy": 0.09888390973210334, "num_tokens": 1964499.0, "step": 1065 }, { "entropy": 6.377533006668091, "epoch": 0.0898970804452846, "grad_norm": 1.1640625, "learning_rate": 0.0004999996204761206, "loss": 6.3832, "mean_token_accuracy": 0.11216704472899437, "num_tokens": 1973635.0, "step": 1070 }, { "entropy": 6.54502387046814, "epoch": 0.09031716026044949, "grad_norm": 0.96875, "learning_rate": 0.0004999995634797993, "loss": 6.5308, "mean_token_accuracy": 0.11021102443337441, "num_tokens": 1983509.0, "step": 1075 }, { "entropy": 6.567485332489014, "epoch": 0.09073724007561437, "grad_norm": 1.1484375, "learning_rate": 0.0004999995024977265, "loss": 6.5197, "mean_token_accuracy": 0.11247633025050163, "num_tokens": 1992336.0, "step": 1080 }, { "entropy": 6.545616102218628, "epoch": 0.09115731989077924, "grad_norm": 1.0234375, "learning_rate": 0.0004999994375299034, "loss": 6.5532, "mean_token_accuracy": 0.10819393768906593, "num_tokens": 2001931.0, "step": 1085 }, { "entropy": 6.484406518936157, "epoch": 0.09157739970594413, "grad_norm": 1.0234375, "learning_rate": 0.000499999368576331, "loss": 6.4218, "mean_token_accuracy": 0.11132358983159066, "num_tokens": 2010935.0, "step": 1090 }, { "entropy": 6.49219536781311, "epoch": 0.09199747952110901, "grad_norm": 1.0546875, "learning_rate": 0.0004999992956370109, "loss": 6.4842, "mean_token_accuracy": 0.10731736794114113, "num_tokens": 2020587.0, "step": 1095 }, { "entropy": 6.410812473297119, "epoch": 0.0924175593362739, "grad_norm": 1.0703125, "learning_rate": 0.000499999218711944, "loss": 6.5089, "mean_token_accuracy": 0.11067400127649307, "num_tokens": 2029743.0, "step": 1100 }, { "entropy": 6.581059837341309, "epoch": 0.09283763915143878, "grad_norm": 1.1328125, "learning_rate": 0.0004999991378011317, "loss": 6.5257, "mean_token_accuracy": 0.10916591510176658, "num_tokens": 2038468.0, "step": 1105 }, { "entropy": 6.456353855133057, "epoch": 0.09325771896660366, "grad_norm": 1.0703125, "learning_rate": 0.0004999990529045757, "loss": 6.4482, "mean_token_accuracy": 0.10893432199954986, "num_tokens": 2047456.0, "step": 1110 }, { "entropy": 6.627411127090454, "epoch": 0.09367779878176853, "grad_norm": 0.98046875, "learning_rate": 0.0004999989640222771, "loss": 6.7525, "mean_token_accuracy": 0.09431043416261672, "num_tokens": 2056691.0, "step": 1115 }, { "entropy": 6.684362411499023, "epoch": 0.09409787859693342, "grad_norm": 1.015625, "learning_rate": 0.000499998871154238, "loss": 6.5462, "mean_token_accuracy": 0.10591837242245675, "num_tokens": 2066068.0, "step": 1120 }, { "entropy": 6.578407287597656, "epoch": 0.0945179584120983, "grad_norm": 1.0234375, "learning_rate": 0.0004999987743004597, "loss": 6.4733, "mean_token_accuracy": 0.1102992869913578, "num_tokens": 2075113.0, "step": 1125 }, { "entropy": 6.506056404113769, "epoch": 0.09493803822726318, "grad_norm": 1.0390625, "learning_rate": 0.0004999986734609438, "loss": 6.6105, "mean_token_accuracy": 0.10494827926158905, "num_tokens": 2084557.0, "step": 1130 }, { "entropy": 6.6157310009002686, "epoch": 0.09535811804242807, "grad_norm": 1.078125, "learning_rate": 0.0004999985686356923, "loss": 6.5139, "mean_token_accuracy": 0.1062320664525032, "num_tokens": 2093424.0, "step": 1135 }, { "entropy": 6.539625740051269, "epoch": 0.09577819785759294, "grad_norm": 1.078125, "learning_rate": 0.000499998459824707, "loss": 6.6346, "mean_token_accuracy": 0.10304314494132996, "num_tokens": 2103066.0, "step": 1140 }, { "entropy": 6.53157410621643, "epoch": 0.09619827767275782, "grad_norm": 1.0546875, "learning_rate": 0.00049999834702799, "loss": 6.5013, "mean_token_accuracy": 0.10883507803082466, "num_tokens": 2112447.0, "step": 1145 }, { "entropy": 6.507535743713379, "epoch": 0.0966183574879227, "grad_norm": 1.03125, "learning_rate": 0.0004999982302455431, "loss": 6.5269, "mean_token_accuracy": 0.11191204637289047, "num_tokens": 2121949.0, "step": 1150 }, { "entropy": 6.507864904403687, "epoch": 0.09703843730308759, "grad_norm": 1.015625, "learning_rate": 0.0004999981094773683, "loss": 6.4328, "mean_token_accuracy": 0.11216317638754844, "num_tokens": 2130464.0, "step": 1155 }, { "entropy": 6.520567464828491, "epoch": 0.09745851711825247, "grad_norm": 1.140625, "learning_rate": 0.000499997984723468, "loss": 6.5942, "mean_token_accuracy": 0.10294081419706344, "num_tokens": 2139577.0, "step": 1160 }, { "entropy": 6.288797092437744, "epoch": 0.09787859693341736, "grad_norm": 0.97265625, "learning_rate": 0.0004999978559838441, "loss": 6.3204, "mean_token_accuracy": 0.11208199337124825, "num_tokens": 2147919.0, "step": 1165 }, { "entropy": 6.472030353546143, "epoch": 0.09829867674858223, "grad_norm": 1.03125, "learning_rate": 0.0004999977232584991, "loss": 6.4949, "mean_token_accuracy": 0.10832359045743942, "num_tokens": 2156936.0, "step": 1170 }, { "entropy": 6.558899450302124, "epoch": 0.09871875656374711, "grad_norm": 1.046875, "learning_rate": 0.0004999975865474354, "loss": 6.5512, "mean_token_accuracy": 0.10766256302595138, "num_tokens": 2165362.0, "step": 1175 }, { "entropy": 6.469175338745117, "epoch": 0.099138836378912, "grad_norm": 1.1171875, "learning_rate": 0.0004999974458506551, "loss": 6.4643, "mean_token_accuracy": 0.10836688205599784, "num_tokens": 2173665.0, "step": 1180 }, { "entropy": 6.551422071456909, "epoch": 0.09955891619407688, "grad_norm": 1.15625, "learning_rate": 0.000499997301168161, "loss": 6.4532, "mean_token_accuracy": 0.11138271391391755, "num_tokens": 2182222.0, "step": 1185 }, { "entropy": 6.531885147094727, "epoch": 0.09997899600924176, "grad_norm": 1.0078125, "learning_rate": 0.0004999971524999556, "loss": 6.5228, "mean_token_accuracy": 0.11111016869544983, "num_tokens": 2192358.0, "step": 1190 }, { "entropy": 6.534890985488891, "epoch": 0.10039907582440663, "grad_norm": 1.0546875, "learning_rate": 0.0004999969998460414, "loss": 6.5355, "mean_token_accuracy": 0.10454710125923157, "num_tokens": 2201889.0, "step": 1195 }, { "entropy": 6.433488464355468, "epoch": 0.10081915563957151, "grad_norm": 1.328125, "learning_rate": 0.0004999968432064213, "loss": 6.5322, "mean_token_accuracy": 0.1198379322886467, "num_tokens": 2211810.0, "step": 1200 }, { "entropy": 6.474250078201294, "epoch": 0.1012392354547364, "grad_norm": 0.95703125, "learning_rate": 0.0004999966825810979, "loss": 6.4684, "mean_token_accuracy": 0.10700508952140808, "num_tokens": 2221123.0, "step": 1205 }, { "entropy": 6.384520959854126, "epoch": 0.10165931526990128, "grad_norm": 1.0703125, "learning_rate": 0.0004999965179700742, "loss": 6.3986, "mean_token_accuracy": 0.11781087368726731, "num_tokens": 2230129.0, "step": 1210 }, { "entropy": 6.4176534652709964, "epoch": 0.10207939508506617, "grad_norm": 1.0078125, "learning_rate": 0.000499996349373353, "loss": 6.4609, "mean_token_accuracy": 0.10817519575357437, "num_tokens": 2239929.0, "step": 1215 }, { "entropy": 6.5110820770263675, "epoch": 0.10249947490023105, "grad_norm": 1.0703125, "learning_rate": 0.0004999961767909374, "loss": 6.4372, "mean_token_accuracy": 0.1148509480059147, "num_tokens": 2248078.0, "step": 1220 }, { "entropy": 6.4125104427337645, "epoch": 0.10291955471539592, "grad_norm": 1.078125, "learning_rate": 0.0004999960002228303, "loss": 6.5274, "mean_token_accuracy": 0.10999985039234161, "num_tokens": 2256975.0, "step": 1225 }, { "entropy": 6.474673461914063, "epoch": 0.1033396345305608, "grad_norm": 1.1484375, "learning_rate": 0.0004999958196690349, "loss": 6.3849, "mean_token_accuracy": 0.11320202201604843, "num_tokens": 2265797.0, "step": 1230 }, { "entropy": 6.479385900497436, "epoch": 0.10375971434572569, "grad_norm": 1.0703125, "learning_rate": 0.0004999956351295545, "loss": 6.4946, "mean_token_accuracy": 0.11450825035572051, "num_tokens": 2274099.0, "step": 1235 }, { "entropy": 6.3540520668029785, "epoch": 0.10417979416089057, "grad_norm": 1.03125, "learning_rate": 0.0004999954466043922, "loss": 6.3917, "mean_token_accuracy": 0.11258968263864517, "num_tokens": 2282360.0, "step": 1240 }, { "entropy": 6.481705999374389, "epoch": 0.10459987397605545, "grad_norm": 0.98046875, "learning_rate": 0.0004999952540935514, "loss": 6.5009, "mean_token_accuracy": 0.10285271480679511, "num_tokens": 2292714.0, "step": 1245 }, { "entropy": 6.455303287506103, "epoch": 0.10501995379122034, "grad_norm": 1.0625, "learning_rate": 0.0004999950575970356, "loss": 6.426, "mean_token_accuracy": 0.11442826837301254, "num_tokens": 2301633.0, "step": 1250 }, { "entropy": 6.465747499465943, "epoch": 0.10544003360638521, "grad_norm": 1.0234375, "learning_rate": 0.0004999948571148482, "loss": 6.4138, "mean_token_accuracy": 0.11426257789134979, "num_tokens": 2310067.0, "step": 1255 }, { "entropy": 6.466140460968018, "epoch": 0.10586011342155009, "grad_norm": 1.0390625, "learning_rate": 0.0004999946526469927, "loss": 6.4932, "mean_token_accuracy": 0.11244904398918151, "num_tokens": 2320090.0, "step": 1260 }, { "entropy": 6.438083505630493, "epoch": 0.10628019323671498, "grad_norm": 1.078125, "learning_rate": 0.0004999944441934728, "loss": 6.4509, "mean_token_accuracy": 0.11593573912978172, "num_tokens": 2329255.0, "step": 1265 }, { "entropy": 6.467304992675781, "epoch": 0.10670027305187986, "grad_norm": 1.1328125, "learning_rate": 0.0004999942317542922, "loss": 6.5481, "mean_token_accuracy": 0.10965899974107743, "num_tokens": 2339535.0, "step": 1270 }, { "entropy": 6.434674501419067, "epoch": 0.10712035286704474, "grad_norm": 1.046875, "learning_rate": 0.0004999940153294546, "loss": 6.4448, "mean_token_accuracy": 0.11061845496296882, "num_tokens": 2348948.0, "step": 1275 }, { "entropy": 6.447847843170166, "epoch": 0.10754043268220961, "grad_norm": 1.046875, "learning_rate": 0.000499993794918964, "loss": 6.4628, "mean_token_accuracy": 0.10641181394457817, "num_tokens": 2359141.0, "step": 1280 }, { "entropy": 6.401166343688965, "epoch": 0.1079605124973745, "grad_norm": 1.1796875, "learning_rate": 0.0004999935705228241, "loss": 6.5084, "mean_token_accuracy": 0.1094856470823288, "num_tokens": 2368906.0, "step": 1285 }, { "entropy": 6.554097080230713, "epoch": 0.10838059231253938, "grad_norm": 1.125, "learning_rate": 0.0004999933421410389, "loss": 6.4839, "mean_token_accuracy": 0.11065066531300545, "num_tokens": 2377029.0, "step": 1290 }, { "entropy": 6.5027672290802006, "epoch": 0.10880067212770426, "grad_norm": 0.9140625, "learning_rate": 0.0004999931097736125, "loss": 6.5541, "mean_token_accuracy": 0.10604767650365829, "num_tokens": 2387088.0, "step": 1295 }, { "entropy": 6.470385646820068, "epoch": 0.10922075194286915, "grad_norm": 1.0546875, "learning_rate": 0.0004999928734205492, "loss": 6.4468, "mean_token_accuracy": 0.11056585833430291, "num_tokens": 2395596.0, "step": 1300 }, { "entropy": 6.403819370269775, "epoch": 0.10964083175803403, "grad_norm": 1.0703125, "learning_rate": 0.0004999926330818528, "loss": 6.4393, "mean_token_accuracy": 0.11377019882202148, "num_tokens": 2404506.0, "step": 1305 }, { "entropy": 6.469174242019653, "epoch": 0.1100609115731989, "grad_norm": 1.09375, "learning_rate": 0.0004999923887575278, "loss": 6.4777, "mean_token_accuracy": 0.11094499379396439, "num_tokens": 2414342.0, "step": 1310 }, { "entropy": 6.476234006881714, "epoch": 0.11048099138836379, "grad_norm": 1.109375, "learning_rate": 0.0004999921404475785, "loss": 6.4422, "mean_token_accuracy": 0.11336205825209618, "num_tokens": 2423076.0, "step": 1315 }, { "entropy": 6.415568065643311, "epoch": 0.11090107120352867, "grad_norm": 0.9453125, "learning_rate": 0.0004999918881520093, "loss": 6.391, "mean_token_accuracy": 0.11621783077716827, "num_tokens": 2432492.0, "step": 1320 }, { "entropy": 6.362053871154785, "epoch": 0.11132115101869355, "grad_norm": 1.0078125, "learning_rate": 0.0004999916318708246, "loss": 6.354, "mean_token_accuracy": 0.11400164812803268, "num_tokens": 2441916.0, "step": 1325 }, { "entropy": 6.406490755081177, "epoch": 0.11174123083385844, "grad_norm": 1.15625, "learning_rate": 0.0004999913716040291, "loss": 6.4072, "mean_token_accuracy": 0.11762610748410225, "num_tokens": 2450932.0, "step": 1330 }, { "entropy": 6.336502504348755, "epoch": 0.11216131064902331, "grad_norm": 1.15625, "learning_rate": 0.0004999911073516272, "loss": 6.4319, "mean_token_accuracy": 0.11254018545150757, "num_tokens": 2460058.0, "step": 1335 }, { "entropy": 6.392711496353149, "epoch": 0.11258139046418819, "grad_norm": 1.0234375, "learning_rate": 0.0004999908391136237, "loss": 6.3569, "mean_token_accuracy": 0.11563631743192673, "num_tokens": 2469607.0, "step": 1340 }, { "entropy": 6.441662883758545, "epoch": 0.11300147027935308, "grad_norm": 1.0859375, "learning_rate": 0.0004999905668900234, "loss": 6.4002, "mean_token_accuracy": 0.11395884156227112, "num_tokens": 2478345.0, "step": 1345 }, { "entropy": 6.438292360305786, "epoch": 0.11342155009451796, "grad_norm": 1.171875, "learning_rate": 0.000499990290680831, "loss": 6.3261, "mean_token_accuracy": 0.11877992302179337, "num_tokens": 2486662.0, "step": 1350 }, { "entropy": 6.379430055618286, "epoch": 0.11384162990968284, "grad_norm": 1.0703125, "learning_rate": 0.0004999900104860516, "loss": 6.472, "mean_token_accuracy": 0.11443257331848145, "num_tokens": 2495392.0, "step": 1355 }, { "entropy": 6.437303638458252, "epoch": 0.11426170972484773, "grad_norm": 1.1171875, "learning_rate": 0.0004999897263056898, "loss": 6.4969, "mean_token_accuracy": 0.10801200717687606, "num_tokens": 2505254.0, "step": 1360 }, { "entropy": 6.457095766067505, "epoch": 0.1146817895400126, "grad_norm": 1.125, "learning_rate": 0.000499989438139751, "loss": 6.3155, "mean_token_accuracy": 0.11900854557752609, "num_tokens": 2514096.0, "step": 1365 }, { "entropy": 6.339952230453491, "epoch": 0.11510186935517748, "grad_norm": 0.94921875, "learning_rate": 0.0004999891459882401, "loss": 6.3262, "mean_token_accuracy": 0.1178194098174572, "num_tokens": 2523635.0, "step": 1370 }, { "entropy": 6.318808507919312, "epoch": 0.11552194917034236, "grad_norm": 1.0234375, "learning_rate": 0.0004999888498511624, "loss": 6.3954, "mean_token_accuracy": 0.11501155719161034, "num_tokens": 2532528.0, "step": 1375 }, { "entropy": 6.366592121124268, "epoch": 0.11594202898550725, "grad_norm": 1.0625, "learning_rate": 0.0004999885497285229, "loss": 6.307, "mean_token_accuracy": 0.11583952903747559, "num_tokens": 2541893.0, "step": 1380 }, { "entropy": 6.354608488082886, "epoch": 0.11636210880067213, "grad_norm": 1.046875, "learning_rate": 0.0004999882456203273, "loss": 6.3581, "mean_token_accuracy": 0.11632645949721336, "num_tokens": 2551551.0, "step": 1385 }, { "entropy": 6.349077987670898, "epoch": 0.11678218861583702, "grad_norm": 1.140625, "learning_rate": 0.0004999879375265806, "loss": 6.3146, "mean_token_accuracy": 0.1158558964729309, "num_tokens": 2560183.0, "step": 1390 }, { "entropy": 6.344199848175049, "epoch": 0.11720226843100189, "grad_norm": 1.1015625, "learning_rate": 0.0004999876254472886, "loss": 6.1959, "mean_token_accuracy": 0.12459081262350083, "num_tokens": 2568697.0, "step": 1395 }, { "entropy": 6.348653078079224, "epoch": 0.11762234824616677, "grad_norm": 0.9609375, "learning_rate": 0.0004999873093824565, "loss": 6.4194, "mean_token_accuracy": 0.11410524025559425, "num_tokens": 2578151.0, "step": 1400 }, { "entropy": 6.50674262046814, "epoch": 0.11804242806133165, "grad_norm": 1.109375, "learning_rate": 0.0004999869893320902, "loss": 6.5289, "mean_token_accuracy": 0.1147321492433548, "num_tokens": 2585901.0, "step": 1405 }, { "entropy": 6.338491153717041, "epoch": 0.11846250787649654, "grad_norm": 1.046875, "learning_rate": 0.0004999866652961952, "loss": 6.3629, "mean_token_accuracy": 0.11298267319798469, "num_tokens": 2595655.0, "step": 1410 }, { "entropy": 6.389230489730835, "epoch": 0.11888258769166142, "grad_norm": 1.0234375, "learning_rate": 0.0004999863372747773, "loss": 6.3335, "mean_token_accuracy": 0.11225836053490638, "num_tokens": 2604949.0, "step": 1415 }, { "entropy": 6.439256811141968, "epoch": 0.11930266750682629, "grad_norm": 1.140625, "learning_rate": 0.0004999860052678423, "loss": 6.3989, "mean_token_accuracy": 0.11546840667724609, "num_tokens": 2614260.0, "step": 1420 }, { "entropy": 6.299542999267578, "epoch": 0.11972274732199117, "grad_norm": 1.1875, "learning_rate": 0.0004999856692753959, "loss": 6.3905, "mean_token_accuracy": 0.11243033632636071, "num_tokens": 2623740.0, "step": 1425 }, { "entropy": 6.37091474533081, "epoch": 0.12014282713715606, "grad_norm": 1.0703125, "learning_rate": 0.0004999853292974444, "loss": 6.2964, "mean_token_accuracy": 0.1178373210132122, "num_tokens": 2631998.0, "step": 1430 }, { "entropy": 6.372178649902343, "epoch": 0.12056290695232094, "grad_norm": 0.96484375, "learning_rate": 0.0004999849853339936, "loss": 6.4358, "mean_token_accuracy": 0.11526904925704003, "num_tokens": 2641169.0, "step": 1435 }, { "entropy": 6.44800329208374, "epoch": 0.12098298676748583, "grad_norm": 0.9296875, "learning_rate": 0.0004999846373850497, "loss": 6.2945, "mean_token_accuracy": 0.11855239495635032, "num_tokens": 2650576.0, "step": 1440 }, { "entropy": 6.257949161529541, "epoch": 0.12140306658265071, "grad_norm": 1.0546875, "learning_rate": 0.0004999842854506186, "loss": 6.3807, "mean_token_accuracy": 0.11334980726242065, "num_tokens": 2660817.0, "step": 1445 }, { "entropy": 6.38723406791687, "epoch": 0.12182314639781558, "grad_norm": 1.0703125, "learning_rate": 0.0004999839295307069, "loss": 6.3212, "mean_token_accuracy": 0.11455826535820961, "num_tokens": 2669338.0, "step": 1450 }, { "entropy": 6.404263877868653, "epoch": 0.12224322621298046, "grad_norm": 1.109375, "learning_rate": 0.0004999835696253206, "loss": 6.3789, "mean_token_accuracy": 0.11618088632822036, "num_tokens": 2679108.0, "step": 1455 }, { "entropy": 6.435732698440551, "epoch": 0.12266330602814535, "grad_norm": 0.96875, "learning_rate": 0.0004999832057344664, "loss": 6.3325, "mean_token_accuracy": 0.1142914392054081, "num_tokens": 2688126.0, "step": 1460 }, { "entropy": 6.152384519577026, "epoch": 0.12308338584331023, "grad_norm": 1.1171875, "learning_rate": 0.0004999828378581504, "loss": 6.3063, "mean_token_accuracy": 0.12400648295879364, "num_tokens": 2697245.0, "step": 1465 }, { "entropy": 6.425075197219849, "epoch": 0.12350346565847511, "grad_norm": 1.046875, "learning_rate": 0.0004999824659963793, "loss": 6.3465, "mean_token_accuracy": 0.1198640413582325, "num_tokens": 2705934.0, "step": 1470 }, { "entropy": 6.265953540802002, "epoch": 0.12392354547364, "grad_norm": 1.140625, "learning_rate": 0.0004999820901491598, "loss": 6.2796, "mean_token_accuracy": 0.12351771965622901, "num_tokens": 2714367.0, "step": 1475 }, { "entropy": 6.334036827087402, "epoch": 0.12434362528880487, "grad_norm": 1.078125, "learning_rate": 0.0004999817103164983, "loss": 6.3413, "mean_token_accuracy": 0.11931266412138938, "num_tokens": 2724366.0, "step": 1480 }, { "entropy": 6.360864496231079, "epoch": 0.12476370510396975, "grad_norm": 1.03125, "learning_rate": 0.0004999813264984017, "loss": 6.3448, "mean_token_accuracy": 0.11467731669545174, "num_tokens": 2733980.0, "step": 1485 }, { "entropy": 6.366592979431152, "epoch": 0.12518378491913462, "grad_norm": 1.046875, "learning_rate": 0.0004999809386948767, "loss": 6.3342, "mean_token_accuracy": 0.12208072617650031, "num_tokens": 2744013.0, "step": 1490 }, { "entropy": 6.299022817611695, "epoch": 0.12560386473429952, "grad_norm": 1.1484375, "learning_rate": 0.0004999805469059302, "loss": 6.4186, "mean_token_accuracy": 0.11027913689613342, "num_tokens": 2753385.0, "step": 1495 }, { "entropy": 6.366168975830078, "epoch": 0.1260239445494644, "grad_norm": 1.078125, "learning_rate": 0.0004999801511315693, "loss": 6.256, "mean_token_accuracy": 0.11804210916161537, "num_tokens": 2762875.0, "step": 1500 }, { "entropy": 6.342552661895752, "epoch": 0.1264440243646293, "grad_norm": 1.078125, "learning_rate": 0.0004999797513718007, "loss": 6.3108, "mean_token_accuracy": 0.12443676739931106, "num_tokens": 2772182.0, "step": 1505 }, { "entropy": 6.206664896011352, "epoch": 0.12686410417979416, "grad_norm": 1.046875, "learning_rate": 0.0004999793476266317, "loss": 6.2711, "mean_token_accuracy": 0.12031201645731926, "num_tokens": 2780814.0, "step": 1510 }, { "entropy": 6.639998197555542, "epoch": 0.12728418399495905, "grad_norm": 1.078125, "learning_rate": 0.0004999789398960695, "loss": 6.5474, "mean_token_accuracy": 0.1183062419295311, "num_tokens": 2791104.0, "step": 1515 }, { "entropy": 6.19776029586792, "epoch": 0.12770426381012392, "grad_norm": 1.046875, "learning_rate": 0.0004999785281801212, "loss": 6.256, "mean_token_accuracy": 0.11993122175335884, "num_tokens": 2800081.0, "step": 1520 }, { "entropy": 6.334916496276856, "epoch": 0.1281243436252888, "grad_norm": 1.1015625, "learning_rate": 0.000499978112478794, "loss": 6.3835, "mean_token_accuracy": 0.11843734234571457, "num_tokens": 2809096.0, "step": 1525 }, { "entropy": 6.403998374938965, "epoch": 0.1285444234404537, "grad_norm": 1.03125, "learning_rate": 0.0004999776927920955, "loss": 6.3545, "mean_token_accuracy": 0.12085104510188102, "num_tokens": 2818857.0, "step": 1530 }, { "entropy": 6.3299469470977785, "epoch": 0.12896450325561856, "grad_norm": 1.0625, "learning_rate": 0.000499977269120033, "loss": 6.4167, "mean_token_accuracy": 0.11449578031897545, "num_tokens": 2829332.0, "step": 1535 }, { "entropy": 6.3263038158416744, "epoch": 0.12938458307078346, "grad_norm": 0.97265625, "learning_rate": 0.000499976841462614, "loss": 6.3436, "mean_token_accuracy": 0.11686776131391526, "num_tokens": 2839193.0, "step": 1540 }, { "entropy": 6.397625589370728, "epoch": 0.12980466288594833, "grad_norm": 0.95703125, "learning_rate": 0.000499976409819846, "loss": 6.3117, "mean_token_accuracy": 0.11800177842378616, "num_tokens": 2848535.0, "step": 1545 }, { "entropy": 6.116656970977783, "epoch": 0.1302247427011132, "grad_norm": 0.9765625, "learning_rate": 0.0004999759741917369, "loss": 6.2278, "mean_token_accuracy": 0.12729543596506118, "num_tokens": 2858090.0, "step": 1550 }, { "entropy": 6.364631414413452, "epoch": 0.1306448225162781, "grad_norm": 1.1796875, "learning_rate": 0.0004999755345782941, "loss": 6.378, "mean_token_accuracy": 0.11326263695955277, "num_tokens": 2866984.0, "step": 1555 }, { "entropy": 6.246821451187134, "epoch": 0.13106490233144297, "grad_norm": 0.96875, "learning_rate": 0.0004999750909795256, "loss": 6.1885, "mean_token_accuracy": 0.1256905347108841, "num_tokens": 2876550.0, "step": 1560 }, { "entropy": 6.341800737380981, "epoch": 0.13148498214660786, "grad_norm": 0.9921875, "learning_rate": 0.0004999746433954394, "loss": 6.286, "mean_token_accuracy": 0.12146776840090752, "num_tokens": 2885782.0, "step": 1565 }, { "entropy": 6.275845241546631, "epoch": 0.13190506196177273, "grad_norm": 1.0234375, "learning_rate": 0.000499974191826043, "loss": 6.2653, "mean_token_accuracy": 0.13301032781600952, "num_tokens": 2894807.0, "step": 1570 }, { "entropy": 6.351547765731811, "epoch": 0.1323251417769376, "grad_norm": 1.140625, "learning_rate": 0.0004999737362713448, "loss": 6.304, "mean_token_accuracy": 0.12145641520619392, "num_tokens": 2904076.0, "step": 1575 }, { "entropy": 6.267245769500732, "epoch": 0.1327452215921025, "grad_norm": 1.0703125, "learning_rate": 0.0004999732767313527, "loss": 6.2029, "mean_token_accuracy": 0.12209122702479362, "num_tokens": 2913761.0, "step": 1580 }, { "entropy": 6.383308267593383, "epoch": 0.13316530140726737, "grad_norm": 1.1328125, "learning_rate": 0.0004999728132060746, "loss": 6.439, "mean_token_accuracy": 0.12098384723067283, "num_tokens": 2922848.0, "step": 1585 }, { "entropy": 6.364631271362304, "epoch": 0.13358538122243227, "grad_norm": 0.92578125, "learning_rate": 0.0004999723456955192, "loss": 6.3245, "mean_token_accuracy": 0.11949731931090354, "num_tokens": 2932718.0, "step": 1590 }, { "entropy": 6.2494594097137455, "epoch": 0.13400546103759714, "grad_norm": 0.97265625, "learning_rate": 0.0004999718741996945, "loss": 6.2837, "mean_token_accuracy": 0.12003797963261605, "num_tokens": 2942686.0, "step": 1595 }, { "entropy": 6.2547472476959225, "epoch": 0.13442554085276204, "grad_norm": 1.0390625, "learning_rate": 0.000499971398718609, "loss": 6.2407, "mean_token_accuracy": 0.1179835021495819, "num_tokens": 2952096.0, "step": 1600 }, { "entropy": 6.3157384395599365, "epoch": 0.1348456206679269, "grad_norm": 1.03125, "learning_rate": 0.0004999709192522708, "loss": 6.3129, "mean_token_accuracy": 0.12474863901734352, "num_tokens": 2960660.0, "step": 1605 }, { "entropy": 6.379588079452515, "epoch": 0.13526570048309178, "grad_norm": 0.96875, "learning_rate": 0.0004999704358006887, "loss": 6.3158, "mean_token_accuracy": 0.11744728311896324, "num_tokens": 2969834.0, "step": 1610 }, { "entropy": 6.285486459732056, "epoch": 0.13568578029825668, "grad_norm": 1.109375, "learning_rate": 0.0004999699483638712, "loss": 6.311, "mean_token_accuracy": 0.12142582982778549, "num_tokens": 2979023.0, "step": 1615 }, { "entropy": 6.294291210174561, "epoch": 0.13610586011342155, "grad_norm": 1.078125, "learning_rate": 0.0004999694569418269, "loss": 6.3063, "mean_token_accuracy": 0.12201808094978332, "num_tokens": 2988083.0, "step": 1620 }, { "entropy": 6.2657451152801515, "epoch": 0.13652593992858644, "grad_norm": 1.0234375, "learning_rate": 0.0004999689615345645, "loss": 6.2388, "mean_token_accuracy": 0.1231310561299324, "num_tokens": 2997240.0, "step": 1625 }, { "entropy": 6.308252573013306, "epoch": 0.1369460197437513, "grad_norm": 1.0859375, "learning_rate": 0.0004999684621420928, "loss": 6.3111, "mean_token_accuracy": 0.1184695117175579, "num_tokens": 3007077.0, "step": 1630 }, { "entropy": 6.319302654266357, "epoch": 0.13736609955891618, "grad_norm": 1.0546875, "learning_rate": 0.0004999679587644205, "loss": 6.3497, "mean_token_accuracy": 0.11671060770750045, "num_tokens": 3015821.0, "step": 1635 }, { "entropy": 6.236631298065186, "epoch": 0.13778617937408108, "grad_norm": 1.0859375, "learning_rate": 0.0004999674514015568, "loss": 6.2724, "mean_token_accuracy": 0.11908711194992065, "num_tokens": 3025858.0, "step": 1640 }, { "entropy": 6.3658030986785885, "epoch": 0.13820625918924595, "grad_norm": 1.046875, "learning_rate": 0.0004999669400535105, "loss": 6.2416, "mean_token_accuracy": 0.11343135982751847, "num_tokens": 3035537.0, "step": 1645 }, { "entropy": 6.147812271118164, "epoch": 0.13862633900441085, "grad_norm": 1.125, "learning_rate": 0.0004999664247202907, "loss": 6.1617, "mean_token_accuracy": 0.11974595785140991, "num_tokens": 3044204.0, "step": 1650 }, { "entropy": 6.327428913116455, "epoch": 0.13904641881957572, "grad_norm": 1.0625, "learning_rate": 0.0004999659054019066, "loss": 6.3345, "mean_token_accuracy": 0.11974811106920243, "num_tokens": 3053111.0, "step": 1655 }, { "entropy": 6.258665418624878, "epoch": 0.1394664986347406, "grad_norm": 1.0390625, "learning_rate": 0.0004999653820983673, "loss": 6.2415, "mean_token_accuracy": 0.12036412507295609, "num_tokens": 3062456.0, "step": 1660 }, { "entropy": 6.2644579887390135, "epoch": 0.13988657844990549, "grad_norm": 1.015625, "learning_rate": 0.000499964854809682, "loss": 6.2627, "mean_token_accuracy": 0.12668107002973555, "num_tokens": 3071132.0, "step": 1665 }, { "entropy": 6.261227464675903, "epoch": 0.14030665826507036, "grad_norm": 0.99609375, "learning_rate": 0.0004999643235358602, "loss": 6.222, "mean_token_accuracy": 0.125965429097414, "num_tokens": 3080892.0, "step": 1670 }, { "entropy": 6.215318775177002, "epoch": 0.14072673808023525, "grad_norm": 1.046875, "learning_rate": 0.0004999637882769112, "loss": 6.1526, "mean_token_accuracy": 0.12532262802124022, "num_tokens": 3089874.0, "step": 1675 }, { "entropy": 6.308867406845093, "epoch": 0.14114681789540012, "grad_norm": 0.93359375, "learning_rate": 0.0004999632490328447, "loss": 6.3008, "mean_token_accuracy": 0.12098695039749145, "num_tokens": 3099535.0, "step": 1680 }, { "entropy": 6.281496620178222, "epoch": 0.14156689771056502, "grad_norm": 1.0, "learning_rate": 0.0004999627058036699, "loss": 6.2552, "mean_token_accuracy": 0.12044425159692765, "num_tokens": 3108772.0, "step": 1685 }, { "entropy": 6.311051607131958, "epoch": 0.1419869775257299, "grad_norm": 1.0390625, "learning_rate": 0.0004999621585893966, "loss": 6.2799, "mean_token_accuracy": 0.11901640743017197, "num_tokens": 3118333.0, "step": 1690 }, { "entropy": 6.305313062667847, "epoch": 0.14240705734089476, "grad_norm": 1.0546875, "learning_rate": 0.0004999616073900346, "loss": 6.3091, "mean_token_accuracy": 0.12129790410399437, "num_tokens": 3127356.0, "step": 1695 }, { "entropy": 6.2683678150177, "epoch": 0.14282713715605966, "grad_norm": 1.0703125, "learning_rate": 0.0004999610522055935, "loss": 6.2794, "mean_token_accuracy": 0.11691329404711723, "num_tokens": 3136859.0, "step": 1700 }, { "entropy": 6.303126668930053, "epoch": 0.14324721697122453, "grad_norm": 1.0390625, "learning_rate": 0.0004999604930360832, "loss": 6.304, "mean_token_accuracy": 0.11767303720116615, "num_tokens": 3146607.0, "step": 1705 }, { "entropy": 6.214645338058472, "epoch": 0.14366729678638943, "grad_norm": 0.9921875, "learning_rate": 0.0004999599298815136, "loss": 6.2515, "mean_token_accuracy": 0.12662419229745864, "num_tokens": 3156327.0, "step": 1710 }, { "entropy": 6.21446213722229, "epoch": 0.1440873766015543, "grad_norm": 1.5859375, "learning_rate": 0.0004999593627418947, "loss": 6.2009, "mean_token_accuracy": 0.1281860999763012, "num_tokens": 3165559.0, "step": 1715 }, { "entropy": 6.299745416641235, "epoch": 0.14450745641671917, "grad_norm": 1.0546875, "learning_rate": 0.0004999587916172365, "loss": 6.2848, "mean_token_accuracy": 0.11663243547081947, "num_tokens": 3173850.0, "step": 1720 }, { "entropy": 6.324022483825684, "epoch": 0.14492753623188406, "grad_norm": 1.015625, "learning_rate": 0.0004999582165075492, "loss": 6.2353, "mean_token_accuracy": 0.11788406521081925, "num_tokens": 3182838.0, "step": 1725 }, { "entropy": 6.144151782989502, "epoch": 0.14534761604704893, "grad_norm": 1.03125, "learning_rate": 0.0004999576374128429, "loss": 6.2299, "mean_token_accuracy": 0.1223968394100666, "num_tokens": 3191692.0, "step": 1730 }, { "entropy": 6.343899536132812, "epoch": 0.14576769586221383, "grad_norm": 1.0703125, "learning_rate": 0.0004999570543331279, "loss": 6.2507, "mean_token_accuracy": 0.12281694263219833, "num_tokens": 3200069.0, "step": 1735 }, { "entropy": 6.2878196239471436, "epoch": 0.1461877756773787, "grad_norm": 1.1953125, "learning_rate": 0.0004999564672684145, "loss": 6.3406, "mean_token_accuracy": 0.11862553879618645, "num_tokens": 3209653.0, "step": 1740 }, { "entropy": 6.361492061614991, "epoch": 0.14660785549254357, "grad_norm": 1.0546875, "learning_rate": 0.0004999558762187131, "loss": 6.2041, "mean_token_accuracy": 0.12774061411619186, "num_tokens": 3218313.0, "step": 1745 }, { "entropy": 6.146276044845581, "epoch": 0.14702793530770847, "grad_norm": 1.0390625, "learning_rate": 0.0004999552811840342, "loss": 6.1521, "mean_token_accuracy": 0.1273271396756172, "num_tokens": 3227525.0, "step": 1750 }, { "entropy": 6.241751718521118, "epoch": 0.14744801512287334, "grad_norm": 0.98046875, "learning_rate": 0.0004999546821643884, "loss": 6.2657, "mean_token_accuracy": 0.121260417252779, "num_tokens": 3237022.0, "step": 1755 }, { "entropy": 6.169715499877929, "epoch": 0.14786809493803824, "grad_norm": 1.03125, "learning_rate": 0.0004999540791597861, "loss": 6.156, "mean_token_accuracy": 0.12248859778046609, "num_tokens": 3246605.0, "step": 1760 }, { "entropy": 6.1003180027008055, "epoch": 0.1482881747532031, "grad_norm": 1.046875, "learning_rate": 0.0004999534721702383, "loss": 6.1054, "mean_token_accuracy": 0.12855856791138648, "num_tokens": 3255587.0, "step": 1765 }, { "entropy": 6.226248407363892, "epoch": 0.148708254568368, "grad_norm": 1.046875, "learning_rate": 0.0004999528611957553, "loss": 6.2171, "mean_token_accuracy": 0.12187446802854537, "num_tokens": 3265669.0, "step": 1770 }, { "entropy": 6.278449535369873, "epoch": 0.14912833438353287, "grad_norm": 1.1328125, "learning_rate": 0.0004999522462363485, "loss": 6.1919, "mean_token_accuracy": 0.1278035633265972, "num_tokens": 3275013.0, "step": 1775 }, { "entropy": 6.265809679031372, "epoch": 0.14954841419869774, "grad_norm": 0.98828125, "learning_rate": 0.0004999516272920283, "loss": 6.311, "mean_token_accuracy": 0.1240921102464199, "num_tokens": 3284723.0, "step": 1780 }, { "entropy": 6.131893539428711, "epoch": 0.14996849401386264, "grad_norm": 1.0625, "learning_rate": 0.000499951004362806, "loss": 6.1325, "mean_token_accuracy": 0.12936908155679702, "num_tokens": 3293860.0, "step": 1785 }, { "entropy": 6.151740789413452, "epoch": 0.1503885738290275, "grad_norm": 1.0078125, "learning_rate": 0.0004999503774486924, "loss": 6.1833, "mean_token_accuracy": 0.12577988132834433, "num_tokens": 3303158.0, "step": 1790 }, { "entropy": 6.184361696243286, "epoch": 0.1508086536441924, "grad_norm": 1.0390625, "learning_rate": 0.0004999497465496987, "loss": 6.1137, "mean_token_accuracy": 0.11985947787761689, "num_tokens": 3313068.0, "step": 1795 }, { "entropy": 6.191692352294922, "epoch": 0.15122873345935728, "grad_norm": 1.109375, "learning_rate": 0.000499949111665836, "loss": 6.2033, "mean_token_accuracy": 0.12312208265066146, "num_tokens": 3321885.0, "step": 1800 }, { "entropy": 6.25971827507019, "epoch": 0.15164881327452215, "grad_norm": 1.0078125, "learning_rate": 0.0004999484727971158, "loss": 6.1858, "mean_token_accuracy": 0.12474783286452293, "num_tokens": 3330924.0, "step": 1805 }, { "entropy": 6.176667261123657, "epoch": 0.15206889308968705, "grad_norm": 1.03125, "learning_rate": 0.000499947829943549, "loss": 6.2248, "mean_token_accuracy": 0.12161886692047119, "num_tokens": 3340070.0, "step": 1810 }, { "entropy": 6.295008039474487, "epoch": 0.15248897290485192, "grad_norm": 1.0625, "learning_rate": 0.0004999471831051474, "loss": 6.213, "mean_token_accuracy": 0.13358828723430632, "num_tokens": 3349870.0, "step": 1815 }, { "entropy": 6.278341436386109, "epoch": 0.1529090527200168, "grad_norm": 1.0078125, "learning_rate": 0.0004999465322819222, "loss": 6.2576, "mean_token_accuracy": 0.11560158357024193, "num_tokens": 3359573.0, "step": 1820 }, { "entropy": 6.279096603393555, "epoch": 0.15332913253518168, "grad_norm": 1.078125, "learning_rate": 0.0004999458774738851, "loss": 6.1999, "mean_token_accuracy": 0.13126230910420417, "num_tokens": 3368577.0, "step": 1825 }, { "entropy": 6.1456389904022215, "epoch": 0.15374921235034655, "grad_norm": 1.0625, "learning_rate": 0.0004999452186810476, "loss": 6.1662, "mean_token_accuracy": 0.12922282814979552, "num_tokens": 3377801.0, "step": 1830 }, { "entropy": 6.282723903656006, "epoch": 0.15416929216551145, "grad_norm": 1.0859375, "learning_rate": 0.0004999445559034214, "loss": 6.2248, "mean_token_accuracy": 0.12709890604019164, "num_tokens": 3386666.0, "step": 1835 }, { "entropy": 6.3540504455566404, "epoch": 0.15458937198067632, "grad_norm": 1.0078125, "learning_rate": 0.0004999438891410181, "loss": 6.3599, "mean_token_accuracy": 0.12122973501682281, "num_tokens": 3396086.0, "step": 1840 }, { "entropy": 6.2125379085540775, "epoch": 0.15500945179584122, "grad_norm": 1.046875, "learning_rate": 0.0004999432183938496, "loss": 6.2646, "mean_token_accuracy": 0.1275039754807949, "num_tokens": 3404894.0, "step": 1845 }, { "entropy": 6.214909315109253, "epoch": 0.1554295316110061, "grad_norm": 1.046875, "learning_rate": 0.0004999425436619279, "loss": 6.2499, "mean_token_accuracy": 0.12167986705899239, "num_tokens": 3414172.0, "step": 1850 }, { "entropy": 6.310878896713257, "epoch": 0.15584961142617096, "grad_norm": 0.97265625, "learning_rate": 0.000499941864945265, "loss": 6.2176, "mean_token_accuracy": 0.11906537339091301, "num_tokens": 3423409.0, "step": 1855 }, { "entropy": 6.134654092788696, "epoch": 0.15626969124133586, "grad_norm": 1.0234375, "learning_rate": 0.0004999411822438726, "loss": 6.1799, "mean_token_accuracy": 0.12394418343901634, "num_tokens": 3433047.0, "step": 1860 }, { "entropy": 6.2948554992675785, "epoch": 0.15668977105650073, "grad_norm": 1.1484375, "learning_rate": 0.000499940495557763, "loss": 6.173, "mean_token_accuracy": 0.12352384477853776, "num_tokens": 3442490.0, "step": 1865 }, { "entropy": 6.233772277832031, "epoch": 0.15710985087166562, "grad_norm": 1.03125, "learning_rate": 0.0004999398048869485, "loss": 6.2356, "mean_token_accuracy": 0.1239772841334343, "num_tokens": 3451804.0, "step": 1870 }, { "entropy": 6.296554517745972, "epoch": 0.1575299306868305, "grad_norm": 1.0546875, "learning_rate": 0.000499939110231441, "loss": 6.2223, "mean_token_accuracy": 0.12610766440629959, "num_tokens": 3461481.0, "step": 1875 }, { "entropy": 6.218039226531983, "epoch": 0.1579500105019954, "grad_norm": 1.1484375, "learning_rate": 0.0004999384115912531, "loss": 6.2673, "mean_token_accuracy": 0.1208581991493702, "num_tokens": 3471798.0, "step": 1880 }, { "entropy": 6.088755655288696, "epoch": 0.15837009031716026, "grad_norm": 1.0234375, "learning_rate": 0.000499937708966397, "loss": 6.1755, "mean_token_accuracy": 0.12277546525001526, "num_tokens": 3481386.0, "step": 1885 }, { "entropy": 6.257310009002685, "epoch": 0.15879017013232513, "grad_norm": 1.0390625, "learning_rate": 0.0004999370023568853, "loss": 6.1643, "mean_token_accuracy": 0.12328559309244155, "num_tokens": 3489981.0, "step": 1890 }, { "entropy": 6.140112638473511, "epoch": 0.15921024994749003, "grad_norm": 1.0703125, "learning_rate": 0.0004999362917627304, "loss": 6.1438, "mean_token_accuracy": 0.12805134281516076, "num_tokens": 3498551.0, "step": 1895 }, { "entropy": 6.224145746231079, "epoch": 0.1596303297626549, "grad_norm": 1.140625, "learning_rate": 0.0004999355771839448, "loss": 6.1267, "mean_token_accuracy": 0.1276252895593643, "num_tokens": 3507921.0, "step": 1900 }, { "entropy": 6.316604804992676, "epoch": 0.1600504095778198, "grad_norm": 1.1171875, "learning_rate": 0.0004999348586205414, "loss": 6.2984, "mean_token_accuracy": 0.12361158952116966, "num_tokens": 3517570.0, "step": 1905 }, { "entropy": 6.265382909774781, "epoch": 0.16047048939298467, "grad_norm": 1.125, "learning_rate": 0.0004999341360725327, "loss": 6.2786, "mean_token_accuracy": 0.11925147697329522, "num_tokens": 3526774.0, "step": 1910 }, { "entropy": 6.244428873062134, "epoch": 0.16089056920814954, "grad_norm": 1.09375, "learning_rate": 0.0004999334095399317, "loss": 6.2167, "mean_token_accuracy": 0.1289656363427639, "num_tokens": 3535319.0, "step": 1915 }, { "entropy": 6.091944026947021, "epoch": 0.16131064902331443, "grad_norm": 1.03125, "learning_rate": 0.0004999326790227512, "loss": 6.1819, "mean_token_accuracy": 0.12599623277783395, "num_tokens": 3544468.0, "step": 1920 }, { "entropy": 6.069698667526245, "epoch": 0.1617307288384793, "grad_norm": 0.97265625, "learning_rate": 0.0004999319445210041, "loss": 6.0574, "mean_token_accuracy": 0.13135963827371597, "num_tokens": 3553529.0, "step": 1925 }, { "entropy": 6.176232147216797, "epoch": 0.1621508086536442, "grad_norm": 1.03125, "learning_rate": 0.0004999312060347034, "loss": 6.1206, "mean_token_accuracy": 0.12521466836333275, "num_tokens": 3563053.0, "step": 1930 }, { "entropy": 6.155474901199341, "epoch": 0.16257088846880907, "grad_norm": 0.9765625, "learning_rate": 0.0004999304635638621, "loss": 6.0713, "mean_token_accuracy": 0.13156753256917, "num_tokens": 3571877.0, "step": 1935 }, { "entropy": 6.117454576492309, "epoch": 0.16299096828397394, "grad_norm": 0.984375, "learning_rate": 0.0004999297171084935, "loss": 6.1211, "mean_token_accuracy": 0.12843042388558387, "num_tokens": 3581496.0, "step": 1940 }, { "entropy": 6.246276712417602, "epoch": 0.16341104809913884, "grad_norm": 1.0078125, "learning_rate": 0.0004999289666686109, "loss": 6.1408, "mean_token_accuracy": 0.12944318503141403, "num_tokens": 3590752.0, "step": 1945 }, { "entropy": 6.026504850387573, "epoch": 0.1638311279143037, "grad_norm": 1.0234375, "learning_rate": 0.0004999282122442274, "loss": 6.1427, "mean_token_accuracy": 0.12940528690814973, "num_tokens": 3599885.0, "step": 1950 }, { "entropy": 6.306515789031982, "epoch": 0.1642512077294686, "grad_norm": 1.0078125, "learning_rate": 0.0004999274538353564, "loss": 6.2127, "mean_token_accuracy": 0.12124313414096832, "num_tokens": 3610039.0, "step": 1955 }, { "entropy": 6.1400439739227295, "epoch": 0.16467128754463348, "grad_norm": 1.09375, "learning_rate": 0.0004999266914420114, "loss": 6.1432, "mean_token_accuracy": 0.12274663522839546, "num_tokens": 3619954.0, "step": 1960 }, { "entropy": 6.1886210441589355, "epoch": 0.16509136735979837, "grad_norm": 1.03125, "learning_rate": 0.000499925925064206, "loss": 6.0913, "mean_token_accuracy": 0.13008279874920844, "num_tokens": 3628164.0, "step": 1965 }, { "entropy": 6.256851673126221, "epoch": 0.16551144717496324, "grad_norm": 1.0546875, "learning_rate": 0.0004999251547019535, "loss": 6.2411, "mean_token_accuracy": 0.1288958877325058, "num_tokens": 3636778.0, "step": 1970 }, { "entropy": 6.259689378738403, "epoch": 0.16593152699012811, "grad_norm": 1.1484375, "learning_rate": 0.0004999243803552678, "loss": 6.2104, "mean_token_accuracy": 0.1265132576227188, "num_tokens": 3647046.0, "step": 1975 }, { "entropy": 6.134534025192261, "epoch": 0.166351606805293, "grad_norm": 1.09375, "learning_rate": 0.0004999236020241625, "loss": 6.1237, "mean_token_accuracy": 0.1289564423263073, "num_tokens": 3656130.0, "step": 1980 }, { "entropy": 6.189244413375855, "epoch": 0.16677168662045788, "grad_norm": 1.09375, "learning_rate": 0.0004999228197086514, "loss": 6.2018, "mean_token_accuracy": 0.11904976442456246, "num_tokens": 3666145.0, "step": 1985 }, { "entropy": 6.2379295349121096, "epoch": 0.16719176643562278, "grad_norm": 0.921875, "learning_rate": 0.0004999220334087484, "loss": 6.2356, "mean_token_accuracy": 0.12509587332606315, "num_tokens": 3676722.0, "step": 1990 }, { "entropy": 6.233392667770386, "epoch": 0.16761184625078765, "grad_norm": 1.015625, "learning_rate": 0.0004999212431244673, "loss": 6.2382, "mean_token_accuracy": 0.1240171104669571, "num_tokens": 3685880.0, "step": 1995 }, { "entropy": 6.1124889850616455, "epoch": 0.16803192606595252, "grad_norm": 1.015625, "learning_rate": 0.0004999204488558222, "loss": 6.0582, "mean_token_accuracy": 0.13227254450321196, "num_tokens": 3695167.0, "step": 2000 }, { "entropy": 6.222057247161866, "epoch": 0.16845200588111742, "grad_norm": 1.0625, "learning_rate": 0.0004999196506028273, "loss": 6.1797, "mean_token_accuracy": 0.12606113404035568, "num_tokens": 3703700.0, "step": 2005 }, { "entropy": 6.204267930984497, "epoch": 0.1688720856962823, "grad_norm": 1.1015625, "learning_rate": 0.0004999188483654965, "loss": 6.1263, "mean_token_accuracy": 0.12780678346753122, "num_tokens": 3712825.0, "step": 2010 }, { "entropy": 6.068148231506347, "epoch": 0.16929216551144718, "grad_norm": 0.99609375, "learning_rate": 0.0004999180421438442, "loss": 6.0953, "mean_token_accuracy": 0.12944422513246537, "num_tokens": 3721807.0, "step": 2015 }, { "entropy": 6.252347660064697, "epoch": 0.16971224532661205, "grad_norm": 1.1171875, "learning_rate": 0.0004999172319378846, "loss": 6.2617, "mean_token_accuracy": 0.12066083624958993, "num_tokens": 3730502.0, "step": 2020 }, { "entropy": 6.223606538772583, "epoch": 0.17013232514177692, "grad_norm": 1.0546875, "learning_rate": 0.0004999164177476319, "loss": 6.1457, "mean_token_accuracy": 0.13003366217017173, "num_tokens": 3739696.0, "step": 2025 }, { "entropy": 6.0265522480010985, "epoch": 0.17055240495694182, "grad_norm": 1.09375, "learning_rate": 0.0004999155995731009, "loss": 6.1404, "mean_token_accuracy": 0.1299336552619934, "num_tokens": 3748675.0, "step": 2030 }, { "entropy": 6.380355882644653, "epoch": 0.1709724847721067, "grad_norm": 1.1015625, "learning_rate": 0.0004999147774143057, "loss": 6.2221, "mean_token_accuracy": 0.12048738449811935, "num_tokens": 3757714.0, "step": 2035 }, { "entropy": 6.067580938339233, "epoch": 0.1713925645872716, "grad_norm": 1.0078125, "learning_rate": 0.000499913951271261, "loss": 6.0375, "mean_token_accuracy": 0.13202561810612679, "num_tokens": 3767589.0, "step": 2040 }, { "entropy": 6.142302322387695, "epoch": 0.17181264440243646, "grad_norm": 1.296875, "learning_rate": 0.0004999131211439816, "loss": 6.1596, "mean_token_accuracy": 0.12828587144613265, "num_tokens": 3777261.0, "step": 2045 }, { "entropy": 6.232779121398925, "epoch": 0.17223272421760136, "grad_norm": 1.0859375, "learning_rate": 0.000499912287032482, "loss": 6.1001, "mean_token_accuracy": 0.1372594192624092, "num_tokens": 3786658.0, "step": 2050 }, { "entropy": 6.025224256515503, "epoch": 0.17265280403276623, "grad_norm": 1.09375, "learning_rate": 0.000499911448936777, "loss": 6.1026, "mean_token_accuracy": 0.13396917879581452, "num_tokens": 3794977.0, "step": 2055 }, { "entropy": 6.084959363937378, "epoch": 0.1730728838479311, "grad_norm": 0.98828125, "learning_rate": 0.0004999106068568816, "loss": 6.1787, "mean_token_accuracy": 0.12529570311307908, "num_tokens": 3805138.0, "step": 2060 }, { "entropy": 6.263661098480225, "epoch": 0.173492963663096, "grad_norm": 1.0546875, "learning_rate": 0.0004999097607928106, "loss": 6.1258, "mean_token_accuracy": 0.13813115134835244, "num_tokens": 3814444.0, "step": 2065 }, { "entropy": 6.166193580627441, "epoch": 0.17391304347826086, "grad_norm": 1.0625, "learning_rate": 0.0004999089107445788, "loss": 6.0785, "mean_token_accuracy": 0.12874337583780288, "num_tokens": 3822859.0, "step": 2070 }, { "entropy": 6.0040192127227785, "epoch": 0.17433312329342576, "grad_norm": 1.0, "learning_rate": 0.0004999080567122016, "loss": 6.102, "mean_token_accuracy": 0.1266925446689129, "num_tokens": 3833159.0, "step": 2075 }, { "entropy": 6.185031747817993, "epoch": 0.17475320310859063, "grad_norm": 1.1015625, "learning_rate": 0.0004999071986956941, "loss": 6.1269, "mean_token_accuracy": 0.1295515276491642, "num_tokens": 3842136.0, "step": 2080 }, { "entropy": 6.116478013992309, "epoch": 0.1751732829237555, "grad_norm": 1.0546875, "learning_rate": 0.0004999063366950713, "loss": 6.1939, "mean_token_accuracy": 0.1253967322409153, "num_tokens": 3851406.0, "step": 2085 }, { "entropy": 6.1408590316772464, "epoch": 0.1755933627389204, "grad_norm": 1.078125, "learning_rate": 0.0004999054707103486, "loss": 6.1026, "mean_token_accuracy": 0.1274511694908142, "num_tokens": 3861061.0, "step": 2090 }, { "entropy": 6.164148044586182, "epoch": 0.17601344255408527, "grad_norm": 1.0859375, "learning_rate": 0.0004999046007415412, "loss": 6.067, "mean_token_accuracy": 0.12591860070824623, "num_tokens": 3870357.0, "step": 2095 }, { "entropy": 6.192416858673096, "epoch": 0.17643352236925017, "grad_norm": 1.0703125, "learning_rate": 0.0004999037267886646, "loss": 6.0964, "mean_token_accuracy": 0.1299741767346859, "num_tokens": 3879393.0, "step": 2100 }, { "entropy": 6.0785363674163815, "epoch": 0.17685360218441504, "grad_norm": 1.09375, "learning_rate": 0.0004999028488517343, "loss": 6.1037, "mean_token_accuracy": 0.12889744639396666, "num_tokens": 3888030.0, "step": 2105 }, { "entropy": 6.11736216545105, "epoch": 0.1772736819995799, "grad_norm": 1.125, "learning_rate": 0.0004999019669307659, "loss": 6.1275, "mean_token_accuracy": 0.13039418011903764, "num_tokens": 3897430.0, "step": 2110 }, { "entropy": 6.1809111595153805, "epoch": 0.1776937618147448, "grad_norm": 0.98046875, "learning_rate": 0.0004999010810257749, "loss": 6.1428, "mean_token_accuracy": 0.1269817218184471, "num_tokens": 3907711.0, "step": 2115 }, { "entropy": 6.062447786331177, "epoch": 0.17811384162990967, "grad_norm": 1.1015625, "learning_rate": 0.0004999001911367771, "loss": 6.0668, "mean_token_accuracy": 0.1323694571852684, "num_tokens": 3915816.0, "step": 2120 }, { "entropy": 6.1604491710662845, "epoch": 0.17853392144507457, "grad_norm": 1.0546875, "learning_rate": 0.0004998992972637883, "loss": 6.1943, "mean_token_accuracy": 0.1183660313487053, "num_tokens": 3925162.0, "step": 2125 }, { "entropy": 6.203741979598999, "epoch": 0.17895400126023944, "grad_norm": 1.0390625, "learning_rate": 0.0004998983994068242, "loss": 6.0864, "mean_token_accuracy": 0.1282353989779949, "num_tokens": 3934476.0, "step": 2130 }, { "entropy": 6.044822025299072, "epoch": 0.17937408107540434, "grad_norm": 1.015625, "learning_rate": 0.0004998974975659006, "loss": 6.124, "mean_token_accuracy": 0.12441963106393814, "num_tokens": 3943501.0, "step": 2135 }, { "entropy": 6.184865283966064, "epoch": 0.1797941608905692, "grad_norm": 1.0390625, "learning_rate": 0.0004998965917410338, "loss": 6.1111, "mean_token_accuracy": 0.12969196289777757, "num_tokens": 3953663.0, "step": 2140 }, { "entropy": 6.129238748550415, "epoch": 0.18021424070573408, "grad_norm": 1.1015625, "learning_rate": 0.0004998956819322397, "loss": 6.0839, "mean_token_accuracy": 0.13072072938084603, "num_tokens": 3962634.0, "step": 2145 }, { "entropy": 6.135206937789917, "epoch": 0.18063432052089898, "grad_norm": 1.078125, "learning_rate": 0.0004998947681395343, "loss": 6.0859, "mean_token_accuracy": 0.1366378679871559, "num_tokens": 3972496.0, "step": 2150 }, { "entropy": 6.271072053909302, "epoch": 0.18105440033606385, "grad_norm": 1.0859375, "learning_rate": 0.000499893850362934, "loss": 6.3296, "mean_token_accuracy": 0.12187584564089775, "num_tokens": 3980724.0, "step": 2155 }, { "entropy": 6.224115467071533, "epoch": 0.18147448015122875, "grad_norm": 1.078125, "learning_rate": 0.0004998929286024548, "loss": 6.1594, "mean_token_accuracy": 0.12844373360276223, "num_tokens": 3989842.0, "step": 2160 }, { "entropy": 6.123717546463013, "epoch": 0.18189455996639362, "grad_norm": 1.1640625, "learning_rate": 0.0004998920028581133, "loss": 6.0814, "mean_token_accuracy": 0.13656101748347282, "num_tokens": 3998534.0, "step": 2165 }, { "entropy": 6.150679874420166, "epoch": 0.18231463978155849, "grad_norm": 1.046875, "learning_rate": 0.0004998910731299258, "loss": 6.1088, "mean_token_accuracy": 0.12456604689359665, "num_tokens": 4007677.0, "step": 2170 }, { "entropy": 6.126907587051392, "epoch": 0.18273471959672338, "grad_norm": 1.1015625, "learning_rate": 0.0004998901394179085, "loss": 6.1638, "mean_token_accuracy": 0.12525054216384887, "num_tokens": 4016347.0, "step": 2175 }, { "entropy": 6.135372829437256, "epoch": 0.18315479941188825, "grad_norm": 1.1015625, "learning_rate": 0.0004998892017220784, "loss": 6.0213, "mean_token_accuracy": 0.13323480933904647, "num_tokens": 4025199.0, "step": 2180 }, { "entropy": 6.137722158432007, "epoch": 0.18357487922705315, "grad_norm": 1.125, "learning_rate": 0.0004998882600424519, "loss": 6.0876, "mean_token_accuracy": 0.12551357075572014, "num_tokens": 4033933.0, "step": 2185 }, { "entropy": 6.108227968215942, "epoch": 0.18399495904221802, "grad_norm": 1.2109375, "learning_rate": 0.0004998873143790455, "loss": 6.0183, "mean_token_accuracy": 0.1379354938864708, "num_tokens": 4042891.0, "step": 2190 }, { "entropy": 6.1591612815856935, "epoch": 0.1844150388573829, "grad_norm": 1.0546875, "learning_rate": 0.0004998863647318763, "loss": 6.1366, "mean_token_accuracy": 0.1241612270474434, "num_tokens": 4051123.0, "step": 2195 }, { "entropy": 6.089571523666382, "epoch": 0.1848351186725478, "grad_norm": 1.1640625, "learning_rate": 0.0004998854111009608, "loss": 6.113, "mean_token_accuracy": 0.12376126572489739, "num_tokens": 4060025.0, "step": 2200 }, { "entropy": 6.11730580329895, "epoch": 0.18525519848771266, "grad_norm": 1.0234375, "learning_rate": 0.0004998844534863161, "loss": 6.0217, "mean_token_accuracy": 0.12926619052886962, "num_tokens": 4069363.0, "step": 2205 }, { "entropy": 6.176160907745361, "epoch": 0.18567527830287756, "grad_norm": 1.046875, "learning_rate": 0.0004998834918879592, "loss": 6.1692, "mean_token_accuracy": 0.12947654128074645, "num_tokens": 4078855.0, "step": 2210 }, { "entropy": 6.131696176528931, "epoch": 0.18609535811804243, "grad_norm": 1.0234375, "learning_rate": 0.000499882526305907, "loss": 6.1424, "mean_token_accuracy": 0.12837494984269143, "num_tokens": 4087801.0, "step": 2215 }, { "entropy": 6.191353893280029, "epoch": 0.18651543793320732, "grad_norm": 1.0390625, "learning_rate": 0.0004998815567401765, "loss": 6.1351, "mean_token_accuracy": 0.12790770679712296, "num_tokens": 4096949.0, "step": 2220 }, { "entropy": 6.171415328979492, "epoch": 0.1869355177483722, "grad_norm": 1.078125, "learning_rate": 0.0004998805831907851, "loss": 6.084, "mean_token_accuracy": 0.1275387942790985, "num_tokens": 4105399.0, "step": 2225 }, { "entropy": 6.12052903175354, "epoch": 0.18735559756353706, "grad_norm": 1.0859375, "learning_rate": 0.0004998796056577501, "loss": 6.0391, "mean_token_accuracy": 0.1234730213880539, "num_tokens": 4113873.0, "step": 2230 }, { "entropy": 6.033805179595947, "epoch": 0.18777567737870196, "grad_norm": 1.03125, "learning_rate": 0.0004998786241410886, "loss": 6.1003, "mean_token_accuracy": 0.12796764224767684, "num_tokens": 4123528.0, "step": 2235 }, { "entropy": 6.244566345214844, "epoch": 0.18819575719386683, "grad_norm": 1.015625, "learning_rate": 0.000499877638640818, "loss": 6.1131, "mean_token_accuracy": 0.12414761930704117, "num_tokens": 4133370.0, "step": 2240 }, { "entropy": 6.0351306915283205, "epoch": 0.18861583700903173, "grad_norm": 1.0625, "learning_rate": 0.000499876649156956, "loss": 6.0237, "mean_token_accuracy": 0.13068948239088057, "num_tokens": 4142370.0, "step": 2245 }, { "entropy": 6.075446557998657, "epoch": 0.1890359168241966, "grad_norm": 1.0703125, "learning_rate": 0.0004998756556895196, "loss": 6.1176, "mean_token_accuracy": 0.12780525609850885, "num_tokens": 4152367.0, "step": 2250 }, { "entropy": 6.182886552810669, "epoch": 0.18945599663936147, "grad_norm": 1.0546875, "learning_rate": 0.000499874658238527, "loss": 6.0979, "mean_token_accuracy": 0.1277949795126915, "num_tokens": 4161126.0, "step": 2255 }, { "entropy": 6.106898975372315, "epoch": 0.18987607645452637, "grad_norm": 1.1015625, "learning_rate": 0.0004998736568039957, "loss": 6.0094, "mean_token_accuracy": 0.13100193440914154, "num_tokens": 4169910.0, "step": 2260 }, { "entropy": 6.133787775039673, "epoch": 0.19029615626969124, "grad_norm": 1.0625, "learning_rate": 0.0004998726513859432, "loss": 6.1599, "mean_token_accuracy": 0.12446666359901429, "num_tokens": 4179893.0, "step": 2265 }, { "entropy": 6.202354001998901, "epoch": 0.19071623608485613, "grad_norm": 0.984375, "learning_rate": 0.0004998716419843875, "loss": 6.1617, "mean_token_accuracy": 0.1319762259721756, "num_tokens": 4190065.0, "step": 2270 }, { "entropy": 6.011490678787231, "epoch": 0.191136315900021, "grad_norm": 1.125, "learning_rate": 0.0004998706285993465, "loss": 6.069, "mean_token_accuracy": 0.13331144750118257, "num_tokens": 4198395.0, "step": 2275 }, { "entropy": 6.173086833953858, "epoch": 0.19155639571518587, "grad_norm": 1.03125, "learning_rate": 0.0004998696112308381, "loss": 6.093, "mean_token_accuracy": 0.1271330051124096, "num_tokens": 4207555.0, "step": 2280 }, { "entropy": 6.0555767059326175, "epoch": 0.19197647553035077, "grad_norm": 1.0390625, "learning_rate": 0.0004998685898788803, "loss": 6.0375, "mean_token_accuracy": 0.1309538424015045, "num_tokens": 4216533.0, "step": 2285 }, { "entropy": 6.211866235733032, "epoch": 0.19239655534551564, "grad_norm": 1.1328125, "learning_rate": 0.0004998675645434914, "loss": 6.1419, "mean_token_accuracy": 0.1353093557059765, "num_tokens": 4225575.0, "step": 2290 }, { "entropy": 6.018606328964234, "epoch": 0.19281663516068054, "grad_norm": 1.1171875, "learning_rate": 0.0004998665352246891, "loss": 5.9193, "mean_token_accuracy": 0.13810657039284707, "num_tokens": 4234306.0, "step": 2295 }, { "entropy": 6.014672660827637, "epoch": 0.1932367149758454, "grad_norm": 1.0546875, "learning_rate": 0.0004998655019224921, "loss": 6.1267, "mean_token_accuracy": 0.12904786244034766, "num_tokens": 4243998.0, "step": 2300 }, { "entropy": 6.134347867965698, "epoch": 0.19365679479101028, "grad_norm": 1.0859375, "learning_rate": 0.0004998644646369185, "loss": 6.0238, "mean_token_accuracy": 0.12680166810750962, "num_tokens": 4253653.0, "step": 2305 }, { "entropy": 6.066501617431641, "epoch": 0.19407687460617518, "grad_norm": 1.0390625, "learning_rate": 0.0004998634233679865, "loss": 6.0895, "mean_token_accuracy": 0.12311211153864861, "num_tokens": 4263305.0, "step": 2310 }, { "entropy": 6.049868440628051, "epoch": 0.19449695442134005, "grad_norm": 1.078125, "learning_rate": 0.000499862378115715, "loss": 5.983, "mean_token_accuracy": 0.13395097106695175, "num_tokens": 4272212.0, "step": 2315 }, { "entropy": 6.165916633605957, "epoch": 0.19491703423650494, "grad_norm": 1.1796875, "learning_rate": 0.0004998613288801221, "loss": 6.1922, "mean_token_accuracy": 0.1247316338121891, "num_tokens": 4281445.0, "step": 2320 }, { "entropy": 6.179806041717529, "epoch": 0.1953371140516698, "grad_norm": 1.15625, "learning_rate": 0.0004998602756612267, "loss": 6.0898, "mean_token_accuracy": 0.12693395391106604, "num_tokens": 4290938.0, "step": 2325 }, { "entropy": 6.070136451721192, "epoch": 0.1957571938668347, "grad_norm": 1.078125, "learning_rate": 0.0004998592184590471, "loss": 6.1397, "mean_token_accuracy": 0.12676772177219392, "num_tokens": 4300022.0, "step": 2330 }, { "entropy": 6.06673412322998, "epoch": 0.19617727368199958, "grad_norm": 1.0859375, "learning_rate": 0.0004998581572736024, "loss": 6.0179, "mean_token_accuracy": 0.13165862262248992, "num_tokens": 4308910.0, "step": 2335 }, { "entropy": 5.994941234588623, "epoch": 0.19659735349716445, "grad_norm": 1.078125, "learning_rate": 0.0004998570921049112, "loss": 5.9863, "mean_token_accuracy": 0.135918989777565, "num_tokens": 4317136.0, "step": 2340 }, { "entropy": 6.102301931381225, "epoch": 0.19701743331232935, "grad_norm": 1.1484375, "learning_rate": 0.0004998560229529924, "loss": 6.0425, "mean_token_accuracy": 0.13503788635134698, "num_tokens": 4326163.0, "step": 2345 }, { "entropy": 6.227736186981201, "epoch": 0.19743751312749422, "grad_norm": 1.0390625, "learning_rate": 0.0004998549498178649, "loss": 6.1881, "mean_token_accuracy": 0.13264173418283462, "num_tokens": 4335837.0, "step": 2350 }, { "entropy": 6.1506922245025635, "epoch": 0.19785759294265912, "grad_norm": 1.171875, "learning_rate": 0.0004998538726995477, "loss": 6.1094, "mean_token_accuracy": 0.13223380818963051, "num_tokens": 4345108.0, "step": 2355 }, { "entropy": 6.144142389297485, "epoch": 0.198277672757824, "grad_norm": 1.046875, "learning_rate": 0.00049985279159806, "loss": 6.1229, "mean_token_accuracy": 0.1271647334098816, "num_tokens": 4353761.0, "step": 2360 }, { "entropy": 6.1053972244262695, "epoch": 0.19869775257298886, "grad_norm": 1.1015625, "learning_rate": 0.0004998517065134208, "loss": 6.0771, "mean_token_accuracy": 0.1304875746369362, "num_tokens": 4363244.0, "step": 2365 }, { "entropy": 6.125473690032959, "epoch": 0.19911783238815375, "grad_norm": 1.0625, "learning_rate": 0.0004998506174456494, "loss": 6.0856, "mean_token_accuracy": 0.1269718214869499, "num_tokens": 4373034.0, "step": 2370 }, { "entropy": 6.056502437591552, "epoch": 0.19953791220331862, "grad_norm": 1.0546875, "learning_rate": 0.0004998495243947653, "loss": 6.0113, "mean_token_accuracy": 0.12611002326011658, "num_tokens": 4382554.0, "step": 2375 }, { "entropy": 6.116158485412598, "epoch": 0.19995799201848352, "grad_norm": 1.203125, "learning_rate": 0.0004998484273607875, "loss": 6.0324, "mean_token_accuracy": 0.13722692728042601, "num_tokens": 4391001.0, "step": 2380 }, { "entropy": 5.908738136291504, "epoch": 0.2003780718336484, "grad_norm": 1.03125, "learning_rate": 0.0004998473263437356, "loss": 5.9468, "mean_token_accuracy": 0.1328367456793785, "num_tokens": 4400632.0, "step": 2385 }, { "entropy": 6.068370723724366, "epoch": 0.20079815164881326, "grad_norm": 1.078125, "learning_rate": 0.000499846221343629, "loss": 6.0486, "mean_token_accuracy": 0.12969876527786256, "num_tokens": 4409565.0, "step": 2390 }, { "entropy": 6.078929996490478, "epoch": 0.20121823146397816, "grad_norm": 1.1015625, "learning_rate": 0.0004998451123604875, "loss": 5.9972, "mean_token_accuracy": 0.13624220937490464, "num_tokens": 4418384.0, "step": 2395 }, { "entropy": 6.103708171844483, "epoch": 0.20163831127914303, "grad_norm": 1.1171875, "learning_rate": 0.0004998439993943306, "loss": 6.11, "mean_token_accuracy": 0.13608327358961106, "num_tokens": 4427581.0, "step": 2400 }, { "entropy": 6.2018999576568605, "epoch": 0.20205839109430793, "grad_norm": 1.125, "learning_rate": 0.0004998428824451779, "loss": 6.1047, "mean_token_accuracy": 0.1272777199745178, "num_tokens": 4436572.0, "step": 2405 }, { "entropy": 6.056638908386231, "epoch": 0.2024784709094728, "grad_norm": 1.1328125, "learning_rate": 0.0004998417615130495, "loss": 6.1099, "mean_token_accuracy": 0.12568870037794114, "num_tokens": 4445230.0, "step": 2410 }, { "entropy": 6.192966461181641, "epoch": 0.2028985507246377, "grad_norm": 1.1328125, "learning_rate": 0.0004998406365979649, "loss": 6.1712, "mean_token_accuracy": 0.12947247475385665, "num_tokens": 4454251.0, "step": 2415 }, { "entropy": 6.0738544940948485, "epoch": 0.20331863053980256, "grad_norm": 1.0390625, "learning_rate": 0.0004998395076999443, "loss": 6.0246, "mean_token_accuracy": 0.1331735722720623, "num_tokens": 4463949.0, "step": 2420 }, { "entropy": 6.164913845062256, "epoch": 0.20373871035496743, "grad_norm": 1.109375, "learning_rate": 0.0004998383748190076, "loss": 6.2178, "mean_token_accuracy": 0.12642809972167016, "num_tokens": 4473373.0, "step": 2425 }, { "entropy": 6.169246625900269, "epoch": 0.20415879017013233, "grad_norm": 1.1015625, "learning_rate": 0.0004998372379551748, "loss": 6.0443, "mean_token_accuracy": 0.13512365892529488, "num_tokens": 4482303.0, "step": 2430 }, { "entropy": 6.000651454925537, "epoch": 0.2045788699852972, "grad_norm": 1.0234375, "learning_rate": 0.0004998360971084663, "loss": 6.0248, "mean_token_accuracy": 0.1257840245962143, "num_tokens": 4491214.0, "step": 2435 }, { "entropy": 6.060888242721558, "epoch": 0.2049989498004621, "grad_norm": 1.1015625, "learning_rate": 0.0004998349522789019, "loss": 5.9365, "mean_token_accuracy": 0.14086327105760574, "num_tokens": 4500099.0, "step": 2440 }, { "entropy": 6.020166492462158, "epoch": 0.20541902961562697, "grad_norm": 1.0703125, "learning_rate": 0.0004998338034665021, "loss": 6.0199, "mean_token_accuracy": 0.13966668471693994, "num_tokens": 4509893.0, "step": 2445 }, { "entropy": 6.064390420913696, "epoch": 0.20583910943079184, "grad_norm": 1.0703125, "learning_rate": 0.0004998326506712872, "loss": 5.9974, "mean_token_accuracy": 0.13378938734531404, "num_tokens": 4518606.0, "step": 2450 }, { "entropy": 6.097909021377563, "epoch": 0.20625918924595674, "grad_norm": 1.1171875, "learning_rate": 0.0004998314938932778, "loss": 6.0759, "mean_token_accuracy": 0.1298009656369686, "num_tokens": 4528392.0, "step": 2455 }, { "entropy": 6.1035826206207275, "epoch": 0.2066792690611216, "grad_norm": 1.125, "learning_rate": 0.0004998303331324943, "loss": 6.0416, "mean_token_accuracy": 0.13463694974780083, "num_tokens": 4536983.0, "step": 2460 }, { "entropy": 5.9858495712280275, "epoch": 0.2070993488762865, "grad_norm": 1.109375, "learning_rate": 0.0004998291683889571, "loss": 5.9442, "mean_token_accuracy": 0.13662122339010238, "num_tokens": 4544967.0, "step": 2465 }, { "entropy": 6.056029415130615, "epoch": 0.20751942869145137, "grad_norm": 1.1796875, "learning_rate": 0.000499827999662687, "loss": 6.0242, "mean_token_accuracy": 0.12964650020003318, "num_tokens": 4554646.0, "step": 2470 }, { "entropy": 6.118838214874268, "epoch": 0.20793950850661624, "grad_norm": 1.0390625, "learning_rate": 0.0004998268269537046, "loss": 6.0401, "mean_token_accuracy": 0.13539641574025155, "num_tokens": 4564040.0, "step": 2475 }, { "entropy": 6.022972631454468, "epoch": 0.20835958832178114, "grad_norm": 1.03125, "learning_rate": 0.0004998256502620308, "loss": 6.0624, "mean_token_accuracy": 0.13345976546406746, "num_tokens": 4573758.0, "step": 2480 }, { "entropy": 6.193491125106812, "epoch": 0.208779668136946, "grad_norm": 1.078125, "learning_rate": 0.0004998244695876864, "loss": 6.0874, "mean_token_accuracy": 0.13196430653333663, "num_tokens": 4582097.0, "step": 2485 }, { "entropy": 6.018001937866211, "epoch": 0.2091997479521109, "grad_norm": 1.15625, "learning_rate": 0.0004998232849306921, "loss": 6.064, "mean_token_accuracy": 0.1368905283510685, "num_tokens": 4590687.0, "step": 2490 }, { "entropy": 6.152202367782593, "epoch": 0.20961982776727578, "grad_norm": 1.1328125, "learning_rate": 0.0004998220962910693, "loss": 6.0475, "mean_token_accuracy": 0.12533890679478646, "num_tokens": 4599497.0, "step": 2495 }, { "entropy": 6.059301280975342, "epoch": 0.21003990758244068, "grad_norm": 1.140625, "learning_rate": 0.0004998209036688386, "loss": 6.0091, "mean_token_accuracy": 0.12979092076420784, "num_tokens": 4607958.0, "step": 2500 }, { "entropy": 6.12682089805603, "epoch": 0.21045998739760555, "grad_norm": 1.0703125, "learning_rate": 0.0004998197070640216, "loss": 6.1445, "mean_token_accuracy": 0.12323907017707825, "num_tokens": 4617515.0, "step": 2505 }, { "entropy": 6.13975419998169, "epoch": 0.21088006721277042, "grad_norm": 1.0546875, "learning_rate": 0.0004998185064766391, "loss": 6.028, "mean_token_accuracy": 0.13126113414764404, "num_tokens": 4627037.0, "step": 2510 }, { "entropy": 5.999127197265625, "epoch": 0.21130014702793531, "grad_norm": 1.0625, "learning_rate": 0.0004998173019067127, "loss": 6.0335, "mean_token_accuracy": 0.13387575298547744, "num_tokens": 4637393.0, "step": 2515 }, { "entropy": 6.049172449111938, "epoch": 0.21172022684310018, "grad_norm": 1.0703125, "learning_rate": 0.0004998160933542633, "loss": 6.0685, "mean_token_accuracy": 0.12128801420331001, "num_tokens": 4646832.0, "step": 2520 }, { "entropy": 6.16112699508667, "epoch": 0.21214030665826508, "grad_norm": 1.1953125, "learning_rate": 0.0004998148808193128, "loss": 6.095, "mean_token_accuracy": 0.1346332848072052, "num_tokens": 4655719.0, "step": 2525 }, { "entropy": 6.126083850860596, "epoch": 0.21256038647342995, "grad_norm": 1.0703125, "learning_rate": 0.0004998136643018823, "loss": 6.0477, "mean_token_accuracy": 0.12910717576742173, "num_tokens": 4665364.0, "step": 2530 }, { "entropy": 6.087383460998535, "epoch": 0.21298046628859482, "grad_norm": 1.1484375, "learning_rate": 0.0004998124438019935, "loss": 6.0166, "mean_token_accuracy": 0.1316668502986431, "num_tokens": 4674760.0, "step": 2535 }, { "entropy": 5.993421936035157, "epoch": 0.21340054610375972, "grad_norm": 1.0625, "learning_rate": 0.0004998112193196681, "loss": 5.9488, "mean_token_accuracy": 0.13391186147928238, "num_tokens": 4683900.0, "step": 2540 }, { "entropy": 5.969591331481934, "epoch": 0.2138206259189246, "grad_norm": 1.109375, "learning_rate": 0.0004998099908549277, "loss": 5.9886, "mean_token_accuracy": 0.1273488573729992, "num_tokens": 4693915.0, "step": 2545 }, { "entropy": 5.9875883102417, "epoch": 0.2142407057340895, "grad_norm": 1.078125, "learning_rate": 0.000499808758407794, "loss": 5.8619, "mean_token_accuracy": 0.13991126343607901, "num_tokens": 4703102.0, "step": 2550 }, { "entropy": 6.031775951385498, "epoch": 0.21466078554925436, "grad_norm": 1.0859375, "learning_rate": 0.0004998075219782889, "loss": 6.0787, "mean_token_accuracy": 0.1323968604207039, "num_tokens": 4712925.0, "step": 2555 }, { "entropy": 6.099209594726562, "epoch": 0.21508086536441923, "grad_norm": 1.1015625, "learning_rate": 0.0004998062815664344, "loss": 6.0069, "mean_token_accuracy": 0.12949655801057816, "num_tokens": 4722641.0, "step": 2560 }, { "entropy": 6.046544742584229, "epoch": 0.21550094517958412, "grad_norm": 1.09375, "learning_rate": 0.0004998050371722524, "loss": 6.0781, "mean_token_accuracy": 0.12990766763687134, "num_tokens": 4732603.0, "step": 2565 }, { "entropy": 5.932075929641724, "epoch": 0.215921024994749, "grad_norm": 1.0703125, "learning_rate": 0.0004998037887957649, "loss": 5.9211, "mean_token_accuracy": 0.13785294219851493, "num_tokens": 4742644.0, "step": 2570 }, { "entropy": 6.21406192779541, "epoch": 0.2163411048099139, "grad_norm": 1.0859375, "learning_rate": 0.0004998025364369939, "loss": 6.2335, "mean_token_accuracy": 0.1234040841460228, "num_tokens": 4751482.0, "step": 2575 }, { "entropy": 6.237205886840821, "epoch": 0.21676118462507876, "grad_norm": 1.140625, "learning_rate": 0.0004998012800959619, "loss": 6.0891, "mean_token_accuracy": 0.12757375389337539, "num_tokens": 4760593.0, "step": 2580 }, { "entropy": 6.093921661376953, "epoch": 0.21718126444024366, "grad_norm": 1.171875, "learning_rate": 0.0004998000197726909, "loss": 6.0827, "mean_token_accuracy": 0.13335589170455933, "num_tokens": 4769294.0, "step": 2585 }, { "entropy": 6.031546688079834, "epoch": 0.21760134425540853, "grad_norm": 0.98828125, "learning_rate": 0.0004997987554672033, "loss": 6.0081, "mean_token_accuracy": 0.13305121287703514, "num_tokens": 4779239.0, "step": 2590 }, { "entropy": 6.059205436706543, "epoch": 0.2180214240705734, "grad_norm": 1.09375, "learning_rate": 0.0004997974871795215, "loss": 6.0716, "mean_token_accuracy": 0.13057481795549392, "num_tokens": 4788211.0, "step": 2595 }, { "entropy": 6.109251928329468, "epoch": 0.2184415038857383, "grad_norm": 1.0625, "learning_rate": 0.000499796214909668, "loss": 6.0447, "mean_token_accuracy": 0.13531798869371414, "num_tokens": 4797921.0, "step": 2600 }, { "entropy": 6.092241191864014, "epoch": 0.21886158370090317, "grad_norm": 1.125, "learning_rate": 0.0004997949386576653, "loss": 6.0378, "mean_token_accuracy": 0.13213689997792244, "num_tokens": 4807772.0, "step": 2605 }, { "entropy": 6.042962265014649, "epoch": 0.21928166351606806, "grad_norm": 1.0390625, "learning_rate": 0.000499793658423536, "loss": 6.0593, "mean_token_accuracy": 0.13149860948324205, "num_tokens": 4817999.0, "step": 2610 }, { "entropy": 6.057756137847901, "epoch": 0.21970174333123293, "grad_norm": 1.1328125, "learning_rate": 0.0004997923742073028, "loss": 6.0136, "mean_token_accuracy": 0.13949006497859956, "num_tokens": 4826679.0, "step": 2615 }, { "entropy": 5.998235082626342, "epoch": 0.2201218231463978, "grad_norm": 1.15625, "learning_rate": 0.0004997910860089884, "loss": 6.0157, "mean_token_accuracy": 0.13456794619560242, "num_tokens": 4834998.0, "step": 2620 }, { "entropy": 6.064208889007569, "epoch": 0.2205419029615627, "grad_norm": 1.125, "learning_rate": 0.0004997897938286156, "loss": 5.9717, "mean_token_accuracy": 0.1337368108332157, "num_tokens": 4843635.0, "step": 2625 }, { "entropy": 6.085119295120239, "epoch": 0.22096198277672757, "grad_norm": 1.171875, "learning_rate": 0.0004997884976662075, "loss": 6.0919, "mean_token_accuracy": 0.12607687711715698, "num_tokens": 4852027.0, "step": 2630 }, { "entropy": 6.183318328857422, "epoch": 0.22138206259189247, "grad_norm": 1.1015625, "learning_rate": 0.0004997871975217868, "loss": 6.0165, "mean_token_accuracy": 0.1429324761033058, "num_tokens": 4861244.0, "step": 2635 }, { "entropy": 5.912706756591797, "epoch": 0.22180214240705734, "grad_norm": 1.078125, "learning_rate": 0.0004997858933953768, "loss": 5.9326, "mean_token_accuracy": 0.1404939979314804, "num_tokens": 4869902.0, "step": 2640 }, { "entropy": 5.963629674911499, "epoch": 0.2222222222222222, "grad_norm": 1.125, "learning_rate": 0.0004997845852870004, "loss": 5.8982, "mean_token_accuracy": 0.14085923954844476, "num_tokens": 4878502.0, "step": 2645 }, { "entropy": 5.986082458496094, "epoch": 0.2226423020373871, "grad_norm": 1.1875, "learning_rate": 0.0004997832731966806, "loss": 5.964, "mean_token_accuracy": 0.14047276899218558, "num_tokens": 4888348.0, "step": 2650 }, { "entropy": 6.051373815536499, "epoch": 0.22306238185255198, "grad_norm": 1.1171875, "learning_rate": 0.0004997819571244411, "loss": 6.0172, "mean_token_accuracy": 0.13845039829611777, "num_tokens": 4897302.0, "step": 2655 }, { "entropy": 6.01381549835205, "epoch": 0.22348246166771688, "grad_norm": 1.1015625, "learning_rate": 0.0004997806370703049, "loss": 6.0476, "mean_token_accuracy": 0.13289312049746513, "num_tokens": 4907078.0, "step": 2660 }, { "entropy": 5.983912467956543, "epoch": 0.22390254148288175, "grad_norm": 1.0234375, "learning_rate": 0.0004997793130342954, "loss": 5.8784, "mean_token_accuracy": 0.1382697917521, "num_tokens": 4917489.0, "step": 2665 }, { "entropy": 5.94772891998291, "epoch": 0.22432262129804661, "grad_norm": 1.09375, "learning_rate": 0.0004997779850164363, "loss": 5.9836, "mean_token_accuracy": 0.13369291126728058, "num_tokens": 4927073.0, "step": 2670 }, { "entropy": 6.121642923355102, "epoch": 0.2247427011132115, "grad_norm": 1.109375, "learning_rate": 0.0004997766530167508, "loss": 6.0821, "mean_token_accuracy": 0.1270790107548237, "num_tokens": 4935464.0, "step": 2675 }, { "entropy": 6.221409273147583, "epoch": 0.22516278092837638, "grad_norm": 1.1328125, "learning_rate": 0.0004997753170352627, "loss": 6.1649, "mean_token_accuracy": 0.12717002481222153, "num_tokens": 4944718.0, "step": 2680 }, { "entropy": 6.084948205947876, "epoch": 0.22558286074354128, "grad_norm": 1.1875, "learning_rate": 0.0004997739770719955, "loss": 6.0396, "mean_token_accuracy": 0.1332695096731186, "num_tokens": 4954223.0, "step": 2685 }, { "entropy": 6.003955984115601, "epoch": 0.22600294055870615, "grad_norm": 1.1015625, "learning_rate": 0.000499772633126973, "loss": 6.0733, "mean_token_accuracy": 0.1317312702536583, "num_tokens": 4963371.0, "step": 2690 }, { "entropy": 6.013844203948975, "epoch": 0.22642302037387105, "grad_norm": 1.15625, "learning_rate": 0.0004997712852002192, "loss": 5.9358, "mean_token_accuracy": 0.14093514010310174, "num_tokens": 4972973.0, "step": 2695 }, { "entropy": 6.059261226654053, "epoch": 0.22684310018903592, "grad_norm": 1.15625, "learning_rate": 0.0004997699332917578, "loss": 6.1739, "mean_token_accuracy": 0.12389883399009705, "num_tokens": 4982808.0, "step": 2700 }, { "entropy": 6.180717802047729, "epoch": 0.2272631800042008, "grad_norm": 1.109375, "learning_rate": 0.0004997685774016127, "loss": 6.0444, "mean_token_accuracy": 0.13330344706773758, "num_tokens": 4992427.0, "step": 2705 }, { "entropy": 6.1143828392028805, "epoch": 0.22768325981936569, "grad_norm": 0.96875, "learning_rate": 0.000499767217529808, "loss": 6.2262, "mean_token_accuracy": 0.12522902861237525, "num_tokens": 5003562.0, "step": 2710 }, { "entropy": 6.120408248901367, "epoch": 0.22810333963453056, "grad_norm": 1.015625, "learning_rate": 0.0004997658536763678, "loss": 5.9207, "mean_token_accuracy": 0.13713482916355133, "num_tokens": 5013429.0, "step": 2715 }, { "entropy": 6.080751562118531, "epoch": 0.22852341944969545, "grad_norm": 1.2265625, "learning_rate": 0.0004997644858413163, "loss": 6.046, "mean_token_accuracy": 0.13544052764773368, "num_tokens": 5022045.0, "step": 2720 }, { "entropy": 5.984566640853882, "epoch": 0.22894349926486032, "grad_norm": 1.03125, "learning_rate": 0.0004997631140246775, "loss": 5.8853, "mean_token_accuracy": 0.14113514721393586, "num_tokens": 5032260.0, "step": 2725 }, { "entropy": 5.9389331340789795, "epoch": 0.2293635790800252, "grad_norm": 1.0859375, "learning_rate": 0.000499761738226476, "loss": 5.9276, "mean_token_accuracy": 0.13583676218986512, "num_tokens": 5041688.0, "step": 2730 }, { "entropy": 6.007482099533081, "epoch": 0.2297836588951901, "grad_norm": 1.1640625, "learning_rate": 0.000499760358446736, "loss": 6.0417, "mean_token_accuracy": 0.1291549324989319, "num_tokens": 5051005.0, "step": 2735 }, { "entropy": 6.1208288192749025, "epoch": 0.23020373871035496, "grad_norm": 1.15625, "learning_rate": 0.000499758974685482, "loss": 5.9698, "mean_token_accuracy": 0.13492617905139923, "num_tokens": 5060084.0, "step": 2740 }, { "entropy": 6.010481119155884, "epoch": 0.23062381852551986, "grad_norm": 1.34375, "learning_rate": 0.0004997575869427385, "loss": 5.9731, "mean_token_accuracy": 0.14254927188158034, "num_tokens": 5069081.0, "step": 2745 }, { "entropy": 6.021266603469849, "epoch": 0.23104389834068473, "grad_norm": 1.1484375, "learning_rate": 0.00049975619521853, "loss": 5.9703, "mean_token_accuracy": 0.13409337997436524, "num_tokens": 5078597.0, "step": 2750 }, { "entropy": 5.943169069290161, "epoch": 0.2314639781558496, "grad_norm": 1.0546875, "learning_rate": 0.0004997547995128814, "loss": 6.0084, "mean_token_accuracy": 0.13727526888251304, "num_tokens": 5087607.0, "step": 2755 }, { "entropy": 6.111000204086304, "epoch": 0.2318840579710145, "grad_norm": 1.140625, "learning_rate": 0.0004997533998258171, "loss": 6.0123, "mean_token_accuracy": 0.1351937808096409, "num_tokens": 5097412.0, "step": 2760 }, { "entropy": 6.129235696792603, "epoch": 0.23230413778617937, "grad_norm": 1.1640625, "learning_rate": 0.0004997519961573622, "loss": 6.0735, "mean_token_accuracy": 0.1282409645617008, "num_tokens": 5105817.0, "step": 2765 }, { "entropy": 6.1673665046691895, "epoch": 0.23272421760134426, "grad_norm": 1.1640625, "learning_rate": 0.0004997505885075414, "loss": 6.1269, "mean_token_accuracy": 0.12907201573252677, "num_tokens": 5114958.0, "step": 2770 }, { "entropy": 6.069322109222412, "epoch": 0.23314429741650913, "grad_norm": 1.109375, "learning_rate": 0.0004997491768763795, "loss": 6.0425, "mean_token_accuracy": 0.13409897387027742, "num_tokens": 5123728.0, "step": 2775 }, { "entropy": 6.003434944152832, "epoch": 0.23356437723167403, "grad_norm": 1.1328125, "learning_rate": 0.0004997477612639018, "loss": 6.0871, "mean_token_accuracy": 0.12734304070472718, "num_tokens": 5134099.0, "step": 2780 }, { "entropy": 6.186435317993164, "epoch": 0.2339844570468389, "grad_norm": 1.171875, "learning_rate": 0.0004997463416701332, "loss": 6.094, "mean_token_accuracy": 0.1274227410554886, "num_tokens": 5142934.0, "step": 2785 }, { "entropy": 6.043578577041626, "epoch": 0.23440453686200377, "grad_norm": 1.171875, "learning_rate": 0.0004997449180950989, "loss": 5.9298, "mean_token_accuracy": 0.1532392293214798, "num_tokens": 5151835.0, "step": 2790 }, { "entropy": 5.953121995925903, "epoch": 0.23482461667716867, "grad_norm": 1.1796875, "learning_rate": 0.0004997434905388241, "loss": 5.9842, "mean_token_accuracy": 0.1413706734776497, "num_tokens": 5161136.0, "step": 2795 }, { "entropy": 6.0334107875823975, "epoch": 0.23524469649233354, "grad_norm": 1.09375, "learning_rate": 0.000499742059001334, "loss": 5.9191, "mean_token_accuracy": 0.1378956101834774, "num_tokens": 5170741.0, "step": 2800 }, { "entropy": 5.991379880905152, "epoch": 0.23566477630749844, "grad_norm": 1.203125, "learning_rate": 0.0004997406234826541, "loss": 5.9539, "mean_token_accuracy": 0.14059103950858115, "num_tokens": 5180549.0, "step": 2805 }, { "entropy": 5.995284509658814, "epoch": 0.2360848561226633, "grad_norm": 1.0703125, "learning_rate": 0.0004997391839828098, "loss": 5.9249, "mean_token_accuracy": 0.14390118718147277, "num_tokens": 5189486.0, "step": 2810 }, { "entropy": 6.030531978607177, "epoch": 0.23650493593782818, "grad_norm": 1.1640625, "learning_rate": 0.0004997377405018266, "loss": 6.0032, "mean_token_accuracy": 0.13120983093976973, "num_tokens": 5198525.0, "step": 2815 }, { "entropy": 6.0725666046142575, "epoch": 0.23692501575299307, "grad_norm": 1.1328125, "learning_rate": 0.00049973629303973, "loss": 6.0662, "mean_token_accuracy": 0.1294946141541004, "num_tokens": 5207124.0, "step": 2820 }, { "entropy": 5.958557415008545, "epoch": 0.23734509556815794, "grad_norm": 1.1015625, "learning_rate": 0.0004997348415965457, "loss": 5.878, "mean_token_accuracy": 0.13335178643465043, "num_tokens": 5216529.0, "step": 2825 }, { "entropy": 6.007561159133911, "epoch": 0.23776517538332284, "grad_norm": 1.203125, "learning_rate": 0.0004997333861722995, "loss": 6.0169, "mean_token_accuracy": 0.13635273203253745, "num_tokens": 5225796.0, "step": 2830 }, { "entropy": 6.125902462005615, "epoch": 0.2381852551984877, "grad_norm": 1.203125, "learning_rate": 0.000499731926767017, "loss": 6.0359, "mean_token_accuracy": 0.1375264048576355, "num_tokens": 5233876.0, "step": 2835 }, { "entropy": 5.989985036849975, "epoch": 0.23860533501365258, "grad_norm": 1.1015625, "learning_rate": 0.0004997304633807242, "loss": 6.0396, "mean_token_accuracy": 0.12682786211371422, "num_tokens": 5244782.0, "step": 2840 }, { "entropy": 6.019674825668335, "epoch": 0.23902541482881748, "grad_norm": 1.1875, "learning_rate": 0.0004997289960134468, "loss": 5.9886, "mean_token_accuracy": 0.13695719763636588, "num_tokens": 5253453.0, "step": 2845 }, { "entropy": 6.0026778221130375, "epoch": 0.23944549464398235, "grad_norm": 1.1796875, "learning_rate": 0.0004997275246652111, "loss": 6.0149, "mean_token_accuracy": 0.13926383331418038, "num_tokens": 5262355.0, "step": 2850 }, { "entropy": 5.99656400680542, "epoch": 0.23986557445914725, "grad_norm": 1.125, "learning_rate": 0.000499726049336043, "loss": 5.9374, "mean_token_accuracy": 0.13838583379983901, "num_tokens": 5271959.0, "step": 2855 }, { "entropy": 6.058608770370483, "epoch": 0.24028565427431212, "grad_norm": 1.125, "learning_rate": 0.0004997245700259686, "loss": 5.9673, "mean_token_accuracy": 0.1403045229613781, "num_tokens": 5281393.0, "step": 2860 }, { "entropy": 6.061829471588135, "epoch": 0.240705734089477, "grad_norm": 1.109375, "learning_rate": 0.0004997230867350141, "loss": 6.0878, "mean_token_accuracy": 0.1320396728813648, "num_tokens": 5290979.0, "step": 2865 }, { "entropy": 6.128190040588379, "epoch": 0.24112581390464188, "grad_norm": 1.1171875, "learning_rate": 0.0004997215994632059, "loss": 6.0392, "mean_token_accuracy": 0.13521442338824272, "num_tokens": 5300263.0, "step": 2870 }, { "entropy": 6.065250301361084, "epoch": 0.24154589371980675, "grad_norm": 1.0859375, "learning_rate": 0.0004997201082105704, "loss": 6.0654, "mean_token_accuracy": 0.12793515026569366, "num_tokens": 5309522.0, "step": 2875 }, { "entropy": 6.059223175048828, "epoch": 0.24196597353497165, "grad_norm": 1.1484375, "learning_rate": 0.0004997186129771338, "loss": 6.0625, "mean_token_accuracy": 0.13326726630330085, "num_tokens": 5319770.0, "step": 2880 }, { "entropy": 6.18207311630249, "epoch": 0.24238605335013652, "grad_norm": 1.1484375, "learning_rate": 0.0004997171137629226, "loss": 6.0695, "mean_token_accuracy": 0.13562847971916198, "num_tokens": 5328400.0, "step": 2885 }, { "entropy": 5.968668270111084, "epoch": 0.24280613316530142, "grad_norm": 1.1953125, "learning_rate": 0.0004997156105679636, "loss": 5.8716, "mean_token_accuracy": 0.14514228701591492, "num_tokens": 5336338.0, "step": 2890 }, { "entropy": 5.89683952331543, "epoch": 0.2432262129804663, "grad_norm": 1.1640625, "learning_rate": 0.0004997141033922832, "loss": 5.9748, "mean_token_accuracy": 0.1309155747294426, "num_tokens": 5345391.0, "step": 2895 }, { "entropy": 6.103964805603027, "epoch": 0.24364629279563116, "grad_norm": 1.109375, "learning_rate": 0.0004997125922359081, "loss": 6.0044, "mean_token_accuracy": 0.12651756703853606, "num_tokens": 5354709.0, "step": 2900 }, { "entropy": 6.039173555374146, "epoch": 0.24406637261079606, "grad_norm": 1.109375, "learning_rate": 0.0004997110770988652, "loss": 5.9187, "mean_token_accuracy": 0.13533097133040428, "num_tokens": 5363738.0, "step": 2905 }, { "entropy": 6.009365177154541, "epoch": 0.24448645242596093, "grad_norm": 1.34375, "learning_rate": 0.0004997095579811813, "loss": 6.0492, "mean_token_accuracy": 0.13356854170560836, "num_tokens": 5373583.0, "step": 2910 }, { "entropy": 6.10346941947937, "epoch": 0.24490653224112582, "grad_norm": 1.046875, "learning_rate": 0.0004997080348828833, "loss": 6.0964, "mean_token_accuracy": 0.1329493686556816, "num_tokens": 5383486.0, "step": 2915 }, { "entropy": 6.022554492950439, "epoch": 0.2453266120562907, "grad_norm": 1.1640625, "learning_rate": 0.0004997065078039981, "loss": 5.995, "mean_token_accuracy": 0.1254143126308918, "num_tokens": 5391974.0, "step": 2920 }, { "entropy": 6.089977025985718, "epoch": 0.24574669187145556, "grad_norm": 1.1875, "learning_rate": 0.0004997049767445529, "loss": 6.0288, "mean_token_accuracy": 0.12984034791588783, "num_tokens": 5400882.0, "step": 2925 }, { "entropy": 6.110510158538818, "epoch": 0.24616677168662046, "grad_norm": 1.1796875, "learning_rate": 0.0004997034417045746, "loss": 5.9927, "mean_token_accuracy": 0.1267140880227089, "num_tokens": 5410538.0, "step": 2930 }, { "entropy": 5.971307563781738, "epoch": 0.24658685150178533, "grad_norm": 1.15625, "learning_rate": 0.0004997019026840907, "loss": 5.8743, "mean_token_accuracy": 0.13612414821982383, "num_tokens": 5419406.0, "step": 2935 }, { "entropy": 5.88221755027771, "epoch": 0.24700693131695023, "grad_norm": 1.15625, "learning_rate": 0.0004997003596831282, "loss": 5.9978, "mean_token_accuracy": 0.13463943675160409, "num_tokens": 5428817.0, "step": 2940 }, { "entropy": 6.0984635829925535, "epoch": 0.2474270111321151, "grad_norm": 1.109375, "learning_rate": 0.0004996988127017145, "loss": 6.0253, "mean_token_accuracy": 0.13181837573647498, "num_tokens": 5438277.0, "step": 2945 }, { "entropy": 6.0544061183929445, "epoch": 0.24784709094728, "grad_norm": 1.2578125, "learning_rate": 0.0004996972617398772, "loss": 6.042, "mean_token_accuracy": 0.13205936923623085, "num_tokens": 5447440.0, "step": 2950 }, { "entropy": 6.0680958271026615, "epoch": 0.24826717076244487, "grad_norm": 1.0859375, "learning_rate": 0.0004996957067976435, "loss": 5.9541, "mean_token_accuracy": 0.1357963502407074, "num_tokens": 5455988.0, "step": 2955 }, { "entropy": 6.0058001518249515, "epoch": 0.24868725057760974, "grad_norm": 1.3203125, "learning_rate": 0.0004996941478750411, "loss": 5.9769, "mean_token_accuracy": 0.1373401865363121, "num_tokens": 5464996.0, "step": 2960 }, { "entropy": 6.083559465408325, "epoch": 0.24910733039277463, "grad_norm": 1.0234375, "learning_rate": 0.0004996925849720975, "loss": 6.1025, "mean_token_accuracy": 0.12863337025046348, "num_tokens": 5474174.0, "step": 2965 }, { "entropy": 6.146986627578736, "epoch": 0.2495274102079395, "grad_norm": 1.1875, "learning_rate": 0.0004996910180888405, "loss": 5.9994, "mean_token_accuracy": 0.13324794694781303, "num_tokens": 5482838.0, "step": 2970 }, { "entropy": 6.005090427398682, "epoch": 0.2499474900231044, "grad_norm": 1.0390625, "learning_rate": 0.0004996894472252977, "loss": 6.0195, "mean_token_accuracy": 0.13370491713285446, "num_tokens": 5491616.0, "step": 2975 }, { "entropy": 5.99453763961792, "epoch": 0.25036756983826924, "grad_norm": 1.09375, "learning_rate": 0.0004996878723814973, "loss": 5.9972, "mean_token_accuracy": 0.12933446019887923, "num_tokens": 5500942.0, "step": 2980 }, { "entropy": 6.035016107559204, "epoch": 0.25078764965343414, "grad_norm": 1.109375, "learning_rate": 0.0004996862935574667, "loss": 5.9539, "mean_token_accuracy": 0.13152176290750503, "num_tokens": 5510078.0, "step": 2985 }, { "entropy": 5.9494434833526615, "epoch": 0.25120772946859904, "grad_norm": 1.015625, "learning_rate": 0.0004996847107532342, "loss": 5.9763, "mean_token_accuracy": 0.13343006893992423, "num_tokens": 5518924.0, "step": 2990 }, { "entropy": 6.115957880020142, "epoch": 0.25162780928376394, "grad_norm": 1.09375, "learning_rate": 0.0004996831239688277, "loss": 5.9896, "mean_token_accuracy": 0.12950923070311546, "num_tokens": 5527385.0, "step": 2995 }, { "entropy": 5.96525821685791, "epoch": 0.2520478890989288, "grad_norm": 1.1015625, "learning_rate": 0.0004996815332042754, "loss": 5.8456, "mean_token_accuracy": 0.14307771176099776, "num_tokens": 5536781.0, "step": 3000 }, { "epoch": 0.2520478890989288, "eval_entropy": 5.826104599310177, "eval_loss": 6.01594352722168, "eval_mean_token_accuracy": 0.13980411247313787, "eval_num_tokens": 5536781.0, "eval_runtime": 27.3461, "eval_samples_per_second": 1366.412, "eval_steps_per_second": 170.811, "step": 3000 }, { "entropy": 6.008435201644898, "epoch": 0.2524679689140937, "grad_norm": 1.0703125, "learning_rate": 0.0004996799384596054, "loss": 6.0261, "mean_token_accuracy": 0.1376914620399475, "num_tokens": 5545893.0, "step": 3005 }, { "entropy": 6.02188720703125, "epoch": 0.2528880487292586, "grad_norm": 1.0390625, "learning_rate": 0.0004996783397348461, "loss": 5.9762, "mean_token_accuracy": 0.1329520359635353, "num_tokens": 5555818.0, "step": 3010 }, { "entropy": 6.045353794097901, "epoch": 0.2533081285444234, "grad_norm": 1.015625, "learning_rate": 0.0004996767370300256, "loss": 5.9502, "mean_token_accuracy": 0.13486573100090027, "num_tokens": 5565331.0, "step": 3015 }, { "entropy": 6.056732606887818, "epoch": 0.2537282083595883, "grad_norm": 1.1171875, "learning_rate": 0.0004996751303451724, "loss": 5.9577, "mean_token_accuracy": 0.13709068223834037, "num_tokens": 5574003.0, "step": 3020 }, { "entropy": 5.993344259262085, "epoch": 0.2541482881747532, "grad_norm": 1.0625, "learning_rate": 0.0004996735196803149, "loss": 5.8551, "mean_token_accuracy": 0.1428755633533001, "num_tokens": 5582517.0, "step": 3025 }, { "entropy": 5.977582693099976, "epoch": 0.2545683679899181, "grad_norm": 1.0703125, "learning_rate": 0.0004996719050354818, "loss": 6.0686, "mean_token_accuracy": 0.13471986055374147, "num_tokens": 5591952.0, "step": 3030 }, { "entropy": 6.0037376403808596, "epoch": 0.25498844780508295, "grad_norm": 1.0625, "learning_rate": 0.0004996702864107015, "loss": 5.9609, "mean_token_accuracy": 0.1396644115447998, "num_tokens": 5601460.0, "step": 3035 }, { "entropy": 6.176335668563842, "epoch": 0.25540852762024785, "grad_norm": 1.1328125, "learning_rate": 0.0004996686638060028, "loss": 6.0902, "mean_token_accuracy": 0.1306911051273346, "num_tokens": 5610776.0, "step": 3040 }, { "entropy": 5.970763540267944, "epoch": 0.25582860743541275, "grad_norm": 1.09375, "learning_rate": 0.0004996670372214144, "loss": 5.9871, "mean_token_accuracy": 0.13826777338981627, "num_tokens": 5619627.0, "step": 3045 }, { "entropy": 5.914526128768921, "epoch": 0.2562486872505776, "grad_norm": 1.109375, "learning_rate": 0.0004996654066569651, "loss": 5.8622, "mean_token_accuracy": 0.14179132953286172, "num_tokens": 5628969.0, "step": 3050 }, { "entropy": 5.981579828262329, "epoch": 0.2566687670657425, "grad_norm": 1.1484375, "learning_rate": 0.0004996637721126839, "loss": 5.9332, "mean_token_accuracy": 0.13520999103784562, "num_tokens": 5638629.0, "step": 3055 }, { "entropy": 6.005596733093261, "epoch": 0.2570888468809074, "grad_norm": 1.203125, "learning_rate": 0.0004996621335885996, "loss": 5.9991, "mean_token_accuracy": 0.13599340468645096, "num_tokens": 5647571.0, "step": 3060 }, { "entropy": 6.013420534133911, "epoch": 0.2575089266960722, "grad_norm": 1.390625, "learning_rate": 0.0004996604910847413, "loss": 5.916, "mean_token_accuracy": 0.14960622489452363, "num_tokens": 5656709.0, "step": 3065 }, { "entropy": 6.038319206237793, "epoch": 0.2579290065112371, "grad_norm": 1.109375, "learning_rate": 0.000499658844601138, "loss": 6.1017, "mean_token_accuracy": 0.13502436354756356, "num_tokens": 5665714.0, "step": 3070 }, { "entropy": 6.07736644744873, "epoch": 0.258349086326402, "grad_norm": 1.0, "learning_rate": 0.000499657194137819, "loss": 6.0546, "mean_token_accuracy": 0.13854038044810296, "num_tokens": 5675854.0, "step": 3075 }, { "entropy": 6.074629402160644, "epoch": 0.2587691661415669, "grad_norm": 1.1171875, "learning_rate": 0.0004996555396948136, "loss": 5.8721, "mean_token_accuracy": 0.13419756293296814, "num_tokens": 5685690.0, "step": 3080 }, { "entropy": 5.940470170974732, "epoch": 0.25918924595673176, "grad_norm": 1.0546875, "learning_rate": 0.0004996538812721509, "loss": 5.9341, "mean_token_accuracy": 0.14152218475937844, "num_tokens": 5695766.0, "step": 3085 }, { "entropy": 6.018071937561035, "epoch": 0.25960932577189666, "grad_norm": 1.3046875, "learning_rate": 0.0004996522188698603, "loss": 5.9909, "mean_token_accuracy": 0.13503170683979987, "num_tokens": 5704365.0, "step": 3090 }, { "entropy": 6.13015513420105, "epoch": 0.26002940558706156, "grad_norm": 1.265625, "learning_rate": 0.0004996505524879714, "loss": 6.0965, "mean_token_accuracy": 0.13045159131288528, "num_tokens": 5713345.0, "step": 3095 }, { "entropy": 6.053025817871093, "epoch": 0.2604494854022264, "grad_norm": 1.03125, "learning_rate": 0.0004996488821265137, "loss": 5.8921, "mean_token_accuracy": 0.14050639048218727, "num_tokens": 5722907.0, "step": 3100 }, { "entropy": 5.928135585784912, "epoch": 0.2608695652173913, "grad_norm": 1.1171875, "learning_rate": 0.0004996472077855166, "loss": 5.9387, "mean_token_accuracy": 0.13793488591909409, "num_tokens": 5731589.0, "step": 3105 }, { "entropy": 5.923902750015259, "epoch": 0.2612896450325562, "grad_norm": 1.09375, "learning_rate": 0.00049964552946501, "loss": 5.9237, "mean_token_accuracy": 0.1389499545097351, "num_tokens": 5739922.0, "step": 3110 }, { "entropy": 5.905591726303101, "epoch": 0.2617097248477211, "grad_norm": 1.109375, "learning_rate": 0.0004996438471650235, "loss": 5.8397, "mean_token_accuracy": 0.145526784658432, "num_tokens": 5749206.0, "step": 3115 }, { "entropy": 6.01796875, "epoch": 0.26212980466288593, "grad_norm": 1.109375, "learning_rate": 0.0004996421608855869, "loss": 5.8992, "mean_token_accuracy": 0.1419477328658104, "num_tokens": 5758803.0, "step": 3120 }, { "entropy": 5.962277746200561, "epoch": 0.26254988447805083, "grad_norm": 1.109375, "learning_rate": 0.0004996404706267301, "loss": 5.9991, "mean_token_accuracy": 0.1301351211965084, "num_tokens": 5768368.0, "step": 3125 }, { "entropy": 5.935734415054322, "epoch": 0.26296996429321573, "grad_norm": 1.203125, "learning_rate": 0.000499638776388483, "loss": 5.8424, "mean_token_accuracy": 0.14718177318572997, "num_tokens": 5776707.0, "step": 3130 }, { "entropy": 5.992966365814209, "epoch": 0.26339004410838057, "grad_norm": 1.1796875, "learning_rate": 0.0004996370781708757, "loss": 6.0208, "mean_token_accuracy": 0.13097626715898514, "num_tokens": 5787037.0, "step": 3135 }, { "entropy": 6.120069789886474, "epoch": 0.26381012392354547, "grad_norm": 1.375, "learning_rate": 0.0004996353759739382, "loss": 5.9819, "mean_token_accuracy": 0.140574112534523, "num_tokens": 5796630.0, "step": 3140 }, { "entropy": 5.9368353366851805, "epoch": 0.26423020373871037, "grad_norm": 1.171875, "learning_rate": 0.0004996336697977007, "loss": 5.978, "mean_token_accuracy": 0.13346768617630006, "num_tokens": 5806402.0, "step": 3145 }, { "entropy": 5.97723422050476, "epoch": 0.2646502835538752, "grad_norm": 1.2578125, "learning_rate": 0.0004996319596421933, "loss": 5.9278, "mean_token_accuracy": 0.13734676092863082, "num_tokens": 5815742.0, "step": 3150 }, { "entropy": 5.945355033874511, "epoch": 0.2650703633690401, "grad_norm": 1.09375, "learning_rate": 0.0004996302455074466, "loss": 5.9322, "mean_token_accuracy": 0.1382609039545059, "num_tokens": 5824915.0, "step": 3155 }, { "entropy": 6.0514014720916744, "epoch": 0.265490443184205, "grad_norm": 1.2265625, "learning_rate": 0.0004996285273934906, "loss": 5.9852, "mean_token_accuracy": 0.13715496361255647, "num_tokens": 5834978.0, "step": 3160 }, { "entropy": 6.052202987670898, "epoch": 0.2659105229993699, "grad_norm": 1.09375, "learning_rate": 0.000499626805300356, "loss": 6.1228, "mean_token_accuracy": 0.1326017878949642, "num_tokens": 5845684.0, "step": 3165 }, { "entropy": 6.146022653579712, "epoch": 0.26633060281453474, "grad_norm": 1.1640625, "learning_rate": 0.0004996250792280732, "loss": 5.9964, "mean_token_accuracy": 0.13485243916511536, "num_tokens": 5854905.0, "step": 3170 }, { "entropy": 6.040951061248779, "epoch": 0.26675068262969964, "grad_norm": 1.2265625, "learning_rate": 0.0004996233491766727, "loss": 6.0164, "mean_token_accuracy": 0.1350037656724453, "num_tokens": 5863654.0, "step": 3175 }, { "entropy": 6.058253955841065, "epoch": 0.26717076244486454, "grad_norm": 1.1796875, "learning_rate": 0.0004996216151461854, "loss": 6.0152, "mean_token_accuracy": 0.13996267989277839, "num_tokens": 5872442.0, "step": 3180 }, { "entropy": 6.012804937362671, "epoch": 0.2675908422600294, "grad_norm": 1.140625, "learning_rate": 0.0004996198771366417, "loss": 5.9378, "mean_token_accuracy": 0.13716716319322586, "num_tokens": 5882372.0, "step": 3185 }, { "entropy": 5.8219091415405275, "epoch": 0.2680109220751943, "grad_norm": 1.1015625, "learning_rate": 0.0004996181351480726, "loss": 5.7487, "mean_token_accuracy": 0.14560527056455613, "num_tokens": 5891113.0, "step": 3190 }, { "entropy": 5.941916608810425, "epoch": 0.2684310018903592, "grad_norm": 1.109375, "learning_rate": 0.0004996163891805089, "loss": 5.9892, "mean_token_accuracy": 0.14109294563531877, "num_tokens": 5899582.0, "step": 3195 }, { "entropy": 6.037355852127075, "epoch": 0.2688510817055241, "grad_norm": 1.0546875, "learning_rate": 0.0004996146392339815, "loss": 5.9353, "mean_token_accuracy": 0.1392637461423874, "num_tokens": 5908938.0, "step": 3200 }, { "entropy": 5.9513650894165036, "epoch": 0.2692711615206889, "grad_norm": 1.125, "learning_rate": 0.0004996128853085215, "loss": 5.9041, "mean_token_accuracy": 0.13895752876996995, "num_tokens": 5918055.0, "step": 3205 }, { "entropy": 5.997664451599121, "epoch": 0.2696912413358538, "grad_norm": 1.0625, "learning_rate": 0.0004996111274041598, "loss": 5.8986, "mean_token_accuracy": 0.13369553461670874, "num_tokens": 5926744.0, "step": 3210 }, { "entropy": 5.959716939926148, "epoch": 0.2701113211510187, "grad_norm": 1.0390625, "learning_rate": 0.0004996093655209277, "loss": 5.9958, "mean_token_accuracy": 0.1349453993141651, "num_tokens": 5936521.0, "step": 3215 }, { "entropy": 6.088764905929565, "epoch": 0.27053140096618356, "grad_norm": 1.1953125, "learning_rate": 0.0004996075996588563, "loss": 6.0616, "mean_token_accuracy": 0.13318859413266182, "num_tokens": 5945010.0, "step": 3220 }, { "entropy": 6.052014112472534, "epoch": 0.27095148078134845, "grad_norm": 1.1171875, "learning_rate": 0.000499605829817977, "loss": 5.9638, "mean_token_accuracy": 0.14223103746771812, "num_tokens": 5953766.0, "step": 3225 }, { "entropy": 5.979779624938965, "epoch": 0.27137156059651335, "grad_norm": 1.0859375, "learning_rate": 0.000499604055998321, "loss": 5.875, "mean_token_accuracy": 0.13957174718379975, "num_tokens": 5962168.0, "step": 3230 }, { "entropy": 5.906911420822143, "epoch": 0.2717916404116782, "grad_norm": 1.09375, "learning_rate": 0.0004996022781999198, "loss": 5.9063, "mean_token_accuracy": 0.13852998465299607, "num_tokens": 5971627.0, "step": 3235 }, { "entropy": 5.9631248950958256, "epoch": 0.2722117202268431, "grad_norm": 1.125, "learning_rate": 0.000499600496422805, "loss": 5.9925, "mean_token_accuracy": 0.13308593779802322, "num_tokens": 5981775.0, "step": 3240 }, { "entropy": 5.993693208694458, "epoch": 0.272631800042008, "grad_norm": 1.15625, "learning_rate": 0.000499598710667008, "loss": 5.9061, "mean_token_accuracy": 0.1379516489803791, "num_tokens": 5991097.0, "step": 3245 }, { "entropy": 5.984791469573975, "epoch": 0.2730518798571729, "grad_norm": 1.2265625, "learning_rate": 0.0004995969209325604, "loss": 5.9693, "mean_token_accuracy": 0.13060558065772057, "num_tokens": 5999517.0, "step": 3250 }, { "entropy": 5.930228567123413, "epoch": 0.2734719596723377, "grad_norm": 1.1953125, "learning_rate": 0.0004995951272194941, "loss": 5.9479, "mean_token_accuracy": 0.12969653084874153, "num_tokens": 6008545.0, "step": 3255 }, { "entropy": 6.119350004196167, "epoch": 0.2738920394875026, "grad_norm": 1.203125, "learning_rate": 0.0004995933295278407, "loss": 5.9365, "mean_token_accuracy": 0.1350548431277275, "num_tokens": 6017366.0, "step": 3260 }, { "entropy": 5.9179764747619625, "epoch": 0.2743121193026675, "grad_norm": 1.1875, "learning_rate": 0.0004995915278576321, "loss": 5.8875, "mean_token_accuracy": 0.14413413256406785, "num_tokens": 6025597.0, "step": 3265 }, { "entropy": 5.981735897064209, "epoch": 0.27473219911783237, "grad_norm": 1.0703125, "learning_rate": 0.0004995897222089004, "loss": 5.9867, "mean_token_accuracy": 0.13929954469203948, "num_tokens": 6034239.0, "step": 3270 }, { "entropy": 6.11962890625, "epoch": 0.27515227893299726, "grad_norm": 1.1171875, "learning_rate": 0.0004995879125816772, "loss": 6.0068, "mean_token_accuracy": 0.13686064183712005, "num_tokens": 6043837.0, "step": 3275 }, { "entropy": 5.9640697002410885, "epoch": 0.27557235874816216, "grad_norm": 1.0234375, "learning_rate": 0.0004995860989759949, "loss": 5.956, "mean_token_accuracy": 0.1416999839246273, "num_tokens": 6053217.0, "step": 3280 }, { "entropy": 6.0521222114562985, "epoch": 0.27599243856332706, "grad_norm": 1.1171875, "learning_rate": 0.0004995842813918855, "loss": 5.9551, "mean_token_accuracy": 0.13722361102700234, "num_tokens": 6061553.0, "step": 3285 }, { "entropy": 5.9697545051574705, "epoch": 0.2764125183784919, "grad_norm": 1.1640625, "learning_rate": 0.0004995824598293812, "loss": 5.8601, "mean_token_accuracy": 0.14069184213876723, "num_tokens": 6070080.0, "step": 3290 }, { "entropy": 5.995730686187744, "epoch": 0.2768325981936568, "grad_norm": 1.0703125, "learning_rate": 0.0004995806342885142, "loss": 5.9852, "mean_token_accuracy": 0.14142092764377595, "num_tokens": 6078438.0, "step": 3295 }, { "entropy": 6.019344282150269, "epoch": 0.2772526780088217, "grad_norm": 1.1484375, "learning_rate": 0.000499578804769317, "loss": 5.9771, "mean_token_accuracy": 0.13406604304909706, "num_tokens": 6087794.0, "step": 3300 }, { "entropy": 6.085688066482544, "epoch": 0.27767275782398654, "grad_norm": 1.1015625, "learning_rate": 0.0004995769712718218, "loss": 6.0065, "mean_token_accuracy": 0.13597604855895043, "num_tokens": 6096709.0, "step": 3305 }, { "entropy": 5.9711473941802975, "epoch": 0.27809283763915144, "grad_norm": 1.109375, "learning_rate": 0.0004995751337960613, "loss": 5.9269, "mean_token_accuracy": 0.13786234930157662, "num_tokens": 6105866.0, "step": 3310 }, { "entropy": 6.074538946151733, "epoch": 0.27851291745431633, "grad_norm": 1.078125, "learning_rate": 0.0004995732923420679, "loss": 5.8813, "mean_token_accuracy": 0.13884977921843528, "num_tokens": 6114882.0, "step": 3315 }, { "entropy": 5.857705545425415, "epoch": 0.2789329972694812, "grad_norm": 1.1953125, "learning_rate": 0.0004995714469098743, "loss": 5.8412, "mean_token_accuracy": 0.13618046417832375, "num_tokens": 6123978.0, "step": 3320 }, { "entropy": 5.886438226699829, "epoch": 0.2793530770846461, "grad_norm": 1.171875, "learning_rate": 0.000499569597499513, "loss": 5.9946, "mean_token_accuracy": 0.1375075623393059, "num_tokens": 6133246.0, "step": 3325 }, { "entropy": 5.993762636184693, "epoch": 0.27977315689981097, "grad_norm": 1.0390625, "learning_rate": 0.0004995677441110172, "loss": 5.8559, "mean_token_accuracy": 0.14045721143484116, "num_tokens": 6142865.0, "step": 3330 }, { "entropy": 6.025714874267578, "epoch": 0.28019323671497587, "grad_norm": 1.0625, "learning_rate": 0.0004995658867444192, "loss": 5.9512, "mean_token_accuracy": 0.13522876128554345, "num_tokens": 6152492.0, "step": 3335 }, { "entropy": 5.981087923049927, "epoch": 0.2806133165301407, "grad_norm": 1.1484375, "learning_rate": 0.0004995640253997523, "loss": 5.959, "mean_token_accuracy": 0.1329936422407627, "num_tokens": 6161953.0, "step": 3340 }, { "entropy": 5.841523504257202, "epoch": 0.2810333963453056, "grad_norm": 1.0625, "learning_rate": 0.0004995621600770492, "loss": 5.8129, "mean_token_accuracy": 0.1412846788764, "num_tokens": 6171467.0, "step": 3345 }, { "entropy": 5.90531325340271, "epoch": 0.2814534761604705, "grad_norm": 1.015625, "learning_rate": 0.0004995602907763431, "loss": 5.8859, "mean_token_accuracy": 0.13736898675560952, "num_tokens": 6180646.0, "step": 3350 }, { "entropy": 5.981820106506348, "epoch": 0.28187355597563535, "grad_norm": 1.28125, "learning_rate": 0.0004995584174976672, "loss": 5.9116, "mean_token_accuracy": 0.13150710314512254, "num_tokens": 6189832.0, "step": 3355 }, { "entropy": 5.980225324630737, "epoch": 0.28229363579080025, "grad_norm": 1.09375, "learning_rate": 0.0004995565402410544, "loss": 5.7994, "mean_token_accuracy": 0.14472294151782988, "num_tokens": 6198339.0, "step": 3360 }, { "entropy": 5.924914312362671, "epoch": 0.28271371560596514, "grad_norm": 1.25, "learning_rate": 0.0004995546590065383, "loss": 5.8935, "mean_token_accuracy": 0.1394026793539524, "num_tokens": 6207564.0, "step": 3365 }, { "entropy": 5.931164789199829, "epoch": 0.28313379542113004, "grad_norm": 1.1875, "learning_rate": 0.0004995527737941518, "loss": 5.9781, "mean_token_accuracy": 0.13914698138833045, "num_tokens": 6216056.0, "step": 3370 }, { "entropy": 5.968091154098511, "epoch": 0.2835538752362949, "grad_norm": 1.1015625, "learning_rate": 0.0004995508846039287, "loss": 5.9114, "mean_token_accuracy": 0.13818917274475098, "num_tokens": 6225573.0, "step": 3375 }, { "entropy": 6.069493198394776, "epoch": 0.2839739550514598, "grad_norm": 1.125, "learning_rate": 0.0004995489914359023, "loss": 6.0417, "mean_token_accuracy": 0.13078732788562775, "num_tokens": 6235057.0, "step": 3380 }, { "entropy": 6.030756092071533, "epoch": 0.2843940348666247, "grad_norm": 1.2734375, "learning_rate": 0.0004995470942901061, "loss": 5.9557, "mean_token_accuracy": 0.13645285964012147, "num_tokens": 6244164.0, "step": 3385 }, { "entropy": 6.068174362182617, "epoch": 0.2848141146817895, "grad_norm": 1.265625, "learning_rate": 0.0004995451931665738, "loss": 5.9588, "mean_token_accuracy": 0.13424528315663337, "num_tokens": 6253095.0, "step": 3390 }, { "entropy": 5.918725109100341, "epoch": 0.2852341944969544, "grad_norm": 1.21875, "learning_rate": 0.000499543288065339, "loss": 5.9038, "mean_token_accuracy": 0.13533290028572081, "num_tokens": 6261134.0, "step": 3395 }, { "entropy": 5.926444101333618, "epoch": 0.2856542743121193, "grad_norm": 1.3125, "learning_rate": 0.0004995413789864354, "loss": 5.9066, "mean_token_accuracy": 0.1413659855723381, "num_tokens": 6270384.0, "step": 3400 }, { "entropy": 5.974505090713501, "epoch": 0.28607435412728416, "grad_norm": 1.078125, "learning_rate": 0.0004995394659298971, "loss": 5.842, "mean_token_accuracy": 0.14783402383327485, "num_tokens": 6279702.0, "step": 3405 }, { "entropy": 5.924916839599609, "epoch": 0.28649443394244906, "grad_norm": 1.1796875, "learning_rate": 0.0004995375488957576, "loss": 5.8871, "mean_token_accuracy": 0.1403558671474457, "num_tokens": 6288297.0, "step": 3410 }, { "entropy": 5.979348230361938, "epoch": 0.28691451375761395, "grad_norm": 1.1328125, "learning_rate": 0.000499535627884051, "loss": 5.983, "mean_token_accuracy": 0.12937102988362312, "num_tokens": 6297288.0, "step": 3415 }, { "entropy": 6.12882170677185, "epoch": 0.28733459357277885, "grad_norm": 1.1171875, "learning_rate": 0.0004995337028948115, "loss": 6.0094, "mean_token_accuracy": 0.13142260611057283, "num_tokens": 6306719.0, "step": 3420 }, { "entropy": 5.93622145652771, "epoch": 0.2877546733879437, "grad_norm": 1.1328125, "learning_rate": 0.0004995317739280731, "loss": 5.8256, "mean_token_accuracy": 0.14748729318380355, "num_tokens": 6316639.0, "step": 3425 }, { "entropy": 5.951609373092651, "epoch": 0.2881747532031086, "grad_norm": 1.125, "learning_rate": 0.0004995298409838699, "loss": 5.9555, "mean_token_accuracy": 0.1391440898180008, "num_tokens": 6326879.0, "step": 3430 }, { "entropy": 5.9383097171783445, "epoch": 0.2885948330182735, "grad_norm": 1.140625, "learning_rate": 0.000499527904062236, "loss": 5.8671, "mean_token_accuracy": 0.139659284055233, "num_tokens": 6335729.0, "step": 3435 }, { "entropy": 5.971969127655029, "epoch": 0.28901491283343833, "grad_norm": 1.1328125, "learning_rate": 0.0004995259631632061, "loss": 5.9185, "mean_token_accuracy": 0.1310904636979103, "num_tokens": 6345154.0, "step": 3440 }, { "entropy": 5.977327823638916, "epoch": 0.28943499264860323, "grad_norm": 1.09375, "learning_rate": 0.0004995240182868143, "loss": 5.8858, "mean_token_accuracy": 0.14063168689608574, "num_tokens": 6354309.0, "step": 3445 }, { "entropy": 5.8834575653076175, "epoch": 0.2898550724637681, "grad_norm": 1.0625, "learning_rate": 0.0004995220694330951, "loss": 5.8586, "mean_token_accuracy": 0.14082162082195282, "num_tokens": 6363389.0, "step": 3450 }, { "entropy": 5.92822527885437, "epoch": 0.290275152278933, "grad_norm": 1.1015625, "learning_rate": 0.0004995201166020832, "loss": 5.9065, "mean_token_accuracy": 0.13562884032726288, "num_tokens": 6372475.0, "step": 3455 }, { "entropy": 6.024522161483764, "epoch": 0.29069523209409787, "grad_norm": 1.1953125, "learning_rate": 0.000499518159793813, "loss": 5.8677, "mean_token_accuracy": 0.14305904358625413, "num_tokens": 6380906.0, "step": 3460 }, { "entropy": 5.884508085250855, "epoch": 0.29111531190926276, "grad_norm": 1.125, "learning_rate": 0.000499516199008319, "loss": 5.8659, "mean_token_accuracy": 0.14293192625045775, "num_tokens": 6390085.0, "step": 3465 }, { "entropy": 6.008301162719727, "epoch": 0.29153539172442766, "grad_norm": 1.203125, "learning_rate": 0.0004995142342456364, "loss": 5.9391, "mean_token_accuracy": 0.13623592853546143, "num_tokens": 6399441.0, "step": 3470 }, { "entropy": 6.066584539413452, "epoch": 0.2919554715395925, "grad_norm": 1.1640625, "learning_rate": 0.0004995122655057997, "loss": 6.0208, "mean_token_accuracy": 0.13953343629837037, "num_tokens": 6408995.0, "step": 3475 }, { "entropy": 5.888063764572143, "epoch": 0.2923755513547574, "grad_norm": 1.1171875, "learning_rate": 0.0004995102927888437, "loss": 5.7722, "mean_token_accuracy": 0.1459358014166355, "num_tokens": 6418080.0, "step": 3480 }, { "entropy": 5.952468156814575, "epoch": 0.2927956311699223, "grad_norm": 1.2421875, "learning_rate": 0.0004995083160948036, "loss": 5.9318, "mean_token_accuracy": 0.14023924767971038, "num_tokens": 6426732.0, "step": 3485 }, { "entropy": 5.971553039550781, "epoch": 0.29321571098508714, "grad_norm": 1.15625, "learning_rate": 0.0004995063354237141, "loss": 5.9538, "mean_token_accuracy": 0.14043337404727935, "num_tokens": 6435957.0, "step": 3490 }, { "entropy": 5.94589900970459, "epoch": 0.29363579080025204, "grad_norm": 1.3671875, "learning_rate": 0.0004995043507756107, "loss": 5.9069, "mean_token_accuracy": 0.133124540746212, "num_tokens": 6445642.0, "step": 3495 }, { "entropy": 5.974902820587158, "epoch": 0.29405587061541694, "grad_norm": 1.3203125, "learning_rate": 0.0004995023621505282, "loss": 5.9363, "mean_token_accuracy": 0.1418766610324383, "num_tokens": 6454664.0, "step": 3500 }, { "entropy": 5.940143728256226, "epoch": 0.29447595043058183, "grad_norm": 1.265625, "learning_rate": 0.000499500369548502, "loss": 5.8583, "mean_token_accuracy": 0.1379205584526062, "num_tokens": 6463224.0, "step": 3505 }, { "entropy": 6.120481824874878, "epoch": 0.2948960302457467, "grad_norm": 1.09375, "learning_rate": 0.0004994983729695674, "loss": 6.0926, "mean_token_accuracy": 0.1296972803771496, "num_tokens": 6473112.0, "step": 3510 }, { "entropy": 5.980841064453125, "epoch": 0.2953161100609116, "grad_norm": 1.4453125, "learning_rate": 0.0004994963724137595, "loss": 5.9214, "mean_token_accuracy": 0.1389226034283638, "num_tokens": 6482062.0, "step": 3515 }, { "entropy": 5.932737588882446, "epoch": 0.29573618987607647, "grad_norm": 1.5390625, "learning_rate": 0.0004994943678811142, "loss": 5.9004, "mean_token_accuracy": 0.13374803215265274, "num_tokens": 6490568.0, "step": 3520 }, { "entropy": 5.997820091247559, "epoch": 0.2961562696912413, "grad_norm": 1.1015625, "learning_rate": 0.0004994923593716667, "loss": 5.963, "mean_token_accuracy": 0.14052257165312768, "num_tokens": 6500815.0, "step": 3525 }, { "entropy": 5.916243839263916, "epoch": 0.2965763495064062, "grad_norm": 1.1484375, "learning_rate": 0.0004994903468854527, "loss": 5.8376, "mean_token_accuracy": 0.14926647543907165, "num_tokens": 6509529.0, "step": 3530 }, { "entropy": 5.922206735610962, "epoch": 0.2969964293215711, "grad_norm": 1.1171875, "learning_rate": 0.0004994883304225077, "loss": 5.8937, "mean_token_accuracy": 0.13852014467120172, "num_tokens": 6517934.0, "step": 3535 }, { "entropy": 5.9876025199890135, "epoch": 0.297416509136736, "grad_norm": 1.1328125, "learning_rate": 0.0004994863099828675, "loss": 5.8695, "mean_token_accuracy": 0.14087166935205458, "num_tokens": 6526098.0, "step": 3540 }, { "entropy": 5.935700082778931, "epoch": 0.29783658895190085, "grad_norm": 1.1328125, "learning_rate": 0.000499484285566568, "loss": 5.906, "mean_token_accuracy": 0.13566448390483857, "num_tokens": 6535831.0, "step": 3545 }, { "entropy": 5.939550399780273, "epoch": 0.29825666876706575, "grad_norm": 1.125, "learning_rate": 0.0004994822571736449, "loss": 5.8255, "mean_token_accuracy": 0.13489115089178086, "num_tokens": 6545704.0, "step": 3550 }, { "entropy": 5.947116851806641, "epoch": 0.29867674858223064, "grad_norm": 1.1953125, "learning_rate": 0.0004994802248041342, "loss": 5.8548, "mean_token_accuracy": 0.14142827019095422, "num_tokens": 6554423.0, "step": 3555 }, { "entropy": 5.969081258773803, "epoch": 0.2990968283973955, "grad_norm": 1.2109375, "learning_rate": 0.000499478188458072, "loss": 5.9073, "mean_token_accuracy": 0.13533755540847778, "num_tokens": 6563989.0, "step": 3560 }, { "entropy": 5.9689305305480955, "epoch": 0.2995169082125604, "grad_norm": 1.296875, "learning_rate": 0.0004994761481354943, "loss": 6.0328, "mean_token_accuracy": 0.13800237625837325, "num_tokens": 6572745.0, "step": 3565 }, { "entropy": 6.133339929580688, "epoch": 0.2999369880277253, "grad_norm": 1.1875, "learning_rate": 0.0004994741038364371, "loss": 6.0333, "mean_token_accuracy": 0.13616435453295708, "num_tokens": 6581723.0, "step": 3570 }, { "entropy": 5.896167135238647, "epoch": 0.3003570678428901, "grad_norm": 1.2421875, "learning_rate": 0.0004994720555609369, "loss": 5.7604, "mean_token_accuracy": 0.1434899814426899, "num_tokens": 6590342.0, "step": 3575 }, { "entropy": 5.878182983398437, "epoch": 0.300777147658055, "grad_norm": 1.3125, "learning_rate": 0.0004994700033090297, "loss": 5.8344, "mean_token_accuracy": 0.14836035221815108, "num_tokens": 6599206.0, "step": 3580 }, { "entropy": 6.036917591094971, "epoch": 0.3011972274732199, "grad_norm": 1.2421875, "learning_rate": 0.000499467947080752, "loss": 6.1289, "mean_token_accuracy": 0.13054108917713164, "num_tokens": 6608947.0, "step": 3585 }, { "entropy": 6.017320966720581, "epoch": 0.3016173072883848, "grad_norm": 1.2421875, "learning_rate": 0.0004994658868761402, "loss": 5.9128, "mean_token_accuracy": 0.14748418629169463, "num_tokens": 6618378.0, "step": 3590 }, { "entropy": 5.987727546691895, "epoch": 0.30203738710354966, "grad_norm": 1.1953125, "learning_rate": 0.0004994638226952307, "loss": 5.9681, "mean_token_accuracy": 0.13054394274950026, "num_tokens": 6627527.0, "step": 3595 }, { "entropy": 5.996758890151978, "epoch": 0.30245746691871456, "grad_norm": 1.2734375, "learning_rate": 0.0004994617545380604, "loss": 5.8919, "mean_token_accuracy": 0.13826094195246696, "num_tokens": 6636964.0, "step": 3600 }, { "entropy": 5.905787420272827, "epoch": 0.30287754673387945, "grad_norm": 1.3046875, "learning_rate": 0.0004994596824046656, "loss": 5.8569, "mean_token_accuracy": 0.141887067258358, "num_tokens": 6646074.0, "step": 3605 }, { "entropy": 5.99219708442688, "epoch": 0.3032976265490443, "grad_norm": 1.3359375, "learning_rate": 0.000499457606295083, "loss": 5.9311, "mean_token_accuracy": 0.13836071118712426, "num_tokens": 6655027.0, "step": 3610 }, { "entropy": 5.7845015048980715, "epoch": 0.3037177063642092, "grad_norm": 1.453125, "learning_rate": 0.0004994555262093495, "loss": 5.713, "mean_token_accuracy": 0.15755455046892167, "num_tokens": 6663747.0, "step": 3615 }, { "entropy": 6.036468362808227, "epoch": 0.3041377861793741, "grad_norm": 1.234375, "learning_rate": 0.000499453442147502, "loss": 6.0392, "mean_token_accuracy": 0.13115543723106385, "num_tokens": 6672922.0, "step": 3620 }, { "entropy": 5.979010963439942, "epoch": 0.304557865994539, "grad_norm": 1.234375, "learning_rate": 0.0004994513541095773, "loss": 5.8654, "mean_token_accuracy": 0.14586904942989348, "num_tokens": 6682233.0, "step": 3625 }, { "entropy": 5.928103733062744, "epoch": 0.30497794580970383, "grad_norm": 1.15625, "learning_rate": 0.0004994492620956126, "loss": 5.9125, "mean_token_accuracy": 0.14258120208978653, "num_tokens": 6691593.0, "step": 3630 }, { "entropy": 5.953917217254639, "epoch": 0.30539802562486873, "grad_norm": 1.0703125, "learning_rate": 0.0004994471661056445, "loss": 5.9125, "mean_token_accuracy": 0.14142323583364486, "num_tokens": 6701318.0, "step": 3635 }, { "entropy": 5.986124277114868, "epoch": 0.3058181054400336, "grad_norm": 1.078125, "learning_rate": 0.0004994450661397106, "loss": 5.9176, "mean_token_accuracy": 0.14466760009527208, "num_tokens": 6710059.0, "step": 3640 }, { "entropy": 6.110535717010498, "epoch": 0.30623818525519847, "grad_norm": 1.203125, "learning_rate": 0.000499442962197848, "loss": 6.0091, "mean_token_accuracy": 0.1349786825478077, "num_tokens": 6719811.0, "step": 3645 }, { "entropy": 5.885643482208252, "epoch": 0.30665826507036337, "grad_norm": 1.1171875, "learning_rate": 0.0004994408542800937, "loss": 5.8848, "mean_token_accuracy": 0.13900379538536073, "num_tokens": 6728789.0, "step": 3650 }, { "entropy": 5.929373550415039, "epoch": 0.30707834488552826, "grad_norm": 1.1796875, "learning_rate": 0.0004994387423864855, "loss": 5.8632, "mean_token_accuracy": 0.1396006353199482, "num_tokens": 6737706.0, "step": 3655 }, { "entropy": 5.928421974182129, "epoch": 0.3074984247006931, "grad_norm": 1.171875, "learning_rate": 0.0004994366265170603, "loss": 5.8269, "mean_token_accuracy": 0.1530800625681877, "num_tokens": 6746861.0, "step": 3660 }, { "entropy": 6.01959867477417, "epoch": 0.307918504515858, "grad_norm": 1.1953125, "learning_rate": 0.0004994345066718558, "loss": 6.0207, "mean_token_accuracy": 0.13322951793670654, "num_tokens": 6755242.0, "step": 3665 }, { "entropy": 6.026466798782349, "epoch": 0.3083385843310229, "grad_norm": 1.1171875, "learning_rate": 0.0004994323828509098, "loss": 5.954, "mean_token_accuracy": 0.13347591310739518, "num_tokens": 6764549.0, "step": 3670 }, { "entropy": 5.915293598175049, "epoch": 0.3087586641461878, "grad_norm": 1.359375, "learning_rate": 0.0004994302550542596, "loss": 5.9418, "mean_token_accuracy": 0.14316236823797227, "num_tokens": 6774123.0, "step": 3675 }, { "entropy": 5.850841808319092, "epoch": 0.30917874396135264, "grad_norm": 1.265625, "learning_rate": 0.000499428123281943, "loss": 5.7122, "mean_token_accuracy": 0.1474112629890442, "num_tokens": 6782922.0, "step": 3680 }, { "entropy": 5.9184730052948, "epoch": 0.30959882377651754, "grad_norm": 1.1953125, "learning_rate": 0.0004994259875339978, "loss": 5.9611, "mean_token_accuracy": 0.13746373876929283, "num_tokens": 6792042.0, "step": 3685 }, { "entropy": 6.05865330696106, "epoch": 0.31001890359168244, "grad_norm": 1.2109375, "learning_rate": 0.0004994238478104617, "loss": 5.9598, "mean_token_accuracy": 0.1366279661655426, "num_tokens": 6800994.0, "step": 3690 }, { "entropy": 5.93690128326416, "epoch": 0.3104389834068473, "grad_norm": 1.1015625, "learning_rate": 0.0004994217041113727, "loss": 5.8868, "mean_token_accuracy": 0.14316150173544884, "num_tokens": 6809938.0, "step": 3695 }, { "entropy": 6.014241790771484, "epoch": 0.3108590632220122, "grad_norm": 1.0703125, "learning_rate": 0.0004994195564367688, "loss": 6.0213, "mean_token_accuracy": 0.13116879239678383, "num_tokens": 6820289.0, "step": 3700 }, { "entropy": 6.002475690841675, "epoch": 0.3112791430371771, "grad_norm": 1.2265625, "learning_rate": 0.0004994174047866882, "loss": 5.8424, "mean_token_accuracy": 0.14203700423240662, "num_tokens": 6830068.0, "step": 3705 }, { "entropy": 5.788861274719238, "epoch": 0.3116992228523419, "grad_norm": 1.1328125, "learning_rate": 0.0004994152491611686, "loss": 5.8813, "mean_token_accuracy": 0.13960717990994453, "num_tokens": 6838591.0, "step": 3710 }, { "entropy": 5.89765567779541, "epoch": 0.3121193026675068, "grad_norm": 1.1171875, "learning_rate": 0.0004994130895602485, "loss": 5.8505, "mean_token_accuracy": 0.13729089125990868, "num_tokens": 6847796.0, "step": 3715 }, { "entropy": 6.010899591445923, "epoch": 0.3125393824826717, "grad_norm": 1.171875, "learning_rate": 0.000499410925983966, "loss": 5.941, "mean_token_accuracy": 0.13994767293334007, "num_tokens": 6856585.0, "step": 3720 }, { "entropy": 5.889919090270996, "epoch": 0.3129594622978366, "grad_norm": 1.3125, "learning_rate": 0.0004994087584323596, "loss": 5.8502, "mean_token_accuracy": 0.14524889141321182, "num_tokens": 6865757.0, "step": 3725 }, { "entropy": 5.9244975566864015, "epoch": 0.31337954211300145, "grad_norm": 1.21875, "learning_rate": 0.0004994065869054676, "loss": 5.9051, "mean_token_accuracy": 0.13346855491399764, "num_tokens": 6875371.0, "step": 3730 }, { "entropy": 5.990236139297485, "epoch": 0.31379962192816635, "grad_norm": 1.3046875, "learning_rate": 0.0004994044114033283, "loss": 5.9445, "mean_token_accuracy": 0.13406403809785844, "num_tokens": 6884050.0, "step": 3735 }, { "entropy": 6.023118162155152, "epoch": 0.31421970174333125, "grad_norm": 1.2265625, "learning_rate": 0.0004994022319259806, "loss": 5.9236, "mean_token_accuracy": 0.1428280971944332, "num_tokens": 6893079.0, "step": 3740 }, { "entropy": 5.977470397949219, "epoch": 0.3146397815584961, "grad_norm": 1.2109375, "learning_rate": 0.0004994000484734629, "loss": 6.0157, "mean_token_accuracy": 0.14197005555033684, "num_tokens": 6903100.0, "step": 3745 }, { "entropy": 5.968418455123901, "epoch": 0.315059861373661, "grad_norm": 1.1015625, "learning_rate": 0.0004993978610458137, "loss": 5.8564, "mean_token_accuracy": 0.1436561480164528, "num_tokens": 6912164.0, "step": 3750 }, { "entropy": 5.8913768291473385, "epoch": 0.3154799411888259, "grad_norm": 1.125, "learning_rate": 0.0004993956696430721, "loss": 5.8793, "mean_token_accuracy": 0.13736136257648468, "num_tokens": 6921183.0, "step": 3755 }, { "entropy": 6.017658281326294, "epoch": 0.3159000210039908, "grad_norm": 1.140625, "learning_rate": 0.0004993934742652768, "loss": 5.9616, "mean_token_accuracy": 0.1389385998249054, "num_tokens": 6931325.0, "step": 3760 }, { "entropy": 6.002210426330566, "epoch": 0.3163201008191556, "grad_norm": 1.21875, "learning_rate": 0.0004993912749124665, "loss": 5.8433, "mean_token_accuracy": 0.1487124353647232, "num_tokens": 6940234.0, "step": 3765 }, { "entropy": 5.929537010192871, "epoch": 0.3167401806343205, "grad_norm": 1.1484375, "learning_rate": 0.0004993890715846804, "loss": 5.9507, "mean_token_accuracy": 0.14044182747602463, "num_tokens": 6949067.0, "step": 3770 }, { "entropy": 5.998405647277832, "epoch": 0.3171602604494854, "grad_norm": 1.171875, "learning_rate": 0.0004993868642819574, "loss": 5.9194, "mean_token_accuracy": 0.13718469440937042, "num_tokens": 6959085.0, "step": 3775 }, { "entropy": 5.961022281646729, "epoch": 0.31758034026465026, "grad_norm": 1.1640625, "learning_rate": 0.0004993846530043367, "loss": 5.9451, "mean_token_accuracy": 0.13289572075009345, "num_tokens": 6967392.0, "step": 3780 }, { "entropy": 5.938811779022217, "epoch": 0.31800042007981516, "grad_norm": 1.1875, "learning_rate": 0.0004993824377518574, "loss": 5.8794, "mean_token_accuracy": 0.14492053985595704, "num_tokens": 6976369.0, "step": 3785 }, { "entropy": 6.007278203964233, "epoch": 0.31842049989498006, "grad_norm": 1.234375, "learning_rate": 0.0004993802185245587, "loss": 5.8979, "mean_token_accuracy": 0.14349642321467398, "num_tokens": 6985889.0, "step": 3790 }, { "entropy": 5.902310371398926, "epoch": 0.3188405797101449, "grad_norm": 1.265625, "learning_rate": 0.00049937799532248, "loss": 5.9155, "mean_token_accuracy": 0.13254671469330787, "num_tokens": 6995396.0, "step": 3795 }, { "entropy": 6.108139371871948, "epoch": 0.3192606595253098, "grad_norm": 1.125, "learning_rate": 0.0004993757681456607, "loss": 5.974, "mean_token_accuracy": 0.13683522641658782, "num_tokens": 7004666.0, "step": 3800 }, { "entropy": 5.993764448165893, "epoch": 0.3196807393404747, "grad_norm": 1.09375, "learning_rate": 0.0004993735369941401, "loss": 6.0094, "mean_token_accuracy": 0.13341464176774026, "num_tokens": 7014608.0, "step": 3805 }, { "entropy": 5.958604240417481, "epoch": 0.3201008191556396, "grad_norm": 1.0546875, "learning_rate": 0.0004993713018679579, "loss": 5.866, "mean_token_accuracy": 0.14026129618287086, "num_tokens": 7023671.0, "step": 3810 }, { "entropy": 5.995219898223877, "epoch": 0.32052089897080444, "grad_norm": 1.140625, "learning_rate": 0.0004993690627671536, "loss": 5.9253, "mean_token_accuracy": 0.13401568681001663, "num_tokens": 7033786.0, "step": 3815 }, { "entropy": 5.926336812973022, "epoch": 0.32094097878596933, "grad_norm": 1.28125, "learning_rate": 0.0004993668196917669, "loss": 5.8311, "mean_token_accuracy": 0.14573807418346404, "num_tokens": 7042162.0, "step": 3820 }, { "entropy": 5.96917757987976, "epoch": 0.32136105860113423, "grad_norm": 1.1875, "learning_rate": 0.0004993645726418375, "loss": 5.981, "mean_token_accuracy": 0.13832971975207328, "num_tokens": 7051903.0, "step": 3825 }, { "entropy": 5.879901790618897, "epoch": 0.3217811384162991, "grad_norm": 1.125, "learning_rate": 0.0004993623216174053, "loss": 5.8013, "mean_token_accuracy": 0.15186585038900374, "num_tokens": 7060229.0, "step": 3830 }, { "entropy": 5.918556547164917, "epoch": 0.32220121823146397, "grad_norm": 1.25, "learning_rate": 0.00049936006661851, "loss": 5.8909, "mean_token_accuracy": 0.13876768276095391, "num_tokens": 7069040.0, "step": 3835 }, { "entropy": 5.9392224788665775, "epoch": 0.32262129804662887, "grad_norm": 1.1875, "learning_rate": 0.0004993578076451917, "loss": 5.7726, "mean_token_accuracy": 0.14143876731395721, "num_tokens": 7078409.0, "step": 3840 }, { "entropy": 5.779048347473145, "epoch": 0.32304137786179377, "grad_norm": 1.3046875, "learning_rate": 0.0004993555446974903, "loss": 5.8733, "mean_token_accuracy": 0.13716461956501008, "num_tokens": 7087983.0, "step": 3845 }, { "entropy": 5.941289329528809, "epoch": 0.3234614576769586, "grad_norm": 1.21875, "learning_rate": 0.000499353277775446, "loss": 5.8228, "mean_token_accuracy": 0.14281788170337678, "num_tokens": 7097277.0, "step": 3850 }, { "entropy": 5.894749402999878, "epoch": 0.3238815374921235, "grad_norm": 1.2265625, "learning_rate": 0.0004993510068790989, "loss": 5.7164, "mean_token_accuracy": 0.15665216147899627, "num_tokens": 7105918.0, "step": 3855 }, { "entropy": 5.773345851898194, "epoch": 0.3243016173072884, "grad_norm": 1.1171875, "learning_rate": 0.0004993487320084892, "loss": 5.7838, "mean_token_accuracy": 0.15064965635538102, "num_tokens": 7115049.0, "step": 3860 }, { "entropy": 5.944450235366821, "epoch": 0.32472169712245325, "grad_norm": 1.1328125, "learning_rate": 0.0004993464531636573, "loss": 5.8883, "mean_token_accuracy": 0.13874924927949905, "num_tokens": 7124862.0, "step": 3865 }, { "entropy": 5.947724437713623, "epoch": 0.32514177693761814, "grad_norm": 1.15625, "learning_rate": 0.0004993441703446435, "loss": 5.7816, "mean_token_accuracy": 0.1445206731557846, "num_tokens": 7133280.0, "step": 3870 }, { "entropy": 6.020012712478637, "epoch": 0.32556185675278304, "grad_norm": 1.1015625, "learning_rate": 0.0004993418835514882, "loss": 5.9743, "mean_token_accuracy": 0.1368774726986885, "num_tokens": 7142446.0, "step": 3875 }, { "entropy": 5.944014692306519, "epoch": 0.3259819365679479, "grad_norm": 1.078125, "learning_rate": 0.0004993395927842321, "loss": 5.8824, "mean_token_accuracy": 0.1359010323882103, "num_tokens": 7152143.0, "step": 3880 }, { "entropy": 5.993379163742065, "epoch": 0.3264020163831128, "grad_norm": 1.203125, "learning_rate": 0.0004993372980429155, "loss": 5.9617, "mean_token_accuracy": 0.13282209262251854, "num_tokens": 7162046.0, "step": 3885 }, { "entropy": 5.989493370056152, "epoch": 0.3268220961982777, "grad_norm": 1.1171875, "learning_rate": 0.0004993349993275792, "loss": 5.8488, "mean_token_accuracy": 0.14026510193943978, "num_tokens": 7171557.0, "step": 3890 }, { "entropy": 5.754479789733887, "epoch": 0.3272421760134426, "grad_norm": 1.1640625, "learning_rate": 0.0004993326966382639, "loss": 5.7423, "mean_token_accuracy": 0.14871106296777725, "num_tokens": 7180927.0, "step": 3895 }, { "entropy": 5.8972282886505125, "epoch": 0.3276622558286074, "grad_norm": 1.2578125, "learning_rate": 0.0004993303899750104, "loss": 5.8311, "mean_token_accuracy": 0.1395234152674675, "num_tokens": 7189552.0, "step": 3900 }, { "entropy": 6.021924352645874, "epoch": 0.3280823356437723, "grad_norm": 1.4375, "learning_rate": 0.0004993280793378595, "loss": 5.8549, "mean_token_accuracy": 0.13788855373859404, "num_tokens": 7197857.0, "step": 3905 }, { "entropy": 5.914785861968994, "epoch": 0.3285024154589372, "grad_norm": 1.171875, "learning_rate": 0.0004993257647268522, "loss": 5.8281, "mean_token_accuracy": 0.14489276185631753, "num_tokens": 7206785.0, "step": 3910 }, { "entropy": 5.945201826095581, "epoch": 0.32892249527410206, "grad_norm": 1.1796875, "learning_rate": 0.0004993234461420295, "loss": 5.9003, "mean_token_accuracy": 0.1415283761918545, "num_tokens": 7216360.0, "step": 3915 }, { "entropy": 5.844962692260742, "epoch": 0.32934257508926695, "grad_norm": 1.265625, "learning_rate": 0.0004993211235834326, "loss": 5.7122, "mean_token_accuracy": 0.15939737260341644, "num_tokens": 7224890.0, "step": 3920 }, { "entropy": 5.77975697517395, "epoch": 0.32976265490443185, "grad_norm": 1.359375, "learning_rate": 0.0004993187970511023, "loss": 5.7707, "mean_token_accuracy": 0.16336829960346222, "num_tokens": 7234442.0, "step": 3925 }, { "entropy": 5.964393234252929, "epoch": 0.33018273471959675, "grad_norm": 1.3203125, "learning_rate": 0.0004993164665450801, "loss": 5.9279, "mean_token_accuracy": 0.1439814858138561, "num_tokens": 7244023.0, "step": 3930 }, { "entropy": 5.916021871566772, "epoch": 0.3306028145347616, "grad_norm": 1.1953125, "learning_rate": 0.0004993141320654072, "loss": 5.7793, "mean_token_accuracy": 0.14671456664800644, "num_tokens": 7253548.0, "step": 3935 }, { "entropy": 5.898174810409546, "epoch": 0.3310228943499265, "grad_norm": 1.2578125, "learning_rate": 0.000499311793612125, "loss": 5.8402, "mean_token_accuracy": 0.1421785496175289, "num_tokens": 7262962.0, "step": 3940 }, { "entropy": 5.964570426940918, "epoch": 0.3314429741650914, "grad_norm": 1.140625, "learning_rate": 0.0004993094511852748, "loss": 5.863, "mean_token_accuracy": 0.14184453189373017, "num_tokens": 7272234.0, "step": 3945 }, { "entropy": 5.929952716827392, "epoch": 0.33186305398025623, "grad_norm": 1.15625, "learning_rate": 0.0004993071047848983, "loss": 5.8493, "mean_token_accuracy": 0.1383821338415146, "num_tokens": 7281524.0, "step": 3950 }, { "entropy": 5.838898372650147, "epoch": 0.3322831337954211, "grad_norm": 1.34375, "learning_rate": 0.0004993047544110368, "loss": 5.7384, "mean_token_accuracy": 0.14712240919470787, "num_tokens": 7289601.0, "step": 3955 }, { "entropy": 5.791057062149048, "epoch": 0.332703213610586, "grad_norm": 1.3203125, "learning_rate": 0.0004993024000637321, "loss": 5.7137, "mean_token_accuracy": 0.15096415132284163, "num_tokens": 7298508.0, "step": 3960 }, { "entropy": 5.892502069473267, "epoch": 0.33312329342575087, "grad_norm": 1.1328125, "learning_rate": 0.0004993000417430259, "loss": 5.9339, "mean_token_accuracy": 0.1390118695795536, "num_tokens": 7309065.0, "step": 3965 }, { "entropy": 6.066646718978882, "epoch": 0.33354337324091576, "grad_norm": 1.125, "learning_rate": 0.00049929767944896, "loss": 5.953, "mean_token_accuracy": 0.1411003813147545, "num_tokens": 7319669.0, "step": 3970 }, { "entropy": 6.000399112701416, "epoch": 0.33396345305608066, "grad_norm": 1.2421875, "learning_rate": 0.0004992953131815761, "loss": 5.9022, "mean_token_accuracy": 0.1418354742228985, "num_tokens": 7328425.0, "step": 3975 }, { "entropy": 5.8749700546264645, "epoch": 0.33438353287124556, "grad_norm": 1.1875, "learning_rate": 0.0004992929429409164, "loss": 5.775, "mean_token_accuracy": 0.1469979852437973, "num_tokens": 7337369.0, "step": 3980 }, { "entropy": 5.913109064102173, "epoch": 0.3348036126864104, "grad_norm": 1.1640625, "learning_rate": 0.0004992905687270225, "loss": 5.8411, "mean_token_accuracy": 0.1466023862361908, "num_tokens": 7346829.0, "step": 3985 }, { "entropy": 5.973616456985473, "epoch": 0.3352236925015753, "grad_norm": 1.1875, "learning_rate": 0.0004992881905399368, "loss": 5.9044, "mean_token_accuracy": 0.14303565323352813, "num_tokens": 7355976.0, "step": 3990 }, { "entropy": 5.9362890243530275, "epoch": 0.3356437723167402, "grad_norm": 1.296875, "learning_rate": 0.0004992858083797013, "loss": 5.8555, "mean_token_accuracy": 0.13833607137203216, "num_tokens": 7365210.0, "step": 3995 }, { "entropy": 5.910732650756836, "epoch": 0.33606385213190504, "grad_norm": 1.2265625, "learning_rate": 0.0004992834222463581, "loss": 5.9097, "mean_token_accuracy": 0.13066598325967788, "num_tokens": 7374175.0, "step": 4000 }, { "entropy": 6.022627830505371, "epoch": 0.33648393194706994, "grad_norm": 1.1171875, "learning_rate": 0.0004992810321399496, "loss": 5.936, "mean_token_accuracy": 0.13869498372077943, "num_tokens": 7383302.0, "step": 4005 }, { "entropy": 6.006158876419067, "epoch": 0.33690401176223483, "grad_norm": 1.296875, "learning_rate": 0.0004992786380605182, "loss": 5.9162, "mean_token_accuracy": 0.13912810906767845, "num_tokens": 7392746.0, "step": 4010 }, { "entropy": 5.839102506637573, "epoch": 0.33732409157739973, "grad_norm": 1.1640625, "learning_rate": 0.0004992762400081062, "loss": 5.7562, "mean_token_accuracy": 0.1469271421432495, "num_tokens": 7401604.0, "step": 4015 }, { "entropy": 5.856449317932129, "epoch": 0.3377441713925646, "grad_norm": 1.15625, "learning_rate": 0.0004992738379827559, "loss": 5.8677, "mean_token_accuracy": 0.13804834261536597, "num_tokens": 7410594.0, "step": 4020 }, { "entropy": 5.922429132461548, "epoch": 0.33816425120772947, "grad_norm": 1.2421875, "learning_rate": 0.0004992714319845101, "loss": 5.7704, "mean_token_accuracy": 0.15343396067619325, "num_tokens": 7418831.0, "step": 4025 }, { "entropy": 5.8475088596344, "epoch": 0.33858433102289437, "grad_norm": 1.1015625, "learning_rate": 0.0004992690220134116, "loss": 5.8188, "mean_token_accuracy": 0.144370898604393, "num_tokens": 7427731.0, "step": 4030 }, { "entropy": 6.030502510070801, "epoch": 0.3390044108380592, "grad_norm": 1.203125, "learning_rate": 0.0004992666080695027, "loss": 5.9373, "mean_token_accuracy": 0.13586149737238884, "num_tokens": 7436447.0, "step": 4035 }, { "entropy": 5.901221179962159, "epoch": 0.3394244906532241, "grad_norm": 1.140625, "learning_rate": 0.0004992641901528262, "loss": 5.8156, "mean_token_accuracy": 0.14270046576857567, "num_tokens": 7445352.0, "step": 4040 }, { "entropy": 5.946398782730102, "epoch": 0.339844570468389, "grad_norm": 1.125, "learning_rate": 0.0004992617682634252, "loss": 5.8858, "mean_token_accuracy": 0.1441212549805641, "num_tokens": 7454298.0, "step": 4045 }, { "entropy": 5.920703315734864, "epoch": 0.34026465028355385, "grad_norm": 1.1328125, "learning_rate": 0.0004992593424013424, "loss": 5.8948, "mean_token_accuracy": 0.13869627565145493, "num_tokens": 7463543.0, "step": 4050 }, { "entropy": 5.9791840553283695, "epoch": 0.34068473009871875, "grad_norm": 1.1875, "learning_rate": 0.0004992569125666209, "loss": 5.9195, "mean_token_accuracy": 0.14178480133414267, "num_tokens": 7472701.0, "step": 4055 }, { "entropy": 6.054230260848999, "epoch": 0.34110480991388364, "grad_norm": 1.2265625, "learning_rate": 0.0004992544787593037, "loss": 5.9062, "mean_token_accuracy": 0.13785406127572059, "num_tokens": 7481123.0, "step": 4060 }, { "entropy": 5.989615964889526, "epoch": 0.34152488972904854, "grad_norm": 1.1953125, "learning_rate": 0.0004992520409794338, "loss": 5.9555, "mean_token_accuracy": 0.14264528974890708, "num_tokens": 7490439.0, "step": 4065 }, { "entropy": 5.894261217117309, "epoch": 0.3419449695442134, "grad_norm": 1.140625, "learning_rate": 0.0004992495992270544, "loss": 5.8444, "mean_token_accuracy": 0.1425054393708706, "num_tokens": 7499326.0, "step": 4070 }, { "entropy": 5.95070858001709, "epoch": 0.3423650493593783, "grad_norm": 1.265625, "learning_rate": 0.0004992471535022089, "loss": 5.8947, "mean_token_accuracy": 0.14209673926234245, "num_tokens": 7509407.0, "step": 4075 }, { "entropy": 5.978242111206055, "epoch": 0.3427851291745432, "grad_norm": 1.203125, "learning_rate": 0.0004992447038049405, "loss": 5.9368, "mean_token_accuracy": 0.1432798534631729, "num_tokens": 7518443.0, "step": 4080 }, { "entropy": 5.854420137405396, "epoch": 0.343205208989708, "grad_norm": 1.3125, "learning_rate": 0.0004992422501352927, "loss": 5.7979, "mean_token_accuracy": 0.15148040205240249, "num_tokens": 7527609.0, "step": 4085 }, { "entropy": 5.958763885498047, "epoch": 0.3436252888048729, "grad_norm": 1.2578125, "learning_rate": 0.0004992397924933089, "loss": 5.8829, "mean_token_accuracy": 0.14002160280942916, "num_tokens": 7536890.0, "step": 4090 }, { "entropy": 5.984218978881836, "epoch": 0.3440453686200378, "grad_norm": 1.2578125, "learning_rate": 0.0004992373308790325, "loss": 5.8445, "mean_token_accuracy": 0.14879057705402374, "num_tokens": 7546509.0, "step": 4095 }, { "entropy": 5.8121418952941895, "epoch": 0.3444654484352027, "grad_norm": 1.2578125, "learning_rate": 0.0004992348652925074, "loss": 5.8814, "mean_token_accuracy": 0.13877593278884887, "num_tokens": 7555336.0, "step": 4100 }, { "entropy": 5.959460878372193, "epoch": 0.34488552825036756, "grad_norm": 1.1875, "learning_rate": 0.0004992323957337771, "loss": 5.8217, "mean_token_accuracy": 0.14075680449604988, "num_tokens": 7565210.0, "step": 4105 }, { "entropy": 5.997728681564331, "epoch": 0.34530560806553245, "grad_norm": 1.125, "learning_rate": 0.0004992299222028855, "loss": 5.9177, "mean_token_accuracy": 0.14632946625351906, "num_tokens": 7574516.0, "step": 4110 }, { "entropy": 5.837478542327881, "epoch": 0.34572568788069735, "grad_norm": 1.1875, "learning_rate": 0.0004992274446998761, "loss": 5.7701, "mean_token_accuracy": 0.14613791555166245, "num_tokens": 7583219.0, "step": 4115 }, { "entropy": 5.990570783615112, "epoch": 0.3461457676958622, "grad_norm": 1.171875, "learning_rate": 0.0004992249632247929, "loss": 5.9898, "mean_token_accuracy": 0.13541294783353805, "num_tokens": 7592050.0, "step": 4120 }, { "entropy": 6.017976236343384, "epoch": 0.3465658475110271, "grad_norm": 1.171875, "learning_rate": 0.0004992224777776802, "loss": 5.8269, "mean_token_accuracy": 0.1406927302479744, "num_tokens": 7600718.0, "step": 4125 }, { "entropy": 5.928384780883789, "epoch": 0.346985927326192, "grad_norm": 1.1953125, "learning_rate": 0.0004992199883585816, "loss": 5.8623, "mean_token_accuracy": 0.14485160112380982, "num_tokens": 7609191.0, "step": 4130 }, { "entropy": 5.958423805236817, "epoch": 0.34740600714135683, "grad_norm": 1.1953125, "learning_rate": 0.0004992174949675413, "loss": 5.8819, "mean_token_accuracy": 0.14174177944660188, "num_tokens": 7618509.0, "step": 4135 }, { "entropy": 5.890047216415406, "epoch": 0.34782608695652173, "grad_norm": 1.1796875, "learning_rate": 0.0004992149976046037, "loss": 5.8117, "mean_token_accuracy": 0.14391598626971244, "num_tokens": 7627851.0, "step": 4140 }, { "entropy": 5.892529726028442, "epoch": 0.3482461667716866, "grad_norm": 1.1171875, "learning_rate": 0.0004992124962698128, "loss": 5.8894, "mean_token_accuracy": 0.13846235871315002, "num_tokens": 7636748.0, "step": 4145 }, { "entropy": 5.952128744125366, "epoch": 0.3486662465868515, "grad_norm": 1.203125, "learning_rate": 0.000499209990963213, "loss": 5.7996, "mean_token_accuracy": 0.14363356158137322, "num_tokens": 7645436.0, "step": 4150 }, { "entropy": 5.9340009689331055, "epoch": 0.34908632640201637, "grad_norm": 1.3125, "learning_rate": 0.0004992074816848487, "loss": 5.9287, "mean_token_accuracy": 0.13951508998870848, "num_tokens": 7655414.0, "step": 4155 }, { "entropy": 5.832207345962525, "epoch": 0.34950640621718126, "grad_norm": 1.203125, "learning_rate": 0.0004992049684347642, "loss": 5.7094, "mean_token_accuracy": 0.14780430346727372, "num_tokens": 7664295.0, "step": 4160 }, { "entropy": 5.929846525192261, "epoch": 0.34992648603234616, "grad_norm": 1.21875, "learning_rate": 0.0004992024512130042, "loss": 5.8569, "mean_token_accuracy": 0.14193690866231917, "num_tokens": 7673295.0, "step": 4165 }, { "entropy": 5.905185222625732, "epoch": 0.350346565847511, "grad_norm": 1.0546875, "learning_rate": 0.0004991999300196132, "loss": 5.8475, "mean_token_accuracy": 0.13919475451111793, "num_tokens": 7682932.0, "step": 4170 }, { "entropy": 6.005189561843872, "epoch": 0.3507666456626759, "grad_norm": 1.546875, "learning_rate": 0.0004991974048546359, "loss": 5.8699, "mean_token_accuracy": 0.13765867426991463, "num_tokens": 7692105.0, "step": 4175 }, { "entropy": 5.873351955413819, "epoch": 0.3511867254778408, "grad_norm": 1.171875, "learning_rate": 0.000499194875718117, "loss": 5.859, "mean_token_accuracy": 0.1459092453122139, "num_tokens": 7701294.0, "step": 4180 }, { "entropy": 5.976405239105224, "epoch": 0.3516068052930057, "grad_norm": 1.1015625, "learning_rate": 0.0004991923426101013, "loss": 5.8556, "mean_token_accuracy": 0.14097452014684678, "num_tokens": 7710964.0, "step": 4185 }, { "entropy": 5.988002777099609, "epoch": 0.35202688510817054, "grad_norm": 1.1328125, "learning_rate": 0.0004991898055306337, "loss": 5.9768, "mean_token_accuracy": 0.13131897300481796, "num_tokens": 7719938.0, "step": 4190 }, { "entropy": 5.942753410339355, "epoch": 0.35244696492333544, "grad_norm": 1.0390625, "learning_rate": 0.0004991872644797591, "loss": 5.8921, "mean_token_accuracy": 0.13939437940716742, "num_tokens": 7729129.0, "step": 4195 }, { "entropy": 5.955871152877807, "epoch": 0.35286704473850034, "grad_norm": 1.2421875, "learning_rate": 0.0004991847194575226, "loss": 5.8881, "mean_token_accuracy": 0.13834249898791312, "num_tokens": 7738506.0, "step": 4200 }, { "entropy": 6.041079711914063, "epoch": 0.3532871245536652, "grad_norm": 1.1171875, "learning_rate": 0.0004991821704639693, "loss": 5.9968, "mean_token_accuracy": 0.13867756947875023, "num_tokens": 7749320.0, "step": 4205 }, { "entropy": 6.0422234535217285, "epoch": 0.3537072043688301, "grad_norm": 1.140625, "learning_rate": 0.0004991796174991443, "loss": 5.8516, "mean_token_accuracy": 0.14419358000159263, "num_tokens": 7758735.0, "step": 4210 }, { "entropy": 5.810104942321777, "epoch": 0.354127284183995, "grad_norm": 2.09375, "learning_rate": 0.0004991770605630927, "loss": 5.8115, "mean_token_accuracy": 0.14199010655283928, "num_tokens": 7767556.0, "step": 4215 }, { "entropy": 5.862843370437622, "epoch": 0.3545473639991598, "grad_norm": 1.328125, "learning_rate": 0.0004991744996558599, "loss": 5.839, "mean_token_accuracy": 0.14548772126436232, "num_tokens": 7776615.0, "step": 4220 }, { "entropy": 5.955168771743774, "epoch": 0.3549674438143247, "grad_norm": 1.25, "learning_rate": 0.0004991719347774913, "loss": 5.8885, "mean_token_accuracy": 0.14509620741009713, "num_tokens": 7785288.0, "step": 4225 }, { "entropy": 5.897441482543945, "epoch": 0.3553875236294896, "grad_norm": 1.2578125, "learning_rate": 0.0004991693659280324, "loss": 5.7878, "mean_token_accuracy": 0.1456679493188858, "num_tokens": 7794381.0, "step": 4230 }, { "entropy": 5.895413112640381, "epoch": 0.3558076034446545, "grad_norm": 1.3359375, "learning_rate": 0.0004991667931075284, "loss": 5.7548, "mean_token_accuracy": 0.14165765419602394, "num_tokens": 7803265.0, "step": 4235 }, { "entropy": 5.8606267929077145, "epoch": 0.35622768325981935, "grad_norm": 1.15625, "learning_rate": 0.0004991642163160252, "loss": 5.8796, "mean_token_accuracy": 0.13830938637256623, "num_tokens": 7812445.0, "step": 4240 }, { "entropy": 5.941714191436768, "epoch": 0.35664776307498425, "grad_norm": 1.140625, "learning_rate": 0.0004991616355535684, "loss": 5.8695, "mean_token_accuracy": 0.1441208615899086, "num_tokens": 7822073.0, "step": 4245 }, { "entropy": 6.004122114181518, "epoch": 0.35706784289014915, "grad_norm": 1.265625, "learning_rate": 0.0004991590508202036, "loss": 5.8472, "mean_token_accuracy": 0.13856493979692458, "num_tokens": 7831193.0, "step": 4250 }, { "entropy": 5.952021503448487, "epoch": 0.357487922705314, "grad_norm": 1.21875, "learning_rate": 0.0004991564621159766, "loss": 5.8909, "mean_token_accuracy": 0.1399833530187607, "num_tokens": 7840311.0, "step": 4255 }, { "entropy": 5.902349615097046, "epoch": 0.3579080025204789, "grad_norm": 1.2109375, "learning_rate": 0.0004991538694409334, "loss": 5.8981, "mean_token_accuracy": 0.13640205860137938, "num_tokens": 7849622.0, "step": 4260 }, { "entropy": 5.93274884223938, "epoch": 0.3583280823356438, "grad_norm": 1.3515625, "learning_rate": 0.0004991512727951198, "loss": 5.8639, "mean_token_accuracy": 0.1423584371805191, "num_tokens": 7859494.0, "step": 4265 }, { "entropy": 6.066871976852417, "epoch": 0.3587481621508087, "grad_norm": 1.1171875, "learning_rate": 0.0004991486721785818, "loss": 5.9611, "mean_token_accuracy": 0.13798293545842172, "num_tokens": 7868526.0, "step": 4270 }, { "entropy": 5.916080617904663, "epoch": 0.3591682419659735, "grad_norm": 1.15625, "learning_rate": 0.0004991460675913655, "loss": 5.7946, "mean_token_accuracy": 0.1431095890700817, "num_tokens": 7877631.0, "step": 4275 }, { "entropy": 5.9288982391357425, "epoch": 0.3595883217811384, "grad_norm": 1.1796875, "learning_rate": 0.000499143459033517, "loss": 5.8525, "mean_token_accuracy": 0.14929330348968506, "num_tokens": 7886814.0, "step": 4280 }, { "entropy": 5.835088777542114, "epoch": 0.3600084015963033, "grad_norm": 1.328125, "learning_rate": 0.0004991408465050825, "loss": 5.6819, "mean_token_accuracy": 0.15145567432045937, "num_tokens": 7896337.0, "step": 4285 }, { "entropy": 5.841267919540405, "epoch": 0.36042848141146816, "grad_norm": 1.046875, "learning_rate": 0.0004991382300061084, "loss": 5.9429, "mean_token_accuracy": 0.13477055355906487, "num_tokens": 7906071.0, "step": 4290 }, { "entropy": 6.013036108016967, "epoch": 0.36084856122663306, "grad_norm": 1.046875, "learning_rate": 0.0004991356095366409, "loss": 5.9236, "mean_token_accuracy": 0.14087440073490143, "num_tokens": 7915003.0, "step": 4295 }, { "entropy": 5.964684629440308, "epoch": 0.36126864104179796, "grad_norm": 1.140625, "learning_rate": 0.0004991329850967266, "loss": 5.7748, "mean_token_accuracy": 0.14612130969762802, "num_tokens": 7924408.0, "step": 4300 }, { "entropy": 5.857362222671509, "epoch": 0.3616887208569628, "grad_norm": 1.0859375, "learning_rate": 0.0004991303566864118, "loss": 5.752, "mean_token_accuracy": 0.14585833102464676, "num_tokens": 7934717.0, "step": 4305 }, { "entropy": 5.800111103057861, "epoch": 0.3621088006721277, "grad_norm": 1.2421875, "learning_rate": 0.0004991277243057431, "loss": 5.8176, "mean_token_accuracy": 0.14245440661907197, "num_tokens": 7944278.0, "step": 4310 }, { "entropy": 5.853901958465576, "epoch": 0.3625288804872926, "grad_norm": 1.1640625, "learning_rate": 0.0004991250879547673, "loss": 5.8345, "mean_token_accuracy": 0.14364267513155937, "num_tokens": 7953344.0, "step": 4315 }, { "entropy": 5.9053857803344725, "epoch": 0.3629489603024575, "grad_norm": 1.1015625, "learning_rate": 0.0004991224476335309, "loss": 5.8601, "mean_token_accuracy": 0.1401130437850952, "num_tokens": 7962869.0, "step": 4320 }, { "entropy": 5.988316392898559, "epoch": 0.36336904011762233, "grad_norm": 1.2109375, "learning_rate": 0.0004991198033420807, "loss": 5.8527, "mean_token_accuracy": 0.14232899993658066, "num_tokens": 7971981.0, "step": 4325 }, { "entropy": 5.870962715148925, "epoch": 0.36378911993278723, "grad_norm": 1.1484375, "learning_rate": 0.0004991171550804636, "loss": 5.8073, "mean_token_accuracy": 0.139846058934927, "num_tokens": 7980979.0, "step": 4330 }, { "entropy": 5.898285436630249, "epoch": 0.36420919974795213, "grad_norm": 1.28125, "learning_rate": 0.0004991145028487266, "loss": 5.8963, "mean_token_accuracy": 0.14070027470588684, "num_tokens": 7989607.0, "step": 4335 }, { "entropy": 5.864823675155639, "epoch": 0.36462927956311697, "grad_norm": 1.140625, "learning_rate": 0.0004991118466469165, "loss": 5.713, "mean_token_accuracy": 0.14677212983369828, "num_tokens": 7998356.0, "step": 4340 }, { "entropy": 5.8904320240020756, "epoch": 0.36504935937828187, "grad_norm": 1.1875, "learning_rate": 0.0004991091864750805, "loss": 5.818, "mean_token_accuracy": 0.14362581819295883, "num_tokens": 8007596.0, "step": 4345 }, { "entropy": 5.893006706237793, "epoch": 0.36546943919344677, "grad_norm": 1.1640625, "learning_rate": 0.0004991065223332655, "loss": 5.8754, "mean_token_accuracy": 0.13881655633449555, "num_tokens": 8016493.0, "step": 4350 }, { "entropy": 5.957713174819946, "epoch": 0.36588951900861166, "grad_norm": 1.15625, "learning_rate": 0.0004991038542215191, "loss": 5.8451, "mean_token_accuracy": 0.1374589078128338, "num_tokens": 8025867.0, "step": 4355 }, { "entropy": 5.831826066970825, "epoch": 0.3663095988237765, "grad_norm": 1.1953125, "learning_rate": 0.0004991011821398882, "loss": 5.8861, "mean_token_accuracy": 0.1465972438454628, "num_tokens": 8036251.0, "step": 4360 }, { "entropy": 6.003261423110962, "epoch": 0.3667296786389414, "grad_norm": 1.21875, "learning_rate": 0.0004990985060884202, "loss": 5.8444, "mean_token_accuracy": 0.1452535480260849, "num_tokens": 8045647.0, "step": 4365 }, { "entropy": 5.943668365478516, "epoch": 0.3671497584541063, "grad_norm": 1.234375, "learning_rate": 0.0004990958260671627, "loss": 5.8987, "mean_token_accuracy": 0.13597789257764817, "num_tokens": 8056025.0, "step": 4370 }, { "entropy": 5.898333263397217, "epoch": 0.36756983826927114, "grad_norm": 1.4140625, "learning_rate": 0.0004990931420761629, "loss": 5.8364, "mean_token_accuracy": 0.14677493423223495, "num_tokens": 8065029.0, "step": 4375 }, { "entropy": 5.953028678894043, "epoch": 0.36798991808443604, "grad_norm": 1.3046875, "learning_rate": 0.0004990904541154685, "loss": 5.7841, "mean_token_accuracy": 0.15241612046957015, "num_tokens": 8073249.0, "step": 4380 }, { "entropy": 5.914327716827392, "epoch": 0.36840999789960094, "grad_norm": 1.2578125, "learning_rate": 0.0004990877621851271, "loss": 5.9274, "mean_token_accuracy": 0.13789283782243728, "num_tokens": 8082039.0, "step": 4385 }, { "entropy": 5.818746089935303, "epoch": 0.3688300777147658, "grad_norm": 1.3671875, "learning_rate": 0.0004990850662851863, "loss": 5.7546, "mean_token_accuracy": 0.14923306405544282, "num_tokens": 8090011.0, "step": 4390 }, { "entropy": 5.97280101776123, "epoch": 0.3692501575299307, "grad_norm": 1.265625, "learning_rate": 0.0004990823664156941, "loss": 5.8789, "mean_token_accuracy": 0.1489357531070709, "num_tokens": 8099934.0, "step": 4395 }, { "entropy": 5.970620107650757, "epoch": 0.3696702373450956, "grad_norm": 1.2578125, "learning_rate": 0.0004990796625766981, "loss": 5.8822, "mean_token_accuracy": 0.13866196647286416, "num_tokens": 8108969.0, "step": 4400 }, { "entropy": 5.857716226577759, "epoch": 0.3700903171602605, "grad_norm": 1.1796875, "learning_rate": 0.0004990769547682462, "loss": 5.798, "mean_token_accuracy": 0.14401047080755233, "num_tokens": 8117372.0, "step": 4405 }, { "entropy": 6.015813732147217, "epoch": 0.3705103969754253, "grad_norm": 1.3046875, "learning_rate": 0.0004990742429903866, "loss": 5.9812, "mean_token_accuracy": 0.13605612963438035, "num_tokens": 8127108.0, "step": 4410 }, { "entropy": 6.0110640048980715, "epoch": 0.3709304767905902, "grad_norm": 1.2734375, "learning_rate": 0.000499071527243167, "loss": 5.9774, "mean_token_accuracy": 0.13931988626718522, "num_tokens": 8137392.0, "step": 4415 }, { "entropy": 5.916806697845459, "epoch": 0.3713505566057551, "grad_norm": 1.28125, "learning_rate": 0.0004990688075266357, "loss": 5.8172, "mean_token_accuracy": 0.14630230888724327, "num_tokens": 8146257.0, "step": 4420 }, { "entropy": 5.90497236251831, "epoch": 0.37177063642091995, "grad_norm": 1.234375, "learning_rate": 0.0004990660838408409, "loss": 5.7894, "mean_token_accuracy": 0.14007715433835982, "num_tokens": 8154952.0, "step": 4425 }, { "entropy": 5.948085355758667, "epoch": 0.37219071623608485, "grad_norm": 1.1171875, "learning_rate": 0.0004990633561858308, "loss": 5.8263, "mean_token_accuracy": 0.14142653867602348, "num_tokens": 8164365.0, "step": 4430 }, { "entropy": 5.9057210922241214, "epoch": 0.37261079605124975, "grad_norm": 1.2421875, "learning_rate": 0.0004990606245616537, "loss": 5.8405, "mean_token_accuracy": 0.13960912972688674, "num_tokens": 8172614.0, "step": 4435 }, { "entropy": 6.0053239345550535, "epoch": 0.37303087586641465, "grad_norm": 1.265625, "learning_rate": 0.0004990578889683579, "loss": 5.8993, "mean_token_accuracy": 0.13672763109207153, "num_tokens": 8182445.0, "step": 4440 }, { "entropy": 5.912483501434326, "epoch": 0.3734509556815795, "grad_norm": 1.2421875, "learning_rate": 0.0004990551494059921, "loss": 5.7912, "mean_token_accuracy": 0.14882408380508422, "num_tokens": 8191871.0, "step": 4445 }, { "entropy": 5.91331787109375, "epoch": 0.3738710354967444, "grad_norm": 1.2578125, "learning_rate": 0.0004990524058746047, "loss": 5.9292, "mean_token_accuracy": 0.14731585383415222, "num_tokens": 8200658.0, "step": 4450 }, { "entropy": 5.922462463378906, "epoch": 0.3742911153119093, "grad_norm": 1.296875, "learning_rate": 0.0004990496583742443, "loss": 5.8609, "mean_token_accuracy": 0.13896840661764145, "num_tokens": 8209776.0, "step": 4455 }, { "entropy": 5.8580132007598875, "epoch": 0.3747111951270741, "grad_norm": 1.3984375, "learning_rate": 0.0004990469069049596, "loss": 5.7933, "mean_token_accuracy": 0.14876351952552797, "num_tokens": 8219401.0, "step": 4460 }, { "entropy": 5.9017116069793705, "epoch": 0.375131274942239, "grad_norm": 1.328125, "learning_rate": 0.0004990441514667993, "loss": 5.8399, "mean_token_accuracy": 0.1457892268896103, "num_tokens": 8228762.0, "step": 4465 }, { "entropy": 5.960052967071533, "epoch": 0.3755513547574039, "grad_norm": 1.2109375, "learning_rate": 0.0004990413920598121, "loss": 5.8364, "mean_token_accuracy": 0.1444413885474205, "num_tokens": 8236612.0, "step": 4470 }, { "entropy": 5.957969760894775, "epoch": 0.37597143457256876, "grad_norm": 1.296875, "learning_rate": 0.0004990386286840471, "loss": 5.8452, "mean_token_accuracy": 0.14290711134672165, "num_tokens": 8245043.0, "step": 4475 }, { "entropy": 6.0023870944976805, "epoch": 0.37639151438773366, "grad_norm": 1.2109375, "learning_rate": 0.0004990358613395532, "loss": 5.9381, "mean_token_accuracy": 0.13609616905450822, "num_tokens": 8255270.0, "step": 4480 }, { "entropy": 5.976658725738526, "epoch": 0.37681159420289856, "grad_norm": 1.203125, "learning_rate": 0.0004990330900263792, "loss": 5.896, "mean_token_accuracy": 0.13675653785467148, "num_tokens": 8264761.0, "step": 4485 }, { "entropy": 5.991942405700684, "epoch": 0.37723167401806346, "grad_norm": 1.1640625, "learning_rate": 0.0004990303147445745, "loss": 5.8568, "mean_token_accuracy": 0.14412947744131088, "num_tokens": 8274308.0, "step": 4490 }, { "entropy": 5.831737422943116, "epoch": 0.3776517538332283, "grad_norm": 1.265625, "learning_rate": 0.0004990275354941881, "loss": 5.751, "mean_token_accuracy": 0.15253113806247712, "num_tokens": 8283323.0, "step": 4495 }, { "entropy": 5.965500402450561, "epoch": 0.3780718336483932, "grad_norm": 1.2109375, "learning_rate": 0.0004990247522752694, "loss": 6.0719, "mean_token_accuracy": 0.12804851979017257, "num_tokens": 8293452.0, "step": 4500 }, { "entropy": 5.9973039627075195, "epoch": 0.3784919134635581, "grad_norm": 1.171875, "learning_rate": 0.0004990219650878674, "loss": 5.7459, "mean_token_accuracy": 0.14813876897096634, "num_tokens": 8302941.0, "step": 4505 }, { "entropy": 5.840318632125855, "epoch": 0.37891199327872294, "grad_norm": 1.5859375, "learning_rate": 0.0004990191739320318, "loss": 5.7706, "mean_token_accuracy": 0.15119873285293578, "num_tokens": 8311811.0, "step": 4510 }, { "entropy": 5.808368587493897, "epoch": 0.37933207309388783, "grad_norm": 1.1796875, "learning_rate": 0.0004990163788078117, "loss": 5.6889, "mean_token_accuracy": 0.1518329106271267, "num_tokens": 8321130.0, "step": 4515 }, { "entropy": 5.834763097763061, "epoch": 0.37975215290905273, "grad_norm": 1.21875, "learning_rate": 0.0004990135797152569, "loss": 5.7997, "mean_token_accuracy": 0.14402930140495301, "num_tokens": 8330233.0, "step": 4520 }, { "entropy": 5.881337881088257, "epoch": 0.3801722327242176, "grad_norm": 2.15625, "learning_rate": 0.0004990107766544169, "loss": 5.7852, "mean_token_accuracy": 0.144415046274662, "num_tokens": 8338585.0, "step": 4525 }, { "entropy": 5.83257737159729, "epoch": 0.38059231253938247, "grad_norm": 1.2109375, "learning_rate": 0.0004990079696253413, "loss": 5.8118, "mean_token_accuracy": 0.14888912737369536, "num_tokens": 8346618.0, "step": 4530 }, { "entropy": 5.908400917053223, "epoch": 0.38101239235454737, "grad_norm": 1.1484375, "learning_rate": 0.0004990051586280799, "loss": 5.7942, "mean_token_accuracy": 0.14552049711346626, "num_tokens": 8356273.0, "step": 4535 }, { "entropy": 5.918098402023316, "epoch": 0.38143247216971227, "grad_norm": 1.1796875, "learning_rate": 0.0004990023436626824, "loss": 5.7951, "mean_token_accuracy": 0.14602155163884162, "num_tokens": 8366668.0, "step": 4540 }, { "entropy": 5.982459354400635, "epoch": 0.3818525519848771, "grad_norm": 1.2734375, "learning_rate": 0.0004989995247291988, "loss": 5.9163, "mean_token_accuracy": 0.14120357036590575, "num_tokens": 8375610.0, "step": 4545 }, { "entropy": 5.895563316345215, "epoch": 0.382272631800042, "grad_norm": 1.15625, "learning_rate": 0.0004989967018276789, "loss": 5.774, "mean_token_accuracy": 0.15064741671085358, "num_tokens": 8384455.0, "step": 4550 }, { "entropy": 5.79692234992981, "epoch": 0.3826927116152069, "grad_norm": 1.171875, "learning_rate": 0.0004989938749581727, "loss": 5.8123, "mean_token_accuracy": 0.14297219812870027, "num_tokens": 8393868.0, "step": 4555 }, { "entropy": 5.923454284667969, "epoch": 0.38311279143037175, "grad_norm": 1.140625, "learning_rate": 0.0004989910441207305, "loss": 5.8328, "mean_token_accuracy": 0.1404195971786976, "num_tokens": 8402916.0, "step": 4560 }, { "entropy": 5.898684453964234, "epoch": 0.38353287124553664, "grad_norm": 1.328125, "learning_rate": 0.0004989882093154023, "loss": 5.7638, "mean_token_accuracy": 0.14875229001045226, "num_tokens": 8411649.0, "step": 4565 }, { "entropy": 5.880671072006225, "epoch": 0.38395295106070154, "grad_norm": 1.1796875, "learning_rate": 0.0004989853705422381, "loss": 5.8801, "mean_token_accuracy": 0.13631365299224854, "num_tokens": 8420393.0, "step": 4570 }, { "entropy": 5.883023405075074, "epoch": 0.38437303087586644, "grad_norm": 1.28125, "learning_rate": 0.0004989825278012886, "loss": 5.7743, "mean_token_accuracy": 0.14661871045827865, "num_tokens": 8429404.0, "step": 4575 }, { "entropy": 5.882754182815551, "epoch": 0.3847931106910313, "grad_norm": 1.3125, "learning_rate": 0.000498979681092604, "loss": 5.8106, "mean_token_accuracy": 0.14257726520299913, "num_tokens": 8438299.0, "step": 4580 }, { "entropy": 5.837142848968506, "epoch": 0.3852131905061962, "grad_norm": 1.0703125, "learning_rate": 0.0004989768304162345, "loss": 5.7554, "mean_token_accuracy": 0.14974153488874437, "num_tokens": 8447392.0, "step": 4585 }, { "entropy": 5.9916746616363525, "epoch": 0.3856332703213611, "grad_norm": 1.1171875, "learning_rate": 0.0004989739757722308, "loss": 5.8625, "mean_token_accuracy": 0.13722902536392212, "num_tokens": 8456361.0, "step": 4590 }, { "entropy": 5.905898475646973, "epoch": 0.3860533501365259, "grad_norm": 1.25, "learning_rate": 0.0004989711171606436, "loss": 5.7858, "mean_token_accuracy": 0.14541147351264955, "num_tokens": 8465548.0, "step": 4595 }, { "entropy": 5.921667671203613, "epoch": 0.3864734299516908, "grad_norm": 1.3359375, "learning_rate": 0.0004989682545815232, "loss": 5.8109, "mean_token_accuracy": 0.1411545142531395, "num_tokens": 8474454.0, "step": 4600 }, { "entropy": 5.837777233123779, "epoch": 0.3868935097668557, "grad_norm": 1.328125, "learning_rate": 0.0004989653880349207, "loss": 5.7277, "mean_token_accuracy": 0.14593051224946976, "num_tokens": 8482694.0, "step": 4605 }, { "entropy": 5.864150905609131, "epoch": 0.38731358958202056, "grad_norm": 1.328125, "learning_rate": 0.0004989625175208864, "loss": 5.8308, "mean_token_accuracy": 0.14381687343120575, "num_tokens": 8491162.0, "step": 4610 }, { "entropy": 5.819499731063843, "epoch": 0.38773366939718545, "grad_norm": 1.203125, "learning_rate": 0.0004989596430394717, "loss": 5.6983, "mean_token_accuracy": 0.1608663707971573, "num_tokens": 8500716.0, "step": 4615 }, { "entropy": 5.8265057563781735, "epoch": 0.38815374921235035, "grad_norm": 1.2421875, "learning_rate": 0.000498956764590727, "loss": 5.7384, "mean_token_accuracy": 0.14157627001404763, "num_tokens": 8508871.0, "step": 4620 }, { "entropy": 5.979275703430176, "epoch": 0.38857382902751525, "grad_norm": 1.28125, "learning_rate": 0.0004989538821747037, "loss": 5.9482, "mean_token_accuracy": 0.1420240134000778, "num_tokens": 8518450.0, "step": 4625 }, { "entropy": 5.9397321224212645, "epoch": 0.3889939088426801, "grad_norm": 1.2421875, "learning_rate": 0.0004989509957914527, "loss": 5.8528, "mean_token_accuracy": 0.1380702592432499, "num_tokens": 8528238.0, "step": 4630 }, { "entropy": 5.852479600906372, "epoch": 0.389413988657845, "grad_norm": 1.21875, "learning_rate": 0.0004989481054410251, "loss": 5.7431, "mean_token_accuracy": 0.14131385385990142, "num_tokens": 8537587.0, "step": 4635 }, { "entropy": 5.9004875183105465, "epoch": 0.3898340684730099, "grad_norm": 1.25, "learning_rate": 0.0004989452111234721, "loss": 5.854, "mean_token_accuracy": 0.14011769965291024, "num_tokens": 8547703.0, "step": 4640 }, { "entropy": 5.860686302185059, "epoch": 0.39025414828817473, "grad_norm": 1.25, "learning_rate": 0.000498942312838845, "loss": 5.7958, "mean_token_accuracy": 0.14458008110523224, "num_tokens": 8557001.0, "step": 4645 }, { "entropy": 5.8804422378540036, "epoch": 0.3906742281033396, "grad_norm": 1.3125, "learning_rate": 0.0004989394105871952, "loss": 5.692, "mean_token_accuracy": 0.15489965081214904, "num_tokens": 8565638.0, "step": 4650 }, { "entropy": 5.966875410079956, "epoch": 0.3910943079185045, "grad_norm": 1.734375, "learning_rate": 0.000498936504368574, "loss": 5.866, "mean_token_accuracy": 0.14225341156125068, "num_tokens": 8574428.0, "step": 4655 }, { "entropy": 5.759807777404785, "epoch": 0.3915143877336694, "grad_norm": 1.2109375, "learning_rate": 0.0004989335941830329, "loss": 5.816, "mean_token_accuracy": 0.14541401863098144, "num_tokens": 8583157.0, "step": 4660 }, { "entropy": 5.834117889404297, "epoch": 0.39193446754883426, "grad_norm": 1.40625, "learning_rate": 0.0004989306800306236, "loss": 5.7781, "mean_token_accuracy": 0.14344885647296907, "num_tokens": 8592382.0, "step": 4665 }, { "entropy": 5.8663976192474365, "epoch": 0.39235454736399916, "grad_norm": 1.8984375, "learning_rate": 0.0004989277619113975, "loss": 5.7604, "mean_token_accuracy": 0.15097892433404922, "num_tokens": 8601058.0, "step": 4670 }, { "entropy": 5.956953763961792, "epoch": 0.39277462717916406, "grad_norm": 1.75, "learning_rate": 0.0004989248398254065, "loss": 5.8591, "mean_token_accuracy": 0.1437965750694275, "num_tokens": 8609479.0, "step": 4675 }, { "entropy": 5.92048830986023, "epoch": 0.3931947069943289, "grad_norm": 1.1640625, "learning_rate": 0.0004989219137727021, "loss": 5.8058, "mean_token_accuracy": 0.14700522273778915, "num_tokens": 8618860.0, "step": 4680 }, { "entropy": 5.8700724124908445, "epoch": 0.3936147868094938, "grad_norm": 1.21875, "learning_rate": 0.0004989189837533365, "loss": 5.7572, "mean_token_accuracy": 0.14664537757635115, "num_tokens": 8627462.0, "step": 4685 }, { "entropy": 5.981065273284912, "epoch": 0.3940348666246587, "grad_norm": 1.21875, "learning_rate": 0.0004989160497673613, "loss": 5.9387, "mean_token_accuracy": 0.13696896955370902, "num_tokens": 8637569.0, "step": 4690 }, { "entropy": 5.918409252166748, "epoch": 0.39445494643982354, "grad_norm": 1.40625, "learning_rate": 0.0004989131118148286, "loss": 5.7353, "mean_token_accuracy": 0.14450196400284768, "num_tokens": 8645440.0, "step": 4695 }, { "entropy": 5.836373901367187, "epoch": 0.39487502625498844, "grad_norm": 1.578125, "learning_rate": 0.0004989101698957904, "loss": 5.9023, "mean_token_accuracy": 0.14248489439487458, "num_tokens": 8655077.0, "step": 4700 }, { "entropy": 5.941747808456421, "epoch": 0.39529510607015333, "grad_norm": 1.2578125, "learning_rate": 0.0004989072240102988, "loss": 5.8142, "mean_token_accuracy": 0.14740578532218934, "num_tokens": 8663126.0, "step": 4705 }, { "entropy": 5.973061513900757, "epoch": 0.39571518588531823, "grad_norm": 1.3125, "learning_rate": 0.0004989042741584061, "loss": 5.7952, "mean_token_accuracy": 0.14338430240750313, "num_tokens": 8672386.0, "step": 4710 }, { "entropy": 5.720412731170654, "epoch": 0.3961352657004831, "grad_norm": 1.6015625, "learning_rate": 0.0004989013203401645, "loss": 5.7388, "mean_token_accuracy": 0.1476906917989254, "num_tokens": 8681930.0, "step": 4715 }, { "entropy": 5.883289384841919, "epoch": 0.396555345515648, "grad_norm": 1.234375, "learning_rate": 0.0004988983625556264, "loss": 5.7919, "mean_token_accuracy": 0.14368573501706122, "num_tokens": 8690993.0, "step": 4720 }, { "entropy": 5.890859937667846, "epoch": 0.39697542533081287, "grad_norm": 1.25, "learning_rate": 0.0004988954008048438, "loss": 5.7809, "mean_token_accuracy": 0.14698703289031984, "num_tokens": 8699497.0, "step": 4725 }, { "entropy": 6.004160451889038, "epoch": 0.3973955051459777, "grad_norm": 1.2890625, "learning_rate": 0.0004988924350878697, "loss": 5.986, "mean_token_accuracy": 0.1333600528538227, "num_tokens": 8709274.0, "step": 4730 }, { "entropy": 5.947705507278442, "epoch": 0.3978155849611426, "grad_norm": 1.2421875, "learning_rate": 0.0004988894654047563, "loss": 5.8378, "mean_token_accuracy": 0.13920372053980828, "num_tokens": 8718158.0, "step": 4735 }, { "entropy": 5.82051944732666, "epoch": 0.3982356647763075, "grad_norm": 1.3359375, "learning_rate": 0.0004988864917555562, "loss": 5.7239, "mean_token_accuracy": 0.14391618072986603, "num_tokens": 8727459.0, "step": 4740 }, { "entropy": 5.940366458892822, "epoch": 0.3986557445914724, "grad_norm": 1.6484375, "learning_rate": 0.0004988835141403224, "loss": 5.8538, "mean_token_accuracy": 0.14721113741397857, "num_tokens": 8737614.0, "step": 4745 }, { "entropy": 5.819404935836792, "epoch": 0.39907582440663725, "grad_norm": 1.4453125, "learning_rate": 0.0004988805325591073, "loss": 5.6874, "mean_token_accuracy": 0.14453882575035096, "num_tokens": 8746799.0, "step": 4750 }, { "entropy": 5.84985032081604, "epoch": 0.39949590422180214, "grad_norm": 1.3203125, "learning_rate": 0.0004988775470119639, "loss": 5.8628, "mean_token_accuracy": 0.14014028683304786, "num_tokens": 8756555.0, "step": 4755 }, { "entropy": 5.867576169967651, "epoch": 0.39991598403696704, "grad_norm": 1.2265625, "learning_rate": 0.0004988745574989451, "loss": 5.8851, "mean_token_accuracy": 0.1480340264737606, "num_tokens": 8765849.0, "step": 4760 }, { "entropy": 6.094280099868774, "epoch": 0.4003360638521319, "grad_norm": 1.21875, "learning_rate": 0.0004988715640201036, "loss": 5.954, "mean_token_accuracy": 0.13378295823931693, "num_tokens": 8775713.0, "step": 4765 }, { "entropy": 5.884061288833618, "epoch": 0.4007561436672968, "grad_norm": 1.3125, "learning_rate": 0.0004988685665754928, "loss": 5.7775, "mean_token_accuracy": 0.14666623920202254, "num_tokens": 8784717.0, "step": 4770 }, { "entropy": 5.8814960479736325, "epoch": 0.4011762234824617, "grad_norm": 1.21875, "learning_rate": 0.0004988655651651656, "loss": 5.7911, "mean_token_accuracy": 0.14413672238588332, "num_tokens": 8794388.0, "step": 4775 }, { "entropy": 5.836367225646972, "epoch": 0.4015963032976265, "grad_norm": 1.234375, "learning_rate": 0.0004988625597891751, "loss": 5.8093, "mean_token_accuracy": 0.14697518199682236, "num_tokens": 8802436.0, "step": 4780 }, { "entropy": 5.912711811065674, "epoch": 0.4020163831127914, "grad_norm": 1.21875, "learning_rate": 0.0004988595504475746, "loss": 5.7636, "mean_token_accuracy": 0.1465681880712509, "num_tokens": 8811184.0, "step": 4785 }, { "entropy": 5.9507347583770756, "epoch": 0.4024364629279563, "grad_norm": 1.3515625, "learning_rate": 0.0004988565371404175, "loss": 5.8423, "mean_token_accuracy": 0.14505148231983184, "num_tokens": 8820525.0, "step": 4790 }, { "entropy": 5.830136728286743, "epoch": 0.4028565427431212, "grad_norm": 1.3984375, "learning_rate": 0.0004988535198677571, "loss": 5.7011, "mean_token_accuracy": 0.153212571144104, "num_tokens": 8828928.0, "step": 4795 }, { "entropy": 5.90922179222107, "epoch": 0.40327662255828606, "grad_norm": 1.390625, "learning_rate": 0.0004988504986296469, "loss": 5.907, "mean_token_accuracy": 0.1371180810034275, "num_tokens": 8838615.0, "step": 4800 }, { "entropy": 5.942590522766113, "epoch": 0.40369670237345096, "grad_norm": 1.1796875, "learning_rate": 0.0004988474734261404, "loss": 5.9047, "mean_token_accuracy": 0.13416762948036193, "num_tokens": 8848709.0, "step": 4805 }, { "entropy": 5.973557710647583, "epoch": 0.40411678218861585, "grad_norm": 1.1796875, "learning_rate": 0.0004988444442572911, "loss": 5.8479, "mean_token_accuracy": 0.1310623273253441, "num_tokens": 8858277.0, "step": 4810 }, { "entropy": 5.891769552230835, "epoch": 0.4045368620037807, "grad_norm": 1.2421875, "learning_rate": 0.0004988414111231528, "loss": 5.8161, "mean_token_accuracy": 0.14670211374759673, "num_tokens": 8868436.0, "step": 4815 }, { "entropy": 5.925015592575074, "epoch": 0.4049569418189456, "grad_norm": 1.2578125, "learning_rate": 0.000498838374023779, "loss": 5.7888, "mean_token_accuracy": 0.13960602283477783, "num_tokens": 8877740.0, "step": 4820 }, { "entropy": 5.908780908584594, "epoch": 0.4053770216341105, "grad_norm": 1.2578125, "learning_rate": 0.0004988353329592239, "loss": 5.7761, "mean_token_accuracy": 0.14475535228848457, "num_tokens": 8887408.0, "step": 4825 }, { "entropy": 5.893645095825195, "epoch": 0.4057971014492754, "grad_norm": 1.2109375, "learning_rate": 0.0004988322879295409, "loss": 5.929, "mean_token_accuracy": 0.13994188457727433, "num_tokens": 8897141.0, "step": 4830 }, { "entropy": 5.865872049331665, "epoch": 0.40621718126444023, "grad_norm": 1.2109375, "learning_rate": 0.0004988292389347844, "loss": 5.7105, "mean_token_accuracy": 0.15417256727814674, "num_tokens": 8905747.0, "step": 4835 }, { "entropy": 5.965148115158081, "epoch": 0.40663726107960513, "grad_norm": 1.265625, "learning_rate": 0.000498826185975008, "loss": 5.8673, "mean_token_accuracy": 0.14333693608641623, "num_tokens": 8914926.0, "step": 4840 }, { "entropy": 5.872843933105469, "epoch": 0.40705734089477, "grad_norm": 1.3203125, "learning_rate": 0.0004988231290502662, "loss": 5.8806, "mean_token_accuracy": 0.14108002185821533, "num_tokens": 8923956.0, "step": 4845 }, { "entropy": 5.925130224227905, "epoch": 0.40747742070993487, "grad_norm": 1.2578125, "learning_rate": 0.0004988200681606127, "loss": 5.7542, "mean_token_accuracy": 0.1388688787817955, "num_tokens": 8932654.0, "step": 4850 }, { "entropy": 5.9108325958251955, "epoch": 0.40789750052509977, "grad_norm": 1.1875, "learning_rate": 0.000498817003306102, "loss": 5.7364, "mean_token_accuracy": 0.1501722030341625, "num_tokens": 8941716.0, "step": 4855 }, { "entropy": 5.846788120269776, "epoch": 0.40831758034026466, "grad_norm": 1.2734375, "learning_rate": 0.0004988139344867884, "loss": 5.8122, "mean_token_accuracy": 0.14448407515883446, "num_tokens": 8950377.0, "step": 4860 }, { "entropy": 5.848782968521118, "epoch": 0.4087376601554295, "grad_norm": 1.2578125, "learning_rate": 0.0004988108617027261, "loss": 5.7679, "mean_token_accuracy": 0.14761658608913422, "num_tokens": 8959857.0, "step": 4865 }, { "entropy": 5.834667444229126, "epoch": 0.4091577399705944, "grad_norm": 1.2890625, "learning_rate": 0.0004988077849539698, "loss": 5.7183, "mean_token_accuracy": 0.1485067203640938, "num_tokens": 8968272.0, "step": 4870 }, { "entropy": 5.923686075210571, "epoch": 0.4095778197857593, "grad_norm": 1.4609375, "learning_rate": 0.0004988047042405736, "loss": 5.7969, "mean_token_accuracy": 0.14762237221002578, "num_tokens": 8977445.0, "step": 4875 }, { "entropy": 5.964400959014893, "epoch": 0.4099978996009242, "grad_norm": 1.2890625, "learning_rate": 0.0004988016195625924, "loss": 5.8644, "mean_token_accuracy": 0.13916484266519547, "num_tokens": 8987315.0, "step": 4880 }, { "entropy": 5.8641290187835695, "epoch": 0.41041797941608904, "grad_norm": 1.421875, "learning_rate": 0.0004987985309200807, "loss": 5.8568, "mean_token_accuracy": 0.1417423367500305, "num_tokens": 8998119.0, "step": 4885 }, { "entropy": 5.7576408863067625, "epoch": 0.41083805923125394, "grad_norm": 1.421875, "learning_rate": 0.0004987954383130934, "loss": 5.7477, "mean_token_accuracy": 0.1535985603928566, "num_tokens": 9007167.0, "step": 4890 }, { "entropy": 5.866803312301636, "epoch": 0.41125813904641884, "grad_norm": 1.203125, "learning_rate": 0.000498792341741685, "loss": 5.8006, "mean_token_accuracy": 0.13756236732006072, "num_tokens": 9016690.0, "step": 4895 }, { "entropy": 5.996728754043579, "epoch": 0.4116782188615837, "grad_norm": 1.296875, "learning_rate": 0.0004987892412059106, "loss": 5.8881, "mean_token_accuracy": 0.1421562008559704, "num_tokens": 9026117.0, "step": 4900 }, { "entropy": 5.823458862304688, "epoch": 0.4120982986767486, "grad_norm": 1.28125, "learning_rate": 0.0004987861367058251, "loss": 5.7583, "mean_token_accuracy": 0.1456121936440468, "num_tokens": 9035754.0, "step": 4905 }, { "entropy": 5.91724009513855, "epoch": 0.4125183784919135, "grad_norm": 1.3203125, "learning_rate": 0.0004987830282414833, "loss": 5.7614, "mean_token_accuracy": 0.15125717446208, "num_tokens": 9045453.0, "step": 4910 }, { "entropy": 5.882875871658325, "epoch": 0.41293845830707837, "grad_norm": 1.265625, "learning_rate": 0.0004987799158129404, "loss": 5.8736, "mean_token_accuracy": 0.14322762489318847, "num_tokens": 9056045.0, "step": 4915 }, { "entropy": 5.822021722793579, "epoch": 0.4133585381222432, "grad_norm": 1.2421875, "learning_rate": 0.0004987767994202516, "loss": 5.7652, "mean_token_accuracy": 0.14132684618234634, "num_tokens": 9065728.0, "step": 4920 }, { "entropy": 5.874257898330688, "epoch": 0.4137786179374081, "grad_norm": 1.2265625, "learning_rate": 0.0004987736790634719, "loss": 5.7867, "mean_token_accuracy": 0.14259056150913238, "num_tokens": 9075522.0, "step": 4925 }, { "entropy": 5.868446731567383, "epoch": 0.414198697752573, "grad_norm": 1.3125, "learning_rate": 0.0004987705547426568, "loss": 5.7633, "mean_token_accuracy": 0.14451717659831048, "num_tokens": 9084412.0, "step": 4930 }, { "entropy": 5.86938099861145, "epoch": 0.41461877756773785, "grad_norm": 1.3203125, "learning_rate": 0.0004987674264578615, "loss": 5.8382, "mean_token_accuracy": 0.1410167396068573, "num_tokens": 9094289.0, "step": 4935 }, { "entropy": 5.902176809310913, "epoch": 0.41503885738290275, "grad_norm": 1.1875, "learning_rate": 0.0004987642942091414, "loss": 5.7413, "mean_token_accuracy": 0.14698186367750168, "num_tokens": 9103124.0, "step": 4940 }, { "entropy": 5.898521900177002, "epoch": 0.41545893719806765, "grad_norm": 1.4296875, "learning_rate": 0.0004987611579965523, "loss": 5.6945, "mean_token_accuracy": 0.1453884869813919, "num_tokens": 9112794.0, "step": 4945 }, { "entropy": 5.867249441146851, "epoch": 0.4158790170132325, "grad_norm": 1.1484375, "learning_rate": 0.0004987580178201492, "loss": 5.8508, "mean_token_accuracy": 0.15215325057506562, "num_tokens": 9122718.0, "step": 4950 }, { "entropy": 5.877714014053344, "epoch": 0.4162990968283974, "grad_norm": 1.2734375, "learning_rate": 0.0004987548736799882, "loss": 5.8851, "mean_token_accuracy": 0.13938734084367752, "num_tokens": 9131855.0, "step": 4955 }, { "entropy": 5.866538429260254, "epoch": 0.4167191766435623, "grad_norm": 1.1875, "learning_rate": 0.0004987517255761248, "loss": 5.7248, "mean_token_accuracy": 0.14940666258335114, "num_tokens": 9141102.0, "step": 4960 }, { "entropy": 5.806973934173584, "epoch": 0.4171392564587272, "grad_norm": 1.2578125, "learning_rate": 0.0004987485735086148, "loss": 5.8043, "mean_token_accuracy": 0.14497776329517365, "num_tokens": 9150552.0, "step": 4965 }, { "entropy": 5.940771627426147, "epoch": 0.417559336273892, "grad_norm": 1.1953125, "learning_rate": 0.000498745417477514, "loss": 5.7927, "mean_token_accuracy": 0.14460284858942032, "num_tokens": 9160105.0, "step": 4970 }, { "entropy": 5.864925670623779, "epoch": 0.4179794160890569, "grad_norm": 1.203125, "learning_rate": 0.0004987422574828784, "loss": 5.7728, "mean_token_accuracy": 0.14519683197140693, "num_tokens": 9169367.0, "step": 4975 }, { "entropy": 5.846901607513428, "epoch": 0.4183994959042218, "grad_norm": 1.265625, "learning_rate": 0.0004987390935247639, "loss": 5.6568, "mean_token_accuracy": 0.15195999220013617, "num_tokens": 9177872.0, "step": 4980 }, { "entropy": 5.892278623580933, "epoch": 0.41881957571938666, "grad_norm": 1.234375, "learning_rate": 0.0004987359256032265, "loss": 5.8728, "mean_token_accuracy": 0.1392049200832844, "num_tokens": 9187879.0, "step": 4985 }, { "entropy": 5.834523773193359, "epoch": 0.41923965553455156, "grad_norm": 1.140625, "learning_rate": 0.0004987327537183225, "loss": 5.7865, "mean_token_accuracy": 0.14359964653849602, "num_tokens": 9198281.0, "step": 4990 }, { "entropy": 5.898417997360229, "epoch": 0.41965973534971646, "grad_norm": 1.1015625, "learning_rate": 0.0004987295778701078, "loss": 5.7784, "mean_token_accuracy": 0.1480983316898346, "num_tokens": 9207670.0, "step": 4995 }, { "entropy": 5.903277587890625, "epoch": 0.42007981516488135, "grad_norm": 1.3828125, "learning_rate": 0.000498726398058639, "loss": 5.7986, "mean_token_accuracy": 0.1475730612874031, "num_tokens": 9216995.0, "step": 5000 }, { "entropy": 5.920054292678833, "epoch": 0.4204998949800462, "grad_norm": 1.28125, "learning_rate": 0.0004987232142839723, "loss": 5.8785, "mean_token_accuracy": 0.13731264397501947, "num_tokens": 9227330.0, "step": 5005 }, { "entropy": 5.861970615386963, "epoch": 0.4209199747952111, "grad_norm": 1.1953125, "learning_rate": 0.0004987200265461638, "loss": 5.7885, "mean_token_accuracy": 0.15134866386651993, "num_tokens": 9236666.0, "step": 5010 }, { "entropy": 5.934697484970092, "epoch": 0.421340054610376, "grad_norm": 1.1953125, "learning_rate": 0.0004987168348452705, "loss": 5.7864, "mean_token_accuracy": 0.144124399125576, "num_tokens": 9246388.0, "step": 5015 }, { "entropy": 5.8499044418334964, "epoch": 0.42176013442554083, "grad_norm": 1.2109375, "learning_rate": 0.0004987136391813485, "loss": 5.7404, "mean_token_accuracy": 0.15391666144132615, "num_tokens": 9255239.0, "step": 5020 }, { "entropy": 5.773643350601196, "epoch": 0.42218021424070573, "grad_norm": 1.2265625, "learning_rate": 0.0004987104395544547, "loss": 5.7252, "mean_token_accuracy": 0.14332954734563827, "num_tokens": 9264468.0, "step": 5025 }, { "entropy": 5.859898376464844, "epoch": 0.42260029405587063, "grad_norm": 1.1953125, "learning_rate": 0.0004987072359646455, "loss": 5.7927, "mean_token_accuracy": 0.15058641731739045, "num_tokens": 9274140.0, "step": 5030 }, { "entropy": 5.917972660064697, "epoch": 0.42302037387103547, "grad_norm": 1.2265625, "learning_rate": 0.0004987040284119778, "loss": 5.7586, "mean_token_accuracy": 0.1428128033876419, "num_tokens": 9283539.0, "step": 5035 }, { "entropy": 5.781129264831543, "epoch": 0.42344045368620037, "grad_norm": 1.3046875, "learning_rate": 0.0004987008168965087, "loss": 5.7728, "mean_token_accuracy": 0.14332580342888832, "num_tokens": 9292664.0, "step": 5040 }, { "entropy": 5.946068525314331, "epoch": 0.42386053350136527, "grad_norm": 1.1640625, "learning_rate": 0.0004986976014182946, "loss": 5.8657, "mean_token_accuracy": 0.14432715028524398, "num_tokens": 9302814.0, "step": 5045 }, { "entropy": 5.980961608886719, "epoch": 0.42428061331653016, "grad_norm": 1.203125, "learning_rate": 0.0004986943819773927, "loss": 5.858, "mean_token_accuracy": 0.14330325573682784, "num_tokens": 9312654.0, "step": 5050 }, { "entropy": 5.9505743980407715, "epoch": 0.424700693131695, "grad_norm": 1.1953125, "learning_rate": 0.00049869115857386, "loss": 5.8737, "mean_token_accuracy": 0.13669376373291015, "num_tokens": 9322271.0, "step": 5055 }, { "entropy": 5.951388359069824, "epoch": 0.4251207729468599, "grad_norm": 1.125, "learning_rate": 0.0004986879312077536, "loss": 5.8193, "mean_token_accuracy": 0.14102528542280196, "num_tokens": 9331341.0, "step": 5060 }, { "entropy": 5.834031820297241, "epoch": 0.4255408527620248, "grad_norm": 1.2578125, "learning_rate": 0.0004986846998791308, "loss": 5.7561, "mean_token_accuracy": 0.1436670668423176, "num_tokens": 9339863.0, "step": 5065 }, { "entropy": 5.811039066314697, "epoch": 0.42596093257718964, "grad_norm": 1.203125, "learning_rate": 0.0004986814645880485, "loss": 5.7236, "mean_token_accuracy": 0.14669884666800498, "num_tokens": 9349488.0, "step": 5070 }, { "entropy": 5.830924463272095, "epoch": 0.42638101239235454, "grad_norm": 1.140625, "learning_rate": 0.0004986782253345645, "loss": 5.7333, "mean_token_accuracy": 0.14323149994015694, "num_tokens": 9357977.0, "step": 5075 }, { "entropy": 5.839050388336181, "epoch": 0.42680109220751944, "grad_norm": 1.171875, "learning_rate": 0.0004986749821187358, "loss": 5.8394, "mean_token_accuracy": 0.14253177791833876, "num_tokens": 9367449.0, "step": 5080 }, { "entropy": 5.939317226409912, "epoch": 0.42722117202268434, "grad_norm": 1.375, "learning_rate": 0.00049867173494062, "loss": 5.8681, "mean_token_accuracy": 0.14768607616424562, "num_tokens": 9377070.0, "step": 5085 }, { "entropy": 5.813904285430908, "epoch": 0.4276412518378492, "grad_norm": 1.203125, "learning_rate": 0.0004986684838002744, "loss": 5.6526, "mean_token_accuracy": 0.14204483926296235, "num_tokens": 9385881.0, "step": 5090 }, { "entropy": 5.823819637298584, "epoch": 0.4280613316530141, "grad_norm": 1.1953125, "learning_rate": 0.0004986652286977569, "loss": 5.7905, "mean_token_accuracy": 0.14255458265542983, "num_tokens": 9395159.0, "step": 5095 }, { "entropy": 5.877113628387451, "epoch": 0.428481411468179, "grad_norm": 1.1875, "learning_rate": 0.0004986619696331252, "loss": 5.7486, "mean_token_accuracy": 0.14601895585656166, "num_tokens": 9404590.0, "step": 5100 }, { "entropy": 5.856746768951416, "epoch": 0.4289014912833438, "grad_norm": 1.1875, "learning_rate": 0.0004986587066064367, "loss": 5.7708, "mean_token_accuracy": 0.1473971426486969, "num_tokens": 9414452.0, "step": 5105 }, { "entropy": 5.868241453170777, "epoch": 0.4293215710985087, "grad_norm": 1.25, "learning_rate": 0.0004986554396177494, "loss": 5.894, "mean_token_accuracy": 0.1396991342306137, "num_tokens": 9424004.0, "step": 5110 }, { "entropy": 5.933579587936402, "epoch": 0.4297416509136736, "grad_norm": 1.1875, "learning_rate": 0.0004986521686671212, "loss": 5.7713, "mean_token_accuracy": 0.1551983118057251, "num_tokens": 9433487.0, "step": 5115 }, { "entropy": 5.856822824478149, "epoch": 0.43016173072883845, "grad_norm": 1.2265625, "learning_rate": 0.00049864889375461, "loss": 5.8359, "mean_token_accuracy": 0.13958305045962333, "num_tokens": 9442742.0, "step": 5120 }, { "entropy": 5.880755043029785, "epoch": 0.43058181054400335, "grad_norm": 1.1015625, "learning_rate": 0.0004986456148802738, "loss": 5.8957, "mean_token_accuracy": 0.14121335968375207, "num_tokens": 9452550.0, "step": 5125 }, { "entropy": 6.039326620101929, "epoch": 0.43100189035916825, "grad_norm": 1.15625, "learning_rate": 0.0004986423320441707, "loss": 5.8546, "mean_token_accuracy": 0.13762183710932732, "num_tokens": 9461920.0, "step": 5130 }, { "entropy": 5.904562616348267, "epoch": 0.43142197017433315, "grad_norm": 1.2890625, "learning_rate": 0.0004986390452463588, "loss": 5.7682, "mean_token_accuracy": 0.14276604056358339, "num_tokens": 9470817.0, "step": 5135 }, { "entropy": 5.710296773910523, "epoch": 0.431842049989498, "grad_norm": 1.234375, "learning_rate": 0.0004986357544868964, "loss": 5.7258, "mean_token_accuracy": 0.15019231289625168, "num_tokens": 9479936.0, "step": 5140 }, { "entropy": 5.892205905914307, "epoch": 0.4322621298046629, "grad_norm": 1.25, "learning_rate": 0.0004986324597658418, "loss": 5.7581, "mean_token_accuracy": 0.15196042209863664, "num_tokens": 9489818.0, "step": 5145 }, { "entropy": 5.733763742446899, "epoch": 0.4326822096198278, "grad_norm": 1.2421875, "learning_rate": 0.0004986291610832533, "loss": 5.7455, "mean_token_accuracy": 0.14281522929668428, "num_tokens": 9499688.0, "step": 5150 }, { "entropy": 5.960237169265747, "epoch": 0.4331022894349926, "grad_norm": 1.21875, "learning_rate": 0.0004986258584391892, "loss": 5.8063, "mean_token_accuracy": 0.14208860471844673, "num_tokens": 9509581.0, "step": 5155 }, { "entropy": 6.0035475730896, "epoch": 0.4335223692501575, "grad_norm": 1.3671875, "learning_rate": 0.0004986225518337084, "loss": 5.89, "mean_token_accuracy": 0.143732051551342, "num_tokens": 9518556.0, "step": 5160 }, { "entropy": 5.81024432182312, "epoch": 0.4339424490653224, "grad_norm": 1.1484375, "learning_rate": 0.0004986192412668692, "loss": 5.7931, "mean_token_accuracy": 0.14318298548460007, "num_tokens": 9527612.0, "step": 5165 }, { "entropy": 5.847835922241211, "epoch": 0.4343625288804873, "grad_norm": 1.2421875, "learning_rate": 0.0004986159267387302, "loss": 5.6856, "mean_token_accuracy": 0.1560652643442154, "num_tokens": 9535882.0, "step": 5170 }, { "entropy": 5.862061595916748, "epoch": 0.43478260869565216, "grad_norm": 1.2578125, "learning_rate": 0.0004986126082493502, "loss": 5.7914, "mean_token_accuracy": 0.14822041988372803, "num_tokens": 9544799.0, "step": 5175 }, { "entropy": 5.794046545028687, "epoch": 0.43520268851081706, "grad_norm": 1.171875, "learning_rate": 0.0004986092857987881, "loss": 5.6968, "mean_token_accuracy": 0.15352533906698226, "num_tokens": 9553805.0, "step": 5180 }, { "entropy": 5.832414722442627, "epoch": 0.43562276832598196, "grad_norm": 1.421875, "learning_rate": 0.0004986059593871026, "loss": 5.7414, "mean_token_accuracy": 0.14509093537926673, "num_tokens": 9563493.0, "step": 5185 }, { "entropy": 5.899970149993896, "epoch": 0.4360428481411468, "grad_norm": 2.0, "learning_rate": 0.0004986026290143527, "loss": 5.8201, "mean_token_accuracy": 0.14310061410069466, "num_tokens": 9572297.0, "step": 5190 }, { "entropy": 5.985169315338135, "epoch": 0.4364629279563117, "grad_norm": 1.4609375, "learning_rate": 0.0004985992946805973, "loss": 5.9499, "mean_token_accuracy": 0.1373360723257065, "num_tokens": 9581967.0, "step": 5195 }, { "entropy": 5.853709316253662, "epoch": 0.4368830077714766, "grad_norm": 1.2890625, "learning_rate": 0.0004985959563858955, "loss": 5.8611, "mean_token_accuracy": 0.14648908525705337, "num_tokens": 9590885.0, "step": 5200 }, { "entropy": 5.920672750473022, "epoch": 0.43730308758664144, "grad_norm": 1.6015625, "learning_rate": 0.0004985926141303066, "loss": 5.7766, "mean_token_accuracy": 0.14383909106254578, "num_tokens": 9599247.0, "step": 5205 }, { "entropy": 5.823170852661133, "epoch": 0.43772316740180633, "grad_norm": 1.4453125, "learning_rate": 0.0004985892679138896, "loss": 5.709, "mean_token_accuracy": 0.15263715162873268, "num_tokens": 9608296.0, "step": 5210 }, { "entropy": 5.922242307662964, "epoch": 0.43814324721697123, "grad_norm": 1.34375, "learning_rate": 0.0004985859177367038, "loss": 5.7539, "mean_token_accuracy": 0.14295759946107864, "num_tokens": 9616734.0, "step": 5215 }, { "entropy": 5.933417272567749, "epoch": 0.43856332703213613, "grad_norm": 2.25, "learning_rate": 0.0004985825635988087, "loss": 5.839, "mean_token_accuracy": 0.14136623740196227, "num_tokens": 9626246.0, "step": 5220 }, { "entropy": 5.840227174758911, "epoch": 0.43898340684730097, "grad_norm": 1.359375, "learning_rate": 0.0004985792055002635, "loss": 5.7156, "mean_token_accuracy": 0.1447908401489258, "num_tokens": 9634963.0, "step": 5225 }, { "entropy": 5.864311695098877, "epoch": 0.43940348666246587, "grad_norm": 1.2734375, "learning_rate": 0.0004985758434411278, "loss": 5.7954, "mean_token_accuracy": 0.1492132991552353, "num_tokens": 9643615.0, "step": 5230 }, { "entropy": 5.824445819854736, "epoch": 0.43982356647763077, "grad_norm": 1.3046875, "learning_rate": 0.0004985724774214613, "loss": 5.7572, "mean_token_accuracy": 0.14679911136627197, "num_tokens": 9653306.0, "step": 5235 }, { "entropy": 5.8889368057250975, "epoch": 0.4402436462927956, "grad_norm": 1.3671875, "learning_rate": 0.0004985691074413233, "loss": 5.7966, "mean_token_accuracy": 0.1408935308456421, "num_tokens": 9662389.0, "step": 5240 }, { "entropy": 5.806066703796387, "epoch": 0.4406637261079605, "grad_norm": 1.2578125, "learning_rate": 0.0004985657335007739, "loss": 5.7659, "mean_token_accuracy": 0.14551339596509932, "num_tokens": 9671183.0, "step": 5245 }, { "entropy": 5.852633047103882, "epoch": 0.4410838059231254, "grad_norm": 1.1953125, "learning_rate": 0.0004985623555998725, "loss": 5.778, "mean_token_accuracy": 0.1539351999759674, "num_tokens": 9680544.0, "step": 5250 }, { "entropy": 5.867886209487915, "epoch": 0.4415038857382903, "grad_norm": 1.34375, "learning_rate": 0.0004985589737386791, "loss": 5.8053, "mean_token_accuracy": 0.1449089080095291, "num_tokens": 9690137.0, "step": 5255 }, { "entropy": 5.847021532058716, "epoch": 0.44192396555345514, "grad_norm": 1.203125, "learning_rate": 0.0004985555879172535, "loss": 5.7433, "mean_token_accuracy": 0.14687602072954178, "num_tokens": 9699149.0, "step": 5260 }, { "entropy": 5.898943853378296, "epoch": 0.44234404536862004, "grad_norm": 1.2578125, "learning_rate": 0.000498552198135656, "loss": 5.8097, "mean_token_accuracy": 0.15019679218530654, "num_tokens": 9709308.0, "step": 5265 }, { "entropy": 5.844637632369995, "epoch": 0.44276412518378494, "grad_norm": 1.4921875, "learning_rate": 0.0004985488043939462, "loss": 5.7573, "mean_token_accuracy": 0.1442711167037487, "num_tokens": 9718462.0, "step": 5270 }, { "entropy": 5.853937387466431, "epoch": 0.4431842049989498, "grad_norm": 1.4140625, "learning_rate": 0.0004985454066921846, "loss": 5.6905, "mean_token_accuracy": 0.1537187710404396, "num_tokens": 9727626.0, "step": 5275 }, { "entropy": 5.747472763061523, "epoch": 0.4436042848141147, "grad_norm": 1.2421875, "learning_rate": 0.0004985420050304312, "loss": 5.7068, "mean_token_accuracy": 0.1498991407454014, "num_tokens": 9737091.0, "step": 5280 }, { "entropy": 5.846937942504883, "epoch": 0.4440243646292796, "grad_norm": 1.484375, "learning_rate": 0.0004985385994087462, "loss": 5.7867, "mean_token_accuracy": 0.14585647359490395, "num_tokens": 9746135.0, "step": 5285 }, { "entropy": 5.949729108810425, "epoch": 0.4444444444444444, "grad_norm": 1.5703125, "learning_rate": 0.0004985351898271901, "loss": 5.719, "mean_token_accuracy": 0.1520434781908989, "num_tokens": 9754549.0, "step": 5290 }, { "entropy": 5.887947463989258, "epoch": 0.4448645242596093, "grad_norm": 1.28125, "learning_rate": 0.0004985317762858231, "loss": 5.8567, "mean_token_accuracy": 0.14025997146964073, "num_tokens": 9764219.0, "step": 5295 }, { "entropy": 5.871951913833618, "epoch": 0.4452846040747742, "grad_norm": 1.5078125, "learning_rate": 0.000498528358784706, "loss": 5.6972, "mean_token_accuracy": 0.15001460164785385, "num_tokens": 9772234.0, "step": 5300 }, { "entropy": 5.811316633224488, "epoch": 0.4457046838899391, "grad_norm": 1.203125, "learning_rate": 0.000498524937323899, "loss": 5.7622, "mean_token_accuracy": 0.15125853270292283, "num_tokens": 9781417.0, "step": 5305 }, { "entropy": 5.981836175918579, "epoch": 0.44612476370510395, "grad_norm": 1.2265625, "learning_rate": 0.0004985215119034628, "loss": 5.8763, "mean_token_accuracy": 0.13692381381988525, "num_tokens": 9791286.0, "step": 5310 }, { "entropy": 5.866169118881226, "epoch": 0.44654484352026885, "grad_norm": 1.34375, "learning_rate": 0.0004985180825234582, "loss": 5.8755, "mean_token_accuracy": 0.13873762115836144, "num_tokens": 9802157.0, "step": 5315 }, { "entropy": 5.981353807449341, "epoch": 0.44696492333543375, "grad_norm": 1.2890625, "learning_rate": 0.0004985146491839459, "loss": 5.8547, "mean_token_accuracy": 0.1320488214492798, "num_tokens": 9812646.0, "step": 5320 }, { "entropy": 5.9978625774383545, "epoch": 0.4473850031505986, "grad_norm": 1.359375, "learning_rate": 0.0004985112118849865, "loss": 5.8664, "mean_token_accuracy": 0.13918881937861444, "num_tokens": 9822274.0, "step": 5325 }, { "entropy": 5.781670093536377, "epoch": 0.4478050829657635, "grad_norm": 1.4609375, "learning_rate": 0.0004985077706266412, "loss": 5.6507, "mean_token_accuracy": 0.14431787207722663, "num_tokens": 9831337.0, "step": 5330 }, { "entropy": 5.797645950317383, "epoch": 0.4482251627809284, "grad_norm": 1.21875, "learning_rate": 0.0004985043254089708, "loss": 5.8111, "mean_token_accuracy": 0.13542471826076508, "num_tokens": 9840798.0, "step": 5335 }, { "entropy": 5.871469783782959, "epoch": 0.44864524259609323, "grad_norm": 1.2109375, "learning_rate": 0.0004985008762320364, "loss": 5.7666, "mean_token_accuracy": 0.14363950192928315, "num_tokens": 9850117.0, "step": 5340 }, { "entropy": 5.885560655593872, "epoch": 0.4490653224112581, "grad_norm": 1.328125, "learning_rate": 0.000498497423095899, "loss": 5.7176, "mean_token_accuracy": 0.15319354236125945, "num_tokens": 9858227.0, "step": 5345 }, { "entropy": 5.810570764541626, "epoch": 0.449485402226423, "grad_norm": 1.21875, "learning_rate": 0.0004984939660006199, "loss": 5.8079, "mean_token_accuracy": 0.14338937029242516, "num_tokens": 9867157.0, "step": 5350 }, { "entropy": 5.811974906921387, "epoch": 0.4499054820415879, "grad_norm": 1.3046875, "learning_rate": 0.0004984905049462602, "loss": 5.7349, "mean_token_accuracy": 0.144259013235569, "num_tokens": 9877045.0, "step": 5355 }, { "entropy": 5.959705638885498, "epoch": 0.45032556185675277, "grad_norm": 1.4453125, "learning_rate": 0.0004984870399328814, "loss": 5.8617, "mean_token_accuracy": 0.14245471283793448, "num_tokens": 9886637.0, "step": 5360 }, { "entropy": 5.816979646682739, "epoch": 0.45074564167191766, "grad_norm": 1.171875, "learning_rate": 0.0004984835709605446, "loss": 5.7271, "mean_token_accuracy": 0.15511318892240525, "num_tokens": 9895601.0, "step": 5365 }, { "entropy": 5.86139702796936, "epoch": 0.45116572148708256, "grad_norm": 1.3203125, "learning_rate": 0.0004984800980293116, "loss": 5.8807, "mean_token_accuracy": 0.14196527227759362, "num_tokens": 9904775.0, "step": 5370 }, { "entropy": 5.883301210403443, "epoch": 0.4515858013022474, "grad_norm": 1.328125, "learning_rate": 0.0004984766211392435, "loss": 5.8184, "mean_token_accuracy": 0.13878512308001517, "num_tokens": 9913795.0, "step": 5375 }, { "entropy": 5.856382942199707, "epoch": 0.4520058811174123, "grad_norm": 1.4765625, "learning_rate": 0.0004984731402904024, "loss": 5.6546, "mean_token_accuracy": 0.15193988084793092, "num_tokens": 9922576.0, "step": 5380 }, { "entropy": 5.768913459777832, "epoch": 0.4524259609325772, "grad_norm": 1.3828125, "learning_rate": 0.0004984696554828496, "loss": 5.6446, "mean_token_accuracy": 0.15225213021039963, "num_tokens": 9930971.0, "step": 5385 }, { "entropy": 5.856381464004516, "epoch": 0.4528460407477421, "grad_norm": 1.5703125, "learning_rate": 0.0004984661667166468, "loss": 5.7606, "mean_token_accuracy": 0.1514030024409294, "num_tokens": 9939628.0, "step": 5390 }, { "entropy": 5.887900066375733, "epoch": 0.45326612056290694, "grad_norm": 1.203125, "learning_rate": 0.0004984626739918561, "loss": 5.7294, "mean_token_accuracy": 0.15370103269815444, "num_tokens": 9948397.0, "step": 5395 }, { "entropy": 5.8639452934265135, "epoch": 0.45368620037807184, "grad_norm": 1.125, "learning_rate": 0.0004984591773085391, "loss": 5.8108, "mean_token_accuracy": 0.14718640744686126, "num_tokens": 9957683.0, "step": 5400 }, { "entropy": 5.911360502243042, "epoch": 0.45410628019323673, "grad_norm": 1.3359375, "learning_rate": 0.0004984556766667578, "loss": 5.7938, "mean_token_accuracy": 0.14773029685020447, "num_tokens": 9966756.0, "step": 5405 }, { "entropy": 5.876928043365479, "epoch": 0.4545263600084016, "grad_norm": 1.15625, "learning_rate": 0.0004984521720665743, "loss": 5.7996, "mean_token_accuracy": 0.1499388188123703, "num_tokens": 9976000.0, "step": 5410 }, { "entropy": 5.9389279842376705, "epoch": 0.4549464398235665, "grad_norm": 1.296875, "learning_rate": 0.0004984486635080507, "loss": 5.7922, "mean_token_accuracy": 0.146384534239769, "num_tokens": 9985509.0, "step": 5415 }, { "entropy": 5.7951904296875, "epoch": 0.45536651963873137, "grad_norm": 1.34375, "learning_rate": 0.0004984451509912489, "loss": 5.744, "mean_token_accuracy": 0.1474005714058876, "num_tokens": 9994342.0, "step": 5420 }, { "entropy": 5.838972473144532, "epoch": 0.4557865994538962, "grad_norm": 1.28125, "learning_rate": 0.0004984416345162315, "loss": 5.7889, "mean_token_accuracy": 0.14537926837801934, "num_tokens": 10004249.0, "step": 5425 }, { "entropy": 5.8457417488098145, "epoch": 0.4562066792690611, "grad_norm": 1.3359375, "learning_rate": 0.0004984381140830605, "loss": 5.7485, "mean_token_accuracy": 0.14723600521683694, "num_tokens": 10012430.0, "step": 5430 }, { "entropy": 5.878772354125976, "epoch": 0.456626759084226, "grad_norm": 1.1875, "learning_rate": 0.0004984345896917984, "loss": 5.7605, "mean_token_accuracy": 0.14340553283691407, "num_tokens": 10021434.0, "step": 5435 }, { "entropy": 5.859716320037842, "epoch": 0.4570468388993909, "grad_norm": 1.3359375, "learning_rate": 0.0004984310613425076, "loss": 5.7662, "mean_token_accuracy": 0.1505170688033104, "num_tokens": 10030473.0, "step": 5440 }, { "entropy": 5.890053796768188, "epoch": 0.45746691871455575, "grad_norm": 1.3984375, "learning_rate": 0.0004984275290352506, "loss": 5.7347, "mean_token_accuracy": 0.1503530338406563, "num_tokens": 10039057.0, "step": 5445 }, { "entropy": 5.906252813339234, "epoch": 0.45788699852972065, "grad_norm": 1.3046875, "learning_rate": 0.0004984239927700899, "loss": 5.8309, "mean_token_accuracy": 0.14800925105810164, "num_tokens": 10047998.0, "step": 5450 }, { "entropy": 5.96235499382019, "epoch": 0.45830707834488554, "grad_norm": 1.1796875, "learning_rate": 0.0004984204525470883, "loss": 5.7626, "mean_token_accuracy": 0.14305243864655495, "num_tokens": 10057479.0, "step": 5455 }, { "entropy": 5.773991537094116, "epoch": 0.4587271581600504, "grad_norm": 1.484375, "learning_rate": 0.0004984169083663084, "loss": 5.7318, "mean_token_accuracy": 0.14002140685915948, "num_tokens": 10067754.0, "step": 5460 }, { "entropy": 5.805001163482666, "epoch": 0.4591472379752153, "grad_norm": 1.1484375, "learning_rate": 0.0004984133602278129, "loss": 5.8253, "mean_token_accuracy": 0.1421283006668091, "num_tokens": 10076815.0, "step": 5465 }, { "entropy": 6.033328580856323, "epoch": 0.4595673177903802, "grad_norm": 1.25, "learning_rate": 0.000498409808131665, "loss": 5.8269, "mean_token_accuracy": 0.14671371206641198, "num_tokens": 10086300.0, "step": 5470 }, { "entropy": 5.823101377487182, "epoch": 0.4599873976055451, "grad_norm": 1.296875, "learning_rate": 0.0004984062520779272, "loss": 5.7259, "mean_token_accuracy": 0.1552243560552597, "num_tokens": 10095383.0, "step": 5475 }, { "entropy": 5.773621034622193, "epoch": 0.4604074774207099, "grad_norm": 1.296875, "learning_rate": 0.0004984026920666628, "loss": 5.7019, "mean_token_accuracy": 0.1514463573694229, "num_tokens": 10103971.0, "step": 5480 }, { "entropy": 5.798014068603516, "epoch": 0.4608275572358748, "grad_norm": 1.1484375, "learning_rate": 0.0004983991280979347, "loss": 5.6971, "mean_token_accuracy": 0.1502104952931404, "num_tokens": 10113028.0, "step": 5485 }, { "entropy": 5.823189973831177, "epoch": 0.4612476370510397, "grad_norm": 1.2734375, "learning_rate": 0.0004983955601718061, "loss": 5.6819, "mean_token_accuracy": 0.14814986884593964, "num_tokens": 10121890.0, "step": 5490 }, { "entropy": 5.896232748031617, "epoch": 0.46166771686620456, "grad_norm": 1.3984375, "learning_rate": 0.0004983919882883401, "loss": 5.8089, "mean_token_accuracy": 0.1452305495738983, "num_tokens": 10131655.0, "step": 5495 }, { "entropy": 5.876237583160401, "epoch": 0.46208779668136946, "grad_norm": 1.296875, "learning_rate": 0.0004983884124476, "loss": 5.8051, "mean_token_accuracy": 0.14433109760284424, "num_tokens": 10140778.0, "step": 5500 }, { "entropy": 5.897982120513916, "epoch": 0.46250787649653435, "grad_norm": 1.2421875, "learning_rate": 0.0004983848326496494, "loss": 5.8699, "mean_token_accuracy": 0.1391661711037159, "num_tokens": 10150229.0, "step": 5505 }, { "entropy": 5.943829345703125, "epoch": 0.4629279563116992, "grad_norm": 1.2421875, "learning_rate": 0.0004983812488945513, "loss": 5.7502, "mean_token_accuracy": 0.14314467534422876, "num_tokens": 10158939.0, "step": 5510 }, { "entropy": 5.819750833511352, "epoch": 0.4633480361268641, "grad_norm": 1.1796875, "learning_rate": 0.0004983776611823696, "loss": 5.7489, "mean_token_accuracy": 0.14325918182730674, "num_tokens": 10168383.0, "step": 5515 }, { "entropy": 5.7525170803070065, "epoch": 0.463768115942029, "grad_norm": 1.4765625, "learning_rate": 0.0004983740695131676, "loss": 5.7483, "mean_token_accuracy": 0.1506567046046257, "num_tokens": 10178678.0, "step": 5520 }, { "entropy": 5.8393933296203615, "epoch": 0.4641881957571939, "grad_norm": 1.453125, "learning_rate": 0.000498370473887009, "loss": 5.7404, "mean_token_accuracy": 0.1451387256383896, "num_tokens": 10188964.0, "step": 5525 }, { "entropy": 5.9242652416229244, "epoch": 0.46460827557235873, "grad_norm": 1.34375, "learning_rate": 0.0004983668743039573, "loss": 5.7722, "mean_token_accuracy": 0.15323825627565385, "num_tokens": 10198333.0, "step": 5530 }, { "entropy": 5.789677238464355, "epoch": 0.46502835538752363, "grad_norm": 1.3984375, "learning_rate": 0.0004983632707640766, "loss": 5.7876, "mean_token_accuracy": 0.14813560321927072, "num_tokens": 10207876.0, "step": 5535 }, { "entropy": 5.812788200378418, "epoch": 0.4654484352026885, "grad_norm": 1.2265625, "learning_rate": 0.0004983596632674306, "loss": 5.7229, "mean_token_accuracy": 0.14903474599123, "num_tokens": 10216822.0, "step": 5540 }, { "entropy": 5.883552932739258, "epoch": 0.46586851501785337, "grad_norm": 1.2890625, "learning_rate": 0.0004983560518140831, "loss": 5.8344, "mean_token_accuracy": 0.139993616938591, "num_tokens": 10226887.0, "step": 5545 }, { "entropy": 5.850424337387085, "epoch": 0.46628859483301827, "grad_norm": 1.34375, "learning_rate": 0.0004983524364040982, "loss": 5.7004, "mean_token_accuracy": 0.1548854097723961, "num_tokens": 10235935.0, "step": 5550 }, { "entropy": 5.844246101379395, "epoch": 0.46670867464818316, "grad_norm": 1.2890625, "learning_rate": 0.0004983488170375399, "loss": 5.6405, "mean_token_accuracy": 0.1503463476896286, "num_tokens": 10245590.0, "step": 5555 }, { "entropy": 5.735381555557251, "epoch": 0.46712875446334806, "grad_norm": 1.2421875, "learning_rate": 0.0004983451937144723, "loss": 5.7345, "mean_token_accuracy": 0.1456381857395172, "num_tokens": 10255104.0, "step": 5560 }, { "entropy": 5.7118124008178714, "epoch": 0.4675488342785129, "grad_norm": 1.265625, "learning_rate": 0.0004983415664349595, "loss": 5.6004, "mean_token_accuracy": 0.16290194243192674, "num_tokens": 10264236.0, "step": 5565 }, { "entropy": 5.817228507995606, "epoch": 0.4679689140936778, "grad_norm": 1.4453125, "learning_rate": 0.0004983379351990659, "loss": 5.7056, "mean_token_accuracy": 0.1503439575433731, "num_tokens": 10273335.0, "step": 5570 }, { "entropy": 5.7475629329681395, "epoch": 0.4683889939088427, "grad_norm": 1.546875, "learning_rate": 0.0004983343000068559, "loss": 5.6682, "mean_token_accuracy": 0.1495598793029785, "num_tokens": 10282206.0, "step": 5575 }, { "entropy": 5.688462829589843, "epoch": 0.46880907372400754, "grad_norm": 1.4296875, "learning_rate": 0.0004983306608583937, "loss": 5.6189, "mean_token_accuracy": 0.16340474039316177, "num_tokens": 10290056.0, "step": 5580 }, { "entropy": 5.7730052947998045, "epoch": 0.46922915353917244, "grad_norm": 1.25, "learning_rate": 0.0004983270177537438, "loss": 5.7028, "mean_token_accuracy": 0.14809525161981582, "num_tokens": 10299726.0, "step": 5585 }, { "entropy": 5.84525089263916, "epoch": 0.46964923335433734, "grad_norm": 1.3984375, "learning_rate": 0.0004983233706929708, "loss": 5.7725, "mean_token_accuracy": 0.1471342384815216, "num_tokens": 10308696.0, "step": 5590 }, { "entropy": 5.880400562286377, "epoch": 0.4700693131695022, "grad_norm": 1.3046875, "learning_rate": 0.0004983197196761392, "loss": 5.8412, "mean_token_accuracy": 0.14054280817508696, "num_tokens": 10317845.0, "step": 5595 }, { "entropy": 5.84756875038147, "epoch": 0.4704893929846671, "grad_norm": 1.3828125, "learning_rate": 0.0004983160647033139, "loss": 5.737, "mean_token_accuracy": 0.150573068857193, "num_tokens": 10326563.0, "step": 5600 }, { "entropy": 5.826395320892334, "epoch": 0.470909472799832, "grad_norm": 1.2265625, "learning_rate": 0.0004983124057745595, "loss": 5.7235, "mean_token_accuracy": 0.14374103918671607, "num_tokens": 10335931.0, "step": 5605 }, { "entropy": 5.76983675956726, "epoch": 0.47132955261499687, "grad_norm": 1.3359375, "learning_rate": 0.0004983087428899408, "loss": 5.7216, "mean_token_accuracy": 0.1377339854836464, "num_tokens": 10344984.0, "step": 5610 }, { "entropy": 5.842723369598389, "epoch": 0.4717496324301617, "grad_norm": 1.3203125, "learning_rate": 0.0004983050760495227, "loss": 5.7638, "mean_token_accuracy": 0.14885966181755067, "num_tokens": 10353522.0, "step": 5615 }, { "entropy": 5.915482044219971, "epoch": 0.4721697122453266, "grad_norm": 1.2890625, "learning_rate": 0.0004983014052533702, "loss": 5.7678, "mean_token_accuracy": 0.14949656873941422, "num_tokens": 10363527.0, "step": 5620 }, { "entropy": 5.765365362167358, "epoch": 0.4725897920604915, "grad_norm": 1.1796875, "learning_rate": 0.0004982977305015481, "loss": 5.6942, "mean_token_accuracy": 0.1467475950717926, "num_tokens": 10372040.0, "step": 5625 }, { "entropy": 5.808851623535157, "epoch": 0.47300987187565635, "grad_norm": 1.234375, "learning_rate": 0.0004982940517941219, "loss": 5.6732, "mean_token_accuracy": 0.14801965281367302, "num_tokens": 10381279.0, "step": 5630 }, { "entropy": 5.891337108612061, "epoch": 0.47342995169082125, "grad_norm": 1.3359375, "learning_rate": 0.0004982903691311564, "loss": 5.8457, "mean_token_accuracy": 0.1401650868356228, "num_tokens": 10390608.0, "step": 5635 }, { "entropy": 5.811560487747192, "epoch": 0.47385003150598615, "grad_norm": 1.4765625, "learning_rate": 0.0004982866825127172, "loss": 5.6437, "mean_token_accuracy": 0.1533919870853424, "num_tokens": 10399851.0, "step": 5640 }, { "entropy": 5.952455997467041, "epoch": 0.47427011132115104, "grad_norm": 1.296875, "learning_rate": 0.0004982829919388692, "loss": 5.9303, "mean_token_accuracy": 0.1413193352520466, "num_tokens": 10410425.0, "step": 5645 }, { "entropy": 5.829264545440674, "epoch": 0.4746901911363159, "grad_norm": 1.2265625, "learning_rate": 0.0004982792974096781, "loss": 5.6844, "mean_token_accuracy": 0.15058013647794724, "num_tokens": 10418783.0, "step": 5650 }, { "entropy": 5.883219861984253, "epoch": 0.4751102709514808, "grad_norm": 1.375, "learning_rate": 0.000498275598925209, "loss": 5.8575, "mean_token_accuracy": 0.14019499495625495, "num_tokens": 10427360.0, "step": 5655 }, { "entropy": 5.982011365890503, "epoch": 0.4755303507666457, "grad_norm": 1.2265625, "learning_rate": 0.0004982718964855277, "loss": 5.8116, "mean_token_accuracy": 0.14399669840931892, "num_tokens": 10436613.0, "step": 5660 }, { "entropy": 5.872733783721924, "epoch": 0.4759504305818105, "grad_norm": 1.265625, "learning_rate": 0.0004982681900907, "loss": 5.8526, "mean_token_accuracy": 0.1458025962114334, "num_tokens": 10445055.0, "step": 5665 }, { "entropy": 5.826623582839966, "epoch": 0.4763705103969754, "grad_norm": 1.1796875, "learning_rate": 0.000498264479740791, "loss": 5.6666, "mean_token_accuracy": 0.15394981056451798, "num_tokens": 10454516.0, "step": 5670 }, { "entropy": 5.948064708709717, "epoch": 0.4767905902121403, "grad_norm": 1.1953125, "learning_rate": 0.0004982607654358668, "loss": 5.8096, "mean_token_accuracy": 0.147859063744545, "num_tokens": 10463771.0, "step": 5675 }, { "entropy": 5.835044527053833, "epoch": 0.47721067002730516, "grad_norm": 1.15625, "learning_rate": 0.000498257047175993, "loss": 5.7488, "mean_token_accuracy": 0.142615008354187, "num_tokens": 10473783.0, "step": 5680 }, { "entropy": 5.83440375328064, "epoch": 0.47763074984247006, "grad_norm": 1.234375, "learning_rate": 0.0004982533249612357, "loss": 5.6997, "mean_token_accuracy": 0.14993957430124283, "num_tokens": 10483424.0, "step": 5685 }, { "entropy": 5.763900947570801, "epoch": 0.47805082965763496, "grad_norm": 1.171875, "learning_rate": 0.0004982495987916607, "loss": 5.6455, "mean_token_accuracy": 0.15347654670476912, "num_tokens": 10492536.0, "step": 5690 }, { "entropy": 5.8370520114898685, "epoch": 0.47847090947279985, "grad_norm": 1.4296875, "learning_rate": 0.0004982458686673339, "loss": 5.7578, "mean_token_accuracy": 0.14936625212430954, "num_tokens": 10501616.0, "step": 5695 }, { "entropy": 5.956824541091919, "epoch": 0.4788909892879647, "grad_norm": 1.3984375, "learning_rate": 0.0004982421345883217, "loss": 5.8031, "mean_token_accuracy": 0.14071496576070786, "num_tokens": 10511190.0, "step": 5700 }, { "entropy": 5.793789196014404, "epoch": 0.4793110691031296, "grad_norm": 1.4453125, "learning_rate": 0.0004982383965546898, "loss": 5.7381, "mean_token_accuracy": 0.144473847001791, "num_tokens": 10520310.0, "step": 5705 }, { "entropy": 5.833015632629395, "epoch": 0.4797311489182945, "grad_norm": 1.1875, "learning_rate": 0.0004982346545665048, "loss": 5.6941, "mean_token_accuracy": 0.1467716298997402, "num_tokens": 10528711.0, "step": 5710 }, { "entropy": 5.8455291271209715, "epoch": 0.48015122873345933, "grad_norm": 1.4609375, "learning_rate": 0.0004982309086238328, "loss": 5.8016, "mean_token_accuracy": 0.14259516224265098, "num_tokens": 10538484.0, "step": 5715 }, { "entropy": 5.898940181732177, "epoch": 0.48057130854862423, "grad_norm": 1.21875, "learning_rate": 0.0004982271587267403, "loss": 5.747, "mean_token_accuracy": 0.14794613867998124, "num_tokens": 10547623.0, "step": 5720 }, { "entropy": 5.868904733657837, "epoch": 0.48099138836378913, "grad_norm": 1.2109375, "learning_rate": 0.0004982234048752935, "loss": 5.6997, "mean_token_accuracy": 0.14849727526307105, "num_tokens": 10556234.0, "step": 5725 }, { "entropy": 5.9389198303222654, "epoch": 0.481411468178954, "grad_norm": 1.578125, "learning_rate": 0.000498219647069559, "loss": 5.9273, "mean_token_accuracy": 0.13982586190104485, "num_tokens": 10566308.0, "step": 5730 }, { "entropy": 5.836957883834839, "epoch": 0.48183154799411887, "grad_norm": 1.5234375, "learning_rate": 0.0004982158853096035, "loss": 5.8519, "mean_token_accuracy": 0.1417085811495781, "num_tokens": 10575212.0, "step": 5735 }, { "entropy": 5.8836267471313475, "epoch": 0.48225162780928377, "grad_norm": 1.265625, "learning_rate": 0.0004982121195954935, "loss": 5.6287, "mean_token_accuracy": 0.15638786405324936, "num_tokens": 10584590.0, "step": 5740 }, { "entropy": 5.817459297180176, "epoch": 0.48267170762444866, "grad_norm": 1.359375, "learning_rate": 0.0004982083499272957, "loss": 5.7007, "mean_token_accuracy": 0.14900539070367813, "num_tokens": 10593997.0, "step": 5745 }, { "entropy": 5.799760389328003, "epoch": 0.4830917874396135, "grad_norm": 1.2890625, "learning_rate": 0.0004982045763050768, "loss": 5.8291, "mean_token_accuracy": 0.1467505380511284, "num_tokens": 10603299.0, "step": 5750 }, { "entropy": 5.825570392608642, "epoch": 0.4835118672547784, "grad_norm": 1.296875, "learning_rate": 0.0004982007987289041, "loss": 5.7641, "mean_token_accuracy": 0.14574431553483008, "num_tokens": 10613546.0, "step": 5755 }, { "entropy": 5.833213567733765, "epoch": 0.4839319470699433, "grad_norm": 1.359375, "learning_rate": 0.0004981970171988439, "loss": 5.7267, "mean_token_accuracy": 0.15680563673377038, "num_tokens": 10622966.0, "step": 5760 }, { "entropy": 5.918120956420898, "epoch": 0.48435202688510814, "grad_norm": 1.984375, "learning_rate": 0.0004981932317149636, "loss": 5.8074, "mean_token_accuracy": 0.14230270087718963, "num_tokens": 10633441.0, "step": 5765 }, { "entropy": 5.926499748229981, "epoch": 0.48477210670027304, "grad_norm": 1.296875, "learning_rate": 0.00049818944227733, "loss": 5.7829, "mean_token_accuracy": 0.145944182574749, "num_tokens": 10643124.0, "step": 5770 }, { "entropy": 5.8368360042572025, "epoch": 0.48519218651543794, "grad_norm": 1.46875, "learning_rate": 0.0004981856488860105, "loss": 5.75, "mean_token_accuracy": 0.14405592083930968, "num_tokens": 10652517.0, "step": 5775 }, { "entropy": 5.827040672302246, "epoch": 0.48561226633060284, "grad_norm": 1.34375, "learning_rate": 0.0004981818515410721, "loss": 5.8018, "mean_token_accuracy": 0.14195797815918923, "num_tokens": 10663352.0, "step": 5780 }, { "entropy": 5.911312675476074, "epoch": 0.4860323461457677, "grad_norm": 1.3046875, "learning_rate": 0.0004981780502425821, "loss": 5.8228, "mean_token_accuracy": 0.14514586478471755, "num_tokens": 10672430.0, "step": 5785 }, { "entropy": 5.858085298538208, "epoch": 0.4864524259609326, "grad_norm": 1.5390625, "learning_rate": 0.0004981742449906079, "loss": 5.7778, "mean_token_accuracy": 0.15105650201439857, "num_tokens": 10681908.0, "step": 5790 }, { "entropy": 5.876479959487915, "epoch": 0.4868725057760975, "grad_norm": 1.359375, "learning_rate": 0.0004981704357852168, "loss": 5.7501, "mean_token_accuracy": 0.1459008663892746, "num_tokens": 10691259.0, "step": 5795 }, { "entropy": 5.803030967712402, "epoch": 0.4872925855912623, "grad_norm": 1.515625, "learning_rate": 0.0004981666226264764, "loss": 5.6514, "mean_token_accuracy": 0.14785986095666886, "num_tokens": 10699668.0, "step": 5800 }, { "entropy": 5.827937030792237, "epoch": 0.4877126654064272, "grad_norm": 1.4296875, "learning_rate": 0.0004981628055144542, "loss": 5.7065, "mean_token_accuracy": 0.15127545595169067, "num_tokens": 10709146.0, "step": 5805 }, { "entropy": 5.876874828338623, "epoch": 0.4881327452215921, "grad_norm": 1.140625, "learning_rate": 0.0004981589844492177, "loss": 5.8008, "mean_token_accuracy": 0.13951031863689423, "num_tokens": 10718724.0, "step": 5810 }, { "entropy": 5.814950895309448, "epoch": 0.488552825036757, "grad_norm": 1.5625, "learning_rate": 0.0004981551594308349, "loss": 5.7424, "mean_token_accuracy": 0.14747670367360116, "num_tokens": 10728101.0, "step": 5815 }, { "entropy": 5.938137483596802, "epoch": 0.48897290485192185, "grad_norm": 1.3046875, "learning_rate": 0.0004981513304593733, "loss": 5.7721, "mean_token_accuracy": 0.15057093650102615, "num_tokens": 10736750.0, "step": 5820 }, { "entropy": 5.9004603862762455, "epoch": 0.48939298466708675, "grad_norm": 1.2265625, "learning_rate": 0.0004981474975349006, "loss": 5.9573, "mean_token_accuracy": 0.143083293735981, "num_tokens": 10746914.0, "step": 5825 }, { "entropy": 5.944899702072144, "epoch": 0.48981306448225165, "grad_norm": 1.3515625, "learning_rate": 0.000498143660657485, "loss": 5.7841, "mean_token_accuracy": 0.14469311460852624, "num_tokens": 10755786.0, "step": 5830 }, { "entropy": 5.719291877746582, "epoch": 0.4902331442974165, "grad_norm": 1.21875, "learning_rate": 0.0004981398198271944, "loss": 5.6544, "mean_token_accuracy": 0.15054057389497758, "num_tokens": 10764821.0, "step": 5835 }, { "entropy": 5.821346855163574, "epoch": 0.4906532241125814, "grad_norm": 1.5, "learning_rate": 0.0004981359750440968, "loss": 5.7381, "mean_token_accuracy": 0.14619418531656264, "num_tokens": 10773569.0, "step": 5840 }, { "entropy": 5.812557601928711, "epoch": 0.4910733039277463, "grad_norm": 1.265625, "learning_rate": 0.0004981321263082603, "loss": 5.7233, "mean_token_accuracy": 0.14379709362983703, "num_tokens": 10782298.0, "step": 5845 }, { "entropy": 5.7633030891418455, "epoch": 0.4914933837429111, "grad_norm": 1.3125, "learning_rate": 0.000498128273619753, "loss": 5.6964, "mean_token_accuracy": 0.15067172646522523, "num_tokens": 10792087.0, "step": 5850 }, { "entropy": 5.826433086395264, "epoch": 0.491913463558076, "grad_norm": 1.28125, "learning_rate": 0.0004981244169786433, "loss": 5.7863, "mean_token_accuracy": 0.14527801647782326, "num_tokens": 10801641.0, "step": 5855 }, { "entropy": 5.962628364562988, "epoch": 0.4923335433732409, "grad_norm": 1.2578125, "learning_rate": 0.0004981205563849994, "loss": 5.8636, "mean_token_accuracy": 0.1445979543030262, "num_tokens": 10811612.0, "step": 5860 }, { "entropy": 5.84666166305542, "epoch": 0.4927536231884058, "grad_norm": 1.3203125, "learning_rate": 0.0004981166918388897, "loss": 5.6721, "mean_token_accuracy": 0.1496157467365265, "num_tokens": 10821608.0, "step": 5865 }, { "entropy": 5.758074522018433, "epoch": 0.49317370300357066, "grad_norm": 1.4609375, "learning_rate": 0.0004981128233403828, "loss": 5.6341, "mean_token_accuracy": 0.15541895031929015, "num_tokens": 10830679.0, "step": 5870 }, { "entropy": 5.810383653640747, "epoch": 0.49359378281873556, "grad_norm": 1.2109375, "learning_rate": 0.000498108950889547, "loss": 5.7028, "mean_token_accuracy": 0.15059976279735565, "num_tokens": 10839669.0, "step": 5875 }, { "entropy": 5.813056564331054, "epoch": 0.49401386263390046, "grad_norm": 1.25, "learning_rate": 0.0004981050744864512, "loss": 5.6876, "mean_token_accuracy": 0.14685238003730774, "num_tokens": 10849666.0, "step": 5880 }, { "entropy": 5.78202338218689, "epoch": 0.4944339424490653, "grad_norm": 1.3828125, "learning_rate": 0.0004981011941311638, "loss": 5.6093, "mean_token_accuracy": 0.1536119759082794, "num_tokens": 10858225.0, "step": 5885 }, { "entropy": 5.7550591945648195, "epoch": 0.4948540222642302, "grad_norm": 1.3984375, "learning_rate": 0.0004980973098237535, "loss": 5.7246, "mean_token_accuracy": 0.14252085834741593, "num_tokens": 10867466.0, "step": 5890 }, { "entropy": 5.849875020980835, "epoch": 0.4952741020793951, "grad_norm": 1.3671875, "learning_rate": 0.0004980934215642894, "loss": 5.7463, "mean_token_accuracy": 0.151506906747818, "num_tokens": 10875850.0, "step": 5895 }, { "entropy": 5.780202579498291, "epoch": 0.49569418189456, "grad_norm": 1.375, "learning_rate": 0.00049808952935284, "loss": 5.6809, "mean_token_accuracy": 0.15422153174877168, "num_tokens": 10885154.0, "step": 5900 }, { "entropy": 5.7728334903717045, "epoch": 0.49611426170972484, "grad_norm": 1.3828125, "learning_rate": 0.0004980856331894747, "loss": 5.7714, "mean_token_accuracy": 0.14351727366447448, "num_tokens": 10894080.0, "step": 5905 }, { "entropy": 5.794958066940308, "epoch": 0.49653434152488973, "grad_norm": 1.3671875, "learning_rate": 0.0004980817330742621, "loss": 5.7728, "mean_token_accuracy": 0.1406318761408329, "num_tokens": 10903248.0, "step": 5910 }, { "entropy": 5.890414190292359, "epoch": 0.49695442134005463, "grad_norm": 1.234375, "learning_rate": 0.0004980778290072716, "loss": 5.7344, "mean_token_accuracy": 0.1520361930131912, "num_tokens": 10912939.0, "step": 5915 }, { "entropy": 5.844255971908569, "epoch": 0.4973745011552195, "grad_norm": 1.484375, "learning_rate": 0.0004980739209885722, "loss": 5.7519, "mean_token_accuracy": 0.14798953309655188, "num_tokens": 10921505.0, "step": 5920 }, { "entropy": 5.894140291213989, "epoch": 0.49779458097038437, "grad_norm": 1.3359375, "learning_rate": 0.0004980700090182331, "loss": 5.8334, "mean_token_accuracy": 0.14881108254194259, "num_tokens": 10931861.0, "step": 5925 }, { "entropy": 5.870219659805298, "epoch": 0.49821466078554927, "grad_norm": 1.671875, "learning_rate": 0.0004980660930963238, "loss": 5.7625, "mean_token_accuracy": 0.14495279788970947, "num_tokens": 10940810.0, "step": 5930 }, { "entropy": 5.808070087432862, "epoch": 0.4986347406007141, "grad_norm": 1.2890625, "learning_rate": 0.0004980621732229133, "loss": 5.6263, "mean_token_accuracy": 0.15171189308166505, "num_tokens": 10949514.0, "step": 5935 }, { "entropy": 5.853536224365234, "epoch": 0.499054820415879, "grad_norm": 1.6015625, "learning_rate": 0.0004980582493980714, "loss": 5.8402, "mean_token_accuracy": 0.13668815642595292, "num_tokens": 10959161.0, "step": 5940 }, { "entropy": 5.811306715011597, "epoch": 0.4994749002310439, "grad_norm": 1.2578125, "learning_rate": 0.0004980543216218674, "loss": 5.7084, "mean_token_accuracy": 0.1605042815208435, "num_tokens": 10968983.0, "step": 5945 }, { "entropy": 5.838724660873413, "epoch": 0.4998949800462088, "grad_norm": 1.3203125, "learning_rate": 0.0004980503898943711, "loss": 5.8486, "mean_token_accuracy": 0.14541933685541153, "num_tokens": 10978044.0, "step": 5950 }, { "entropy": 5.919149684906006, "epoch": 0.5003150598613737, "grad_norm": 1.3046875, "learning_rate": 0.0004980464542156519, "loss": 5.7474, "mean_token_accuracy": 0.15162651985883713, "num_tokens": 10986980.0, "step": 5955 }, { "entropy": 5.8385172367095945, "epoch": 0.5007351396765385, "grad_norm": 1.4453125, "learning_rate": 0.0004980425145857796, "loss": 5.6939, "mean_token_accuracy": 0.15786231756210328, "num_tokens": 10995163.0, "step": 5960 }, { "entropy": 5.755066156387329, "epoch": 0.5011552194917034, "grad_norm": 1.375, "learning_rate": 0.000498038571004824, "loss": 5.6211, "mean_token_accuracy": 0.159263913333416, "num_tokens": 11003722.0, "step": 5965 }, { "entropy": 5.732334613800049, "epoch": 0.5015752993068683, "grad_norm": 1.28125, "learning_rate": 0.0004980346234728549, "loss": 5.6829, "mean_token_accuracy": 0.15636452287435532, "num_tokens": 11013176.0, "step": 5970 }, { "entropy": 5.856866264343262, "epoch": 0.5019953791220332, "grad_norm": 1.484375, "learning_rate": 0.0004980306719899424, "loss": 5.7417, "mean_token_accuracy": 0.1482336312532425, "num_tokens": 11022636.0, "step": 5975 }, { "entropy": 5.81472544670105, "epoch": 0.5024154589371981, "grad_norm": 1.3359375, "learning_rate": 0.0004980267165561564, "loss": 5.6994, "mean_token_accuracy": 0.15061589032411576, "num_tokens": 11031896.0, "step": 5980 }, { "entropy": 5.8317889213562015, "epoch": 0.502835538752363, "grad_norm": 1.3203125, "learning_rate": 0.0004980227571715669, "loss": 5.7442, "mean_token_accuracy": 0.14868111461400985, "num_tokens": 11040802.0, "step": 5985 }, { "entropy": 5.817817497253418, "epoch": 0.5032556185675279, "grad_norm": 1.7265625, "learning_rate": 0.0004980187938362441, "loss": 5.6616, "mean_token_accuracy": 0.14449788331985475, "num_tokens": 11049701.0, "step": 5990 }, { "entropy": 5.8403524398803714, "epoch": 0.5036756983826927, "grad_norm": 1.265625, "learning_rate": 0.0004980148265502581, "loss": 5.8553, "mean_token_accuracy": 0.1392398163676262, "num_tokens": 11059555.0, "step": 5995 }, { "entropy": 5.883025121688843, "epoch": 0.5040957781978576, "grad_norm": 1.3359375, "learning_rate": 0.0004980108553136795, "loss": 5.7762, "mean_token_accuracy": 0.14863402545452117, "num_tokens": 11068940.0, "step": 6000 }, { "epoch": 0.5040957781978576, "eval_entropy": 5.732787127158954, "eval_loss": 5.7686614990234375, "eval_mean_token_accuracy": 0.15331337192289018, "eval_num_tokens": 11068940.0, "eval_runtime": 27.3892, "eval_samples_per_second": 1364.261, "eval_steps_per_second": 170.542, "step": 6000 }, { "entropy": 5.908424186706543, "epoch": 0.5045158580130225, "grad_norm": 1.3046875, "learning_rate": 0.0004980068801265783, "loss": 5.7414, "mean_token_accuracy": 0.14692858532071112, "num_tokens": 11079014.0, "step": 6005 }, { "entropy": 5.866373205184937, "epoch": 0.5049359378281874, "grad_norm": 1.375, "learning_rate": 0.0004980029009890251, "loss": 5.8378, "mean_token_accuracy": 0.1466228261590004, "num_tokens": 11089526.0, "step": 6010 }, { "entropy": 5.839123296737671, "epoch": 0.5053560176433523, "grad_norm": 1.421875, "learning_rate": 0.0004979989179010904, "loss": 5.7197, "mean_token_accuracy": 0.15178524404764177, "num_tokens": 11099156.0, "step": 6015 }, { "entropy": 5.760820007324218, "epoch": 0.5057760974585171, "grad_norm": 1.359375, "learning_rate": 0.0004979949308628445, "loss": 5.7078, "mean_token_accuracy": 0.15017148554325105, "num_tokens": 11108242.0, "step": 6020 }, { "entropy": 5.7764500141143795, "epoch": 0.506196177273682, "grad_norm": 1.203125, "learning_rate": 0.0004979909398743584, "loss": 5.7066, "mean_token_accuracy": 0.15099107772111892, "num_tokens": 11118076.0, "step": 6025 }, { "entropy": 5.893146562576294, "epoch": 0.5066162570888468, "grad_norm": 1.1875, "learning_rate": 0.0004979869449357026, "loss": 5.7766, "mean_token_accuracy": 0.15781906694173814, "num_tokens": 11127265.0, "step": 6030 }, { "entropy": 5.810907363891602, "epoch": 0.5070363369040117, "grad_norm": 1.3203125, "learning_rate": 0.0004979829460469478, "loss": 5.6965, "mean_token_accuracy": 0.1483650103211403, "num_tokens": 11136429.0, "step": 6035 }, { "entropy": 5.813454437255859, "epoch": 0.5074564167191766, "grad_norm": 1.328125, "learning_rate": 0.0004979789432081649, "loss": 5.7139, "mean_token_accuracy": 0.1487409368157387, "num_tokens": 11146201.0, "step": 6040 }, { "entropy": 5.864733123779297, "epoch": 0.5078764965343415, "grad_norm": 1.3984375, "learning_rate": 0.000497974936419425, "loss": 5.7222, "mean_token_accuracy": 0.15236361622810363, "num_tokens": 11154867.0, "step": 6045 }, { "entropy": 5.746392869949341, "epoch": 0.5082965763495064, "grad_norm": 1.4453125, "learning_rate": 0.0004979709256807989, "loss": 5.758, "mean_token_accuracy": 0.1480425164103508, "num_tokens": 11164092.0, "step": 6050 }, { "entropy": 5.840289688110351, "epoch": 0.5087166561646713, "grad_norm": 1.3671875, "learning_rate": 0.0004979669109923575, "loss": 5.7754, "mean_token_accuracy": 0.14666769057512283, "num_tokens": 11173176.0, "step": 6055 }, { "entropy": 5.953520202636719, "epoch": 0.5091367359798362, "grad_norm": 1.171875, "learning_rate": 0.0004979628923541721, "loss": 5.7491, "mean_token_accuracy": 0.1458544984459877, "num_tokens": 11182397.0, "step": 6060 }, { "entropy": 5.871777105331421, "epoch": 0.509556815795001, "grad_norm": 1.296875, "learning_rate": 0.000497958869766314, "loss": 5.7938, "mean_token_accuracy": 0.14472762495279312, "num_tokens": 11191790.0, "step": 6065 }, { "entropy": 5.785938310623169, "epoch": 0.5099768956101659, "grad_norm": 1.328125, "learning_rate": 0.0004979548432288543, "loss": 5.7104, "mean_token_accuracy": 0.1533594697713852, "num_tokens": 11201104.0, "step": 6070 }, { "entropy": 5.850540256500244, "epoch": 0.5103969754253308, "grad_norm": 1.2734375, "learning_rate": 0.0004979508127418643, "loss": 5.7179, "mean_token_accuracy": 0.1509293831884861, "num_tokens": 11209578.0, "step": 6075 }, { "entropy": 5.824426078796387, "epoch": 0.5108170552404957, "grad_norm": 1.3515625, "learning_rate": 0.0004979467783054155, "loss": 5.6559, "mean_token_accuracy": 0.15454075038433074, "num_tokens": 11218380.0, "step": 6080 }, { "entropy": 5.734690237045288, "epoch": 0.5112371350556606, "grad_norm": 1.328125, "learning_rate": 0.0004979427399195793, "loss": 5.6795, "mean_token_accuracy": 0.1466882646083832, "num_tokens": 11227810.0, "step": 6085 }, { "entropy": 5.784052991867066, "epoch": 0.5116572148708255, "grad_norm": 1.3359375, "learning_rate": 0.0004979386975844274, "loss": 5.6925, "mean_token_accuracy": 0.1516873687505722, "num_tokens": 11236631.0, "step": 6090 }, { "entropy": 5.811602210998535, "epoch": 0.5120772946859904, "grad_norm": 1.125, "learning_rate": 0.0004979346513000311, "loss": 5.7643, "mean_token_accuracy": 0.14228157997131347, "num_tokens": 11247418.0, "step": 6095 }, { "entropy": 5.801711654663086, "epoch": 0.5124973745011552, "grad_norm": 1.2421875, "learning_rate": 0.0004979306010664623, "loss": 5.6482, "mean_token_accuracy": 0.15656405985355376, "num_tokens": 11256246.0, "step": 6100 }, { "entropy": 5.709601259231567, "epoch": 0.5129174543163201, "grad_norm": 1.1484375, "learning_rate": 0.0004979265468837927, "loss": 5.6377, "mean_token_accuracy": 0.15466838777065278, "num_tokens": 11265980.0, "step": 6105 }, { "entropy": 5.778408575057983, "epoch": 0.513337534131485, "grad_norm": 1.4375, "learning_rate": 0.000497922488752094, "loss": 5.6873, "mean_token_accuracy": 0.1463077425956726, "num_tokens": 11276158.0, "step": 6110 }, { "entropy": 5.757645797729492, "epoch": 0.5137576139466499, "grad_norm": 1.2265625, "learning_rate": 0.0004979184266714383, "loss": 5.6121, "mean_token_accuracy": 0.1554221287369728, "num_tokens": 11284957.0, "step": 6115 }, { "entropy": 5.694925689697266, "epoch": 0.5141776937618148, "grad_norm": 1.265625, "learning_rate": 0.0004979143606418974, "loss": 5.6283, "mean_token_accuracy": 0.1562877871096134, "num_tokens": 11294340.0, "step": 6120 }, { "entropy": 5.903133296966553, "epoch": 0.5145977735769797, "grad_norm": 1.375, "learning_rate": 0.0004979102906635435, "loss": 5.8808, "mean_token_accuracy": 0.14421921372413635, "num_tokens": 11303344.0, "step": 6125 }, { "entropy": 5.9017737865447994, "epoch": 0.5150178533921445, "grad_norm": 1.375, "learning_rate": 0.0004979062167364486, "loss": 5.7468, "mean_token_accuracy": 0.15465227216482164, "num_tokens": 11311338.0, "step": 6130 }, { "entropy": 5.760764503479004, "epoch": 0.5154379332073094, "grad_norm": 1.375, "learning_rate": 0.0004979021388606847, "loss": 5.5793, "mean_token_accuracy": 0.16053801253437996, "num_tokens": 11320194.0, "step": 6135 }, { "entropy": 5.783118629455567, "epoch": 0.5158580130224742, "grad_norm": 1.2578125, "learning_rate": 0.0004978980570363243, "loss": 5.7606, "mean_token_accuracy": 0.15072498917579652, "num_tokens": 11329952.0, "step": 6140 }, { "entropy": 5.807923793792725, "epoch": 0.5162780928376391, "grad_norm": 1.375, "learning_rate": 0.0004978939712634396, "loss": 5.7097, "mean_token_accuracy": 0.1485825777053833, "num_tokens": 11339384.0, "step": 6145 }, { "entropy": 5.927007532119751, "epoch": 0.516698172652804, "grad_norm": 1.2734375, "learning_rate": 0.0004978898815421029, "loss": 5.882, "mean_token_accuracy": 0.14463590383529662, "num_tokens": 11348409.0, "step": 6150 }, { "entropy": 5.948485612869263, "epoch": 0.5171182524679689, "grad_norm": 1.296875, "learning_rate": 0.0004978857878723867, "loss": 5.7826, "mean_token_accuracy": 0.1465214103460312, "num_tokens": 11357478.0, "step": 6155 }, { "entropy": 5.871764278411865, "epoch": 0.5175383322831338, "grad_norm": 1.46875, "learning_rate": 0.0004978816902543636, "loss": 5.7924, "mean_token_accuracy": 0.14824822992086412, "num_tokens": 11366379.0, "step": 6160 }, { "entropy": 5.857372522354126, "epoch": 0.5179584120982986, "grad_norm": 1.328125, "learning_rate": 0.0004978775886881062, "loss": 5.8228, "mean_token_accuracy": 0.144633187353611, "num_tokens": 11376357.0, "step": 6165 }, { "entropy": 5.790678644180298, "epoch": 0.5183784919134635, "grad_norm": 1.453125, "learning_rate": 0.000497873483173687, "loss": 5.682, "mean_token_accuracy": 0.1550826385617256, "num_tokens": 11384995.0, "step": 6170 }, { "entropy": 5.803675746917724, "epoch": 0.5187985717286284, "grad_norm": 1.328125, "learning_rate": 0.0004978693737111787, "loss": 5.691, "mean_token_accuracy": 0.14901078641414642, "num_tokens": 11395363.0, "step": 6175 }, { "entropy": 5.773939752578736, "epoch": 0.5192186515437933, "grad_norm": 1.3515625, "learning_rate": 0.0004978652603006543, "loss": 5.6785, "mean_token_accuracy": 0.14922358542680741, "num_tokens": 11404511.0, "step": 6180 }, { "entropy": 5.83831205368042, "epoch": 0.5196387313589582, "grad_norm": 1.25, "learning_rate": 0.0004978611429421866, "loss": 5.7376, "mean_token_accuracy": 0.14898759126663208, "num_tokens": 11413400.0, "step": 6185 }, { "entropy": 5.867534255981445, "epoch": 0.5200588111741231, "grad_norm": 1.28125, "learning_rate": 0.0004978570216358485, "loss": 5.7719, "mean_token_accuracy": 0.14096312299370767, "num_tokens": 11423693.0, "step": 6190 }, { "entropy": 5.85771164894104, "epoch": 0.520478890989288, "grad_norm": 1.3515625, "learning_rate": 0.000497852896381713, "loss": 5.7317, "mean_token_accuracy": 0.14528233110904692, "num_tokens": 11433195.0, "step": 6195 }, { "entropy": 5.8870384216308596, "epoch": 0.5208989708044528, "grad_norm": 1.3359375, "learning_rate": 0.0004978487671798531, "loss": 5.8604, "mean_token_accuracy": 0.13629197254776954, "num_tokens": 11443416.0, "step": 6200 }, { "entropy": 5.938678550720215, "epoch": 0.5213190506196177, "grad_norm": 1.421875, "learning_rate": 0.0004978446340303422, "loss": 5.7271, "mean_token_accuracy": 0.15116187259554864, "num_tokens": 11452487.0, "step": 6205 }, { "entropy": 5.809211301803589, "epoch": 0.5217391304347826, "grad_norm": 1.3984375, "learning_rate": 0.0004978404969332533, "loss": 5.7517, "mean_token_accuracy": 0.15704237520694733, "num_tokens": 11461893.0, "step": 6210 }, { "entropy": 5.73575005531311, "epoch": 0.5221592102499475, "grad_norm": 1.203125, "learning_rate": 0.0004978363558886597, "loss": 5.6754, "mean_token_accuracy": 0.14295373037457465, "num_tokens": 11471238.0, "step": 6215 }, { "entropy": 5.850252771377564, "epoch": 0.5225792900651124, "grad_norm": 1.28125, "learning_rate": 0.0004978322108966348, "loss": 5.7739, "mean_token_accuracy": 0.14141838401556014, "num_tokens": 11480571.0, "step": 6220 }, { "entropy": 5.817096996307373, "epoch": 0.5229993698802773, "grad_norm": 1.2734375, "learning_rate": 0.0004978280619572521, "loss": 5.7567, "mean_token_accuracy": 0.14793166518211365, "num_tokens": 11489552.0, "step": 6225 }, { "entropy": 5.864131927490234, "epoch": 0.5234194496954422, "grad_norm": 1.375, "learning_rate": 0.000497823909070585, "loss": 5.8087, "mean_token_accuracy": 0.1432569444179535, "num_tokens": 11498715.0, "step": 6230 }, { "entropy": 5.847290849685669, "epoch": 0.523839529510607, "grad_norm": 1.328125, "learning_rate": 0.0004978197522367071, "loss": 5.7472, "mean_token_accuracy": 0.14424416646361352, "num_tokens": 11508472.0, "step": 6235 }, { "entropy": 5.939693546295166, "epoch": 0.5242596093257719, "grad_norm": 1.234375, "learning_rate": 0.0004978155914556919, "loss": 5.6864, "mean_token_accuracy": 0.15637651830911636, "num_tokens": 11517620.0, "step": 6240 }, { "entropy": 5.744783592224121, "epoch": 0.5246796891409368, "grad_norm": 1.2890625, "learning_rate": 0.0004978114267276134, "loss": 5.7336, "mean_token_accuracy": 0.14782111793756486, "num_tokens": 11526106.0, "step": 6245 }, { "entropy": 5.853097581863404, "epoch": 0.5250997689561017, "grad_norm": 1.3515625, "learning_rate": 0.0004978072580525451, "loss": 5.7751, "mean_token_accuracy": 0.14963556379079818, "num_tokens": 11535840.0, "step": 6250 }, { "entropy": 5.883814191818237, "epoch": 0.5255198487712666, "grad_norm": 1.359375, "learning_rate": 0.000497803085430561, "loss": 5.7622, "mean_token_accuracy": 0.15003612414002418, "num_tokens": 11545110.0, "step": 6255 }, { "entropy": 5.879300594329834, "epoch": 0.5259399285864315, "grad_norm": 1.6328125, "learning_rate": 0.0004977989088617349, "loss": 5.7805, "mean_token_accuracy": 0.1432628057897091, "num_tokens": 11554382.0, "step": 6260 }, { "entropy": 5.77400393486023, "epoch": 0.5263600084015964, "grad_norm": 1.546875, "learning_rate": 0.000497794728346141, "loss": 5.632, "mean_token_accuracy": 0.1552414707839489, "num_tokens": 11562821.0, "step": 6265 }, { "entropy": 5.952142190933228, "epoch": 0.5267800882167611, "grad_norm": 1.34375, "learning_rate": 0.0004977905438838531, "loss": 5.8474, "mean_token_accuracy": 0.14172168597579002, "num_tokens": 11571705.0, "step": 6270 }, { "entropy": 5.71492829322815, "epoch": 0.527200168031926, "grad_norm": 2.296875, "learning_rate": 0.0004977863554749453, "loss": 5.6778, "mean_token_accuracy": 0.14525432735681534, "num_tokens": 11580692.0, "step": 6275 }, { "entropy": 5.727636861801147, "epoch": 0.5276202478470909, "grad_norm": 1.3359375, "learning_rate": 0.0004977821631194922, "loss": 5.686, "mean_token_accuracy": 0.14509947448968888, "num_tokens": 11589966.0, "step": 6280 }, { "entropy": 5.8679040431976315, "epoch": 0.5280403276622558, "grad_norm": 1.2578125, "learning_rate": 0.0004977779668175677, "loss": 5.7627, "mean_token_accuracy": 0.1469483494758606, "num_tokens": 11599627.0, "step": 6285 }, { "entropy": 5.856904077529907, "epoch": 0.5284604074774207, "grad_norm": 1.25, "learning_rate": 0.0004977737665692461, "loss": 5.7366, "mean_token_accuracy": 0.15558115839958192, "num_tokens": 11608431.0, "step": 6290 }, { "entropy": 5.841502332687378, "epoch": 0.5288804872925856, "grad_norm": 1.28125, "learning_rate": 0.0004977695623746021, "loss": 5.6142, "mean_token_accuracy": 0.14905260503292084, "num_tokens": 11617552.0, "step": 6295 }, { "entropy": 5.712338972091675, "epoch": 0.5293005671077504, "grad_norm": 1.65625, "learning_rate": 0.0004977653542337099, "loss": 5.6645, "mean_token_accuracy": 0.15581920593976975, "num_tokens": 11626828.0, "step": 6300 }, { "entropy": 5.804640913009644, "epoch": 0.5297206469229153, "grad_norm": 1.4140625, "learning_rate": 0.0004977611421466443, "loss": 5.746, "mean_token_accuracy": 0.14610961824655533, "num_tokens": 11635867.0, "step": 6305 }, { "entropy": 5.886562156677246, "epoch": 0.5301407267380802, "grad_norm": 1.3046875, "learning_rate": 0.0004977569261134797, "loss": 5.6601, "mean_token_accuracy": 0.15055324360728264, "num_tokens": 11644711.0, "step": 6310 }, { "entropy": 5.830437183380127, "epoch": 0.5305608065532451, "grad_norm": 1.484375, "learning_rate": 0.0004977527061342908, "loss": 5.7385, "mean_token_accuracy": 0.15071533769369125, "num_tokens": 11653320.0, "step": 6315 }, { "entropy": 5.832324886322022, "epoch": 0.53098088636841, "grad_norm": 1.3125, "learning_rate": 0.0004977484822091524, "loss": 5.703, "mean_token_accuracy": 0.15310411900281906, "num_tokens": 11662753.0, "step": 6320 }, { "entropy": 5.879701805114746, "epoch": 0.5314009661835749, "grad_norm": 1.6484375, "learning_rate": 0.0004977442543381394, "loss": 5.7395, "mean_token_accuracy": 0.1498982183635235, "num_tokens": 11671622.0, "step": 6325 }, { "entropy": 5.854084539413452, "epoch": 0.5318210459987398, "grad_norm": 1.796875, "learning_rate": 0.0004977400225213266, "loss": 5.7196, "mean_token_accuracy": 0.14721598774194716, "num_tokens": 11679964.0, "step": 6330 }, { "entropy": 5.763905620574951, "epoch": 0.5322411258139046, "grad_norm": 1.375, "learning_rate": 0.000497735786758789, "loss": 5.6842, "mean_token_accuracy": 0.1521085247397423, "num_tokens": 11688700.0, "step": 6335 }, { "entropy": 5.846723842620849, "epoch": 0.5326612056290695, "grad_norm": 1.5625, "learning_rate": 0.0004977315470506016, "loss": 5.8056, "mean_token_accuracy": 0.14883239492774009, "num_tokens": 11698425.0, "step": 6340 }, { "entropy": 5.966537141799927, "epoch": 0.5330812854442344, "grad_norm": 1.4921875, "learning_rate": 0.0004977273033968397, "loss": 5.791, "mean_token_accuracy": 0.13928466588258742, "num_tokens": 11707705.0, "step": 6345 }, { "entropy": 5.8435125827789305, "epoch": 0.5335013652593993, "grad_norm": 1.4296875, "learning_rate": 0.0004977230557975782, "loss": 5.6783, "mean_token_accuracy": 0.1494770586490631, "num_tokens": 11717079.0, "step": 6350 }, { "entropy": 5.791642379760742, "epoch": 0.5339214450745642, "grad_norm": 1.96875, "learning_rate": 0.0004977188042528923, "loss": 5.6678, "mean_token_accuracy": 0.14970564991235732, "num_tokens": 11725504.0, "step": 6355 }, { "entropy": 5.847938060760498, "epoch": 0.5343415248897291, "grad_norm": 1.4375, "learning_rate": 0.0004977145487628576, "loss": 5.7572, "mean_token_accuracy": 0.14778463244438172, "num_tokens": 11735282.0, "step": 6360 }, { "entropy": 5.854086971282959, "epoch": 0.534761604704894, "grad_norm": 1.8125, "learning_rate": 0.0004977102893275494, "loss": 5.7377, "mean_token_accuracy": 0.14616001397371292, "num_tokens": 11744827.0, "step": 6365 }, { "entropy": 5.835380983352661, "epoch": 0.5351816845200588, "grad_norm": 1.4140625, "learning_rate": 0.000497706025947043, "loss": 5.7012, "mean_token_accuracy": 0.14849554300308226, "num_tokens": 11753066.0, "step": 6370 }, { "entropy": 5.829690742492676, "epoch": 0.5356017643352237, "grad_norm": 1.5078125, "learning_rate": 0.0004977017586214142, "loss": 5.7175, "mean_token_accuracy": 0.14658187404274942, "num_tokens": 11761190.0, "step": 6375 }, { "entropy": 5.845994329452514, "epoch": 0.5360218441503886, "grad_norm": 1.5078125, "learning_rate": 0.0004976974873507382, "loss": 5.6947, "mean_token_accuracy": 0.15390099734067916, "num_tokens": 11770321.0, "step": 6380 }, { "entropy": 5.7918110370635985, "epoch": 0.5364419239655535, "grad_norm": 1.3046875, "learning_rate": 0.000497693212135091, "loss": 5.7547, "mean_token_accuracy": 0.14563888013362886, "num_tokens": 11778388.0, "step": 6385 }, { "entropy": 5.857013368606568, "epoch": 0.5368620037807184, "grad_norm": 1.484375, "learning_rate": 0.0004976889329745482, "loss": 5.6164, "mean_token_accuracy": 0.15133741348981858, "num_tokens": 11786250.0, "step": 6390 }, { "entropy": 5.720251989364624, "epoch": 0.5372820835958833, "grad_norm": 2.28125, "learning_rate": 0.0004976846498691857, "loss": 5.579, "mean_token_accuracy": 0.15662760883569718, "num_tokens": 11794831.0, "step": 6395 }, { "entropy": 5.777666759490967, "epoch": 0.5377021634110482, "grad_norm": 1.34375, "learning_rate": 0.0004976803628190792, "loss": 5.6537, "mean_token_accuracy": 0.15591528862714768, "num_tokens": 11803550.0, "step": 6400 }, { "entropy": 5.767534923553467, "epoch": 0.5381222432262129, "grad_norm": 1.5703125, "learning_rate": 0.0004976760718243047, "loss": 5.7165, "mean_token_accuracy": 0.14894714206457138, "num_tokens": 11812478.0, "step": 6405 }, { "entropy": 5.8361043453216555, "epoch": 0.5385423230413778, "grad_norm": 2.109375, "learning_rate": 0.0004976717768849383, "loss": 5.6892, "mean_token_accuracy": 0.14339745715260505, "num_tokens": 11822463.0, "step": 6410 }, { "entropy": 5.79760046005249, "epoch": 0.5389624028565427, "grad_norm": 2.59375, "learning_rate": 0.0004976674780010561, "loss": 5.7244, "mean_token_accuracy": 0.13902894631028176, "num_tokens": 11831853.0, "step": 6415 }, { "entropy": 5.824806070327758, "epoch": 0.5393824826717076, "grad_norm": 1.59375, "learning_rate": 0.000497663175172734, "loss": 5.7457, "mean_token_accuracy": 0.1442998580634594, "num_tokens": 11841574.0, "step": 6420 }, { "entropy": 5.9099555015563965, "epoch": 0.5398025624868725, "grad_norm": 1.6953125, "learning_rate": 0.0004976588684000486, "loss": 5.8432, "mean_token_accuracy": 0.13176233023405076, "num_tokens": 11852489.0, "step": 6425 }, { "entropy": 5.846707534790039, "epoch": 0.5402226423020374, "grad_norm": 1.4609375, "learning_rate": 0.0004976545576830759, "loss": 5.6999, "mean_token_accuracy": 0.1471443608403206, "num_tokens": 11861499.0, "step": 6430 }, { "entropy": 5.810786867141724, "epoch": 0.5406427221172023, "grad_norm": 1.1953125, "learning_rate": 0.0004976502430218924, "loss": 5.776, "mean_token_accuracy": 0.14316292852163315, "num_tokens": 11871685.0, "step": 6435 }, { "entropy": 5.8063677787780765, "epoch": 0.5410628019323671, "grad_norm": 1.5234375, "learning_rate": 0.0004976459244165744, "loss": 5.6983, "mean_token_accuracy": 0.14863400161266327, "num_tokens": 11881340.0, "step": 6440 }, { "entropy": 5.772097444534301, "epoch": 0.541482881747532, "grad_norm": 1.46875, "learning_rate": 0.0004976416018671986, "loss": 5.7131, "mean_token_accuracy": 0.14742937684059143, "num_tokens": 11890700.0, "step": 6445 }, { "entropy": 5.814801359176636, "epoch": 0.5419029615626969, "grad_norm": 1.5, "learning_rate": 0.0004976372753738415, "loss": 5.7129, "mean_token_accuracy": 0.14111651703715325, "num_tokens": 11900329.0, "step": 6450 }, { "entropy": 5.9360603332519535, "epoch": 0.5423230413778618, "grad_norm": 1.4296875, "learning_rate": 0.0004976329449365795, "loss": 5.754, "mean_token_accuracy": 0.1429471679031849, "num_tokens": 11909915.0, "step": 6455 }, { "entropy": 5.787397623062134, "epoch": 0.5427431211930267, "grad_norm": 1.4140625, "learning_rate": 0.0004976286105554897, "loss": 5.7645, "mean_token_accuracy": 0.14958669245243073, "num_tokens": 11918302.0, "step": 6460 }, { "entropy": 5.77375168800354, "epoch": 0.5431632010081916, "grad_norm": 1.4921875, "learning_rate": 0.0004976242722306487, "loss": 5.7198, "mean_token_accuracy": 0.14630756974220277, "num_tokens": 11927794.0, "step": 6465 }, { "entropy": 5.919241952896118, "epoch": 0.5435832808233564, "grad_norm": 1.2421875, "learning_rate": 0.0004976199299621333, "loss": 5.747, "mean_token_accuracy": 0.14924167543649675, "num_tokens": 11937701.0, "step": 6470 }, { "entropy": 5.725202035903931, "epoch": 0.5440033606385213, "grad_norm": 1.3828125, "learning_rate": 0.0004976155837500205, "loss": 5.6509, "mean_token_accuracy": 0.15285194665193558, "num_tokens": 11946106.0, "step": 6475 }, { "entropy": 5.793752574920655, "epoch": 0.5444234404536862, "grad_norm": 1.546875, "learning_rate": 0.0004976112335943872, "loss": 5.5899, "mean_token_accuracy": 0.15264788568019866, "num_tokens": 11954604.0, "step": 6480 }, { "entropy": 5.727561092376709, "epoch": 0.5448435202688511, "grad_norm": 1.375, "learning_rate": 0.0004976068794953106, "loss": 5.655, "mean_token_accuracy": 0.15496142357587814, "num_tokens": 11963664.0, "step": 6485 }, { "entropy": 5.800908708572388, "epoch": 0.545263600084016, "grad_norm": 1.4296875, "learning_rate": 0.0004976025214528677, "loss": 5.6569, "mean_token_accuracy": 0.15130768865346908, "num_tokens": 11973426.0, "step": 6490 }, { "entropy": 5.773944950103759, "epoch": 0.5456836798991809, "grad_norm": 1.4921875, "learning_rate": 0.0004975981594671359, "loss": 5.6981, "mean_token_accuracy": 0.14681158736348152, "num_tokens": 11982339.0, "step": 6495 }, { "entropy": 5.846315574645996, "epoch": 0.5461037597143458, "grad_norm": 1.3203125, "learning_rate": 0.0004975937935381921, "loss": 5.7408, "mean_token_accuracy": 0.15329586565494538, "num_tokens": 11992016.0, "step": 6500 }, { "entropy": 5.7528393268585205, "epoch": 0.5465238395295106, "grad_norm": 1.40625, "learning_rate": 0.000497589423666114, "loss": 5.7341, "mean_token_accuracy": 0.1440807357430458, "num_tokens": 12000616.0, "step": 6505 }, { "entropy": 5.6946946144104, "epoch": 0.5469439193446755, "grad_norm": 1.390625, "learning_rate": 0.0004975850498509789, "loss": 5.6253, "mean_token_accuracy": 0.15553901046514512, "num_tokens": 12009717.0, "step": 6510 }, { "entropy": 5.767681360244751, "epoch": 0.5473639991598404, "grad_norm": 1.484375, "learning_rate": 0.0004975806720928642, "loss": 5.713, "mean_token_accuracy": 0.1479937508702278, "num_tokens": 12018020.0, "step": 6515 }, { "entropy": 5.797775173187256, "epoch": 0.5477840789750053, "grad_norm": 1.28125, "learning_rate": 0.0004975762903918475, "loss": 5.7163, "mean_token_accuracy": 0.14613735526800156, "num_tokens": 12027119.0, "step": 6520 }, { "entropy": 5.875396728515625, "epoch": 0.5482041587901701, "grad_norm": 1.359375, "learning_rate": 0.0004975719047480064, "loss": 5.6829, "mean_token_accuracy": 0.15304642170667648, "num_tokens": 12035566.0, "step": 6525 }, { "entropy": 5.761675643920898, "epoch": 0.548624238605335, "grad_norm": 1.421875, "learning_rate": 0.0004975675151614187, "loss": 5.6105, "mean_token_accuracy": 0.15602717846632003, "num_tokens": 12044505.0, "step": 6530 }, { "entropy": 5.709016609191894, "epoch": 0.5490443184204999, "grad_norm": 1.5234375, "learning_rate": 0.000497563121632162, "loss": 5.6827, "mean_token_accuracy": 0.15345038324594498, "num_tokens": 12053338.0, "step": 6535 }, { "entropy": 5.784457445144653, "epoch": 0.5494643982356647, "grad_norm": 1.4453125, "learning_rate": 0.0004975587241603142, "loss": 5.676, "mean_token_accuracy": 0.14854272603988647, "num_tokens": 12063235.0, "step": 6540 }, { "entropy": 5.909809684753418, "epoch": 0.5498844780508296, "grad_norm": 1.375, "learning_rate": 0.0004975543227459533, "loss": 5.7491, "mean_token_accuracy": 0.1429952785372734, "num_tokens": 12072490.0, "step": 6545 }, { "entropy": 5.8736042976379395, "epoch": 0.5503045578659945, "grad_norm": 1.328125, "learning_rate": 0.0004975499173891571, "loss": 5.818, "mean_token_accuracy": 0.14217820167541503, "num_tokens": 12081474.0, "step": 6550 }, { "entropy": 5.804098796844483, "epoch": 0.5507246376811594, "grad_norm": 1.359375, "learning_rate": 0.0004975455080900037, "loss": 5.6739, "mean_token_accuracy": 0.15498915761709214, "num_tokens": 12090963.0, "step": 6555 }, { "entropy": 5.811689233779907, "epoch": 0.5511447174963243, "grad_norm": 1.390625, "learning_rate": 0.0004975410948485713, "loss": 5.6853, "mean_token_accuracy": 0.1526065543293953, "num_tokens": 12099786.0, "step": 6560 }, { "entropy": 5.74642539024353, "epoch": 0.5515647973114892, "grad_norm": 1.3828125, "learning_rate": 0.0004975366776649379, "loss": 5.695, "mean_token_accuracy": 0.14672838300466537, "num_tokens": 12108469.0, "step": 6565 }, { "entropy": 5.774152183532715, "epoch": 0.5519848771266541, "grad_norm": 1.6640625, "learning_rate": 0.0004975322565391818, "loss": 5.6804, "mean_token_accuracy": 0.1517785020172596, "num_tokens": 12118287.0, "step": 6570 }, { "entropy": 5.879052972793579, "epoch": 0.5524049569418189, "grad_norm": 1.6796875, "learning_rate": 0.0004975278314713814, "loss": 5.8381, "mean_token_accuracy": 0.14230698868632316, "num_tokens": 12127122.0, "step": 6575 }, { "entropy": 5.914984178543091, "epoch": 0.5528250367569838, "grad_norm": 1.4765625, "learning_rate": 0.0004975234024616152, "loss": 5.731, "mean_token_accuracy": 0.15133389160037042, "num_tokens": 12136395.0, "step": 6580 }, { "entropy": 5.734422016143799, "epoch": 0.5532451165721487, "grad_norm": 1.84375, "learning_rate": 0.0004975189695099613, "loss": 5.6943, "mean_token_accuracy": 0.15051371306180955, "num_tokens": 12145025.0, "step": 6585 }, { "entropy": 5.800812196731568, "epoch": 0.5536651963873136, "grad_norm": 1.6015625, "learning_rate": 0.0004975145326164985, "loss": 5.7429, "mean_token_accuracy": 0.1447499178349972, "num_tokens": 12154352.0, "step": 6590 }, { "entropy": 5.8064220428466795, "epoch": 0.5540852762024785, "grad_norm": 1.59375, "learning_rate": 0.0004975100917813055, "loss": 5.6588, "mean_token_accuracy": 0.15041681826114656, "num_tokens": 12163802.0, "step": 6595 }, { "entropy": 5.750297594070434, "epoch": 0.5545053560176434, "grad_norm": 1.5078125, "learning_rate": 0.0004975056470044606, "loss": 5.682, "mean_token_accuracy": 0.14631521701812744, "num_tokens": 12173111.0, "step": 6600 }, { "entropy": 5.8171515464782715, "epoch": 0.5549254358328082, "grad_norm": 1.4140625, "learning_rate": 0.0004975011982860428, "loss": 5.7383, "mean_token_accuracy": 0.14391349628567696, "num_tokens": 12182048.0, "step": 6605 }, { "entropy": 5.812657642364502, "epoch": 0.5553455156479731, "grad_norm": 1.5, "learning_rate": 0.0004974967456261309, "loss": 5.7159, "mean_token_accuracy": 0.15039578825235367, "num_tokens": 12191501.0, "step": 6610 }, { "entropy": 5.857609844207763, "epoch": 0.555765595463138, "grad_norm": 1.5390625, "learning_rate": 0.0004974922890248036, "loss": 5.7249, "mean_token_accuracy": 0.15451397448778154, "num_tokens": 12201132.0, "step": 6615 }, { "entropy": 5.899567031860352, "epoch": 0.5561856752783029, "grad_norm": 1.6953125, "learning_rate": 0.00049748782848214, "loss": 5.8549, "mean_token_accuracy": 0.14553611800074578, "num_tokens": 12211082.0, "step": 6620 }, { "entropy": 5.807045125961304, "epoch": 0.5566057550934678, "grad_norm": 1.5390625, "learning_rate": 0.0004974833639982192, "loss": 5.6909, "mean_token_accuracy": 0.15329068303108215, "num_tokens": 12219946.0, "step": 6625 }, { "entropy": 5.925949478149414, "epoch": 0.5570258349086327, "grad_norm": 1.484375, "learning_rate": 0.00049747889557312, "loss": 5.7931, "mean_token_accuracy": 0.14512094482779503, "num_tokens": 12229668.0, "step": 6630 }, { "entropy": 5.886264276504517, "epoch": 0.5574459147237976, "grad_norm": 1.4765625, "learning_rate": 0.0004974744232069219, "loss": 5.7574, "mean_token_accuracy": 0.14679303765296936, "num_tokens": 12238750.0, "step": 6635 }, { "entropy": 5.809984493255615, "epoch": 0.5578659945389624, "grad_norm": 2.109375, "learning_rate": 0.0004974699468997038, "loss": 5.7017, "mean_token_accuracy": 0.14905162900686264, "num_tokens": 12246825.0, "step": 6640 }, { "entropy": 5.811229848861695, "epoch": 0.5582860743541272, "grad_norm": 1.546875, "learning_rate": 0.0004974654666515452, "loss": 5.6602, "mean_token_accuracy": 0.14834603071212768, "num_tokens": 12256413.0, "step": 6645 }, { "entropy": 5.882418012619018, "epoch": 0.5587061541692921, "grad_norm": 1.6171875, "learning_rate": 0.0004974609824625254, "loss": 5.6729, "mean_token_accuracy": 0.1607891857624054, "num_tokens": 12265458.0, "step": 6650 }, { "entropy": 5.649556875228882, "epoch": 0.559126233984457, "grad_norm": 1.3984375, "learning_rate": 0.0004974564943327239, "loss": 5.6227, "mean_token_accuracy": 0.15252939462661744, "num_tokens": 12274124.0, "step": 6655 }, { "entropy": 5.668555736541748, "epoch": 0.5595463137996219, "grad_norm": 1.390625, "learning_rate": 0.00049745200226222, "loss": 5.5888, "mean_token_accuracy": 0.16476203203201295, "num_tokens": 12283513.0, "step": 6660 }, { "entropy": 5.861951494216919, "epoch": 0.5599663936147868, "grad_norm": 1.4140625, "learning_rate": 0.0004974475062510936, "loss": 5.7171, "mean_token_accuracy": 0.15322822630405425, "num_tokens": 12292396.0, "step": 6665 }, { "entropy": 5.834360265731812, "epoch": 0.5603864734299517, "grad_norm": 1.5703125, "learning_rate": 0.0004974430062994242, "loss": 5.754, "mean_token_accuracy": 0.1490551695227623, "num_tokens": 12301604.0, "step": 6670 }, { "entropy": 5.901991987228394, "epoch": 0.5608065532451165, "grad_norm": 1.4375, "learning_rate": 0.0004974385024072912, "loss": 5.7881, "mean_token_accuracy": 0.14175782203674317, "num_tokens": 12310458.0, "step": 6675 }, { "entropy": 5.967726707458496, "epoch": 0.5612266330602814, "grad_norm": 1.4296875, "learning_rate": 0.000497433994574775, "loss": 5.7835, "mean_token_accuracy": 0.1453966811299324, "num_tokens": 12319620.0, "step": 6680 }, { "entropy": 5.85808310508728, "epoch": 0.5616467128754463, "grad_norm": 1.4453125, "learning_rate": 0.000497429482801955, "loss": 5.8356, "mean_token_accuracy": 0.1476121611893177, "num_tokens": 12329518.0, "step": 6685 }, { "entropy": 5.773319292068481, "epoch": 0.5620667926906112, "grad_norm": 1.421875, "learning_rate": 0.0004974249670889111, "loss": 5.6512, "mean_token_accuracy": 0.15055545866489412, "num_tokens": 12338244.0, "step": 6690 }, { "entropy": 5.965986871719361, "epoch": 0.5624868725057761, "grad_norm": 1.5703125, "learning_rate": 0.0004974204474357237, "loss": 5.8233, "mean_token_accuracy": 0.14185196608304979, "num_tokens": 12347962.0, "step": 6695 }, { "entropy": 5.896701097488403, "epoch": 0.562906952320941, "grad_norm": 1.625, "learning_rate": 0.0004974159238424723, "loss": 5.7434, "mean_token_accuracy": 0.14349103569984437, "num_tokens": 12357020.0, "step": 6700 }, { "entropy": 5.812654113769531, "epoch": 0.5633270321361059, "grad_norm": 1.5, "learning_rate": 0.0004974113963092376, "loss": 5.7151, "mean_token_accuracy": 0.1478872776031494, "num_tokens": 12366108.0, "step": 6705 }, { "entropy": 5.879363203048706, "epoch": 0.5637471119512707, "grad_norm": 1.4921875, "learning_rate": 0.0004974068648360995, "loss": 5.646, "mean_token_accuracy": 0.15770871341228485, "num_tokens": 12374508.0, "step": 6710 }, { "entropy": 5.793216609954834, "epoch": 0.5641671917664356, "grad_norm": 1.5703125, "learning_rate": 0.0004974023294231383, "loss": 5.652, "mean_token_accuracy": 0.15676265954971313, "num_tokens": 12383555.0, "step": 6715 }, { "entropy": 5.762006092071533, "epoch": 0.5645872715816005, "grad_norm": 1.390625, "learning_rate": 0.0004973977900704342, "loss": 5.7612, "mean_token_accuracy": 0.1457872360944748, "num_tokens": 12392680.0, "step": 6720 }, { "entropy": 5.872710561752319, "epoch": 0.5650073513967654, "grad_norm": 1.40625, "learning_rate": 0.0004973932467780679, "loss": 5.7963, "mean_token_accuracy": 0.14350106567144394, "num_tokens": 12401881.0, "step": 6725 }, { "entropy": 5.897738790512085, "epoch": 0.5654274312119303, "grad_norm": 1.5, "learning_rate": 0.0004973886995461197, "loss": 5.7755, "mean_token_accuracy": 0.14316605031490326, "num_tokens": 12411487.0, "step": 6730 }, { "entropy": 5.799207353591919, "epoch": 0.5658475110270952, "grad_norm": 1.328125, "learning_rate": 0.0004973841483746703, "loss": 5.594, "mean_token_accuracy": 0.16017859652638436, "num_tokens": 12420376.0, "step": 6735 }, { "entropy": 5.6296477794647215, "epoch": 0.5662675908422601, "grad_norm": 1.421875, "learning_rate": 0.0004973795932638001, "loss": 5.639, "mean_token_accuracy": 0.15424187034368514, "num_tokens": 12429518.0, "step": 6740 }, { "entropy": 5.768233728408814, "epoch": 0.5666876706574249, "grad_norm": 1.3125, "learning_rate": 0.00049737503421359, "loss": 5.6208, "mean_token_accuracy": 0.15618278905749322, "num_tokens": 12438952.0, "step": 6745 }, { "entropy": 5.762353801727295, "epoch": 0.5671077504725898, "grad_norm": 1.515625, "learning_rate": 0.0004973704712241206, "loss": 5.6399, "mean_token_accuracy": 0.14973016381263732, "num_tokens": 12448576.0, "step": 6750 }, { "entropy": 5.758606004714966, "epoch": 0.5675278302877547, "grad_norm": 1.3984375, "learning_rate": 0.0004973659042954729, "loss": 5.666, "mean_token_accuracy": 0.15317632332444192, "num_tokens": 12458166.0, "step": 6755 }, { "entropy": 5.703948211669922, "epoch": 0.5679479101029196, "grad_norm": 1.4765625, "learning_rate": 0.0004973613334277277, "loss": 5.5962, "mean_token_accuracy": 0.15764016062021255, "num_tokens": 12467271.0, "step": 6760 }, { "entropy": 5.815484666824341, "epoch": 0.5683679899180845, "grad_norm": 1.3515625, "learning_rate": 0.0004973567586209658, "loss": 5.7679, "mean_token_accuracy": 0.1427201583981514, "num_tokens": 12476255.0, "step": 6765 }, { "entropy": 5.838050889968872, "epoch": 0.5687880697332494, "grad_norm": 1.5625, "learning_rate": 0.0004973521798752686, "loss": 5.7306, "mean_token_accuracy": 0.1476944074034691, "num_tokens": 12485096.0, "step": 6770 }, { "entropy": 5.906451845169068, "epoch": 0.5692081495484141, "grad_norm": 1.5234375, "learning_rate": 0.000497347597190717, "loss": 5.7558, "mean_token_accuracy": 0.1506843164563179, "num_tokens": 12494405.0, "step": 6775 }, { "entropy": 5.792209434509277, "epoch": 0.569628229363579, "grad_norm": 1.328125, "learning_rate": 0.0004973430105673921, "loss": 5.6821, "mean_token_accuracy": 0.14848777875304223, "num_tokens": 12503349.0, "step": 6780 }, { "entropy": 5.828717470169067, "epoch": 0.5700483091787439, "grad_norm": 1.5625, "learning_rate": 0.0004973384200053754, "loss": 5.7518, "mean_token_accuracy": 0.15347943902015687, "num_tokens": 12513122.0, "step": 6785 }, { "entropy": 5.784585285186767, "epoch": 0.5704683889939088, "grad_norm": 1.3515625, "learning_rate": 0.000497333825504748, "loss": 5.695, "mean_token_accuracy": 0.14986882135272026, "num_tokens": 12523614.0, "step": 6790 }, { "entropy": 5.838396644592285, "epoch": 0.5708884688090737, "grad_norm": 1.359375, "learning_rate": 0.0004973292270655914, "loss": 5.7434, "mean_token_accuracy": 0.143761482834816, "num_tokens": 12532031.0, "step": 6795 }, { "entropy": 5.926707601547241, "epoch": 0.5713085486242386, "grad_norm": 1.3671875, "learning_rate": 0.000497324624687987, "loss": 5.8378, "mean_token_accuracy": 0.1392517074942589, "num_tokens": 12542239.0, "step": 6800 }, { "entropy": 5.917767190933228, "epoch": 0.5717286284394035, "grad_norm": 1.515625, "learning_rate": 0.0004973200183720164, "loss": 5.7483, "mean_token_accuracy": 0.14240999147295952, "num_tokens": 12552608.0, "step": 6805 }, { "entropy": 5.775180721282959, "epoch": 0.5721487082545683, "grad_norm": 1.546875, "learning_rate": 0.0004973154081177611, "loss": 5.593, "mean_token_accuracy": 0.15000374913215636, "num_tokens": 12562020.0, "step": 6810 }, { "entropy": 5.760695695877075, "epoch": 0.5725687880697332, "grad_norm": 1.578125, "learning_rate": 0.0004973107939253027, "loss": 5.6762, "mean_token_accuracy": 0.1592295289039612, "num_tokens": 12570519.0, "step": 6815 }, { "entropy": 5.706324434280395, "epoch": 0.5729888678848981, "grad_norm": 1.4140625, "learning_rate": 0.0004973061757947233, "loss": 5.6616, "mean_token_accuracy": 0.15384514778852462, "num_tokens": 12579324.0, "step": 6820 }, { "entropy": 5.790519523620605, "epoch": 0.573408947700063, "grad_norm": 1.3984375, "learning_rate": 0.0004973015537261043, "loss": 5.7372, "mean_token_accuracy": 0.1493046186864376, "num_tokens": 12588014.0, "step": 6825 }, { "entropy": 5.890619230270386, "epoch": 0.5738290275152279, "grad_norm": 1.4296875, "learning_rate": 0.0004972969277195279, "loss": 5.7305, "mean_token_accuracy": 0.15202558934688568, "num_tokens": 12596882.0, "step": 6830 }, { "entropy": 5.819242668151856, "epoch": 0.5742491073303928, "grad_norm": 1.703125, "learning_rate": 0.0004972922977750757, "loss": 5.6515, "mean_token_accuracy": 0.1478489086031914, "num_tokens": 12606069.0, "step": 6835 }, { "entropy": 5.829999208450317, "epoch": 0.5746691871455577, "grad_norm": 2.21875, "learning_rate": 0.00049728766389283, "loss": 5.6783, "mean_token_accuracy": 0.1460999220609665, "num_tokens": 12615167.0, "step": 6840 }, { "entropy": 5.775484275817871, "epoch": 0.5750892669607225, "grad_norm": 1.6328125, "learning_rate": 0.0004972830260728729, "loss": 5.7111, "mean_token_accuracy": 0.15089115351438523, "num_tokens": 12624230.0, "step": 6845 }, { "entropy": 5.81471266746521, "epoch": 0.5755093467758874, "grad_norm": 1.390625, "learning_rate": 0.0004972783843152863, "loss": 5.6964, "mean_token_accuracy": 0.15319516360759736, "num_tokens": 12633158.0, "step": 6850 }, { "entropy": 5.742516231536865, "epoch": 0.5759294265910523, "grad_norm": 1.3203125, "learning_rate": 0.0004972737386201527, "loss": 5.6358, "mean_token_accuracy": 0.1493402510881424, "num_tokens": 12641465.0, "step": 6855 }, { "entropy": 5.772433757781982, "epoch": 0.5763495064062172, "grad_norm": 1.7890625, "learning_rate": 0.0004972690889875541, "loss": 5.6115, "mean_token_accuracy": 0.15269945561885834, "num_tokens": 12650437.0, "step": 6860 }, { "entropy": 5.9466852188110355, "epoch": 0.5767695862213821, "grad_norm": 1.5, "learning_rate": 0.0004972644354175732, "loss": 5.8321, "mean_token_accuracy": 0.14773827642202378, "num_tokens": 12660072.0, "step": 6865 }, { "entropy": 5.8965418338775635, "epoch": 0.577189666036547, "grad_norm": 1.5703125, "learning_rate": 0.0004972597779102922, "loss": 5.844, "mean_token_accuracy": 0.14816712588071823, "num_tokens": 12670405.0, "step": 6870 }, { "entropy": 5.826220703125, "epoch": 0.5776097458517119, "grad_norm": 1.515625, "learning_rate": 0.0004972551164657937, "loss": 5.7126, "mean_token_accuracy": 0.15028751343488694, "num_tokens": 12679992.0, "step": 6875 }, { "entropy": 5.9022228717803955, "epoch": 0.5780298256668767, "grad_norm": 1.5390625, "learning_rate": 0.0004972504510841602, "loss": 5.7796, "mean_token_accuracy": 0.14697190523147582, "num_tokens": 12690289.0, "step": 6880 }, { "entropy": 5.883794593811035, "epoch": 0.5784499054820416, "grad_norm": 1.359375, "learning_rate": 0.0004972457817654745, "loss": 5.7709, "mean_token_accuracy": 0.14337689578533172, "num_tokens": 12700518.0, "step": 6885 }, { "entropy": 5.896582746505738, "epoch": 0.5788699852972065, "grad_norm": 1.3125, "learning_rate": 0.0004972411085098191, "loss": 5.8202, "mean_token_accuracy": 0.138790999352932, "num_tokens": 12710603.0, "step": 6890 }, { "entropy": 5.896594381332397, "epoch": 0.5792900651123714, "grad_norm": 1.296875, "learning_rate": 0.000497236431317277, "loss": 5.7086, "mean_token_accuracy": 0.14955383241176606, "num_tokens": 12719298.0, "step": 6895 }, { "entropy": 5.828510808944702, "epoch": 0.5797101449275363, "grad_norm": 1.484375, "learning_rate": 0.000497231750187931, "loss": 5.7051, "mean_token_accuracy": 0.1494380295276642, "num_tokens": 12728368.0, "step": 6900 }, { "entropy": 5.847594785690307, "epoch": 0.5801302247427012, "grad_norm": 1.59375, "learning_rate": 0.0004972270651218638, "loss": 5.769, "mean_token_accuracy": 0.15052054449915886, "num_tokens": 12737898.0, "step": 6905 }, { "entropy": 5.896743059158325, "epoch": 0.580550304557866, "grad_norm": 1.4921875, "learning_rate": 0.0004972223761191587, "loss": 5.7024, "mean_token_accuracy": 0.1484552301466465, "num_tokens": 12746761.0, "step": 6910 }, { "entropy": 5.748441457748413, "epoch": 0.5809703843730308, "grad_norm": 1.4296875, "learning_rate": 0.0004972176831798986, "loss": 5.6317, "mean_token_accuracy": 0.1558982439339161, "num_tokens": 12755128.0, "step": 6915 }, { "entropy": 5.8237542629241945, "epoch": 0.5813904641881957, "grad_norm": 1.2890625, "learning_rate": 0.0004972129863041667, "loss": 5.8145, "mean_token_accuracy": 0.1419169031083584, "num_tokens": 12764727.0, "step": 6920 }, { "entropy": 5.825289487838745, "epoch": 0.5818105440033606, "grad_norm": 1.34375, "learning_rate": 0.0004972082854920462, "loss": 5.6682, "mean_token_accuracy": 0.15212180316448212, "num_tokens": 12773557.0, "step": 6925 }, { "entropy": 5.780522108078003, "epoch": 0.5822306238185255, "grad_norm": 1.375, "learning_rate": 0.0004972035807436203, "loss": 5.6741, "mean_token_accuracy": 0.15388695299625396, "num_tokens": 12782525.0, "step": 6930 }, { "entropy": 5.874711608886718, "epoch": 0.5826507036336904, "grad_norm": 1.4765625, "learning_rate": 0.0004971988720589723, "loss": 5.7714, "mean_token_accuracy": 0.14911144897341727, "num_tokens": 12791534.0, "step": 6935 }, { "entropy": 5.865447235107422, "epoch": 0.5830707834488553, "grad_norm": 1.4140625, "learning_rate": 0.0004971941594381858, "loss": 5.6622, "mean_token_accuracy": 0.1520915597677231, "num_tokens": 12800662.0, "step": 6940 }, { "entropy": 5.833262968063354, "epoch": 0.5834908632640201, "grad_norm": 1.421875, "learning_rate": 0.0004971894428813441, "loss": 5.7134, "mean_token_accuracy": 0.15022262334823608, "num_tokens": 12809440.0, "step": 6945 }, { "entropy": 5.89053783416748, "epoch": 0.583910943079185, "grad_norm": 1.453125, "learning_rate": 0.000497184722388531, "loss": 5.7974, "mean_token_accuracy": 0.14950450211763383, "num_tokens": 12818560.0, "step": 6950 }, { "entropy": 5.910626697540283, "epoch": 0.5843310228943499, "grad_norm": 1.3828125, "learning_rate": 0.0004971799979598297, "loss": 5.7158, "mean_token_accuracy": 0.15047362595796585, "num_tokens": 12827898.0, "step": 6955 }, { "entropy": 5.736415719985962, "epoch": 0.5847511027095148, "grad_norm": 1.6875, "learning_rate": 0.0004971752695953243, "loss": 5.6673, "mean_token_accuracy": 0.15286629199981688, "num_tokens": 12837199.0, "step": 6960 }, { "entropy": 5.841268587112427, "epoch": 0.5851711825246797, "grad_norm": 1.484375, "learning_rate": 0.0004971705372950984, "loss": 5.6889, "mean_token_accuracy": 0.14883269965648652, "num_tokens": 12846493.0, "step": 6965 }, { "entropy": 5.862727975845337, "epoch": 0.5855912623398446, "grad_norm": 1.5078125, "learning_rate": 0.0004971658010592358, "loss": 5.7059, "mean_token_accuracy": 0.14308914840221404, "num_tokens": 12855026.0, "step": 6970 }, { "entropy": 5.807987260818481, "epoch": 0.5860113421550095, "grad_norm": 1.4609375, "learning_rate": 0.0004971610608878205, "loss": 5.7711, "mean_token_accuracy": 0.14490452259778977, "num_tokens": 12864563.0, "step": 6975 }, { "entropy": 5.884010982513428, "epoch": 0.5864314219701743, "grad_norm": 1.671875, "learning_rate": 0.0004971563167809363, "loss": 5.7237, "mean_token_accuracy": 0.15075904428958892, "num_tokens": 12874358.0, "step": 6980 }, { "entropy": 5.7711278915405275, "epoch": 0.5868515017853392, "grad_norm": 1.359375, "learning_rate": 0.0004971515687386674, "loss": 5.7117, "mean_token_accuracy": 0.1473625972867012, "num_tokens": 12883110.0, "step": 6985 }, { "entropy": 5.803575611114502, "epoch": 0.5872715816005041, "grad_norm": 1.40625, "learning_rate": 0.0004971468167610978, "loss": 5.7851, "mean_token_accuracy": 0.15010628029704093, "num_tokens": 12892977.0, "step": 6990 }, { "entropy": 5.790566396713257, "epoch": 0.587691661415669, "grad_norm": 1.390625, "learning_rate": 0.0004971420608483117, "loss": 5.6004, "mean_token_accuracy": 0.1545809641480446, "num_tokens": 12902327.0, "step": 6995 }, { "entropy": 5.741348314285278, "epoch": 0.5881117412308339, "grad_norm": 1.640625, "learning_rate": 0.0004971373010003936, "loss": 5.6022, "mean_token_accuracy": 0.16168920323252678, "num_tokens": 12911957.0, "step": 7000 }, { "entropy": 5.8003096103668215, "epoch": 0.5885318210459988, "grad_norm": 1.3828125, "learning_rate": 0.0004971325372174274, "loss": 5.6907, "mean_token_accuracy": 0.14657490849494934, "num_tokens": 12920380.0, "step": 7005 }, { "entropy": 5.811933612823486, "epoch": 0.5889519008611637, "grad_norm": 1.2578125, "learning_rate": 0.0004971277694994976, "loss": 5.7533, "mean_token_accuracy": 0.15078987032175065, "num_tokens": 12929670.0, "step": 7010 }, { "entropy": 5.819301414489746, "epoch": 0.5893719806763285, "grad_norm": 1.5390625, "learning_rate": 0.000497122997846689, "loss": 5.6612, "mean_token_accuracy": 0.1566910207271576, "num_tokens": 12938185.0, "step": 7015 }, { "entropy": 5.85056962966919, "epoch": 0.5897920604914934, "grad_norm": 1.4609375, "learning_rate": 0.0004971182222590857, "loss": 5.6984, "mean_token_accuracy": 0.15590957552194595, "num_tokens": 12947706.0, "step": 7020 }, { "entropy": 5.766946744918823, "epoch": 0.5902121403066583, "grad_norm": 1.34375, "learning_rate": 0.0004971134427367725, "loss": 5.6836, "mean_token_accuracy": 0.14876563102006912, "num_tokens": 12957393.0, "step": 7025 }, { "entropy": 5.863473749160766, "epoch": 0.5906322201218231, "grad_norm": 1.375, "learning_rate": 0.000497108659279834, "loss": 5.5813, "mean_token_accuracy": 0.1580106034874916, "num_tokens": 12967165.0, "step": 7030 }, { "entropy": 5.893796777725219, "epoch": 0.591052299936988, "grad_norm": 1.4921875, "learning_rate": 0.0004971038718883551, "loss": 5.7311, "mean_token_accuracy": 0.14258148968219758, "num_tokens": 12976490.0, "step": 7035 }, { "entropy": 5.8169300079345705, "epoch": 0.5914723797521529, "grad_norm": 1.796875, "learning_rate": 0.0004970990805624203, "loss": 5.7245, "mean_token_accuracy": 0.1458576127886772, "num_tokens": 12985423.0, "step": 7040 }, { "entropy": 5.806120443344116, "epoch": 0.5918924595673178, "grad_norm": 1.34375, "learning_rate": 0.0004970942853021147, "loss": 5.6187, "mean_token_accuracy": 0.15678810328245163, "num_tokens": 12994510.0, "step": 7045 }, { "entropy": 5.8349559783935545, "epoch": 0.5923125393824826, "grad_norm": 1.5859375, "learning_rate": 0.0004970894861075232, "loss": 5.734, "mean_token_accuracy": 0.1486038699746132, "num_tokens": 13003383.0, "step": 7050 }, { "entropy": 5.833832693099976, "epoch": 0.5927326191976475, "grad_norm": 1.34375, "learning_rate": 0.0004970846829787309, "loss": 5.6695, "mean_token_accuracy": 0.15129955112934113, "num_tokens": 13012550.0, "step": 7055 }, { "entropy": 5.845009517669678, "epoch": 0.5931526990128124, "grad_norm": 1.40625, "learning_rate": 0.0004970798759158227, "loss": 5.7421, "mean_token_accuracy": 0.14426639974117278, "num_tokens": 13022066.0, "step": 7060 }, { "entropy": 5.804647397994995, "epoch": 0.5935727788279773, "grad_norm": 1.515625, "learning_rate": 0.0004970750649188839, "loss": 5.711, "mean_token_accuracy": 0.15260717198252677, "num_tokens": 13031008.0, "step": 7065 }, { "entropy": 5.774487495422363, "epoch": 0.5939928586431422, "grad_norm": 1.4140625, "learning_rate": 0.0004970702499879998, "loss": 5.6978, "mean_token_accuracy": 0.14794613867998124, "num_tokens": 13040366.0, "step": 7070 }, { "entropy": 5.774663066864013, "epoch": 0.5944129384583071, "grad_norm": 1.5390625, "learning_rate": 0.0004970654311232554, "loss": 5.7282, "mean_token_accuracy": 0.14623787105083466, "num_tokens": 13051140.0, "step": 7075 }, { "entropy": 5.849271965026856, "epoch": 0.594833018273472, "grad_norm": 1.96875, "learning_rate": 0.0004970606083247362, "loss": 5.6443, "mean_token_accuracy": 0.15294349193572998, "num_tokens": 13059835.0, "step": 7080 }, { "entropy": 5.7127063274383545, "epoch": 0.5952530980886368, "grad_norm": 1.5078125, "learning_rate": 0.0004970557815925278, "loss": 5.5898, "mean_token_accuracy": 0.14923029839992524, "num_tokens": 13068909.0, "step": 7085 }, { "entropy": 5.729467248916626, "epoch": 0.5956731779038017, "grad_norm": 3.078125, "learning_rate": 0.0004970509509267155, "loss": 5.6618, "mean_token_accuracy": 0.14696715027093887, "num_tokens": 13078380.0, "step": 7090 }, { "entropy": 5.90779447555542, "epoch": 0.5960932577189666, "grad_norm": 1.71875, "learning_rate": 0.0004970461163273849, "loss": 5.7102, "mean_token_accuracy": 0.15209844410419465, "num_tokens": 13087774.0, "step": 7095 }, { "entropy": 5.781322765350342, "epoch": 0.5965133375341315, "grad_norm": 1.578125, "learning_rate": 0.0004970412777946219, "loss": 5.5491, "mean_token_accuracy": 0.1548515573143959, "num_tokens": 13095938.0, "step": 7100 }, { "entropy": 5.7372105598449705, "epoch": 0.5969334173492964, "grad_norm": 1.546875, "learning_rate": 0.0004970364353285117, "loss": 5.6888, "mean_token_accuracy": 0.15444473102688788, "num_tokens": 13104661.0, "step": 7105 }, { "entropy": 5.844806241989136, "epoch": 0.5973534971644613, "grad_norm": 1.4375, "learning_rate": 0.0004970315889291405, "loss": 5.6731, "mean_token_accuracy": 0.1474146157503128, "num_tokens": 13114505.0, "step": 7110 }, { "entropy": 5.694882488250732, "epoch": 0.5977735769796261, "grad_norm": 1.65625, "learning_rate": 0.0004970267385965941, "loss": 5.6245, "mean_token_accuracy": 0.15627836883068086, "num_tokens": 13124590.0, "step": 7115 }, { "entropy": 5.715419483184815, "epoch": 0.598193656794791, "grad_norm": 1.5390625, "learning_rate": 0.0004970218843309583, "loss": 5.6087, "mean_token_accuracy": 0.1559140369296074, "num_tokens": 13134026.0, "step": 7120 }, { "entropy": 5.890923166275025, "epoch": 0.5986137366099559, "grad_norm": 1.578125, "learning_rate": 0.0004970170261323192, "loss": 5.7662, "mean_token_accuracy": 0.15187639147043228, "num_tokens": 13142654.0, "step": 7125 }, { "entropy": 5.7584481716156, "epoch": 0.5990338164251208, "grad_norm": 1.4765625, "learning_rate": 0.0004970121640007627, "loss": 5.6728, "mean_token_accuracy": 0.1504793107509613, "num_tokens": 13151177.0, "step": 7130 }, { "entropy": 5.807246541976928, "epoch": 0.5994538962402857, "grad_norm": 1.4140625, "learning_rate": 0.0004970072979363751, "loss": 5.6657, "mean_token_accuracy": 0.1458762139081955, "num_tokens": 13159689.0, "step": 7135 }, { "entropy": 5.796993541717529, "epoch": 0.5998739760554506, "grad_norm": 1.3984375, "learning_rate": 0.0004970024279392425, "loss": 5.7087, "mean_token_accuracy": 0.1491813488304615, "num_tokens": 13168601.0, "step": 7140 }, { "entropy": 5.799499607086181, "epoch": 0.6002940558706155, "grad_norm": 1.578125, "learning_rate": 0.0004969975540094513, "loss": 5.6911, "mean_token_accuracy": 0.1491454616189003, "num_tokens": 13177035.0, "step": 7145 }, { "entropy": 5.840288925170898, "epoch": 0.6007141356857802, "grad_norm": 1.546875, "learning_rate": 0.0004969926761470876, "loss": 5.6471, "mean_token_accuracy": 0.15894681811332703, "num_tokens": 13185444.0, "step": 7150 }, { "entropy": 5.787335777282715, "epoch": 0.6011342155009451, "grad_norm": 1.5234375, "learning_rate": 0.000496987794352238, "loss": 5.6543, "mean_token_accuracy": 0.15718057453632356, "num_tokens": 13194987.0, "step": 7155 }, { "entropy": 5.711384534835815, "epoch": 0.60155429531611, "grad_norm": 1.34375, "learning_rate": 0.0004969829086249889, "loss": 5.6887, "mean_token_accuracy": 0.14929505437612534, "num_tokens": 13203807.0, "step": 7160 }, { "entropy": 5.874243068695068, "epoch": 0.6019743751312749, "grad_norm": 1.3984375, "learning_rate": 0.000496978018965427, "loss": 5.7803, "mean_token_accuracy": 0.14797215312719345, "num_tokens": 13214362.0, "step": 7165 }, { "entropy": 5.938519763946533, "epoch": 0.6023944549464398, "grad_norm": 1.78125, "learning_rate": 0.0004969731253736387, "loss": 5.7816, "mean_token_accuracy": 0.14409856349229813, "num_tokens": 13224192.0, "step": 7170 }, { "entropy": 5.824232769012451, "epoch": 0.6028145347616047, "grad_norm": 1.6171875, "learning_rate": 0.0004969682278497109, "loss": 5.7438, "mean_token_accuracy": 0.149906075745821, "num_tokens": 13234430.0, "step": 7175 }, { "entropy": 5.766725778579712, "epoch": 0.6032346145767696, "grad_norm": 1.5625, "learning_rate": 0.0004969633263937301, "loss": 5.6477, "mean_token_accuracy": 0.15190263986587524, "num_tokens": 13243681.0, "step": 7180 }, { "entropy": 5.959778547286987, "epoch": 0.6036546943919344, "grad_norm": 1.4609375, "learning_rate": 0.0004969584210057832, "loss": 5.9315, "mean_token_accuracy": 0.13914565443992616, "num_tokens": 13254334.0, "step": 7185 }, { "entropy": 5.908876419067383, "epoch": 0.6040747742070993, "grad_norm": 1.25, "learning_rate": 0.0004969535116859573, "loss": 5.7233, "mean_token_accuracy": 0.15498362332582474, "num_tokens": 13263781.0, "step": 7190 }, { "entropy": 5.757447004318237, "epoch": 0.6044948540222642, "grad_norm": 1.421875, "learning_rate": 0.0004969485984343392, "loss": 5.633, "mean_token_accuracy": 0.15214563608169557, "num_tokens": 13272831.0, "step": 7195 }, { "entropy": 5.840635204315186, "epoch": 0.6049149338374291, "grad_norm": 1.671875, "learning_rate": 0.000496943681251016, "loss": 5.6943, "mean_token_accuracy": 0.15125853568315506, "num_tokens": 13281621.0, "step": 7200 }, { "entropy": 5.772113513946533, "epoch": 0.605335013652594, "grad_norm": 1.484375, "learning_rate": 0.0004969387601360747, "loss": 5.6754, "mean_token_accuracy": 0.1471445269882679, "num_tokens": 13291021.0, "step": 7205 }, { "entropy": 5.837057733535767, "epoch": 0.6057550934677589, "grad_norm": 1.34375, "learning_rate": 0.0004969338350896026, "loss": 5.6877, "mean_token_accuracy": 0.15487841069698333, "num_tokens": 13299752.0, "step": 7210 }, { "entropy": 5.855220079421997, "epoch": 0.6061751732829238, "grad_norm": 1.328125, "learning_rate": 0.0004969289061116869, "loss": 5.7219, "mean_token_accuracy": 0.14336248189210893, "num_tokens": 13309112.0, "step": 7215 }, { "entropy": 5.829800653457641, "epoch": 0.6065952530980886, "grad_norm": 1.46875, "learning_rate": 0.0004969239732024148, "loss": 5.7305, "mean_token_accuracy": 0.15485918670892715, "num_tokens": 13318328.0, "step": 7220 }, { "entropy": 5.693413162231446, "epoch": 0.6070153329132535, "grad_norm": 1.34375, "learning_rate": 0.0004969190363618739, "loss": 5.6063, "mean_token_accuracy": 0.149900983273983, "num_tokens": 13328940.0, "step": 7225 }, { "entropy": 5.717437219619751, "epoch": 0.6074354127284184, "grad_norm": 1.5625, "learning_rate": 0.0004969140955901516, "loss": 5.6137, "mean_token_accuracy": 0.15410374999046325, "num_tokens": 13337829.0, "step": 7230 }, { "entropy": 5.903831624984742, "epoch": 0.6078554925435833, "grad_norm": 1.5, "learning_rate": 0.0004969091508873352, "loss": 5.804, "mean_token_accuracy": 0.14683766812086105, "num_tokens": 13348289.0, "step": 7235 }, { "entropy": 5.835478973388672, "epoch": 0.6082755723587482, "grad_norm": 1.421875, "learning_rate": 0.0004969042022535126, "loss": 5.708, "mean_token_accuracy": 0.15235030949115752, "num_tokens": 13357292.0, "step": 7240 }, { "entropy": 5.843629169464111, "epoch": 0.6086956521739131, "grad_norm": 1.4140625, "learning_rate": 0.0004968992496887713, "loss": 5.7554, "mean_token_accuracy": 0.14912576526403426, "num_tokens": 13366640.0, "step": 7245 }, { "entropy": 5.844546985626221, "epoch": 0.609115731989078, "grad_norm": 1.390625, "learning_rate": 0.0004968942931931989, "loss": 5.6594, "mean_token_accuracy": 0.1629155233502388, "num_tokens": 13377509.0, "step": 7250 }, { "entropy": 5.813440895080566, "epoch": 0.6095358118042428, "grad_norm": 1.53125, "learning_rate": 0.0004968893327668835, "loss": 5.749, "mean_token_accuracy": 0.14384781569242477, "num_tokens": 13386573.0, "step": 7255 }, { "entropy": 5.739164876937866, "epoch": 0.6099558916194077, "grad_norm": 1.234375, "learning_rate": 0.0004968843684099128, "loss": 5.607, "mean_token_accuracy": 0.1540288582444191, "num_tokens": 13395790.0, "step": 7260 }, { "entropy": 5.730731964111328, "epoch": 0.6103759714345726, "grad_norm": 1.375, "learning_rate": 0.0004968794001223747, "loss": 5.658, "mean_token_accuracy": 0.1504225805401802, "num_tokens": 13405265.0, "step": 7265 }, { "entropy": 5.77107720375061, "epoch": 0.6107960512497375, "grad_norm": 1.5859375, "learning_rate": 0.0004968744279043574, "loss": 5.6733, "mean_token_accuracy": 0.15312831848859787, "num_tokens": 13413796.0, "step": 7270 }, { "entropy": 5.86907844543457, "epoch": 0.6112161310649024, "grad_norm": 1.515625, "learning_rate": 0.0004968694517559488, "loss": 5.7213, "mean_token_accuracy": 0.15257197394967079, "num_tokens": 13423299.0, "step": 7275 }, { "entropy": 5.709070634841919, "epoch": 0.6116362108800673, "grad_norm": 1.7265625, "learning_rate": 0.0004968644716772371, "loss": 5.6292, "mean_token_accuracy": 0.15693681687116623, "num_tokens": 13432267.0, "step": 7280 }, { "entropy": 5.737072992324829, "epoch": 0.612056290695232, "grad_norm": 1.765625, "learning_rate": 0.0004968594876683105, "loss": 5.733, "mean_token_accuracy": 0.14609354361891747, "num_tokens": 13442332.0, "step": 7285 }, { "entropy": 5.8117883682250975, "epoch": 0.6124763705103969, "grad_norm": 1.4921875, "learning_rate": 0.0004968544997292572, "loss": 5.6747, "mean_token_accuracy": 0.15259024500846863, "num_tokens": 13451700.0, "step": 7290 }, { "entropy": 5.812619876861572, "epoch": 0.6128964503255618, "grad_norm": 1.7421875, "learning_rate": 0.0004968495078601659, "loss": 5.7774, "mean_token_accuracy": 0.14332814291119575, "num_tokens": 13461009.0, "step": 7295 }, { "entropy": 5.858203887939453, "epoch": 0.6133165301407267, "grad_norm": 1.5078125, "learning_rate": 0.0004968445120611247, "loss": 5.7707, "mean_token_accuracy": 0.15080213099718093, "num_tokens": 13470341.0, "step": 7300 }, { "entropy": 5.905436229705811, "epoch": 0.6137366099558916, "grad_norm": 1.578125, "learning_rate": 0.0004968395123322223, "loss": 5.7003, "mean_token_accuracy": 0.1523931697010994, "num_tokens": 13479898.0, "step": 7305 }, { "entropy": 5.742975854873658, "epoch": 0.6141566897710565, "grad_norm": 1.4140625, "learning_rate": 0.000496834508673547, "loss": 5.6046, "mean_token_accuracy": 0.15081604719161987, "num_tokens": 13488116.0, "step": 7310 }, { "entropy": 5.723895263671875, "epoch": 0.6145767695862214, "grad_norm": 1.6796875, "learning_rate": 0.0004968295010851877, "loss": 5.6474, "mean_token_accuracy": 0.15416487902402878, "num_tokens": 13497814.0, "step": 7315 }, { "entropy": 5.786228084564209, "epoch": 0.6149968494013862, "grad_norm": 1.90625, "learning_rate": 0.0004968244895672331, "loss": 5.6679, "mean_token_accuracy": 0.14462938904762268, "num_tokens": 13506617.0, "step": 7320 }, { "entropy": 5.833630132675171, "epoch": 0.6154169292165511, "grad_norm": 1.578125, "learning_rate": 0.0004968194741197718, "loss": 5.8051, "mean_token_accuracy": 0.1436678983271122, "num_tokens": 13516632.0, "step": 7325 }, { "entropy": 5.897484588623047, "epoch": 0.615837009031716, "grad_norm": 1.484375, "learning_rate": 0.0004968144547428927, "loss": 5.7291, "mean_token_accuracy": 0.15222294852137566, "num_tokens": 13526452.0, "step": 7330 }, { "entropy": 5.792807674407959, "epoch": 0.6162570888468809, "grad_norm": 1.4609375, "learning_rate": 0.0004968094314366848, "loss": 5.6406, "mean_token_accuracy": 0.150718155503273, "num_tokens": 13535663.0, "step": 7335 }, { "entropy": 5.687614870071411, "epoch": 0.6166771686620458, "grad_norm": 1.6484375, "learning_rate": 0.000496804404201237, "loss": 5.558, "mean_token_accuracy": 0.16134363710880278, "num_tokens": 13544574.0, "step": 7340 }, { "entropy": 5.88130555152893, "epoch": 0.6170972484772107, "grad_norm": 1.375, "learning_rate": 0.0004967993730366385, "loss": 5.7309, "mean_token_accuracy": 0.15020160600543023, "num_tokens": 13553041.0, "step": 7345 }, { "entropy": 5.799270153045654, "epoch": 0.6175173282923756, "grad_norm": 1.5546875, "learning_rate": 0.0004967943379429781, "loss": 5.7106, "mean_token_accuracy": 0.14654484167695045, "num_tokens": 13562108.0, "step": 7350 }, { "entropy": 5.930500316619873, "epoch": 0.6179374081075404, "grad_norm": 1.5234375, "learning_rate": 0.0004967892989203454, "loss": 5.8659, "mean_token_accuracy": 0.14354829862713814, "num_tokens": 13571500.0, "step": 7355 }, { "entropy": 5.872519779205322, "epoch": 0.6183574879227053, "grad_norm": 1.5, "learning_rate": 0.0004967842559688295, "loss": 5.7577, "mean_token_accuracy": 0.14510439038276673, "num_tokens": 13581304.0, "step": 7360 }, { "entropy": 5.81227593421936, "epoch": 0.6187775677378702, "grad_norm": 1.9140625, "learning_rate": 0.0004967792090885195, "loss": 5.6444, "mean_token_accuracy": 0.15179503858089446, "num_tokens": 13590734.0, "step": 7365 }, { "entropy": 5.746864557266235, "epoch": 0.6191976475530351, "grad_norm": 1.671875, "learning_rate": 0.0004967741582795052, "loss": 5.6924, "mean_token_accuracy": 0.14929923564195632, "num_tokens": 13600486.0, "step": 7370 }, { "entropy": 5.881101942062378, "epoch": 0.6196177273682, "grad_norm": 1.4921875, "learning_rate": 0.0004967691035418758, "loss": 5.7268, "mean_token_accuracy": 0.14389215558767318, "num_tokens": 13610542.0, "step": 7375 }, { "entropy": 5.792819786071777, "epoch": 0.6200378071833649, "grad_norm": 2.25, "learning_rate": 0.000496764044875721, "loss": 5.6759, "mean_token_accuracy": 0.15460289865732194, "num_tokens": 13619431.0, "step": 7380 }, { "entropy": 5.761080598831176, "epoch": 0.6204578869985298, "grad_norm": 1.4921875, "learning_rate": 0.0004967589822811303, "loss": 5.6957, "mean_token_accuracy": 0.14801864922046662, "num_tokens": 13629930.0, "step": 7385 }, { "entropy": 5.956879663467407, "epoch": 0.6208779668136946, "grad_norm": 1.4140625, "learning_rate": 0.0004967539157581934, "loss": 5.8424, "mean_token_accuracy": 0.14267176687717437, "num_tokens": 13639439.0, "step": 7390 }, { "entropy": 5.9114847660064695, "epoch": 0.6212980466288595, "grad_norm": 1.4140625, "learning_rate": 0.000496748845307, "loss": 5.7476, "mean_token_accuracy": 0.15258604139089585, "num_tokens": 13648548.0, "step": 7395 }, { "entropy": 5.858182144165039, "epoch": 0.6217181264440244, "grad_norm": 1.6484375, "learning_rate": 0.0004967437709276401, "loss": 5.7985, "mean_token_accuracy": 0.15154744163155556, "num_tokens": 13657658.0, "step": 7400 }, { "entropy": 5.721544599533081, "epoch": 0.6221382062591893, "grad_norm": 1.6953125, "learning_rate": 0.0004967386926202034, "loss": 5.5518, "mean_token_accuracy": 0.15903828144073487, "num_tokens": 13666763.0, "step": 7405 }, { "entropy": 5.837467288970947, "epoch": 0.6225582860743542, "grad_norm": 1.4296875, "learning_rate": 0.00049673361038478, "loss": 5.8103, "mean_token_accuracy": 0.14174049571156502, "num_tokens": 13676527.0, "step": 7410 }, { "entropy": 5.855217123031617, "epoch": 0.622978365889519, "grad_norm": 1.5, "learning_rate": 0.0004967285242214599, "loss": 5.7674, "mean_token_accuracy": 0.149812014400959, "num_tokens": 13685404.0, "step": 7415 }, { "entropy": 5.782896041870117, "epoch": 0.6233984457046838, "grad_norm": 1.4609375, "learning_rate": 0.000496723434130333, "loss": 5.5821, "mean_token_accuracy": 0.15357585549354552, "num_tokens": 13693118.0, "step": 7420 }, { "entropy": 5.7227521419525145, "epoch": 0.6238185255198487, "grad_norm": 1.46875, "learning_rate": 0.0004967183401114898, "loss": 5.6601, "mean_token_accuracy": 0.15249475762248038, "num_tokens": 13702015.0, "step": 7425 }, { "entropy": 5.806180191040039, "epoch": 0.6242386053350136, "grad_norm": 2.203125, "learning_rate": 0.0004967132421650203, "loss": 5.6877, "mean_token_accuracy": 0.14611244574189186, "num_tokens": 13711658.0, "step": 7430 }, { "entropy": 5.766854763031006, "epoch": 0.6246586851501785, "grad_norm": 1.625, "learning_rate": 0.0004967081402910149, "loss": 5.6979, "mean_token_accuracy": 0.14979787766933442, "num_tokens": 13720718.0, "step": 7435 }, { "entropy": 5.728975391387939, "epoch": 0.6250787649653434, "grad_norm": 1.359375, "learning_rate": 0.000496703034489564, "loss": 5.5606, "mean_token_accuracy": 0.1568959876894951, "num_tokens": 13729364.0, "step": 7440 }, { "entropy": 5.909390020370483, "epoch": 0.6254988447805083, "grad_norm": 1.71875, "learning_rate": 0.0004966979247607579, "loss": 5.8725, "mean_token_accuracy": 0.14035747721791267, "num_tokens": 13739436.0, "step": 7445 }, { "entropy": 5.9296684741973875, "epoch": 0.6259189245956732, "grad_norm": 1.3515625, "learning_rate": 0.0004966928111046873, "loss": 5.7708, "mean_token_accuracy": 0.15743647813796996, "num_tokens": 13749196.0, "step": 7450 }, { "entropy": 5.783377313613892, "epoch": 0.626339004410838, "grad_norm": 1.40625, "learning_rate": 0.0004966876935214426, "loss": 5.6254, "mean_token_accuracy": 0.15206747651100158, "num_tokens": 13758414.0, "step": 7455 }, { "entropy": 5.766037368774414, "epoch": 0.6267590842260029, "grad_norm": 2.5625, "learning_rate": 0.0004966825720111147, "loss": 5.6562, "mean_token_accuracy": 0.14928966909646987, "num_tokens": 13767496.0, "step": 7460 }, { "entropy": 5.811860084533691, "epoch": 0.6271791640411678, "grad_norm": 1.546875, "learning_rate": 0.0004966774465737942, "loss": 5.8047, "mean_token_accuracy": 0.15070491954684256, "num_tokens": 13777033.0, "step": 7465 }, { "entropy": 5.844302463531494, "epoch": 0.6275992438563327, "grad_norm": 1.796875, "learning_rate": 0.0004966723172095717, "loss": 5.7583, "mean_token_accuracy": 0.14748911708593368, "num_tokens": 13786313.0, "step": 7470 }, { "entropy": 5.826303386688233, "epoch": 0.6280193236714976, "grad_norm": 1.5078125, "learning_rate": 0.0004966671839185384, "loss": 5.691, "mean_token_accuracy": 0.1544649474322796, "num_tokens": 13795257.0, "step": 7475 }, { "entropy": 5.733129787445068, "epoch": 0.6284394034866625, "grad_norm": 1.3984375, "learning_rate": 0.0004966620467007851, "loss": 5.6151, "mean_token_accuracy": 0.15482667088508606, "num_tokens": 13804582.0, "step": 7480 }, { "entropy": 5.708710527420044, "epoch": 0.6288594833018274, "grad_norm": 1.859375, "learning_rate": 0.0004966569055564027, "loss": 5.5858, "mean_token_accuracy": 0.1517590843141079, "num_tokens": 13813248.0, "step": 7485 }, { "entropy": 5.892451477050781, "epoch": 0.6292795631169922, "grad_norm": 1.703125, "learning_rate": 0.0004966517604854823, "loss": 5.8557, "mean_token_accuracy": 0.13463475033640862, "num_tokens": 13823301.0, "step": 7490 }, { "entropy": 5.816387891769409, "epoch": 0.6296996429321571, "grad_norm": 1.5625, "learning_rate": 0.0004966466114881152, "loss": 5.5904, "mean_token_accuracy": 0.15593330711126327, "num_tokens": 13832040.0, "step": 7495 }, { "entropy": 5.830536413192749, "epoch": 0.630119722747322, "grad_norm": 1.390625, "learning_rate": 0.0004966414585643925, "loss": 5.7743, "mean_token_accuracy": 0.14742243885993958, "num_tokens": 13841874.0, "step": 7500 }, { "entropy": 5.7584226608276365, "epoch": 0.6305398025624869, "grad_norm": 1.5, "learning_rate": 0.0004966363017144055, "loss": 5.6126, "mean_token_accuracy": 0.15902097374200821, "num_tokens": 13850755.0, "step": 7505 }, { "entropy": 5.788242483139038, "epoch": 0.6309598823776518, "grad_norm": 1.3203125, "learning_rate": 0.0004966311409382455, "loss": 5.6797, "mean_token_accuracy": 0.14931050986051558, "num_tokens": 13860009.0, "step": 7510 }, { "entropy": 5.736308908462524, "epoch": 0.6313799621928167, "grad_norm": 1.421875, "learning_rate": 0.0004966259762360039, "loss": 5.5946, "mean_token_accuracy": 0.15429836511611938, "num_tokens": 13868476.0, "step": 7515 }, { "entropy": 5.711131143569946, "epoch": 0.6318000420079816, "grad_norm": 1.4921875, "learning_rate": 0.0004966208076077723, "loss": 5.6093, "mean_token_accuracy": 0.15463593304157258, "num_tokens": 13877367.0, "step": 7520 }, { "entropy": 5.750036096572876, "epoch": 0.6322201218231464, "grad_norm": 1.3671875, "learning_rate": 0.0004966156350536422, "loss": 5.6935, "mean_token_accuracy": 0.14963461458683014, "num_tokens": 13885985.0, "step": 7525 }, { "entropy": 5.755751752853394, "epoch": 0.6326402016383113, "grad_norm": 1.28125, "learning_rate": 0.0004966104585737054, "loss": 5.61, "mean_token_accuracy": 0.15479331612586975, "num_tokens": 13895059.0, "step": 7530 }, { "entropy": 5.780548143386841, "epoch": 0.6330602814534761, "grad_norm": 1.53125, "learning_rate": 0.0004966052781680534, "loss": 5.6767, "mean_token_accuracy": 0.14704100489616395, "num_tokens": 13903789.0, "step": 7535 }, { "entropy": 5.845569133758545, "epoch": 0.633480361268641, "grad_norm": 1.3671875, "learning_rate": 0.0004966000938367778, "loss": 5.6591, "mean_token_accuracy": 0.15396612286567687, "num_tokens": 13913377.0, "step": 7540 }, { "entropy": 5.6942973136901855, "epoch": 0.6339004410838059, "grad_norm": 1.6171875, "learning_rate": 0.0004965949055799708, "loss": 5.6186, "mean_token_accuracy": 0.1588241770863533, "num_tokens": 13922141.0, "step": 7545 }, { "entropy": 5.787711143493652, "epoch": 0.6343205208989708, "grad_norm": 1.71875, "learning_rate": 0.0004965897133977241, "loss": 5.6597, "mean_token_accuracy": 0.1402692511677742, "num_tokens": 13930717.0, "step": 7550 }, { "entropy": 5.825317001342773, "epoch": 0.6347406007141357, "grad_norm": 1.6796875, "learning_rate": 0.0004965845172901298, "loss": 5.7464, "mean_token_accuracy": 0.14808339700102807, "num_tokens": 13940344.0, "step": 7555 }, { "entropy": 5.7218469142913815, "epoch": 0.6351606805293005, "grad_norm": 1.6171875, "learning_rate": 0.0004965793172572798, "loss": 5.58, "mean_token_accuracy": 0.15380775630474092, "num_tokens": 13948400.0, "step": 7560 }, { "entropy": 5.710135746002197, "epoch": 0.6355807603444654, "grad_norm": 1.3984375, "learning_rate": 0.0004965741132992663, "loss": 5.6947, "mean_token_accuracy": 0.14487617537379266, "num_tokens": 13957939.0, "step": 7565 }, { "entropy": 5.832439231872558, "epoch": 0.6360008401596303, "grad_norm": 1.390625, "learning_rate": 0.0004965689054161814, "loss": 5.6573, "mean_token_accuracy": 0.1547864407300949, "num_tokens": 13966943.0, "step": 7570 }, { "entropy": 5.738895320892334, "epoch": 0.6364209199747952, "grad_norm": 1.5390625, "learning_rate": 0.0004965636936081176, "loss": 5.5722, "mean_token_accuracy": 0.1546689599752426, "num_tokens": 13975850.0, "step": 7575 }, { "entropy": 5.806326103210449, "epoch": 0.6368409997899601, "grad_norm": 1.5390625, "learning_rate": 0.000496558477875167, "loss": 5.6725, "mean_token_accuracy": 0.15719727128744126, "num_tokens": 13985059.0, "step": 7580 }, { "entropy": 5.77093358039856, "epoch": 0.637261079605125, "grad_norm": 1.359375, "learning_rate": 0.000496553258217422, "loss": 5.7215, "mean_token_accuracy": 0.1449730947613716, "num_tokens": 13993571.0, "step": 7585 }, { "entropy": 5.842133378982544, "epoch": 0.6376811594202898, "grad_norm": 1.5078125, "learning_rate": 0.0004965480346349751, "loss": 5.7185, "mean_token_accuracy": 0.15069702565670012, "num_tokens": 14002326.0, "step": 7590 }, { "entropy": 5.9778131484985355, "epoch": 0.6381012392354547, "grad_norm": 1.984375, "learning_rate": 0.000496542807127919, "loss": 5.8686, "mean_token_accuracy": 0.14351749792695045, "num_tokens": 14012002.0, "step": 7595 }, { "entropy": 5.788293838500977, "epoch": 0.6385213190506196, "grad_norm": 1.359375, "learning_rate": 0.000496537575696346, "loss": 5.7363, "mean_token_accuracy": 0.14434802830219268, "num_tokens": 14022085.0, "step": 7600 }, { "entropy": 5.704484844207764, "epoch": 0.6389413988657845, "grad_norm": 1.5234375, "learning_rate": 0.0004965323403403488, "loss": 5.6045, "mean_token_accuracy": 0.15442810356616973, "num_tokens": 14030706.0, "step": 7605 }, { "entropy": 5.77836651802063, "epoch": 0.6393614786809494, "grad_norm": 1.3828125, "learning_rate": 0.0004965271010600205, "loss": 5.6262, "mean_token_accuracy": 0.15519261509180068, "num_tokens": 14039520.0, "step": 7610 }, { "entropy": 5.822714900970459, "epoch": 0.6397815584961143, "grad_norm": 1.53125, "learning_rate": 0.0004965218578554535, "loss": 5.7178, "mean_token_accuracy": 0.15360228195786477, "num_tokens": 14048407.0, "step": 7615 }, { "entropy": 5.711956024169922, "epoch": 0.6402016383112792, "grad_norm": 1.375, "learning_rate": 0.000496516610726741, "loss": 5.6573, "mean_token_accuracy": 0.158063705265522, "num_tokens": 14057534.0, "step": 7620 }, { "entropy": 5.765710496902466, "epoch": 0.640621718126444, "grad_norm": 1.390625, "learning_rate": 0.0004965113596739759, "loss": 5.6129, "mean_token_accuracy": 0.1602526545524597, "num_tokens": 14065992.0, "step": 7625 }, { "entropy": 5.712855339050293, "epoch": 0.6410417979416089, "grad_norm": 1.625, "learning_rate": 0.0004965061046972508, "loss": 5.6062, "mean_token_accuracy": 0.15307263806462287, "num_tokens": 14074806.0, "step": 7630 }, { "entropy": 5.752716493606568, "epoch": 0.6414618777567738, "grad_norm": 1.5859375, "learning_rate": 0.0004965008457966594, "loss": 5.6501, "mean_token_accuracy": 0.15263762921094895, "num_tokens": 14083813.0, "step": 7635 }, { "entropy": 5.762417888641357, "epoch": 0.6418819575719387, "grad_norm": 1.8359375, "learning_rate": 0.0004964955829722945, "loss": 5.5858, "mean_token_accuracy": 0.1599087104201317, "num_tokens": 14092193.0, "step": 7640 }, { "entropy": 5.84725341796875, "epoch": 0.6423020373871036, "grad_norm": 1.7578125, "learning_rate": 0.0004964903162242493, "loss": 5.7916, "mean_token_accuracy": 0.14413690567016602, "num_tokens": 14102797.0, "step": 7645 }, { "entropy": 5.76859679222107, "epoch": 0.6427221172022685, "grad_norm": 1.6640625, "learning_rate": 0.0004964850455526173, "loss": 5.6637, "mean_token_accuracy": 0.15364854410290718, "num_tokens": 14112226.0, "step": 7650 }, { "entropy": 5.661821556091309, "epoch": 0.6431421970174334, "grad_norm": 1.4140625, "learning_rate": 0.0004964797709574917, "loss": 5.5939, "mean_token_accuracy": 0.15402402132749557, "num_tokens": 14121775.0, "step": 7655 }, { "entropy": 5.719243478775025, "epoch": 0.6435622768325981, "grad_norm": 1.4453125, "learning_rate": 0.000496474492438966, "loss": 5.5856, "mean_token_accuracy": 0.15579498410224915, "num_tokens": 14130415.0, "step": 7660 }, { "entropy": 5.75182991027832, "epoch": 0.643982356647763, "grad_norm": 1.4453125, "learning_rate": 0.0004964692099971338, "loss": 5.6058, "mean_token_accuracy": 0.1568465366959572, "num_tokens": 14140204.0, "step": 7665 }, { "entropy": 5.736771440505981, "epoch": 0.6444024364629279, "grad_norm": 1.4453125, "learning_rate": 0.0004964639236320885, "loss": 5.567, "mean_token_accuracy": 0.15371138900518416, "num_tokens": 14149595.0, "step": 7670 }, { "entropy": 5.714345407485962, "epoch": 0.6448225162780928, "grad_norm": 1.6015625, "learning_rate": 0.0004964586333439239, "loss": 5.6346, "mean_token_accuracy": 0.15398874282836914, "num_tokens": 14158865.0, "step": 7675 }, { "entropy": 5.78523097038269, "epoch": 0.6452425960932577, "grad_norm": 1.546875, "learning_rate": 0.0004964533391327335, "loss": 5.5938, "mean_token_accuracy": 0.158450847864151, "num_tokens": 14167962.0, "step": 7680 }, { "entropy": 5.816212701797485, "epoch": 0.6456626759084226, "grad_norm": 1.578125, "learning_rate": 0.0004964480409986113, "loss": 5.6465, "mean_token_accuracy": 0.1606015980243683, "num_tokens": 14176479.0, "step": 7685 }, { "entropy": 5.829603910446167, "epoch": 0.6460827557235875, "grad_norm": 1.5, "learning_rate": 0.0004964427389416512, "loss": 5.6739, "mean_token_accuracy": 0.14969076216220856, "num_tokens": 14185408.0, "step": 7690 }, { "entropy": 5.702767419815063, "epoch": 0.6465028355387523, "grad_norm": 1.5390625, "learning_rate": 0.000496437432961947, "loss": 5.6745, "mean_token_accuracy": 0.15580256432294845, "num_tokens": 14194155.0, "step": 7695 }, { "entropy": 5.729840040206909, "epoch": 0.6469229153539172, "grad_norm": 1.4453125, "learning_rate": 0.0004964321230595925, "loss": 5.6916, "mean_token_accuracy": 0.1505993440747261, "num_tokens": 14202779.0, "step": 7700 }, { "entropy": 5.923639154434204, "epoch": 0.6473429951690821, "grad_norm": 1.5234375, "learning_rate": 0.0004964268092346821, "loss": 5.868, "mean_token_accuracy": 0.14160000756382943, "num_tokens": 14212552.0, "step": 7705 }, { "entropy": 5.925770807266235, "epoch": 0.647763074984247, "grad_norm": 1.484375, "learning_rate": 0.0004964214914873098, "loss": 5.6684, "mean_token_accuracy": 0.14924321398139, "num_tokens": 14222783.0, "step": 7710 }, { "entropy": 5.70919623374939, "epoch": 0.6481831547994119, "grad_norm": 1.578125, "learning_rate": 0.0004964161698175697, "loss": 5.5477, "mean_token_accuracy": 0.15285850167274476, "num_tokens": 14232085.0, "step": 7715 }, { "entropy": 5.768083095550537, "epoch": 0.6486032346145768, "grad_norm": 1.5234375, "learning_rate": 0.0004964108442255562, "loss": 5.7039, "mean_token_accuracy": 0.14666701555252076, "num_tokens": 14241969.0, "step": 7720 }, { "entropy": 5.75738754272461, "epoch": 0.6490233144297417, "grad_norm": 1.75, "learning_rate": 0.0004964055147113637, "loss": 5.616, "mean_token_accuracy": 0.1562434285879135, "num_tokens": 14251012.0, "step": 7725 }, { "entropy": 5.841613340377807, "epoch": 0.6494433942449065, "grad_norm": 1.4296875, "learning_rate": 0.0004964001812750864, "loss": 5.7414, "mean_token_accuracy": 0.15030983835458755, "num_tokens": 14261110.0, "step": 7730 }, { "entropy": 5.793753337860108, "epoch": 0.6498634740600714, "grad_norm": 1.6953125, "learning_rate": 0.000496394843916819, "loss": 5.7123, "mean_token_accuracy": 0.15001400411128998, "num_tokens": 14270869.0, "step": 7735 }, { "entropy": 5.8021101474761965, "epoch": 0.6502835538752363, "grad_norm": 1.625, "learning_rate": 0.0004963895026366558, "loss": 5.6624, "mean_token_accuracy": 0.14703597128391266, "num_tokens": 14279607.0, "step": 7740 }, { "entropy": 5.798326921463013, "epoch": 0.6507036336904012, "grad_norm": 1.5859375, "learning_rate": 0.0004963841574346917, "loss": 5.6664, "mean_token_accuracy": 0.15177475959062575, "num_tokens": 14289282.0, "step": 7745 }, { "entropy": 5.785371494293213, "epoch": 0.6511237135055661, "grad_norm": 1.4375, "learning_rate": 0.0004963788083110212, "loss": 5.5947, "mean_token_accuracy": 0.15618948638439178, "num_tokens": 14298658.0, "step": 7750 }, { "entropy": 5.867933845520019, "epoch": 0.651543793320731, "grad_norm": 1.4375, "learning_rate": 0.000496373455265739, "loss": 5.6715, "mean_token_accuracy": 0.15167464911937714, "num_tokens": 14307832.0, "step": 7755 }, { "entropy": 5.737640428543091, "epoch": 0.6519638731358958, "grad_norm": 1.7265625, "learning_rate": 0.0004963680982989402, "loss": 5.5745, "mean_token_accuracy": 0.15618224889039994, "num_tokens": 14317122.0, "step": 7760 }, { "entropy": 5.728768348693848, "epoch": 0.6523839529510607, "grad_norm": 1.625, "learning_rate": 0.0004963627374107195, "loss": 5.624, "mean_token_accuracy": 0.15685338973999025, "num_tokens": 14326069.0, "step": 7765 }, { "entropy": 5.735061359405518, "epoch": 0.6528040327662256, "grad_norm": 1.6640625, "learning_rate": 0.0004963573726011717, "loss": 5.6154, "mean_token_accuracy": 0.152651646733284, "num_tokens": 14335260.0, "step": 7770 }, { "entropy": 5.89712963104248, "epoch": 0.6532241125813905, "grad_norm": 1.421875, "learning_rate": 0.0004963520038703922, "loss": 5.7147, "mean_token_accuracy": 0.14169859886169434, "num_tokens": 14345823.0, "step": 7775 }, { "entropy": 5.8055966854095455, "epoch": 0.6536441923965554, "grad_norm": 1.4296875, "learning_rate": 0.000496346631218476, "loss": 5.5901, "mean_token_accuracy": 0.151746928691864, "num_tokens": 14354316.0, "step": 7780 }, { "entropy": 5.731487655639649, "epoch": 0.6540642722117203, "grad_norm": 1.8515625, "learning_rate": 0.000496341254645518, "loss": 5.637, "mean_token_accuracy": 0.15558102428913118, "num_tokens": 14364539.0, "step": 7785 }, { "entropy": 5.791000318527222, "epoch": 0.6544843520268852, "grad_norm": 1.5234375, "learning_rate": 0.0004963358741516138, "loss": 5.7568, "mean_token_accuracy": 0.14070456251502036, "num_tokens": 14374081.0, "step": 7790 }, { "entropy": 5.791856861114502, "epoch": 0.6549044318420499, "grad_norm": 1.5859375, "learning_rate": 0.0004963304897368585, "loss": 5.6421, "mean_token_accuracy": 0.14869485646486283, "num_tokens": 14383255.0, "step": 7795 }, { "entropy": 5.887608623504638, "epoch": 0.6553245116572148, "grad_norm": 1.7734375, "learning_rate": 0.0004963251014013475, "loss": 5.7709, "mean_token_accuracy": 0.14988299310207367, "num_tokens": 14392417.0, "step": 7800 }, { "entropy": 5.925739812850952, "epoch": 0.6557445914723797, "grad_norm": 2.265625, "learning_rate": 0.0004963197091451763, "loss": 5.8171, "mean_token_accuracy": 0.14091493040323258, "num_tokens": 14401899.0, "step": 7805 }, { "entropy": 5.8610601902008055, "epoch": 0.6561646712875446, "grad_norm": 1.5078125, "learning_rate": 0.0004963143129684405, "loss": 5.7865, "mean_token_accuracy": 0.14567770585417747, "num_tokens": 14411245.0, "step": 7810 }, { "entropy": 5.733341979980469, "epoch": 0.6565847511027095, "grad_norm": 2.09375, "learning_rate": 0.0004963089128712355, "loss": 5.6357, "mean_token_accuracy": 0.15616341382265092, "num_tokens": 14419710.0, "step": 7815 }, { "entropy": 5.761330413818359, "epoch": 0.6570048309178744, "grad_norm": 1.6640625, "learning_rate": 0.0004963035088536571, "loss": 5.6196, "mean_token_accuracy": 0.16149473637342454, "num_tokens": 14430266.0, "step": 7820 }, { "entropy": 5.832095336914063, "epoch": 0.6574249107330393, "grad_norm": 1.53125, "learning_rate": 0.0004962981009158012, "loss": 5.5946, "mean_token_accuracy": 0.14890647828578948, "num_tokens": 14439515.0, "step": 7825 }, { "entropy": 5.783193588256836, "epoch": 0.6578449905482041, "grad_norm": 1.53125, "learning_rate": 0.0004962926890577632, "loss": 5.6537, "mean_token_accuracy": 0.1543855309486389, "num_tokens": 14448091.0, "step": 7830 }, { "entropy": 5.762275314331054, "epoch": 0.658265070363369, "grad_norm": 1.4296875, "learning_rate": 0.000496287273279639, "loss": 5.6831, "mean_token_accuracy": 0.14809218272566796, "num_tokens": 14457744.0, "step": 7835 }, { "entropy": 5.830176925659179, "epoch": 0.6586851501785339, "grad_norm": 1.375, "learning_rate": 0.000496281853581525, "loss": 5.6747, "mean_token_accuracy": 0.15542599856853484, "num_tokens": 14467597.0, "step": 7840 }, { "entropy": 5.816223096847534, "epoch": 0.6591052299936988, "grad_norm": 1.4296875, "learning_rate": 0.0004962764299635168, "loss": 5.6557, "mean_token_accuracy": 0.15143783688545226, "num_tokens": 14476662.0, "step": 7845 }, { "entropy": 5.868206977844238, "epoch": 0.6595253098088637, "grad_norm": 1.8203125, "learning_rate": 0.0004962710024257105, "loss": 5.7365, "mean_token_accuracy": 0.15013337954878808, "num_tokens": 14486583.0, "step": 7850 }, { "entropy": 5.866771793365478, "epoch": 0.6599453896240286, "grad_norm": 1.4375, "learning_rate": 0.0004962655709682025, "loss": 5.7422, "mean_token_accuracy": 0.14670923799276353, "num_tokens": 14496528.0, "step": 7855 }, { "entropy": 5.847543859481812, "epoch": 0.6603654694391935, "grad_norm": 1.3046875, "learning_rate": 0.0004962601355910887, "loss": 5.7216, "mean_token_accuracy": 0.14750941842794418, "num_tokens": 14507026.0, "step": 7860 }, { "entropy": 5.714229869842529, "epoch": 0.6607855492543583, "grad_norm": 1.796875, "learning_rate": 0.0004962546962944656, "loss": 5.5896, "mean_token_accuracy": 0.1554133415222168, "num_tokens": 14516480.0, "step": 7865 }, { "entropy": 5.7652284622192385, "epoch": 0.6612056290695232, "grad_norm": 1.7265625, "learning_rate": 0.0004962492530784295, "loss": 5.5384, "mean_token_accuracy": 0.16685622930526733, "num_tokens": 14525068.0, "step": 7870 }, { "entropy": 5.764181613922119, "epoch": 0.6616257088846881, "grad_norm": 1.546875, "learning_rate": 0.0004962438059430768, "loss": 5.6811, "mean_token_accuracy": 0.15448692589998245, "num_tokens": 14534441.0, "step": 7875 }, { "entropy": 5.791794538497925, "epoch": 0.662045788699853, "grad_norm": 1.515625, "learning_rate": 0.0004962383548885039, "loss": 5.7416, "mean_token_accuracy": 0.15312327668070794, "num_tokens": 14543026.0, "step": 7880 }, { "entropy": 5.810564088821411, "epoch": 0.6624658685150179, "grad_norm": 1.71875, "learning_rate": 0.0004962328999148075, "loss": 5.6235, "mean_token_accuracy": 0.15815748721361161, "num_tokens": 14552068.0, "step": 7885 }, { "entropy": 5.795226907730102, "epoch": 0.6628859483301828, "grad_norm": 1.5234375, "learning_rate": 0.0004962274410220842, "loss": 5.748, "mean_token_accuracy": 0.14739178717136384, "num_tokens": 14561587.0, "step": 7890 }, { "entropy": 5.840717220306397, "epoch": 0.6633060281453477, "grad_norm": 1.7265625, "learning_rate": 0.0004962219782104308, "loss": 5.7455, "mean_token_accuracy": 0.15566187649965285, "num_tokens": 14571020.0, "step": 7895 }, { "entropy": 5.857281494140625, "epoch": 0.6637261079605125, "grad_norm": 1.484375, "learning_rate": 0.0004962165114799439, "loss": 5.7013, "mean_token_accuracy": 0.14193924963474275, "num_tokens": 14580638.0, "step": 7900 }, { "entropy": 5.753746509552002, "epoch": 0.6641461877756774, "grad_norm": 1.578125, "learning_rate": 0.0004962110408307204, "loss": 5.6411, "mean_token_accuracy": 0.1508389577269554, "num_tokens": 14590173.0, "step": 7905 }, { "entropy": 5.771540355682373, "epoch": 0.6645662675908423, "grad_norm": 1.3046875, "learning_rate": 0.0004962055662628571, "loss": 5.6088, "mean_token_accuracy": 0.1546558991074562, "num_tokens": 14598635.0, "step": 7910 }, { "entropy": 5.824790573120117, "epoch": 0.6649863474060071, "grad_norm": 1.4921875, "learning_rate": 0.0004962000877764513, "loss": 5.6465, "mean_token_accuracy": 0.15380171239376067, "num_tokens": 14607233.0, "step": 7915 }, { "entropy": 5.900277614593506, "epoch": 0.665406427221172, "grad_norm": 1.4296875, "learning_rate": 0.0004961946053715998, "loss": 5.811, "mean_token_accuracy": 0.14116770774126053, "num_tokens": 14617483.0, "step": 7920 }, { "entropy": 5.774311876296997, "epoch": 0.665826507036337, "grad_norm": 1.5078125, "learning_rate": 0.0004961891190483997, "loss": 5.6337, "mean_token_accuracy": 0.15262163281440735, "num_tokens": 14625805.0, "step": 7925 }, { "entropy": 5.750567626953125, "epoch": 0.6662465868515017, "grad_norm": 1.53125, "learning_rate": 0.0004961836288069483, "loss": 5.56, "mean_token_accuracy": 0.15181114226579667, "num_tokens": 14634605.0, "step": 7930 }, { "entropy": 5.866780996322632, "epoch": 0.6666666666666666, "grad_norm": 1.5234375, "learning_rate": 0.0004961781346473428, "loss": 5.754, "mean_token_accuracy": 0.1443464897572994, "num_tokens": 14644970.0, "step": 7935 }, { "entropy": 5.8288147926330565, "epoch": 0.6670867464818315, "grad_norm": 1.3359375, "learning_rate": 0.0004961726365696805, "loss": 5.6444, "mean_token_accuracy": 0.1512111656367779, "num_tokens": 14655043.0, "step": 7940 }, { "entropy": 5.81706018447876, "epoch": 0.6675068262969964, "grad_norm": 1.4296875, "learning_rate": 0.0004961671345740589, "loss": 5.624, "mean_token_accuracy": 0.1498358130455017, "num_tokens": 14663994.0, "step": 7945 }, { "entropy": 5.73077392578125, "epoch": 0.6679269061121613, "grad_norm": 1.3984375, "learning_rate": 0.0004961616286605753, "loss": 5.6285, "mean_token_accuracy": 0.14595297276973723, "num_tokens": 14674101.0, "step": 7950 }, { "entropy": 5.793763732910156, "epoch": 0.6683469859273262, "grad_norm": 1.3984375, "learning_rate": 0.0004961561188293273, "loss": 5.7245, "mean_token_accuracy": 0.14435067921876907, "num_tokens": 14684156.0, "step": 7955 }, { "entropy": 5.726213026046753, "epoch": 0.6687670657424911, "grad_norm": 1.515625, "learning_rate": 0.0004961506050804126, "loss": 5.6178, "mean_token_accuracy": 0.15918601751327516, "num_tokens": 14693223.0, "step": 7960 }, { "entropy": 5.852010822296142, "epoch": 0.6691871455576559, "grad_norm": 1.34375, "learning_rate": 0.000496145087413929, "loss": 5.6258, "mean_token_accuracy": 0.14910822063684465, "num_tokens": 14702959.0, "step": 7965 }, { "entropy": 5.876345634460449, "epoch": 0.6696072253728208, "grad_norm": 1.5625, "learning_rate": 0.0004961395658299737, "loss": 5.737, "mean_token_accuracy": 0.1483006753027439, "num_tokens": 14712146.0, "step": 7970 }, { "entropy": 5.710770320892334, "epoch": 0.6700273051879857, "grad_norm": 1.515625, "learning_rate": 0.0004961340403286451, "loss": 5.6515, "mean_token_accuracy": 0.14912314414978028, "num_tokens": 14721932.0, "step": 7975 }, { "entropy": 5.775924396514893, "epoch": 0.6704473850031506, "grad_norm": 1.4921875, "learning_rate": 0.0004961285109100408, "loss": 5.5857, "mean_token_accuracy": 0.15742873400449753, "num_tokens": 14731080.0, "step": 7980 }, { "entropy": 5.719264698028565, "epoch": 0.6708674648183155, "grad_norm": 1.5390625, "learning_rate": 0.0004961229775742587, "loss": 5.5991, "mean_token_accuracy": 0.16006802767515182, "num_tokens": 14740057.0, "step": 7985 }, { "entropy": 5.813319492340088, "epoch": 0.6712875446334804, "grad_norm": 1.4453125, "learning_rate": 0.000496117440321397, "loss": 5.6828, "mean_token_accuracy": 0.15654956847429274, "num_tokens": 14748399.0, "step": 7990 }, { "entropy": 5.8324696063995365, "epoch": 0.6717076244486453, "grad_norm": 1.4921875, "learning_rate": 0.0004961118991515537, "loss": 5.6881, "mean_token_accuracy": 0.14406146556138993, "num_tokens": 14757215.0, "step": 7995 }, { "entropy": 5.786386203765869, "epoch": 0.6721277042638101, "grad_norm": 1.609375, "learning_rate": 0.000496106354064827, "loss": 5.6868, "mean_token_accuracy": 0.15685203224420546, "num_tokens": 14766191.0, "step": 8000 }, { "entropy": 5.8651642322540285, "epoch": 0.672547784078975, "grad_norm": 1.59375, "learning_rate": 0.0004961008050613149, "loss": 5.7521, "mean_token_accuracy": 0.14210513085126877, "num_tokens": 14775220.0, "step": 8005 }, { "entropy": 5.838468170166015, "epoch": 0.6729678638941399, "grad_norm": 1.546875, "learning_rate": 0.0004960952521411161, "loss": 5.7078, "mean_token_accuracy": 0.14716721177101136, "num_tokens": 14784287.0, "step": 8010 }, { "entropy": 5.932072496414184, "epoch": 0.6733879437093048, "grad_norm": 1.3828125, "learning_rate": 0.0004960896953043287, "loss": 5.7759, "mean_token_accuracy": 0.14442920163273812, "num_tokens": 14794219.0, "step": 8015 }, { "entropy": 5.824687051773071, "epoch": 0.6738080235244697, "grad_norm": 1.671875, "learning_rate": 0.0004960841345510511, "loss": 5.6703, "mean_token_accuracy": 0.1518692597746849, "num_tokens": 14803324.0, "step": 8020 }, { "entropy": 5.7951741218566895, "epoch": 0.6742281033396346, "grad_norm": 1.5859375, "learning_rate": 0.000496078569881382, "loss": 5.6876, "mean_token_accuracy": 0.1539413034915924, "num_tokens": 14811963.0, "step": 8025 }, { "entropy": 5.747313785552978, "epoch": 0.6746481831547995, "grad_norm": 1.5546875, "learning_rate": 0.0004960730012954198, "loss": 5.6526, "mean_token_accuracy": 0.14589986428618432, "num_tokens": 14821903.0, "step": 8030 }, { "entropy": 5.716427040100098, "epoch": 0.6750682629699643, "grad_norm": 1.328125, "learning_rate": 0.0004960674287932634, "loss": 5.6271, "mean_token_accuracy": 0.14554727971553802, "num_tokens": 14831215.0, "step": 8035 }, { "entropy": 5.827300643920898, "epoch": 0.6754883427851291, "grad_norm": 1.390625, "learning_rate": 0.0004960618523750111, "loss": 5.5552, "mean_token_accuracy": 0.1551190733909607, "num_tokens": 14840354.0, "step": 8040 }, { "entropy": 5.817133188247681, "epoch": 0.675908422600294, "grad_norm": 1.59375, "learning_rate": 0.000496056272040762, "loss": 5.7402, "mean_token_accuracy": 0.14943507611751555, "num_tokens": 14849660.0, "step": 8045 }, { "entropy": 5.807599830627441, "epoch": 0.6763285024154589, "grad_norm": 1.4921875, "learning_rate": 0.0004960506877906149, "loss": 5.6648, "mean_token_accuracy": 0.14764449894428253, "num_tokens": 14859819.0, "step": 8050 }, { "entropy": 5.801334857940674, "epoch": 0.6767485822306238, "grad_norm": 1.5078125, "learning_rate": 0.0004960450996246686, "loss": 5.6585, "mean_token_accuracy": 0.15806604847311972, "num_tokens": 14869260.0, "step": 8055 }, { "entropy": 5.7306236743927, "epoch": 0.6771686620457887, "grad_norm": 1.40625, "learning_rate": 0.0004960395075430222, "loss": 5.6336, "mean_token_accuracy": 0.15279667675495148, "num_tokens": 14878685.0, "step": 8060 }, { "entropy": 5.749643182754516, "epoch": 0.6775887418609536, "grad_norm": 1.5625, "learning_rate": 0.0004960339115457748, "loss": 5.6372, "mean_token_accuracy": 0.1503060542047024, "num_tokens": 14888456.0, "step": 8065 }, { "entropy": 5.7973710060119625, "epoch": 0.6780088216761184, "grad_norm": 1.890625, "learning_rate": 0.0004960283116330255, "loss": 5.731, "mean_token_accuracy": 0.14978916943073273, "num_tokens": 14897401.0, "step": 8070 }, { "entropy": 5.807585668563843, "epoch": 0.6784289014912833, "grad_norm": 1.421875, "learning_rate": 0.0004960227078048735, "loss": 5.6567, "mean_token_accuracy": 0.15412394553422928, "num_tokens": 14906741.0, "step": 8075 }, { "entropy": 5.760078573226929, "epoch": 0.6788489813064482, "grad_norm": 1.53125, "learning_rate": 0.0004960171000614179, "loss": 5.5427, "mean_token_accuracy": 0.16074198186397554, "num_tokens": 14916002.0, "step": 8080 }, { "entropy": 5.638378715515136, "epoch": 0.6792690611216131, "grad_norm": 1.4296875, "learning_rate": 0.0004960114884027583, "loss": 5.4776, "mean_token_accuracy": 0.16621290147304535, "num_tokens": 14925247.0, "step": 8085 }, { "entropy": 5.708978319168091, "epoch": 0.679689140936778, "grad_norm": 1.5390625, "learning_rate": 0.0004960058728289939, "loss": 5.608, "mean_token_accuracy": 0.15026133954524995, "num_tokens": 14933925.0, "step": 8090 }, { "entropy": 5.904026126861572, "epoch": 0.6801092207519429, "grad_norm": 1.515625, "learning_rate": 0.0004960002533402243, "loss": 5.6881, "mean_token_accuracy": 0.15241528823971748, "num_tokens": 14943368.0, "step": 8095 }, { "entropy": 5.790306043624878, "epoch": 0.6805293005671077, "grad_norm": 1.375, "learning_rate": 0.0004959946299365491, "loss": 5.6953, "mean_token_accuracy": 0.14710961580276488, "num_tokens": 14953710.0, "step": 8100 }, { "entropy": 5.816765403747558, "epoch": 0.6809493803822726, "grad_norm": 1.609375, "learning_rate": 0.0004959890026180677, "loss": 5.7182, "mean_token_accuracy": 0.14748610258102418, "num_tokens": 14962814.0, "step": 8105 }, { "entropy": 5.688648128509522, "epoch": 0.6813694601974375, "grad_norm": 1.3359375, "learning_rate": 0.00049598337138488, "loss": 5.5964, "mean_token_accuracy": 0.16184311360120773, "num_tokens": 14971631.0, "step": 8110 }, { "entropy": 5.8211281299591064, "epoch": 0.6817895400126024, "grad_norm": 1.78125, "learning_rate": 0.0004959777362370855, "loss": 5.5884, "mean_token_accuracy": 0.15286847501993178, "num_tokens": 14980528.0, "step": 8115 }, { "entropy": 5.87521915435791, "epoch": 0.6822096198277673, "grad_norm": 2.3125, "learning_rate": 0.0004959720971747843, "loss": 5.6149, "mean_token_accuracy": 0.15216847509145737, "num_tokens": 14989331.0, "step": 8120 }, { "entropy": 5.713017272949219, "epoch": 0.6826296996429322, "grad_norm": 1.421875, "learning_rate": 0.0004959664541980762, "loss": 5.598, "mean_token_accuracy": 0.15774561017751693, "num_tokens": 14999403.0, "step": 8125 }, { "entropy": 5.737113285064697, "epoch": 0.6830497794580971, "grad_norm": 1.578125, "learning_rate": 0.0004959608073070612, "loss": 5.6958, "mean_token_accuracy": 0.14559513479471206, "num_tokens": 15009388.0, "step": 8130 }, { "entropy": 5.837254619598388, "epoch": 0.6834698592732619, "grad_norm": 1.5859375, "learning_rate": 0.0004959551565018392, "loss": 5.6286, "mean_token_accuracy": 0.15535787492990494, "num_tokens": 15018586.0, "step": 8135 }, { "entropy": 5.778875064849854, "epoch": 0.6838899390884268, "grad_norm": 1.3359375, "learning_rate": 0.0004959495017825104, "loss": 5.6407, "mean_token_accuracy": 0.15465399324893953, "num_tokens": 15027982.0, "step": 8140 }, { "entropy": 5.739845132827758, "epoch": 0.6843100189035917, "grad_norm": 1.734375, "learning_rate": 0.0004959438431491749, "loss": 5.6278, "mean_token_accuracy": 0.15651622265577317, "num_tokens": 15037103.0, "step": 8145 }, { "entropy": 5.728132820129394, "epoch": 0.6847300987187566, "grad_norm": 1.453125, "learning_rate": 0.000495938180601933, "loss": 5.7184, "mean_token_accuracy": 0.14796946495771407, "num_tokens": 15046739.0, "step": 8150 }, { "entropy": 5.822361660003662, "epoch": 0.6851501785339215, "grad_norm": 1.5703125, "learning_rate": 0.0004959325141408851, "loss": 5.666, "mean_token_accuracy": 0.15593857914209366, "num_tokens": 15056586.0, "step": 8155 }, { "entropy": 5.768631410598755, "epoch": 0.6855702583490864, "grad_norm": 1.5703125, "learning_rate": 0.0004959268437661313, "loss": 5.641, "mean_token_accuracy": 0.15448189303278922, "num_tokens": 15066622.0, "step": 8160 }, { "entropy": 5.767803955078125, "epoch": 0.6859903381642513, "grad_norm": 1.4609375, "learning_rate": 0.0004959211694777724, "loss": 5.6293, "mean_token_accuracy": 0.15781602412462234, "num_tokens": 15075415.0, "step": 8165 }, { "entropy": 5.731510210037231, "epoch": 0.686410417979416, "grad_norm": 1.6796875, "learning_rate": 0.0004959154912759086, "loss": 5.6134, "mean_token_accuracy": 0.15285183787345885, "num_tokens": 15085087.0, "step": 8170 }, { "entropy": 5.772061681747436, "epoch": 0.6868304977945809, "grad_norm": 2.0, "learning_rate": 0.0004959098091606406, "loss": 5.6231, "mean_token_accuracy": 0.1562209889292717, "num_tokens": 15093580.0, "step": 8175 }, { "entropy": 5.681428337097168, "epoch": 0.6872505776097458, "grad_norm": 1.71875, "learning_rate": 0.0004959041231320692, "loss": 5.5996, "mean_token_accuracy": 0.15760979950428008, "num_tokens": 15104033.0, "step": 8180 }, { "entropy": 5.769718980789184, "epoch": 0.6876706574249107, "grad_norm": 1.4765625, "learning_rate": 0.0004958984331902951, "loss": 5.6773, "mean_token_accuracy": 0.14753246530890465, "num_tokens": 15113164.0, "step": 8185 }, { "entropy": 5.745969009399414, "epoch": 0.6880907372400756, "grad_norm": 1.75, "learning_rate": 0.0004958927393354188, "loss": 5.6297, "mean_token_accuracy": 0.15737390518188477, "num_tokens": 15122215.0, "step": 8190 }, { "entropy": 5.765387773513794, "epoch": 0.6885108170552405, "grad_norm": 1.4609375, "learning_rate": 0.0004958870415675415, "loss": 5.6091, "mean_token_accuracy": 0.15159644484519957, "num_tokens": 15130877.0, "step": 8195 }, { "entropy": 5.7833487033844, "epoch": 0.6889308968704054, "grad_norm": 1.40625, "learning_rate": 0.0004958813398867639, "loss": 5.5909, "mean_token_accuracy": 0.1610761597752571, "num_tokens": 15140227.0, "step": 8200 }, { "entropy": 5.874035358428955, "epoch": 0.6893509766855702, "grad_norm": 1.21875, "learning_rate": 0.0004958756342931872, "loss": 5.7618, "mean_token_accuracy": 0.14578953385353088, "num_tokens": 15150006.0, "step": 8205 }, { "entropy": 5.7979443073272705, "epoch": 0.6897710565007351, "grad_norm": 1.2890625, "learning_rate": 0.0004958699247869122, "loss": 5.6734, "mean_token_accuracy": 0.15173593461513518, "num_tokens": 15160032.0, "step": 8210 }, { "entropy": 5.775300407409668, "epoch": 0.6901911363159, "grad_norm": 1.40625, "learning_rate": 0.0004958642113680404, "loss": 5.607, "mean_token_accuracy": 0.15672277957201003, "num_tokens": 15168966.0, "step": 8215 }, { "entropy": 5.886404323577881, "epoch": 0.6906112161310649, "grad_norm": 1.7734375, "learning_rate": 0.0004958584940366727, "loss": 5.7931, "mean_token_accuracy": 0.1462364301085472, "num_tokens": 15179337.0, "step": 8220 }, { "entropy": 5.845329141616821, "epoch": 0.6910312959462298, "grad_norm": 1.4765625, "learning_rate": 0.0004958527727929106, "loss": 5.6901, "mean_token_accuracy": 0.15126113295555116, "num_tokens": 15188395.0, "step": 8225 }, { "entropy": 5.777632856369019, "epoch": 0.6914513757613947, "grad_norm": 1.359375, "learning_rate": 0.0004958470476368552, "loss": 5.6175, "mean_token_accuracy": 0.1590783603489399, "num_tokens": 15198669.0, "step": 8230 }, { "entropy": 5.717659664154053, "epoch": 0.6918714555765595, "grad_norm": 2.0625, "learning_rate": 0.0004958413185686082, "loss": 5.637, "mean_token_accuracy": 0.15654054433107376, "num_tokens": 15207371.0, "step": 8235 }, { "entropy": 5.771133661270142, "epoch": 0.6922915353917244, "grad_norm": 1.5625, "learning_rate": 0.0004958355855882709, "loss": 5.6623, "mean_token_accuracy": 0.15609176307916642, "num_tokens": 15215694.0, "step": 8240 }, { "entropy": 5.838139247894287, "epoch": 0.6927116152068893, "grad_norm": 1.59375, "learning_rate": 0.000495829848695945, "loss": 5.6462, "mean_token_accuracy": 0.15314621180295945, "num_tokens": 15224963.0, "step": 8245 }, { "entropy": 5.6792638301849365, "epoch": 0.6931316950220542, "grad_norm": 1.5078125, "learning_rate": 0.000495824107891732, "loss": 5.4601, "mean_token_accuracy": 0.16161370724439622, "num_tokens": 15233569.0, "step": 8250 }, { "entropy": 5.702935647964478, "epoch": 0.6935517748372191, "grad_norm": 1.421875, "learning_rate": 0.0004958183631757336, "loss": 5.6456, "mean_token_accuracy": 0.15384626239538193, "num_tokens": 15242671.0, "step": 8255 }, { "entropy": 5.757969760894776, "epoch": 0.693971854652384, "grad_norm": 1.5078125, "learning_rate": 0.0004958126145480517, "loss": 5.6062, "mean_token_accuracy": 0.15589472502470017, "num_tokens": 15251698.0, "step": 8260 }, { "entropy": 5.881031131744384, "epoch": 0.6943919344675489, "grad_norm": 2.265625, "learning_rate": 0.0004958068620087879, "loss": 5.7131, "mean_token_accuracy": 0.15278587341308594, "num_tokens": 15260608.0, "step": 8265 }, { "entropy": 5.7654228687286375, "epoch": 0.6948120142827137, "grad_norm": 1.78125, "learning_rate": 0.0004958011055580443, "loss": 5.5824, "mean_token_accuracy": 0.1566091775894165, "num_tokens": 15268866.0, "step": 8270 }, { "entropy": 5.691988468170166, "epoch": 0.6952320940978786, "grad_norm": 1.4609375, "learning_rate": 0.0004957953451959229, "loss": 5.5428, "mean_token_accuracy": 0.1687786027789116, "num_tokens": 15277600.0, "step": 8275 }, { "entropy": 5.712690019607544, "epoch": 0.6956521739130435, "grad_norm": 1.3203125, "learning_rate": 0.0004957895809225254, "loss": 5.577, "mean_token_accuracy": 0.15904618948698043, "num_tokens": 15286016.0, "step": 8280 }, { "entropy": 5.791261529922485, "epoch": 0.6960722537282084, "grad_norm": 1.46875, "learning_rate": 0.0004957838127379544, "loss": 5.6203, "mean_token_accuracy": 0.15775981694459915, "num_tokens": 15294676.0, "step": 8285 }, { "entropy": 5.787760162353516, "epoch": 0.6964923335433733, "grad_norm": 1.7890625, "learning_rate": 0.0004957780406423118, "loss": 5.6093, "mean_token_accuracy": 0.1520596593618393, "num_tokens": 15304084.0, "step": 8290 }, { "entropy": 5.732133674621582, "epoch": 0.6969124133585382, "grad_norm": 1.515625, "learning_rate": 0.0004957722646356999, "loss": 5.6145, "mean_token_accuracy": 0.15437885522842407, "num_tokens": 15314182.0, "step": 8295 }, { "entropy": 5.82383394241333, "epoch": 0.697332493173703, "grad_norm": 1.4921875, "learning_rate": 0.0004957664847182209, "loss": 5.7321, "mean_token_accuracy": 0.14916351363062857, "num_tokens": 15324213.0, "step": 8300 }, { "entropy": 5.901606464385987, "epoch": 0.6977525729888678, "grad_norm": 1.515625, "learning_rate": 0.0004957607008899774, "loss": 5.6654, "mean_token_accuracy": 0.14808408319950103, "num_tokens": 15333122.0, "step": 8305 }, { "entropy": 5.821764516830444, "epoch": 0.6981726528040327, "grad_norm": 1.75, "learning_rate": 0.0004957549131510717, "loss": 5.7587, "mean_token_accuracy": 0.14488900303840638, "num_tokens": 15342199.0, "step": 8310 }, { "entropy": 5.85214409828186, "epoch": 0.6985927326191976, "grad_norm": 1.578125, "learning_rate": 0.0004957491215016065, "loss": 5.7068, "mean_token_accuracy": 0.14899201691150665, "num_tokens": 15352463.0, "step": 8315 }, { "entropy": 5.7340789318084715, "epoch": 0.6990128124343625, "grad_norm": 1.8828125, "learning_rate": 0.0004957433259416841, "loss": 5.5519, "mean_token_accuracy": 0.15695535391569138, "num_tokens": 15361815.0, "step": 8320 }, { "entropy": 5.829116296768189, "epoch": 0.6994328922495274, "grad_norm": 1.4296875, "learning_rate": 0.0004957375264714075, "loss": 5.6665, "mean_token_accuracy": 0.14441719949245452, "num_tokens": 15371773.0, "step": 8325 }, { "entropy": 5.731393432617187, "epoch": 0.6998529720646923, "grad_norm": 1.46875, "learning_rate": 0.0004957317230908792, "loss": 5.6078, "mean_token_accuracy": 0.153985595703125, "num_tokens": 15380881.0, "step": 8330 }, { "entropy": 5.69814658164978, "epoch": 0.7002730518798572, "grad_norm": 1.4765625, "learning_rate": 0.0004957259158002022, "loss": 5.4853, "mean_token_accuracy": 0.16338536590337754, "num_tokens": 15389310.0, "step": 8335 }, { "entropy": 5.65314564704895, "epoch": 0.700693131695022, "grad_norm": 1.6328125, "learning_rate": 0.0004957201045994791, "loss": 5.585, "mean_token_accuracy": 0.15192776024341584, "num_tokens": 15398584.0, "step": 8340 }, { "entropy": 5.752124881744384, "epoch": 0.7011132115101869, "grad_norm": 1.328125, "learning_rate": 0.0004957142894888131, "loss": 5.6244, "mean_token_accuracy": 0.1605387285351753, "num_tokens": 15407208.0, "step": 8345 }, { "entropy": 5.781596279144287, "epoch": 0.7015332913253518, "grad_norm": 1.5546875, "learning_rate": 0.0004957084704683071, "loss": 5.6552, "mean_token_accuracy": 0.15119443833827972, "num_tokens": 15416474.0, "step": 8350 }, { "entropy": 5.796496915817261, "epoch": 0.7019533711405167, "grad_norm": 1.6171875, "learning_rate": 0.0004957026475380642, "loss": 5.6589, "mean_token_accuracy": 0.1581042394042015, "num_tokens": 15426101.0, "step": 8355 }, { "entropy": 5.8482013702392575, "epoch": 0.7023734509556816, "grad_norm": 1.4140625, "learning_rate": 0.0004956968206981875, "loss": 5.6866, "mean_token_accuracy": 0.1528375506401062, "num_tokens": 15435910.0, "step": 8360 }, { "entropy": 5.838450860977173, "epoch": 0.7027935307708465, "grad_norm": 1.8359375, "learning_rate": 0.0004956909899487803, "loss": 5.7297, "mean_token_accuracy": 0.14721868485212325, "num_tokens": 15445494.0, "step": 8365 }, { "entropy": 5.773874664306641, "epoch": 0.7032136105860114, "grad_norm": 1.484375, "learning_rate": 0.0004956851552899459, "loss": 5.6133, "mean_token_accuracy": 0.15867630988359452, "num_tokens": 15455332.0, "step": 8370 }, { "entropy": 5.7730677127838135, "epoch": 0.7036336904011762, "grad_norm": 1.546875, "learning_rate": 0.0004956793167217874, "loss": 5.6813, "mean_token_accuracy": 0.1490170478820801, "num_tokens": 15464241.0, "step": 8375 }, { "entropy": 5.8777241706848145, "epoch": 0.7040537702163411, "grad_norm": 2.0625, "learning_rate": 0.0004956734742444087, "loss": 5.6821, "mean_token_accuracy": 0.15121965557336808, "num_tokens": 15473473.0, "step": 8380 }, { "entropy": 5.744890403747559, "epoch": 0.704473850031506, "grad_norm": 1.4375, "learning_rate": 0.0004956676278579129, "loss": 5.563, "mean_token_accuracy": 0.15540574193000795, "num_tokens": 15482494.0, "step": 8385 }, { "entropy": 5.676463556289673, "epoch": 0.7048939298466709, "grad_norm": 1.4375, "learning_rate": 0.0004956617775624037, "loss": 5.5724, "mean_token_accuracy": 0.15146812200546264, "num_tokens": 15491180.0, "step": 8390 }, { "entropy": 5.786671447753906, "epoch": 0.7053140096618358, "grad_norm": 2.078125, "learning_rate": 0.0004956559233579848, "loss": 5.6148, "mean_token_accuracy": 0.15258617997169494, "num_tokens": 15501035.0, "step": 8395 }, { "entropy": 5.7913405418396, "epoch": 0.7057340894770007, "grad_norm": 1.3671875, "learning_rate": 0.0004956500652447598, "loss": 5.5994, "mean_token_accuracy": 0.15323785319924355, "num_tokens": 15510191.0, "step": 8400 }, { "entropy": 5.706702041625976, "epoch": 0.7061541692921655, "grad_norm": 1.5, "learning_rate": 0.0004956442032228324, "loss": 5.6875, "mean_token_accuracy": 0.15146460086107255, "num_tokens": 15519253.0, "step": 8405 }, { "entropy": 5.7468561172485355, "epoch": 0.7065742491073304, "grad_norm": 1.4453125, "learning_rate": 0.0004956383372923067, "loss": 5.6573, "mean_token_accuracy": 0.15219423472881316, "num_tokens": 15528348.0, "step": 8410 }, { "entropy": 5.909702920913697, "epoch": 0.7069943289224953, "grad_norm": 1.4765625, "learning_rate": 0.0004956324674532864, "loss": 5.7312, "mean_token_accuracy": 0.14496915340423583, "num_tokens": 15537557.0, "step": 8415 }, { "entropy": 5.853457021713257, "epoch": 0.7074144087376601, "grad_norm": 1.375, "learning_rate": 0.0004956265937058757, "loss": 5.6662, "mean_token_accuracy": 0.14985378384590148, "num_tokens": 15546745.0, "step": 8420 }, { "entropy": 5.753704071044922, "epoch": 0.707834488552825, "grad_norm": 1.65625, "learning_rate": 0.0004956207160501784, "loss": 5.5646, "mean_token_accuracy": 0.15850543081760407, "num_tokens": 15555532.0, "step": 8425 }, { "entropy": 5.728769159317016, "epoch": 0.70825456836799, "grad_norm": 1.4921875, "learning_rate": 0.0004956148344862987, "loss": 5.6209, "mean_token_accuracy": 0.1560587242245674, "num_tokens": 15564189.0, "step": 8430 }, { "entropy": 5.664771509170532, "epoch": 0.7086746481831548, "grad_norm": 1.7265625, "learning_rate": 0.0004956089490143408, "loss": 5.6492, "mean_token_accuracy": 0.15197667628526687, "num_tokens": 15574116.0, "step": 8435 }, { "entropy": 5.824323844909668, "epoch": 0.7090947279983196, "grad_norm": 1.484375, "learning_rate": 0.0004956030596344089, "loss": 5.6473, "mean_token_accuracy": 0.149012803286314, "num_tokens": 15583031.0, "step": 8440 }, { "entropy": 5.836510467529297, "epoch": 0.7095148078134845, "grad_norm": 1.6953125, "learning_rate": 0.0004955971663466075, "loss": 5.7671, "mean_token_accuracy": 0.15028237402439118, "num_tokens": 15592576.0, "step": 8445 }, { "entropy": 5.823656129837036, "epoch": 0.7099348876286494, "grad_norm": 1.78125, "learning_rate": 0.0004955912691510407, "loss": 5.697, "mean_token_accuracy": 0.15281013548374175, "num_tokens": 15601065.0, "step": 8450 }, { "entropy": 5.751941967010498, "epoch": 0.7103549674438143, "grad_norm": 1.890625, "learning_rate": 0.0004955853680478134, "loss": 5.633, "mean_token_accuracy": 0.14754925668239594, "num_tokens": 15610112.0, "step": 8455 }, { "entropy": 5.778195095062256, "epoch": 0.7107750472589792, "grad_norm": 1.859375, "learning_rate": 0.0004955794630370297, "loss": 5.6139, "mean_token_accuracy": 0.15469905436038972, "num_tokens": 15618890.0, "step": 8460 }, { "entropy": 5.750346851348877, "epoch": 0.7111951270741441, "grad_norm": 1.7578125, "learning_rate": 0.0004955735541187945, "loss": 5.6397, "mean_token_accuracy": 0.15139740109443664, "num_tokens": 15627678.0, "step": 8465 }, { "entropy": 5.838537120819092, "epoch": 0.711615206889309, "grad_norm": 2.265625, "learning_rate": 0.0004955676412932124, "loss": 5.6254, "mean_token_accuracy": 0.15495479255914688, "num_tokens": 15636833.0, "step": 8470 }, { "entropy": 5.758643341064453, "epoch": 0.7120352867044738, "grad_norm": 2.390625, "learning_rate": 0.0004955617245603881, "loss": 5.6441, "mean_token_accuracy": 0.1475740984082222, "num_tokens": 15646571.0, "step": 8475 }, { "entropy": 5.771809864044189, "epoch": 0.7124553665196387, "grad_norm": 1.546875, "learning_rate": 0.0004955558039204263, "loss": 5.6883, "mean_token_accuracy": 0.1559377834200859, "num_tokens": 15654907.0, "step": 8480 }, { "entropy": 5.87169828414917, "epoch": 0.7128754463348036, "grad_norm": 1.5859375, "learning_rate": 0.0004955498793734321, "loss": 5.6259, "mean_token_accuracy": 0.15253366231918336, "num_tokens": 15664336.0, "step": 8485 }, { "entropy": 5.775359678268432, "epoch": 0.7132955261499685, "grad_norm": 1.6640625, "learning_rate": 0.0004955439509195103, "loss": 5.6818, "mean_token_accuracy": 0.15552834868431092, "num_tokens": 15674000.0, "step": 8490 }, { "entropy": 5.817126750946045, "epoch": 0.7137156059651334, "grad_norm": 1.5625, "learning_rate": 0.0004955380185587661, "loss": 5.6655, "mean_token_accuracy": 0.15541905909776688, "num_tokens": 15684214.0, "step": 8495 }, { "entropy": 5.823128080368042, "epoch": 0.7141356857802983, "grad_norm": 2.65625, "learning_rate": 0.0004955320822913043, "loss": 5.695, "mean_token_accuracy": 0.14909214079380034, "num_tokens": 15693546.0, "step": 8500 }, { "entropy": 5.796035861968994, "epoch": 0.7145557655954632, "grad_norm": 1.328125, "learning_rate": 0.0004955261421172302, "loss": 5.6006, "mean_token_accuracy": 0.15094921365380287, "num_tokens": 15702310.0, "step": 8505 }, { "entropy": 5.765657234191894, "epoch": 0.714975845410628, "grad_norm": 1.59375, "learning_rate": 0.0004955201980366493, "loss": 5.6549, "mean_token_accuracy": 0.1583261877298355, "num_tokens": 15711544.0, "step": 8510 }, { "entropy": 5.701775074005127, "epoch": 0.7153959252257929, "grad_norm": 1.7109375, "learning_rate": 0.0004955142500496665, "loss": 5.5378, "mean_token_accuracy": 0.15932040065526962, "num_tokens": 15720914.0, "step": 8515 }, { "entropy": 5.806231927871704, "epoch": 0.7158160050409578, "grad_norm": 1.6328125, "learning_rate": 0.0004955082981563872, "loss": 5.636, "mean_token_accuracy": 0.1497705653309822, "num_tokens": 15729825.0, "step": 8520 }, { "entropy": 5.731112813949585, "epoch": 0.7162360848561227, "grad_norm": 1.984375, "learning_rate": 0.000495502342356917, "loss": 5.6407, "mean_token_accuracy": 0.15358344316482545, "num_tokens": 15739649.0, "step": 8525 }, { "entropy": 5.775957298278809, "epoch": 0.7166561646712876, "grad_norm": 1.7265625, "learning_rate": 0.0004954963826513614, "loss": 5.5312, "mean_token_accuracy": 0.15533651560544967, "num_tokens": 15747805.0, "step": 8530 }, { "entropy": 5.848172760009765, "epoch": 0.7170762444864525, "grad_norm": 1.7421875, "learning_rate": 0.000495490419039826, "loss": 5.6763, "mean_token_accuracy": 0.15182012543082238, "num_tokens": 15757267.0, "step": 8535 }, { "entropy": 5.734999704360962, "epoch": 0.7174963243016174, "grad_norm": 1.703125, "learning_rate": 0.0004954844515224162, "loss": 5.6442, "mean_token_accuracy": 0.15498089045286179, "num_tokens": 15767412.0, "step": 8540 }, { "entropy": 5.702851438522339, "epoch": 0.7179164041167821, "grad_norm": 1.6953125, "learning_rate": 0.0004954784800992379, "loss": 5.6434, "mean_token_accuracy": 0.1511929914355278, "num_tokens": 15776813.0, "step": 8545 }, { "entropy": 5.8534894466400145, "epoch": 0.718336483931947, "grad_norm": 1.6484375, "learning_rate": 0.0004954725047703969, "loss": 5.6771, "mean_token_accuracy": 0.152647565305233, "num_tokens": 15786258.0, "step": 8550 }, { "entropy": 5.836289310455323, "epoch": 0.7187565637471119, "grad_norm": 2.09375, "learning_rate": 0.000495466525535999, "loss": 5.6667, "mean_token_accuracy": 0.15143323093652725, "num_tokens": 15795673.0, "step": 8555 }, { "entropy": 5.811659526824951, "epoch": 0.7191766435622768, "grad_norm": 2.234375, "learning_rate": 0.0004954605423961501, "loss": 5.6561, "mean_token_accuracy": 0.15157762318849563, "num_tokens": 15805050.0, "step": 8560 }, { "entropy": 5.681427240371704, "epoch": 0.7195967233774417, "grad_norm": 1.703125, "learning_rate": 0.0004954545553509562, "loss": 5.606, "mean_token_accuracy": 0.16409880369901658, "num_tokens": 15813347.0, "step": 8565 }, { "entropy": 5.839797496795654, "epoch": 0.7200168031926066, "grad_norm": 1.4765625, "learning_rate": 0.0004954485644005235, "loss": 5.7266, "mean_token_accuracy": 0.1489485539495945, "num_tokens": 15823528.0, "step": 8570 }, { "entropy": 5.8334362506866455, "epoch": 0.7204368830077714, "grad_norm": 1.703125, "learning_rate": 0.0004954425695449578, "loss": 5.6173, "mean_token_accuracy": 0.15086468532681466, "num_tokens": 15832727.0, "step": 8575 }, { "entropy": 5.822533702850341, "epoch": 0.7208569628229363, "grad_norm": 1.71875, "learning_rate": 0.0004954365707843657, "loss": 5.6976, "mean_token_accuracy": 0.14436446502804756, "num_tokens": 15842402.0, "step": 8580 }, { "entropy": 5.748192930221558, "epoch": 0.7212770426381012, "grad_norm": 1.4140625, "learning_rate": 0.0004954305681188531, "loss": 5.5623, "mean_token_accuracy": 0.1519525095820427, "num_tokens": 15850886.0, "step": 8585 }, { "entropy": 5.9683891296386715, "epoch": 0.7216971224532661, "grad_norm": 1.6015625, "learning_rate": 0.0004954245615485265, "loss": 5.8576, "mean_token_accuracy": 0.14881062209606172, "num_tokens": 15860093.0, "step": 8590 }, { "entropy": 5.825228261947632, "epoch": 0.722117202268431, "grad_norm": 1.4921875, "learning_rate": 0.0004954185510734924, "loss": 5.5603, "mean_token_accuracy": 0.15691882967948914, "num_tokens": 15868681.0, "step": 8595 }, { "entropy": 5.775141906738281, "epoch": 0.7225372820835959, "grad_norm": 1.4296875, "learning_rate": 0.0004954125366938571, "loss": 5.6425, "mean_token_accuracy": 0.15889365077018738, "num_tokens": 15878041.0, "step": 8600 }, { "entropy": 5.759042358398437, "epoch": 0.7229573618987608, "grad_norm": 2.328125, "learning_rate": 0.0004954065184097271, "loss": 5.6357, "mean_token_accuracy": 0.15483569353818893, "num_tokens": 15887562.0, "step": 8605 }, { "entropy": 5.751525020599365, "epoch": 0.7233774417139256, "grad_norm": 1.8046875, "learning_rate": 0.0004954004962212092, "loss": 5.5541, "mean_token_accuracy": 0.1643654190003872, "num_tokens": 15896480.0, "step": 8610 }, { "entropy": 5.911052465438843, "epoch": 0.7237975215290905, "grad_norm": 1.65625, "learning_rate": 0.0004953944701284101, "loss": 5.7752, "mean_token_accuracy": 0.1463731437921524, "num_tokens": 15906743.0, "step": 8615 }, { "entropy": 5.830478382110596, "epoch": 0.7242176013442554, "grad_norm": 1.640625, "learning_rate": 0.0004953884401314363, "loss": 5.7213, "mean_token_accuracy": 0.13995275720953942, "num_tokens": 15915981.0, "step": 8620 }, { "entropy": 5.8113525867462155, "epoch": 0.7246376811594203, "grad_norm": 1.53125, "learning_rate": 0.0004953824062303949, "loss": 5.5765, "mean_token_accuracy": 0.1530995100736618, "num_tokens": 15924117.0, "step": 8625 }, { "entropy": 5.7734462261199955, "epoch": 0.7250577609745852, "grad_norm": 1.5546875, "learning_rate": 0.0004953763684253926, "loss": 5.6054, "mean_token_accuracy": 0.16219132840633393, "num_tokens": 15933124.0, "step": 8630 }, { "entropy": 5.7224249839782715, "epoch": 0.7254778407897501, "grad_norm": 1.96875, "learning_rate": 0.0004953703267165364, "loss": 5.5024, "mean_token_accuracy": 0.1558832585811615, "num_tokens": 15942422.0, "step": 8635 }, { "entropy": 5.749732875823975, "epoch": 0.725897920604915, "grad_norm": 1.515625, "learning_rate": 0.0004953642811039332, "loss": 5.7128, "mean_token_accuracy": 0.14854123890399934, "num_tokens": 15950989.0, "step": 8640 }, { "entropy": 5.855362319946289, "epoch": 0.7263180004200798, "grad_norm": 1.6640625, "learning_rate": 0.0004953582315876904, "loss": 5.7185, "mean_token_accuracy": 0.15013131573796273, "num_tokens": 15959659.0, "step": 8645 }, { "entropy": 5.837911462783813, "epoch": 0.7267380802352447, "grad_norm": 1.859375, "learning_rate": 0.000495352178167915, "loss": 5.5977, "mean_token_accuracy": 0.16410948783159257, "num_tokens": 15968102.0, "step": 8650 }, { "entropy": 5.854554653167725, "epoch": 0.7271581600504096, "grad_norm": 2.28125, "learning_rate": 0.0004953461208447143, "loss": 5.7132, "mean_token_accuracy": 0.14808624759316444, "num_tokens": 15977705.0, "step": 8655 }, { "entropy": 5.801808023452759, "epoch": 0.7275782398655745, "grad_norm": 2.328125, "learning_rate": 0.0004953400596181953, "loss": 5.7244, "mean_token_accuracy": 0.1447308510541916, "num_tokens": 15986703.0, "step": 8660 }, { "entropy": 5.839752292633056, "epoch": 0.7279983196807394, "grad_norm": 1.6796875, "learning_rate": 0.0004953339944884657, "loss": 5.6309, "mean_token_accuracy": 0.15707603991031646, "num_tokens": 15995672.0, "step": 8665 }, { "entropy": 5.702234554290771, "epoch": 0.7284183994959043, "grad_norm": 1.6328125, "learning_rate": 0.0004953279254556329, "loss": 5.5683, "mean_token_accuracy": 0.16529579162597657, "num_tokens": 16004437.0, "step": 8670 }, { "entropy": 5.786400604248047, "epoch": 0.7288384793110692, "grad_norm": 1.5, "learning_rate": 0.0004953218525198043, "loss": 5.6136, "mean_token_accuracy": 0.1482889771461487, "num_tokens": 16012847.0, "step": 8675 }, { "entropy": 5.820078039169312, "epoch": 0.7292585591262339, "grad_norm": 9.3125, "learning_rate": 0.0004953157756810876, "loss": 5.6444, "mean_token_accuracy": 0.15196260213851928, "num_tokens": 16022213.0, "step": 8680 }, { "entropy": 5.784472417831421, "epoch": 0.7296786389413988, "grad_norm": 1.7421875, "learning_rate": 0.0004953096949395902, "loss": 5.6938, "mean_token_accuracy": 0.15605147629976274, "num_tokens": 16031411.0, "step": 8685 }, { "entropy": 5.822618913650513, "epoch": 0.7300987187565637, "grad_norm": 1.59375, "learning_rate": 0.0004953036102954202, "loss": 5.7282, "mean_token_accuracy": 0.14967211931943894, "num_tokens": 16041227.0, "step": 8690 }, { "entropy": 5.778734588623047, "epoch": 0.7305187985717286, "grad_norm": 1.671875, "learning_rate": 0.0004952975217486852, "loss": 5.5479, "mean_token_accuracy": 0.1602558448910713, "num_tokens": 16049777.0, "step": 8695 }, { "entropy": 5.83000955581665, "epoch": 0.7309388783868935, "grad_norm": 2.609375, "learning_rate": 0.0004952914292994928, "loss": 5.659, "mean_token_accuracy": 0.15439933240413667, "num_tokens": 16059093.0, "step": 8700 }, { "entropy": 5.840744495391846, "epoch": 0.7313589582020584, "grad_norm": 1.5546875, "learning_rate": 0.0004952853329479514, "loss": 5.6861, "mean_token_accuracy": 0.15537820011377335, "num_tokens": 16068550.0, "step": 8705 }, { "entropy": 5.810123777389526, "epoch": 0.7317790380172233, "grad_norm": 2.015625, "learning_rate": 0.0004952792326941686, "loss": 5.7191, "mean_token_accuracy": 0.14849043488502503, "num_tokens": 16078286.0, "step": 8710 }, { "entropy": 5.814086198806763, "epoch": 0.7321991178323881, "grad_norm": 2.03125, "learning_rate": 0.0004952731285382527, "loss": 5.6667, "mean_token_accuracy": 0.15178068578243256, "num_tokens": 16087560.0, "step": 8715 }, { "entropy": 5.787434864044189, "epoch": 0.732619197647553, "grad_norm": 2.515625, "learning_rate": 0.0004952670204803118, "loss": 5.6204, "mean_token_accuracy": 0.1559364140033722, "num_tokens": 16097478.0, "step": 8720 }, { "entropy": 5.850944232940674, "epoch": 0.7330392774627179, "grad_norm": 1.7734375, "learning_rate": 0.0004952609085204539, "loss": 5.7189, "mean_token_accuracy": 0.15533626079559326, "num_tokens": 16106884.0, "step": 8725 }, { "entropy": 5.731724834442138, "epoch": 0.7334593572778828, "grad_norm": 1.8515625, "learning_rate": 0.0004952547926587876, "loss": 5.6334, "mean_token_accuracy": 0.15004593282938003, "num_tokens": 16115689.0, "step": 8730 }, { "entropy": 5.7415611743927, "epoch": 0.7338794370930477, "grad_norm": 2.375, "learning_rate": 0.0004952486728954209, "loss": 5.5761, "mean_token_accuracy": 0.1599406212568283, "num_tokens": 16125237.0, "step": 8735 }, { "entropy": 5.7435039520263675, "epoch": 0.7342995169082126, "grad_norm": 1.5859375, "learning_rate": 0.0004952425492304624, "loss": 5.5816, "mean_token_accuracy": 0.15830608755350112, "num_tokens": 16133940.0, "step": 8740 }, { "entropy": 5.803058242797851, "epoch": 0.7347195967233774, "grad_norm": 2.546875, "learning_rate": 0.0004952364216640207, "loss": 5.6865, "mean_token_accuracy": 0.15288463681936265, "num_tokens": 16143256.0, "step": 8745 }, { "entropy": 5.834009265899658, "epoch": 0.7351396765385423, "grad_norm": 1.609375, "learning_rate": 0.000495230290196204, "loss": 5.5648, "mean_token_accuracy": 0.15222593396902084, "num_tokens": 16153259.0, "step": 8750 }, { "entropy": 5.86444673538208, "epoch": 0.7355597563537072, "grad_norm": 1.9765625, "learning_rate": 0.0004952241548271212, "loss": 5.8055, "mean_token_accuracy": 0.14142679050564766, "num_tokens": 16162125.0, "step": 8755 }, { "entropy": 5.84849967956543, "epoch": 0.7359798361688721, "grad_norm": 1.6015625, "learning_rate": 0.0004952180155568809, "loss": 5.7224, "mean_token_accuracy": 0.14703101068735122, "num_tokens": 16171680.0, "step": 8760 }, { "entropy": 5.853292989730835, "epoch": 0.736399915984037, "grad_norm": 1.6015625, "learning_rate": 0.0004952118723855919, "loss": 5.7153, "mean_token_accuracy": 0.15350899547338487, "num_tokens": 16181559.0, "step": 8765 }, { "entropy": 5.755408191680909, "epoch": 0.7368199957992019, "grad_norm": 1.53125, "learning_rate": 0.0004952057253133628, "loss": 5.668, "mean_token_accuracy": 0.15180395692586898, "num_tokens": 16190611.0, "step": 8770 }, { "entropy": 5.833858060836792, "epoch": 0.7372400756143668, "grad_norm": 1.6640625, "learning_rate": 0.0004951995743403028, "loss": 5.6769, "mean_token_accuracy": 0.15253981202840805, "num_tokens": 16200156.0, "step": 8775 }, { "entropy": 5.824840307235718, "epoch": 0.7376601554295316, "grad_norm": 1.859375, "learning_rate": 0.0004951934194665208, "loss": 5.6458, "mean_token_accuracy": 0.14709821194410325, "num_tokens": 16209808.0, "step": 8780 }, { "entropy": 5.756002902984619, "epoch": 0.7380802352446965, "grad_norm": 1.609375, "learning_rate": 0.0004951872606921257, "loss": 5.6136, "mean_token_accuracy": 0.15270906686782837, "num_tokens": 16219243.0, "step": 8785 }, { "entropy": 5.72284197807312, "epoch": 0.7385003150598614, "grad_norm": 1.7421875, "learning_rate": 0.0004951810980172265, "loss": 5.627, "mean_token_accuracy": 0.1641955330967903, "num_tokens": 16228180.0, "step": 8790 }, { "entropy": 5.785319805145264, "epoch": 0.7389203948750263, "grad_norm": 1.515625, "learning_rate": 0.0004951749314419327, "loss": 5.6417, "mean_token_accuracy": 0.15115589275956154, "num_tokens": 16237045.0, "step": 8795 }, { "entropy": 5.791619110107422, "epoch": 0.7393404746901912, "grad_norm": 1.7734375, "learning_rate": 0.0004951687609663533, "loss": 5.5589, "mean_token_accuracy": 0.15952047407627107, "num_tokens": 16245307.0, "step": 8800 }, { "entropy": 5.765593528747559, "epoch": 0.739760554505356, "grad_norm": 1.6015625, "learning_rate": 0.0004951625865905977, "loss": 5.5974, "mean_token_accuracy": 0.14921371787786483, "num_tokens": 16255047.0, "step": 8805 }, { "entropy": 5.749333095550537, "epoch": 0.740180634320521, "grad_norm": 1.640625, "learning_rate": 0.0004951564083147753, "loss": 5.6447, "mean_token_accuracy": 0.1600167080760002, "num_tokens": 16264969.0, "step": 8810 }, { "entropy": 5.81842737197876, "epoch": 0.7406007141356857, "grad_norm": 1.46875, "learning_rate": 0.0004951502261389953, "loss": 5.7327, "mean_token_accuracy": 0.14656912833452224, "num_tokens": 16274757.0, "step": 8815 }, { "entropy": 5.780880069732666, "epoch": 0.7410207939508506, "grad_norm": 1.7421875, "learning_rate": 0.0004951440400633677, "loss": 5.6351, "mean_token_accuracy": 0.16265199482440948, "num_tokens": 16283409.0, "step": 8820 }, { "entropy": 5.687593412399292, "epoch": 0.7414408737660155, "grad_norm": 1.6875, "learning_rate": 0.0004951378500880015, "loss": 5.5962, "mean_token_accuracy": 0.1549723207950592, "num_tokens": 16293206.0, "step": 8825 }, { "entropy": 5.82498950958252, "epoch": 0.7418609535811804, "grad_norm": 1.609375, "learning_rate": 0.0004951316562130067, "loss": 5.6332, "mean_token_accuracy": 0.15318880528211593, "num_tokens": 16303121.0, "step": 8830 }, { "entropy": 5.778778553009033, "epoch": 0.7422810333963453, "grad_norm": 1.46875, "learning_rate": 0.000495125458438493, "loss": 5.5975, "mean_token_accuracy": 0.16230110377073287, "num_tokens": 16312710.0, "step": 8835 }, { "entropy": 5.8864704132080075, "epoch": 0.7427011132115102, "grad_norm": 1.6875, "learning_rate": 0.0004951192567645702, "loss": 5.7853, "mean_token_accuracy": 0.14685365781188012, "num_tokens": 16322280.0, "step": 8840 }, { "entropy": 5.721866273880005, "epoch": 0.7431211930266751, "grad_norm": 1.625, "learning_rate": 0.0004951130511913481, "loss": 5.635, "mean_token_accuracy": 0.15453375428915023, "num_tokens": 16331656.0, "step": 8845 }, { "entropy": 5.7635541439056395, "epoch": 0.7435412728418399, "grad_norm": 1.4921875, "learning_rate": 0.0004951068417189366, "loss": 5.6607, "mean_token_accuracy": 0.15400536656379699, "num_tokens": 16341074.0, "step": 8850 }, { "entropy": 5.806599426269531, "epoch": 0.7439613526570048, "grad_norm": 1.6484375, "learning_rate": 0.0004951006283474457, "loss": 5.6525, "mean_token_accuracy": 0.15177395343780517, "num_tokens": 16350097.0, "step": 8855 }, { "entropy": 5.6168114185333256, "epoch": 0.7443814324721697, "grad_norm": 1.5625, "learning_rate": 0.0004950944110769856, "loss": 5.5518, "mean_token_accuracy": 0.16385273784399032, "num_tokens": 16359274.0, "step": 8860 }, { "entropy": 5.655103158950806, "epoch": 0.7448015122873346, "grad_norm": 1.8046875, "learning_rate": 0.0004950881899076663, "loss": 5.5365, "mean_token_accuracy": 0.1682687819004059, "num_tokens": 16368445.0, "step": 8865 }, { "entropy": 5.878038167953491, "epoch": 0.7452215921024995, "grad_norm": 1.6796875, "learning_rate": 0.0004950819648395979, "loss": 5.6423, "mean_token_accuracy": 0.1565190926194191, "num_tokens": 16377689.0, "step": 8870 }, { "entropy": 5.772777366638183, "epoch": 0.7456416719176644, "grad_norm": 1.5390625, "learning_rate": 0.000495075735872891, "loss": 5.5949, "mean_token_accuracy": 0.1571029394865036, "num_tokens": 16386713.0, "step": 8875 }, { "entropy": 5.772426748275757, "epoch": 0.7460617517328293, "grad_norm": 1.8671875, "learning_rate": 0.0004950695030076557, "loss": 5.6116, "mean_token_accuracy": 0.152817103266716, "num_tokens": 16395390.0, "step": 8880 }, { "entropy": 5.862038803100586, "epoch": 0.7464818315479941, "grad_norm": 1.59375, "learning_rate": 0.0004950632662440027, "loss": 5.6909, "mean_token_accuracy": 0.15143778100609778, "num_tokens": 16404531.0, "step": 8885 }, { "entropy": 5.734190988540649, "epoch": 0.746901911363159, "grad_norm": 1.578125, "learning_rate": 0.0004950570255820419, "loss": 5.5892, "mean_token_accuracy": 0.15557831078767775, "num_tokens": 16413649.0, "step": 8890 }, { "entropy": 5.679434442520142, "epoch": 0.7473219911783239, "grad_norm": 1.96875, "learning_rate": 0.0004950507810218843, "loss": 5.7074, "mean_token_accuracy": 0.14878712072968484, "num_tokens": 16423247.0, "step": 8895 }, { "entropy": 5.8338196754455565, "epoch": 0.7477420709934888, "grad_norm": 1.515625, "learning_rate": 0.0004950445325636405, "loss": 5.649, "mean_token_accuracy": 0.14864842891693114, "num_tokens": 16432190.0, "step": 8900 }, { "entropy": 5.864486503601074, "epoch": 0.7481621508086537, "grad_norm": 2.15625, "learning_rate": 0.0004950382802074211, "loss": 5.6038, "mean_token_accuracy": 0.15934911370277405, "num_tokens": 16443091.0, "step": 8905 }, { "entropy": 5.711412811279297, "epoch": 0.7485822306238186, "grad_norm": 1.703125, "learning_rate": 0.0004950320239533369, "loss": 5.6338, "mean_token_accuracy": 0.15670278668403625, "num_tokens": 16452077.0, "step": 8910 }, { "entropy": 5.8399248123168945, "epoch": 0.7490023104389834, "grad_norm": 1.859375, "learning_rate": 0.0004950257638014986, "loss": 5.7602, "mean_token_accuracy": 0.14474717825651168, "num_tokens": 16461893.0, "step": 8915 }, { "entropy": 5.905817985534668, "epoch": 0.7494223902541483, "grad_norm": 1.484375, "learning_rate": 0.0004950194997520172, "loss": 5.5814, "mean_token_accuracy": 0.1564013957977295, "num_tokens": 16470904.0, "step": 8920 }, { "entropy": 5.779659080505371, "epoch": 0.7498424700693131, "grad_norm": 2.078125, "learning_rate": 0.0004950132318050037, "loss": 5.6502, "mean_token_accuracy": 0.14872682839632034, "num_tokens": 16480130.0, "step": 8925 }, { "entropy": 5.735926008224487, "epoch": 0.750262549884478, "grad_norm": 1.6328125, "learning_rate": 0.0004950069599605691, "loss": 5.7004, "mean_token_accuracy": 0.1561155989766121, "num_tokens": 16489485.0, "step": 8930 }, { "entropy": 5.7690812110900875, "epoch": 0.750682629699643, "grad_norm": 1.5859375, "learning_rate": 0.0004950006842188245, "loss": 5.6526, "mean_token_accuracy": 0.15704655051231384, "num_tokens": 16498529.0, "step": 8935 }, { "entropy": 5.776333618164062, "epoch": 0.7511027095148078, "grad_norm": 1.6875, "learning_rate": 0.000494994404579881, "loss": 5.5733, "mean_token_accuracy": 0.1540952205657959, "num_tokens": 16508094.0, "step": 8940 }, { "entropy": 5.810970735549927, "epoch": 0.7515227893299727, "grad_norm": 1.5859375, "learning_rate": 0.00049498812104385, "loss": 5.6854, "mean_token_accuracy": 0.14840709492564202, "num_tokens": 16517620.0, "step": 8945 }, { "entropy": 5.717817068099976, "epoch": 0.7519428691451375, "grad_norm": 1.9140625, "learning_rate": 0.0004949818336108425, "loss": 5.6743, "mean_token_accuracy": 0.1453969433903694, "num_tokens": 16526720.0, "step": 8950 }, { "entropy": 5.782077789306641, "epoch": 0.7523629489603024, "grad_norm": 1.8125, "learning_rate": 0.0004949755422809703, "loss": 5.6349, "mean_token_accuracy": 0.15297809839248658, "num_tokens": 16535979.0, "step": 8955 }, { "entropy": 5.789309072494507, "epoch": 0.7527830287754673, "grad_norm": 1.9453125, "learning_rate": 0.0004949692470543446, "loss": 5.518, "mean_token_accuracy": 0.16405045241117477, "num_tokens": 16544538.0, "step": 8960 }, { "entropy": 5.700740957260132, "epoch": 0.7532031085906322, "grad_norm": 1.640625, "learning_rate": 0.0004949629479310769, "loss": 5.6021, "mean_token_accuracy": 0.15271754264831544, "num_tokens": 16553962.0, "step": 8965 }, { "entropy": 5.7723414421081545, "epoch": 0.7536231884057971, "grad_norm": 1.5546875, "learning_rate": 0.0004949566449112788, "loss": 5.5341, "mean_token_accuracy": 0.1600716605782509, "num_tokens": 16562652.0, "step": 8970 }, { "entropy": 5.816875839233399, "epoch": 0.754043268220962, "grad_norm": 1.6796875, "learning_rate": 0.0004949503379950621, "loss": 5.6381, "mean_token_accuracy": 0.15340977758169175, "num_tokens": 16570887.0, "step": 8975 }, { "entropy": 5.825795125961304, "epoch": 0.7544633480361269, "grad_norm": 2.140625, "learning_rate": 0.0004949440271825385, "loss": 5.7669, "mean_token_accuracy": 0.15065207779407502, "num_tokens": 16581469.0, "step": 8980 }, { "entropy": 5.783386135101319, "epoch": 0.7548834278512917, "grad_norm": 1.6015625, "learning_rate": 0.0004949377124738196, "loss": 5.6376, "mean_token_accuracy": 0.15028667375445365, "num_tokens": 16590213.0, "step": 8985 }, { "entropy": 5.759113931655884, "epoch": 0.7553035076664566, "grad_norm": 1.5, "learning_rate": 0.0004949313938690174, "loss": 5.6301, "mean_token_accuracy": 0.1542770192027092, "num_tokens": 16598384.0, "step": 8990 }, { "entropy": 5.692385244369507, "epoch": 0.7557235874816215, "grad_norm": 1.953125, "learning_rate": 0.0004949250713682438, "loss": 5.6114, "mean_token_accuracy": 0.15893905013799667, "num_tokens": 16607670.0, "step": 8995 }, { "entropy": 5.830786418914795, "epoch": 0.7561436672967864, "grad_norm": 1.4375, "learning_rate": 0.0004949187449716107, "loss": 5.6932, "mean_token_accuracy": 0.15244348496198654, "num_tokens": 16617560.0, "step": 9000 }, { "epoch": 0.7561436672967864, "eval_entropy": 5.638838640603793, "eval_loss": 5.66161584854126, "eval_mean_token_accuracy": 0.1600216546673523, "eval_num_tokens": 16617560.0, "eval_runtime": 27.3107, "eval_samples_per_second": 1368.184, "eval_steps_per_second": 171.032, "step": 9000 }, { "entropy": 5.768628692626953, "epoch": 0.7565637471119513, "grad_norm": 1.4140625, "learning_rate": 0.0004949124146792304, "loss": 5.6053, "mean_token_accuracy": 0.15778433308005332, "num_tokens": 16626038.0, "step": 9005 }, { "entropy": 5.731417560577393, "epoch": 0.7569838269271162, "grad_norm": 1.6015625, "learning_rate": 0.0004949060804912149, "loss": 5.6189, "mean_token_accuracy": 0.15456314831972123, "num_tokens": 16636490.0, "step": 9010 }, { "entropy": 5.799277114868164, "epoch": 0.7574039067422811, "grad_norm": 1.5, "learning_rate": 0.0004948997424076764, "loss": 5.6171, "mean_token_accuracy": 0.15356937795877457, "num_tokens": 16645369.0, "step": 9015 }, { "entropy": 5.897463607788086, "epoch": 0.7578239865574459, "grad_norm": 1.4296875, "learning_rate": 0.0004948934004287272, "loss": 5.7045, "mean_token_accuracy": 0.15171536356210708, "num_tokens": 16654348.0, "step": 9020 }, { "entropy": 5.868229866027832, "epoch": 0.7582440663726108, "grad_norm": 1.3203125, "learning_rate": 0.0004948870545544796, "loss": 5.6922, "mean_token_accuracy": 0.14805838614702224, "num_tokens": 16664009.0, "step": 9025 }, { "entropy": 5.773172187805176, "epoch": 0.7586641461877757, "grad_norm": 1.6171875, "learning_rate": 0.000494880704785046, "loss": 5.7196, "mean_token_accuracy": 0.14663708806037903, "num_tokens": 16674079.0, "step": 9030 }, { "entropy": 5.847867155075074, "epoch": 0.7590842260029406, "grad_norm": 1.6796875, "learning_rate": 0.0004948743511205392, "loss": 5.6426, "mean_token_accuracy": 0.1503751888871193, "num_tokens": 16683687.0, "step": 9035 }, { "entropy": 5.763606691360474, "epoch": 0.7595043058181055, "grad_norm": 1.6015625, "learning_rate": 0.0004948679935610712, "loss": 5.5392, "mean_token_accuracy": 0.1664429262280464, "num_tokens": 16693311.0, "step": 9040 }, { "entropy": 5.767797994613647, "epoch": 0.7599243856332704, "grad_norm": 1.34375, "learning_rate": 0.000494861632106755, "loss": 5.5897, "mean_token_accuracy": 0.15403168946504592, "num_tokens": 16702121.0, "step": 9045 }, { "entropy": 5.778069067001343, "epoch": 0.7603444654484351, "grad_norm": 1.7265625, "learning_rate": 0.0004948552667577033, "loss": 5.6211, "mean_token_accuracy": 0.1538814291357994, "num_tokens": 16711883.0, "step": 9050 }, { "entropy": 5.786386919021607, "epoch": 0.7607645452636, "grad_norm": 2.34375, "learning_rate": 0.0004948488975140286, "loss": 5.6847, "mean_token_accuracy": 0.1501935139298439, "num_tokens": 16721449.0, "step": 9055 }, { "entropy": 5.747914838790893, "epoch": 0.7611846250787649, "grad_norm": 1.75, "learning_rate": 0.000494842524375844, "loss": 5.6196, "mean_token_accuracy": 0.15546474158763884, "num_tokens": 16730068.0, "step": 9060 }, { "entropy": 5.748115968704224, "epoch": 0.7616047048939298, "grad_norm": 1.484375, "learning_rate": 0.0004948361473432623, "loss": 5.6365, "mean_token_accuracy": 0.15265990495681764, "num_tokens": 16739970.0, "step": 9065 }, { "entropy": 5.84217677116394, "epoch": 0.7620247847090947, "grad_norm": 1.953125, "learning_rate": 0.0004948297664163964, "loss": 5.7024, "mean_token_accuracy": 0.1518349438905716, "num_tokens": 16749461.0, "step": 9070 }, { "entropy": 5.866002225875855, "epoch": 0.7624448645242596, "grad_norm": 1.71875, "learning_rate": 0.0004948233815953593, "loss": 5.7617, "mean_token_accuracy": 0.15022132098674773, "num_tokens": 16758747.0, "step": 9075 }, { "entropy": 5.733387041091919, "epoch": 0.7628649443394245, "grad_norm": 1.6015625, "learning_rate": 0.0004948169928802643, "loss": 5.4962, "mean_token_accuracy": 0.1622622489929199, "num_tokens": 16767212.0, "step": 9080 }, { "entropy": 5.798452520370484, "epoch": 0.7632850241545893, "grad_norm": 1.6875, "learning_rate": 0.0004948106002712245, "loss": 5.6462, "mean_token_accuracy": 0.15284293740987778, "num_tokens": 16776514.0, "step": 9085 }, { "entropy": 5.7934057235717775, "epoch": 0.7637051039697542, "grad_norm": 1.7890625, "learning_rate": 0.0004948042037683529, "loss": 5.6197, "mean_token_accuracy": 0.15112117901444436, "num_tokens": 16786310.0, "step": 9090 }, { "entropy": 5.78909387588501, "epoch": 0.7641251837849191, "grad_norm": 1.5, "learning_rate": 0.0004947978033717632, "loss": 5.6515, "mean_token_accuracy": 0.1501218557357788, "num_tokens": 16795551.0, "step": 9095 }, { "entropy": 5.803013610839844, "epoch": 0.764545263600084, "grad_norm": 1.8515625, "learning_rate": 0.0004947913990815684, "loss": 5.6264, "mean_token_accuracy": 0.15242374390363694, "num_tokens": 16805099.0, "step": 9100 }, { "entropy": 5.776360607147216, "epoch": 0.7649653434152489, "grad_norm": 1.375, "learning_rate": 0.0004947849908978824, "loss": 5.6647, "mean_token_accuracy": 0.15367067903280257, "num_tokens": 16813963.0, "step": 9105 }, { "entropy": 5.838750600814819, "epoch": 0.7653854232304138, "grad_norm": 1.5, "learning_rate": 0.0004947785788208182, "loss": 5.6921, "mean_token_accuracy": 0.1523756965994835, "num_tokens": 16822814.0, "step": 9110 }, { "entropy": 5.851981353759766, "epoch": 0.7658055030455787, "grad_norm": 1.46875, "learning_rate": 0.0004947721628504898, "loss": 5.7322, "mean_token_accuracy": 0.15000171959400177, "num_tokens": 16831906.0, "step": 9115 }, { "entropy": 5.700230932235717, "epoch": 0.7662255828607435, "grad_norm": 1.640625, "learning_rate": 0.0004947657429870108, "loss": 5.5446, "mean_token_accuracy": 0.1595570996403694, "num_tokens": 16840050.0, "step": 9120 }, { "entropy": 5.705719661712647, "epoch": 0.7666456626759084, "grad_norm": 1.4296875, "learning_rate": 0.0004947593192304946, "loss": 5.5713, "mean_token_accuracy": 0.15321452915668488, "num_tokens": 16848404.0, "step": 9125 }, { "entropy": 5.759864425659179, "epoch": 0.7670657424910733, "grad_norm": 1.4453125, "learning_rate": 0.0004947528915810554, "loss": 5.5722, "mean_token_accuracy": 0.1579087942838669, "num_tokens": 16856568.0, "step": 9130 }, { "entropy": 5.756204128265381, "epoch": 0.7674858223062382, "grad_norm": 1.6640625, "learning_rate": 0.0004947464600388066, "loss": 5.6034, "mean_token_accuracy": 0.15562164336442946, "num_tokens": 16864936.0, "step": 9135 }, { "entropy": 5.9225013732910154, "epoch": 0.7679059021214031, "grad_norm": 1.5234375, "learning_rate": 0.0004947400246038627, "loss": 5.7416, "mean_token_accuracy": 0.14872185736894608, "num_tokens": 16874504.0, "step": 9140 }, { "entropy": 5.684078311920166, "epoch": 0.768325981936568, "grad_norm": 1.3046875, "learning_rate": 0.0004947335852763374, "loss": 5.4846, "mean_token_accuracy": 0.15627673268318176, "num_tokens": 16883365.0, "step": 9145 }, { "entropy": 5.801791000366211, "epoch": 0.7687460617517329, "grad_norm": 1.59375, "learning_rate": 0.0004947271420563447, "loss": 5.7415, "mean_token_accuracy": 0.14208680838346482, "num_tokens": 16892701.0, "step": 9150 }, { "entropy": 5.769952487945557, "epoch": 0.7691661415668977, "grad_norm": 1.421875, "learning_rate": 0.0004947206949439989, "loss": 5.553, "mean_token_accuracy": 0.15000357180833818, "num_tokens": 16901864.0, "step": 9155 }, { "entropy": 5.736040306091309, "epoch": 0.7695862213820626, "grad_norm": 1.4609375, "learning_rate": 0.000494714243939414, "loss": 5.608, "mean_token_accuracy": 0.15939729958772658, "num_tokens": 16910908.0, "step": 9160 }, { "entropy": 5.721309995651245, "epoch": 0.7700063011972275, "grad_norm": 1.53125, "learning_rate": 0.0004947077890427045, "loss": 5.6325, "mean_token_accuracy": 0.15240202248096466, "num_tokens": 16920299.0, "step": 9165 }, { "entropy": 5.880091524124145, "epoch": 0.7704263810123924, "grad_norm": 1.4609375, "learning_rate": 0.0004947013302539846, "loss": 5.7698, "mean_token_accuracy": 0.1435159295797348, "num_tokens": 16930027.0, "step": 9170 }, { "entropy": 5.877429723739624, "epoch": 0.7708464608275573, "grad_norm": 2.84375, "learning_rate": 0.0004946948675733688, "loss": 5.6626, "mean_token_accuracy": 0.15365543216466904, "num_tokens": 16939387.0, "step": 9175 }, { "entropy": 5.743030834197998, "epoch": 0.7712665406427222, "grad_norm": 1.4921875, "learning_rate": 0.0004946884010009714, "loss": 5.6303, "mean_token_accuracy": 0.15409868359565734, "num_tokens": 16950024.0, "step": 9180 }, { "entropy": 5.677621221542358, "epoch": 0.771686620457887, "grad_norm": 1.46875, "learning_rate": 0.0004946819305369073, "loss": 5.525, "mean_token_accuracy": 0.16152018159627915, "num_tokens": 16958219.0, "step": 9185 }, { "entropy": 5.728807067871093, "epoch": 0.7721067002730518, "grad_norm": 1.40625, "learning_rate": 0.0004946754561812909, "loss": 5.5102, "mean_token_accuracy": 0.16226852238178252, "num_tokens": 16966829.0, "step": 9190 }, { "entropy": 5.733729887008667, "epoch": 0.7725267800882167, "grad_norm": 1.4453125, "learning_rate": 0.0004946689779342367, "loss": 5.6145, "mean_token_accuracy": 0.15137282758951187, "num_tokens": 16975585.0, "step": 9195 }, { "entropy": 5.743972539901733, "epoch": 0.7729468599033816, "grad_norm": 1.5078125, "learning_rate": 0.0004946624957958599, "loss": 5.5951, "mean_token_accuracy": 0.15704918652772903, "num_tokens": 16984848.0, "step": 9200 }, { "entropy": 5.737395524978638, "epoch": 0.7733669397185465, "grad_norm": 1.4765625, "learning_rate": 0.000494656009766275, "loss": 5.5839, "mean_token_accuracy": 0.16066163033246994, "num_tokens": 16993179.0, "step": 9205 }, { "entropy": 5.7287391185760494, "epoch": 0.7737870195337114, "grad_norm": 1.8828125, "learning_rate": 0.000494649519845597, "loss": 5.62, "mean_token_accuracy": 0.15363839864730836, "num_tokens": 17002563.0, "step": 9210 }, { "entropy": 5.80807056427002, "epoch": 0.7742070993488763, "grad_norm": 1.640625, "learning_rate": 0.0004946430260339409, "loss": 5.6284, "mean_token_accuracy": 0.15289961099624633, "num_tokens": 17011805.0, "step": 9215 }, { "entropy": 5.779461622238159, "epoch": 0.7746271791640411, "grad_norm": 1.6328125, "learning_rate": 0.0004946365283314216, "loss": 5.5989, "mean_token_accuracy": 0.1561885267496109, "num_tokens": 17020398.0, "step": 9220 }, { "entropy": 5.694942331314087, "epoch": 0.775047258979206, "grad_norm": 1.484375, "learning_rate": 0.0004946300267381545, "loss": 5.5753, "mean_token_accuracy": 0.15811678916215896, "num_tokens": 17030805.0, "step": 9225 }, { "entropy": 5.794308614730835, "epoch": 0.7754673387943709, "grad_norm": 1.9140625, "learning_rate": 0.0004946235212542544, "loss": 5.597, "mean_token_accuracy": 0.1565954014658928, "num_tokens": 17040164.0, "step": 9230 }, { "entropy": 5.77291522026062, "epoch": 0.7758874186095358, "grad_norm": 1.5078125, "learning_rate": 0.0004946170118798367, "loss": 5.67, "mean_token_accuracy": 0.14761753827333451, "num_tokens": 17049519.0, "step": 9235 }, { "entropy": 5.802110385894776, "epoch": 0.7763074984247007, "grad_norm": 1.4375, "learning_rate": 0.0004946104986150167, "loss": 5.5979, "mean_token_accuracy": 0.15635768324136734, "num_tokens": 17058042.0, "step": 9240 }, { "entropy": 5.77113904953003, "epoch": 0.7767275782398656, "grad_norm": 1.828125, "learning_rate": 0.0004946039814599099, "loss": 5.624, "mean_token_accuracy": 0.15740283727645873, "num_tokens": 17067107.0, "step": 9245 }, { "entropy": 5.784947872161865, "epoch": 0.7771476580550305, "grad_norm": 1.5546875, "learning_rate": 0.0004945974604146316, "loss": 5.7176, "mean_token_accuracy": 0.15673644915223123, "num_tokens": 17076975.0, "step": 9250 }, { "entropy": 5.760613203048706, "epoch": 0.7775677378701953, "grad_norm": 1.65625, "learning_rate": 0.0004945909354792974, "loss": 5.5674, "mean_token_accuracy": 0.15634535551071166, "num_tokens": 17086405.0, "step": 9255 }, { "entropy": 5.718491649627685, "epoch": 0.7779878176853602, "grad_norm": 1.6640625, "learning_rate": 0.0004945844066540229, "loss": 5.6449, "mean_token_accuracy": 0.1455477386713028, "num_tokens": 17095333.0, "step": 9260 }, { "entropy": 5.7345335483551025, "epoch": 0.7784078975005251, "grad_norm": 1.5078125, "learning_rate": 0.0004945778739389236, "loss": 5.684, "mean_token_accuracy": 0.150144724547863, "num_tokens": 17103631.0, "step": 9265 }, { "entropy": 5.794864368438721, "epoch": 0.77882797731569, "grad_norm": 1.8203125, "learning_rate": 0.0004945713373341152, "loss": 5.5715, "mean_token_accuracy": 0.15383470058441162, "num_tokens": 17112612.0, "step": 9270 }, { "entropy": 5.823299360275269, "epoch": 0.7792480571308549, "grad_norm": 3.109375, "learning_rate": 0.0004945647968397139, "loss": 5.6242, "mean_token_accuracy": 0.15435410290956497, "num_tokens": 17121592.0, "step": 9275 }, { "entropy": 5.742037677764893, "epoch": 0.7796681369460198, "grad_norm": 1.75, "learning_rate": 0.0004945582524558352, "loss": 5.6497, "mean_token_accuracy": 0.15522131621837615, "num_tokens": 17131003.0, "step": 9280 }, { "entropy": 5.8117687702178955, "epoch": 0.7800882167611847, "grad_norm": 1.546875, "learning_rate": 0.000494551704182595, "loss": 5.6434, "mean_token_accuracy": 0.1501818783581257, "num_tokens": 17140013.0, "step": 9285 }, { "entropy": 5.904456377029419, "epoch": 0.7805082965763495, "grad_norm": 1.53125, "learning_rate": 0.0004945451520201095, "loss": 5.7995, "mean_token_accuracy": 0.1440419152379036, "num_tokens": 17150406.0, "step": 9290 }, { "entropy": 5.804939079284668, "epoch": 0.7809283763915144, "grad_norm": 1.578125, "learning_rate": 0.0004945385959684947, "loss": 5.643, "mean_token_accuracy": 0.15583974719047547, "num_tokens": 17159757.0, "step": 9295 }, { "entropy": 5.787489128112793, "epoch": 0.7813484562066793, "grad_norm": 1.859375, "learning_rate": 0.0004945320360278667, "loss": 5.6665, "mean_token_accuracy": 0.15916707813739778, "num_tokens": 17169317.0, "step": 9300 }, { "entropy": 5.814616775512695, "epoch": 0.7817685360218442, "grad_norm": 2.203125, "learning_rate": 0.0004945254721983416, "loss": 5.6676, "mean_token_accuracy": 0.1608291007578373, "num_tokens": 17178410.0, "step": 9305 }, { "entropy": 5.825447988510132, "epoch": 0.782188615837009, "grad_norm": 1.9296875, "learning_rate": 0.000494518904480036, "loss": 5.5898, "mean_token_accuracy": 0.15593952387571336, "num_tokens": 17186922.0, "step": 9310 }, { "entropy": 5.802917385101319, "epoch": 0.782608695652174, "grad_norm": 1.734375, "learning_rate": 0.0004945123328730659, "loss": 5.6666, "mean_token_accuracy": 0.1478397913277149, "num_tokens": 17197125.0, "step": 9315 }, { "entropy": 5.739556694030762, "epoch": 0.7830287754673388, "grad_norm": 2.09375, "learning_rate": 0.000494505757377548, "loss": 5.597, "mean_token_accuracy": 0.15432032942771912, "num_tokens": 17206169.0, "step": 9320 }, { "entropy": 5.679258155822754, "epoch": 0.7834488552825036, "grad_norm": 1.8125, "learning_rate": 0.0004944991779935985, "loss": 5.538, "mean_token_accuracy": 0.15561339557170867, "num_tokens": 17214607.0, "step": 9325 }, { "entropy": 5.689110612869262, "epoch": 0.7838689350976685, "grad_norm": 1.7890625, "learning_rate": 0.000494492594721334, "loss": 5.5188, "mean_token_accuracy": 0.15666710287332536, "num_tokens": 17223616.0, "step": 9330 }, { "entropy": 5.764066362380982, "epoch": 0.7842890149128334, "grad_norm": 1.4765625, "learning_rate": 0.0004944860075608715, "loss": 5.607, "mean_token_accuracy": 0.15148743987083435, "num_tokens": 17232729.0, "step": 9335 }, { "entropy": 5.747860622406006, "epoch": 0.7847090947279983, "grad_norm": 1.84375, "learning_rate": 0.0004944794165123272, "loss": 5.6633, "mean_token_accuracy": 0.1552363008260727, "num_tokens": 17242128.0, "step": 9340 }, { "entropy": 5.7937798500061035, "epoch": 0.7851291745431632, "grad_norm": 1.8984375, "learning_rate": 0.000494472821575818, "loss": 5.572, "mean_token_accuracy": 0.15619071274995805, "num_tokens": 17250806.0, "step": 9345 }, { "entropy": 5.884761095046997, "epoch": 0.7855492543583281, "grad_norm": 1.5234375, "learning_rate": 0.0004944662227514609, "loss": 5.796, "mean_token_accuracy": 0.14290329068899155, "num_tokens": 17260888.0, "step": 9350 }, { "entropy": 5.765118503570557, "epoch": 0.785969334173493, "grad_norm": 1.921875, "learning_rate": 0.0004944596200393726, "loss": 5.5632, "mean_token_accuracy": 0.1571262151002884, "num_tokens": 17270387.0, "step": 9355 }, { "entropy": 5.790839576721192, "epoch": 0.7863894139886578, "grad_norm": 1.6875, "learning_rate": 0.0004944530134396702, "loss": 5.5971, "mean_token_accuracy": 0.1504202328622341, "num_tokens": 17279866.0, "step": 9360 }, { "entropy": 5.764979267120362, "epoch": 0.7868094938038227, "grad_norm": 1.578125, "learning_rate": 0.0004944464029524707, "loss": 5.5927, "mean_token_accuracy": 0.15793014466762542, "num_tokens": 17289233.0, "step": 9365 }, { "entropy": 5.78815860748291, "epoch": 0.7872295736189876, "grad_norm": 1.59375, "learning_rate": 0.000494439788577891, "loss": 5.6811, "mean_token_accuracy": 0.15233502089977263, "num_tokens": 17298705.0, "step": 9370 }, { "entropy": 5.803197431564331, "epoch": 0.7876496534341525, "grad_norm": 1.8671875, "learning_rate": 0.0004944331703160486, "loss": 5.6262, "mean_token_accuracy": 0.1556847333908081, "num_tokens": 17307793.0, "step": 9375 }, { "entropy": 5.768749332427978, "epoch": 0.7880697332493174, "grad_norm": 1.6953125, "learning_rate": 0.0004944265481670605, "loss": 5.7109, "mean_token_accuracy": 0.14565183371305465, "num_tokens": 17318248.0, "step": 9380 }, { "entropy": 5.781773900985717, "epoch": 0.7884898130644823, "grad_norm": 2.328125, "learning_rate": 0.0004944199221310441, "loss": 5.6174, "mean_token_accuracy": 0.15221924781799318, "num_tokens": 17327281.0, "step": 9385 }, { "entropy": 5.823486852645874, "epoch": 0.7889098928796471, "grad_norm": 1.640625, "learning_rate": 0.0004944132922081168, "loss": 5.6269, "mean_token_accuracy": 0.15858044922351838, "num_tokens": 17336805.0, "step": 9390 }, { "entropy": 5.736378765106201, "epoch": 0.789329972694812, "grad_norm": 1.5234375, "learning_rate": 0.0004944066583983961, "loss": 5.5747, "mean_token_accuracy": 0.15340599566698074, "num_tokens": 17346024.0, "step": 9395 }, { "entropy": 5.739033269882202, "epoch": 0.7897500525099769, "grad_norm": 1.6015625, "learning_rate": 0.0004944000207019992, "loss": 5.6743, "mean_token_accuracy": 0.15382137894630432, "num_tokens": 17355100.0, "step": 9400 }, { "entropy": 5.865094900131226, "epoch": 0.7901701323251418, "grad_norm": 1.7265625, "learning_rate": 0.0004943933791190441, "loss": 5.7171, "mean_token_accuracy": 0.14582199305295945, "num_tokens": 17364769.0, "step": 9405 }, { "entropy": 5.805460023880005, "epoch": 0.7905902121403067, "grad_norm": 1.7109375, "learning_rate": 0.0004943867336496482, "loss": 5.5593, "mean_token_accuracy": 0.156871497631073, "num_tokens": 17374082.0, "step": 9410 }, { "entropy": 5.704965591430664, "epoch": 0.7910102919554716, "grad_norm": 1.640625, "learning_rate": 0.0004943800842939293, "loss": 5.6061, "mean_token_accuracy": 0.1573358103632927, "num_tokens": 17383570.0, "step": 9415 }, { "entropy": 5.762260246276855, "epoch": 0.7914303717706365, "grad_norm": 1.8671875, "learning_rate": 0.000494373431052005, "loss": 5.6136, "mean_token_accuracy": 0.15585907325148582, "num_tokens": 17392105.0, "step": 9420 }, { "entropy": 5.754047203063965, "epoch": 0.7918504515858013, "grad_norm": 1.5859375, "learning_rate": 0.0004943667739239935, "loss": 5.5694, "mean_token_accuracy": 0.1567780628800392, "num_tokens": 17401363.0, "step": 9425 }, { "entropy": 5.831571578979492, "epoch": 0.7922705314009661, "grad_norm": 1.6171875, "learning_rate": 0.0004943601129100125, "loss": 5.5907, "mean_token_accuracy": 0.15667269229888917, "num_tokens": 17411333.0, "step": 9430 }, { "entropy": 5.808466386795044, "epoch": 0.792690611216131, "grad_norm": 1.7734375, "learning_rate": 0.0004943534480101801, "loss": 5.6449, "mean_token_accuracy": 0.1564931645989418, "num_tokens": 17421162.0, "step": 9435 }, { "entropy": 5.764466953277588, "epoch": 0.793110691031296, "grad_norm": 2.625, "learning_rate": 0.0004943467792246142, "loss": 5.5917, "mean_token_accuracy": 0.1545848786830902, "num_tokens": 17430119.0, "step": 9440 }, { "entropy": 5.799530792236328, "epoch": 0.7935307708464608, "grad_norm": 1.484375, "learning_rate": 0.0004943401065534332, "loss": 5.6028, "mean_token_accuracy": 0.1538163974881172, "num_tokens": 17439617.0, "step": 9445 }, { "entropy": 5.715310573577881, "epoch": 0.7939508506616257, "grad_norm": 2.0, "learning_rate": 0.0004943334299967551, "loss": 5.7132, "mean_token_accuracy": 0.14998757019639014, "num_tokens": 17448720.0, "step": 9450 }, { "entropy": 5.697250175476074, "epoch": 0.7943709304767906, "grad_norm": 1.84375, "learning_rate": 0.0004943267495546982, "loss": 5.5917, "mean_token_accuracy": 0.16172372549772263, "num_tokens": 17457458.0, "step": 9455 }, { "entropy": 5.81586275100708, "epoch": 0.7947910102919554, "grad_norm": 1.6796875, "learning_rate": 0.0004943200652273809, "loss": 5.6191, "mean_token_accuracy": 0.15560947209596634, "num_tokens": 17467095.0, "step": 9460 }, { "entropy": 5.75570330619812, "epoch": 0.7952110901071203, "grad_norm": 1.7734375, "learning_rate": 0.0004943133770149216, "loss": 5.657, "mean_token_accuracy": 0.14877953082323075, "num_tokens": 17476247.0, "step": 9465 }, { "entropy": 5.799701309204101, "epoch": 0.7956311699222852, "grad_norm": 1.5859375, "learning_rate": 0.0004943066849174386, "loss": 5.6635, "mean_token_accuracy": 0.1575782373547554, "num_tokens": 17486352.0, "step": 9470 }, { "entropy": 5.821471929550171, "epoch": 0.7960512497374501, "grad_norm": 1.65625, "learning_rate": 0.0004942999889350508, "loss": 5.6216, "mean_token_accuracy": 0.15541253834962845, "num_tokens": 17495633.0, "step": 9475 }, { "entropy": 5.826534175872803, "epoch": 0.796471329552615, "grad_norm": 1.8359375, "learning_rate": 0.0004942932890678765, "loss": 5.6665, "mean_token_accuracy": 0.14694230481982232, "num_tokens": 17504325.0, "step": 9480 }, { "entropy": 5.7822521209716795, "epoch": 0.7968914093677799, "grad_norm": 1.8046875, "learning_rate": 0.0004942865853160346, "loss": 5.6862, "mean_token_accuracy": 0.1536302775144577, "num_tokens": 17513265.0, "step": 9485 }, { "entropy": 5.799659156799317, "epoch": 0.7973114891829448, "grad_norm": 2.03125, "learning_rate": 0.0004942798776796436, "loss": 5.6811, "mean_token_accuracy": 0.1501378260552883, "num_tokens": 17522939.0, "step": 9490 }, { "entropy": 5.848496150970459, "epoch": 0.7977315689981096, "grad_norm": 1.890625, "learning_rate": 0.0004942731661588226, "loss": 5.699, "mean_token_accuracy": 0.1455768197774887, "num_tokens": 17532250.0, "step": 9495 }, { "entropy": 5.859736204147339, "epoch": 0.7981516488132745, "grad_norm": 1.8984375, "learning_rate": 0.0004942664507536904, "loss": 5.7145, "mean_token_accuracy": 0.1528845690190792, "num_tokens": 17541368.0, "step": 9500 }, { "entropy": 5.775320148468017, "epoch": 0.7985717286284394, "grad_norm": 2.109375, "learning_rate": 0.0004942597314643659, "loss": 5.6473, "mean_token_accuracy": 0.15444121211767198, "num_tokens": 17550871.0, "step": 9505 }, { "entropy": 5.797231960296631, "epoch": 0.7989918084436043, "grad_norm": 1.609375, "learning_rate": 0.0004942530082909681, "loss": 5.5808, "mean_token_accuracy": 0.16195199489593506, "num_tokens": 17559683.0, "step": 9510 }, { "entropy": 5.815419673919678, "epoch": 0.7994118882587692, "grad_norm": 2.578125, "learning_rate": 0.0004942462812336163, "loss": 5.5933, "mean_token_accuracy": 0.1550535589456558, "num_tokens": 17568877.0, "step": 9515 }, { "entropy": 5.879213762283325, "epoch": 0.7998319680739341, "grad_norm": 2.09375, "learning_rate": 0.0004942395502924293, "loss": 5.7466, "mean_token_accuracy": 0.14571947157382964, "num_tokens": 17578202.0, "step": 9520 }, { "entropy": 5.797115516662598, "epoch": 0.800252047889099, "grad_norm": 1.859375, "learning_rate": 0.0004942328154675268, "loss": 5.577, "mean_token_accuracy": 0.15988959819078447, "num_tokens": 17587342.0, "step": 9525 }, { "entropy": 5.750249338150025, "epoch": 0.8006721277042638, "grad_norm": 1.8984375, "learning_rate": 0.0004942260767590277, "loss": 5.4334, "mean_token_accuracy": 0.16428305059671403, "num_tokens": 17595671.0, "step": 9530 }, { "entropy": 5.749629020690918, "epoch": 0.8010922075194287, "grad_norm": 2.3125, "learning_rate": 0.0004942193341670516, "loss": 5.7607, "mean_token_accuracy": 0.1483020693063736, "num_tokens": 17605649.0, "step": 9535 }, { "entropy": 5.763780164718628, "epoch": 0.8015122873345936, "grad_norm": 3.171875, "learning_rate": 0.0004942125876917178, "loss": 5.6478, "mean_token_accuracy": 0.1507388584315777, "num_tokens": 17615286.0, "step": 9540 }, { "entropy": 5.7505041599273685, "epoch": 0.8019323671497585, "grad_norm": 2.046875, "learning_rate": 0.000494205837333146, "loss": 5.636, "mean_token_accuracy": 0.1552906632423401, "num_tokens": 17624583.0, "step": 9545 }, { "entropy": 5.812001085281372, "epoch": 0.8023524469649234, "grad_norm": 1.84375, "learning_rate": 0.0004941990830914557, "loss": 5.6149, "mean_token_accuracy": 0.15935958474874495, "num_tokens": 17633894.0, "step": 9550 }, { "entropy": 5.8303131580352785, "epoch": 0.8027725267800883, "grad_norm": 1.7890625, "learning_rate": 0.0004941923249667663, "loss": 5.71, "mean_token_accuracy": 0.149199178814888, "num_tokens": 17643172.0, "step": 9555 }, { "entropy": 5.764499855041504, "epoch": 0.803192606595253, "grad_norm": 1.828125, "learning_rate": 0.0004941855629591979, "loss": 5.5945, "mean_token_accuracy": 0.15305460765957832, "num_tokens": 17651901.0, "step": 9560 }, { "entropy": 5.755572938919068, "epoch": 0.8036126864104179, "grad_norm": 2.03125, "learning_rate": 0.0004941787970688701, "loss": 5.5957, "mean_token_accuracy": 0.15799273997545243, "num_tokens": 17660806.0, "step": 9565 }, { "entropy": 5.837345361709595, "epoch": 0.8040327662255828, "grad_norm": 3.4375, "learning_rate": 0.0004941720272959027, "loss": 5.6559, "mean_token_accuracy": 0.16126096546649932, "num_tokens": 17669157.0, "step": 9570 }, { "entropy": 5.719307231903076, "epoch": 0.8044528460407477, "grad_norm": 1.9375, "learning_rate": 0.0004941652536404157, "loss": 5.5632, "mean_token_accuracy": 0.15555428415536882, "num_tokens": 17678664.0, "step": 9575 }, { "entropy": 5.79267258644104, "epoch": 0.8048729258559126, "grad_norm": 4.625, "learning_rate": 0.0004941584761025291, "loss": 5.6044, "mean_token_accuracy": 0.15480156391859054, "num_tokens": 17688252.0, "step": 9580 }, { "entropy": 5.7286498069763185, "epoch": 0.8052930056710775, "grad_norm": 2.171875, "learning_rate": 0.000494151694682363, "loss": 5.6334, "mean_token_accuracy": 0.1567763715982437, "num_tokens": 17696473.0, "step": 9585 }, { "entropy": 5.752206754684448, "epoch": 0.8057130854862424, "grad_norm": 1.4921875, "learning_rate": 0.0004941449093800374, "loss": 5.6529, "mean_token_accuracy": 0.15852190256118776, "num_tokens": 17706177.0, "step": 9590 }, { "entropy": 5.7514872550964355, "epoch": 0.8061331653014072, "grad_norm": 1.6640625, "learning_rate": 0.0004941381201956726, "loss": 5.5015, "mean_token_accuracy": 0.16315654218196868, "num_tokens": 17715355.0, "step": 9595 }, { "entropy": 5.747640895843506, "epoch": 0.8065532451165721, "grad_norm": 1.90625, "learning_rate": 0.0004941313271293889, "loss": 5.6008, "mean_token_accuracy": 0.1622050292789936, "num_tokens": 17724345.0, "step": 9600 }, { "entropy": 5.7168864727020265, "epoch": 0.806973324931737, "grad_norm": 1.859375, "learning_rate": 0.0004941245301813065, "loss": 5.5143, "mean_token_accuracy": 0.16414132565259934, "num_tokens": 17732805.0, "step": 9605 }, { "entropy": 5.737041282653808, "epoch": 0.8073934047469019, "grad_norm": 1.75, "learning_rate": 0.0004941177293515459, "loss": 5.5799, "mean_token_accuracy": 0.157880100607872, "num_tokens": 17741963.0, "step": 9610 }, { "entropy": 5.705282735824585, "epoch": 0.8078134845620668, "grad_norm": 2.1875, "learning_rate": 0.0004941109246402275, "loss": 5.5938, "mean_token_accuracy": 0.151243394613266, "num_tokens": 17751858.0, "step": 9615 }, { "entropy": 5.866819715499878, "epoch": 0.8082335643772317, "grad_norm": 2.65625, "learning_rate": 0.0004941041160474721, "loss": 5.7059, "mean_token_accuracy": 0.1499703124165535, "num_tokens": 17761152.0, "step": 9620 }, { "entropy": 5.859082937240601, "epoch": 0.8086536441923966, "grad_norm": 1.921875, "learning_rate": 0.0004940973035733999, "loss": 5.6428, "mean_token_accuracy": 0.15314959064126016, "num_tokens": 17770493.0, "step": 9625 }, { "entropy": 5.887163877487183, "epoch": 0.8090737240075614, "grad_norm": 1.5390625, "learning_rate": 0.0004940904872181318, "loss": 5.6534, "mean_token_accuracy": 0.15107578188180923, "num_tokens": 17779871.0, "step": 9630 }, { "entropy": 5.8648134708404545, "epoch": 0.8094938038227263, "grad_norm": 1.4921875, "learning_rate": 0.0004940836669817887, "loss": 5.6633, "mean_token_accuracy": 0.1496044009923935, "num_tokens": 17788606.0, "step": 9635 }, { "entropy": 5.72215781211853, "epoch": 0.8099138836378912, "grad_norm": 4.96875, "learning_rate": 0.0004940768428644911, "loss": 5.5938, "mean_token_accuracy": 0.1555838018655777, "num_tokens": 17797458.0, "step": 9640 }, { "entropy": 5.666493082046509, "epoch": 0.8103339634530561, "grad_norm": 1.71875, "learning_rate": 0.0004940700148663601, "loss": 5.5519, "mean_token_accuracy": 0.1551619812846184, "num_tokens": 17806902.0, "step": 9645 }, { "entropy": 5.774869537353515, "epoch": 0.810754043268221, "grad_norm": 1.71875, "learning_rate": 0.0004940631829875165, "loss": 5.6878, "mean_token_accuracy": 0.1476306848227978, "num_tokens": 17816374.0, "step": 9650 }, { "entropy": 5.793194580078125, "epoch": 0.8111741230833859, "grad_norm": 1.796875, "learning_rate": 0.0004940563472280815, "loss": 5.6585, "mean_token_accuracy": 0.15614343285560608, "num_tokens": 17825267.0, "step": 9655 }, { "entropy": 5.768211507797242, "epoch": 0.8115942028985508, "grad_norm": 2.078125, "learning_rate": 0.0004940495075881761, "loss": 5.5722, "mean_token_accuracy": 0.15710717141628266, "num_tokens": 17834027.0, "step": 9660 }, { "entropy": 5.728369903564453, "epoch": 0.8120142827137156, "grad_norm": 2.3125, "learning_rate": 0.0004940426640679214, "loss": 5.5753, "mean_token_accuracy": 0.15249805226922036, "num_tokens": 17843587.0, "step": 9665 }, { "entropy": 5.8064950466156, "epoch": 0.8124343625288805, "grad_norm": 1.8359375, "learning_rate": 0.0004940358166674388, "loss": 5.6147, "mean_token_accuracy": 0.15565043687820435, "num_tokens": 17852284.0, "step": 9670 }, { "entropy": 5.845684242248535, "epoch": 0.8128544423440454, "grad_norm": 1.6484375, "learning_rate": 0.0004940289653868494, "loss": 5.6262, "mean_token_accuracy": 0.15545963644981384, "num_tokens": 17860896.0, "step": 9675 }, { "entropy": 5.703367519378662, "epoch": 0.8132745221592103, "grad_norm": 2.0625, "learning_rate": 0.0004940221102262747, "loss": 5.5942, "mean_token_accuracy": 0.15159963369369506, "num_tokens": 17870796.0, "step": 9680 }, { "entropy": 5.789257049560547, "epoch": 0.8136946019743752, "grad_norm": 2.859375, "learning_rate": 0.0004940152511858361, "loss": 5.6788, "mean_token_accuracy": 0.14908051788806914, "num_tokens": 17880016.0, "step": 9685 }, { "entropy": 5.866326093673706, "epoch": 0.81411468178954, "grad_norm": 1.609375, "learning_rate": 0.0004940083882656551, "loss": 5.7101, "mean_token_accuracy": 0.14765079468488693, "num_tokens": 17889348.0, "step": 9690 }, { "entropy": 5.818946790695191, "epoch": 0.814534761604705, "grad_norm": 1.6015625, "learning_rate": 0.0004940015214658532, "loss": 5.5647, "mean_token_accuracy": 0.16243199706077577, "num_tokens": 17898392.0, "step": 9695 }, { "entropy": 5.796739816665649, "epoch": 0.8149548414198697, "grad_norm": 2.21875, "learning_rate": 0.0004939946507865522, "loss": 5.6743, "mean_token_accuracy": 0.1524437814950943, "num_tokens": 17907141.0, "step": 9700 }, { "entropy": 5.688076829910278, "epoch": 0.8153749212350346, "grad_norm": 1.7421875, "learning_rate": 0.0004939877762278737, "loss": 5.5548, "mean_token_accuracy": 0.15888291895389556, "num_tokens": 17915792.0, "step": 9705 }, { "entropy": 5.819617366790771, "epoch": 0.8157950010501995, "grad_norm": 1.6875, "learning_rate": 0.0004939808977899396, "loss": 5.7061, "mean_token_accuracy": 0.14910464882850646, "num_tokens": 17925603.0, "step": 9710 }, { "entropy": 5.840267324447632, "epoch": 0.8162150808653644, "grad_norm": 1.78125, "learning_rate": 0.0004939740154728716, "loss": 5.6424, "mean_token_accuracy": 0.15840867161750793, "num_tokens": 17934436.0, "step": 9715 }, { "entropy": 5.819521951675415, "epoch": 0.8166351606805293, "grad_norm": 1.703125, "learning_rate": 0.0004939671292767915, "loss": 5.595, "mean_token_accuracy": 0.16303292959928511, "num_tokens": 17942969.0, "step": 9720 }, { "entropy": 5.824506616592407, "epoch": 0.8170552404956942, "grad_norm": 1.4765625, "learning_rate": 0.0004939602392018216, "loss": 5.6782, "mean_token_accuracy": 0.15368429720401763, "num_tokens": 17952053.0, "step": 9725 }, { "entropy": 5.7638860702514645, "epoch": 0.817475320310859, "grad_norm": 1.5078125, "learning_rate": 0.0004939533452480839, "loss": 5.6463, "mean_token_accuracy": 0.15662853494286538, "num_tokens": 17960707.0, "step": 9730 }, { "entropy": 5.850724697113037, "epoch": 0.8178954001260239, "grad_norm": 2.21875, "learning_rate": 0.0004939464474157003, "loss": 5.7485, "mean_token_accuracy": 0.143310609459877, "num_tokens": 17971035.0, "step": 9735 }, { "entropy": 5.811854696273803, "epoch": 0.8183154799411888, "grad_norm": 1.7265625, "learning_rate": 0.0004939395457047932, "loss": 5.6147, "mean_token_accuracy": 0.1503463938832283, "num_tokens": 17980656.0, "step": 9740 }, { "entropy": 5.849935054779053, "epoch": 0.8187355597563537, "grad_norm": 1.765625, "learning_rate": 0.0004939326401154847, "loss": 5.6425, "mean_token_accuracy": 0.14753958508372306, "num_tokens": 17990977.0, "step": 9745 }, { "entropy": 5.72039909362793, "epoch": 0.8191556395715186, "grad_norm": 1.921875, "learning_rate": 0.0004939257306478973, "loss": 5.6386, "mean_token_accuracy": 0.15436331778764725, "num_tokens": 18000186.0, "step": 9750 }, { "entropy": 5.743032836914063, "epoch": 0.8195757193866835, "grad_norm": 1.71875, "learning_rate": 0.0004939188173021532, "loss": 5.6294, "mean_token_accuracy": 0.15535037443041802, "num_tokens": 18010269.0, "step": 9755 }, { "entropy": 5.837440872192383, "epoch": 0.8199957992018484, "grad_norm": 1.71875, "learning_rate": 0.0004939119000783751, "loss": 5.5548, "mean_token_accuracy": 0.1628822222352028, "num_tokens": 18018461.0, "step": 9760 }, { "entropy": 5.726272964477539, "epoch": 0.8204158790170132, "grad_norm": 1.625, "learning_rate": 0.0004939049789766855, "loss": 5.5727, "mean_token_accuracy": 0.1559150367975235, "num_tokens": 18027173.0, "step": 9765 }, { "entropy": 5.681005191802979, "epoch": 0.8208359588321781, "grad_norm": 1.953125, "learning_rate": 0.0004938980539972068, "loss": 5.6704, "mean_token_accuracy": 0.15305837988853455, "num_tokens": 18036791.0, "step": 9770 }, { "entropy": 5.741180467605591, "epoch": 0.821256038647343, "grad_norm": 1.453125, "learning_rate": 0.0004938911251400617, "loss": 5.6164, "mean_token_accuracy": 0.15979565382003785, "num_tokens": 18046908.0, "step": 9775 }, { "entropy": 5.690343570709229, "epoch": 0.8216761184625079, "grad_norm": 1.546875, "learning_rate": 0.0004938841924053731, "loss": 5.5305, "mean_token_accuracy": 0.166619610786438, "num_tokens": 18055825.0, "step": 9780 }, { "entropy": 5.83678789138794, "epoch": 0.8220961982776728, "grad_norm": 1.5078125, "learning_rate": 0.0004938772557932637, "loss": 5.7218, "mean_token_accuracy": 0.1443665809929371, "num_tokens": 18065334.0, "step": 9785 }, { "entropy": 5.830995225906372, "epoch": 0.8225162780928377, "grad_norm": 1.71875, "learning_rate": 0.0004938703153038565, "loss": 5.585, "mean_token_accuracy": 0.15924161821603774, "num_tokens": 18073999.0, "step": 9790 }, { "entropy": 5.664001035690307, "epoch": 0.8229363579080026, "grad_norm": 1.5703125, "learning_rate": 0.0004938633709372744, "loss": 5.6106, "mean_token_accuracy": 0.15344761908054352, "num_tokens": 18083665.0, "step": 9795 }, { "entropy": 5.735060787200927, "epoch": 0.8233564377231674, "grad_norm": 1.7890625, "learning_rate": 0.0004938564226936403, "loss": 5.6081, "mean_token_accuracy": 0.15541263967752456, "num_tokens": 18092501.0, "step": 9800 }, { "entropy": 5.726347208023071, "epoch": 0.8237765175383323, "grad_norm": 1.6171875, "learning_rate": 0.0004938494705730773, "loss": 5.5879, "mean_token_accuracy": 0.15256380438804626, "num_tokens": 18101320.0, "step": 9805 }, { "entropy": 5.76941032409668, "epoch": 0.8241965973534972, "grad_norm": 1.578125, "learning_rate": 0.0004938425145757087, "loss": 5.6155, "mean_token_accuracy": 0.15062929540872574, "num_tokens": 18110190.0, "step": 9810 }, { "entropy": 5.77293291091919, "epoch": 0.824616677168662, "grad_norm": 1.40625, "learning_rate": 0.0004938355547016577, "loss": 5.6121, "mean_token_accuracy": 0.15612404122948648, "num_tokens": 18119301.0, "step": 9815 }, { "entropy": 5.836658191680908, "epoch": 0.825036756983827, "grad_norm": 1.6796875, "learning_rate": 0.0004938285909510474, "loss": 5.6581, "mean_token_accuracy": 0.15127312690019606, "num_tokens": 18128959.0, "step": 9820 }, { "entropy": 5.728819894790649, "epoch": 0.8254568367989918, "grad_norm": 1.859375, "learning_rate": 0.0004938216233240014, "loss": 5.6313, "mean_token_accuracy": 0.15713003724813462, "num_tokens": 18138156.0, "step": 9825 }, { "entropy": 5.834373140335083, "epoch": 0.8258769166141567, "grad_norm": 1.578125, "learning_rate": 0.000493814651820643, "loss": 5.6643, "mean_token_accuracy": 0.14810227751731872, "num_tokens": 18147244.0, "step": 9830 }, { "entropy": 5.870449686050415, "epoch": 0.8262969964293215, "grad_norm": 1.65625, "learning_rate": 0.0004938076764410956, "loss": 5.6655, "mean_token_accuracy": 0.15398952662944793, "num_tokens": 18156040.0, "step": 9835 }, { "entropy": 5.834972286224366, "epoch": 0.8267170762444864, "grad_norm": 1.8359375, "learning_rate": 0.000493800697185483, "loss": 5.5916, "mean_token_accuracy": 0.14924859553575515, "num_tokens": 18165210.0, "step": 9840 }, { "entropy": 5.787695646286011, "epoch": 0.8271371560596513, "grad_norm": 1.5390625, "learning_rate": 0.0004937937140539288, "loss": 5.6591, "mean_token_accuracy": 0.15217285007238388, "num_tokens": 18174841.0, "step": 9845 }, { "entropy": 5.705031299591065, "epoch": 0.8275572358748162, "grad_norm": 1.9296875, "learning_rate": 0.0004937867270465564, "loss": 5.5282, "mean_token_accuracy": 0.1546058475971222, "num_tokens": 18184112.0, "step": 9850 }, { "entropy": 5.810121345520019, "epoch": 0.8279773156899811, "grad_norm": 1.8046875, "learning_rate": 0.0004937797361634899, "loss": 5.7327, "mean_token_accuracy": 0.15010684877634048, "num_tokens": 18193564.0, "step": 9855 }, { "entropy": 5.696271514892578, "epoch": 0.828397395505146, "grad_norm": 1.5859375, "learning_rate": 0.000493772741404853, "loss": 5.4681, "mean_token_accuracy": 0.1613880753517151, "num_tokens": 18202836.0, "step": 9860 }, { "entropy": 5.766516923904419, "epoch": 0.8288174753203108, "grad_norm": 1.453125, "learning_rate": 0.0004937657427707698, "loss": 5.5918, "mean_token_accuracy": 0.16525972336530687, "num_tokens": 18212098.0, "step": 9865 }, { "entropy": 5.790452575683593, "epoch": 0.8292375551354757, "grad_norm": 1.8203125, "learning_rate": 0.0004937587402613639, "loss": 5.6181, "mean_token_accuracy": 0.15452115386724471, "num_tokens": 18221541.0, "step": 9870 }, { "entropy": 5.693503141403198, "epoch": 0.8296576349506406, "grad_norm": 1.828125, "learning_rate": 0.0004937517338767597, "loss": 5.6181, "mean_token_accuracy": 0.14959986433386802, "num_tokens": 18231015.0, "step": 9875 }, { "entropy": 5.776920127868652, "epoch": 0.8300777147658055, "grad_norm": 1.828125, "learning_rate": 0.0004937447236170811, "loss": 5.6442, "mean_token_accuracy": 0.15097325891256333, "num_tokens": 18239729.0, "step": 9880 }, { "entropy": 5.846532917022705, "epoch": 0.8304977945809704, "grad_norm": 1.6953125, "learning_rate": 0.0004937377094824523, "loss": 5.6934, "mean_token_accuracy": 0.14850014224648475, "num_tokens": 18249773.0, "step": 9885 }, { "entropy": 5.829236078262329, "epoch": 0.8309178743961353, "grad_norm": 1.5859375, "learning_rate": 0.0004937306914729977, "loss": 5.6466, "mean_token_accuracy": 0.14962287619709969, "num_tokens": 18259179.0, "step": 9890 }, { "entropy": 5.640655469894409, "epoch": 0.8313379542113002, "grad_norm": 1.625, "learning_rate": 0.0004937236695888416, "loss": 5.5285, "mean_token_accuracy": 0.16359366923570634, "num_tokens": 18268164.0, "step": 9895 }, { "entropy": 5.7921144485473635, "epoch": 0.831758034026465, "grad_norm": 1.765625, "learning_rate": 0.0004937166438301082, "loss": 5.7047, "mean_token_accuracy": 0.15264711230993272, "num_tokens": 18276259.0, "step": 9900 }, { "entropy": 5.795594167709351, "epoch": 0.8321781138416299, "grad_norm": 1.953125, "learning_rate": 0.0004937096141969221, "loss": 5.6749, "mean_token_accuracy": 0.15689299032092094, "num_tokens": 18285729.0, "step": 9905 }, { "entropy": 5.905335474014282, "epoch": 0.8325981936567948, "grad_norm": 1.65625, "learning_rate": 0.0004937025806894077, "loss": 5.8351, "mean_token_accuracy": 0.139414294809103, "num_tokens": 18295873.0, "step": 9910 }, { "entropy": 5.881864213943482, "epoch": 0.8330182734719597, "grad_norm": 1.9453125, "learning_rate": 0.0004936955433076899, "loss": 5.6606, "mean_token_accuracy": 0.15778864026069642, "num_tokens": 18305135.0, "step": 9915 }, { "entropy": 5.854172706604004, "epoch": 0.8334383532871246, "grad_norm": 2.015625, "learning_rate": 0.000493688502051893, "loss": 5.7077, "mean_token_accuracy": 0.15348225384950637, "num_tokens": 18314251.0, "step": 9920 }, { "entropy": 5.7477837085723875, "epoch": 0.8338584331022895, "grad_norm": 1.703125, "learning_rate": 0.0004936814569221421, "loss": 5.5373, "mean_token_accuracy": 0.16807708740234376, "num_tokens": 18322863.0, "step": 9925 }, { "entropy": 5.710943984985351, "epoch": 0.8342785129174544, "grad_norm": 2.046875, "learning_rate": 0.0004936744079185616, "loss": 5.5515, "mean_token_accuracy": 0.15136271864175796, "num_tokens": 18332129.0, "step": 9930 }, { "entropy": 5.780642127990722, "epoch": 0.8346985927326191, "grad_norm": 1.6640625, "learning_rate": 0.0004936673550412767, "loss": 5.6502, "mean_token_accuracy": 0.15562164932489395, "num_tokens": 18341457.0, "step": 9935 }, { "entropy": 5.817247200012207, "epoch": 0.835118672547784, "grad_norm": 1.515625, "learning_rate": 0.000493660298290412, "loss": 5.6352, "mean_token_accuracy": 0.14964016079902648, "num_tokens": 18351397.0, "step": 9940 }, { "entropy": 5.7535981178283695, "epoch": 0.8355387523629489, "grad_norm": 1.4609375, "learning_rate": 0.0004936532376660929, "loss": 5.5601, "mean_token_accuracy": 0.15686817914247514, "num_tokens": 18360005.0, "step": 9945 }, { "entropy": 5.856048727035523, "epoch": 0.8359588321781138, "grad_norm": 1.59375, "learning_rate": 0.0004936461731684442, "loss": 5.6621, "mean_token_accuracy": 0.15645960420370103, "num_tokens": 18369707.0, "step": 9950 }, { "entropy": 5.904961681365966, "epoch": 0.8363789119932787, "grad_norm": 2.875, "learning_rate": 0.0004936391047975912, "loss": 5.7951, "mean_token_accuracy": 0.14975984990596772, "num_tokens": 18379514.0, "step": 9955 }, { "entropy": 5.728058910369873, "epoch": 0.8367989918084436, "grad_norm": 1.4921875, "learning_rate": 0.0004936320325536589, "loss": 5.4989, "mean_token_accuracy": 0.15699619948863983, "num_tokens": 18388854.0, "step": 9960 }, { "entropy": 5.80841555595398, "epoch": 0.8372190716236085, "grad_norm": 1.59375, "learning_rate": 0.0004936249564367729, "loss": 5.6713, "mean_token_accuracy": 0.15378804504871368, "num_tokens": 18397806.0, "step": 9965 }, { "entropy": 5.713347768783569, "epoch": 0.8376391514387733, "grad_norm": 1.7109375, "learning_rate": 0.0004936178764470583, "loss": 5.5296, "mean_token_accuracy": 0.1534825384616852, "num_tokens": 18406645.0, "step": 9970 }, { "entropy": 5.6835887908935545, "epoch": 0.8380592312539382, "grad_norm": 1.625, "learning_rate": 0.0004936107925846405, "loss": 5.5458, "mean_token_accuracy": 0.15742876827716829, "num_tokens": 18415730.0, "step": 9975 }, { "entropy": 5.7930676460266115, "epoch": 0.8384793110691031, "grad_norm": 1.6484375, "learning_rate": 0.0004936037048496452, "loss": 5.6499, "mean_token_accuracy": 0.1560029774904251, "num_tokens": 18424638.0, "step": 9980 }, { "entropy": 5.799233627319336, "epoch": 0.838899390884268, "grad_norm": 1.75, "learning_rate": 0.0004935966132421977, "loss": 5.6852, "mean_token_accuracy": 0.14873172864317893, "num_tokens": 18434090.0, "step": 9985 }, { "entropy": 5.67788405418396, "epoch": 0.8393194706994329, "grad_norm": 1.6796875, "learning_rate": 0.0004935895177624239, "loss": 5.5532, "mean_token_accuracy": 0.1584454283118248, "num_tokens": 18442965.0, "step": 9990 }, { "entropy": 5.811638116836548, "epoch": 0.8397395505145978, "grad_norm": 1.4453125, "learning_rate": 0.0004935824184104493, "loss": 5.5789, "mean_token_accuracy": 0.1549446702003479, "num_tokens": 18451553.0, "step": 9995 }, { "entropy": 5.778439950942993, "epoch": 0.8401596303297627, "grad_norm": 2.046875, "learning_rate": 0.0004935753151863997, "loss": 5.6168, "mean_token_accuracy": 0.15213518738746643, "num_tokens": 18461325.0, "step": 10000 }, { "entropy": 5.781700515747071, "epoch": 0.8405797101449275, "grad_norm": 1.828125, "learning_rate": 0.0004935682080904009, "loss": 5.6206, "mean_token_accuracy": 0.16005493104457855, "num_tokens": 18469977.0, "step": 10005 }, { "entropy": 5.758043384552002, "epoch": 0.8409997899600924, "grad_norm": 1.7890625, "learning_rate": 0.0004935610971225789, "loss": 5.5862, "mean_token_accuracy": 0.1575999900698662, "num_tokens": 18479534.0, "step": 10010 }, { "entropy": 5.688985300064087, "epoch": 0.8414198697752573, "grad_norm": 1.53125, "learning_rate": 0.0004935539822830597, "loss": 5.6943, "mean_token_accuracy": 0.14613962322473525, "num_tokens": 18488800.0, "step": 10015 }, { "entropy": 5.7671685218811035, "epoch": 0.8418399495904222, "grad_norm": 1.5078125, "learning_rate": 0.000493546863571969, "loss": 5.6557, "mean_token_accuracy": 0.1554260805249214, "num_tokens": 18498083.0, "step": 10020 }, { "entropy": 5.817663335800171, "epoch": 0.8422600294055871, "grad_norm": 1.6484375, "learning_rate": 0.0004935397409894333, "loss": 5.6099, "mean_token_accuracy": 0.14785023778676987, "num_tokens": 18508265.0, "step": 10025 }, { "entropy": 5.810160112380982, "epoch": 0.842680109220752, "grad_norm": 1.7421875, "learning_rate": 0.0004935326145355787, "loss": 5.6445, "mean_token_accuracy": 0.15227773338556289, "num_tokens": 18517283.0, "step": 10030 }, { "entropy": 5.775955724716186, "epoch": 0.8431001890359168, "grad_norm": 1.7109375, "learning_rate": 0.0004935254842105311, "loss": 5.6577, "mean_token_accuracy": 0.158540278673172, "num_tokens": 18526482.0, "step": 10035 }, { "entropy": 5.6810362339019775, "epoch": 0.8435202688510817, "grad_norm": 1.609375, "learning_rate": 0.0004935183500144173, "loss": 5.4966, "mean_token_accuracy": 0.16830503046512604, "num_tokens": 18536150.0, "step": 10040 }, { "entropy": 5.821089220046997, "epoch": 0.8439403486662466, "grad_norm": 1.796875, "learning_rate": 0.0004935112119473634, "loss": 5.6978, "mean_token_accuracy": 0.15066490024328233, "num_tokens": 18545168.0, "step": 10045 }, { "entropy": 5.785538864135742, "epoch": 0.8443604284814115, "grad_norm": 1.546875, "learning_rate": 0.0004935040700094959, "loss": 5.6256, "mean_token_accuracy": 0.15842598676681519, "num_tokens": 18553363.0, "step": 10050 }, { "entropy": 5.740128374099731, "epoch": 0.8447805082965764, "grad_norm": 1.640625, "learning_rate": 0.0004934969242009412, "loss": 5.5817, "mean_token_accuracy": 0.15919749736785888, "num_tokens": 18562546.0, "step": 10055 }, { "entropy": 5.705161762237549, "epoch": 0.8452005881117413, "grad_norm": 1.96875, "learning_rate": 0.0004934897745218262, "loss": 5.6338, "mean_token_accuracy": 0.15164628773927688, "num_tokens": 18572149.0, "step": 10060 }, { "entropy": 5.729842662811279, "epoch": 0.8456206679269062, "grad_norm": 1.828125, "learning_rate": 0.0004934826209722772, "loss": 5.5077, "mean_token_accuracy": 0.1547485738992691, "num_tokens": 18580842.0, "step": 10065 }, { "entropy": 5.7600654602050785, "epoch": 0.8460407477420709, "grad_norm": 1.8828125, "learning_rate": 0.0004934754635524211, "loss": 5.6115, "mean_token_accuracy": 0.15985522121191026, "num_tokens": 18589765.0, "step": 10070 }, { "entropy": 5.762496757507324, "epoch": 0.8464608275572358, "grad_norm": 1.5390625, "learning_rate": 0.0004934683022623847, "loss": 5.6401, "mean_token_accuracy": 0.15011052042245865, "num_tokens": 18599532.0, "step": 10075 }, { "entropy": 5.685576343536377, "epoch": 0.8468809073724007, "grad_norm": 1.65625, "learning_rate": 0.0004934611371022947, "loss": 5.5281, "mean_token_accuracy": 0.16043669879436492, "num_tokens": 18608438.0, "step": 10080 }, { "entropy": 5.787454748153687, "epoch": 0.8473009871875656, "grad_norm": 1.5078125, "learning_rate": 0.0004934539680722783, "loss": 5.6793, "mean_token_accuracy": 0.1521899461746216, "num_tokens": 18617313.0, "step": 10085 }, { "entropy": 5.731491613388061, "epoch": 0.8477210670027305, "grad_norm": 1.5546875, "learning_rate": 0.0004934467951724622, "loss": 5.5123, "mean_token_accuracy": 0.1605857416987419, "num_tokens": 18625880.0, "step": 10090 }, { "entropy": 5.730096912384033, "epoch": 0.8481411468178954, "grad_norm": 1.40625, "learning_rate": 0.0004934396184029737, "loss": 5.6046, "mean_token_accuracy": 0.15527373552322388, "num_tokens": 18635727.0, "step": 10095 }, { "entropy": 5.769042825698852, "epoch": 0.8485612266330603, "grad_norm": 1.5078125, "learning_rate": 0.0004934324377639398, "loss": 5.662, "mean_token_accuracy": 0.15308721214532853, "num_tokens": 18645619.0, "step": 10100 }, { "entropy": 5.736938428878784, "epoch": 0.8489813064482251, "grad_norm": 1.6015625, "learning_rate": 0.0004934252532554878, "loss": 5.5544, "mean_token_accuracy": 0.1575164332985878, "num_tokens": 18654901.0, "step": 10105 }, { "entropy": 5.844228029251099, "epoch": 0.84940138626339, "grad_norm": 2.171875, "learning_rate": 0.0004934180648777449, "loss": 5.8122, "mean_token_accuracy": 0.15224194526672363, "num_tokens": 18664523.0, "step": 10110 }, { "entropy": 5.8306056499481205, "epoch": 0.8498214660785549, "grad_norm": 1.4375, "learning_rate": 0.0004934108726308384, "loss": 5.6362, "mean_token_accuracy": 0.14759955704212188, "num_tokens": 18673685.0, "step": 10115 }, { "entropy": 5.767707586288452, "epoch": 0.8502415458937198, "grad_norm": 1.78125, "learning_rate": 0.0004934036765148958, "loss": 5.6142, "mean_token_accuracy": 0.14617660790681838, "num_tokens": 18682889.0, "step": 10120 }, { "entropy": 5.758945083618164, "epoch": 0.8506616257088847, "grad_norm": 1.734375, "learning_rate": 0.0004933964765300446, "loss": 5.6533, "mean_token_accuracy": 0.15302490592002868, "num_tokens": 18692978.0, "step": 10125 }, { "entropy": 5.750522422790527, "epoch": 0.8510817055240496, "grad_norm": 1.7265625, "learning_rate": 0.000493389272676412, "loss": 5.5705, "mean_token_accuracy": 0.1600403904914856, "num_tokens": 18701846.0, "step": 10130 }, { "entropy": 5.79836106300354, "epoch": 0.8515017853392145, "grad_norm": 1.859375, "learning_rate": 0.0004933820649541262, "loss": 5.5935, "mean_token_accuracy": 0.16571370661258697, "num_tokens": 18711492.0, "step": 10135 }, { "entropy": 5.670457267761231, "epoch": 0.8519218651543793, "grad_norm": 1.65625, "learning_rate": 0.0004933748533633145, "loss": 5.5244, "mean_token_accuracy": 0.16938419491052628, "num_tokens": 18720407.0, "step": 10140 }, { "entropy": 5.713903999328613, "epoch": 0.8523419449695442, "grad_norm": 1.921875, "learning_rate": 0.0004933676379041045, "loss": 5.5771, "mean_token_accuracy": 0.1604509249329567, "num_tokens": 18729968.0, "step": 10145 }, { "entropy": 5.8019672393798825, "epoch": 0.8527620247847091, "grad_norm": 1.6328125, "learning_rate": 0.0004933604185766245, "loss": 5.6939, "mean_token_accuracy": 0.1484614282846451, "num_tokens": 18739525.0, "step": 10150 }, { "entropy": 5.755314731597901, "epoch": 0.853182104599874, "grad_norm": 1.53125, "learning_rate": 0.0004933531953810019, "loss": 5.5984, "mean_token_accuracy": 0.15788624286651612, "num_tokens": 18749087.0, "step": 10155 }, { "entropy": 5.818537855148316, "epoch": 0.8536021844150389, "grad_norm": 1.6640625, "learning_rate": 0.0004933459683173652, "loss": 5.6259, "mean_token_accuracy": 0.1562245801091194, "num_tokens": 18758174.0, "step": 10160 }, { "entropy": 5.796029376983642, "epoch": 0.8540222642302038, "grad_norm": 1.6171875, "learning_rate": 0.0004933387373858418, "loss": 5.6637, "mean_token_accuracy": 0.15472310557961463, "num_tokens": 18767679.0, "step": 10165 }, { "entropy": 5.743490171432495, "epoch": 0.8544423440453687, "grad_norm": 2.140625, "learning_rate": 0.0004933315025865602, "loss": 5.5875, "mean_token_accuracy": 0.15303896814584733, "num_tokens": 18776749.0, "step": 10170 }, { "entropy": 5.814285850524902, "epoch": 0.8548624238605335, "grad_norm": 1.4609375, "learning_rate": 0.0004933242639196485, "loss": 5.7667, "mean_token_accuracy": 0.14032013416290284, "num_tokens": 18786313.0, "step": 10175 }, { "entropy": 5.87596173286438, "epoch": 0.8552825036756984, "grad_norm": 2.171875, "learning_rate": 0.0004933170213852348, "loss": 5.632, "mean_token_accuracy": 0.15269517451524733, "num_tokens": 18795340.0, "step": 10180 }, { "entropy": 5.749491739273071, "epoch": 0.8557025834908633, "grad_norm": 1.5234375, "learning_rate": 0.0004933097749834476, "loss": 5.5675, "mean_token_accuracy": 0.1547122523188591, "num_tokens": 18804114.0, "step": 10185 }, { "entropy": 5.750264501571655, "epoch": 0.8561226633060282, "grad_norm": 1.9921875, "learning_rate": 0.000493302524714415, "loss": 5.5798, "mean_token_accuracy": 0.1528068631887436, "num_tokens": 18813797.0, "step": 10190 }, { "entropy": 5.751224422454834, "epoch": 0.856542743121193, "grad_norm": 1.6640625, "learning_rate": 0.0004932952705782657, "loss": 5.631, "mean_token_accuracy": 0.15325366854667663, "num_tokens": 18822410.0, "step": 10195 }, { "entropy": 5.709691667556763, "epoch": 0.856962822936358, "grad_norm": 1.546875, "learning_rate": 0.000493288012575128, "loss": 5.5632, "mean_token_accuracy": 0.1608235776424408, "num_tokens": 18832091.0, "step": 10200 }, { "entropy": 5.747391223907471, "epoch": 0.8573829027515227, "grad_norm": 1.890625, "learning_rate": 0.0004932807507051307, "loss": 5.5981, "mean_token_accuracy": 0.14849429577589035, "num_tokens": 18841298.0, "step": 10205 }, { "entropy": 5.7065764427185055, "epoch": 0.8578029825666876, "grad_norm": 1.5390625, "learning_rate": 0.0004932734849684022, "loss": 5.5663, "mean_token_accuracy": 0.15466026067733765, "num_tokens": 18849683.0, "step": 10210 }, { "entropy": 5.744755029678345, "epoch": 0.8582230623818525, "grad_norm": 1.8359375, "learning_rate": 0.0004932662153650712, "loss": 5.5082, "mean_token_accuracy": 0.15981326550245284, "num_tokens": 18858832.0, "step": 10215 }, { "entropy": 5.647493553161621, "epoch": 0.8586431421970174, "grad_norm": 2.171875, "learning_rate": 0.0004932589418952668, "loss": 5.5438, "mean_token_accuracy": 0.15799610018730165, "num_tokens": 18867652.0, "step": 10220 }, { "entropy": 5.78511266708374, "epoch": 0.8590632220121823, "grad_norm": 1.984375, "learning_rate": 0.0004932516645591175, "loss": 5.6315, "mean_token_accuracy": 0.1554282858967781, "num_tokens": 18877282.0, "step": 10225 }, { "entropy": 5.833698844909668, "epoch": 0.8594833018273472, "grad_norm": 1.765625, "learning_rate": 0.0004932443833567524, "loss": 5.7462, "mean_token_accuracy": 0.1505351722240448, "num_tokens": 18886565.0, "step": 10230 }, { "entropy": 5.777234220504761, "epoch": 0.8599033816425121, "grad_norm": 1.609375, "learning_rate": 0.0004932370982883003, "loss": 5.6656, "mean_token_accuracy": 0.15549270063638687, "num_tokens": 18896440.0, "step": 10235 }, { "entropy": 5.8239048480987545, "epoch": 0.8603234614576769, "grad_norm": 2.25, "learning_rate": 0.0004932298093538905, "loss": 5.6887, "mean_token_accuracy": 0.15299588292837143, "num_tokens": 18906246.0, "step": 10240 }, { "entropy": 5.746791028976441, "epoch": 0.8607435412728418, "grad_norm": 1.4375, "learning_rate": 0.000493222516553652, "loss": 5.5925, "mean_token_accuracy": 0.1533835083246231, "num_tokens": 18915108.0, "step": 10245 }, { "entropy": 5.781469821929932, "epoch": 0.8611636210880067, "grad_norm": 1.765625, "learning_rate": 0.0004932152198877139, "loss": 5.6, "mean_token_accuracy": 0.15372219830751419, "num_tokens": 18923664.0, "step": 10250 }, { "entropy": 5.7778332233428955, "epoch": 0.8615837009031716, "grad_norm": 1.7734375, "learning_rate": 0.0004932079193562057, "loss": 5.697, "mean_token_accuracy": 0.15252179205417632, "num_tokens": 18933496.0, "step": 10255 }, { "entropy": 5.733058881759644, "epoch": 0.8620037807183365, "grad_norm": 1.7109375, "learning_rate": 0.0004932006149592564, "loss": 5.5788, "mean_token_accuracy": 0.15552108436822892, "num_tokens": 18942222.0, "step": 10260 }, { "entropy": 5.810169363021851, "epoch": 0.8624238605335014, "grad_norm": 1.9609375, "learning_rate": 0.0004931933066969957, "loss": 5.5888, "mean_token_accuracy": 0.15849068462848664, "num_tokens": 18952057.0, "step": 10265 }, { "entropy": 5.738401651382446, "epoch": 0.8628439403486663, "grad_norm": 1.609375, "learning_rate": 0.0004931859945695528, "loss": 5.6356, "mean_token_accuracy": 0.15441264659166337, "num_tokens": 18961664.0, "step": 10270 }, { "entropy": 5.665639925003052, "epoch": 0.8632640201638311, "grad_norm": 2.296875, "learning_rate": 0.0004931786785770575, "loss": 5.429, "mean_token_accuracy": 0.16940231174230574, "num_tokens": 18969900.0, "step": 10275 }, { "entropy": 5.793166017532348, "epoch": 0.863684099978996, "grad_norm": 1.65625, "learning_rate": 0.0004931713587196392, "loss": 5.7206, "mean_token_accuracy": 0.1475231796503067, "num_tokens": 18979286.0, "step": 10280 }, { "entropy": 5.855304002761841, "epoch": 0.8641041797941609, "grad_norm": 1.53125, "learning_rate": 0.0004931640349974275, "loss": 5.603, "mean_token_accuracy": 0.1532246984541416, "num_tokens": 18987553.0, "step": 10285 }, { "entropy": 5.77991795539856, "epoch": 0.8645242596093258, "grad_norm": 1.71875, "learning_rate": 0.0004931567074105524, "loss": 5.6872, "mean_token_accuracy": 0.15210114121437074, "num_tokens": 18996354.0, "step": 10290 }, { "entropy": 5.688443899154663, "epoch": 0.8649443394244907, "grad_norm": 3.109375, "learning_rate": 0.0004931493759591435, "loss": 5.5749, "mean_token_accuracy": 0.15452788174152374, "num_tokens": 19005150.0, "step": 10295 }, { "entropy": 5.801825380325317, "epoch": 0.8653644192396556, "grad_norm": 1.8984375, "learning_rate": 0.0004931420406433308, "loss": 5.5793, "mean_token_accuracy": 0.15020548403263093, "num_tokens": 19014572.0, "step": 10300 }, { "entropy": 5.703862047195434, "epoch": 0.8657844990548205, "grad_norm": 1.6171875, "learning_rate": 0.000493134701463244, "loss": 5.4508, "mean_token_accuracy": 0.16280461698770524, "num_tokens": 19023462.0, "step": 10305 }, { "entropy": 5.649288606643677, "epoch": 0.8662045788699853, "grad_norm": 1.71875, "learning_rate": 0.0004931273584190135, "loss": 5.5405, "mean_token_accuracy": 0.15991990268230438, "num_tokens": 19032460.0, "step": 10310 }, { "entropy": 5.731163692474365, "epoch": 0.8666246586851502, "grad_norm": 1.8671875, "learning_rate": 0.0004931200115107691, "loss": 5.579, "mean_token_accuracy": 0.16041069328784943, "num_tokens": 19041734.0, "step": 10315 }, { "entropy": 5.697036027908325, "epoch": 0.867044738500315, "grad_norm": 1.7109375, "learning_rate": 0.000493112660738641, "loss": 5.5608, "mean_token_accuracy": 0.15314172506332396, "num_tokens": 19050867.0, "step": 10320 }, { "entropy": 5.708456945419312, "epoch": 0.86746481831548, "grad_norm": 1.90625, "learning_rate": 0.0004931053061027594, "loss": 5.5539, "mean_token_accuracy": 0.15272417664527893, "num_tokens": 19060518.0, "step": 10325 }, { "entropy": 5.742541694641114, "epoch": 0.8678848981306448, "grad_norm": 1.5625, "learning_rate": 0.0004930979476032546, "loss": 5.5539, "mean_token_accuracy": 0.15664585381746293, "num_tokens": 19069588.0, "step": 10330 }, { "entropy": 5.725212717056275, "epoch": 0.8683049779458097, "grad_norm": 1.65625, "learning_rate": 0.000493090585240257, "loss": 5.6005, "mean_token_accuracy": 0.14247507825493813, "num_tokens": 19079060.0, "step": 10335 }, { "entropy": 5.6803240299224855, "epoch": 0.8687250577609746, "grad_norm": 1.9375, "learning_rate": 0.0004930832190138969, "loss": 5.533, "mean_token_accuracy": 0.15190561562776567, "num_tokens": 19087721.0, "step": 10340 }, { "entropy": 5.769875383377075, "epoch": 0.8691451375761394, "grad_norm": 1.5078125, "learning_rate": 0.000493075848924305, "loss": 5.5676, "mean_token_accuracy": 0.1551969662308693, "num_tokens": 19096800.0, "step": 10345 }, { "entropy": 5.790397357940674, "epoch": 0.8695652173913043, "grad_norm": 1.4921875, "learning_rate": 0.0004930684749716117, "loss": 5.6411, "mean_token_accuracy": 0.15215054303407669, "num_tokens": 19106774.0, "step": 10350 }, { "entropy": 5.751374912261963, "epoch": 0.8699852972064692, "grad_norm": 1.7578125, "learning_rate": 0.0004930610971559476, "loss": 5.5861, "mean_token_accuracy": 0.1551279380917549, "num_tokens": 19116413.0, "step": 10355 }, { "entropy": 5.739291095733643, "epoch": 0.8704053770216341, "grad_norm": 1.65625, "learning_rate": 0.0004930537154774436, "loss": 5.6015, "mean_token_accuracy": 0.15086202025413514, "num_tokens": 19125363.0, "step": 10360 }, { "entropy": 5.794745826721192, "epoch": 0.870825456836799, "grad_norm": 1.6171875, "learning_rate": 0.0004930463299362302, "loss": 5.6984, "mean_token_accuracy": 0.14360912814736365, "num_tokens": 19135461.0, "step": 10365 }, { "entropy": 5.806246614456176, "epoch": 0.8712455366519639, "grad_norm": 2.296875, "learning_rate": 0.0004930389405324383, "loss": 5.5582, "mean_token_accuracy": 0.16600679904222487, "num_tokens": 19144085.0, "step": 10370 }, { "entropy": 5.762925720214843, "epoch": 0.8716656164671287, "grad_norm": 2.15625, "learning_rate": 0.0004930315472661987, "loss": 5.5741, "mean_token_accuracy": 0.15904655829071998, "num_tokens": 19153291.0, "step": 10375 }, { "entropy": 5.732652473449707, "epoch": 0.8720856962822936, "grad_norm": 1.796875, "learning_rate": 0.0004930241501376428, "loss": 5.5947, "mean_token_accuracy": 0.15122335851192475, "num_tokens": 19163514.0, "step": 10380 }, { "entropy": 5.602568197250366, "epoch": 0.8725057760974585, "grad_norm": 1.59375, "learning_rate": 0.0004930167491469013, "loss": 5.4792, "mean_token_accuracy": 0.1624978721141815, "num_tokens": 19172103.0, "step": 10385 }, { "entropy": 5.75473918914795, "epoch": 0.8729258559126234, "grad_norm": 1.5078125, "learning_rate": 0.0004930093442941053, "loss": 5.5509, "mean_token_accuracy": 0.15365159437060355, "num_tokens": 19180893.0, "step": 10390 }, { "entropy": 5.764384841918945, "epoch": 0.8733459357277883, "grad_norm": 1.375, "learning_rate": 0.0004930019355793858, "loss": 5.4714, "mean_token_accuracy": 0.1572717860341072, "num_tokens": 19190495.0, "step": 10395 }, { "entropy": 5.709274530410767, "epoch": 0.8737660155429532, "grad_norm": 1.53125, "learning_rate": 0.0004929945230028746, "loss": 5.5633, "mean_token_accuracy": 0.16117294877767563, "num_tokens": 19198988.0, "step": 10400 }, { "entropy": 5.656596994400024, "epoch": 0.8741860953581181, "grad_norm": 1.8671875, "learning_rate": 0.0004929871065647024, "loss": 5.4723, "mean_token_accuracy": 0.1623318910598755, "num_tokens": 19208014.0, "step": 10405 }, { "entropy": 5.754249525070191, "epoch": 0.8746061751732829, "grad_norm": 1.578125, "learning_rate": 0.0004929796862650011, "loss": 5.6686, "mean_token_accuracy": 0.15798502415418625, "num_tokens": 19218220.0, "step": 10410 }, { "entropy": 5.750339126586914, "epoch": 0.8750262549884478, "grad_norm": 1.5625, "learning_rate": 0.0004929722621039018, "loss": 5.5613, "mean_token_accuracy": 0.1570570647716522, "num_tokens": 19227176.0, "step": 10415 }, { "entropy": 5.721258115768433, "epoch": 0.8754463348036127, "grad_norm": 1.6171875, "learning_rate": 0.0004929648340815362, "loss": 5.5929, "mean_token_accuracy": 0.15091799348592758, "num_tokens": 19236085.0, "step": 10420 }, { "entropy": 5.767314195632935, "epoch": 0.8758664146187776, "grad_norm": 1.6875, "learning_rate": 0.0004929574021980355, "loss": 5.643, "mean_token_accuracy": 0.1486381933093071, "num_tokens": 19246671.0, "step": 10425 }, { "entropy": 5.76701602935791, "epoch": 0.8762864944339425, "grad_norm": 1.609375, "learning_rate": 0.0004929499664535319, "loss": 5.5492, "mean_token_accuracy": 0.15346565693616868, "num_tokens": 19256321.0, "step": 10430 }, { "entropy": 5.763290786743164, "epoch": 0.8767065742491074, "grad_norm": 2.296875, "learning_rate": 0.0004929425268481569, "loss": 5.5126, "mean_token_accuracy": 0.1608709618449211, "num_tokens": 19265518.0, "step": 10435 }, { "entropy": 5.718894052505493, "epoch": 0.8771266540642723, "grad_norm": 1.671875, "learning_rate": 0.0004929350833820422, "loss": 5.5147, "mean_token_accuracy": 0.15873141810297967, "num_tokens": 19274120.0, "step": 10440 }, { "entropy": 5.731625127792358, "epoch": 0.877546733879437, "grad_norm": 1.5625, "learning_rate": 0.0004929276360553197, "loss": 5.5882, "mean_token_accuracy": 0.16043589189648627, "num_tokens": 19284377.0, "step": 10445 }, { "entropy": 5.711872720718384, "epoch": 0.8779668136946019, "grad_norm": 1.6953125, "learning_rate": 0.0004929201848681213, "loss": 5.4576, "mean_token_accuracy": 0.15541169792413712, "num_tokens": 19293326.0, "step": 10450 }, { "entropy": 5.690513658523559, "epoch": 0.8783868935097668, "grad_norm": 1.796875, "learning_rate": 0.0004929127298205792, "loss": 5.5079, "mean_token_accuracy": 0.1659105733036995, "num_tokens": 19302086.0, "step": 10455 }, { "entropy": 5.804715394973755, "epoch": 0.8788069733249317, "grad_norm": 1.5703125, "learning_rate": 0.0004929052709128251, "loss": 5.5488, "mean_token_accuracy": 0.1627936765551567, "num_tokens": 19310124.0, "step": 10460 }, { "entropy": 5.633396434783935, "epoch": 0.8792270531400966, "grad_norm": 1.7109375, "learning_rate": 0.0004928978081449914, "loss": 5.5709, "mean_token_accuracy": 0.15216370820999145, "num_tokens": 19321269.0, "step": 10465 }, { "entropy": 5.696399784088134, "epoch": 0.8796471329552615, "grad_norm": 1.5546875, "learning_rate": 0.0004928903415172103, "loss": 5.5728, "mean_token_accuracy": 0.15912040174007416, "num_tokens": 19330390.0, "step": 10470 }, { "entropy": 5.818605709075928, "epoch": 0.8800672127704264, "grad_norm": 1.6015625, "learning_rate": 0.000492882871029614, "loss": 5.5743, "mean_token_accuracy": 0.15722174644470216, "num_tokens": 19339457.0, "step": 10475 }, { "entropy": 5.749679517745972, "epoch": 0.8804872925855912, "grad_norm": 1.65625, "learning_rate": 0.0004928753966823348, "loss": 5.638, "mean_token_accuracy": 0.15191923528909684, "num_tokens": 19348710.0, "step": 10480 }, { "entropy": 5.747959899902344, "epoch": 0.8809073724007561, "grad_norm": 1.640625, "learning_rate": 0.0004928679184755051, "loss": 5.6689, "mean_token_accuracy": 0.15637236088514328, "num_tokens": 19357215.0, "step": 10485 }, { "entropy": 5.747460222244262, "epoch": 0.881327452215921, "grad_norm": 1.71875, "learning_rate": 0.0004928604364092574, "loss": 5.6071, "mean_token_accuracy": 0.15696584284305573, "num_tokens": 19366043.0, "step": 10490 }, { "entropy": 5.8075761795043945, "epoch": 0.8817475320310859, "grad_norm": 2.078125, "learning_rate": 0.0004928529504837243, "loss": 5.6882, "mean_token_accuracy": 0.15294934064149857, "num_tokens": 19375468.0, "step": 10495 }, { "entropy": 5.845993375778198, "epoch": 0.8821676118462508, "grad_norm": 1.40625, "learning_rate": 0.0004928454606990383, "loss": 5.5475, "mean_token_accuracy": 0.16165847033262254, "num_tokens": 19384467.0, "step": 10500 }, { "entropy": 5.70394549369812, "epoch": 0.8825876916614157, "grad_norm": 1.5078125, "learning_rate": 0.0004928379670553322, "loss": 5.5885, "mean_token_accuracy": 0.15876393169164657, "num_tokens": 19393618.0, "step": 10505 }, { "entropy": 5.758576488494873, "epoch": 0.8830077714765806, "grad_norm": 1.5, "learning_rate": 0.0004928304695527387, "loss": 5.6432, "mean_token_accuracy": 0.15267120897769929, "num_tokens": 19402921.0, "step": 10510 }, { "entropy": 5.864232301712036, "epoch": 0.8834278512917454, "grad_norm": 1.890625, "learning_rate": 0.0004928229681913905, "loss": 5.6261, "mean_token_accuracy": 0.15496253222227097, "num_tokens": 19412048.0, "step": 10515 }, { "entropy": 5.862086200714112, "epoch": 0.8838479311069103, "grad_norm": 2.703125, "learning_rate": 0.0004928154629714207, "loss": 5.6081, "mean_token_accuracy": 0.15387734174728393, "num_tokens": 19420993.0, "step": 10520 }, { "entropy": 5.727069139480591, "epoch": 0.8842680109220752, "grad_norm": 2.1875, "learning_rate": 0.000492807953892962, "loss": 5.5841, "mean_token_accuracy": 0.15330443829298018, "num_tokens": 19430145.0, "step": 10525 }, { "entropy": 5.723509407043457, "epoch": 0.8846880907372401, "grad_norm": 1.453125, "learning_rate": 0.0004928004409561476, "loss": 5.4892, "mean_token_accuracy": 0.15867023319005966, "num_tokens": 19438918.0, "step": 10530 }, { "entropy": 5.691130256652832, "epoch": 0.885108170552405, "grad_norm": 1.46875, "learning_rate": 0.0004927929241611106, "loss": 5.5303, "mean_token_accuracy": 0.1610460638999939, "num_tokens": 19448490.0, "step": 10535 }, { "entropy": 5.709879350662232, "epoch": 0.8855282503675699, "grad_norm": 1.625, "learning_rate": 0.000492785403507984, "loss": 5.6012, "mean_token_accuracy": 0.1556025877594948, "num_tokens": 19457098.0, "step": 10540 }, { "entropy": 5.761733865737915, "epoch": 0.8859483301827347, "grad_norm": 1.953125, "learning_rate": 0.0004927778789969012, "loss": 5.5863, "mean_token_accuracy": 0.15728465467691422, "num_tokens": 19466419.0, "step": 10545 }, { "entropy": 5.740839338302612, "epoch": 0.8863684099978996, "grad_norm": 1.515625, "learning_rate": 0.0004927703506279955, "loss": 5.6421, "mean_token_accuracy": 0.14617049992084502, "num_tokens": 19475882.0, "step": 10550 }, { "entropy": 5.88862247467041, "epoch": 0.8867884898130645, "grad_norm": 2.0625, "learning_rate": 0.0004927628184014, "loss": 5.6836, "mean_token_accuracy": 0.15036097317934036, "num_tokens": 19485917.0, "step": 10555 }, { "entropy": 5.807638216018677, "epoch": 0.8872085696282294, "grad_norm": 1.84375, "learning_rate": 0.0004927552823172483, "loss": 5.608, "mean_token_accuracy": 0.1534525066614151, "num_tokens": 19494984.0, "step": 10560 }, { "entropy": 5.803097820281982, "epoch": 0.8876286494433943, "grad_norm": 1.6484375, "learning_rate": 0.000492747742375674, "loss": 5.5521, "mean_token_accuracy": 0.16029339879751206, "num_tokens": 19504087.0, "step": 10565 }, { "entropy": 5.809068632125855, "epoch": 0.8880487292585592, "grad_norm": 1.71875, "learning_rate": 0.0004927401985768106, "loss": 5.6142, "mean_token_accuracy": 0.15856605321168898, "num_tokens": 19512880.0, "step": 10570 }, { "entropy": 5.732918643951416, "epoch": 0.888468809073724, "grad_norm": 2.296875, "learning_rate": 0.0004927326509207915, "loss": 5.5741, "mean_token_accuracy": 0.1594431221485138, "num_tokens": 19521723.0, "step": 10575 }, { "entropy": 5.782747840881347, "epoch": 0.8888888888888888, "grad_norm": 1.7265625, "learning_rate": 0.0004927250994077508, "loss": 5.66, "mean_token_accuracy": 0.15072188079357146, "num_tokens": 19531352.0, "step": 10580 }, { "entropy": 5.858024024963379, "epoch": 0.8893089687040537, "grad_norm": 2.046875, "learning_rate": 0.000492717544037822, "loss": 5.7545, "mean_token_accuracy": 0.15927736610174179, "num_tokens": 19540943.0, "step": 10585 }, { "entropy": 5.770633697509766, "epoch": 0.8897290485192186, "grad_norm": 1.640625, "learning_rate": 0.000492709984811139, "loss": 5.5227, "mean_token_accuracy": 0.1598847970366478, "num_tokens": 19550527.0, "step": 10590 }, { "entropy": 5.72091121673584, "epoch": 0.8901491283343835, "grad_norm": 2.078125, "learning_rate": 0.0004927024217278358, "loss": 5.5219, "mean_token_accuracy": 0.16189746409654618, "num_tokens": 19559746.0, "step": 10595 }, { "entropy": 5.759682607650757, "epoch": 0.8905692081495484, "grad_norm": 1.5546875, "learning_rate": 0.0004926948547880462, "loss": 5.6816, "mean_token_accuracy": 0.14713766053318977, "num_tokens": 19569286.0, "step": 10600 }, { "entropy": 5.684707307815552, "epoch": 0.8909892879647133, "grad_norm": 1.9140625, "learning_rate": 0.0004926872839919044, "loss": 5.5681, "mean_token_accuracy": 0.15598509460687637, "num_tokens": 19578245.0, "step": 10605 }, { "entropy": 5.722670841217041, "epoch": 0.8914093677798782, "grad_norm": 1.7890625, "learning_rate": 0.0004926797093395446, "loss": 5.5325, "mean_token_accuracy": 0.16016458123922347, "num_tokens": 19587244.0, "step": 10610 }, { "entropy": 5.762173748016357, "epoch": 0.891829447595043, "grad_norm": 2.953125, "learning_rate": 0.0004926721308311006, "loss": 5.615, "mean_token_accuracy": 0.15994844064116479, "num_tokens": 19596932.0, "step": 10615 }, { "entropy": 5.879995727539063, "epoch": 0.8922495274102079, "grad_norm": 1.7578125, "learning_rate": 0.0004926645484667069, "loss": 5.7186, "mean_token_accuracy": 0.14976566582918166, "num_tokens": 19606256.0, "step": 10620 }, { "entropy": 5.882073593139649, "epoch": 0.8926696072253728, "grad_norm": 2.0625, "learning_rate": 0.0004926569622464979, "loss": 5.7089, "mean_token_accuracy": 0.15212067142128943, "num_tokens": 19615726.0, "step": 10625 }, { "entropy": 5.8041211605072025, "epoch": 0.8930896870405377, "grad_norm": 2.84375, "learning_rate": 0.0004926493721706079, "loss": 5.5764, "mean_token_accuracy": 0.1547590583562851, "num_tokens": 19624037.0, "step": 10630 }, { "entropy": 5.756782007217407, "epoch": 0.8935097668557026, "grad_norm": 1.78125, "learning_rate": 0.0004926417782391713, "loss": 5.5781, "mean_token_accuracy": 0.16269729286432266, "num_tokens": 19632882.0, "step": 10635 }, { "entropy": 5.793702459335327, "epoch": 0.8939298466708675, "grad_norm": 1.734375, "learning_rate": 0.0004926341804523227, "loss": 5.6828, "mean_token_accuracy": 0.15286366492509842, "num_tokens": 19642686.0, "step": 10640 }, { "entropy": 5.759325933456421, "epoch": 0.8943499264860324, "grad_norm": 1.6640625, "learning_rate": 0.0004926265788101966, "loss": 5.5821, "mean_token_accuracy": 0.15535581558942796, "num_tokens": 19651380.0, "step": 10645 }, { "entropy": 5.718085050582886, "epoch": 0.8947700063011972, "grad_norm": 1.6484375, "learning_rate": 0.0004926189733129278, "loss": 5.5035, "mean_token_accuracy": 0.15965501517057418, "num_tokens": 19660136.0, "step": 10650 }, { "entropy": 5.696755981445312, "epoch": 0.8951900861163621, "grad_norm": 2.1875, "learning_rate": 0.0004926113639606509, "loss": 5.5569, "mean_token_accuracy": 0.16951919198036194, "num_tokens": 19669146.0, "step": 10655 }, { "entropy": 5.8493866443634035, "epoch": 0.895610165931527, "grad_norm": 1.875, "learning_rate": 0.0004926037507535008, "loss": 5.6893, "mean_token_accuracy": 0.15577448457479476, "num_tokens": 19678627.0, "step": 10660 }, { "entropy": 5.76816759109497, "epoch": 0.8960302457466919, "grad_norm": 1.8046875, "learning_rate": 0.0004925961336916122, "loss": 5.6246, "mean_token_accuracy": 0.15917440131306648, "num_tokens": 19688033.0, "step": 10665 }, { "entropy": 5.772870635986328, "epoch": 0.8964503255618568, "grad_norm": 1.8671875, "learning_rate": 0.0004925885127751202, "loss": 5.6191, "mean_token_accuracy": 0.15711403042078018, "num_tokens": 19696523.0, "step": 10670 }, { "entropy": 5.815629243850708, "epoch": 0.8968704053770217, "grad_norm": 2.09375, "learning_rate": 0.0004925808880041596, "loss": 5.5466, "mean_token_accuracy": 0.1619081273674965, "num_tokens": 19706339.0, "step": 10675 }, { "entropy": 5.771422576904297, "epoch": 0.8972904851921865, "grad_norm": 1.9296875, "learning_rate": 0.0004925732593788658, "loss": 5.5756, "mean_token_accuracy": 0.15582350715994836, "num_tokens": 19714779.0, "step": 10680 }, { "entropy": 5.788242959976197, "epoch": 0.8977105650073514, "grad_norm": 2.109375, "learning_rate": 0.0004925656268993737, "loss": 5.6434, "mean_token_accuracy": 0.15538930594921113, "num_tokens": 19723727.0, "step": 10685 }, { "entropy": 5.679297971725464, "epoch": 0.8981306448225163, "grad_norm": 1.7578125, "learning_rate": 0.0004925579905658185, "loss": 5.6078, "mean_token_accuracy": 0.15833698809146882, "num_tokens": 19732783.0, "step": 10690 }, { "entropy": 5.815406656265258, "epoch": 0.8985507246376812, "grad_norm": 1.8828125, "learning_rate": 0.0004925503503783355, "loss": 5.5923, "mean_token_accuracy": 0.14969452172517778, "num_tokens": 19741268.0, "step": 10695 }, { "entropy": 5.8421392917633055, "epoch": 0.898970804452846, "grad_norm": 2.03125, "learning_rate": 0.0004925427063370601, "loss": 5.5229, "mean_token_accuracy": 0.1585152953863144, "num_tokens": 19751490.0, "step": 10700 }, { "entropy": 5.75557165145874, "epoch": 0.899390884268011, "grad_norm": 1.4921875, "learning_rate": 0.0004925350584421278, "loss": 5.5722, "mean_token_accuracy": 0.15308883041143417, "num_tokens": 19760487.0, "step": 10705 }, { "entropy": 5.820067501068115, "epoch": 0.8998109640831758, "grad_norm": 1.78125, "learning_rate": 0.0004925274066936738, "loss": 5.5441, "mean_token_accuracy": 0.16286628544330597, "num_tokens": 19768984.0, "step": 10710 }, { "entropy": 5.693412828445434, "epoch": 0.9002310438983406, "grad_norm": 1.7109375, "learning_rate": 0.0004925197510918339, "loss": 5.5163, "mean_token_accuracy": 0.1612228661775589, "num_tokens": 19778335.0, "step": 10715 }, { "entropy": 5.740248203277588, "epoch": 0.9006511237135055, "grad_norm": 1.78125, "learning_rate": 0.0004925120916367435, "loss": 5.66, "mean_token_accuracy": 0.14562905877828597, "num_tokens": 19789082.0, "step": 10720 }, { "entropy": 5.676235198974609, "epoch": 0.9010712035286704, "grad_norm": 1.5078125, "learning_rate": 0.0004925044283285384, "loss": 5.3958, "mean_token_accuracy": 0.17226272374391555, "num_tokens": 19797902.0, "step": 10725 }, { "entropy": 5.674381303787231, "epoch": 0.9014912833438353, "grad_norm": 1.515625, "learning_rate": 0.0004924967611673544, "loss": 5.567, "mean_token_accuracy": 0.15973830968141556, "num_tokens": 19806481.0, "step": 10730 }, { "entropy": 5.625586986541748, "epoch": 0.9019113631590002, "grad_norm": 1.59375, "learning_rate": 0.0004924890901533273, "loss": 5.4518, "mean_token_accuracy": 0.16687363982200623, "num_tokens": 19815226.0, "step": 10735 }, { "entropy": 5.865736722946167, "epoch": 0.9023314429741651, "grad_norm": 1.3984375, "learning_rate": 0.0004924814152865929, "loss": 5.6794, "mean_token_accuracy": 0.14995559379458429, "num_tokens": 19824577.0, "step": 10740 }, { "entropy": 5.814121675491333, "epoch": 0.90275152278933, "grad_norm": 1.5, "learning_rate": 0.0004924737365672873, "loss": 5.5908, "mean_token_accuracy": 0.15056767463684081, "num_tokens": 19832936.0, "step": 10745 }, { "entropy": 5.820723390579223, "epoch": 0.9031716026044948, "grad_norm": 1.46875, "learning_rate": 0.0004924660539955463, "loss": 5.7351, "mean_token_accuracy": 0.15998328030109404, "num_tokens": 19841946.0, "step": 10750 }, { "entropy": 5.750902080535889, "epoch": 0.9035916824196597, "grad_norm": 1.484375, "learning_rate": 0.0004924583675715063, "loss": 5.6077, "mean_token_accuracy": 0.15404654592275618, "num_tokens": 19851469.0, "step": 10755 }, { "entropy": 5.799461030960083, "epoch": 0.9040117622348246, "grad_norm": 1.578125, "learning_rate": 0.0004924506772953031, "loss": 5.678, "mean_token_accuracy": 0.15529222413897514, "num_tokens": 19860731.0, "step": 10760 }, { "entropy": 5.758323049545288, "epoch": 0.9044318420499895, "grad_norm": 1.7109375, "learning_rate": 0.0004924429831670733, "loss": 5.6852, "mean_token_accuracy": 0.14765787720680237, "num_tokens": 19869717.0, "step": 10765 }, { "entropy": 5.825065422058105, "epoch": 0.9048519218651544, "grad_norm": 1.609375, "learning_rate": 0.000492435285186953, "loss": 5.6377, "mean_token_accuracy": 0.15890030115842818, "num_tokens": 19879229.0, "step": 10770 }, { "entropy": 5.873213052749634, "epoch": 0.9052720016803193, "grad_norm": 1.7109375, "learning_rate": 0.0004924275833550785, "loss": 5.6228, "mean_token_accuracy": 0.1515662133693695, "num_tokens": 19888260.0, "step": 10775 }, { "entropy": 5.827171325683594, "epoch": 0.9056920814954842, "grad_norm": 1.828125, "learning_rate": 0.0004924198776715865, "loss": 5.6436, "mean_token_accuracy": 0.16024628281593323, "num_tokens": 19897070.0, "step": 10780 }, { "entropy": 5.7876802444458, "epoch": 0.906112161310649, "grad_norm": 1.7421875, "learning_rate": 0.0004924121681366132, "loss": 5.6284, "mean_token_accuracy": 0.15037994906306268, "num_tokens": 19907170.0, "step": 10785 }, { "entropy": 5.804350471496582, "epoch": 0.9065322411258139, "grad_norm": 1.859375, "learning_rate": 0.0004924044547502951, "loss": 5.5682, "mean_token_accuracy": 0.1583652213215828, "num_tokens": 19917220.0, "step": 10790 }, { "entropy": 5.744189023971558, "epoch": 0.9069523209409788, "grad_norm": 1.84375, "learning_rate": 0.0004923967375127692, "loss": 5.6334, "mean_token_accuracy": 0.15887839794158937, "num_tokens": 19926724.0, "step": 10795 }, { "entropy": 5.845341348648072, "epoch": 0.9073724007561437, "grad_norm": 1.9140625, "learning_rate": 0.000492389016424172, "loss": 5.7404, "mean_token_accuracy": 0.15144012570381166, "num_tokens": 19936429.0, "step": 10800 }, { "entropy": 5.758127307891845, "epoch": 0.9077924805713086, "grad_norm": 1.7421875, "learning_rate": 0.0004923812914846404, "loss": 5.5099, "mean_token_accuracy": 0.15872399806976317, "num_tokens": 19945096.0, "step": 10805 }, { "entropy": 5.708646059036255, "epoch": 0.9082125603864735, "grad_norm": 2.078125, "learning_rate": 0.0004923735626943111, "loss": 5.5856, "mean_token_accuracy": 0.16495954543352126, "num_tokens": 19953560.0, "step": 10810 }, { "entropy": 5.765500879287719, "epoch": 0.9086326402016384, "grad_norm": 1.4296875, "learning_rate": 0.0004923658300533211, "loss": 5.5682, "mean_token_accuracy": 0.1555124580860138, "num_tokens": 19962669.0, "step": 10815 }, { "entropy": 5.802539348602295, "epoch": 0.9090527200168032, "grad_norm": 1.625, "learning_rate": 0.0004923580935618073, "loss": 5.612, "mean_token_accuracy": 0.1580589756369591, "num_tokens": 19971990.0, "step": 10820 }, { "entropy": 5.759839391708374, "epoch": 0.909472799831968, "grad_norm": 1.6640625, "learning_rate": 0.0004923503532199069, "loss": 5.6108, "mean_token_accuracy": 0.15835360288619996, "num_tokens": 19981850.0, "step": 10825 }, { "entropy": 5.804291439056397, "epoch": 0.909892879647133, "grad_norm": 1.4765625, "learning_rate": 0.0004923426090277567, "loss": 5.6433, "mean_token_accuracy": 0.15101254507899284, "num_tokens": 19991574.0, "step": 10830 }, { "entropy": 5.788902282714844, "epoch": 0.9103129594622978, "grad_norm": 1.5234375, "learning_rate": 0.0004923348609854943, "loss": 5.6121, "mean_token_accuracy": 0.16281114518642426, "num_tokens": 20001392.0, "step": 10835 }, { "entropy": 5.778925085067749, "epoch": 0.9107330392774627, "grad_norm": 1.8515625, "learning_rate": 0.0004923271090932566, "loss": 5.6512, "mean_token_accuracy": 0.1461693450808525, "num_tokens": 20011277.0, "step": 10840 }, { "entropy": 5.704980707168579, "epoch": 0.9111531190926276, "grad_norm": 1.640625, "learning_rate": 0.0004923193533511812, "loss": 5.5568, "mean_token_accuracy": 0.1563573181629181, "num_tokens": 20021171.0, "step": 10845 }, { "entropy": 5.873466444015503, "epoch": 0.9115731989077924, "grad_norm": 1.96875, "learning_rate": 0.0004923115937594053, "loss": 5.6403, "mean_token_accuracy": 0.15872172266244888, "num_tokens": 20030189.0, "step": 10850 }, { "entropy": 5.826998519897461, "epoch": 0.9119932787229573, "grad_norm": 1.875, "learning_rate": 0.0004923038303180664, "loss": 5.6089, "mean_token_accuracy": 0.16154826879501344, "num_tokens": 20038287.0, "step": 10855 }, { "entropy": 5.704780101776123, "epoch": 0.9124133585381222, "grad_norm": 1.5859375, "learning_rate": 0.000492296063027302, "loss": 5.6242, "mean_token_accuracy": 0.1486751489341259, "num_tokens": 20047653.0, "step": 10860 }, { "entropy": 5.720272779464722, "epoch": 0.9128334383532871, "grad_norm": 2.296875, "learning_rate": 0.0004922882918872498, "loss": 5.611, "mean_token_accuracy": 0.15257783234119415, "num_tokens": 20057415.0, "step": 10865 }, { "entropy": 5.843525409698486, "epoch": 0.913253518168452, "grad_norm": 1.8203125, "learning_rate": 0.0004922805168980475, "loss": 5.6436, "mean_token_accuracy": 0.1583248570561409, "num_tokens": 20065996.0, "step": 10870 }, { "entropy": 5.765225791931153, "epoch": 0.9136735979836169, "grad_norm": 1.9296875, "learning_rate": 0.0004922727380598326, "loss": 5.5794, "mean_token_accuracy": 0.15503590703010559, "num_tokens": 20075376.0, "step": 10875 }, { "entropy": 5.7751857280731205, "epoch": 0.9140936777987818, "grad_norm": 1.71875, "learning_rate": 0.000492264955372743, "loss": 5.6108, "mean_token_accuracy": 0.14910593926906585, "num_tokens": 20084950.0, "step": 10880 }, { "entropy": 5.851974725723267, "epoch": 0.9145137576139466, "grad_norm": 1.4921875, "learning_rate": 0.0004922571688369165, "loss": 5.5881, "mean_token_accuracy": 0.1583369717001915, "num_tokens": 20094011.0, "step": 10885 }, { "entropy": 5.7428583145141605, "epoch": 0.9149338374291115, "grad_norm": 1.640625, "learning_rate": 0.0004922493784524914, "loss": 5.56, "mean_token_accuracy": 0.1584095723927021, "num_tokens": 20103037.0, "step": 10890 }, { "entropy": 5.741644382476807, "epoch": 0.9153539172442764, "grad_norm": 1.625, "learning_rate": 0.0004922415842196052, "loss": 5.7116, "mean_token_accuracy": 0.14545977264642715, "num_tokens": 20112727.0, "step": 10895 }, { "entropy": 5.699281311035156, "epoch": 0.9157739970594413, "grad_norm": 2.15625, "learning_rate": 0.0004922337861383963, "loss": 5.522, "mean_token_accuracy": 0.1605138972401619, "num_tokens": 20122341.0, "step": 10900 }, { "entropy": 5.7933906555175785, "epoch": 0.9161940768746062, "grad_norm": 1.765625, "learning_rate": 0.0004922259842090027, "loss": 5.5088, "mean_token_accuracy": 0.15630880296230315, "num_tokens": 20131354.0, "step": 10905 }, { "entropy": 5.752344179153442, "epoch": 0.9166141566897711, "grad_norm": 1.953125, "learning_rate": 0.0004922181784315627, "loss": 5.5565, "mean_token_accuracy": 0.1608913227915764, "num_tokens": 20140440.0, "step": 10910 }, { "entropy": 5.673103618621826, "epoch": 0.917034236504936, "grad_norm": 1.6015625, "learning_rate": 0.0004922103688062145, "loss": 5.556, "mean_token_accuracy": 0.1585061579942703, "num_tokens": 20149331.0, "step": 10915 }, { "entropy": 5.721803379058838, "epoch": 0.9174543163201008, "grad_norm": 1.6015625, "learning_rate": 0.0004922025553330964, "loss": 5.5308, "mean_token_accuracy": 0.16434049159288405, "num_tokens": 20158566.0, "step": 10920 }, { "entropy": 5.820386266708374, "epoch": 0.9178743961352657, "grad_norm": 1.59375, "learning_rate": 0.000492194738012347, "loss": 5.6422, "mean_token_accuracy": 0.15888736993074418, "num_tokens": 20168339.0, "step": 10925 }, { "entropy": 5.8344615459442135, "epoch": 0.9182944759504306, "grad_norm": 2.0625, "learning_rate": 0.0004921869168441045, "loss": 5.6482, "mean_token_accuracy": 0.15219517126679422, "num_tokens": 20177967.0, "step": 10930 }, { "entropy": 5.748171138763428, "epoch": 0.9187145557655955, "grad_norm": 1.875, "learning_rate": 0.0004921790918285077, "loss": 5.6339, "mean_token_accuracy": 0.1555405542254448, "num_tokens": 20187279.0, "step": 10935 }, { "entropy": 5.775131797790527, "epoch": 0.9191346355807604, "grad_norm": 1.6015625, "learning_rate": 0.0004921712629656951, "loss": 5.7308, "mean_token_accuracy": 0.16442956626415253, "num_tokens": 20195324.0, "step": 10940 }, { "entropy": 5.864813995361328, "epoch": 0.9195547153959253, "grad_norm": 1.6640625, "learning_rate": 0.0004921634302558054, "loss": 5.6618, "mean_token_accuracy": 0.1532442182302475, "num_tokens": 20204985.0, "step": 10945 }, { "entropy": 5.7605233669281, "epoch": 0.9199747952110902, "grad_norm": 1.59375, "learning_rate": 0.0004921555936989773, "loss": 5.6693, "mean_token_accuracy": 0.15000807642936706, "num_tokens": 20214553.0, "step": 10950 }, { "entropy": 5.815750789642334, "epoch": 0.9203948750262549, "grad_norm": 1.6953125, "learning_rate": 0.0004921477532953497, "loss": 5.5867, "mean_token_accuracy": 0.15734840780496598, "num_tokens": 20224118.0, "step": 10955 }, { "entropy": 5.770743799209595, "epoch": 0.9208149548414198, "grad_norm": 1.3984375, "learning_rate": 0.0004921399090450616, "loss": 5.5348, "mean_token_accuracy": 0.15028709322214126, "num_tokens": 20233719.0, "step": 10960 }, { "entropy": 5.767902183532715, "epoch": 0.9212350346565847, "grad_norm": 1.5625, "learning_rate": 0.0004921320609482517, "loss": 5.6305, "mean_token_accuracy": 0.15462984144687653, "num_tokens": 20242311.0, "step": 10965 }, { "entropy": 5.807157325744629, "epoch": 0.9216551144717496, "grad_norm": 1.34375, "learning_rate": 0.0004921242090050591, "loss": 5.6595, "mean_token_accuracy": 0.14994974732398986, "num_tokens": 20252998.0, "step": 10970 }, { "entropy": 5.818349123001099, "epoch": 0.9220751942869145, "grad_norm": 1.890625, "learning_rate": 0.000492116353215623, "loss": 5.7205, "mean_token_accuracy": 0.15557870492339135, "num_tokens": 20262456.0, "step": 10975 }, { "entropy": 5.695267009735107, "epoch": 0.9224952741020794, "grad_norm": 1.640625, "learning_rate": 0.0004921084935800825, "loss": 5.4788, "mean_token_accuracy": 0.16470759660005568, "num_tokens": 20271516.0, "step": 10980 }, { "entropy": 5.735180997848511, "epoch": 0.9229153539172443, "grad_norm": 1.4375, "learning_rate": 0.0004921006300985768, "loss": 5.5278, "mean_token_accuracy": 0.1622763454914093, "num_tokens": 20280373.0, "step": 10985 }, { "entropy": 5.715144777297974, "epoch": 0.9233354337324091, "grad_norm": 1.3359375, "learning_rate": 0.0004920927627712453, "loss": 5.5267, "mean_token_accuracy": 0.1575745850801468, "num_tokens": 20289426.0, "step": 10990 }, { "entropy": 5.809565830230713, "epoch": 0.923755513547574, "grad_norm": 1.546875, "learning_rate": 0.0004920848915982273, "loss": 5.6718, "mean_token_accuracy": 0.15313809663057326, "num_tokens": 20298045.0, "step": 10995 }, { "entropy": 5.710923767089843, "epoch": 0.9241755933627389, "grad_norm": 1.359375, "learning_rate": 0.0004920770165796622, "loss": 5.5569, "mean_token_accuracy": 0.1600003331899643, "num_tokens": 20307352.0, "step": 11000 }, { "entropy": 5.757216310501098, "epoch": 0.9245956731779038, "grad_norm": 1.625, "learning_rate": 0.0004920691377156895, "loss": 5.5865, "mean_token_accuracy": 0.15644698292016984, "num_tokens": 20316448.0, "step": 11005 }, { "entropy": 5.867019748687744, "epoch": 0.9250157529930687, "grad_norm": 1.5703125, "learning_rate": 0.0004920612550064488, "loss": 5.7449, "mean_token_accuracy": 0.1475832186639309, "num_tokens": 20326440.0, "step": 11010 }, { "entropy": 5.769907808303833, "epoch": 0.9254358328082336, "grad_norm": 1.359375, "learning_rate": 0.0004920533684520797, "loss": 5.5086, "mean_token_accuracy": 0.15823858827352524, "num_tokens": 20335447.0, "step": 11015 }, { "entropy": 5.750536823272705, "epoch": 0.9258559126233984, "grad_norm": 1.6796875, "learning_rate": 0.000492045478052722, "loss": 5.6596, "mean_token_accuracy": 0.15351206958293914, "num_tokens": 20344523.0, "step": 11020 }, { "entropy": 5.741793203353882, "epoch": 0.9262759924385633, "grad_norm": 1.921875, "learning_rate": 0.0004920375838085154, "loss": 5.6171, "mean_token_accuracy": 0.1559000790119171, "num_tokens": 20354267.0, "step": 11025 }, { "entropy": 5.798118543624878, "epoch": 0.9266960722537282, "grad_norm": 1.4140625, "learning_rate": 0.0004920296857195998, "loss": 5.6771, "mean_token_accuracy": 0.15482696294784545, "num_tokens": 20364137.0, "step": 11030 }, { "entropy": 5.799237871170044, "epoch": 0.9271161520688931, "grad_norm": 1.53125, "learning_rate": 0.000492021783786115, "loss": 5.5804, "mean_token_accuracy": 0.16075632423162461, "num_tokens": 20372583.0, "step": 11035 }, { "entropy": 5.6686241149902346, "epoch": 0.927536231884058, "grad_norm": 1.7734375, "learning_rate": 0.0004920138780082011, "loss": 5.5397, "mean_token_accuracy": 0.15648741349577905, "num_tokens": 20382050.0, "step": 11040 }, { "entropy": 5.725726461410522, "epoch": 0.9279563116992229, "grad_norm": 1.875, "learning_rate": 0.0004920059683859981, "loss": 5.4955, "mean_token_accuracy": 0.1606592372059822, "num_tokens": 20391425.0, "step": 11045 }, { "entropy": 5.798936271667481, "epoch": 0.9283763915143878, "grad_norm": 1.921875, "learning_rate": 0.0004919980549196461, "loss": 5.6647, "mean_token_accuracy": 0.15349570661783218, "num_tokens": 20400559.0, "step": 11050 }, { "entropy": 5.767499828338623, "epoch": 0.9287964713295526, "grad_norm": 1.875, "learning_rate": 0.0004919901376092853, "loss": 5.5783, "mean_token_accuracy": 0.16081294864416124, "num_tokens": 20408985.0, "step": 11055 }, { "entropy": 5.7440389633178714, "epoch": 0.9292165511447175, "grad_norm": 1.453125, "learning_rate": 0.0004919822164550559, "loss": 5.6773, "mean_token_accuracy": 0.14321673214435576, "num_tokens": 20417855.0, "step": 11060 }, { "entropy": 5.744246864318848, "epoch": 0.9296366309598824, "grad_norm": 1.484375, "learning_rate": 0.0004919742914570983, "loss": 5.6304, "mean_token_accuracy": 0.1557525396347046, "num_tokens": 20426191.0, "step": 11065 }, { "entropy": 5.765243244171143, "epoch": 0.9300567107750473, "grad_norm": 1.53125, "learning_rate": 0.000491966362615553, "loss": 5.6006, "mean_token_accuracy": 0.15035101026296616, "num_tokens": 20435592.0, "step": 11070 }, { "entropy": 5.85240740776062, "epoch": 0.9304767905902122, "grad_norm": 1.578125, "learning_rate": 0.00049195842993056, "loss": 5.634, "mean_token_accuracy": 0.15329790860414505, "num_tokens": 20445504.0, "step": 11075 }, { "entropy": 5.803719425201416, "epoch": 0.930896870405377, "grad_norm": 1.6015625, "learning_rate": 0.0004919504934022604, "loss": 5.578, "mean_token_accuracy": 0.15457095801830292, "num_tokens": 20455153.0, "step": 11080 }, { "entropy": 5.7237049579620365, "epoch": 0.931316950220542, "grad_norm": 1.5546875, "learning_rate": 0.0004919425530307943, "loss": 5.5681, "mean_token_accuracy": 0.15656672269105912, "num_tokens": 20465101.0, "step": 11085 }, { "entropy": 5.742412662506103, "epoch": 0.9317370300357067, "grad_norm": 2.015625, "learning_rate": 0.0004919346088163028, "loss": 5.615, "mean_token_accuracy": 0.1582319989800453, "num_tokens": 20474700.0, "step": 11090 }, { "entropy": 5.835652637481689, "epoch": 0.9321571098508716, "grad_norm": 1.546875, "learning_rate": 0.0004919266607589263, "loss": 5.6564, "mean_token_accuracy": 0.15037914365530014, "num_tokens": 20483945.0, "step": 11095 }, { "entropy": 5.8025538444519045, "epoch": 0.9325771896660365, "grad_norm": 1.828125, "learning_rate": 0.0004919187088588057, "loss": 5.6307, "mean_token_accuracy": 0.15815725028514863, "num_tokens": 20493307.0, "step": 11100 }, { "entropy": 5.722408819198608, "epoch": 0.9329972694812014, "grad_norm": 1.578125, "learning_rate": 0.0004919107531160819, "loss": 5.5552, "mean_token_accuracy": 0.1643086478114128, "num_tokens": 20501889.0, "step": 11105 }, { "entropy": 5.729394769668579, "epoch": 0.9334173492963663, "grad_norm": 1.8125, "learning_rate": 0.0004919027935308957, "loss": 5.5785, "mean_token_accuracy": 0.15731487423181534, "num_tokens": 20510577.0, "step": 11110 }, { "entropy": 5.651753997802734, "epoch": 0.9338374291115312, "grad_norm": 1.75, "learning_rate": 0.0004918948301033884, "loss": 5.5583, "mean_token_accuracy": 0.15677412003278732, "num_tokens": 20520025.0, "step": 11115 }, { "entropy": 5.799631404876709, "epoch": 0.9342575089266961, "grad_norm": 1.390625, "learning_rate": 0.0004918868628337007, "loss": 5.6042, "mean_token_accuracy": 0.15233962684869767, "num_tokens": 20528989.0, "step": 11120 }, { "entropy": 5.779157257080078, "epoch": 0.9346775887418609, "grad_norm": 1.6875, "learning_rate": 0.0004918788917219739, "loss": 5.5609, "mean_token_accuracy": 0.15591868460178376, "num_tokens": 20538328.0, "step": 11125 }, { "entropy": 5.714973402023316, "epoch": 0.9350976685570258, "grad_norm": 1.46875, "learning_rate": 0.0004918709167683493, "loss": 5.686, "mean_token_accuracy": 0.15123260617256165, "num_tokens": 20548069.0, "step": 11130 }, { "entropy": 5.690325927734375, "epoch": 0.9355177483721907, "grad_norm": 1.7109375, "learning_rate": 0.0004918629379729681, "loss": 5.4379, "mean_token_accuracy": 0.16827901899814607, "num_tokens": 20557128.0, "step": 11135 }, { "entropy": 5.725959730148316, "epoch": 0.9359378281873556, "grad_norm": 1.5703125, "learning_rate": 0.0004918549553359715, "loss": 5.5616, "mean_token_accuracy": 0.15457266718149185, "num_tokens": 20566352.0, "step": 11140 }, { "entropy": 5.780063915252685, "epoch": 0.9363579080025205, "grad_norm": 2.0, "learning_rate": 0.0004918469688575012, "loss": 5.6077, "mean_token_accuracy": 0.1547131732106209, "num_tokens": 20575814.0, "step": 11145 }, { "entropy": 5.752800464630127, "epoch": 0.9367779878176854, "grad_norm": 1.5390625, "learning_rate": 0.0004918389785376983, "loss": 5.4704, "mean_token_accuracy": 0.16297883689403533, "num_tokens": 20584715.0, "step": 11150 }, { "entropy": 5.691038417816162, "epoch": 0.9371980676328503, "grad_norm": 1.484375, "learning_rate": 0.0004918309843767047, "loss": 5.563, "mean_token_accuracy": 0.15457476824522018, "num_tokens": 20594630.0, "step": 11155 }, { "entropy": 5.705981302261352, "epoch": 0.9376181474480151, "grad_norm": 1.5234375, "learning_rate": 0.0004918229863746618, "loss": 5.5344, "mean_token_accuracy": 0.15329102724790572, "num_tokens": 20603653.0, "step": 11160 }, { "entropy": 5.809178400039673, "epoch": 0.93803822726318, "grad_norm": 1.7890625, "learning_rate": 0.0004918149845317114, "loss": 5.6041, "mean_token_accuracy": 0.15675780922174454, "num_tokens": 20612188.0, "step": 11165 }, { "entropy": 5.743681907653809, "epoch": 0.9384583070783449, "grad_norm": 1.390625, "learning_rate": 0.0004918069788479952, "loss": 5.5291, "mean_token_accuracy": 0.16411179453134536, "num_tokens": 20620933.0, "step": 11170 }, { "entropy": 5.689119815826416, "epoch": 0.9388783868935098, "grad_norm": 1.625, "learning_rate": 0.0004917989693236549, "loss": 5.5733, "mean_token_accuracy": 0.1595962554216385, "num_tokens": 20629919.0, "step": 11175 }, { "entropy": 5.739494895935058, "epoch": 0.9392984667086747, "grad_norm": 1.4921875, "learning_rate": 0.0004917909559588326, "loss": 5.5402, "mean_token_accuracy": 0.1560191825032234, "num_tokens": 20638475.0, "step": 11180 }, { "entropy": 5.911345434188843, "epoch": 0.9397185465238396, "grad_norm": 1.4921875, "learning_rate": 0.00049178293875367, "loss": 5.6769, "mean_token_accuracy": 0.1469906136393547, "num_tokens": 20648105.0, "step": 11185 }, { "entropy": 5.764797687530518, "epoch": 0.9401386263390044, "grad_norm": 1.7421875, "learning_rate": 0.0004917749177083094, "loss": 5.5703, "mean_token_accuracy": 0.1515391141176224, "num_tokens": 20657527.0, "step": 11190 }, { "entropy": 5.723624420166016, "epoch": 0.9405587061541693, "grad_norm": 1.7421875, "learning_rate": 0.0004917668928228927, "loss": 5.5763, "mean_token_accuracy": 0.1612919121980667, "num_tokens": 20666375.0, "step": 11195 }, { "entropy": 5.723942565917969, "epoch": 0.9409787859693342, "grad_norm": 1.4375, "learning_rate": 0.0004917588640975622, "loss": 5.5232, "mean_token_accuracy": 0.1613648310303688, "num_tokens": 20675350.0, "step": 11200 }, { "entropy": 5.659457588195801, "epoch": 0.941398865784499, "grad_norm": 2.03125, "learning_rate": 0.00049175083153246, "loss": 5.4574, "mean_token_accuracy": 0.15883690416812896, "num_tokens": 20684072.0, "step": 11205 }, { "entropy": 5.678450441360473, "epoch": 0.941818945599664, "grad_norm": 1.734375, "learning_rate": 0.0004917427951277284, "loss": 5.5619, "mean_token_accuracy": 0.16161370575428008, "num_tokens": 20692989.0, "step": 11210 }, { "entropy": 5.763214254379273, "epoch": 0.9422390254148288, "grad_norm": 1.578125, "learning_rate": 0.0004917347548835097, "loss": 5.5035, "mean_token_accuracy": 0.16200087666511537, "num_tokens": 20701269.0, "step": 11215 }, { "entropy": 5.759133005142212, "epoch": 0.9426591052299937, "grad_norm": 1.4375, "learning_rate": 0.0004917267107999466, "loss": 5.6106, "mean_token_accuracy": 0.15289842039346696, "num_tokens": 20709739.0, "step": 11220 }, { "entropy": 5.739570665359497, "epoch": 0.9430791850451585, "grad_norm": 1.515625, "learning_rate": 0.0004917186628771812, "loss": 5.5576, "mean_token_accuracy": 0.16139040291309356, "num_tokens": 20718950.0, "step": 11225 }, { "entropy": 5.755300760269165, "epoch": 0.9434992648603234, "grad_norm": 1.5078125, "learning_rate": 0.0004917106111153565, "loss": 5.5673, "mean_token_accuracy": 0.1547436758875847, "num_tokens": 20729469.0, "step": 11230 }, { "entropy": 5.775959253311157, "epoch": 0.9439193446754883, "grad_norm": 1.9296875, "learning_rate": 0.0004917025555146148, "loss": 5.5744, "mean_token_accuracy": 0.1662562906742096, "num_tokens": 20738231.0, "step": 11235 }, { "entropy": 5.756017684936523, "epoch": 0.9443394244906532, "grad_norm": 1.3203125, "learning_rate": 0.000491694496075099, "loss": 5.7704, "mean_token_accuracy": 0.14358580783009528, "num_tokens": 20748578.0, "step": 11240 }, { "entropy": 5.837352752685547, "epoch": 0.9447595043058181, "grad_norm": 1.5703125, "learning_rate": 0.0004916864327969517, "loss": 5.7026, "mean_token_accuracy": 0.14462515115737914, "num_tokens": 20759284.0, "step": 11245 }, { "entropy": 5.8536529541015625, "epoch": 0.945179584120983, "grad_norm": 1.9140625, "learning_rate": 0.0004916783656803158, "loss": 5.6316, "mean_token_accuracy": 0.15945006310939788, "num_tokens": 20768186.0, "step": 11250 }, { "entropy": 5.695327425003052, "epoch": 0.9455996639361479, "grad_norm": 1.6484375, "learning_rate": 0.0004916702947253342, "loss": 5.5009, "mean_token_accuracy": 0.16529642790555954, "num_tokens": 20776711.0, "step": 11255 }, { "entropy": 5.7685582637786865, "epoch": 0.9460197437513127, "grad_norm": 2.234375, "learning_rate": 0.0004916622199321501, "loss": 5.5766, "mean_token_accuracy": 0.15894216895103455, "num_tokens": 20785154.0, "step": 11260 }, { "entropy": 5.80894103050232, "epoch": 0.9464398235664776, "grad_norm": 1.578125, "learning_rate": 0.0004916541413009062, "loss": 5.5194, "mean_token_accuracy": 0.16128009110689162, "num_tokens": 20794114.0, "step": 11265 }, { "entropy": 5.7742784976959225, "epoch": 0.9468599033816425, "grad_norm": 1.953125, "learning_rate": 0.0004916460588317458, "loss": 5.6258, "mean_token_accuracy": 0.14817884638905526, "num_tokens": 20803892.0, "step": 11270 }, { "entropy": 5.641349744796753, "epoch": 0.9472799831968074, "grad_norm": 1.609375, "learning_rate": 0.0004916379725248118, "loss": 5.511, "mean_token_accuracy": 0.16303833425045014, "num_tokens": 20812892.0, "step": 11275 }, { "entropy": 5.743069410324097, "epoch": 0.9477000630119723, "grad_norm": 1.5546875, "learning_rate": 0.0004916298823802479, "loss": 5.5676, "mean_token_accuracy": 0.1500309720635414, "num_tokens": 20821934.0, "step": 11280 }, { "entropy": 5.725360774993897, "epoch": 0.9481201428271372, "grad_norm": 1.3984375, "learning_rate": 0.0004916217883981971, "loss": 5.4977, "mean_token_accuracy": 0.15707524865865707, "num_tokens": 20830100.0, "step": 11285 }, { "entropy": 5.692885828018189, "epoch": 0.9485402226423021, "grad_norm": 1.703125, "learning_rate": 0.0004916136905788029, "loss": 5.5708, "mean_token_accuracy": 0.15760752707719802, "num_tokens": 20839890.0, "step": 11290 }, { "entropy": 5.779399299621582, "epoch": 0.9489603024574669, "grad_norm": 1.78125, "learning_rate": 0.0004916055889222087, "loss": 5.6962, "mean_token_accuracy": 0.14309152886271476, "num_tokens": 20848670.0, "step": 11295 }, { "entropy": 5.753837442398071, "epoch": 0.9493803822726318, "grad_norm": 1.796875, "learning_rate": 0.000491597483428558, "loss": 5.5372, "mean_token_accuracy": 0.16605689823627473, "num_tokens": 20857291.0, "step": 11300 }, { "entropy": 5.653014183044434, "epoch": 0.9498004620877967, "grad_norm": 1.9140625, "learning_rate": 0.0004915893740979944, "loss": 5.4998, "mean_token_accuracy": 0.16381447315216063, "num_tokens": 20865341.0, "step": 11305 }, { "entropy": 5.805274391174317, "epoch": 0.9502205419029616, "grad_norm": 1.3671875, "learning_rate": 0.0004915812609306617, "loss": 5.6431, "mean_token_accuracy": 0.15660493373870848, "num_tokens": 20875194.0, "step": 11310 }, { "entropy": 5.8244353294372555, "epoch": 0.9506406217181265, "grad_norm": 1.4609375, "learning_rate": 0.0004915731439267034, "loss": 5.5483, "mean_token_accuracy": 0.1535589724779129, "num_tokens": 20884831.0, "step": 11315 }, { "entropy": 5.66036376953125, "epoch": 0.9510607015332914, "grad_norm": 1.3984375, "learning_rate": 0.0004915650230862634, "loss": 5.431, "mean_token_accuracy": 0.16502011716365814, "num_tokens": 20893790.0, "step": 11320 }, { "entropy": 5.638264322280884, "epoch": 0.9514807813484563, "grad_norm": 1.609375, "learning_rate": 0.0004915568984094854, "loss": 5.5594, "mean_token_accuracy": 0.15512819588184357, "num_tokens": 20902175.0, "step": 11325 }, { "entropy": 5.824262189865112, "epoch": 0.951900861163621, "grad_norm": 1.453125, "learning_rate": 0.0004915487698965136, "loss": 5.694, "mean_token_accuracy": 0.14529131203889847, "num_tokens": 20911484.0, "step": 11330 }, { "entropy": 5.88162055015564, "epoch": 0.952320940978786, "grad_norm": 1.6484375, "learning_rate": 0.0004915406375474917, "loss": 5.6445, "mean_token_accuracy": 0.14643194004893303, "num_tokens": 20920916.0, "step": 11335 }, { "entropy": 5.807900476455688, "epoch": 0.9527410207939508, "grad_norm": 1.4453125, "learning_rate": 0.000491532501362564, "loss": 5.6522, "mean_token_accuracy": 0.15773532688617706, "num_tokens": 20930219.0, "step": 11340 }, { "entropy": 5.679394388198853, "epoch": 0.9531611006091157, "grad_norm": 1.453125, "learning_rate": 0.0004915243613418745, "loss": 5.482, "mean_token_accuracy": 0.16191438734531402, "num_tokens": 20939591.0, "step": 11345 }, { "entropy": 5.767440366744995, "epoch": 0.9535811804242806, "grad_norm": 1.78125, "learning_rate": 0.0004915162174855675, "loss": 5.6543, "mean_token_accuracy": 0.15383701771497726, "num_tokens": 20950035.0, "step": 11350 }, { "entropy": 5.7626283168792725, "epoch": 0.9540012602394455, "grad_norm": 1.8125, "learning_rate": 0.0004915080697937872, "loss": 5.5616, "mean_token_accuracy": 0.15663446485996246, "num_tokens": 20959168.0, "step": 11355 }, { "entropy": 5.7188849449157715, "epoch": 0.9544213400546103, "grad_norm": 1.53125, "learning_rate": 0.0004914999182666779, "loss": 5.4866, "mean_token_accuracy": 0.1626068413257599, "num_tokens": 20967887.0, "step": 11360 }, { "entropy": 5.763808012008667, "epoch": 0.9548414198697752, "grad_norm": 1.9140625, "learning_rate": 0.0004914917629043839, "loss": 5.5862, "mean_token_accuracy": 0.15319453924894333, "num_tokens": 20977558.0, "step": 11365 }, { "entropy": 5.650288105010986, "epoch": 0.9552614996849401, "grad_norm": 1.5859375, "learning_rate": 0.00049148360370705, "loss": 5.5436, "mean_token_accuracy": 0.16097380816936493, "num_tokens": 20986118.0, "step": 11370 }, { "entropy": 5.754954528808594, "epoch": 0.955681579500105, "grad_norm": 1.6796875, "learning_rate": 0.0004914754406748204, "loss": 5.4839, "mean_token_accuracy": 0.16297108978033065, "num_tokens": 20994623.0, "step": 11375 }, { "entropy": 5.77275652885437, "epoch": 0.9561016593152699, "grad_norm": 1.46875, "learning_rate": 0.00049146727380784, "loss": 5.6615, "mean_token_accuracy": 0.15102900862693786, "num_tokens": 21004193.0, "step": 11380 }, { "entropy": 5.690639591217041, "epoch": 0.9565217391304348, "grad_norm": 1.7734375, "learning_rate": 0.0004914591031062531, "loss": 5.4908, "mean_token_accuracy": 0.16743318736553192, "num_tokens": 21013125.0, "step": 11385 }, { "entropy": 5.655840444564819, "epoch": 0.9569418189455997, "grad_norm": 1.46875, "learning_rate": 0.0004914509285702048, "loss": 5.4135, "mean_token_accuracy": 0.16690310835838318, "num_tokens": 21021402.0, "step": 11390 }, { "entropy": 5.710651922225952, "epoch": 0.9573618987607645, "grad_norm": 1.390625, "learning_rate": 0.0004914427501998397, "loss": 5.5028, "mean_token_accuracy": 0.15886269211769105, "num_tokens": 21029639.0, "step": 11395 }, { "entropy": 5.7445274829864506, "epoch": 0.9577819785759294, "grad_norm": 1.796875, "learning_rate": 0.0004914345679953027, "loss": 5.5347, "mean_token_accuracy": 0.16092265099287034, "num_tokens": 21037525.0, "step": 11400 }, { "entropy": 5.767073345184326, "epoch": 0.9582020583910943, "grad_norm": 1.75, "learning_rate": 0.0004914263819567388, "loss": 5.6295, "mean_token_accuracy": 0.1497710943222046, "num_tokens": 21047702.0, "step": 11405 }, { "entropy": 5.790897989273072, "epoch": 0.9586221382062592, "grad_norm": 1.53125, "learning_rate": 0.000491418192084293, "loss": 5.5474, "mean_token_accuracy": 0.16184664964675904, "num_tokens": 21056379.0, "step": 11410 }, { "entropy": 5.7481053352355955, "epoch": 0.9590422180214241, "grad_norm": 1.6171875, "learning_rate": 0.0004914099983781104, "loss": 5.5489, "mean_token_accuracy": 0.16056970357894898, "num_tokens": 21065283.0, "step": 11415 }, { "entropy": 5.769461059570313, "epoch": 0.959462297836589, "grad_norm": 1.8125, "learning_rate": 0.000491401800838336, "loss": 5.6633, "mean_token_accuracy": 0.15242091715335845, "num_tokens": 21074938.0, "step": 11420 }, { "entropy": 5.7138519287109375, "epoch": 0.9598823776517539, "grad_norm": 1.484375, "learning_rate": 0.0004913935994651153, "loss": 5.514, "mean_token_accuracy": 0.16224966198205948, "num_tokens": 21084729.0, "step": 11425 }, { "entropy": 5.642538785934448, "epoch": 0.9603024574669187, "grad_norm": 1.796875, "learning_rate": 0.0004913853942585932, "loss": 5.4117, "mean_token_accuracy": 0.16488994657993317, "num_tokens": 21093456.0, "step": 11430 }, { "entropy": 5.686456680297852, "epoch": 0.9607225372820836, "grad_norm": 1.6796875, "learning_rate": 0.0004913771852189155, "loss": 5.5451, "mean_token_accuracy": 0.15687460005283355, "num_tokens": 21102980.0, "step": 11435 }, { "entropy": 5.848186016082764, "epoch": 0.9611426170972485, "grad_norm": 1.3515625, "learning_rate": 0.0004913689723462271, "loss": 5.7858, "mean_token_accuracy": 0.16201310455799103, "num_tokens": 21112777.0, "step": 11440 }, { "entropy": 5.803880500793457, "epoch": 0.9615626969124134, "grad_norm": 1.625, "learning_rate": 0.000491360755640674, "loss": 5.6636, "mean_token_accuracy": 0.15397086888551711, "num_tokens": 21122139.0, "step": 11445 }, { "entropy": 5.7530255794525145, "epoch": 0.9619827767275783, "grad_norm": 1.4609375, "learning_rate": 0.0004913525351024014, "loss": 5.5361, "mean_token_accuracy": 0.15754189491271972, "num_tokens": 21131425.0, "step": 11450 }, { "entropy": 5.708436107635498, "epoch": 0.9624028565427432, "grad_norm": 1.4140625, "learning_rate": 0.0004913443107315552, "loss": 5.5081, "mean_token_accuracy": 0.15728521049022676, "num_tokens": 21140784.0, "step": 11455 }, { "entropy": 5.700329685211182, "epoch": 0.962822936357908, "grad_norm": 1.515625, "learning_rate": 0.0004913360825282807, "loss": 5.5271, "mean_token_accuracy": 0.16611011624336242, "num_tokens": 21150408.0, "step": 11460 }, { "entropy": 5.716848659515381, "epoch": 0.9632430161730728, "grad_norm": 1.6640625, "learning_rate": 0.000491327850492724, "loss": 5.6151, "mean_token_accuracy": 0.1602442279458046, "num_tokens": 21158915.0, "step": 11465 }, { "entropy": 5.630804204940796, "epoch": 0.9636630959882377, "grad_norm": 1.421875, "learning_rate": 0.0004913196146250309, "loss": 5.4172, "mean_token_accuracy": 0.16883472204208375, "num_tokens": 21167336.0, "step": 11470 }, { "entropy": 5.830715799331665, "epoch": 0.9640831758034026, "grad_norm": 1.6328125, "learning_rate": 0.0004913113749253472, "loss": 5.7333, "mean_token_accuracy": 0.15114703625440598, "num_tokens": 21177499.0, "step": 11475 }, { "entropy": 5.8909022331237795, "epoch": 0.9645032556185675, "grad_norm": 1.8984375, "learning_rate": 0.0004913031313938188, "loss": 5.6341, "mean_token_accuracy": 0.15465849339962007, "num_tokens": 21186961.0, "step": 11480 }, { "entropy": 5.749186849594116, "epoch": 0.9649233354337324, "grad_norm": 1.359375, "learning_rate": 0.0004912948840305919, "loss": 5.5207, "mean_token_accuracy": 0.1647267997264862, "num_tokens": 21196364.0, "step": 11485 }, { "entropy": 5.710730838775635, "epoch": 0.9653434152488973, "grad_norm": 1.671875, "learning_rate": 0.0004912866328358125, "loss": 5.5995, "mean_token_accuracy": 0.15677765160799026, "num_tokens": 21206376.0, "step": 11490 }, { "entropy": 5.779667091369629, "epoch": 0.9657634950640621, "grad_norm": 1.53125, "learning_rate": 0.0004912783778096266, "loss": 5.5689, "mean_token_accuracy": 0.16532181650400163, "num_tokens": 21215889.0, "step": 11495 }, { "entropy": 5.756943035125732, "epoch": 0.966183574879227, "grad_norm": 1.4609375, "learning_rate": 0.0004912701189521808, "loss": 5.5847, "mean_token_accuracy": 0.16167923510074617, "num_tokens": 21224959.0, "step": 11500 }, { "entropy": 5.824455404281617, "epoch": 0.9666036546943919, "grad_norm": 1.5859375, "learning_rate": 0.0004912618562636211, "loss": 5.6996, "mean_token_accuracy": 0.1518427163362503, "num_tokens": 21234495.0, "step": 11505 }, { "entropy": 5.704730606079101, "epoch": 0.9670237345095568, "grad_norm": 1.3984375, "learning_rate": 0.000491253589744094, "loss": 5.5344, "mean_token_accuracy": 0.1582339495420456, "num_tokens": 21244555.0, "step": 11510 }, { "entropy": 5.786595106124878, "epoch": 0.9674438143247217, "grad_norm": 1.9453125, "learning_rate": 0.0004912453193937459, "loss": 5.6929, "mean_token_accuracy": 0.1545358881354332, "num_tokens": 21254199.0, "step": 11515 }, { "entropy": 5.79626088142395, "epoch": 0.9678638941398866, "grad_norm": 1.3984375, "learning_rate": 0.0004912370452127234, "loss": 5.5811, "mean_token_accuracy": 0.15614356994628906, "num_tokens": 21262723.0, "step": 11520 }, { "entropy": 5.749001598358154, "epoch": 0.9682839739550515, "grad_norm": 1.6953125, "learning_rate": 0.0004912287672011728, "loss": 5.498, "mean_token_accuracy": 0.16639503091573715, "num_tokens": 21271283.0, "step": 11525 }, { "entropy": 5.694228219985962, "epoch": 0.9687040537702163, "grad_norm": 1.515625, "learning_rate": 0.0004912204853592411, "loss": 5.5549, "mean_token_accuracy": 0.1661546677350998, "num_tokens": 21279542.0, "step": 11530 }, { "entropy": 5.738241815567017, "epoch": 0.9691241335853812, "grad_norm": 1.4375, "learning_rate": 0.0004912121996870748, "loss": 5.5345, "mean_token_accuracy": 0.16057475954294204, "num_tokens": 21288678.0, "step": 11535 }, { "entropy": 5.781418895721435, "epoch": 0.9695442134005461, "grad_norm": 1.25, "learning_rate": 0.0004912039101848207, "loss": 5.6681, "mean_token_accuracy": 0.1558816574513912, "num_tokens": 21298982.0, "step": 11540 }, { "entropy": 5.759183168411255, "epoch": 0.969964293215711, "grad_norm": 2.015625, "learning_rate": 0.0004911956168526257, "loss": 5.609, "mean_token_accuracy": 0.1565386489033699, "num_tokens": 21307663.0, "step": 11545 }, { "entropy": 5.845695209503174, "epoch": 0.9703843730308759, "grad_norm": 1.5390625, "learning_rate": 0.0004911873196906366, "loss": 5.6214, "mean_token_accuracy": 0.1554511606693268, "num_tokens": 21318004.0, "step": 11550 }, { "entropy": 5.676923847198486, "epoch": 0.9708044528460408, "grad_norm": 1.40625, "learning_rate": 0.0004911790186990005, "loss": 5.4377, "mean_token_accuracy": 0.16938215047121047, "num_tokens": 21327373.0, "step": 11555 }, { "entropy": 5.664393568038941, "epoch": 0.9712245326612057, "grad_norm": 1.390625, "learning_rate": 0.0004911707138778643, "loss": 5.5261, "mean_token_accuracy": 0.15850266367197036, "num_tokens": 21335654.0, "step": 11560 }, { "entropy": 5.805261135101318, "epoch": 0.9716446124763705, "grad_norm": 1.3046875, "learning_rate": 0.0004911624052273754, "loss": 5.5917, "mean_token_accuracy": 0.15714938044548035, "num_tokens": 21344464.0, "step": 11565 }, { "entropy": 5.811971664428711, "epoch": 0.9720646922915354, "grad_norm": 1.7421875, "learning_rate": 0.0004911540927476807, "loss": 5.6846, "mean_token_accuracy": 0.15539554506540298, "num_tokens": 21354121.0, "step": 11570 }, { "entropy": 5.761470699310303, "epoch": 0.9724847721067003, "grad_norm": 1.5234375, "learning_rate": 0.0004911457764389275, "loss": 5.6129, "mean_token_accuracy": 0.16058044135570526, "num_tokens": 21363395.0, "step": 11575 }, { "entropy": 5.740648984909058, "epoch": 0.9729048519218652, "grad_norm": 1.484375, "learning_rate": 0.0004911374563012633, "loss": 5.5736, "mean_token_accuracy": 0.15647933781147003, "num_tokens": 21372126.0, "step": 11580 }, { "entropy": 5.771029758453369, "epoch": 0.97332493173703, "grad_norm": 1.515625, "learning_rate": 0.0004911291323348352, "loss": 5.6557, "mean_token_accuracy": 0.14915687441825867, "num_tokens": 21380554.0, "step": 11585 }, { "entropy": 5.70338454246521, "epoch": 0.973745011552195, "grad_norm": 1.6171875, "learning_rate": 0.0004911208045397909, "loss": 5.5306, "mean_token_accuracy": 0.15759393125772475, "num_tokens": 21389317.0, "step": 11590 }, { "entropy": 5.784313058853149, "epoch": 0.9741650913673598, "grad_norm": 1.71875, "learning_rate": 0.0004911124729162778, "loss": 5.66, "mean_token_accuracy": 0.1539946123957634, "num_tokens": 21398926.0, "step": 11595 }, { "entropy": 5.741526746749878, "epoch": 0.9745851711825246, "grad_norm": 1.8359375, "learning_rate": 0.0004911041374644435, "loss": 5.4636, "mean_token_accuracy": 0.16005127429962157, "num_tokens": 21406962.0, "step": 11600 }, { "entropy": 5.734314489364624, "epoch": 0.9750052509976895, "grad_norm": 1.59375, "learning_rate": 0.0004910957981844357, "loss": 5.5654, "mean_token_accuracy": 0.16124276220798492, "num_tokens": 21415868.0, "step": 11605 }, { "entropy": 5.803146696090698, "epoch": 0.9754253308128544, "grad_norm": 1.6171875, "learning_rate": 0.0004910874550764022, "loss": 5.6967, "mean_token_accuracy": 0.15788596943020822, "num_tokens": 21424544.0, "step": 11610 }, { "entropy": 5.653631067276001, "epoch": 0.9758454106280193, "grad_norm": 1.3984375, "learning_rate": 0.0004910791081404907, "loss": 5.5587, "mean_token_accuracy": 0.16439975649118424, "num_tokens": 21433589.0, "step": 11615 }, { "entropy": 5.75174469947815, "epoch": 0.9762654904431842, "grad_norm": 1.4765625, "learning_rate": 0.0004910707573768489, "loss": 5.6188, "mean_token_accuracy": 0.15351523384451865, "num_tokens": 21442084.0, "step": 11620 }, { "entropy": 5.711339998245239, "epoch": 0.9766855702583491, "grad_norm": 1.4375, "learning_rate": 0.0004910624027856251, "loss": 5.5242, "mean_token_accuracy": 0.15779978781938553, "num_tokens": 21450962.0, "step": 11625 }, { "entropy": 5.761394453048706, "epoch": 0.977105650073514, "grad_norm": 1.5390625, "learning_rate": 0.0004910540443669669, "loss": 5.616, "mean_token_accuracy": 0.15358125492930413, "num_tokens": 21461322.0, "step": 11630 }, { "entropy": 5.790155363082886, "epoch": 0.9775257298886788, "grad_norm": 1.5078125, "learning_rate": 0.0004910456821210227, "loss": 5.5963, "mean_token_accuracy": 0.16139813885092735, "num_tokens": 21470800.0, "step": 11635 }, { "entropy": 5.705955171585083, "epoch": 0.9779458097038437, "grad_norm": 1.3359375, "learning_rate": 0.0004910373160479404, "loss": 5.4578, "mean_token_accuracy": 0.1623155578970909, "num_tokens": 21479707.0, "step": 11640 }, { "entropy": 5.705592966079712, "epoch": 0.9783658895190086, "grad_norm": 1.765625, "learning_rate": 0.0004910289461478683, "loss": 5.6531, "mean_token_accuracy": 0.14903590232133865, "num_tokens": 21489469.0, "step": 11645 }, { "entropy": 5.782165670394898, "epoch": 0.9787859693341735, "grad_norm": 1.46875, "learning_rate": 0.0004910205724209547, "loss": 5.6102, "mean_token_accuracy": 0.15439205691218377, "num_tokens": 21499226.0, "step": 11650 }, { "entropy": 5.662615633010864, "epoch": 0.9792060491493384, "grad_norm": 1.40625, "learning_rate": 0.0004910121948673478, "loss": 5.4725, "mean_token_accuracy": 0.16271869242191314, "num_tokens": 21508129.0, "step": 11655 }, { "entropy": 5.677742385864258, "epoch": 0.9796261289645033, "grad_norm": 1.5234375, "learning_rate": 0.0004910038134871962, "loss": 5.5133, "mean_token_accuracy": 0.16307872533798218, "num_tokens": 21516293.0, "step": 11660 }, { "entropy": 5.8114800453186035, "epoch": 0.9800462087796681, "grad_norm": 1.453125, "learning_rate": 0.0004909954282806482, "loss": 5.663, "mean_token_accuracy": 0.15625039413571357, "num_tokens": 21525393.0, "step": 11665 }, { "entropy": 5.650265026092529, "epoch": 0.980466288594833, "grad_norm": 1.328125, "learning_rate": 0.0004909870392478524, "loss": 5.5162, "mean_token_accuracy": 0.15820949375629426, "num_tokens": 21534585.0, "step": 11670 }, { "entropy": 5.637864255905152, "epoch": 0.9808863684099979, "grad_norm": 1.6484375, "learning_rate": 0.0004909786463889575, "loss": 5.4578, "mean_token_accuracy": 0.16383379697799683, "num_tokens": 21542947.0, "step": 11675 }, { "entropy": 5.737944889068603, "epoch": 0.9813064482251628, "grad_norm": 1.734375, "learning_rate": 0.0004909702497041121, "loss": 5.5743, "mean_token_accuracy": 0.16033673286437988, "num_tokens": 21552168.0, "step": 11680 }, { "entropy": 5.723841714859009, "epoch": 0.9817265280403277, "grad_norm": 1.59375, "learning_rate": 0.0004909618491934648, "loss": 5.577, "mean_token_accuracy": 0.16168997883796693, "num_tokens": 21562131.0, "step": 11685 }, { "entropy": 5.690407085418701, "epoch": 0.9821466078554926, "grad_norm": 1.4921875, "learning_rate": 0.0004909534448571647, "loss": 5.5295, "mean_token_accuracy": 0.1657412603497505, "num_tokens": 21571363.0, "step": 11690 }, { "entropy": 5.723976564407349, "epoch": 0.9825666876706575, "grad_norm": 1.625, "learning_rate": 0.0004909450366953604, "loss": 5.5015, "mean_token_accuracy": 0.16331232860684394, "num_tokens": 21580754.0, "step": 11695 }, { "entropy": 5.728280305862427, "epoch": 0.9829867674858223, "grad_norm": 1.390625, "learning_rate": 0.000490936624708201, "loss": 5.6055, "mean_token_accuracy": 0.15559826791286469, "num_tokens": 21590053.0, "step": 11700 }, { "entropy": 5.720153570175171, "epoch": 0.9834068473009872, "grad_norm": 1.546875, "learning_rate": 0.0004909282088958356, "loss": 5.5648, "mean_token_accuracy": 0.1572035074234009, "num_tokens": 21598681.0, "step": 11705 }, { "entropy": 5.809522724151611, "epoch": 0.983826927116152, "grad_norm": 1.6953125, "learning_rate": 0.000490919789258413, "loss": 5.5901, "mean_token_accuracy": 0.1646919757127762, "num_tokens": 21607465.0, "step": 11710 }, { "entropy": 5.760197687149048, "epoch": 0.984247006931317, "grad_norm": 2.0, "learning_rate": 0.0004909113657960826, "loss": 5.6859, "mean_token_accuracy": 0.1438727371394634, "num_tokens": 21617480.0, "step": 11715 }, { "entropy": 5.747771978378296, "epoch": 0.9846670867464818, "grad_norm": 1.421875, "learning_rate": 0.0004909029385089935, "loss": 5.5799, "mean_token_accuracy": 0.16191355288028716, "num_tokens": 21626434.0, "step": 11720 }, { "entropy": 5.759758377075196, "epoch": 0.9850871665616467, "grad_norm": 1.5546875, "learning_rate": 0.000490894507397295, "loss": 5.5507, "mean_token_accuracy": 0.1621351957321167, "num_tokens": 21635627.0, "step": 11725 }, { "entropy": 5.738065910339356, "epoch": 0.9855072463768116, "grad_norm": 1.671875, "learning_rate": 0.0004908860724611365, "loss": 5.5608, "mean_token_accuracy": 0.1566981017589569, "num_tokens": 21644789.0, "step": 11730 }, { "entropy": 5.63734655380249, "epoch": 0.9859273261919764, "grad_norm": 1.3984375, "learning_rate": 0.0004908776337006675, "loss": 5.5664, "mean_token_accuracy": 0.15962323546409607, "num_tokens": 21653696.0, "step": 11735 }, { "entropy": 5.737686443328857, "epoch": 0.9863474060071413, "grad_norm": 1.609375, "learning_rate": 0.0004908691911160373, "loss": 5.5614, "mean_token_accuracy": 0.15139272063970566, "num_tokens": 21664420.0, "step": 11740 }, { "entropy": 5.753671407699585, "epoch": 0.9867674858223062, "grad_norm": 1.625, "learning_rate": 0.0004908607447073954, "loss": 5.5481, "mean_token_accuracy": 0.15880379527807237, "num_tokens": 21673716.0, "step": 11745 }, { "entropy": 5.73064112663269, "epoch": 0.9871875656374711, "grad_norm": 1.484375, "learning_rate": 0.0004908522944748917, "loss": 5.5493, "mean_token_accuracy": 0.16386302858591079, "num_tokens": 21682860.0, "step": 11750 }, { "entropy": 5.609640121459961, "epoch": 0.987607645452636, "grad_norm": 1.7265625, "learning_rate": 0.0004908438404186758, "loss": 5.5444, "mean_token_accuracy": 0.1676987513899803, "num_tokens": 21691915.0, "step": 11755 }, { "entropy": 5.773650169372559, "epoch": 0.9880277252678009, "grad_norm": 1.875, "learning_rate": 0.0004908353825388973, "loss": 5.6686, "mean_token_accuracy": 0.1477293998003006, "num_tokens": 21701666.0, "step": 11760 }, { "entropy": 5.837761163711548, "epoch": 0.9884478050829658, "grad_norm": 1.8203125, "learning_rate": 0.0004908269208357062, "loss": 5.6005, "mean_token_accuracy": 0.16498832553625106, "num_tokens": 21709267.0, "step": 11765 }, { "entropy": 5.687007045745849, "epoch": 0.9888678848981306, "grad_norm": 2.171875, "learning_rate": 0.0004908184553092523, "loss": 5.4664, "mean_token_accuracy": 0.16219521760940553, "num_tokens": 21718117.0, "step": 11770 }, { "entropy": 5.74579439163208, "epoch": 0.9892879647132955, "grad_norm": 1.7578125, "learning_rate": 0.0004908099859596856, "loss": 5.6226, "mean_token_accuracy": 0.16140222251415254, "num_tokens": 21727952.0, "step": 11775 }, { "entropy": 5.798332405090332, "epoch": 0.9897080445284604, "grad_norm": 1.7578125, "learning_rate": 0.0004908015127871561, "loss": 5.5076, "mean_token_accuracy": 0.15737968385219575, "num_tokens": 21737878.0, "step": 11780 }, { "entropy": 5.66026086807251, "epoch": 0.9901281243436253, "grad_norm": 1.40625, "learning_rate": 0.000490793035791814, "loss": 5.4333, "mean_token_accuracy": 0.16479237079620362, "num_tokens": 21747391.0, "step": 11785 }, { "entropy": 5.639067459106445, "epoch": 0.9905482041587902, "grad_norm": 1.765625, "learning_rate": 0.0004907845549738093, "loss": 5.4825, "mean_token_accuracy": 0.1608181118965149, "num_tokens": 21756791.0, "step": 11790 }, { "entropy": 5.626802778244018, "epoch": 0.9909682839739551, "grad_norm": 1.6875, "learning_rate": 0.0004907760703332923, "loss": 5.514, "mean_token_accuracy": 0.16045965999364853, "num_tokens": 21766020.0, "step": 11795 }, { "entropy": 5.7946771621704105, "epoch": 0.99138836378912, "grad_norm": 1.5703125, "learning_rate": 0.0004907675818704134, "loss": 5.6332, "mean_token_accuracy": 0.15098711997270584, "num_tokens": 21775895.0, "step": 11800 }, { "entropy": 5.720692729949951, "epoch": 0.9918084436042848, "grad_norm": 1.6328125, "learning_rate": 0.0004907590895853228, "loss": 5.5368, "mean_token_accuracy": 0.16272979229688644, "num_tokens": 21784543.0, "step": 11805 }, { "entropy": 5.734677982330322, "epoch": 0.9922285234194497, "grad_norm": 1.890625, "learning_rate": 0.0004907505934781712, "loss": 5.5898, "mean_token_accuracy": 0.15340466499328614, "num_tokens": 21793938.0, "step": 11810 }, { "entropy": 5.73793478012085, "epoch": 0.9926486032346146, "grad_norm": 1.484375, "learning_rate": 0.0004907420935491087, "loss": 5.5694, "mean_token_accuracy": 0.15643597394227982, "num_tokens": 21803641.0, "step": 11815 }, { "entropy": 5.734190273284912, "epoch": 0.9930686830497795, "grad_norm": 1.875, "learning_rate": 0.0004907335897982862, "loss": 5.4978, "mean_token_accuracy": 0.1619450032711029, "num_tokens": 21812542.0, "step": 11820 }, { "entropy": 5.653626728057861, "epoch": 0.9934887628649444, "grad_norm": 1.5390625, "learning_rate": 0.0004907250822258543, "loss": 5.5806, "mean_token_accuracy": 0.15819441080093383, "num_tokens": 21821847.0, "step": 11825 }, { "entropy": 5.8374409675598145, "epoch": 0.9939088426801093, "grad_norm": 1.7734375, "learning_rate": 0.0004907165708319637, "loss": 5.6198, "mean_token_accuracy": 0.15984491556882857, "num_tokens": 21830799.0, "step": 11830 }, { "entropy": 5.780798053741455, "epoch": 0.994328922495274, "grad_norm": 1.4765625, "learning_rate": 0.0004907080556167651, "loss": 5.5597, "mean_token_accuracy": 0.15932203084230423, "num_tokens": 21840202.0, "step": 11835 }, { "entropy": 5.827149820327759, "epoch": 0.994749002310439, "grad_norm": 1.671875, "learning_rate": 0.0004906995365804093, "loss": 5.665, "mean_token_accuracy": 0.15373467579483985, "num_tokens": 21849701.0, "step": 11840 }, { "entropy": 5.745590162277222, "epoch": 0.9951690821256038, "grad_norm": 1.5546875, "learning_rate": 0.0004906910137230472, "loss": 5.5375, "mean_token_accuracy": 0.161653570830822, "num_tokens": 21859191.0, "step": 11845 }, { "entropy": 5.722856521606445, "epoch": 0.9955891619407687, "grad_norm": 1.34375, "learning_rate": 0.00049068248704483, "loss": 5.5202, "mean_token_accuracy": 0.157266703248024, "num_tokens": 21867944.0, "step": 11850 }, { "entropy": 5.644532155990601, "epoch": 0.9960092417559336, "grad_norm": 1.5546875, "learning_rate": 0.0004906739565459085, "loss": 5.5632, "mean_token_accuracy": 0.15848701894283296, "num_tokens": 21876368.0, "step": 11855 }, { "entropy": 5.863846969604492, "epoch": 0.9964293215710985, "grad_norm": 1.375, "learning_rate": 0.000490665422226434, "loss": 5.6436, "mean_token_accuracy": 0.1514528512954712, "num_tokens": 21885634.0, "step": 11860 }, { "entropy": 5.6821434020996096, "epoch": 0.9968494013862634, "grad_norm": 1.65625, "learning_rate": 0.0004906568840865576, "loss": 5.4504, "mean_token_accuracy": 0.16308265626430513, "num_tokens": 21894315.0, "step": 11865 }, { "entropy": 5.626799726486206, "epoch": 0.9972694812014282, "grad_norm": 1.9453125, "learning_rate": 0.0004906483421264305, "loss": 5.5695, "mean_token_accuracy": 0.159691222012043, "num_tokens": 21903342.0, "step": 11870 }, { "entropy": 5.7634326934814455, "epoch": 0.9976895610165931, "grad_norm": 1.9296875, "learning_rate": 0.000490639796346204, "loss": 5.686, "mean_token_accuracy": 0.15302741080522536, "num_tokens": 21914158.0, "step": 11875 }, { "entropy": 5.901743459701538, "epoch": 0.998109640831758, "grad_norm": 1.5234375, "learning_rate": 0.0004906312467460297, "loss": 5.5633, "mean_token_accuracy": 0.16004915833473204, "num_tokens": 21922639.0, "step": 11880 }, { "entropy": 5.736720323562622, "epoch": 0.9985297206469229, "grad_norm": 1.53125, "learning_rate": 0.0004906226933260588, "loss": 5.5645, "mean_token_accuracy": 0.15823576152324675, "num_tokens": 21931385.0, "step": 11885 }, { "entropy": 5.78201150894165, "epoch": 0.9989498004620878, "grad_norm": 1.859375, "learning_rate": 0.0004906141360864429, "loss": 5.5746, "mean_token_accuracy": 0.15795834213495255, "num_tokens": 21940788.0, "step": 11890 }, { "entropy": 5.749546051025391, "epoch": 0.9993698802772527, "grad_norm": 1.7734375, "learning_rate": 0.0004906055750273336, "loss": 5.5854, "mean_token_accuracy": 0.15715595483779907, "num_tokens": 21950309.0, "step": 11895 }, { "entropy": 5.691565322875976, "epoch": 0.9997899600924176, "grad_norm": 2.03125, "learning_rate": 0.0004905970101488826, "loss": 5.5724, "mean_token_accuracy": 0.15797929465770721, "num_tokens": 21959141.0, "step": 11900 }, { "entropy": 5.813778877258301, "epoch": 1.000168031926066, "grad_norm": 1.8828125, "learning_rate": 0.0004905884414512416, "loss": 5.6073, "mean_token_accuracy": 0.15993836356533897, "num_tokens": 21966665.0, "step": 11905 }, { "entropy": 5.747717571258545, "epoch": 1.0005881117412307, "grad_norm": 1.65625, "learning_rate": 0.0004905798689345623, "loss": 5.5985, "mean_token_accuracy": 0.15958280488848686, "num_tokens": 21976728.0, "step": 11910 }, { "entropy": 5.70471978187561, "epoch": 1.0010081915563958, "grad_norm": 1.53125, "learning_rate": 0.0004905712925989968, "loss": 5.4321, "mean_token_accuracy": 0.1577399954199791, "num_tokens": 21985915.0, "step": 11915 }, { "entropy": 5.717014789581299, "epoch": 1.0014282713715605, "grad_norm": 2.15625, "learning_rate": 0.0004905627124446967, "loss": 5.4817, "mean_token_accuracy": 0.16125397384166718, "num_tokens": 21995826.0, "step": 11920 }, { "entropy": 5.673809146881103, "epoch": 1.0018483511867255, "grad_norm": 1.625, "learning_rate": 0.0004905541284718142, "loss": 5.441, "mean_token_accuracy": 0.16078125834465026, "num_tokens": 22005299.0, "step": 11925 }, { "entropy": 5.688680934906006, "epoch": 1.0022684310018903, "grad_norm": 1.9609375, "learning_rate": 0.0004905455406805011, "loss": 5.48, "mean_token_accuracy": 0.160285322368145, "num_tokens": 22014499.0, "step": 11930 }, { "entropy": 5.827605724334717, "epoch": 1.0026885108170553, "grad_norm": 1.4765625, "learning_rate": 0.00049053694907091, "loss": 5.6404, "mean_token_accuracy": 0.15083224773406984, "num_tokens": 22024531.0, "step": 11935 }, { "entropy": 5.765188217163086, "epoch": 1.0031085906322201, "grad_norm": 1.5390625, "learning_rate": 0.0004905283536431928, "loss": 5.546, "mean_token_accuracy": 0.16389428079128265, "num_tokens": 22034036.0, "step": 11940 }, { "entropy": 5.673288774490357, "epoch": 1.003528670447385, "grad_norm": 1.6484375, "learning_rate": 0.0004905197543975017, "loss": 5.4413, "mean_token_accuracy": 0.16298594772815705, "num_tokens": 22042910.0, "step": 11945 }, { "entropy": 5.742687463760376, "epoch": 1.00394875026255, "grad_norm": 1.4453125, "learning_rate": 0.0004905111513339892, "loss": 5.5236, "mean_token_accuracy": 0.16467590481042862, "num_tokens": 22052242.0, "step": 11950 }, { "entropy": 5.723882246017456, "epoch": 1.0043688300777147, "grad_norm": 1.578125, "learning_rate": 0.0004905025444528076, "loss": 5.4865, "mean_token_accuracy": 0.15788668096065522, "num_tokens": 22061467.0, "step": 11955 }, { "entropy": 5.6063799381256105, "epoch": 1.0047889098928797, "grad_norm": 1.6328125, "learning_rate": 0.0004904939337541093, "loss": 5.3608, "mean_token_accuracy": 0.1663319230079651, "num_tokens": 22070300.0, "step": 11960 }, { "entropy": 5.7507532119750975, "epoch": 1.0052089897080445, "grad_norm": 1.625, "learning_rate": 0.0004904853192380472, "loss": 5.5215, "mean_token_accuracy": 0.158057052642107, "num_tokens": 22078960.0, "step": 11965 }, { "entropy": 5.719160795211792, "epoch": 1.0056290695232095, "grad_norm": 1.6171875, "learning_rate": 0.0004904767009047733, "loss": 5.458, "mean_token_accuracy": 0.1630512699484825, "num_tokens": 22088135.0, "step": 11970 }, { "entropy": 5.731142950057984, "epoch": 1.0060491493383743, "grad_norm": 1.3984375, "learning_rate": 0.0004904680787544408, "loss": 5.582, "mean_token_accuracy": 0.15549475252628325, "num_tokens": 22098004.0, "step": 11975 }, { "entropy": 5.818147802352906, "epoch": 1.006469229153539, "grad_norm": 1.5, "learning_rate": 0.0004904594527872022, "loss": 5.5522, "mean_token_accuracy": 0.15604811310768127, "num_tokens": 22107680.0, "step": 11980 }, { "entropy": 5.7786630153656, "epoch": 1.006889308968704, "grad_norm": 1.4296875, "learning_rate": 0.0004904508230032103, "loss": 5.5677, "mean_token_accuracy": 0.1585972711443901, "num_tokens": 22118004.0, "step": 11985 }, { "entropy": 5.7285055160522464, "epoch": 1.0073093887838689, "grad_norm": 1.7109375, "learning_rate": 0.000490442189402618, "loss": 5.5151, "mean_token_accuracy": 0.17011249363422393, "num_tokens": 22127825.0, "step": 11990 }, { "entropy": 5.711953926086426, "epoch": 1.007729468599034, "grad_norm": 1.5, "learning_rate": 0.0004904335519855783, "loss": 5.4227, "mean_token_accuracy": 0.16442998498678207, "num_tokens": 22136448.0, "step": 11995 }, { "entropy": 5.657416820526123, "epoch": 1.0081495484141987, "grad_norm": 1.4609375, "learning_rate": 0.0004904249107522442, "loss": 5.5436, "mean_token_accuracy": 0.15949945598840715, "num_tokens": 22146415.0, "step": 12000 }, { "epoch": 1.0081495484141987, "eval_entropy": 5.525661500662507, "eval_loss": 5.590455532073975, "eval_mean_token_accuracy": 0.16449697244313435, "eval_num_tokens": 22146415.0, "eval_runtime": 27.36, "eval_samples_per_second": 1365.715, "eval_steps_per_second": 170.724, "step": 12000 }, { "entropy": 5.816870403289795, "epoch": 1.0085696282293637, "grad_norm": 1.4921875, "learning_rate": 0.0004904162657027685, "loss": 5.6473, "mean_token_accuracy": 0.1565300554037094, "num_tokens": 22156327.0, "step": 12005 }, { "entropy": 5.738042402267456, "epoch": 1.0089897080445285, "grad_norm": 1.484375, "learning_rate": 0.0004904076168373049, "loss": 5.4672, "mean_token_accuracy": 0.1601177304983139, "num_tokens": 22165677.0, "step": 12010 }, { "entropy": 5.727717494964599, "epoch": 1.0094097878596933, "grad_norm": 1.34375, "learning_rate": 0.0004903989641560061, "loss": 5.5975, "mean_token_accuracy": 0.1590371698141098, "num_tokens": 22175232.0, "step": 12015 }, { "entropy": 5.758626651763916, "epoch": 1.0098298676748583, "grad_norm": 1.515625, "learning_rate": 0.0004903903076590256, "loss": 5.473, "mean_token_accuracy": 0.15314906388521193, "num_tokens": 22184026.0, "step": 12020 }, { "entropy": 5.663096857070923, "epoch": 1.010249947490023, "grad_norm": 1.421875, "learning_rate": 0.0004903816473465167, "loss": 5.3778, "mean_token_accuracy": 0.1727016821503639, "num_tokens": 22192020.0, "step": 12025 }, { "entropy": 5.613332319259643, "epoch": 1.010670027305188, "grad_norm": 1.421875, "learning_rate": 0.0004903729832186328, "loss": 5.3511, "mean_token_accuracy": 0.16883303374052047, "num_tokens": 22200060.0, "step": 12030 }, { "entropy": 5.620872068405151, "epoch": 1.0110901071203529, "grad_norm": 1.2421875, "learning_rate": 0.0004903643152755274, "loss": 5.407, "mean_token_accuracy": 0.1603987216949463, "num_tokens": 22208625.0, "step": 12035 }, { "entropy": 5.685234689712525, "epoch": 1.0115101869355176, "grad_norm": 1.5703125, "learning_rate": 0.0004903556435173541, "loss": 5.3922, "mean_token_accuracy": 0.1666228473186493, "num_tokens": 22217781.0, "step": 12040 }, { "entropy": 5.746535110473633, "epoch": 1.0119302667506826, "grad_norm": 1.5625, "learning_rate": 0.0004903469679442665, "loss": 5.5318, "mean_token_accuracy": 0.16123737245798112, "num_tokens": 22226432.0, "step": 12045 }, { "entropy": 5.652414417266845, "epoch": 1.0123503465658474, "grad_norm": 1.5078125, "learning_rate": 0.0004903382885564181, "loss": 5.5297, "mean_token_accuracy": 0.16408767104148864, "num_tokens": 22234811.0, "step": 12050 }, { "entropy": 5.5869992733001705, "epoch": 1.0127704263810124, "grad_norm": 1.6328125, "learning_rate": 0.000490329605353963, "loss": 5.4075, "mean_token_accuracy": 0.17152390927076339, "num_tokens": 22242808.0, "step": 12055 }, { "entropy": 5.75869345664978, "epoch": 1.0131905061961772, "grad_norm": 1.53125, "learning_rate": 0.0004903209183370547, "loss": 5.4738, "mean_token_accuracy": 0.1645299270749092, "num_tokens": 22251371.0, "step": 12060 }, { "entropy": 5.830525541305542, "epoch": 1.0136105860113422, "grad_norm": 1.515625, "learning_rate": 0.0004903122275058472, "loss": 5.5546, "mean_token_accuracy": 0.16162935346364976, "num_tokens": 22260868.0, "step": 12065 }, { "entropy": 5.650126838684082, "epoch": 1.014030665826507, "grad_norm": 1.5546875, "learning_rate": 0.0004903035328604944, "loss": 5.4551, "mean_token_accuracy": 0.16388770192861557, "num_tokens": 22270554.0, "step": 12070 }, { "entropy": 5.599603319168091, "epoch": 1.0144507456416718, "grad_norm": 1.6796875, "learning_rate": 0.0004902948344011506, "loss": 5.4471, "mean_token_accuracy": 0.16133227497339248, "num_tokens": 22279170.0, "step": 12075 }, { "entropy": 5.739398241043091, "epoch": 1.0148708254568368, "grad_norm": 1.46875, "learning_rate": 0.0004902861321279694, "loss": 5.6051, "mean_token_accuracy": 0.1532390832901001, "num_tokens": 22288788.0, "step": 12080 }, { "entropy": 5.6841353416442875, "epoch": 1.0152909052720016, "grad_norm": 1.40625, "learning_rate": 0.0004902774260411055, "loss": 5.385, "mean_token_accuracy": 0.1635892152786255, "num_tokens": 22297501.0, "step": 12085 }, { "entropy": 5.612368249893189, "epoch": 1.0157109850871666, "grad_norm": 1.75, "learning_rate": 0.0004902687161407126, "loss": 5.3466, "mean_token_accuracy": 0.17515814155340195, "num_tokens": 22306181.0, "step": 12090 }, { "entropy": 5.670634174346924, "epoch": 1.0161310649023314, "grad_norm": 1.671875, "learning_rate": 0.0004902600024269454, "loss": 5.5072, "mean_token_accuracy": 0.16697340905666352, "num_tokens": 22315762.0, "step": 12095 }, { "entropy": 5.626059675216675, "epoch": 1.0165511447174964, "grad_norm": 1.84375, "learning_rate": 0.000490251284899958, "loss": 5.439, "mean_token_accuracy": 0.16588278263807296, "num_tokens": 22325127.0, "step": 12100 }, { "entropy": 5.649977350234986, "epoch": 1.0169712245326612, "grad_norm": 1.5234375, "learning_rate": 0.000490242563559905, "loss": 5.5278, "mean_token_accuracy": 0.15909326523542405, "num_tokens": 22334038.0, "step": 12105 }, { "entropy": 5.681149196624756, "epoch": 1.017391304347826, "grad_norm": 1.359375, "learning_rate": 0.0004902338384069408, "loss": 5.3772, "mean_token_accuracy": 0.16700164079666138, "num_tokens": 22342658.0, "step": 12110 }, { "entropy": 5.748837232589722, "epoch": 1.017811384162991, "grad_norm": 1.3515625, "learning_rate": 0.00049022510944122, "loss": 5.5592, "mean_token_accuracy": 0.1559050902724266, "num_tokens": 22352559.0, "step": 12115 }, { "entropy": 5.741272211074829, "epoch": 1.0182314639781558, "grad_norm": 1.6171875, "learning_rate": 0.0004902163766628972, "loss": 5.4663, "mean_token_accuracy": 0.16664180606603624, "num_tokens": 22361455.0, "step": 12120 }, { "entropy": 5.761194944381714, "epoch": 1.0186515437933208, "grad_norm": 1.578125, "learning_rate": 0.0004902076400721271, "loss": 5.5025, "mean_token_accuracy": 0.15924629420042039, "num_tokens": 22371163.0, "step": 12125 }, { "entropy": 5.786735534667969, "epoch": 1.0190716236084856, "grad_norm": 1.890625, "learning_rate": 0.0004901988996690645, "loss": 5.4939, "mean_token_accuracy": 0.16901676952838898, "num_tokens": 22379975.0, "step": 12130 }, { "entropy": 5.794359588623047, "epoch": 1.0194917034236506, "grad_norm": 1.390625, "learning_rate": 0.0004901901554538641, "loss": 5.5351, "mean_token_accuracy": 0.16184651851654053, "num_tokens": 22389657.0, "step": 12135 }, { "entropy": 5.626089334487915, "epoch": 1.0199117832388154, "grad_norm": 1.5625, "learning_rate": 0.000490181407426681, "loss": 5.3773, "mean_token_accuracy": 0.16764698773622513, "num_tokens": 22398320.0, "step": 12140 }, { "entropy": 5.705850219726562, "epoch": 1.0203318630539802, "grad_norm": 1.9140625, "learning_rate": 0.0004901726555876701, "loss": 5.573, "mean_token_accuracy": 0.1539936549961567, "num_tokens": 22406634.0, "step": 12145 }, { "entropy": 5.800102376937867, "epoch": 1.0207519428691452, "grad_norm": 1.4609375, "learning_rate": 0.0004901638999369862, "loss": 5.6111, "mean_token_accuracy": 0.15667299777269364, "num_tokens": 22415939.0, "step": 12150 }, { "entropy": 5.758721494674683, "epoch": 1.02117202268431, "grad_norm": 1.671875, "learning_rate": 0.0004901551404747847, "loss": 5.5353, "mean_token_accuracy": 0.1576780617237091, "num_tokens": 22425256.0, "step": 12155 }, { "entropy": 5.758379936218262, "epoch": 1.021592102499475, "grad_norm": 1.625, "learning_rate": 0.0004901463772012209, "loss": 5.6105, "mean_token_accuracy": 0.15414702594280244, "num_tokens": 22434750.0, "step": 12160 }, { "entropy": 5.7319268703460695, "epoch": 1.0220121823146397, "grad_norm": 1.796875, "learning_rate": 0.0004901376101164495, "loss": 5.4788, "mean_token_accuracy": 0.16012528240680696, "num_tokens": 22443426.0, "step": 12165 }, { "entropy": 5.718150901794433, "epoch": 1.0224322621298048, "grad_norm": 2.53125, "learning_rate": 0.0004901288392206263, "loss": 5.496, "mean_token_accuracy": 0.15628497451543807, "num_tokens": 22452778.0, "step": 12170 }, { "entropy": 5.683122968673706, "epoch": 1.0228523419449695, "grad_norm": 1.828125, "learning_rate": 0.0004901200645139064, "loss": 5.4532, "mean_token_accuracy": 0.1657660871744156, "num_tokens": 22462864.0, "step": 12175 }, { "entropy": 5.715426301956176, "epoch": 1.0232724217601343, "grad_norm": 2.3125, "learning_rate": 0.0004901112859964454, "loss": 5.515, "mean_token_accuracy": 0.1562432289123535, "num_tokens": 22472849.0, "step": 12180 }, { "entropy": 5.6747640609741214, "epoch": 1.0236925015752993, "grad_norm": 1.6796875, "learning_rate": 0.0004901025036683987, "loss": 5.4378, "mean_token_accuracy": 0.15990415960550308, "num_tokens": 22481693.0, "step": 12185 }, { "entropy": 5.717993688583374, "epoch": 1.0241125813904641, "grad_norm": 1.5546875, "learning_rate": 0.0004900937175299219, "loss": 5.4283, "mean_token_accuracy": 0.16626838445663453, "num_tokens": 22490934.0, "step": 12190 }, { "entropy": 5.723482513427735, "epoch": 1.0245326612056291, "grad_norm": 1.5546875, "learning_rate": 0.0004900849275811707, "loss": 5.488, "mean_token_accuracy": 0.16016919761896134, "num_tokens": 22500457.0, "step": 12195 }, { "entropy": 5.739189004898071, "epoch": 1.024952741020794, "grad_norm": 1.875, "learning_rate": 0.0004900761338223007, "loss": 5.4461, "mean_token_accuracy": 0.15878349542617798, "num_tokens": 22509641.0, "step": 12200 }, { "entropy": 5.648697996139527, "epoch": 1.025372820835959, "grad_norm": 1.4296875, "learning_rate": 0.0004900673362534677, "loss": 5.3597, "mean_token_accuracy": 0.16653590351343156, "num_tokens": 22518616.0, "step": 12205 }, { "entropy": 5.73726167678833, "epoch": 1.0257929006511237, "grad_norm": 1.53125, "learning_rate": 0.0004900585348748277, "loss": 5.5152, "mean_token_accuracy": 0.1678289592266083, "num_tokens": 22527599.0, "step": 12210 }, { "entropy": 5.699249696731568, "epoch": 1.0262129804662885, "grad_norm": 1.609375, "learning_rate": 0.0004900497296865365, "loss": 5.522, "mean_token_accuracy": 0.15160454586148261, "num_tokens": 22537399.0, "step": 12215 }, { "entropy": 5.909937381744385, "epoch": 1.0266330602814535, "grad_norm": 1.484375, "learning_rate": 0.0004900409206887499, "loss": 5.7361, "mean_token_accuracy": 0.1503012202680111, "num_tokens": 22546746.0, "step": 12220 }, { "entropy": 5.758369112014771, "epoch": 1.0270531400966183, "grad_norm": 2.25, "learning_rate": 0.0004900321078816243, "loss": 5.4986, "mean_token_accuracy": 0.16833491176366805, "num_tokens": 22555735.0, "step": 12225 }, { "entropy": 5.747727394104004, "epoch": 1.0274732199117833, "grad_norm": 1.609375, "learning_rate": 0.0004900232912653156, "loss": 5.5011, "mean_token_accuracy": 0.16442441418766976, "num_tokens": 22565010.0, "step": 12230 }, { "entropy": 5.755198526382446, "epoch": 1.027893299726948, "grad_norm": 1.796875, "learning_rate": 0.00049001447083998, "loss": 5.4897, "mean_token_accuracy": 0.15753853023052217, "num_tokens": 22573565.0, "step": 12235 }, { "entropy": 5.751472043991089, "epoch": 1.028313379542113, "grad_norm": 1.6015625, "learning_rate": 0.0004900056466057737, "loss": 5.4754, "mean_token_accuracy": 0.15711085349321366, "num_tokens": 22582549.0, "step": 12240 }, { "entropy": 5.6773108959198, "epoch": 1.028733459357278, "grad_norm": 1.5703125, "learning_rate": 0.0004899968185628531, "loss": 5.5407, "mean_token_accuracy": 0.15574416965246202, "num_tokens": 22592112.0, "step": 12245 }, { "entropy": 5.632958936691284, "epoch": 1.0291535391724427, "grad_norm": 1.6328125, "learning_rate": 0.0004899879867113746, "loss": 5.3852, "mean_token_accuracy": 0.16619622707366943, "num_tokens": 22600581.0, "step": 12250 }, { "entropy": 5.809205341339111, "epoch": 1.0295736189876077, "grad_norm": 1.625, "learning_rate": 0.0004899791510514945, "loss": 5.5897, "mean_token_accuracy": 0.15455610007047654, "num_tokens": 22610822.0, "step": 12255 }, { "entropy": 5.765147113800049, "epoch": 1.0299936988027725, "grad_norm": 1.3828125, "learning_rate": 0.0004899703115833696, "loss": 5.5785, "mean_token_accuracy": 0.1628772124648094, "num_tokens": 22619484.0, "step": 12260 }, { "entropy": 5.693212890625, "epoch": 1.0304137786179375, "grad_norm": 1.8671875, "learning_rate": 0.0004899614683071563, "loss": 5.4248, "mean_token_accuracy": 0.16597820073366165, "num_tokens": 22629038.0, "step": 12265 }, { "entropy": 5.7034484386444095, "epoch": 1.0308338584331023, "grad_norm": 1.7265625, "learning_rate": 0.0004899526212230112, "loss": 5.516, "mean_token_accuracy": 0.15537500530481338, "num_tokens": 22638619.0, "step": 12270 }, { "entropy": 5.658185815811157, "epoch": 1.0312539382482673, "grad_norm": 2.296875, "learning_rate": 0.0004899437703310912, "loss": 5.5003, "mean_token_accuracy": 0.16062938123941423, "num_tokens": 22648065.0, "step": 12275 }, { "entropy": 5.798764753341675, "epoch": 1.031674018063432, "grad_norm": 1.8515625, "learning_rate": 0.0004899349156315529, "loss": 5.5658, "mean_token_accuracy": 0.15393222272396087, "num_tokens": 22658107.0, "step": 12280 }, { "entropy": 5.730508184432983, "epoch": 1.0320940978785969, "grad_norm": 1.8046875, "learning_rate": 0.0004899260571245533, "loss": 5.4466, "mean_token_accuracy": 0.16231588870286942, "num_tokens": 22667103.0, "step": 12285 }, { "entropy": 5.6667787551879885, "epoch": 1.0325141776937619, "grad_norm": 1.5078125, "learning_rate": 0.0004899171948102492, "loss": 5.4168, "mean_token_accuracy": 0.16460922211408616, "num_tokens": 22676792.0, "step": 12290 }, { "entropy": 5.648167705535888, "epoch": 1.0329342575089266, "grad_norm": 1.5546875, "learning_rate": 0.0004899083286887977, "loss": 5.452, "mean_token_accuracy": 0.1632228210568428, "num_tokens": 22685344.0, "step": 12295 }, { "entropy": 5.778263664245605, "epoch": 1.0333543373240917, "grad_norm": 1.515625, "learning_rate": 0.0004898994587603559, "loss": 5.5131, "mean_token_accuracy": 0.16273818016052247, "num_tokens": 22694387.0, "step": 12300 }, { "entropy": 5.695818853378296, "epoch": 1.0337744171392564, "grad_norm": 1.4453125, "learning_rate": 0.0004898905850250807, "loss": 5.542, "mean_token_accuracy": 0.16002353727817537, "num_tokens": 22704203.0, "step": 12305 }, { "entropy": 5.760842561721802, "epoch": 1.0341944969544214, "grad_norm": 1.578125, "learning_rate": 0.0004898817074831295, "loss": 5.5913, "mean_token_accuracy": 0.1574055314064026, "num_tokens": 22713518.0, "step": 12310 }, { "entropy": 5.756874465942383, "epoch": 1.0346145767695862, "grad_norm": 1.53125, "learning_rate": 0.0004898728261346595, "loss": 5.593, "mean_token_accuracy": 0.15683950930833818, "num_tokens": 22722997.0, "step": 12315 }, { "entropy": 5.767385387420655, "epoch": 1.035034656584751, "grad_norm": 1.53125, "learning_rate": 0.000489863940979828, "loss": 5.534, "mean_token_accuracy": 0.15978951305150985, "num_tokens": 22732385.0, "step": 12320 }, { "entropy": 5.693596649169922, "epoch": 1.035454736399916, "grad_norm": 1.609375, "learning_rate": 0.0004898550520187925, "loss": 5.4096, "mean_token_accuracy": 0.16672670543193818, "num_tokens": 22741148.0, "step": 12325 }, { "entropy": 5.675939607620239, "epoch": 1.0358748162150808, "grad_norm": 1.578125, "learning_rate": 0.0004898461592517103, "loss": 5.4109, "mean_token_accuracy": 0.16389408260583876, "num_tokens": 22750239.0, "step": 12330 }, { "entropy": 5.783782148361206, "epoch": 1.0362948960302458, "grad_norm": 1.4609375, "learning_rate": 0.0004898372626787391, "loss": 5.5538, "mean_token_accuracy": 0.15868894159793853, "num_tokens": 22759290.0, "step": 12335 }, { "entropy": 5.806813049316406, "epoch": 1.0367149758454106, "grad_norm": 1.4609375, "learning_rate": 0.0004898283623000364, "loss": 5.5762, "mean_token_accuracy": 0.15626893192529678, "num_tokens": 22768450.0, "step": 12340 }, { "entropy": 5.7313658714294435, "epoch": 1.0371350556605754, "grad_norm": 1.8203125, "learning_rate": 0.0004898194581157598, "loss": 5.434, "mean_token_accuracy": 0.15717112123966218, "num_tokens": 22777711.0, "step": 12345 }, { "entropy": 5.695150518417359, "epoch": 1.0375551354757404, "grad_norm": 1.453125, "learning_rate": 0.0004898105501260671, "loss": 5.5217, "mean_token_accuracy": 0.16438411176204681, "num_tokens": 22787153.0, "step": 12350 }, { "entropy": 5.763386631011963, "epoch": 1.0379752152909052, "grad_norm": 1.453125, "learning_rate": 0.0004898016383311163, "loss": 5.525, "mean_token_accuracy": 0.1668378531932831, "num_tokens": 22797125.0, "step": 12355 }, { "entropy": 5.720566844940185, "epoch": 1.0383952951060702, "grad_norm": 1.5546875, "learning_rate": 0.000489792722731065, "loss": 5.5131, "mean_token_accuracy": 0.1580943688750267, "num_tokens": 22806478.0, "step": 12360 }, { "entropy": 5.739205694198608, "epoch": 1.038815374921235, "grad_norm": 1.4453125, "learning_rate": 0.0004897838033260712, "loss": 5.5264, "mean_token_accuracy": 0.1504399910569191, "num_tokens": 22815375.0, "step": 12365 }, { "entropy": 5.783780193328857, "epoch": 1.0392354547364, "grad_norm": 1.515625, "learning_rate": 0.0004897748801162929, "loss": 5.4899, "mean_token_accuracy": 0.16633763164281845, "num_tokens": 22824401.0, "step": 12370 }, { "entropy": 5.735202741622925, "epoch": 1.0396555345515648, "grad_norm": 1.5, "learning_rate": 0.0004897659531018882, "loss": 5.6045, "mean_token_accuracy": 0.16063894852995872, "num_tokens": 22833933.0, "step": 12375 }, { "entropy": 5.695873117446899, "epoch": 1.0400756143667296, "grad_norm": 1.8515625, "learning_rate": 0.0004897570222830152, "loss": 5.4862, "mean_token_accuracy": 0.15706607103347778, "num_tokens": 22843779.0, "step": 12380 }, { "entropy": 5.765497493743896, "epoch": 1.0404956941818946, "grad_norm": 1.453125, "learning_rate": 0.0004897480876598322, "loss": 5.5739, "mean_token_accuracy": 0.15584344267845154, "num_tokens": 22852951.0, "step": 12385 }, { "entropy": 5.787726879119873, "epoch": 1.0409157739970594, "grad_norm": 2.078125, "learning_rate": 0.0004897391492324974, "loss": 5.5851, "mean_token_accuracy": 0.15543406456708908, "num_tokens": 22861398.0, "step": 12390 }, { "entropy": 5.713971185684204, "epoch": 1.0413358538122244, "grad_norm": 1.546875, "learning_rate": 0.0004897302070011691, "loss": 5.4643, "mean_token_accuracy": 0.1621120572090149, "num_tokens": 22870518.0, "step": 12395 }, { "entropy": 5.669089317321777, "epoch": 1.0417559336273892, "grad_norm": 1.6015625, "learning_rate": 0.0004897212609660058, "loss": 5.533, "mean_token_accuracy": 0.15619430541992188, "num_tokens": 22879389.0, "step": 12400 }, { "entropy": 5.724472951889038, "epoch": 1.0421760134425542, "grad_norm": 1.609375, "learning_rate": 0.0004897123111271659, "loss": 5.5315, "mean_token_accuracy": 0.16127124577760696, "num_tokens": 22888977.0, "step": 12405 }, { "entropy": 5.853266382217408, "epoch": 1.042596093257719, "grad_norm": 1.8046875, "learning_rate": 0.0004897033574848079, "loss": 5.548, "mean_token_accuracy": 0.16196577847003937, "num_tokens": 22898446.0, "step": 12410 }, { "entropy": 5.70566611289978, "epoch": 1.0430161730728837, "grad_norm": 1.46875, "learning_rate": 0.0004896944000390907, "loss": 5.5245, "mean_token_accuracy": 0.16506237536668777, "num_tokens": 22908044.0, "step": 12415 }, { "entropy": 5.778344535827637, "epoch": 1.0434362528880488, "grad_norm": 1.6328125, "learning_rate": 0.0004896854387901725, "loss": 5.5804, "mean_token_accuracy": 0.15366130471229553, "num_tokens": 22917330.0, "step": 12420 }, { "entropy": 5.807542943954468, "epoch": 1.0438563327032135, "grad_norm": 1.453125, "learning_rate": 0.0004896764737382124, "loss": 5.5466, "mean_token_accuracy": 0.16622493267059327, "num_tokens": 22927160.0, "step": 12425 }, { "entropy": 5.792239236831665, "epoch": 1.0442764125183785, "grad_norm": 1.609375, "learning_rate": 0.0004896675048833691, "loss": 5.4966, "mean_token_accuracy": 0.1603910431265831, "num_tokens": 22936755.0, "step": 12430 }, { "entropy": 5.707068347930909, "epoch": 1.0446964923335433, "grad_norm": 1.5546875, "learning_rate": 0.0004896585322258014, "loss": 5.4971, "mean_token_accuracy": 0.16156308948993683, "num_tokens": 22945699.0, "step": 12435 }, { "entropy": 5.714017152786255, "epoch": 1.0451165721487083, "grad_norm": 1.6875, "learning_rate": 0.0004896495557656685, "loss": 5.4759, "mean_token_accuracy": 0.17006804645061493, "num_tokens": 22954001.0, "step": 12440 }, { "entropy": 5.808580160140991, "epoch": 1.0455366519638731, "grad_norm": 1.359375, "learning_rate": 0.0004896405755031293, "loss": 5.5673, "mean_token_accuracy": 0.15997690260410308, "num_tokens": 22963805.0, "step": 12445 }, { "entropy": 5.6800487518310545, "epoch": 1.045956731779038, "grad_norm": 1.515625, "learning_rate": 0.0004896315914383427, "loss": 5.5063, "mean_token_accuracy": 0.15431105494499206, "num_tokens": 22973542.0, "step": 12450 }, { "entropy": 5.643172407150269, "epoch": 1.046376811594203, "grad_norm": 1.7890625, "learning_rate": 0.0004896226035714679, "loss": 5.3786, "mean_token_accuracy": 0.16473590731620788, "num_tokens": 22982417.0, "step": 12455 }, { "entropy": 5.706238555908203, "epoch": 1.0467968914093677, "grad_norm": 1.46875, "learning_rate": 0.0004896136119026642, "loss": 5.5078, "mean_token_accuracy": 0.15882690697908403, "num_tokens": 22992879.0, "step": 12460 }, { "entropy": 5.697173643112182, "epoch": 1.0472169712245327, "grad_norm": 1.6796875, "learning_rate": 0.0004896046164320911, "loss": 5.3948, "mean_token_accuracy": 0.16620510965585708, "num_tokens": 23001344.0, "step": 12465 }, { "entropy": 5.659090280532837, "epoch": 1.0476370510396975, "grad_norm": 1.5625, "learning_rate": 0.0004895956171599075, "loss": 5.4216, "mean_token_accuracy": 0.1704336553812027, "num_tokens": 23010007.0, "step": 12470 }, { "entropy": 5.747759056091309, "epoch": 1.0480571308548625, "grad_norm": 1.546875, "learning_rate": 0.0004895866140862731, "loss": 5.557, "mean_token_accuracy": 0.15872435867786408, "num_tokens": 23019120.0, "step": 12475 }, { "entropy": 5.71089186668396, "epoch": 1.0484772106700273, "grad_norm": 1.34375, "learning_rate": 0.0004895776072113473, "loss": 5.5359, "mean_token_accuracy": 0.16418685615062714, "num_tokens": 23028562.0, "step": 12480 }, { "entropy": 5.689389705657959, "epoch": 1.048897290485192, "grad_norm": 1.4921875, "learning_rate": 0.0004895685965352898, "loss": 5.4731, "mean_token_accuracy": 0.16231704950332643, "num_tokens": 23037687.0, "step": 12485 }, { "entropy": 5.757169103622436, "epoch": 1.049317370300357, "grad_norm": 1.546875, "learning_rate": 0.0004895595820582601, "loss": 5.4789, "mean_token_accuracy": 0.15927850753068923, "num_tokens": 23047475.0, "step": 12490 }, { "entropy": 5.649786186218262, "epoch": 1.0497374501155219, "grad_norm": 1.3984375, "learning_rate": 0.0004895505637804177, "loss": 5.5069, "mean_token_accuracy": 0.15920701920986174, "num_tokens": 23057475.0, "step": 12495 }, { "entropy": 5.599431371688842, "epoch": 1.050157529930687, "grad_norm": 1.5546875, "learning_rate": 0.0004895415417019227, "loss": 5.4847, "mean_token_accuracy": 0.15794518887996672, "num_tokens": 23066419.0, "step": 12500 }, { "entropy": 5.765432214736938, "epoch": 1.0505776097458517, "grad_norm": 1.453125, "learning_rate": 0.0004895325158229346, "loss": 5.5385, "mean_token_accuracy": 0.1619092509150505, "num_tokens": 23075516.0, "step": 12505 }, { "entropy": 5.721098899841309, "epoch": 1.0509976895610167, "grad_norm": 1.875, "learning_rate": 0.0004895234861436136, "loss": 5.4198, "mean_token_accuracy": 0.1697925642132759, "num_tokens": 23084132.0, "step": 12510 }, { "entropy": 5.783330011367798, "epoch": 1.0514177693761815, "grad_norm": 1.9921875, "learning_rate": 0.0004895144526641194, "loss": 5.5043, "mean_token_accuracy": 0.16086599081754685, "num_tokens": 23093958.0, "step": 12515 }, { "entropy": 5.783671569824219, "epoch": 1.0518378491913463, "grad_norm": 1.5546875, "learning_rate": 0.0004895054153846123, "loss": 5.5409, "mean_token_accuracy": 0.1583005540072918, "num_tokens": 23103524.0, "step": 12520 }, { "entropy": 5.631361865997315, "epoch": 1.0522579290065113, "grad_norm": 1.4609375, "learning_rate": 0.0004894963743052521, "loss": 5.451, "mean_token_accuracy": 0.15645991861820222, "num_tokens": 23112445.0, "step": 12525 }, { "entropy": 5.749525880813598, "epoch": 1.052678008821676, "grad_norm": 1.6796875, "learning_rate": 0.0004894873294261991, "loss": 5.5179, "mean_token_accuracy": 0.15921320170164108, "num_tokens": 23121299.0, "step": 12530 }, { "entropy": 5.767481660842895, "epoch": 1.053098088636841, "grad_norm": 1.4140625, "learning_rate": 0.0004894782807476134, "loss": 5.5333, "mean_token_accuracy": 0.1522089034318924, "num_tokens": 23130260.0, "step": 12535 }, { "entropy": 5.74699182510376, "epoch": 1.0535181684520059, "grad_norm": 1.4140625, "learning_rate": 0.0004894692282696555, "loss": 5.4622, "mean_token_accuracy": 0.16261095851659774, "num_tokens": 23139335.0, "step": 12540 }, { "entropy": 5.656941652297974, "epoch": 1.0539382482671709, "grad_norm": 1.609375, "learning_rate": 0.0004894601719924857, "loss": 5.4648, "mean_token_accuracy": 0.16428751796483992, "num_tokens": 23149299.0, "step": 12545 }, { "entropy": 5.598066186904907, "epoch": 1.0543583280823356, "grad_norm": 1.4609375, "learning_rate": 0.0004894511119162644, "loss": 5.414, "mean_token_accuracy": 0.16812524497509002, "num_tokens": 23158651.0, "step": 12550 }, { "entropy": 5.759066200256347, "epoch": 1.0547784078975004, "grad_norm": 1.375, "learning_rate": 0.000489442048041152, "loss": 5.5022, "mean_token_accuracy": 0.15415302515029908, "num_tokens": 23167629.0, "step": 12555 }, { "entropy": 5.7550407409667965, "epoch": 1.0551984877126654, "grad_norm": 1.640625, "learning_rate": 0.0004894329803673092, "loss": 5.4926, "mean_token_accuracy": 0.15900574922561644, "num_tokens": 23177026.0, "step": 12560 }, { "entropy": 5.696121501922607, "epoch": 1.0556185675278302, "grad_norm": 1.6328125, "learning_rate": 0.0004894239088948964, "loss": 5.4628, "mean_token_accuracy": 0.1633963868021965, "num_tokens": 23185297.0, "step": 12565 }, { "entropy": 5.640616607666016, "epoch": 1.0560386473429952, "grad_norm": 1.5546875, "learning_rate": 0.0004894148336240747, "loss": 5.4745, "mean_token_accuracy": 0.1665568009018898, "num_tokens": 23194804.0, "step": 12570 }, { "entropy": 5.749676895141602, "epoch": 1.05645872715816, "grad_norm": 1.5390625, "learning_rate": 0.0004894057545550045, "loss": 5.5094, "mean_token_accuracy": 0.15964649617671967, "num_tokens": 23205063.0, "step": 12575 }, { "entropy": 5.691761779785156, "epoch": 1.056878806973325, "grad_norm": 1.515625, "learning_rate": 0.0004893966716878467, "loss": 5.4411, "mean_token_accuracy": 0.15895105600357057, "num_tokens": 23215038.0, "step": 12580 }, { "entropy": 5.763622140884399, "epoch": 1.0572988867884898, "grad_norm": 1.4140625, "learning_rate": 0.0004893875850227624, "loss": 5.614, "mean_token_accuracy": 0.151802134513855, "num_tokens": 23223530.0, "step": 12585 }, { "entropy": 5.740535259246826, "epoch": 1.0577189666036546, "grad_norm": 1.609375, "learning_rate": 0.0004893784945599124, "loss": 5.5385, "mean_token_accuracy": 0.16195468753576278, "num_tokens": 23232547.0, "step": 12590 }, { "entropy": 5.704318332672119, "epoch": 1.0581390464188196, "grad_norm": 1.609375, "learning_rate": 0.0004893694002994577, "loss": 5.5753, "mean_token_accuracy": 0.16065402403473855, "num_tokens": 23241305.0, "step": 12595 }, { "entropy": 5.854096460342407, "epoch": 1.0585591262339844, "grad_norm": 1.5234375, "learning_rate": 0.0004893603022415595, "loss": 5.6043, "mean_token_accuracy": 0.1608058363199234, "num_tokens": 23250708.0, "step": 12600 }, { "entropy": 5.792645645141602, "epoch": 1.0589792060491494, "grad_norm": 1.609375, "learning_rate": 0.0004893512003863788, "loss": 5.5117, "mean_token_accuracy": 0.15551188662648202, "num_tokens": 23260161.0, "step": 12605 }, { "entropy": 5.682678604125977, "epoch": 1.0593992858643142, "grad_norm": 1.3984375, "learning_rate": 0.0004893420947340771, "loss": 5.4161, "mean_token_accuracy": 0.1580376446247101, "num_tokens": 23268932.0, "step": 12610 }, { "entropy": 5.680995082855224, "epoch": 1.0598193656794792, "grad_norm": 1.6640625, "learning_rate": 0.0004893329852848155, "loss": 5.5111, "mean_token_accuracy": 0.16238304674625398, "num_tokens": 23277741.0, "step": 12615 }, { "entropy": 5.706674957275391, "epoch": 1.060239445494644, "grad_norm": 1.5546875, "learning_rate": 0.0004893238720387555, "loss": 5.5094, "mean_token_accuracy": 0.16012922972440718, "num_tokens": 23286982.0, "step": 12620 }, { "entropy": 5.713710308074951, "epoch": 1.0606595253098088, "grad_norm": 1.8671875, "learning_rate": 0.0004893147549960584, "loss": 5.4361, "mean_token_accuracy": 0.16573359668254853, "num_tokens": 23296902.0, "step": 12625 }, { "entropy": 5.671449041366577, "epoch": 1.0610796051249738, "grad_norm": 1.6171875, "learning_rate": 0.0004893056341568857, "loss": 5.4432, "mean_token_accuracy": 0.16855929046869278, "num_tokens": 23305443.0, "step": 12630 }, { "entropy": 5.694199895858764, "epoch": 1.0614996849401386, "grad_norm": 1.796875, "learning_rate": 0.0004892965095213992, "loss": 5.4203, "mean_token_accuracy": 0.16460745334625243, "num_tokens": 23315420.0, "step": 12635 }, { "entropy": 5.759862661361694, "epoch": 1.0619197647553036, "grad_norm": 1.375, "learning_rate": 0.0004892873810897604, "loss": 5.5089, "mean_token_accuracy": 0.1558899015188217, "num_tokens": 23324540.0, "step": 12640 }, { "entropy": 5.739316987991333, "epoch": 1.0623398445704684, "grad_norm": 2.25, "learning_rate": 0.0004892782488621308, "loss": 5.4567, "mean_token_accuracy": 0.16644190847873688, "num_tokens": 23334282.0, "step": 12645 }, { "entropy": 5.712379074096679, "epoch": 1.0627599243856332, "grad_norm": 1.59375, "learning_rate": 0.0004892691128386725, "loss": 5.453, "mean_token_accuracy": 0.1627206951379776, "num_tokens": 23342836.0, "step": 12650 }, { "entropy": 5.702242517471314, "epoch": 1.0631800042007982, "grad_norm": 1.71875, "learning_rate": 0.0004892599730195471, "loss": 5.4406, "mean_token_accuracy": 0.16527725458145143, "num_tokens": 23351863.0, "step": 12655 }, { "entropy": 5.809025621414184, "epoch": 1.063600084015963, "grad_norm": 2.0, "learning_rate": 0.0004892508294049167, "loss": 5.6074, "mean_token_accuracy": 0.1642581820487976, "num_tokens": 23361788.0, "step": 12660 }, { "entropy": 5.716249179840088, "epoch": 1.064020163831128, "grad_norm": 1.421875, "learning_rate": 0.0004892416819949431, "loss": 5.4403, "mean_token_accuracy": 0.15782576352357863, "num_tokens": 23370175.0, "step": 12665 }, { "entropy": 5.668329477310181, "epoch": 1.0644402436462927, "grad_norm": 1.671875, "learning_rate": 0.0004892325307897886, "loss": 5.4826, "mean_token_accuracy": 0.16445921808481218, "num_tokens": 23378835.0, "step": 12670 }, { "entropy": 5.684893798828125, "epoch": 1.0648603234614578, "grad_norm": 1.578125, "learning_rate": 0.0004892233757896149, "loss": 5.4898, "mean_token_accuracy": 0.16239043474197387, "num_tokens": 23389390.0, "step": 12675 }, { "entropy": 5.731085300445557, "epoch": 1.0652804032766225, "grad_norm": 1.4765625, "learning_rate": 0.0004892142169945845, "loss": 5.4812, "mean_token_accuracy": 0.15869970321655275, "num_tokens": 23398802.0, "step": 12680 }, { "entropy": 5.663789510726929, "epoch": 1.0657004830917876, "grad_norm": 1.6640625, "learning_rate": 0.0004892050544048596, "loss": 5.4592, "mean_token_accuracy": 0.16194516718387603, "num_tokens": 23407731.0, "step": 12685 }, { "entropy": 5.708717679977417, "epoch": 1.0661205629069523, "grad_norm": 1.59375, "learning_rate": 0.0004891958880206024, "loss": 5.5059, "mean_token_accuracy": 0.15976526141166686, "num_tokens": 23417046.0, "step": 12690 }, { "entropy": 5.7145740509033205, "epoch": 1.0665406427221171, "grad_norm": 1.7890625, "learning_rate": 0.0004891867178419753, "loss": 5.5009, "mean_token_accuracy": 0.1623055413365364, "num_tokens": 23426107.0, "step": 12695 }, { "entropy": 5.758947944641113, "epoch": 1.0669607225372821, "grad_norm": 1.53125, "learning_rate": 0.0004891775438691408, "loss": 5.5391, "mean_token_accuracy": 0.1586405709385872, "num_tokens": 23435523.0, "step": 12700 }, { "entropy": 5.691416501998901, "epoch": 1.067380802352447, "grad_norm": 2.515625, "learning_rate": 0.0004891683661022615, "loss": 5.4907, "mean_token_accuracy": 0.16506600081920625, "num_tokens": 23444185.0, "step": 12705 }, { "entropy": 5.812458419799805, "epoch": 1.067800882167612, "grad_norm": 1.7578125, "learning_rate": 0.0004891591845414997, "loss": 5.678, "mean_token_accuracy": 0.14658654034137725, "num_tokens": 23454100.0, "step": 12710 }, { "entropy": 5.816659593582154, "epoch": 1.0682209619827767, "grad_norm": 1.5703125, "learning_rate": 0.0004891499991870184, "loss": 5.5766, "mean_token_accuracy": 0.15168848782777786, "num_tokens": 23463415.0, "step": 12715 }, { "entropy": 5.723210430145263, "epoch": 1.0686410417979415, "grad_norm": 1.5859375, "learning_rate": 0.00048914081003898, "loss": 5.4731, "mean_token_accuracy": 0.15874896347522735, "num_tokens": 23471515.0, "step": 12720 }, { "entropy": 5.743414497375488, "epoch": 1.0690611216131065, "grad_norm": 1.6015625, "learning_rate": 0.0004891316170975475, "loss": 5.5173, "mean_token_accuracy": 0.15784869194030762, "num_tokens": 23481696.0, "step": 12725 }, { "entropy": 5.7783526420593265, "epoch": 1.0694812014282713, "grad_norm": 1.609375, "learning_rate": 0.0004891224203628836, "loss": 5.4774, "mean_token_accuracy": 0.16449615508317947, "num_tokens": 23490714.0, "step": 12730 }, { "entropy": 5.63666844367981, "epoch": 1.0699012812434363, "grad_norm": 1.734375, "learning_rate": 0.0004891132198351514, "loss": 5.4621, "mean_token_accuracy": 0.1659099578857422, "num_tokens": 23500368.0, "step": 12735 }, { "entropy": 5.526670217514038, "epoch": 1.070321361058601, "grad_norm": 1.90625, "learning_rate": 0.0004891040155145137, "loss": 5.4048, "mean_token_accuracy": 0.17042581588029862, "num_tokens": 23508857.0, "step": 12740 }, { "entropy": 5.627542209625244, "epoch": 1.070741440873766, "grad_norm": 2.328125, "learning_rate": 0.0004890948074011335, "loss": 5.3897, "mean_token_accuracy": 0.17012525349855423, "num_tokens": 23518128.0, "step": 12745 }, { "entropy": 5.748180818557739, "epoch": 1.071161520688931, "grad_norm": 1.84375, "learning_rate": 0.0004890855954951741, "loss": 5.4948, "mean_token_accuracy": 0.16303456127643584, "num_tokens": 23527292.0, "step": 12750 }, { "entropy": 5.744745492935181, "epoch": 1.0715816005040957, "grad_norm": 1.7578125, "learning_rate": 0.0004890763797967987, "loss": 5.4885, "mean_token_accuracy": 0.16271119713783264, "num_tokens": 23535694.0, "step": 12755 }, { "entropy": 5.706960821151734, "epoch": 1.0720016803192607, "grad_norm": 1.9140625, "learning_rate": 0.0004890671603061704, "loss": 5.4966, "mean_token_accuracy": 0.15939076095819474, "num_tokens": 23544766.0, "step": 12760 }, { "entropy": 5.706810760498047, "epoch": 1.0724217601344255, "grad_norm": 1.59375, "learning_rate": 0.0004890579370234526, "loss": 5.4554, "mean_token_accuracy": 0.1673600748181343, "num_tokens": 23554037.0, "step": 12765 }, { "entropy": 5.774952697753906, "epoch": 1.0728418399495905, "grad_norm": 1.5703125, "learning_rate": 0.0004890487099488086, "loss": 5.5179, "mean_token_accuracy": 0.15788703113794328, "num_tokens": 23562282.0, "step": 12770 }, { "entropy": 5.792991018295288, "epoch": 1.0732619197647553, "grad_norm": 1.71875, "learning_rate": 0.000489039479082402, "loss": 5.5865, "mean_token_accuracy": 0.15591855943202973, "num_tokens": 23571955.0, "step": 12775 }, { "entropy": 5.676628351211548, "epoch": 1.0736819995799203, "grad_norm": 2.265625, "learning_rate": 0.0004890302444243962, "loss": 5.4755, "mean_token_accuracy": 0.15936234593391418, "num_tokens": 23580996.0, "step": 12780 }, { "entropy": 5.745807313919068, "epoch": 1.074102079395085, "grad_norm": 1.4609375, "learning_rate": 0.0004890210059749549, "loss": 5.5674, "mean_token_accuracy": 0.1499895855784416, "num_tokens": 23589618.0, "step": 12785 }, { "entropy": 5.733888244628906, "epoch": 1.0745221592102498, "grad_norm": 1.6171875, "learning_rate": 0.0004890117637342416, "loss": 5.4154, "mean_token_accuracy": 0.1605689197778702, "num_tokens": 23599574.0, "step": 12790 }, { "entropy": 5.7341142177581785, "epoch": 1.0749422390254149, "grad_norm": 1.8984375, "learning_rate": 0.0004890025177024202, "loss": 5.486, "mean_token_accuracy": 0.15659692734479905, "num_tokens": 23609205.0, "step": 12795 }, { "entropy": 5.678049373626709, "epoch": 1.0753623188405796, "grad_norm": 1.875, "learning_rate": 0.0004889932678796543, "loss": 5.5044, "mean_token_accuracy": 0.15572902113199233, "num_tokens": 23617554.0, "step": 12800 }, { "entropy": 5.7471010208129885, "epoch": 1.0757823986557447, "grad_norm": 1.5234375, "learning_rate": 0.0004889840142661078, "loss": 5.5599, "mean_token_accuracy": 0.1572861537337303, "num_tokens": 23626757.0, "step": 12805 }, { "entropy": 5.770623016357422, "epoch": 1.0762024784709094, "grad_norm": 1.6875, "learning_rate": 0.0004889747568619447, "loss": 5.5106, "mean_token_accuracy": 0.1615568682551384, "num_tokens": 23636111.0, "step": 12810 }, { "entropy": 5.72378830909729, "epoch": 1.0766225582860744, "grad_norm": 1.5546875, "learning_rate": 0.0004889654956673291, "loss": 5.494, "mean_token_accuracy": 0.16236085295677186, "num_tokens": 23644579.0, "step": 12815 }, { "entropy": 5.700385427474975, "epoch": 1.0770426381012392, "grad_norm": 1.4609375, "learning_rate": 0.0004889562306824248, "loss": 5.4095, "mean_token_accuracy": 0.1597435638308525, "num_tokens": 23653263.0, "step": 12820 }, { "entropy": 5.591032648086548, "epoch": 1.077462717916404, "grad_norm": 1.7265625, "learning_rate": 0.000488946961907396, "loss": 5.3843, "mean_token_accuracy": 0.1746201902627945, "num_tokens": 23662529.0, "step": 12825 }, { "entropy": 5.608241891860962, "epoch": 1.077882797731569, "grad_norm": 1.75, "learning_rate": 0.0004889376893424071, "loss": 5.421, "mean_token_accuracy": 0.1713373154401779, "num_tokens": 23671491.0, "step": 12830 }, { "entropy": 5.640907621383667, "epoch": 1.0783028775467338, "grad_norm": 1.703125, "learning_rate": 0.0004889284129876221, "loss": 5.4005, "mean_token_accuracy": 0.15988982617855071, "num_tokens": 23680121.0, "step": 12835 }, { "entropy": 5.662772226333618, "epoch": 1.0787229573618988, "grad_norm": 1.453125, "learning_rate": 0.0004889191328432054, "loss": 5.4614, "mean_token_accuracy": 0.16260750889778136, "num_tokens": 23689008.0, "step": 12840 }, { "entropy": 5.742505121231079, "epoch": 1.0791430371770636, "grad_norm": 1.7421875, "learning_rate": 0.0004889098489093215, "loss": 5.5053, "mean_token_accuracy": 0.1597042962908745, "num_tokens": 23698551.0, "step": 12845 }, { "entropy": 5.8218427181243895, "epoch": 1.0795631169922286, "grad_norm": 1.6875, "learning_rate": 0.0004889005611861347, "loss": 5.6635, "mean_token_accuracy": 0.15463445335626602, "num_tokens": 23707438.0, "step": 12850 }, { "entropy": 5.734436941146851, "epoch": 1.0799831968073934, "grad_norm": 2.046875, "learning_rate": 0.0004888912696738096, "loss": 5.5045, "mean_token_accuracy": 0.16258185505867004, "num_tokens": 23715822.0, "step": 12855 }, { "entropy": 5.743537902832031, "epoch": 1.0804032766225582, "grad_norm": 1.6796875, "learning_rate": 0.0004888819743725108, "loss": 5.5265, "mean_token_accuracy": 0.1599157154560089, "num_tokens": 23725426.0, "step": 12860 }, { "entropy": 5.762011289596558, "epoch": 1.0808233564377232, "grad_norm": 1.46875, "learning_rate": 0.000488872675282403, "loss": 5.5143, "mean_token_accuracy": 0.16166198402643203, "num_tokens": 23735092.0, "step": 12865 }, { "entropy": 5.754138803482055, "epoch": 1.081243436252888, "grad_norm": 1.7734375, "learning_rate": 0.0004888633724036509, "loss": 5.5018, "mean_token_accuracy": 0.16174346208572388, "num_tokens": 23744255.0, "step": 12870 }, { "entropy": 5.657329463958741, "epoch": 1.081663516068053, "grad_norm": 1.546875, "learning_rate": 0.0004888540657364192, "loss": 5.3593, "mean_token_accuracy": 0.1702010914683342, "num_tokens": 23752978.0, "step": 12875 }, { "entropy": 5.6576941967010494, "epoch": 1.0820835958832178, "grad_norm": 1.5625, "learning_rate": 0.0004888447552808729, "loss": 5.4421, "mean_token_accuracy": 0.16415699273347856, "num_tokens": 23761051.0, "step": 12880 }, { "entropy": 5.763893032073975, "epoch": 1.0825036756983828, "grad_norm": 2.15625, "learning_rate": 0.0004888354410371768, "loss": 5.5546, "mean_token_accuracy": 0.15805445313453675, "num_tokens": 23770818.0, "step": 12885 }, { "entropy": 5.810835695266723, "epoch": 1.0829237555135476, "grad_norm": 1.9375, "learning_rate": 0.000488826123005496, "loss": 5.5714, "mean_token_accuracy": 0.16120226234197615, "num_tokens": 23780597.0, "step": 12890 }, { "entropy": 5.69043231010437, "epoch": 1.0833438353287124, "grad_norm": 3.21875, "learning_rate": 0.0004888168011859957, "loss": 5.4083, "mean_token_accuracy": 0.16143542230129243, "num_tokens": 23790119.0, "step": 12895 }, { "entropy": 5.687187528610229, "epoch": 1.0837639151438774, "grad_norm": 2.109375, "learning_rate": 0.0004888074755788407, "loss": 5.4772, "mean_token_accuracy": 0.16725920587778093, "num_tokens": 23798972.0, "step": 12900 }, { "entropy": 5.722570514678955, "epoch": 1.0841839949590422, "grad_norm": 2.578125, "learning_rate": 0.0004887981461841963, "loss": 5.4527, "mean_token_accuracy": 0.17206404507160186, "num_tokens": 23808685.0, "step": 12905 }, { "entropy": 5.765744590759278, "epoch": 1.0846040747742072, "grad_norm": 1.6875, "learning_rate": 0.0004887888130022279, "loss": 5.4663, "mean_token_accuracy": 0.16214465647935866, "num_tokens": 23817721.0, "step": 12910 }, { "entropy": 5.631449890136719, "epoch": 1.085024154589372, "grad_norm": 1.6875, "learning_rate": 0.0004887794760331008, "loss": 5.4193, "mean_token_accuracy": 0.16689784675836564, "num_tokens": 23826892.0, "step": 12915 }, { "entropy": 5.683791780471802, "epoch": 1.085444234404537, "grad_norm": 1.8359375, "learning_rate": 0.0004887701352769804, "loss": 5.3724, "mean_token_accuracy": 0.17175290137529373, "num_tokens": 23835717.0, "step": 12920 }, { "entropy": 5.697872066497803, "epoch": 1.0858643142197018, "grad_norm": 1.625, "learning_rate": 0.000488760790734032, "loss": 5.472, "mean_token_accuracy": 0.16542189866304396, "num_tokens": 23845814.0, "step": 12925 }, { "entropy": 5.738125276565552, "epoch": 1.0862843940348665, "grad_norm": 1.8046875, "learning_rate": 0.0004887514424044214, "loss": 5.4563, "mean_token_accuracy": 0.153540675342083, "num_tokens": 23854779.0, "step": 12930 }, { "entropy": 5.688271474838257, "epoch": 1.0867044738500315, "grad_norm": 2.09375, "learning_rate": 0.000488742090288314, "loss": 5.5074, "mean_token_accuracy": 0.16052113920450212, "num_tokens": 23863533.0, "step": 12935 }, { "entropy": 5.7345654487609865, "epoch": 1.0871245536651963, "grad_norm": 2.09375, "learning_rate": 0.0004887327343858755, "loss": 5.5325, "mean_token_accuracy": 0.1583286091685295, "num_tokens": 23872725.0, "step": 12940 }, { "entropy": 5.735647916793823, "epoch": 1.0875446334803613, "grad_norm": 1.890625, "learning_rate": 0.0004887233746972717, "loss": 5.5094, "mean_token_accuracy": 0.1608467683196068, "num_tokens": 23881799.0, "step": 12945 }, { "entropy": 5.736598634719849, "epoch": 1.0879647132955261, "grad_norm": 1.75, "learning_rate": 0.0004887140112226684, "loss": 5.5438, "mean_token_accuracy": 0.15989564061164857, "num_tokens": 23890628.0, "step": 12950 }, { "entropy": 5.667224788665772, "epoch": 1.088384793110691, "grad_norm": 3.46875, "learning_rate": 0.0004887046439622314, "loss": 5.5216, "mean_token_accuracy": 0.16750244051218033, "num_tokens": 23899968.0, "step": 12955 }, { "entropy": 5.747261238098145, "epoch": 1.088804872925856, "grad_norm": 2.421875, "learning_rate": 0.0004886952729161267, "loss": 5.3932, "mean_token_accuracy": 0.16512321233749389, "num_tokens": 23908634.0, "step": 12960 }, { "entropy": 5.779636716842651, "epoch": 1.0892249527410207, "grad_norm": 5.5625, "learning_rate": 0.0004886858980845202, "loss": 5.5616, "mean_token_accuracy": 0.15966024100780488, "num_tokens": 23917925.0, "step": 12965 }, { "entropy": 5.65394434928894, "epoch": 1.0896450325561857, "grad_norm": 2.90625, "learning_rate": 0.0004886765194675782, "loss": 5.4445, "mean_token_accuracy": 0.1655475303530693, "num_tokens": 23927173.0, "step": 12970 }, { "entropy": 5.667041397094726, "epoch": 1.0900651123713505, "grad_norm": 1.546875, "learning_rate": 0.0004886671370654665, "loss": 5.4196, "mean_token_accuracy": 0.1653660088777542, "num_tokens": 23936258.0, "step": 12975 }, { "entropy": 5.676847219467163, "epoch": 1.0904851921865155, "grad_norm": 1.5703125, "learning_rate": 0.0004886577508783516, "loss": 5.3862, "mean_token_accuracy": 0.16707207411527633, "num_tokens": 23944215.0, "step": 12980 }, { "entropy": 5.730111455917358, "epoch": 1.0909052720016803, "grad_norm": 3.203125, "learning_rate": 0.0004886483609063997, "loss": 5.4505, "mean_token_accuracy": 0.16068692207336427, "num_tokens": 23953151.0, "step": 12985 }, { "entropy": 5.592217302322387, "epoch": 1.0913253518168453, "grad_norm": 2.359375, "learning_rate": 0.0004886389671497769, "loss": 5.4724, "mean_token_accuracy": 0.16959808766841888, "num_tokens": 23962919.0, "step": 12990 }, { "entropy": 5.735597896575928, "epoch": 1.09174543163201, "grad_norm": 2.53125, "learning_rate": 0.00048862956960865, "loss": 5.4779, "mean_token_accuracy": 0.15886924266815186, "num_tokens": 23971900.0, "step": 12995 }, { "entropy": 5.7348557472229, "epoch": 1.0921655114471749, "grad_norm": 1.75, "learning_rate": 0.0004886201682831852, "loss": 5.4471, "mean_token_accuracy": 0.16426561921834945, "num_tokens": 23980945.0, "step": 13000 }, { "entropy": 5.678046464920044, "epoch": 1.09258559126234, "grad_norm": 1.7421875, "learning_rate": 0.0004886107631735491, "loss": 5.4056, "mean_token_accuracy": 0.16405817568302156, "num_tokens": 23990460.0, "step": 13005 }, { "entropy": 5.714896297454834, "epoch": 1.0930056710775047, "grad_norm": 1.6796875, "learning_rate": 0.0004886013542799083, "loss": 5.5804, "mean_token_accuracy": 0.15213673710823059, "num_tokens": 23999925.0, "step": 13010 }, { "entropy": 5.658804130554199, "epoch": 1.0934257508926697, "grad_norm": 1.6171875, "learning_rate": 0.0004885919416024296, "loss": 5.4217, "mean_token_accuracy": 0.1613025948405266, "num_tokens": 24009039.0, "step": 13015 }, { "entropy": 5.759115076065063, "epoch": 1.0938458307078345, "grad_norm": 1.7421875, "learning_rate": 0.0004885825251412796, "loss": 5.4736, "mean_token_accuracy": 0.16182312816381456, "num_tokens": 24017725.0, "step": 13020 }, { "entropy": 5.735840749740601, "epoch": 1.0942659105229993, "grad_norm": 1.71875, "learning_rate": 0.0004885731048966252, "loss": 5.503, "mean_token_accuracy": 0.1575954094529152, "num_tokens": 24027158.0, "step": 13025 }, { "entropy": 5.6926501274108885, "epoch": 1.0946859903381643, "grad_norm": 1.890625, "learning_rate": 0.0004885636808686331, "loss": 5.5293, "mean_token_accuracy": 0.16384944021701814, "num_tokens": 24037224.0, "step": 13030 }, { "entropy": 5.738328456878662, "epoch": 1.095106070153329, "grad_norm": 1.6328125, "learning_rate": 0.0004885542530574705, "loss": 5.5052, "mean_token_accuracy": 0.1625734105706215, "num_tokens": 24046097.0, "step": 13035 }, { "entropy": 5.693251371383667, "epoch": 1.095526149968494, "grad_norm": 2.1875, "learning_rate": 0.0004885448214633042, "loss": 5.4044, "mean_token_accuracy": 0.1620977535843849, "num_tokens": 24055270.0, "step": 13040 }, { "entropy": 5.73248519897461, "epoch": 1.0959462297836589, "grad_norm": 1.6953125, "learning_rate": 0.0004885353860863013, "loss": 5.5641, "mean_token_accuracy": 0.15346422791481018, "num_tokens": 24064995.0, "step": 13045 }, { "entropy": 5.779075717926025, "epoch": 1.0963663095988239, "grad_norm": 1.921875, "learning_rate": 0.000488525946926629, "loss": 5.6107, "mean_token_accuracy": 0.15505203902721404, "num_tokens": 24075523.0, "step": 13050 }, { "entropy": 5.737312889099121, "epoch": 1.0967863894139886, "grad_norm": 1.8359375, "learning_rate": 0.0004885165039844545, "loss": 5.4789, "mean_token_accuracy": 0.16420630365610123, "num_tokens": 24084933.0, "step": 13055 }, { "entropy": 5.698319673538208, "epoch": 1.0972064692291534, "grad_norm": 1.5703125, "learning_rate": 0.0004885070572599452, "loss": 5.503, "mean_token_accuracy": 0.15282038301229478, "num_tokens": 24093964.0, "step": 13060 }, { "entropy": 5.724235200881958, "epoch": 1.0976265490443184, "grad_norm": 1.7265625, "learning_rate": 0.0004884976067532681, "loss": 5.452, "mean_token_accuracy": 0.15377498120069505, "num_tokens": 24103951.0, "step": 13065 }, { "entropy": 5.679253768920899, "epoch": 1.0980466288594832, "grad_norm": 1.5234375, "learning_rate": 0.000488488152464591, "loss": 5.5711, "mean_token_accuracy": 0.15378451496362686, "num_tokens": 24113392.0, "step": 13070 }, { "entropy": 5.718753099441528, "epoch": 1.0984667086746482, "grad_norm": 2.171875, "learning_rate": 0.0004884786943940812, "loss": 5.4403, "mean_token_accuracy": 0.15815389901399612, "num_tokens": 24123165.0, "step": 13075 }, { "entropy": 5.713952112197876, "epoch": 1.098886788489813, "grad_norm": 1.5390625, "learning_rate": 0.0004884692325419063, "loss": 5.479, "mean_token_accuracy": 0.15968940854072572, "num_tokens": 24132176.0, "step": 13080 }, { "entropy": 5.682787561416626, "epoch": 1.099306868304978, "grad_norm": 1.6484375, "learning_rate": 0.0004884597669082336, "loss": 5.5387, "mean_token_accuracy": 0.15351806879043578, "num_tokens": 24141737.0, "step": 13085 }, { "entropy": 5.712856578826904, "epoch": 1.0997269481201428, "grad_norm": 1.5078125, "learning_rate": 0.0004884502974932313, "loss": 5.4785, "mean_token_accuracy": 0.16513199657201766, "num_tokens": 24150477.0, "step": 13090 }, { "entropy": 5.806832218170166, "epoch": 1.1001470279353076, "grad_norm": 1.9453125, "learning_rate": 0.0004884408242970668, "loss": 5.5721, "mean_token_accuracy": 0.15941140204668044, "num_tokens": 24158739.0, "step": 13095 }, { "entropy": 5.651869440078736, "epoch": 1.1005671077504726, "grad_norm": 1.4375, "learning_rate": 0.0004884313473199081, "loss": 5.4125, "mean_token_accuracy": 0.16672947108745576, "num_tokens": 24167511.0, "step": 13100 }, { "entropy": 5.65992579460144, "epoch": 1.1009871875656374, "grad_norm": 1.3984375, "learning_rate": 0.0004884218665619229, "loss": 5.4252, "mean_token_accuracy": 0.1618572935461998, "num_tokens": 24176413.0, "step": 13105 }, { "entropy": 5.691173410415649, "epoch": 1.1014072673808024, "grad_norm": 1.4296875, "learning_rate": 0.0004884123820232792, "loss": 5.3662, "mean_token_accuracy": 0.17088967561721802, "num_tokens": 24185135.0, "step": 13110 }, { "entropy": 5.688163566589355, "epoch": 1.1018273471959672, "grad_norm": 1.390625, "learning_rate": 0.0004884028937041451, "loss": 5.4519, "mean_token_accuracy": 0.16832585632801056, "num_tokens": 24193273.0, "step": 13115 }, { "entropy": 5.6593669891357425, "epoch": 1.1022474270111322, "grad_norm": 1.5078125, "learning_rate": 0.0004883934016046886, "loss": 5.5176, "mean_token_accuracy": 0.15427347868680955, "num_tokens": 24202509.0, "step": 13120 }, { "entropy": 5.783310222625732, "epoch": 1.102667506826297, "grad_norm": 1.3984375, "learning_rate": 0.000488383905725078, "loss": 5.5096, "mean_token_accuracy": 0.15751553177833558, "num_tokens": 24212644.0, "step": 13125 }, { "entropy": 5.7312768459320065, "epoch": 1.1030875866414618, "grad_norm": 1.453125, "learning_rate": 0.0004883744060654811, "loss": 5.4135, "mean_token_accuracy": 0.16013285517692566, "num_tokens": 24221838.0, "step": 13130 }, { "entropy": 5.651692819595337, "epoch": 1.1035076664566268, "grad_norm": 1.96875, "learning_rate": 0.0004883649026260667, "loss": 5.4813, "mean_token_accuracy": 0.16545673757791518, "num_tokens": 24230987.0, "step": 13135 }, { "entropy": 5.6557066440582275, "epoch": 1.1039277462717916, "grad_norm": 1.5546875, "learning_rate": 0.0004883553954070028, "loss": 5.4491, "mean_token_accuracy": 0.16192631274461747, "num_tokens": 24240523.0, "step": 13140 }, { "entropy": 5.724129486083984, "epoch": 1.1043478260869566, "grad_norm": 1.5078125, "learning_rate": 0.000488345884408458, "loss": 5.5421, "mean_token_accuracy": 0.1672690689563751, "num_tokens": 24249799.0, "step": 13145 }, { "entropy": 5.723703002929687, "epoch": 1.1047679059021214, "grad_norm": 1.5859375, "learning_rate": 0.0004883363696306007, "loss": 5.4621, "mean_token_accuracy": 0.1656269609928131, "num_tokens": 24259361.0, "step": 13150 }, { "entropy": 5.718085622787475, "epoch": 1.1051879857172864, "grad_norm": 1.8984375, "learning_rate": 0.0004883268510735995, "loss": 5.4368, "mean_token_accuracy": 0.15831930935382843, "num_tokens": 24268010.0, "step": 13155 }, { "entropy": 5.5756614208221436, "epoch": 1.1056080655324512, "grad_norm": 1.7109375, "learning_rate": 0.0004883173287376229, "loss": 5.4839, "mean_token_accuracy": 0.1586616076529026, "num_tokens": 24277416.0, "step": 13160 }, { "entropy": 5.7934998035430905, "epoch": 1.106028145347616, "grad_norm": 1.578125, "learning_rate": 0.0004883078026228397, "loss": 5.5608, "mean_token_accuracy": 0.16097336113452912, "num_tokens": 24286185.0, "step": 13165 }, { "entropy": 5.741655588150024, "epoch": 1.106448225162781, "grad_norm": 1.75, "learning_rate": 0.0004882982727294187, "loss": 5.428, "mean_token_accuracy": 0.1603280246257782, "num_tokens": 24295382.0, "step": 13170 }, { "entropy": 5.676276683807373, "epoch": 1.1068683049779457, "grad_norm": 5.4375, "learning_rate": 0.0004882887390575284, "loss": 5.4468, "mean_token_accuracy": 0.1647188439965248, "num_tokens": 24305197.0, "step": 13175 }, { "entropy": 5.706903171539307, "epoch": 1.1072883847931108, "grad_norm": 1.9765625, "learning_rate": 0.0004882792016073381, "loss": 5.541, "mean_token_accuracy": 0.15018792897462846, "num_tokens": 24314149.0, "step": 13180 }, { "entropy": 5.755481195449829, "epoch": 1.1077084646082755, "grad_norm": 1.5078125, "learning_rate": 0.00048826966037901655, "loss": 5.4681, "mean_token_accuracy": 0.1623881921172142, "num_tokens": 24323737.0, "step": 13185 }, { "entropy": 5.675417709350586, "epoch": 1.1081285444234406, "grad_norm": 2.5, "learning_rate": 0.00048826011537273276, "loss": 5.4406, "mean_token_accuracy": 0.1623774915933609, "num_tokens": 24332853.0, "step": 13190 }, { "entropy": 5.705647706985474, "epoch": 1.1085486242386053, "grad_norm": 1.7421875, "learning_rate": 0.0004882505665886558, "loss": 5.5693, "mean_token_accuracy": 0.15558527559041976, "num_tokens": 24342632.0, "step": 13195 }, { "entropy": 5.6826183795928955, "epoch": 1.1089687040537701, "grad_norm": 1.5546875, "learning_rate": 0.00048824101402695493, "loss": 5.4113, "mean_token_accuracy": 0.16228149831295013, "num_tokens": 24351659.0, "step": 13200 }, { "entropy": 5.612444162368774, "epoch": 1.1093887838689351, "grad_norm": 2.171875, "learning_rate": 0.0004882314576877993, "loss": 5.4479, "mean_token_accuracy": 0.1650165230035782, "num_tokens": 24360938.0, "step": 13205 }, { "entropy": 5.7091968059539795, "epoch": 1.1098088636841, "grad_norm": 1.5390625, "learning_rate": 0.0004882218975713581, "loss": 5.5041, "mean_token_accuracy": 0.1613766685128212, "num_tokens": 24369603.0, "step": 13210 }, { "entropy": 5.702196216583252, "epoch": 1.110228943499265, "grad_norm": 1.5703125, "learning_rate": 0.0004882123336778009, "loss": 5.4355, "mean_token_accuracy": 0.16338066160678863, "num_tokens": 24377605.0, "step": 13215 }, { "entropy": 5.725568962097168, "epoch": 1.1106490233144297, "grad_norm": 1.6015625, "learning_rate": 0.0004882027660072969, "loss": 5.5007, "mean_token_accuracy": 0.15481040328741075, "num_tokens": 24386930.0, "step": 13220 }, { "entropy": 5.700316143035889, "epoch": 1.1110691031295947, "grad_norm": 1.9140625, "learning_rate": 0.0004881931945600157, "loss": 5.4679, "mean_token_accuracy": 0.16834752559661864, "num_tokens": 24396473.0, "step": 13225 }, { "entropy": 5.72724027633667, "epoch": 1.1114891829447595, "grad_norm": 1.8046875, "learning_rate": 0.0004881836193361269, "loss": 5.5465, "mean_token_accuracy": 0.1676660493016243, "num_tokens": 24405461.0, "step": 13230 }, { "entropy": 5.7355544090271, "epoch": 1.1119092627599243, "grad_norm": 1.4609375, "learning_rate": 0.0004881740403358, "loss": 5.4901, "mean_token_accuracy": 0.16505587697029114, "num_tokens": 24414138.0, "step": 13235 }, { "entropy": 5.717983341217041, "epoch": 1.1123293425750893, "grad_norm": 1.671875, "learning_rate": 0.00048816445755920474, "loss": 5.5038, "mean_token_accuracy": 0.15973408818244933, "num_tokens": 24423386.0, "step": 13240 }, { "entropy": 5.685654735565185, "epoch": 1.112749422390254, "grad_norm": 2.125, "learning_rate": 0.0004881548710065109, "loss": 5.4944, "mean_token_accuracy": 0.15903386771678923, "num_tokens": 24433637.0, "step": 13245 }, { "entropy": 5.740741491317749, "epoch": 1.113169502205419, "grad_norm": 2.328125, "learning_rate": 0.0004881452806778883, "loss": 5.5311, "mean_token_accuracy": 0.16349861323833464, "num_tokens": 24443677.0, "step": 13250 }, { "entropy": 5.709890747070313, "epoch": 1.113589582020584, "grad_norm": 1.53125, "learning_rate": 0.00048813568657350676, "loss": 5.4317, "mean_token_accuracy": 0.16741538047790527, "num_tokens": 24452317.0, "step": 13255 }, { "entropy": 5.704727077484131, "epoch": 1.1140096618357487, "grad_norm": 1.578125, "learning_rate": 0.0004881260886935363, "loss": 5.449, "mean_token_accuracy": 0.16238080710172653, "num_tokens": 24460626.0, "step": 13260 }, { "entropy": 5.757587671279907, "epoch": 1.1144297416509137, "grad_norm": 2.03125, "learning_rate": 0.00048811648703814693, "loss": 5.546, "mean_token_accuracy": 0.1519101120531559, "num_tokens": 24469583.0, "step": 13265 }, { "entropy": 5.741657829284668, "epoch": 1.1148498214660785, "grad_norm": 1.671875, "learning_rate": 0.0004881068816075087, "loss": 5.4811, "mean_token_accuracy": 0.15867555439472197, "num_tokens": 24478811.0, "step": 13270 }, { "entropy": 5.706976461410522, "epoch": 1.1152699012812435, "grad_norm": 1.78125, "learning_rate": 0.00048809727240179193, "loss": 5.5147, "mean_token_accuracy": 0.1607096463441849, "num_tokens": 24487818.0, "step": 13275 }, { "entropy": 5.676044464111328, "epoch": 1.1156899810964083, "grad_norm": 2.0625, "learning_rate": 0.0004880876594211665, "loss": 5.4882, "mean_token_accuracy": 0.15960678607225418, "num_tokens": 24497087.0, "step": 13280 }, { "entropy": 5.743713235855102, "epoch": 1.1161100609115733, "grad_norm": 2.109375, "learning_rate": 0.00048807804266580304, "loss": 5.4398, "mean_token_accuracy": 0.15841995030641556, "num_tokens": 24505347.0, "step": 13285 }, { "entropy": 5.774560213088989, "epoch": 1.116530140726738, "grad_norm": 1.6484375, "learning_rate": 0.0004880684221358717, "loss": 5.4756, "mean_token_accuracy": 0.16267163306474686, "num_tokens": 24514732.0, "step": 13290 }, { "entropy": 5.7229407787322994, "epoch": 1.116950220541903, "grad_norm": 1.671875, "learning_rate": 0.00048805879783154305, "loss": 5.5052, "mean_token_accuracy": 0.16089607030153275, "num_tokens": 24523295.0, "step": 13295 }, { "entropy": 5.651921367645263, "epoch": 1.1173703003570679, "grad_norm": 1.53125, "learning_rate": 0.00048804916975298744, "loss": 5.4123, "mean_token_accuracy": 0.16294008493423462, "num_tokens": 24532415.0, "step": 13300 }, { "entropy": 5.739264678955078, "epoch": 1.1177903801722326, "grad_norm": 1.7109375, "learning_rate": 0.0004880395379003755, "loss": 5.5434, "mean_token_accuracy": 0.15819203555583955, "num_tokens": 24541856.0, "step": 13305 }, { "entropy": 5.685423040390015, "epoch": 1.1182104599873977, "grad_norm": 1.8125, "learning_rate": 0.00048802990227387797, "loss": 5.5277, "mean_token_accuracy": 0.1538828618824482, "num_tokens": 24550982.0, "step": 13310 }, { "entropy": 5.771675062179566, "epoch": 1.1186305398025624, "grad_norm": 1.59375, "learning_rate": 0.00048802026287366525, "loss": 5.5966, "mean_token_accuracy": 0.1531897470355034, "num_tokens": 24561176.0, "step": 13315 }, { "entropy": 5.749803829193115, "epoch": 1.1190506196177274, "grad_norm": 1.984375, "learning_rate": 0.00048801061969990834, "loss": 5.47, "mean_token_accuracy": 0.16135310828685762, "num_tokens": 24570741.0, "step": 13320 }, { "entropy": 5.661540126800537, "epoch": 1.1194706994328922, "grad_norm": 1.5234375, "learning_rate": 0.00048800097275277795, "loss": 5.4795, "mean_token_accuracy": 0.16684099435806274, "num_tokens": 24580175.0, "step": 13325 }, { "entropy": 5.715025186538696, "epoch": 1.119890779248057, "grad_norm": 1.578125, "learning_rate": 0.000487991322032445, "loss": 5.4763, "mean_token_accuracy": 0.16523855775594712, "num_tokens": 24588754.0, "step": 13330 }, { "entropy": 5.864963054656982, "epoch": 1.120310859063222, "grad_norm": 1.75, "learning_rate": 0.0004879816675390805, "loss": 5.6524, "mean_token_accuracy": 0.15361952036619186, "num_tokens": 24599429.0, "step": 13335 }, { "entropy": 5.661528491973877, "epoch": 1.1207309388783868, "grad_norm": 1.8046875, "learning_rate": 0.00048797200927285547, "loss": 5.3917, "mean_token_accuracy": 0.1662903368473053, "num_tokens": 24608767.0, "step": 13340 }, { "entropy": 5.678159713745117, "epoch": 1.1211510186935518, "grad_norm": 1.859375, "learning_rate": 0.0004879623472339409, "loss": 5.5641, "mean_token_accuracy": 0.16006904989480972, "num_tokens": 24618232.0, "step": 13345 }, { "entropy": 5.752752017974854, "epoch": 1.1215710985087166, "grad_norm": 1.6640625, "learning_rate": 0.000487952681422508, "loss": 5.4368, "mean_token_accuracy": 0.16255403459072112, "num_tokens": 24626986.0, "step": 13350 }, { "entropy": 5.588898992538452, "epoch": 1.1219911783238816, "grad_norm": 1.4453125, "learning_rate": 0.000487943011838728, "loss": 5.3223, "mean_token_accuracy": 0.16933453232049941, "num_tokens": 24635283.0, "step": 13355 }, { "entropy": 5.555433702468872, "epoch": 1.1224112581390464, "grad_norm": 1.703125, "learning_rate": 0.0004879333384827722, "loss": 5.4317, "mean_token_accuracy": 0.1646237164735794, "num_tokens": 24644451.0, "step": 13360 }, { "entropy": 5.796985626220703, "epoch": 1.1228313379542114, "grad_norm": 1.5859375, "learning_rate": 0.0004879236613548119, "loss": 5.5727, "mean_token_accuracy": 0.15768791288137435, "num_tokens": 24654811.0, "step": 13365 }, { "entropy": 5.751317977905273, "epoch": 1.1232514177693762, "grad_norm": 1.6015625, "learning_rate": 0.0004879139804550187, "loss": 5.4907, "mean_token_accuracy": 0.15994445979595184, "num_tokens": 24663712.0, "step": 13370 }, { "entropy": 5.733260011672973, "epoch": 1.123671497584541, "grad_norm": 1.5078125, "learning_rate": 0.00048790429578356387, "loss": 5.588, "mean_token_accuracy": 0.15311638191342353, "num_tokens": 24672518.0, "step": 13375 }, { "entropy": 5.719970655441284, "epoch": 1.124091577399706, "grad_norm": 1.453125, "learning_rate": 0.00048789460734061915, "loss": 5.5207, "mean_token_accuracy": 0.160324390232563, "num_tokens": 24681900.0, "step": 13380 }, { "entropy": 5.720213317871094, "epoch": 1.1245116572148708, "grad_norm": 4.53125, "learning_rate": 0.0004878849151263561, "loss": 5.4909, "mean_token_accuracy": 0.16072850972414016, "num_tokens": 24691760.0, "step": 13385 }, { "entropy": 5.71978440284729, "epoch": 1.1249317370300358, "grad_norm": 1.4609375, "learning_rate": 0.0004878752191409463, "loss": 5.4247, "mean_token_accuracy": 0.16750899255275725, "num_tokens": 24700742.0, "step": 13390 }, { "entropy": 5.660094261169434, "epoch": 1.1253518168452006, "grad_norm": 1.53125, "learning_rate": 0.0004878655193845616, "loss": 5.5156, "mean_token_accuracy": 0.15948394387960435, "num_tokens": 24709329.0, "step": 13395 }, { "entropy": 5.693703031539917, "epoch": 1.1257718966603654, "grad_norm": 1.921875, "learning_rate": 0.00048785581585737394, "loss": 5.6359, "mean_token_accuracy": 0.15693159401416779, "num_tokens": 24718475.0, "step": 13400 }, { "entropy": 5.770649480819702, "epoch": 1.1261919764755304, "grad_norm": 2.375, "learning_rate": 0.000487846108559555, "loss": 5.5083, "mean_token_accuracy": 0.16890775114297868, "num_tokens": 24727817.0, "step": 13405 }, { "entropy": 5.703707599639893, "epoch": 1.1266120562906952, "grad_norm": 1.515625, "learning_rate": 0.00048783639749127694, "loss": 5.4892, "mean_token_accuracy": 0.16033429354429246, "num_tokens": 24737057.0, "step": 13410 }, { "entropy": 5.686640310287475, "epoch": 1.1270321361058602, "grad_norm": 1.734375, "learning_rate": 0.0004878266826527116, "loss": 5.5297, "mean_token_accuracy": 0.15543637573719024, "num_tokens": 24746016.0, "step": 13415 }, { "entropy": 5.779524898529052, "epoch": 1.127452215921025, "grad_norm": 1.578125, "learning_rate": 0.00048781696404403126, "loss": 5.527, "mean_token_accuracy": 0.163545098900795, "num_tokens": 24755978.0, "step": 13420 }, { "entropy": 5.694488048553467, "epoch": 1.12787229573619, "grad_norm": 1.5625, "learning_rate": 0.00048780724166540794, "loss": 5.423, "mean_token_accuracy": 0.1599399358034134, "num_tokens": 24765255.0, "step": 13425 }, { "entropy": 5.662991142272949, "epoch": 1.1282923755513548, "grad_norm": 1.5078125, "learning_rate": 0.0004877975155170139, "loss": 5.4922, "mean_token_accuracy": 0.15767267495393752, "num_tokens": 24774339.0, "step": 13430 }, { "entropy": 5.680190658569336, "epoch": 1.1287124553665198, "grad_norm": 1.546875, "learning_rate": 0.0004877877855990215, "loss": 5.4979, "mean_token_accuracy": 0.1563847467303276, "num_tokens": 24783236.0, "step": 13435 }, { "entropy": 5.642968368530274, "epoch": 1.1291325351816845, "grad_norm": 1.609375, "learning_rate": 0.000487778051911603, "loss": 5.4033, "mean_token_accuracy": 0.1693968042731285, "num_tokens": 24792168.0, "step": 13440 }, { "entropy": 5.761270141601562, "epoch": 1.1295526149968493, "grad_norm": 1.8046875, "learning_rate": 0.0004877683144549308, "loss": 5.5611, "mean_token_accuracy": 0.16145953834056853, "num_tokens": 24800843.0, "step": 13445 }, { "entropy": 5.7103941440582275, "epoch": 1.1299726948120143, "grad_norm": 1.59375, "learning_rate": 0.00048775857322917753, "loss": 5.4436, "mean_token_accuracy": 0.15832821130752564, "num_tokens": 24810475.0, "step": 13450 }, { "entropy": 5.657360696792603, "epoch": 1.1303927746271791, "grad_norm": 1.828125, "learning_rate": 0.0004877488282345158, "loss": 5.5202, "mean_token_accuracy": 0.16295383870601654, "num_tokens": 24820486.0, "step": 13455 }, { "entropy": 5.752575635910034, "epoch": 1.1308128544423441, "grad_norm": 1.6953125, "learning_rate": 0.000487739079471118, "loss": 5.5749, "mean_token_accuracy": 0.16365474909543992, "num_tokens": 24830243.0, "step": 13460 }, { "entropy": 5.7682483196258545, "epoch": 1.131232934257509, "grad_norm": 1.75, "learning_rate": 0.000487729326939157, "loss": 5.4805, "mean_token_accuracy": 0.16092797219753266, "num_tokens": 24839090.0, "step": 13465 }, { "entropy": 5.685783910751343, "epoch": 1.1316530140726737, "grad_norm": 2.203125, "learning_rate": 0.00048771957063880553, "loss": 5.4632, "mean_token_accuracy": 0.1614797055721283, "num_tokens": 24847933.0, "step": 13470 }, { "entropy": 5.772010850906372, "epoch": 1.1320730938878387, "grad_norm": 2.015625, "learning_rate": 0.0004877098105702363, "loss": 5.4886, "mean_token_accuracy": 0.163765586912632, "num_tokens": 24857037.0, "step": 13475 }, { "entropy": 5.617125749588013, "epoch": 1.1324931737030035, "grad_norm": 1.59375, "learning_rate": 0.00048770004673362243, "loss": 5.3125, "mean_token_accuracy": 0.1722439780831337, "num_tokens": 24866042.0, "step": 13480 }, { "entropy": 5.561356925964356, "epoch": 1.1329132535181685, "grad_norm": 1.734375, "learning_rate": 0.00048769027912913673, "loss": 5.2843, "mean_token_accuracy": 0.1734999194741249, "num_tokens": 24873735.0, "step": 13485 }, { "entropy": 5.528507661819458, "epoch": 1.1333333333333333, "grad_norm": 1.8515625, "learning_rate": 0.0004876805077569522, "loss": 5.353, "mean_token_accuracy": 0.16299628913402558, "num_tokens": 24882277.0, "step": 13490 }, { "entropy": 5.604131412506104, "epoch": 1.133753413148498, "grad_norm": 2.03125, "learning_rate": 0.00048767073261724204, "loss": 5.4774, "mean_token_accuracy": 0.16074343770742416, "num_tokens": 24891354.0, "step": 13495 }, { "entropy": 5.686602067947388, "epoch": 1.134173492963663, "grad_norm": 1.875, "learning_rate": 0.0004876609537101793, "loss": 5.4689, "mean_token_accuracy": 0.1579518973827362, "num_tokens": 24899887.0, "step": 13500 }, { "entropy": 5.832871198654175, "epoch": 1.1345935727788279, "grad_norm": 1.9296875, "learning_rate": 0.0004876511710359374, "loss": 5.4771, "mean_token_accuracy": 0.16068532615900039, "num_tokens": 24908616.0, "step": 13505 }, { "entropy": 5.792671775817871, "epoch": 1.135013652593993, "grad_norm": 1.71875, "learning_rate": 0.00048764138459468935, "loss": 5.5377, "mean_token_accuracy": 0.16124322265386581, "num_tokens": 24917864.0, "step": 13510 }, { "entropy": 5.755936479568481, "epoch": 1.1354337324091577, "grad_norm": 1.7734375, "learning_rate": 0.00048763159438660876, "loss": 5.551, "mean_token_accuracy": 0.1572817325592041, "num_tokens": 24927864.0, "step": 13515 }, { "entropy": 5.621814107894897, "epoch": 1.1358538122243227, "grad_norm": 1.8125, "learning_rate": 0.00048762180041186893, "loss": 5.4411, "mean_token_accuracy": 0.16689430475234984, "num_tokens": 24937146.0, "step": 13520 }, { "entropy": 5.737927103042603, "epoch": 1.1362738920394875, "grad_norm": 1.7421875, "learning_rate": 0.0004876120026706434, "loss": 5.5174, "mean_token_accuracy": 0.16046024858951569, "num_tokens": 24945694.0, "step": 13525 }, { "entropy": 5.7014954566955565, "epoch": 1.1366939718546525, "grad_norm": 1.7421875, "learning_rate": 0.0004876022011631057, "loss": 5.4271, "mean_token_accuracy": 0.165780770778656, "num_tokens": 24955325.0, "step": 13530 }, { "entropy": 5.640952110290527, "epoch": 1.1371140516698173, "grad_norm": 1.8046875, "learning_rate": 0.0004875923958894295, "loss": 5.2981, "mean_token_accuracy": 0.1672575891017914, "num_tokens": 24964028.0, "step": 13535 }, { "entropy": 5.672315645217895, "epoch": 1.137534131484982, "grad_norm": 2.078125, "learning_rate": 0.00048758258684978846, "loss": 5.498, "mean_token_accuracy": 0.1611057698726654, "num_tokens": 24972923.0, "step": 13540 }, { "entropy": 5.699390411376953, "epoch": 1.137954211300147, "grad_norm": 1.9140625, "learning_rate": 0.00048757277404435636, "loss": 5.3845, "mean_token_accuracy": 0.16409458816051484, "num_tokens": 24982156.0, "step": 13545 }, { "entropy": 5.678975343704224, "epoch": 1.1383742911153119, "grad_norm": 1.7109375, "learning_rate": 0.000487562957473307, "loss": 5.4364, "mean_token_accuracy": 0.16643529236316681, "num_tokens": 24991616.0, "step": 13550 }, { "entropy": 5.650065231323242, "epoch": 1.1387943709304769, "grad_norm": 1.953125, "learning_rate": 0.0004875531371368144, "loss": 5.5046, "mean_token_accuracy": 0.1579531379044056, "num_tokens": 25001140.0, "step": 13555 }, { "entropy": 5.72753210067749, "epoch": 1.1392144507456416, "grad_norm": 1.9765625, "learning_rate": 0.00048754331303505236, "loss": 5.4148, "mean_token_accuracy": 0.16427789330482484, "num_tokens": 25010863.0, "step": 13560 }, { "entropy": 5.72626485824585, "epoch": 1.1396345305608064, "grad_norm": 1.703125, "learning_rate": 0.00048753348516819496, "loss": 5.5148, "mean_token_accuracy": 0.15984421372413635, "num_tokens": 25019770.0, "step": 13565 }, { "entropy": 5.761800861358642, "epoch": 1.1400546103759714, "grad_norm": 1.4921875, "learning_rate": 0.0004875236535364163, "loss": 5.5556, "mean_token_accuracy": 0.15370625630021095, "num_tokens": 25029900.0, "step": 13570 }, { "entropy": 5.775524997711182, "epoch": 1.1404746901911362, "grad_norm": 2.375, "learning_rate": 0.0004875138181398906, "loss": 5.516, "mean_token_accuracy": 0.16178728863596917, "num_tokens": 25039428.0, "step": 13575 }, { "entropy": 5.739251804351807, "epoch": 1.1408947700063012, "grad_norm": 1.8671875, "learning_rate": 0.000487503978978792, "loss": 5.5084, "mean_token_accuracy": 0.1567676842212677, "num_tokens": 25049145.0, "step": 13580 }, { "entropy": 5.7199629783630375, "epoch": 1.141314849821466, "grad_norm": 1.53125, "learning_rate": 0.00048749413605329487, "loss": 5.5387, "mean_token_accuracy": 0.15968952625989913, "num_tokens": 25058772.0, "step": 13585 }, { "entropy": 5.715544176101685, "epoch": 1.141734929636631, "grad_norm": 1.4765625, "learning_rate": 0.00048748428936357346, "loss": 5.4386, "mean_token_accuracy": 0.1636001095175743, "num_tokens": 25067249.0, "step": 13590 }, { "entropy": 5.647507381439209, "epoch": 1.1421550094517958, "grad_norm": 1.5859375, "learning_rate": 0.0004874744389098024, "loss": 5.405, "mean_token_accuracy": 0.1577477991580963, "num_tokens": 25076893.0, "step": 13595 }, { "entropy": 5.634746408462524, "epoch": 1.1425750892669608, "grad_norm": 1.609375, "learning_rate": 0.0004874645846921559, "loss": 5.4148, "mean_token_accuracy": 0.16532657518982888, "num_tokens": 25086238.0, "step": 13600 }, { "entropy": 5.669492626190186, "epoch": 1.1429951690821256, "grad_norm": 2.171875, "learning_rate": 0.00048745472671080884, "loss": 5.4414, "mean_token_accuracy": 0.1582840844988823, "num_tokens": 25095334.0, "step": 13605 }, { "entropy": 5.686340093612671, "epoch": 1.1434152488972904, "grad_norm": 2.046875, "learning_rate": 0.00048744486496593565, "loss": 5.4259, "mean_token_accuracy": 0.1654140532016754, "num_tokens": 25104136.0, "step": 13610 }, { "entropy": 5.6616381168365475, "epoch": 1.1438353287124554, "grad_norm": 1.6328125, "learning_rate": 0.000487434999457711, "loss": 5.4213, "mean_token_accuracy": 0.17221303135156632, "num_tokens": 25112629.0, "step": 13615 }, { "entropy": 5.7070547580719, "epoch": 1.1442554085276202, "grad_norm": 1.6953125, "learning_rate": 0.0004874251301863098, "loss": 5.4605, "mean_token_accuracy": 0.1610724672675133, "num_tokens": 25121014.0, "step": 13620 }, { "entropy": 5.658392524719238, "epoch": 1.1446754883427852, "grad_norm": 4.03125, "learning_rate": 0.00048741525715190675, "loss": 5.4949, "mean_token_accuracy": 0.1595884680747986, "num_tokens": 25130097.0, "step": 13625 }, { "entropy": 5.7246985912323, "epoch": 1.14509556815795, "grad_norm": 1.7265625, "learning_rate": 0.0004874053803546769, "loss": 5.5002, "mean_token_accuracy": 0.16293734163045884, "num_tokens": 25139065.0, "step": 13630 }, { "entropy": 5.713643646240234, "epoch": 1.1455156479731148, "grad_norm": 1.984375, "learning_rate": 0.000487395499794795, "loss": 5.4796, "mean_token_accuracy": 0.1665970802307129, "num_tokens": 25148852.0, "step": 13635 }, { "entropy": 5.620668411254883, "epoch": 1.1459357277882798, "grad_norm": 1.8046875, "learning_rate": 0.0004873856154724362, "loss": 5.3741, "mean_token_accuracy": 0.17443220168352128, "num_tokens": 25157580.0, "step": 13640 }, { "entropy": 5.670327091217041, "epoch": 1.1463558076034446, "grad_norm": 2.1875, "learning_rate": 0.0004873757273877756, "loss": 5.4831, "mean_token_accuracy": 0.1579154871404171, "num_tokens": 25166243.0, "step": 13645 }, { "entropy": 5.720182752609253, "epoch": 1.1467758874186096, "grad_norm": 1.796875, "learning_rate": 0.00048736583554098836, "loss": 5.49, "mean_token_accuracy": 0.16273559033870696, "num_tokens": 25174674.0, "step": 13650 }, { "entropy": 5.649949932098389, "epoch": 1.1471959672337744, "grad_norm": 1.7265625, "learning_rate": 0.00048735593993224973, "loss": 5.4028, "mean_token_accuracy": 0.1695830523967743, "num_tokens": 25183892.0, "step": 13655 }, { "entropy": 5.677073192596436, "epoch": 1.1476160470489394, "grad_norm": 1.6328125, "learning_rate": 0.00048734604056173495, "loss": 5.4464, "mean_token_accuracy": 0.1643107756972313, "num_tokens": 25192731.0, "step": 13660 }, { "entropy": 5.715389537811279, "epoch": 1.1480361268641042, "grad_norm": 2.828125, "learning_rate": 0.00048733613742961933, "loss": 5.5484, "mean_token_accuracy": 0.16420064717531205, "num_tokens": 25201280.0, "step": 13665 }, { "entropy": 5.67736177444458, "epoch": 1.1484562066792692, "grad_norm": 2.5625, "learning_rate": 0.00048732623053607846, "loss": 5.4255, "mean_token_accuracy": 0.1610700160264969, "num_tokens": 25209929.0, "step": 13670 }, { "entropy": 5.672457456588745, "epoch": 1.148876286494434, "grad_norm": 1.578125, "learning_rate": 0.0004873163198812877, "loss": 5.3544, "mean_token_accuracy": 0.16796983331441878, "num_tokens": 25218583.0, "step": 13675 }, { "entropy": 5.768982076644898, "epoch": 1.1492963663095987, "grad_norm": 2.421875, "learning_rate": 0.0004873064054654227, "loss": 5.5805, "mean_token_accuracy": 0.15605029240250587, "num_tokens": 25228949.0, "step": 13680 }, { "entropy": 5.741779899597168, "epoch": 1.1497164461247638, "grad_norm": 1.8984375, "learning_rate": 0.00048729648728865904, "loss": 5.4092, "mean_token_accuracy": 0.17617493420839309, "num_tokens": 25238603.0, "step": 13685 }, { "entropy": 5.68451452255249, "epoch": 1.1501365259399285, "grad_norm": 1.671875, "learning_rate": 0.00048728656535117237, "loss": 5.5239, "mean_token_accuracy": 0.15241808593273162, "num_tokens": 25248265.0, "step": 13690 }, { "entropy": 5.671699285507202, "epoch": 1.1505566057550936, "grad_norm": 2.046875, "learning_rate": 0.0004872766396531386, "loss": 5.5062, "mean_token_accuracy": 0.16589785665273665, "num_tokens": 25258195.0, "step": 13695 }, { "entropy": 5.758512020111084, "epoch": 1.1509766855702583, "grad_norm": 2.28125, "learning_rate": 0.00048726671019473335, "loss": 5.4622, "mean_token_accuracy": 0.16697300374507903, "num_tokens": 25267886.0, "step": 13700 }, { "entropy": 5.720304870605469, "epoch": 1.1513967653854231, "grad_norm": 1.703125, "learning_rate": 0.00048725677697613267, "loss": 5.5039, "mean_token_accuracy": 0.16215680837631224, "num_tokens": 25277304.0, "step": 13705 }, { "entropy": 5.701442766189575, "epoch": 1.1518168452005881, "grad_norm": 1.984375, "learning_rate": 0.0004872468399975125, "loss": 5.5047, "mean_token_accuracy": 0.15424503684043883, "num_tokens": 25286771.0, "step": 13710 }, { "entropy": 5.780902290344239, "epoch": 1.152236925015753, "grad_norm": 1.90625, "learning_rate": 0.00048723689925904884, "loss": 5.5656, "mean_token_accuracy": 0.15876710936427116, "num_tokens": 25296018.0, "step": 13715 }, { "entropy": 5.713040781021118, "epoch": 1.152657004830918, "grad_norm": 1.609375, "learning_rate": 0.0004872269547609179, "loss": 5.5103, "mean_token_accuracy": 0.1646754786372185, "num_tokens": 25305737.0, "step": 13720 }, { "entropy": 5.65469765663147, "epoch": 1.1530770846460827, "grad_norm": 1.6328125, "learning_rate": 0.0004872170065032956, "loss": 5.3432, "mean_token_accuracy": 0.1650144189596176, "num_tokens": 25314625.0, "step": 13725 }, { "entropy": 5.688196754455566, "epoch": 1.1534971644612477, "grad_norm": 1.6953125, "learning_rate": 0.0004872070544863584, "loss": 5.4849, "mean_token_accuracy": 0.15882542431354524, "num_tokens": 25323453.0, "step": 13730 }, { "entropy": 5.685961675643921, "epoch": 1.1539172442764125, "grad_norm": 1.75, "learning_rate": 0.0004871970987102824, "loss": 5.4906, "mean_token_accuracy": 0.166608627140522, "num_tokens": 25333236.0, "step": 13735 }, { "entropy": 5.751754331588745, "epoch": 1.1543373240915775, "grad_norm": 2.171875, "learning_rate": 0.0004871871391752442, "loss": 5.3968, "mean_token_accuracy": 0.16037501096725465, "num_tokens": 25341993.0, "step": 13740 }, { "entropy": 5.743503475189209, "epoch": 1.1547574039067423, "grad_norm": 2.21875, "learning_rate": 0.00048717717588141993, "loss": 5.4382, "mean_token_accuracy": 0.16419214904308319, "num_tokens": 25350695.0, "step": 13745 }, { "entropy": 5.695055913925171, "epoch": 1.155177483721907, "grad_norm": 1.859375, "learning_rate": 0.0004871672088289863, "loss": 5.4726, "mean_token_accuracy": 0.1616984099149704, "num_tokens": 25359044.0, "step": 13750 }, { "entropy": 5.670234966278076, "epoch": 1.155597563537072, "grad_norm": 1.78125, "learning_rate": 0.00048715723801811986, "loss": 5.4911, "mean_token_accuracy": 0.16160673201084136, "num_tokens": 25367959.0, "step": 13755 }, { "entropy": 5.719758939743042, "epoch": 1.156017643352237, "grad_norm": 1.796875, "learning_rate": 0.00048714726344899716, "loss": 5.51, "mean_token_accuracy": 0.16625330299139024, "num_tokens": 25376968.0, "step": 13760 }, { "entropy": 5.6630126953125, "epoch": 1.156437723167402, "grad_norm": 1.484375, "learning_rate": 0.0004871372851217949, "loss": 5.3763, "mean_token_accuracy": 0.16886330991983414, "num_tokens": 25385381.0, "step": 13765 }, { "entropy": 5.691815996170044, "epoch": 1.1568578029825667, "grad_norm": 1.71875, "learning_rate": 0.0004871273030366899, "loss": 5.4938, "mean_token_accuracy": 0.15851637423038484, "num_tokens": 25394647.0, "step": 13770 }, { "entropy": 5.671438503265381, "epoch": 1.1572778827977315, "grad_norm": 1.7890625, "learning_rate": 0.0004871173171938589, "loss": 5.4387, "mean_token_accuracy": 0.1735491305589676, "num_tokens": 25403973.0, "step": 13775 }, { "entropy": 5.642987537384033, "epoch": 1.1576979626128965, "grad_norm": 1.8828125, "learning_rate": 0.0004871073275934789, "loss": 5.4258, "mean_token_accuracy": 0.1666042447090149, "num_tokens": 25412319.0, "step": 13780 }, { "entropy": 5.623088264465332, "epoch": 1.1581180424280613, "grad_norm": 1.6484375, "learning_rate": 0.00048709733423572685, "loss": 5.4618, "mean_token_accuracy": 0.16146773099899292, "num_tokens": 25420558.0, "step": 13785 }, { "entropy": 5.629000854492188, "epoch": 1.1585381222432263, "grad_norm": 1.65625, "learning_rate": 0.00048708733712077973, "loss": 5.4071, "mean_token_accuracy": 0.16649366915225983, "num_tokens": 25429258.0, "step": 13790 }, { "entropy": 5.719772052764893, "epoch": 1.158958202058391, "grad_norm": 1.65625, "learning_rate": 0.0004870773362488146, "loss": 5.3748, "mean_token_accuracy": 0.1697326421737671, "num_tokens": 25438005.0, "step": 13795 }, { "entropy": 5.681618309020996, "epoch": 1.159378281873556, "grad_norm": 2.03125, "learning_rate": 0.0004870673316200087, "loss": 5.4003, "mean_token_accuracy": 0.16728533059358597, "num_tokens": 25447120.0, "step": 13800 }, { "entropy": 5.646628332138062, "epoch": 1.1597983616887209, "grad_norm": 1.4375, "learning_rate": 0.0004870573232345392, "loss": 5.3916, "mean_token_accuracy": 0.16811733990907668, "num_tokens": 25456216.0, "step": 13805 }, { "entropy": 5.839818906784058, "epoch": 1.1602184415038856, "grad_norm": 1.5703125, "learning_rate": 0.0004870473110925834, "loss": 5.6768, "mean_token_accuracy": 0.15327301174402236, "num_tokens": 25466456.0, "step": 13810 }, { "entropy": 5.657715559005737, "epoch": 1.1606385213190507, "grad_norm": 1.59375, "learning_rate": 0.0004870372951943187, "loss": 5.3212, "mean_token_accuracy": 0.1731086015701294, "num_tokens": 25475217.0, "step": 13815 }, { "entropy": 5.764273929595947, "epoch": 1.1610586011342154, "grad_norm": 1.7890625, "learning_rate": 0.00048702727553992243, "loss": 5.6252, "mean_token_accuracy": 0.15146582424640656, "num_tokens": 25484617.0, "step": 13820 }, { "entropy": 5.661474609375, "epoch": 1.1614786809493804, "grad_norm": 2.59375, "learning_rate": 0.00048701725212957223, "loss": 5.4056, "mean_token_accuracy": 0.17106336653232573, "num_tokens": 25493936.0, "step": 13825 }, { "entropy": 5.615126895904541, "epoch": 1.1618987607645452, "grad_norm": 1.5703125, "learning_rate": 0.0004870072249634455, "loss": 5.3846, "mean_token_accuracy": 0.16981288492679597, "num_tokens": 25502306.0, "step": 13830 }, { "entropy": 5.586185503005981, "epoch": 1.1623188405797102, "grad_norm": 1.5, "learning_rate": 0.00048699719404172006, "loss": 5.4546, "mean_token_accuracy": 0.1651104733347893, "num_tokens": 25511247.0, "step": 13835 }, { "entropy": 5.713759469985962, "epoch": 1.162738920394875, "grad_norm": 1.609375, "learning_rate": 0.00048698715936457344, "loss": 5.5012, "mean_token_accuracy": 0.15939352810382842, "num_tokens": 25520482.0, "step": 13840 }, { "entropy": 5.726053237915039, "epoch": 1.1631590002100398, "grad_norm": 1.5, "learning_rate": 0.00048697712093218336, "loss": 5.3974, "mean_token_accuracy": 0.1690056636929512, "num_tokens": 25529854.0, "step": 13845 }, { "entropy": 5.624899101257324, "epoch": 1.1635790800252048, "grad_norm": 2.328125, "learning_rate": 0.0004869670787447279, "loss": 5.3395, "mean_token_accuracy": 0.16676997542381286, "num_tokens": 25538251.0, "step": 13850 }, { "entropy": 5.618051338195801, "epoch": 1.1639991598403696, "grad_norm": 1.6640625, "learning_rate": 0.0004869570328023846, "loss": 5.4133, "mean_token_accuracy": 0.16560969799757003, "num_tokens": 25546889.0, "step": 13855 }, { "entropy": 5.655919551849365, "epoch": 1.1644192396555346, "grad_norm": 2.21875, "learning_rate": 0.00048694698310533177, "loss": 5.48, "mean_token_accuracy": 0.16459716558456422, "num_tokens": 25557040.0, "step": 13860 }, { "entropy": 5.7215770244598385, "epoch": 1.1648393194706994, "grad_norm": 1.9765625, "learning_rate": 0.0004869369296537472, "loss": 5.6387, "mean_token_accuracy": 0.1548250749707222, "num_tokens": 25565798.0, "step": 13865 }, { "entropy": 5.826737976074218, "epoch": 1.1652593992858642, "grad_norm": 2.515625, "learning_rate": 0.0004869268724478091, "loss": 5.4626, "mean_token_accuracy": 0.1656502142548561, "num_tokens": 25575039.0, "step": 13870 }, { "entropy": 5.779808759689331, "epoch": 1.1656794791010292, "grad_norm": 1.7421875, "learning_rate": 0.00048691681148769545, "loss": 5.4698, "mean_token_accuracy": 0.16209751814603807, "num_tokens": 25584635.0, "step": 13875 }, { "entropy": 5.579784250259399, "epoch": 1.166099558916194, "grad_norm": 1.609375, "learning_rate": 0.0004869067467735847, "loss": 5.4154, "mean_token_accuracy": 0.16798000484704972, "num_tokens": 25593736.0, "step": 13880 }, { "entropy": 5.613956546783447, "epoch": 1.166519638731359, "grad_norm": 2.90625, "learning_rate": 0.0004868966783056551, "loss": 5.3718, "mean_token_accuracy": 0.17804049104452133, "num_tokens": 25602685.0, "step": 13885 }, { "entropy": 5.671496915817261, "epoch": 1.1669397185465238, "grad_norm": 1.765625, "learning_rate": 0.00048688660608408484, "loss": 5.4521, "mean_token_accuracy": 0.16123623102903367, "num_tokens": 25610690.0, "step": 13890 }, { "entropy": 5.615883159637451, "epoch": 1.1673597983616888, "grad_norm": 1.5, "learning_rate": 0.00048687653010905254, "loss": 5.3419, "mean_token_accuracy": 0.16897291988134383, "num_tokens": 25619805.0, "step": 13895 }, { "entropy": 5.767966794967651, "epoch": 1.1677798781768536, "grad_norm": 1.59375, "learning_rate": 0.00048686645038073664, "loss": 5.5659, "mean_token_accuracy": 0.15139710083603858, "num_tokens": 25629447.0, "step": 13900 }, { "entropy": 5.700986623764038, "epoch": 1.1681999579920186, "grad_norm": 2.03125, "learning_rate": 0.00048685636689931554, "loss": 5.4057, "mean_token_accuracy": 0.16528156250715256, "num_tokens": 25638619.0, "step": 13905 }, { "entropy": 5.720313978195191, "epoch": 1.1686200378071834, "grad_norm": 1.4453125, "learning_rate": 0.00048684627966496803, "loss": 5.4855, "mean_token_accuracy": 0.16764382421970367, "num_tokens": 25648255.0, "step": 13910 }, { "entropy": 5.695196580886841, "epoch": 1.1690401176223482, "grad_norm": 1.796875, "learning_rate": 0.00048683618867787284, "loss": 5.494, "mean_token_accuracy": 0.15946254581212999, "num_tokens": 25657881.0, "step": 13915 }, { "entropy": 5.7503081321716305, "epoch": 1.1694601974375132, "grad_norm": 1.703125, "learning_rate": 0.0004868260939382086, "loss": 5.545, "mean_token_accuracy": 0.16139545887708664, "num_tokens": 25666773.0, "step": 13920 }, { "entropy": 5.727688646316528, "epoch": 1.169880277252678, "grad_norm": 1.5078125, "learning_rate": 0.0004868159954461542, "loss": 5.4278, "mean_token_accuracy": 0.16332051604986192, "num_tokens": 25675152.0, "step": 13925 }, { "entropy": 5.8233521461486815, "epoch": 1.170300357067843, "grad_norm": 1.890625, "learning_rate": 0.00048680589320188847, "loss": 5.563, "mean_token_accuracy": 0.15545087233185767, "num_tokens": 25684962.0, "step": 13930 }, { "entropy": 5.67341160774231, "epoch": 1.1707204368830078, "grad_norm": 1.546875, "learning_rate": 0.0004867957872055904, "loss": 5.4358, "mean_token_accuracy": 0.166546930372715, "num_tokens": 25693782.0, "step": 13935 }, { "entropy": 5.649929618835449, "epoch": 1.1711405166981725, "grad_norm": 2.046875, "learning_rate": 0.00048678567745743905, "loss": 5.4121, "mean_token_accuracy": 0.16831570118665695, "num_tokens": 25703081.0, "step": 13940 }, { "entropy": 5.6604838371276855, "epoch": 1.1715605965133375, "grad_norm": 1.8125, "learning_rate": 0.0004867755639576135, "loss": 5.4141, "mean_token_accuracy": 0.17139442414045333, "num_tokens": 25711628.0, "step": 13945 }, { "entropy": 5.6378460884094235, "epoch": 1.1719806763285023, "grad_norm": 2.234375, "learning_rate": 0.0004867654467062928, "loss": 5.4509, "mean_token_accuracy": 0.16958941370248795, "num_tokens": 25720676.0, "step": 13950 }, { "entropy": 5.623021745681763, "epoch": 1.1724007561436673, "grad_norm": 1.9453125, "learning_rate": 0.00048675532570365633, "loss": 5.418, "mean_token_accuracy": 0.16936941295862198, "num_tokens": 25729920.0, "step": 13955 }, { "entropy": 5.644413042068481, "epoch": 1.1728208359588321, "grad_norm": 1.765625, "learning_rate": 0.00048674520094988327, "loss": 5.4047, "mean_token_accuracy": 0.1689576655626297, "num_tokens": 25739745.0, "step": 13960 }, { "entropy": 5.673465824127197, "epoch": 1.1732409157739971, "grad_norm": 1.7109375, "learning_rate": 0.00048673507244515303, "loss": 5.421, "mean_token_accuracy": 0.16571830958127975, "num_tokens": 25748636.0, "step": 13965 }, { "entropy": 5.774284315109253, "epoch": 1.173660995589162, "grad_norm": 1.7109375, "learning_rate": 0.000486724940189645, "loss": 5.5623, "mean_token_accuracy": 0.15929994434118272, "num_tokens": 25758393.0, "step": 13970 }, { "entropy": 5.73808479309082, "epoch": 1.174081075404327, "grad_norm": 1.46875, "learning_rate": 0.0004867148041835386, "loss": 5.5378, "mean_token_accuracy": 0.15335596948862076, "num_tokens": 25768520.0, "step": 13975 }, { "entropy": 5.613088512420655, "epoch": 1.1745011552194917, "grad_norm": 2.03125, "learning_rate": 0.0004867046644270136, "loss": 5.3398, "mean_token_accuracy": 0.17122806012630462, "num_tokens": 25777168.0, "step": 13980 }, { "entropy": 5.7978432178497314, "epoch": 1.1749212350346565, "grad_norm": 1.734375, "learning_rate": 0.0004866945209202494, "loss": 5.6517, "mean_token_accuracy": 0.14711768478155135, "num_tokens": 25787042.0, "step": 13985 }, { "entropy": 5.7582615375518795, "epoch": 1.1753413148498215, "grad_norm": 1.4765625, "learning_rate": 0.0004866843736634258, "loss": 5.5287, "mean_token_accuracy": 0.16342772543430328, "num_tokens": 25796784.0, "step": 13990 }, { "entropy": 5.7990720748901365, "epoch": 1.1757613946649863, "grad_norm": 1.8046875, "learning_rate": 0.0004866742226567225, "loss": 5.561, "mean_token_accuracy": 0.1599314257502556, "num_tokens": 25806285.0, "step": 13995 }, { "entropy": 5.723859405517578, "epoch": 1.1761814744801513, "grad_norm": 2.265625, "learning_rate": 0.00048666406790031936, "loss": 5.4036, "mean_token_accuracy": 0.16107962131500245, "num_tokens": 25814889.0, "step": 14000 }, { "entropy": 5.662409734725952, "epoch": 1.176601554295316, "grad_norm": 1.953125, "learning_rate": 0.0004866539093943962, "loss": 5.4512, "mean_token_accuracy": 0.16538093835115433, "num_tokens": 25824551.0, "step": 14005 }, { "entropy": 5.746952390670776, "epoch": 1.1770216341104809, "grad_norm": 1.734375, "learning_rate": 0.00048664374713913304, "loss": 5.517, "mean_token_accuracy": 0.15985865890979767, "num_tokens": 25834482.0, "step": 14010 }, { "entropy": 5.755951976776123, "epoch": 1.177441713925646, "grad_norm": 1.5625, "learning_rate": 0.0004866335811347099, "loss": 5.5288, "mean_token_accuracy": 0.1602414257824421, "num_tokens": 25843274.0, "step": 14015 }, { "entropy": 5.80555944442749, "epoch": 1.1778617937408107, "grad_norm": 1.7890625, "learning_rate": 0.00048662341138130683, "loss": 5.521, "mean_token_accuracy": 0.15339552462100983, "num_tokens": 25852482.0, "step": 14020 }, { "entropy": 5.730782413482666, "epoch": 1.1782818735559757, "grad_norm": 1.7421875, "learning_rate": 0.00048661323787910405, "loss": 5.4969, "mean_token_accuracy": 0.1554713472723961, "num_tokens": 25862657.0, "step": 14025 }, { "entropy": 5.663182163238526, "epoch": 1.1787019533711405, "grad_norm": 1.6484375, "learning_rate": 0.0004866030606282817, "loss": 5.4568, "mean_token_accuracy": 0.16776310056447982, "num_tokens": 25871492.0, "step": 14030 }, { "entropy": 5.734621810913086, "epoch": 1.1791220331863055, "grad_norm": 2.046875, "learning_rate": 0.00048659287962902006, "loss": 5.4536, "mean_token_accuracy": 0.1627289742231369, "num_tokens": 25880979.0, "step": 14035 }, { "entropy": 5.732399988174438, "epoch": 1.1795421130014703, "grad_norm": 2.25, "learning_rate": 0.00048658269488149945, "loss": 5.4554, "mean_token_accuracy": 0.16046008914709092, "num_tokens": 25891060.0, "step": 14040 }, { "entropy": 5.822850942611694, "epoch": 1.1799621928166353, "grad_norm": 2.359375, "learning_rate": 0.0004865725063859005, "loss": 5.5659, "mean_token_accuracy": 0.16293970942497255, "num_tokens": 25900421.0, "step": 14045 }, { "entropy": 5.73273401260376, "epoch": 1.1803822726318, "grad_norm": 1.6953125, "learning_rate": 0.00048656231414240345, "loss": 5.458, "mean_token_accuracy": 0.15972733795642852, "num_tokens": 25909614.0, "step": 14050 }, { "entropy": 5.668324518203735, "epoch": 1.1808023524469649, "grad_norm": 1.90625, "learning_rate": 0.000486552118151189, "loss": 5.4895, "mean_token_accuracy": 0.15764440298080445, "num_tokens": 25919324.0, "step": 14055 }, { "entropy": 5.670943117141723, "epoch": 1.1812224322621299, "grad_norm": 2.0625, "learning_rate": 0.00048654191841243763, "loss": 5.4993, "mean_token_accuracy": 0.1652704119682312, "num_tokens": 25928818.0, "step": 14060 }, { "entropy": 5.75603985786438, "epoch": 1.1816425120772946, "grad_norm": 2.25, "learning_rate": 0.0004865317149263301, "loss": 5.5482, "mean_token_accuracy": 0.16319168210029603, "num_tokens": 25938148.0, "step": 14065 }, { "entropy": 5.6569633960723875, "epoch": 1.1820625918924597, "grad_norm": 1.703125, "learning_rate": 0.0004865215076930473, "loss": 5.4529, "mean_token_accuracy": 0.16367049515247345, "num_tokens": 25947210.0, "step": 14070 }, { "entropy": 5.660248327255249, "epoch": 1.1824826717076244, "grad_norm": 2.4375, "learning_rate": 0.0004865112967127697, "loss": 5.4428, "mean_token_accuracy": 0.16496210247278215, "num_tokens": 25955949.0, "step": 14075 }, { "entropy": 5.648013925552368, "epoch": 1.1829027515227892, "grad_norm": 1.515625, "learning_rate": 0.0004865010819856786, "loss": 5.3959, "mean_token_accuracy": 0.16307084411382675, "num_tokens": 25964193.0, "step": 14080 }, { "entropy": 5.673745965957641, "epoch": 1.1833228313379542, "grad_norm": 1.5703125, "learning_rate": 0.0004864908635119546, "loss": 5.4612, "mean_token_accuracy": 0.1630059838294983, "num_tokens": 25973141.0, "step": 14085 }, { "entropy": 5.725007057189941, "epoch": 1.183742911153119, "grad_norm": 2.046875, "learning_rate": 0.0004864806412917788, "loss": 5.5398, "mean_token_accuracy": 0.158825521171093, "num_tokens": 25982650.0, "step": 14090 }, { "entropy": 5.773545980453491, "epoch": 1.184162990968284, "grad_norm": 1.8125, "learning_rate": 0.0004864704153253325, "loss": 5.5371, "mean_token_accuracy": 0.1549429714679718, "num_tokens": 25992096.0, "step": 14095 }, { "entropy": 5.772162914276123, "epoch": 1.1845830707834488, "grad_norm": 3.703125, "learning_rate": 0.00048646018561279665, "loss": 5.5104, "mean_token_accuracy": 0.16160587966442108, "num_tokens": 26002063.0, "step": 14100 }, { "entropy": 5.648436164855957, "epoch": 1.1850031505986138, "grad_norm": 1.9609375, "learning_rate": 0.00048644995215435245, "loss": 5.3414, "mean_token_accuracy": 0.1703270673751831, "num_tokens": 26010716.0, "step": 14105 }, { "entropy": 5.677743911743164, "epoch": 1.1854232304137786, "grad_norm": 1.9609375, "learning_rate": 0.0004864397149501812, "loss": 5.4265, "mean_token_accuracy": 0.16840701997280122, "num_tokens": 26019136.0, "step": 14110 }, { "entropy": 5.679789972305298, "epoch": 1.1858433102289434, "grad_norm": 1.6484375, "learning_rate": 0.00048642947400046434, "loss": 5.4571, "mean_token_accuracy": 0.17166182398796082, "num_tokens": 26028029.0, "step": 14115 }, { "entropy": 5.77405161857605, "epoch": 1.1862633900441084, "grad_norm": 1.65625, "learning_rate": 0.00048641922930538325, "loss": 5.6101, "mean_token_accuracy": 0.15164516270160674, "num_tokens": 26038025.0, "step": 14120 }, { "entropy": 5.7644017219543455, "epoch": 1.1866834698592732, "grad_norm": 1.5703125, "learning_rate": 0.0004864089808651193, "loss": 5.5754, "mean_token_accuracy": 0.14774202257394792, "num_tokens": 26048427.0, "step": 14125 }, { "entropy": 5.729209041595459, "epoch": 1.1871035496744382, "grad_norm": 1.7265625, "learning_rate": 0.0004863987286798541, "loss": 5.3801, "mean_token_accuracy": 0.16284161061048508, "num_tokens": 26057682.0, "step": 14130 }, { "entropy": 5.64456000328064, "epoch": 1.187523629489603, "grad_norm": 1.671875, "learning_rate": 0.0004863884727497693, "loss": 5.4509, "mean_token_accuracy": 0.1594451993703842, "num_tokens": 26066562.0, "step": 14135 }, { "entropy": 5.6487713813781735, "epoch": 1.187943709304768, "grad_norm": 1.796875, "learning_rate": 0.0004863782130750466, "loss": 5.3779, "mean_token_accuracy": 0.16446612328290938, "num_tokens": 26075633.0, "step": 14140 }, { "entropy": 5.723405551910401, "epoch": 1.1883637891199328, "grad_norm": 1.953125, "learning_rate": 0.00048636794965586764, "loss": 5.5428, "mean_token_accuracy": 0.1594787582755089, "num_tokens": 26085160.0, "step": 14145 }, { "entropy": 5.711528730392456, "epoch": 1.1887838689350976, "grad_norm": 1.84375, "learning_rate": 0.00048635768249241434, "loss": 5.4197, "mean_token_accuracy": 0.16347247660160064, "num_tokens": 26094157.0, "step": 14150 }, { "entropy": 5.7905010223388675, "epoch": 1.1892039487502626, "grad_norm": 1.671875, "learning_rate": 0.0004863474115848685, "loss": 5.5487, "mean_token_accuracy": 0.16446382999420167, "num_tokens": 26104459.0, "step": 14155 }, { "entropy": 5.677060556411743, "epoch": 1.1896240285654274, "grad_norm": 1.71875, "learning_rate": 0.00048633713693341214, "loss": 5.4867, "mean_token_accuracy": 0.16067123413085938, "num_tokens": 26114468.0, "step": 14160 }, { "entropy": 5.663212585449219, "epoch": 1.1900441083805924, "grad_norm": 1.484375, "learning_rate": 0.00048632685853822714, "loss": 5.4469, "mean_token_accuracy": 0.1624838277697563, "num_tokens": 26123408.0, "step": 14165 }, { "entropy": 5.629336786270142, "epoch": 1.1904641881957572, "grad_norm": 1.71875, "learning_rate": 0.0004863165763994957, "loss": 5.4641, "mean_token_accuracy": 0.15543654710054397, "num_tokens": 26132692.0, "step": 14170 }, { "entropy": 5.753988409042359, "epoch": 1.190884268010922, "grad_norm": 2.28125, "learning_rate": 0.0004863062905173999, "loss": 5.6279, "mean_token_accuracy": 0.15481553226709366, "num_tokens": 26142259.0, "step": 14175 }, { "entropy": 5.779358720779419, "epoch": 1.191304347826087, "grad_norm": 1.7109375, "learning_rate": 0.000486296000892122, "loss": 5.4794, "mean_token_accuracy": 0.16091232895851135, "num_tokens": 26151782.0, "step": 14180 }, { "entropy": 5.650760316848755, "epoch": 1.1917244276412517, "grad_norm": 1.859375, "learning_rate": 0.00048628570752384424, "loss": 5.3234, "mean_token_accuracy": 0.16556781977415086, "num_tokens": 26160449.0, "step": 14185 }, { "entropy": 5.700650358200074, "epoch": 1.1921445074564168, "grad_norm": 2.015625, "learning_rate": 0.00048627541041274897, "loss": 5.5649, "mean_token_accuracy": 0.1567431628704071, "num_tokens": 26169764.0, "step": 14190 }, { "entropy": 5.703988265991211, "epoch": 1.1925645872715815, "grad_norm": 1.8125, "learning_rate": 0.00048626510955901854, "loss": 5.4088, "mean_token_accuracy": 0.15994867235422133, "num_tokens": 26178759.0, "step": 14195 }, { "entropy": 5.739314889907837, "epoch": 1.1929846670867466, "grad_norm": 2.421875, "learning_rate": 0.0004862548049628356, "loss": 5.5509, "mean_token_accuracy": 0.1646982505917549, "num_tokens": 26187904.0, "step": 14200 }, { "entropy": 5.734980583190918, "epoch": 1.1934047469019113, "grad_norm": 2.015625, "learning_rate": 0.0004862444966243824, "loss": 5.4643, "mean_token_accuracy": 0.1669871285557747, "num_tokens": 26196563.0, "step": 14205 }, { "entropy": 5.778663492202758, "epoch": 1.1938248267170763, "grad_norm": 1.7109375, "learning_rate": 0.0004862341845438419, "loss": 5.4847, "mean_token_accuracy": 0.16169409304857255, "num_tokens": 26206573.0, "step": 14210 }, { "entropy": 5.661051893234253, "epoch": 1.1942449065322411, "grad_norm": 1.6015625, "learning_rate": 0.00048622386872139645, "loss": 5.3909, "mean_token_accuracy": 0.16438209414482116, "num_tokens": 26215308.0, "step": 14215 }, { "entropy": 5.587487888336182, "epoch": 1.194664986347406, "grad_norm": 2.0, "learning_rate": 0.000486213549157229, "loss": 5.4567, "mean_token_accuracy": 0.1640901729464531, "num_tokens": 26224379.0, "step": 14220 }, { "entropy": 5.664547252655029, "epoch": 1.195085066162571, "grad_norm": 1.78125, "learning_rate": 0.0004862032258515222, "loss": 5.4358, "mean_token_accuracy": 0.1679796889424324, "num_tokens": 26233620.0, "step": 14225 }, { "entropy": 5.725254678726197, "epoch": 1.1955051459777357, "grad_norm": 1.5859375, "learning_rate": 0.0004861928988044592, "loss": 5.5138, "mean_token_accuracy": 0.15623046904802323, "num_tokens": 26242556.0, "step": 14230 }, { "entropy": 5.718895196914673, "epoch": 1.1959252257929007, "grad_norm": 2.53125, "learning_rate": 0.0004861825680162226, "loss": 5.4946, "mean_token_accuracy": 0.16485830694437026, "num_tokens": 26251561.0, "step": 14235 }, { "entropy": 5.664663934707642, "epoch": 1.1963453056080655, "grad_norm": 1.890625, "learning_rate": 0.00048617223348699546, "loss": 5.4329, "mean_token_accuracy": 0.16026019304990768, "num_tokens": 26261115.0, "step": 14240 }, { "entropy": 5.770184707641602, "epoch": 1.1967653854232303, "grad_norm": 2.75, "learning_rate": 0.0004861618952169611, "loss": 5.591, "mean_token_accuracy": 0.1603381484746933, "num_tokens": 26271165.0, "step": 14245 }, { "entropy": 5.695276260375977, "epoch": 1.1971854652383953, "grad_norm": 1.734375, "learning_rate": 0.0004861515532063025, "loss": 5.5429, "mean_token_accuracy": 0.16051559895277023, "num_tokens": 26280822.0, "step": 14250 }, { "entropy": 5.69549150466919, "epoch": 1.19760554505356, "grad_norm": 1.6171875, "learning_rate": 0.00048614120745520275, "loss": 5.4191, "mean_token_accuracy": 0.16700200736522675, "num_tokens": 26288747.0, "step": 14255 }, { "entropy": 5.7050079822540285, "epoch": 1.198025624868725, "grad_norm": 1.5703125, "learning_rate": 0.00048613085796384524, "loss": 5.4945, "mean_token_accuracy": 0.15817514955997466, "num_tokens": 26298387.0, "step": 14260 }, { "entropy": 5.639023733139038, "epoch": 1.19844570468389, "grad_norm": 1.7578125, "learning_rate": 0.00048612050473241335, "loss": 5.3966, "mean_token_accuracy": 0.16590498983860016, "num_tokens": 26307016.0, "step": 14265 }, { "entropy": 5.690613889694214, "epoch": 1.198865784499055, "grad_norm": 2.40625, "learning_rate": 0.0004861101477610905, "loss": 5.5035, "mean_token_accuracy": 0.16300584375858307, "num_tokens": 26316296.0, "step": 14270 }, { "entropy": 5.692527532577515, "epoch": 1.1992858643142197, "grad_norm": 1.875, "learning_rate": 0.00048609978705006, "loss": 5.4837, "mean_token_accuracy": 0.1594039648771286, "num_tokens": 26325525.0, "step": 14275 }, { "entropy": 5.666857576370239, "epoch": 1.1997059441293847, "grad_norm": 2.109375, "learning_rate": 0.0004860894225995055, "loss": 5.377, "mean_token_accuracy": 0.16849509179592131, "num_tokens": 26334195.0, "step": 14280 }, { "entropy": 5.684696054458618, "epoch": 1.2001260239445495, "grad_norm": 1.890625, "learning_rate": 0.00048607905440961054, "loss": 5.512, "mean_token_accuracy": 0.16250620037317276, "num_tokens": 26343933.0, "step": 14285 }, { "entropy": 5.738911294937134, "epoch": 1.2005461037597143, "grad_norm": 1.5078125, "learning_rate": 0.00048606868248055887, "loss": 5.4441, "mean_token_accuracy": 0.16386907249689103, "num_tokens": 26353455.0, "step": 14290 }, { "entropy": 5.790994453430176, "epoch": 1.2009661835748793, "grad_norm": 1.65625, "learning_rate": 0.0004860583068125341, "loss": 5.458, "mean_token_accuracy": 0.16346363723278046, "num_tokens": 26362662.0, "step": 14295 }, { "entropy": 5.692120361328125, "epoch": 1.201386263390044, "grad_norm": 1.7109375, "learning_rate": 0.0004860479274057202, "loss": 5.4509, "mean_token_accuracy": 0.1605956733226776, "num_tokens": 26371536.0, "step": 14300 }, { "entropy": 5.720314931869507, "epoch": 1.201806343205209, "grad_norm": 1.46875, "learning_rate": 0.00048603754426030087, "loss": 5.5496, "mean_token_accuracy": 0.1566978722810745, "num_tokens": 26381925.0, "step": 14305 }, { "entropy": 5.662918376922607, "epoch": 1.2022264230203739, "grad_norm": 1.625, "learning_rate": 0.00048602715737646016, "loss": 5.4158, "mean_token_accuracy": 0.16778195053339004, "num_tokens": 26391111.0, "step": 14310 }, { "entropy": 5.84964919090271, "epoch": 1.2026465028355386, "grad_norm": 1.8671875, "learning_rate": 0.00048601676675438197, "loss": 5.5865, "mean_token_accuracy": 0.1477577805519104, "num_tokens": 26401667.0, "step": 14315 }, { "entropy": 5.686867713928223, "epoch": 1.2030665826507037, "grad_norm": 2.21875, "learning_rate": 0.00048600637239425045, "loss": 5.3949, "mean_token_accuracy": 0.1717774584889412, "num_tokens": 26411261.0, "step": 14320 }, { "entropy": 5.671998596191406, "epoch": 1.2034866624658684, "grad_norm": 1.6796875, "learning_rate": 0.00048599597429624966, "loss": 5.5392, "mean_token_accuracy": 0.15727996453642845, "num_tokens": 26419808.0, "step": 14325 }, { "entropy": 5.687448644638062, "epoch": 1.2039067422810334, "grad_norm": 1.734375, "learning_rate": 0.00048598557246056385, "loss": 5.458, "mean_token_accuracy": 0.16296297758817674, "num_tokens": 26429160.0, "step": 14330 }, { "entropy": 5.683249378204346, "epoch": 1.2043268220961982, "grad_norm": 1.5546875, "learning_rate": 0.00048597516688737727, "loss": 5.4074, "mean_token_accuracy": 0.16682589650154114, "num_tokens": 26437675.0, "step": 14335 }, { "entropy": 5.6975466251373295, "epoch": 1.2047469019113632, "grad_norm": 1.6640625, "learning_rate": 0.00048596475757687425, "loss": 5.4681, "mean_token_accuracy": 0.16042584478855132, "num_tokens": 26446317.0, "step": 14340 }, { "entropy": 5.708725118637085, "epoch": 1.205166981726528, "grad_norm": 1.859375, "learning_rate": 0.00048595434452923915, "loss": 5.5139, "mean_token_accuracy": 0.16216087639331817, "num_tokens": 26456183.0, "step": 14345 }, { "entropy": 5.6849305629730225, "epoch": 1.205587061541693, "grad_norm": 2.15625, "learning_rate": 0.00048594392774465656, "loss": 5.4568, "mean_token_accuracy": 0.15838514566421508, "num_tokens": 26466324.0, "step": 14350 }, { "entropy": 5.690262508392334, "epoch": 1.2060071413568578, "grad_norm": 1.8203125, "learning_rate": 0.00048593350722331074, "loss": 5.4705, "mean_token_accuracy": 0.1616607829928398, "num_tokens": 26475560.0, "step": 14355 }, { "entropy": 5.686220169067383, "epoch": 1.2064272211720226, "grad_norm": 1.7421875, "learning_rate": 0.00048592308296538654, "loss": 5.4449, "mean_token_accuracy": 0.16128322407603263, "num_tokens": 26484955.0, "step": 14360 }, { "entropy": 5.690824508666992, "epoch": 1.2068473009871876, "grad_norm": 1.609375, "learning_rate": 0.0004859126549710686, "loss": 5.4025, "mean_token_accuracy": 0.17448743879795076, "num_tokens": 26494306.0, "step": 14365 }, { "entropy": 5.605901575088501, "epoch": 1.2072673808023524, "grad_norm": 1.40625, "learning_rate": 0.00048590222324054153, "loss": 5.4058, "mean_token_accuracy": 0.16911747306585312, "num_tokens": 26503871.0, "step": 14370 }, { "entropy": 5.741916131973267, "epoch": 1.2076874606175174, "grad_norm": 2.515625, "learning_rate": 0.0004858917877739901, "loss": 5.5106, "mean_token_accuracy": 0.16466034948825836, "num_tokens": 26511929.0, "step": 14375 }, { "entropy": 5.765912389755249, "epoch": 1.2081075404326822, "grad_norm": 1.609375, "learning_rate": 0.0004858813485715994, "loss": 5.5129, "mean_token_accuracy": 0.15352574586868287, "num_tokens": 26520469.0, "step": 14380 }, { "entropy": 5.6565714359283445, "epoch": 1.208527620247847, "grad_norm": 1.53125, "learning_rate": 0.0004858709056335541, "loss": 5.4803, "mean_token_accuracy": 0.16136947721242906, "num_tokens": 26530102.0, "step": 14385 }, { "entropy": 5.6539154052734375, "epoch": 1.208947700063012, "grad_norm": 1.71875, "learning_rate": 0.00048586045896003926, "loss": 5.4784, "mean_token_accuracy": 0.15783216953277587, "num_tokens": 26538705.0, "step": 14390 }, { "entropy": 5.786140489578247, "epoch": 1.2093677798781768, "grad_norm": 1.875, "learning_rate": 0.0004858500085512401, "loss": 5.5837, "mean_token_accuracy": 0.15772880017757415, "num_tokens": 26548315.0, "step": 14395 }, { "entropy": 5.7165955066680905, "epoch": 1.2097878596933418, "grad_norm": 1.828125, "learning_rate": 0.00048583955440734144, "loss": 5.4101, "mean_token_accuracy": 0.1629326745867729, "num_tokens": 26556412.0, "step": 14400 }, { "entropy": 5.70180230140686, "epoch": 1.2102079395085066, "grad_norm": 1.6328125, "learning_rate": 0.00048582909652852873, "loss": 5.5744, "mean_token_accuracy": 0.16115047186613082, "num_tokens": 26566146.0, "step": 14405 }, { "entropy": 5.715734386444092, "epoch": 1.2106280193236716, "grad_norm": 1.703125, "learning_rate": 0.0004858186349149871, "loss": 5.4691, "mean_token_accuracy": 0.16265431568026542, "num_tokens": 26576019.0, "step": 14410 }, { "entropy": 5.612696838378906, "epoch": 1.2110480991388364, "grad_norm": 2.265625, "learning_rate": 0.000485808169566902, "loss": 5.3309, "mean_token_accuracy": 0.1696453645825386, "num_tokens": 26585461.0, "step": 14415 }, { "entropy": 5.62654824256897, "epoch": 1.2114681789540014, "grad_norm": 1.65625, "learning_rate": 0.00048579770048445863, "loss": 5.3726, "mean_token_accuracy": 0.18201425969600676, "num_tokens": 26594021.0, "step": 14420 }, { "entropy": 5.753058910369873, "epoch": 1.2118882587691662, "grad_norm": 1.734375, "learning_rate": 0.00048578722766784253, "loss": 5.5086, "mean_token_accuracy": 0.16204283237457276, "num_tokens": 26602712.0, "step": 14425 }, { "entropy": 5.593479490280151, "epoch": 1.212308338584331, "grad_norm": 1.8046875, "learning_rate": 0.00048577675111723925, "loss": 5.2025, "mean_token_accuracy": 0.18278668075799942, "num_tokens": 26610970.0, "step": 14430 }, { "entropy": 5.615044832229614, "epoch": 1.212728418399496, "grad_norm": 2.03125, "learning_rate": 0.00048576627083283435, "loss": 5.4954, "mean_token_accuracy": 0.16862737089395524, "num_tokens": 26619840.0, "step": 14435 }, { "entropy": 5.663373374938965, "epoch": 1.2131484982146608, "grad_norm": 1.7578125, "learning_rate": 0.0004857557868148136, "loss": 5.4002, "mean_token_accuracy": 0.16440703421831132, "num_tokens": 26629271.0, "step": 14440 }, { "entropy": 5.672978448867798, "epoch": 1.2135685780298258, "grad_norm": 1.734375, "learning_rate": 0.0004857452990633625, "loss": 5.4333, "mean_token_accuracy": 0.16087207645177842, "num_tokens": 26638610.0, "step": 14445 }, { "entropy": 5.792498302459717, "epoch": 1.2139886578449905, "grad_norm": 1.5703125, "learning_rate": 0.00048573480757866695, "loss": 5.5919, "mean_token_accuracy": 0.15683359503746033, "num_tokens": 26648504.0, "step": 14450 }, { "entropy": 5.720464372634888, "epoch": 1.2144087376601553, "grad_norm": 1.84375, "learning_rate": 0.00048572431236091284, "loss": 5.4654, "mean_token_accuracy": 0.16139672845602035, "num_tokens": 26658084.0, "step": 14455 }, { "entropy": 5.712548398971558, "epoch": 1.2148288174753203, "grad_norm": 1.8125, "learning_rate": 0.00048571381341028604, "loss": 5.5517, "mean_token_accuracy": 0.16073913276195526, "num_tokens": 26666933.0, "step": 14460 }, { "entropy": 5.752342224121094, "epoch": 1.2152488972904851, "grad_norm": 1.6953125, "learning_rate": 0.0004857033107269725, "loss": 5.4182, "mean_token_accuracy": 0.16725114732980728, "num_tokens": 26675049.0, "step": 14465 }, { "entropy": 5.664717102050782, "epoch": 1.2156689771056501, "grad_norm": 1.546875, "learning_rate": 0.00048569280431115823, "loss": 5.4942, "mean_token_accuracy": 0.16280431896448136, "num_tokens": 26684223.0, "step": 14470 }, { "entropy": 5.666110849380493, "epoch": 1.216089056920815, "grad_norm": 1.8125, "learning_rate": 0.0004856822941630296, "loss": 5.4388, "mean_token_accuracy": 0.15747048407793046, "num_tokens": 26693605.0, "step": 14475 }, { "entropy": 5.7499290943145756, "epoch": 1.2165091367359797, "grad_norm": 1.953125, "learning_rate": 0.00048567178028277255, "loss": 5.5114, "mean_token_accuracy": 0.16667446196079255, "num_tokens": 26702829.0, "step": 14480 }, { "entropy": 5.765132427215576, "epoch": 1.2169292165511447, "grad_norm": 1.5546875, "learning_rate": 0.0004856612626705733, "loss": 5.5496, "mean_token_accuracy": 0.15713531970977784, "num_tokens": 26712466.0, "step": 14485 }, { "entropy": 5.745383930206299, "epoch": 1.2173492963663095, "grad_norm": 1.8359375, "learning_rate": 0.0004856507413266183, "loss": 5.4247, "mean_token_accuracy": 0.16737874001264572, "num_tokens": 26721730.0, "step": 14490 }, { "entropy": 5.637966871261597, "epoch": 1.2177693761814745, "grad_norm": 1.6953125, "learning_rate": 0.000485640216251094, "loss": 5.5088, "mean_token_accuracy": 0.16009110063314438, "num_tokens": 26731017.0, "step": 14495 }, { "entropy": 5.674624824523926, "epoch": 1.2181894559966393, "grad_norm": 2.09375, "learning_rate": 0.00048562968744418665, "loss": 5.4761, "mean_token_accuracy": 0.16008124649524688, "num_tokens": 26739588.0, "step": 14500 }, { "entropy": 5.764046764373779, "epoch": 1.2186095358118043, "grad_norm": 1.9140625, "learning_rate": 0.0004856191549060828, "loss": 5.6018, "mean_token_accuracy": 0.15619692504405974, "num_tokens": 26748889.0, "step": 14505 }, { "entropy": 5.754044675827027, "epoch": 1.219029615626969, "grad_norm": 1.546875, "learning_rate": 0.00048560861863696913, "loss": 5.5297, "mean_token_accuracy": 0.15980444252490997, "num_tokens": 26757979.0, "step": 14510 }, { "entropy": 5.707068204879761, "epoch": 1.219449695442134, "grad_norm": 1.5625, "learning_rate": 0.0004855980786370322, "loss": 5.4485, "mean_token_accuracy": 0.16127097010612487, "num_tokens": 26767225.0, "step": 14515 }, { "entropy": 5.6698870182037355, "epoch": 1.219869775257299, "grad_norm": 1.4375, "learning_rate": 0.0004855875349064588, "loss": 5.3966, "mean_token_accuracy": 0.16548994332551956, "num_tokens": 26776289.0, "step": 14520 }, { "entropy": 5.744715166091919, "epoch": 1.2202898550724637, "grad_norm": 1.625, "learning_rate": 0.0004855769874454356, "loss": 5.5192, "mean_token_accuracy": 0.16024302393198014, "num_tokens": 26785631.0, "step": 14525 }, { "entropy": 5.691872644424438, "epoch": 1.2207099348876287, "grad_norm": 1.640625, "learning_rate": 0.0004855664362541495, "loss": 5.5232, "mean_token_accuracy": 0.16038859486579896, "num_tokens": 26795285.0, "step": 14530 }, { "entropy": 5.651829099655151, "epoch": 1.2211300147027935, "grad_norm": 1.5078125, "learning_rate": 0.00048555588133278744, "loss": 5.4307, "mean_token_accuracy": 0.16211945861577987, "num_tokens": 26804584.0, "step": 14535 }, { "entropy": 5.604468536376953, "epoch": 1.2215500945179585, "grad_norm": 1.515625, "learning_rate": 0.0004855453226815363, "loss": 5.3061, "mean_token_accuracy": 0.17006382644176482, "num_tokens": 26814354.0, "step": 14540 }, { "entropy": 5.626403427124023, "epoch": 1.2219701743331233, "grad_norm": 2.0625, "learning_rate": 0.00048553476030058326, "loss": 5.3466, "mean_token_accuracy": 0.17612583935260773, "num_tokens": 26824274.0, "step": 14545 }, { "entropy": 5.616889381408692, "epoch": 1.222390254148288, "grad_norm": 1.4921875, "learning_rate": 0.00048552419419011536, "loss": 5.4738, "mean_token_accuracy": 0.16051012128591538, "num_tokens": 26833155.0, "step": 14550 }, { "entropy": 5.667688083648682, "epoch": 1.222810333963453, "grad_norm": 1.4609375, "learning_rate": 0.0004855136243503196, "loss": 5.3997, "mean_token_accuracy": 0.1646553486585617, "num_tokens": 26842545.0, "step": 14555 }, { "entropy": 5.739150142669677, "epoch": 1.2232304137786179, "grad_norm": 1.7109375, "learning_rate": 0.00048550305078138363, "loss": 5.481, "mean_token_accuracy": 0.16481468081474304, "num_tokens": 26851772.0, "step": 14560 }, { "entropy": 5.648625612258911, "epoch": 1.2236504935937829, "grad_norm": 1.6953125, "learning_rate": 0.00048549247348349435, "loss": 5.3863, "mean_token_accuracy": 0.16550036519765854, "num_tokens": 26860884.0, "step": 14565 }, { "entropy": 5.679945564270019, "epoch": 1.2240705734089476, "grad_norm": 2.140625, "learning_rate": 0.00048548189245683934, "loss": 5.5126, "mean_token_accuracy": 0.1663243889808655, "num_tokens": 26869435.0, "step": 14570 }, { "entropy": 5.681559896469116, "epoch": 1.2244906532241127, "grad_norm": 1.4296875, "learning_rate": 0.00048547130770160596, "loss": 5.4131, "mean_token_accuracy": 0.16150881946086884, "num_tokens": 26878852.0, "step": 14575 }, { "entropy": 5.70316162109375, "epoch": 1.2249107330392774, "grad_norm": 1.5390625, "learning_rate": 0.0004854607192179817, "loss": 5.3864, "mean_token_accuracy": 0.1695043832063675, "num_tokens": 26887532.0, "step": 14580 }, { "entropy": 5.844434452056885, "epoch": 1.2253308128544425, "grad_norm": 1.5546875, "learning_rate": 0.0004854501270061543, "loss": 5.6029, "mean_token_accuracy": 0.15792314410209657, "num_tokens": 26897459.0, "step": 14585 }, { "entropy": 5.618150424957276, "epoch": 1.2257508926696072, "grad_norm": 1.5703125, "learning_rate": 0.00048543953106631115, "loss": 5.3795, "mean_token_accuracy": 0.16793021261692048, "num_tokens": 26907156.0, "step": 14590 }, { "entropy": 5.732923221588135, "epoch": 1.226170972484772, "grad_norm": 1.5703125, "learning_rate": 0.0004854289313986401, "loss": 5.4648, "mean_token_accuracy": 0.16741324663162233, "num_tokens": 26915764.0, "step": 14595 }, { "entropy": 5.644811153411865, "epoch": 1.226591052299937, "grad_norm": 1.7109375, "learning_rate": 0.0004854183280033289, "loss": 5.3429, "mean_token_accuracy": 0.16403224915266038, "num_tokens": 26924166.0, "step": 14600 }, { "entropy": 5.6976734638214115, "epoch": 1.2270111321151018, "grad_norm": 1.578125, "learning_rate": 0.0004854077208805654, "loss": 5.5704, "mean_token_accuracy": 0.1540565922856331, "num_tokens": 26933546.0, "step": 14605 }, { "entropy": 5.7353489875793455, "epoch": 1.2274312119302668, "grad_norm": 1.4375, "learning_rate": 0.0004853971100305374, "loss": 5.4901, "mean_token_accuracy": 0.1645752012729645, "num_tokens": 26943213.0, "step": 14610 }, { "entropy": 5.752119350433349, "epoch": 1.2278512917454316, "grad_norm": 1.4296875, "learning_rate": 0.000485386495453433, "loss": 5.4702, "mean_token_accuracy": 0.16524574309587478, "num_tokens": 26952968.0, "step": 14615 }, { "entropy": 5.690602731704712, "epoch": 1.2282713715605964, "grad_norm": 1.640625, "learning_rate": 0.00048537587714944007, "loss": 5.431, "mean_token_accuracy": 0.16387941986322402, "num_tokens": 26962230.0, "step": 14620 }, { "entropy": 5.637970733642578, "epoch": 1.2286914513757614, "grad_norm": 1.703125, "learning_rate": 0.0004853652551187469, "loss": 5.5035, "mean_token_accuracy": 0.16774664968252181, "num_tokens": 26970985.0, "step": 14625 }, { "entropy": 5.707252836227417, "epoch": 1.2291115311909262, "grad_norm": 1.6328125, "learning_rate": 0.00048535462936154147, "loss": 5.5344, "mean_token_accuracy": 0.16012766510248183, "num_tokens": 26981138.0, "step": 14630 }, { "entropy": 5.622266340255737, "epoch": 1.2295316110060912, "grad_norm": 1.671875, "learning_rate": 0.0004853439998780122, "loss": 5.3687, "mean_token_accuracy": 0.17002979367971421, "num_tokens": 26990158.0, "step": 14635 }, { "entropy": 5.6507940769195555, "epoch": 1.229951690821256, "grad_norm": 1.65625, "learning_rate": 0.0004853333666683472, "loss": 5.5224, "mean_token_accuracy": 0.15614334493875504, "num_tokens": 26998889.0, "step": 14640 }, { "entropy": 5.708015727996826, "epoch": 1.230371770636421, "grad_norm": 2.203125, "learning_rate": 0.00048532272973273496, "loss": 5.4656, "mean_token_accuracy": 0.16113510280847548, "num_tokens": 27008912.0, "step": 14645 }, { "entropy": 5.671196317672729, "epoch": 1.2307918504515858, "grad_norm": 1.6171875, "learning_rate": 0.00048531208907136384, "loss": 5.3541, "mean_token_accuracy": 0.17473920732736586, "num_tokens": 27017573.0, "step": 14650 }, { "entropy": 5.658402824401856, "epoch": 1.2312119302667508, "grad_norm": 1.484375, "learning_rate": 0.00048530144468442236, "loss": 5.4297, "mean_token_accuracy": 0.1590592809021473, "num_tokens": 27027205.0, "step": 14655 }, { "entropy": 5.66589732170105, "epoch": 1.2316320100819156, "grad_norm": 1.7890625, "learning_rate": 0.00048529079657209906, "loss": 5.3827, "mean_token_accuracy": 0.16773709654808044, "num_tokens": 27035882.0, "step": 14660 }, { "entropy": 5.628454732894897, "epoch": 1.2320520898970804, "grad_norm": 1.5390625, "learning_rate": 0.0004852801447345826, "loss": 5.4555, "mean_token_accuracy": 0.17012043595314025, "num_tokens": 27044761.0, "step": 14665 }, { "entropy": 5.6688700199127195, "epoch": 1.2324721697122454, "grad_norm": 1.46875, "learning_rate": 0.0004852694891720617, "loss": 5.4815, "mean_token_accuracy": 0.16467399448156356, "num_tokens": 27054149.0, "step": 14670 }, { "entropy": 5.725511741638184, "epoch": 1.2328922495274102, "grad_norm": 1.4375, "learning_rate": 0.000485258829884725, "loss": 5.524, "mean_token_accuracy": 0.1634502664208412, "num_tokens": 27063145.0, "step": 14675 }, { "entropy": 5.7596677303314205, "epoch": 1.2333123293425752, "grad_norm": 1.546875, "learning_rate": 0.0004852481668727614, "loss": 5.4697, "mean_token_accuracy": 0.16408599615097047, "num_tokens": 27072378.0, "step": 14680 }, { "entropy": 5.588124799728393, "epoch": 1.23373240915774, "grad_norm": 1.703125, "learning_rate": 0.00048523750013635986, "loss": 5.354, "mean_token_accuracy": 0.16549673229455947, "num_tokens": 27082241.0, "step": 14685 }, { "entropy": 5.605792379379272, "epoch": 1.2341524889729047, "grad_norm": 1.734375, "learning_rate": 0.0004852268296757092, "loss": 5.3762, "mean_token_accuracy": 0.16784797310829164, "num_tokens": 27091488.0, "step": 14690 }, { "entropy": 5.743075704574585, "epoch": 1.2345725687880698, "grad_norm": 1.7421875, "learning_rate": 0.0004852161554909985, "loss": 5.4272, "mean_token_accuracy": 0.16824524402618407, "num_tokens": 27100378.0, "step": 14695 }, { "entropy": 5.69188551902771, "epoch": 1.2349926486032345, "grad_norm": 1.5703125, "learning_rate": 0.00048520547758241686, "loss": 5.4522, "mean_token_accuracy": 0.16235414147377014, "num_tokens": 27110341.0, "step": 14700 }, { "entropy": 5.656498527526855, "epoch": 1.2354127284183996, "grad_norm": 1.375, "learning_rate": 0.00048519479595015343, "loss": 5.3965, "mean_token_accuracy": 0.1622692197561264, "num_tokens": 27119381.0, "step": 14705 }, { "entropy": 5.605996942520141, "epoch": 1.2358328082335643, "grad_norm": 1.6015625, "learning_rate": 0.00048518411059439746, "loss": 5.4951, "mean_token_accuracy": 0.1566877394914627, "num_tokens": 27129167.0, "step": 14710 }, { "entropy": 5.697007560729981, "epoch": 1.2362528880487293, "grad_norm": 1.5703125, "learning_rate": 0.00048517342151533813, "loss": 5.5005, "mean_token_accuracy": 0.1557912290096283, "num_tokens": 27138479.0, "step": 14715 }, { "entropy": 5.697368383407593, "epoch": 1.2366729678638941, "grad_norm": 1.40625, "learning_rate": 0.0004851627287131649, "loss": 5.3838, "mean_token_accuracy": 0.16886205822229386, "num_tokens": 27147197.0, "step": 14720 }, { "entropy": 5.643680572509766, "epoch": 1.2370930476790591, "grad_norm": 1.71875, "learning_rate": 0.0004851520321880672, "loss": 5.4126, "mean_token_accuracy": 0.1719201013445854, "num_tokens": 27155854.0, "step": 14725 }, { "entropy": 5.657077169418335, "epoch": 1.237513127494224, "grad_norm": 1.75, "learning_rate": 0.0004851413319402344, "loss": 5.3862, "mean_token_accuracy": 0.1578731968998909, "num_tokens": 27165069.0, "step": 14730 }, { "entropy": 5.684050750732422, "epoch": 1.2379332073093887, "grad_norm": 1.859375, "learning_rate": 0.0004851306279698561, "loss": 5.4352, "mean_token_accuracy": 0.16021962463855743, "num_tokens": 27174070.0, "step": 14735 }, { "entropy": 5.788384103775025, "epoch": 1.2383532871245537, "grad_norm": 1.7421875, "learning_rate": 0.0004851199202771219, "loss": 5.5038, "mean_token_accuracy": 0.1639639750123024, "num_tokens": 27182903.0, "step": 14740 }, { "entropy": 5.693592119216919, "epoch": 1.2387733669397185, "grad_norm": 1.6796875, "learning_rate": 0.0004851092088622216, "loss": 5.4264, "mean_token_accuracy": 0.17083500623703002, "num_tokens": 27192747.0, "step": 14745 }, { "entropy": 5.670225000381469, "epoch": 1.2391934467548835, "grad_norm": 1.5078125, "learning_rate": 0.0004850984937253448, "loss": 5.4402, "mean_token_accuracy": 0.1658121481537819, "num_tokens": 27201657.0, "step": 14750 }, { "entropy": 5.693979692459107, "epoch": 1.2396135265700483, "grad_norm": 1.6328125, "learning_rate": 0.0004850877748666814, "loss": 5.4621, "mean_token_accuracy": 0.16480949372053147, "num_tokens": 27211794.0, "step": 14755 }, { "entropy": 5.638466024398804, "epoch": 1.240033606385213, "grad_norm": 1.7265625, "learning_rate": 0.00048507705228642117, "loss": 5.4174, "mean_token_accuracy": 0.1595284804701805, "num_tokens": 27221852.0, "step": 14760 }, { "entropy": 5.654482078552246, "epoch": 1.240453686200378, "grad_norm": 1.65625, "learning_rate": 0.0004850663259847542, "loss": 5.4612, "mean_token_accuracy": 0.158142551779747, "num_tokens": 27231558.0, "step": 14765 }, { "entropy": 5.628722333908081, "epoch": 1.240873766015543, "grad_norm": 1.8671875, "learning_rate": 0.00048505559596187037, "loss": 5.451, "mean_token_accuracy": 0.16363227218389512, "num_tokens": 27241053.0, "step": 14770 }, { "entropy": 5.614446783065796, "epoch": 1.241293845830708, "grad_norm": 2.0, "learning_rate": 0.0004850448622179599, "loss": 5.3357, "mean_token_accuracy": 0.1671755015850067, "num_tokens": 27249770.0, "step": 14775 }, { "entropy": 5.800767087936402, "epoch": 1.2417139256458727, "grad_norm": 2.390625, "learning_rate": 0.0004850341247532128, "loss": 5.5805, "mean_token_accuracy": 0.15848884508013725, "num_tokens": 27258883.0, "step": 14780 }, { "entropy": 5.751977014541626, "epoch": 1.2421340054610377, "grad_norm": 1.75, "learning_rate": 0.0004850233835678194, "loss": 5.4846, "mean_token_accuracy": 0.1624804139137268, "num_tokens": 27268056.0, "step": 14785 }, { "entropy": 5.669937515258789, "epoch": 1.2425540852762025, "grad_norm": 2.21875, "learning_rate": 0.0004850126386619699, "loss": 5.3487, "mean_token_accuracy": 0.17517259567975998, "num_tokens": 27276965.0, "step": 14790 }, { "entropy": 5.600133562088013, "epoch": 1.2429741650913673, "grad_norm": 1.6171875, "learning_rate": 0.0004850018900358545, "loss": 5.4149, "mean_token_accuracy": 0.16797211319208144, "num_tokens": 27286173.0, "step": 14795 }, { "entropy": 5.646801853179932, "epoch": 1.2433942449065323, "grad_norm": 1.6640625, "learning_rate": 0.00048499113768966386, "loss": 5.4173, "mean_token_accuracy": 0.16762335151433944, "num_tokens": 27294863.0, "step": 14800 }, { "entropy": 5.730639934539795, "epoch": 1.243814324721697, "grad_norm": 1.625, "learning_rate": 0.0004849803816235884, "loss": 5.4551, "mean_token_accuracy": 0.16181258857250214, "num_tokens": 27304427.0, "step": 14805 }, { "entropy": 5.7499453067779545, "epoch": 1.244234404536862, "grad_norm": 1.609375, "learning_rate": 0.0004849696218378185, "loss": 5.53, "mean_token_accuracy": 0.16161169856786728, "num_tokens": 27313716.0, "step": 14810 }, { "entropy": 5.7411253452301025, "epoch": 1.2446544843520269, "grad_norm": 1.484375, "learning_rate": 0.0004849588583325449, "loss": 5.4179, "mean_token_accuracy": 0.17681172788143157, "num_tokens": 27322342.0, "step": 14815 }, { "entropy": 5.742122983932495, "epoch": 1.2450745641671919, "grad_norm": 1.53125, "learning_rate": 0.0004849480911079583, "loss": 5.4983, "mean_token_accuracy": 0.15292923152446747, "num_tokens": 27331892.0, "step": 14820 }, { "entropy": 5.687739038467408, "epoch": 1.2454946439823567, "grad_norm": 1.3984375, "learning_rate": 0.0004849373201642493, "loss": 5.4674, "mean_token_accuracy": 0.15925178527832032, "num_tokens": 27340428.0, "step": 14825 }, { "entropy": 5.6958386421203615, "epoch": 1.2459147237975214, "grad_norm": 1.640625, "learning_rate": 0.0004849265455016088, "loss": 5.4664, "mean_token_accuracy": 0.16365174651145936, "num_tokens": 27349224.0, "step": 14830 }, { "entropy": 5.661598014831543, "epoch": 1.2463348036126864, "grad_norm": 1.5625, "learning_rate": 0.0004849157671202277, "loss": 5.4434, "mean_token_accuracy": 0.16567779928445817, "num_tokens": 27357480.0, "step": 14835 }, { "entropy": 5.658696794509888, "epoch": 1.2467548834278512, "grad_norm": 1.5390625, "learning_rate": 0.0004849049850202968, "loss": 5.3717, "mean_token_accuracy": 0.17218401432037353, "num_tokens": 27366732.0, "step": 14840 }, { "entropy": 5.671054315567017, "epoch": 1.2471749632430162, "grad_norm": 1.5234375, "learning_rate": 0.0004848941992020072, "loss": 5.4774, "mean_token_accuracy": 0.15841912627220153, "num_tokens": 27375834.0, "step": 14845 }, { "entropy": 5.730887794494629, "epoch": 1.247595043058181, "grad_norm": 1.5546875, "learning_rate": 0.0004848834096655499, "loss": 5.4563, "mean_token_accuracy": 0.16432572156190872, "num_tokens": 27385311.0, "step": 14850 }, { "entropy": 5.700474452972412, "epoch": 1.2480151228733458, "grad_norm": 1.4921875, "learning_rate": 0.00048487261641111607, "loss": 5.5133, "mean_token_accuracy": 0.16188574731349945, "num_tokens": 27394587.0, "step": 14855 }, { "entropy": 5.581315422058106, "epoch": 1.2484352026885108, "grad_norm": 1.3203125, "learning_rate": 0.000484861819438897, "loss": 5.3722, "mean_token_accuracy": 0.1629566103219986, "num_tokens": 27403316.0, "step": 14860 }, { "entropy": 5.674688768386841, "epoch": 1.2488552825036756, "grad_norm": 1.46875, "learning_rate": 0.0004848510187490838, "loss": 5.4211, "mean_token_accuracy": 0.16881508529186248, "num_tokens": 27412709.0, "step": 14865 }, { "entropy": 5.717575883865356, "epoch": 1.2492753623188406, "grad_norm": 1.65625, "learning_rate": 0.0004848402143418679, "loss": 5.4867, "mean_token_accuracy": 0.16073511987924577, "num_tokens": 27422004.0, "step": 14870 }, { "entropy": 5.667223167419434, "epoch": 1.2496954421340054, "grad_norm": 1.78125, "learning_rate": 0.00048482940621744053, "loss": 5.5146, "mean_token_accuracy": 0.16103297472000122, "num_tokens": 27431931.0, "step": 14875 }, { "entropy": 5.64241132736206, "epoch": 1.2501155219491704, "grad_norm": 1.2890625, "learning_rate": 0.0004848185943759934, "loss": 5.3291, "mean_token_accuracy": 0.17295840233564377, "num_tokens": 27441527.0, "step": 14880 }, { "entropy": 5.751472759246826, "epoch": 1.2505356017643352, "grad_norm": 1.6796875, "learning_rate": 0.00048480777881771786, "loss": 5.488, "mean_token_accuracy": 0.16338546127080916, "num_tokens": 27449964.0, "step": 14885 }, { "entropy": 5.653960943222046, "epoch": 1.2509556815795002, "grad_norm": 2.0625, "learning_rate": 0.0004847969595428056, "loss": 5.4769, "mean_token_accuracy": 0.16023507416248323, "num_tokens": 27459044.0, "step": 14890 }, { "entropy": 5.632353162765503, "epoch": 1.251375761394665, "grad_norm": 2.5625, "learning_rate": 0.00048478613655144817, "loss": 5.4677, "mean_token_accuracy": 0.16684045344591142, "num_tokens": 27467644.0, "step": 14895 }, { "entropy": 5.754183292388916, "epoch": 1.2517958412098298, "grad_norm": 2.1875, "learning_rate": 0.0004847753098438374, "loss": 5.4969, "mean_token_accuracy": 0.15503143072128295, "num_tokens": 27476899.0, "step": 14900 }, { "entropy": 5.713054418563843, "epoch": 1.2522159210249948, "grad_norm": 1.5625, "learning_rate": 0.000484764479420165, "loss": 5.3986, "mean_token_accuracy": 0.16840293928980826, "num_tokens": 27485167.0, "step": 14905 }, { "entropy": 5.67601432800293, "epoch": 1.2526360008401596, "grad_norm": 1.78125, "learning_rate": 0.00048475364528062287, "loss": 5.4366, "mean_token_accuracy": 0.15893664807081223, "num_tokens": 27493986.0, "step": 14910 }, { "entropy": 5.717255640029907, "epoch": 1.2530560806553246, "grad_norm": 1.734375, "learning_rate": 0.0004847428074254029, "loss": 5.481, "mean_token_accuracy": 0.1676044538617134, "num_tokens": 27503896.0, "step": 14915 }, { "entropy": 5.700136041641235, "epoch": 1.2534761604704894, "grad_norm": 1.78125, "learning_rate": 0.00048473196585469713, "loss": 5.4409, "mean_token_accuracy": 0.16624458730220795, "num_tokens": 27513485.0, "step": 14920 }, { "entropy": 5.725602149963379, "epoch": 1.2538962402856542, "grad_norm": 1.765625, "learning_rate": 0.00048472112056869763, "loss": 5.5032, "mean_token_accuracy": 0.15849509388208388, "num_tokens": 27523164.0, "step": 14925 }, { "entropy": 5.7331983089447025, "epoch": 1.2543163201008192, "grad_norm": 1.609375, "learning_rate": 0.0004847102715675964, "loss": 5.4388, "mean_token_accuracy": 0.16513479351997376, "num_tokens": 27531387.0, "step": 14930 }, { "entropy": 5.6596925258636475, "epoch": 1.254736399915984, "grad_norm": 1.703125, "learning_rate": 0.0004846994188515857, "loss": 5.4488, "mean_token_accuracy": 0.16895988285541536, "num_tokens": 27541754.0, "step": 14935 }, { "entropy": 5.79337100982666, "epoch": 1.255156479731149, "grad_norm": 1.65625, "learning_rate": 0.0004846885624208578, "loss": 5.5214, "mean_token_accuracy": 0.158653724193573, "num_tokens": 27551458.0, "step": 14940 }, { "entropy": 5.685010766983032, "epoch": 1.2555765595463138, "grad_norm": 1.890625, "learning_rate": 0.000484677702275605, "loss": 5.4378, "mean_token_accuracy": 0.16842745393514633, "num_tokens": 27560797.0, "step": 14945 }, { "entropy": 5.695211362838745, "epoch": 1.2559966393614788, "grad_norm": 1.546875, "learning_rate": 0.00048466683841601963, "loss": 5.4206, "mean_token_accuracy": 0.16701247841119765, "num_tokens": 27570166.0, "step": 14950 }, { "entropy": 5.662879896163941, "epoch": 1.2564167191766435, "grad_norm": 1.5, "learning_rate": 0.00048465597084229416, "loss": 5.3411, "mean_token_accuracy": 0.16752343326807023, "num_tokens": 27579411.0, "step": 14955 }, { "entropy": 5.737317419052124, "epoch": 1.2568367989918086, "grad_norm": 1.4296875, "learning_rate": 0.0004846450995546212, "loss": 5.5894, "mean_token_accuracy": 0.15929221510887145, "num_tokens": 27589124.0, "step": 14960 }, { "entropy": 5.76739387512207, "epoch": 1.2572568788069733, "grad_norm": 1.71875, "learning_rate": 0.0004846342245531932, "loss": 5.5526, "mean_token_accuracy": 0.15253591239452363, "num_tokens": 27598664.0, "step": 14965 }, { "entropy": 5.792992496490479, "epoch": 1.2576769586221381, "grad_norm": 1.546875, "learning_rate": 0.0004846233458382029, "loss": 5.4779, "mean_token_accuracy": 0.16482626497745514, "num_tokens": 27607189.0, "step": 14970 }, { "entropy": 5.758588409423828, "epoch": 1.2580970384373031, "grad_norm": 1.796875, "learning_rate": 0.00048461246340984293, "loss": 5.5099, "mean_token_accuracy": 0.16399455666542054, "num_tokens": 27616415.0, "step": 14975 }, { "entropy": 5.67619571685791, "epoch": 1.258517118252468, "grad_norm": 1.46875, "learning_rate": 0.0004846015772683061, "loss": 5.4745, "mean_token_accuracy": 0.1670221731066704, "num_tokens": 27624492.0, "step": 14980 }, { "entropy": 5.610988140106201, "epoch": 1.258937198067633, "grad_norm": 1.7578125, "learning_rate": 0.00048459068741378526, "loss": 5.3731, "mean_token_accuracy": 0.16672062426805495, "num_tokens": 27634243.0, "step": 14985 }, { "entropy": 5.695155811309815, "epoch": 1.2593572778827977, "grad_norm": 1.6953125, "learning_rate": 0.0004845797938464734, "loss": 5.4803, "mean_token_accuracy": 0.16463592499494553, "num_tokens": 27642887.0, "step": 14990 }, { "entropy": 5.7585619449615475, "epoch": 1.2597773576979625, "grad_norm": 1.609375, "learning_rate": 0.0004845688965665633, "loss": 5.4946, "mean_token_accuracy": 0.1642697721719742, "num_tokens": 27652524.0, "step": 14995 }, { "entropy": 5.68261866569519, "epoch": 1.2601974375131275, "grad_norm": 1.5234375, "learning_rate": 0.00048455799557424814, "loss": 5.3471, "mean_token_accuracy": 0.17591068595647813, "num_tokens": 27661306.0, "step": 15000 }, { "epoch": 1.2601974375131275, "eval_entropy": 5.542287499050695, "eval_loss": 5.52593994140625, "eval_mean_token_accuracy": 0.16979930738796262, "eval_num_tokens": 27661306.0, "eval_runtime": 27.4053, "eval_samples_per_second": 1363.46, "eval_steps_per_second": 170.442, "step": 15000 }, { "entropy": 5.719019222259521, "epoch": 1.2606175173282923, "grad_norm": 1.6015625, "learning_rate": 0.0004845470908697209, "loss": 5.5345, "mean_token_accuracy": 0.1672997236251831, "num_tokens": 27671728.0, "step": 15005 }, { "entropy": 5.660177707672119, "epoch": 1.2610375971434573, "grad_norm": 1.421875, "learning_rate": 0.000484536182453175, "loss": 5.3345, "mean_token_accuracy": 0.16970676183700562, "num_tokens": 27680740.0, "step": 15010 }, { "entropy": 5.690030097961426, "epoch": 1.261457676958622, "grad_norm": 1.6640625, "learning_rate": 0.0004845252703248035, "loss": 5.4072, "mean_token_accuracy": 0.16504298150539398, "num_tokens": 27689865.0, "step": 15015 }, { "entropy": 5.6956014156341555, "epoch": 1.2618777567737869, "grad_norm": 1.4765625, "learning_rate": 0.0004845143544847997, "loss": 5.4473, "mean_token_accuracy": 0.1682340383529663, "num_tokens": 27700366.0, "step": 15020 }, { "entropy": 5.698393678665161, "epoch": 1.262297836588952, "grad_norm": 1.5390625, "learning_rate": 0.00048450343493335697, "loss": 5.3561, "mean_token_accuracy": 0.17051917016506196, "num_tokens": 27708893.0, "step": 15025 }, { "entropy": 5.611342048645019, "epoch": 1.262717916404117, "grad_norm": 1.4609375, "learning_rate": 0.0004844925116706688, "loss": 5.3771, "mean_token_accuracy": 0.16306255012750626, "num_tokens": 27717494.0, "step": 15030 }, { "entropy": 5.57361912727356, "epoch": 1.2631379962192817, "grad_norm": 2.328125, "learning_rate": 0.00048448158469692866, "loss": 5.3038, "mean_token_accuracy": 0.18097079247236253, "num_tokens": 27726487.0, "step": 15035 }, { "entropy": 5.786226844787597, "epoch": 1.2635580760344465, "grad_norm": 1.9375, "learning_rate": 0.0004844706540123301, "loss": 5.5463, "mean_token_accuracy": 0.15970377177000045, "num_tokens": 27736602.0, "step": 15040 }, { "entropy": 5.89350733757019, "epoch": 1.2639781558496115, "grad_norm": 1.65625, "learning_rate": 0.00048445971961706675, "loss": 5.5419, "mean_token_accuracy": 0.15724890679121017, "num_tokens": 27746322.0, "step": 15045 }, { "entropy": 5.636753940582276, "epoch": 1.2643982356647763, "grad_norm": 1.7109375, "learning_rate": 0.0004844487815113323, "loss": 5.3895, "mean_token_accuracy": 0.1694614127278328, "num_tokens": 27754941.0, "step": 15050 }, { "entropy": 5.603873300552368, "epoch": 1.2648183154799413, "grad_norm": 1.890625, "learning_rate": 0.0004844378396953206, "loss": 5.4706, "mean_token_accuracy": 0.16238831877708435, "num_tokens": 27763941.0, "step": 15055 }, { "entropy": 5.733206653594971, "epoch": 1.265238395295106, "grad_norm": 1.5703125, "learning_rate": 0.00048442689416922536, "loss": 5.4854, "mean_token_accuracy": 0.16823527961969376, "num_tokens": 27773087.0, "step": 15060 }, { "entropy": 5.640398788452148, "epoch": 1.2656584751102709, "grad_norm": 1.6796875, "learning_rate": 0.00048441594493324057, "loss": 5.3039, "mean_token_accuracy": 0.17487951517105102, "num_tokens": 27782648.0, "step": 15065 }, { "entropy": 5.66456823348999, "epoch": 1.2660785549254359, "grad_norm": 1.59375, "learning_rate": 0.00048440499198756015, "loss": 5.5098, "mean_token_accuracy": 0.16223005801439286, "num_tokens": 27791567.0, "step": 15070 }, { "entropy": 5.695383977890015, "epoch": 1.2664986347406006, "grad_norm": 1.53125, "learning_rate": 0.00048439403533237816, "loss": 5.499, "mean_token_accuracy": 0.1588960826396942, "num_tokens": 27801397.0, "step": 15075 }, { "entropy": 5.790954875946045, "epoch": 1.2669187145557657, "grad_norm": 1.765625, "learning_rate": 0.0004843830749678886, "loss": 5.5147, "mean_token_accuracy": 0.16107721030712127, "num_tokens": 27810831.0, "step": 15080 }, { "entropy": 5.717430448532104, "epoch": 1.2673387943709304, "grad_norm": 2.390625, "learning_rate": 0.0004843721108942856, "loss": 5.4237, "mean_token_accuracy": 0.16757311969995498, "num_tokens": 27819591.0, "step": 15085 }, { "entropy": 5.6086828231811525, "epoch": 1.2677588741860952, "grad_norm": 1.8046875, "learning_rate": 0.0004843611431117636, "loss": 5.4138, "mean_token_accuracy": 0.1716834545135498, "num_tokens": 27828614.0, "step": 15090 }, { "entropy": 5.673300123214721, "epoch": 1.2681789540012602, "grad_norm": 1.75, "learning_rate": 0.0004843501716205167, "loss": 5.4511, "mean_token_accuracy": 0.165350541472435, "num_tokens": 27837549.0, "step": 15095 }, { "entropy": 5.737055730819702, "epoch": 1.2685990338164252, "grad_norm": 1.6328125, "learning_rate": 0.0004843391964207393, "loss": 5.4743, "mean_token_accuracy": 0.15991066843271257, "num_tokens": 27846678.0, "step": 15100 }, { "entropy": 5.789986085891724, "epoch": 1.26901911363159, "grad_norm": 1.65625, "learning_rate": 0.0004843282175126258, "loss": 5.4962, "mean_token_accuracy": 0.1644158586859703, "num_tokens": 27855734.0, "step": 15105 }, { "entropy": 5.703271150588989, "epoch": 1.2694391934467548, "grad_norm": 2.328125, "learning_rate": 0.00048431723489637086, "loss": 5.4225, "mean_token_accuracy": 0.16743371933698653, "num_tokens": 27865111.0, "step": 15110 }, { "entropy": 5.7195985317230225, "epoch": 1.2698592732619198, "grad_norm": 2.140625, "learning_rate": 0.00048430624857216876, "loss": 5.4393, "mean_token_accuracy": 0.1662244826555252, "num_tokens": 27874495.0, "step": 15115 }, { "entropy": 5.6339555263519285, "epoch": 1.2702793530770846, "grad_norm": 1.703125, "learning_rate": 0.0004842952585402143, "loss": 5.4758, "mean_token_accuracy": 0.16450706571340562, "num_tokens": 27884531.0, "step": 15120 }, { "entropy": 5.596436595916748, "epoch": 1.2706994328922496, "grad_norm": 2.21875, "learning_rate": 0.000484284264800702, "loss": 5.3613, "mean_token_accuracy": 0.17341870963573455, "num_tokens": 27893463.0, "step": 15125 }, { "entropy": 5.757380199432373, "epoch": 1.2711195127074144, "grad_norm": 1.9453125, "learning_rate": 0.00048427326735382687, "loss": 5.4724, "mean_token_accuracy": 0.16172740906476973, "num_tokens": 27903015.0, "step": 15130 }, { "entropy": 5.742963027954102, "epoch": 1.2715395925225792, "grad_norm": 9.8125, "learning_rate": 0.0004842622661997834, "loss": 5.4552, "mean_token_accuracy": 0.16410297602415086, "num_tokens": 27912207.0, "step": 15135 }, { "entropy": 5.6874500751495365, "epoch": 1.2719596723377442, "grad_norm": 1.7578125, "learning_rate": 0.0004842512613387668, "loss": 5.4679, "mean_token_accuracy": 0.1574219599366188, "num_tokens": 27921566.0, "step": 15140 }, { "entropy": 5.663531732559204, "epoch": 1.272379752152909, "grad_norm": 1.4765625, "learning_rate": 0.0004842402527709718, "loss": 5.4061, "mean_token_accuracy": 0.16983576118946075, "num_tokens": 27930633.0, "step": 15145 }, { "entropy": 5.78377251625061, "epoch": 1.272799831968074, "grad_norm": 1.75, "learning_rate": 0.0004842292404965934, "loss": 5.5197, "mean_token_accuracy": 0.1595507562160492, "num_tokens": 27939887.0, "step": 15150 }, { "entropy": 5.767408180236816, "epoch": 1.2732199117832388, "grad_norm": 1.71875, "learning_rate": 0.0004842182245158268, "loss": 5.5257, "mean_token_accuracy": 0.16959029585123062, "num_tokens": 27949090.0, "step": 15155 }, { "entropy": 5.610546350479126, "epoch": 1.2736399915984036, "grad_norm": 1.5859375, "learning_rate": 0.00048420720482886715, "loss": 5.3312, "mean_token_accuracy": 0.1733013227581978, "num_tokens": 27958141.0, "step": 15160 }, { "entropy": 5.63969464302063, "epoch": 1.2740600714135686, "grad_norm": 1.59375, "learning_rate": 0.0004841961814359095, "loss": 5.4047, "mean_token_accuracy": 0.16643078476190568, "num_tokens": 27967780.0, "step": 15165 }, { "entropy": 5.69786319732666, "epoch": 1.2744801512287336, "grad_norm": 1.90625, "learning_rate": 0.00048418515433714917, "loss": 5.489, "mean_token_accuracy": 0.16522752195596696, "num_tokens": 27976243.0, "step": 15170 }, { "entropy": 5.6997581958770756, "epoch": 1.2749002310438984, "grad_norm": 1.859375, "learning_rate": 0.0004841741235327817, "loss": 5.3579, "mean_token_accuracy": 0.17067465782165528, "num_tokens": 27985874.0, "step": 15175 }, { "entropy": 5.806114244461059, "epoch": 1.2753203108590632, "grad_norm": 1.75, "learning_rate": 0.00048416308902300215, "loss": 5.5921, "mean_token_accuracy": 0.15702388137578965, "num_tokens": 27995111.0, "step": 15180 }, { "entropy": 5.689389657974243, "epoch": 1.2757403906742282, "grad_norm": 1.828125, "learning_rate": 0.0004841520508080063, "loss": 5.4127, "mean_token_accuracy": 0.1689732179045677, "num_tokens": 28003948.0, "step": 15185 }, { "entropy": 5.6548957347869875, "epoch": 1.276160470489393, "grad_norm": 2.171875, "learning_rate": 0.00048414100888798957, "loss": 5.4174, "mean_token_accuracy": 0.16478729695081712, "num_tokens": 28012941.0, "step": 15190 }, { "entropy": 5.601344108581543, "epoch": 1.276580550304558, "grad_norm": 3.359375, "learning_rate": 0.0004841299632631475, "loss": 5.41, "mean_token_accuracy": 0.1636947825551033, "num_tokens": 28022195.0, "step": 15195 }, { "entropy": 5.65929913520813, "epoch": 1.2770006301197228, "grad_norm": 1.65625, "learning_rate": 0.0004841189139336759, "loss": 5.3589, "mean_token_accuracy": 0.16983367949724198, "num_tokens": 28031446.0, "step": 15200 }, { "entropy": 5.688397693634033, "epoch": 1.2774207099348875, "grad_norm": 1.7734375, "learning_rate": 0.0004841078608997703, "loss": 5.3801, "mean_token_accuracy": 0.17025842219591142, "num_tokens": 28040906.0, "step": 15205 }, { "entropy": 5.676456069946289, "epoch": 1.2778407897500526, "grad_norm": 1.3984375, "learning_rate": 0.0004840968041616267, "loss": 5.3894, "mean_token_accuracy": 0.1704905390739441, "num_tokens": 28049848.0, "step": 15210 }, { "entropy": 5.67938723564148, "epoch": 1.2782608695652173, "grad_norm": 1.484375, "learning_rate": 0.00048408574371944094, "loss": 5.3732, "mean_token_accuracy": 0.16771376579999925, "num_tokens": 28058276.0, "step": 15215 }, { "entropy": 5.688129663467407, "epoch": 1.2786809493803823, "grad_norm": 1.3515625, "learning_rate": 0.0004840746795734088, "loss": 5.5029, "mean_token_accuracy": 0.1592990979552269, "num_tokens": 28068185.0, "step": 15220 }, { "entropy": 5.77323579788208, "epoch": 1.2791010291955471, "grad_norm": 1.65625, "learning_rate": 0.0004840636117237264, "loss": 5.5346, "mean_token_accuracy": 0.16309675723314285, "num_tokens": 28077532.0, "step": 15225 }, { "entropy": 5.695499229431152, "epoch": 1.279521109010712, "grad_norm": 1.9296875, "learning_rate": 0.0004840525401705897, "loss": 5.3962, "mean_token_accuracy": 0.16487024575471879, "num_tokens": 28087593.0, "step": 15230 }, { "entropy": 5.651865243911743, "epoch": 1.279941188825877, "grad_norm": 2.671875, "learning_rate": 0.00048404146491419503, "loss": 5.3617, "mean_token_accuracy": 0.17026301175355912, "num_tokens": 28096256.0, "step": 15235 }, { "entropy": 5.682730484008789, "epoch": 1.2803612686410417, "grad_norm": 3.03125, "learning_rate": 0.00048403038595473837, "loss": 5.3999, "mean_token_accuracy": 0.1683255612850189, "num_tokens": 28105048.0, "step": 15240 }, { "entropy": 5.698611879348755, "epoch": 1.2807813484562067, "grad_norm": 1.875, "learning_rate": 0.000484019303292416, "loss": 5.4677, "mean_token_accuracy": 0.15729653239250183, "num_tokens": 28114330.0, "step": 15245 }, { "entropy": 5.666230535507202, "epoch": 1.2812014282713715, "grad_norm": 1.59375, "learning_rate": 0.00048400821692742434, "loss": 5.3826, "mean_token_accuracy": 0.17221412509679795, "num_tokens": 28123147.0, "step": 15250 }, { "entropy": 5.731086874008179, "epoch": 1.2816215080865365, "grad_norm": 2.03125, "learning_rate": 0.00048399712685995983, "loss": 5.519, "mean_token_accuracy": 0.16596773117780686, "num_tokens": 28132477.0, "step": 15255 }, { "entropy": 5.683180570602417, "epoch": 1.2820415879017013, "grad_norm": 1.4375, "learning_rate": 0.00048398603309021877, "loss": 5.5007, "mean_token_accuracy": 0.16307283490896224, "num_tokens": 28141350.0, "step": 15260 }, { "entropy": 5.718101358413696, "epoch": 1.2824616677168663, "grad_norm": 1.71875, "learning_rate": 0.0004839749356183978, "loss": 5.4452, "mean_token_accuracy": 0.16625609248876572, "num_tokens": 28149522.0, "step": 15265 }, { "entropy": 5.71740870475769, "epoch": 1.282881747532031, "grad_norm": 1.84375, "learning_rate": 0.0004839638344446933, "loss": 5.5484, "mean_token_accuracy": 0.16156259179115295, "num_tokens": 28159646.0, "step": 15270 }, { "entropy": 5.810041522979736, "epoch": 1.283301827347196, "grad_norm": 1.875, "learning_rate": 0.0004839527295693023, "loss": 5.4631, "mean_token_accuracy": 0.1712553933262825, "num_tokens": 28168408.0, "step": 15275 }, { "entropy": 5.740299415588379, "epoch": 1.283721907162361, "grad_norm": 2.484375, "learning_rate": 0.0004839416209924211, "loss": 5.4659, "mean_token_accuracy": 0.16082556098699569, "num_tokens": 28177744.0, "step": 15280 }, { "entropy": 5.74624080657959, "epoch": 1.2841419869775257, "grad_norm": 1.8515625, "learning_rate": 0.00048393050871424676, "loss": 5.5276, "mean_token_accuracy": 0.16067055016756057, "num_tokens": 28186811.0, "step": 15285 }, { "entropy": 5.6819815158844, "epoch": 1.2845620667926907, "grad_norm": 2.140625, "learning_rate": 0.000483919392734976, "loss": 5.5012, "mean_token_accuracy": 0.15652224719524382, "num_tokens": 28197052.0, "step": 15290 }, { "entropy": 5.707629013061523, "epoch": 1.2849821466078555, "grad_norm": 1.9296875, "learning_rate": 0.0004839082730548058, "loss": 5.3546, "mean_token_accuracy": 0.1764655143022537, "num_tokens": 28206000.0, "step": 15295 }, { "entropy": 5.692590618133545, "epoch": 1.2854022264230203, "grad_norm": 1.734375, "learning_rate": 0.0004838971496739331, "loss": 5.3416, "mean_token_accuracy": 0.16673224717378615, "num_tokens": 28214679.0, "step": 15300 }, { "entropy": 5.616611909866333, "epoch": 1.2858223062381853, "grad_norm": 1.7421875, "learning_rate": 0.000483886022592555, "loss": 5.4572, "mean_token_accuracy": 0.16383219435811042, "num_tokens": 28223890.0, "step": 15305 }, { "entropy": 5.671573495864868, "epoch": 1.28624238605335, "grad_norm": 1.59375, "learning_rate": 0.0004838748918108685, "loss": 5.3889, "mean_token_accuracy": 0.16743310987949372, "num_tokens": 28232422.0, "step": 15310 }, { "entropy": 5.661684656143189, "epoch": 1.286662465868515, "grad_norm": 2.53125, "learning_rate": 0.00048386375732907083, "loss": 5.4321, "mean_token_accuracy": 0.1664291650056839, "num_tokens": 28242079.0, "step": 15315 }, { "entropy": 5.772406101226807, "epoch": 1.2870825456836799, "grad_norm": 1.6640625, "learning_rate": 0.00048385261914735936, "loss": 5.626, "mean_token_accuracy": 0.1569541186094284, "num_tokens": 28252510.0, "step": 15320 }, { "entropy": 5.816063642501831, "epoch": 1.2875026254988446, "grad_norm": 2.828125, "learning_rate": 0.00048384147726593125, "loss": 5.5211, "mean_token_accuracy": 0.1613934814929962, "num_tokens": 28261348.0, "step": 15325 }, { "entropy": 5.7399543762207035, "epoch": 1.2879227053140097, "grad_norm": 1.765625, "learning_rate": 0.0004838303316849839, "loss": 5.4373, "mean_token_accuracy": 0.15664124339818955, "num_tokens": 28270739.0, "step": 15330 }, { "entropy": 5.7096014499664305, "epoch": 1.2883427851291747, "grad_norm": 1.5625, "learning_rate": 0.00048381918240471473, "loss": 5.4913, "mean_token_accuracy": 0.15497729554772377, "num_tokens": 28279370.0, "step": 15335 }, { "entropy": 5.726278638839721, "epoch": 1.2887628649443394, "grad_norm": 1.90625, "learning_rate": 0.00048380802942532124, "loss": 5.411, "mean_token_accuracy": 0.1654820501804352, "num_tokens": 28287955.0, "step": 15340 }, { "entropy": 5.604457712173462, "epoch": 1.2891829447595042, "grad_norm": 1.765625, "learning_rate": 0.00048379687274700107, "loss": 5.3613, "mean_token_accuracy": 0.17298102527856826, "num_tokens": 28296832.0, "step": 15345 }, { "entropy": 5.598322010040283, "epoch": 1.2896030245746692, "grad_norm": 1.4375, "learning_rate": 0.00048378571236995185, "loss": 5.3944, "mean_token_accuracy": 0.166165030002594, "num_tokens": 28305778.0, "step": 15350 }, { "entropy": 5.761275959014893, "epoch": 1.290023104389834, "grad_norm": 2.28125, "learning_rate": 0.00048377454829437124, "loss": 5.4484, "mean_token_accuracy": 0.1619205430150032, "num_tokens": 28314615.0, "step": 15355 }, { "entropy": 5.827945566177368, "epoch": 1.290443184204999, "grad_norm": 1.6953125, "learning_rate": 0.0004837633805204569, "loss": 5.5111, "mean_token_accuracy": 0.16340176910161971, "num_tokens": 28324478.0, "step": 15360 }, { "entropy": 5.753641033172608, "epoch": 1.2908632640201638, "grad_norm": 1.8203125, "learning_rate": 0.0004837522090484069, "loss": 5.4739, "mean_token_accuracy": 0.16428422480821608, "num_tokens": 28333532.0, "step": 15365 }, { "entropy": 5.720655488967895, "epoch": 1.2912833438353286, "grad_norm": 1.9375, "learning_rate": 0.00048374103387841894, "loss": 5.4456, "mean_token_accuracy": 0.15933494865894318, "num_tokens": 28343723.0, "step": 15370 }, { "entropy": 5.728183746337891, "epoch": 1.2917034236504936, "grad_norm": 1.7421875, "learning_rate": 0.00048372985501069106, "loss": 5.4241, "mean_token_accuracy": 0.1650676444172859, "num_tokens": 28351992.0, "step": 15375 }, { "entropy": 5.65154390335083, "epoch": 1.2921235034656584, "grad_norm": 1.75, "learning_rate": 0.0004837186724454213, "loss": 5.4075, "mean_token_accuracy": 0.16652555614709855, "num_tokens": 28361141.0, "step": 15380 }, { "entropy": 5.664861392974854, "epoch": 1.2925435832808234, "grad_norm": 2.109375, "learning_rate": 0.0004837074861828077, "loss": 5.3951, "mean_token_accuracy": 0.16747472435235977, "num_tokens": 28370339.0, "step": 15385 }, { "entropy": 5.725724220275879, "epoch": 1.2929636630959882, "grad_norm": 1.8984375, "learning_rate": 0.0004836962962230485, "loss": 5.5142, "mean_token_accuracy": 0.16315443962812423, "num_tokens": 28379242.0, "step": 15390 }, { "entropy": 5.659032392501831, "epoch": 1.293383742911153, "grad_norm": 1.828125, "learning_rate": 0.0004836851025663418, "loss": 5.4054, "mean_token_accuracy": 0.1692844420671463, "num_tokens": 28388864.0, "step": 15395 }, { "entropy": 5.7302182674407955, "epoch": 1.293803822726318, "grad_norm": 2.0625, "learning_rate": 0.000483673905212886, "loss": 5.5045, "mean_token_accuracy": 0.16604892164468765, "num_tokens": 28398000.0, "step": 15400 }, { "entropy": 5.645801734924317, "epoch": 1.294223902541483, "grad_norm": 1.8359375, "learning_rate": 0.0004836627041628794, "loss": 5.4445, "mean_token_accuracy": 0.1687624305486679, "num_tokens": 28407652.0, "step": 15405 }, { "entropy": 5.7521144390106205, "epoch": 1.2946439823566478, "grad_norm": 2.125, "learning_rate": 0.0004836514994165205, "loss": 5.4993, "mean_token_accuracy": 0.16120134592056273, "num_tokens": 28417694.0, "step": 15410 }, { "entropy": 5.694954919815063, "epoch": 1.2950640621718126, "grad_norm": 1.75, "learning_rate": 0.00048364029097400777, "loss": 5.442, "mean_token_accuracy": 0.16629258692264556, "num_tokens": 28426928.0, "step": 15415 }, { "entropy": 5.664297342300415, "epoch": 1.2954841419869776, "grad_norm": 1.609375, "learning_rate": 0.00048362907883553956, "loss": 5.4714, "mean_token_accuracy": 0.15762439966201783, "num_tokens": 28436176.0, "step": 15420 }, { "entropy": 5.728027105331421, "epoch": 1.2959042218021424, "grad_norm": 2.046875, "learning_rate": 0.00048361786300131477, "loss": 5.5363, "mean_token_accuracy": 0.15678158700466155, "num_tokens": 28445277.0, "step": 15425 }, { "entropy": 5.784550476074219, "epoch": 1.2963243016173074, "grad_norm": 1.671875, "learning_rate": 0.0004836066434715319, "loss": 5.4399, "mean_token_accuracy": 0.16050161719322203, "num_tokens": 28453959.0, "step": 15430 }, { "entropy": 5.718553638458252, "epoch": 1.2967443814324722, "grad_norm": 1.921875, "learning_rate": 0.0004835954202463898, "loss": 5.5243, "mean_token_accuracy": 0.16090073585510253, "num_tokens": 28463780.0, "step": 15435 }, { "entropy": 5.64632830619812, "epoch": 1.297164461247637, "grad_norm": 2.109375, "learning_rate": 0.0004835841933260872, "loss": 5.3784, "mean_token_accuracy": 0.16484325826168061, "num_tokens": 28473299.0, "step": 15440 }, { "entropy": 5.666690301895142, "epoch": 1.297584541062802, "grad_norm": 2.53125, "learning_rate": 0.00048357296271082305, "loss": 5.4216, "mean_token_accuracy": 0.16306840777397155, "num_tokens": 28481859.0, "step": 15445 }, { "entropy": 5.80743989944458, "epoch": 1.2980046208779668, "grad_norm": 1.7578125, "learning_rate": 0.00048356172840079625, "loss": 5.4795, "mean_token_accuracy": 0.16350326538085938, "num_tokens": 28491034.0, "step": 15450 }, { "entropy": 5.697645139694214, "epoch": 1.2984247006931318, "grad_norm": 1.5390625, "learning_rate": 0.0004835504903962058, "loss": 5.3839, "mean_token_accuracy": 0.16248102933168412, "num_tokens": 28499829.0, "step": 15455 }, { "entropy": 5.623191022872925, "epoch": 1.2988447805082965, "grad_norm": 1.5234375, "learning_rate": 0.00048353924869725084, "loss": 5.3937, "mean_token_accuracy": 0.1705133929848671, "num_tokens": 28508188.0, "step": 15460 }, { "entropy": 5.609925365447998, "epoch": 1.2992648603234613, "grad_norm": 1.625, "learning_rate": 0.0004835280033041305, "loss": 5.2948, "mean_token_accuracy": 0.16951121538877487, "num_tokens": 28516509.0, "step": 15465 }, { "entropy": 5.652699041366577, "epoch": 1.2996849401386263, "grad_norm": 1.6875, "learning_rate": 0.0004835167542170439, "loss": 5.5169, "mean_token_accuracy": 0.16390926837921144, "num_tokens": 28526457.0, "step": 15470 }, { "entropy": 5.70890064239502, "epoch": 1.3001050199537914, "grad_norm": 1.90625, "learning_rate": 0.0004835055014361904, "loss": 5.461, "mean_token_accuracy": 0.16140211522579193, "num_tokens": 28536149.0, "step": 15475 }, { "entropy": 5.776080131530762, "epoch": 1.3005250997689561, "grad_norm": 1.6796875, "learning_rate": 0.00048349424496176924, "loss": 5.5146, "mean_token_accuracy": 0.16204932928085328, "num_tokens": 28545486.0, "step": 15480 }, { "entropy": 5.693456315994263, "epoch": 1.300945179584121, "grad_norm": 1.6640625, "learning_rate": 0.00048348298479397996, "loss": 5.4013, "mean_token_accuracy": 0.1665617987513542, "num_tokens": 28554555.0, "step": 15485 }, { "entropy": 5.563140153884888, "epoch": 1.301365259399286, "grad_norm": 1.65625, "learning_rate": 0.00048347172093302196, "loss": 5.4174, "mean_token_accuracy": 0.17032357305288315, "num_tokens": 28563387.0, "step": 15490 }, { "entropy": 5.654443550109863, "epoch": 1.3017853392144507, "grad_norm": 2.5625, "learning_rate": 0.00048346045337909475, "loss": 5.4198, "mean_token_accuracy": 0.16440292894840242, "num_tokens": 28573437.0, "step": 15495 }, { "entropy": 5.641400241851807, "epoch": 1.3022054190296157, "grad_norm": 1.9453125, "learning_rate": 0.000483449182132398, "loss": 5.3656, "mean_token_accuracy": 0.17342451214790344, "num_tokens": 28583362.0, "step": 15500 }, { "entropy": 5.808328342437744, "epoch": 1.3026254988447805, "grad_norm": 2.359375, "learning_rate": 0.00048343790719313124, "loss": 5.553, "mean_token_accuracy": 0.15858516097068787, "num_tokens": 28593201.0, "step": 15505 }, { "entropy": 5.6986161231994625, "epoch": 1.3030455786599453, "grad_norm": 1.875, "learning_rate": 0.00048342662856149427, "loss": 5.452, "mean_token_accuracy": 0.15802465230226517, "num_tokens": 28602486.0, "step": 15510 }, { "entropy": 5.641084289550781, "epoch": 1.3034656584751103, "grad_norm": 1.5703125, "learning_rate": 0.000483415346237687, "loss": 5.4635, "mean_token_accuracy": 0.163986237347126, "num_tokens": 28611643.0, "step": 15515 }, { "entropy": 5.741965579986572, "epoch": 1.303885738290275, "grad_norm": 1.796875, "learning_rate": 0.0004834040602219091, "loss": 5.511, "mean_token_accuracy": 0.16517338454723357, "num_tokens": 28620545.0, "step": 15520 }, { "entropy": 5.687145090103149, "epoch": 1.30430581810544, "grad_norm": 1.6796875, "learning_rate": 0.00048339277051436067, "loss": 5.4423, "mean_token_accuracy": 0.16573746055364608, "num_tokens": 28630024.0, "step": 15525 }, { "entropy": 5.800404119491577, "epoch": 1.304725897920605, "grad_norm": 2.078125, "learning_rate": 0.0004833814771152415, "loss": 5.4982, "mean_token_accuracy": 0.1673808366060257, "num_tokens": 28638995.0, "step": 15530 }, { "entropy": 5.6978675365448, "epoch": 1.3051459777357697, "grad_norm": 1.90625, "learning_rate": 0.00048337018002475184, "loss": 5.4483, "mean_token_accuracy": 0.1675184115767479, "num_tokens": 28647833.0, "step": 15535 }, { "entropy": 5.632976531982422, "epoch": 1.3055660575509347, "grad_norm": 1.8671875, "learning_rate": 0.0004833588792430917, "loss": 5.3562, "mean_token_accuracy": 0.16957587152719497, "num_tokens": 28657441.0, "step": 15540 }, { "entropy": 5.710914278030396, "epoch": 1.3059861373660997, "grad_norm": 2.015625, "learning_rate": 0.0004833475747704614, "loss": 5.4746, "mean_token_accuracy": 0.16293687522411346, "num_tokens": 28666666.0, "step": 15545 }, { "entropy": 5.711972379684449, "epoch": 1.3064062171812645, "grad_norm": 1.8984375, "learning_rate": 0.000483336266607061, "loss": 5.4684, "mean_token_accuracy": 0.16195246577262878, "num_tokens": 28676770.0, "step": 15550 }, { "entropy": 5.71502652168274, "epoch": 1.3068262969964293, "grad_norm": 1.59375, "learning_rate": 0.00048332495475309097, "loss": 5.3882, "mean_token_accuracy": 0.16904159635305405, "num_tokens": 28685610.0, "step": 15555 }, { "entropy": 5.733300971984863, "epoch": 1.3072463768115943, "grad_norm": 1.7890625, "learning_rate": 0.00048331363920875155, "loss": 5.4835, "mean_token_accuracy": 0.1614070475101471, "num_tokens": 28695082.0, "step": 15560 }, { "entropy": 5.6674620628356935, "epoch": 1.307666456626759, "grad_norm": 1.796875, "learning_rate": 0.00048330231997424335, "loss": 5.3919, "mean_token_accuracy": 0.1674228772521019, "num_tokens": 28704006.0, "step": 15565 }, { "entropy": 5.664810228347778, "epoch": 1.308086536441924, "grad_norm": 1.5625, "learning_rate": 0.0004832909970497668, "loss": 5.4412, "mean_token_accuracy": 0.16440101712942123, "num_tokens": 28713665.0, "step": 15570 }, { "entropy": 5.687829685211182, "epoch": 1.3085066162570889, "grad_norm": 2.03125, "learning_rate": 0.00048327967043552245, "loss": 5.3995, "mean_token_accuracy": 0.16435023695230483, "num_tokens": 28722920.0, "step": 15575 }, { "entropy": 5.689635181427002, "epoch": 1.3089266960722536, "grad_norm": 1.5234375, "learning_rate": 0.00048326834013171107, "loss": 5.348, "mean_token_accuracy": 0.1712331637740135, "num_tokens": 28731689.0, "step": 15580 }, { "entropy": 5.734625387191772, "epoch": 1.3093467758874187, "grad_norm": 1.796875, "learning_rate": 0.0004832570061385332, "loss": 5.4711, "mean_token_accuracy": 0.17253154814243316, "num_tokens": 28741308.0, "step": 15585 }, { "entropy": 5.603468322753907, "epoch": 1.3097668557025834, "grad_norm": 1.9921875, "learning_rate": 0.0004832456684561898, "loss": 5.4311, "mean_token_accuracy": 0.16657552123069763, "num_tokens": 28750190.0, "step": 15590 }, { "entropy": 5.622490262985229, "epoch": 1.3101869355177485, "grad_norm": 1.8046875, "learning_rate": 0.0004832343270848815, "loss": 5.5019, "mean_token_accuracy": 0.16145084649324418, "num_tokens": 28759588.0, "step": 15595 }, { "entropy": 5.707578086853028, "epoch": 1.3106070153329132, "grad_norm": 2.25, "learning_rate": 0.00048322298202480935, "loss": 5.5023, "mean_token_accuracy": 0.162407810986042, "num_tokens": 28768800.0, "step": 15600 }, { "entropy": 5.782344579696655, "epoch": 1.311027095148078, "grad_norm": 2.296875, "learning_rate": 0.00048321163327617433, "loss": 5.4337, "mean_token_accuracy": 0.16309218406677245, "num_tokens": 28778108.0, "step": 15605 }, { "entropy": 5.753531789779663, "epoch": 1.311447174963243, "grad_norm": 1.875, "learning_rate": 0.0004832002808391775, "loss": 5.428, "mean_token_accuracy": 0.16352954655885696, "num_tokens": 28787202.0, "step": 15610 }, { "entropy": 5.679688262939453, "epoch": 1.3118672547784078, "grad_norm": 2.421875, "learning_rate": 0.0004831889247140198, "loss": 5.4529, "mean_token_accuracy": 0.16261952072381974, "num_tokens": 28797482.0, "step": 15615 }, { "entropy": 5.587442255020141, "epoch": 1.3122873345935728, "grad_norm": 2.234375, "learning_rate": 0.00048317756490090253, "loss": 5.3885, "mean_token_accuracy": 0.16671659797430038, "num_tokens": 28805872.0, "step": 15620 }, { "entropy": 5.645391368865967, "epoch": 1.3127074144087376, "grad_norm": 1.8515625, "learning_rate": 0.00048316620140002685, "loss": 5.5111, "mean_token_accuracy": 0.15997616499662398, "num_tokens": 28814836.0, "step": 15625 }, { "entropy": 5.78643798828125, "epoch": 1.3131274942239024, "grad_norm": 2.15625, "learning_rate": 0.0004831548342115942, "loss": 5.4727, "mean_token_accuracy": 0.1584260269999504, "num_tokens": 28824727.0, "step": 15630 }, { "entropy": 5.820865345001221, "epoch": 1.3135475740390674, "grad_norm": 1.7890625, "learning_rate": 0.00048314346333580576, "loss": 5.5875, "mean_token_accuracy": 0.1578096106648445, "num_tokens": 28833848.0, "step": 15635 }, { "entropy": 5.667257070541382, "epoch": 1.3139676538542324, "grad_norm": 1.890625, "learning_rate": 0.0004831320887728631, "loss": 5.3397, "mean_token_accuracy": 0.16978776156902314, "num_tokens": 28842198.0, "step": 15640 }, { "entropy": 5.667364835739136, "epoch": 1.3143877336693972, "grad_norm": 1.7109375, "learning_rate": 0.0004831207105229676, "loss": 5.4355, "mean_token_accuracy": 0.16604958921670915, "num_tokens": 28851804.0, "step": 15645 }, { "entropy": 5.605535078048706, "epoch": 1.314807813484562, "grad_norm": 2.03125, "learning_rate": 0.00048310932858632087, "loss": 5.3583, "mean_token_accuracy": 0.16956010460853577, "num_tokens": 28860181.0, "step": 15650 }, { "entropy": 5.634918832778931, "epoch": 1.315227893299727, "grad_norm": 1.7265625, "learning_rate": 0.00048309794296312467, "loss": 5.4172, "mean_token_accuracy": 0.17280941605567932, "num_tokens": 28869945.0, "step": 15655 }, { "entropy": 5.699268817901611, "epoch": 1.3156479731148918, "grad_norm": 1.90625, "learning_rate": 0.00048308655365358053, "loss": 5.4639, "mean_token_accuracy": 0.1694648638367653, "num_tokens": 28880343.0, "step": 15660 }, { "entropy": 5.794540119171143, "epoch": 1.3160680529300568, "grad_norm": 1.84375, "learning_rate": 0.00048307516065789017, "loss": 5.5316, "mean_token_accuracy": 0.15753707140684128, "num_tokens": 28889441.0, "step": 15665 }, { "entropy": 5.740979290008545, "epoch": 1.3164881327452216, "grad_norm": 1.5546875, "learning_rate": 0.00048306376397625546, "loss": 5.4851, "mean_token_accuracy": 0.15848094820976258, "num_tokens": 28898154.0, "step": 15670 }, { "entropy": 5.736214065551758, "epoch": 1.3169082125603864, "grad_norm": 2.28125, "learning_rate": 0.00048305236360887834, "loss": 5.4881, "mean_token_accuracy": 0.16091601997613908, "num_tokens": 28908359.0, "step": 15675 }, { "entropy": 5.694441890716552, "epoch": 1.3173282923755514, "grad_norm": 1.484375, "learning_rate": 0.00048304095955596074, "loss": 5.4821, "mean_token_accuracy": 0.16323225647211076, "num_tokens": 28918416.0, "step": 15680 }, { "entropy": 5.743959140777588, "epoch": 1.3177483721907162, "grad_norm": 1.7109375, "learning_rate": 0.0004830295518177047, "loss": 5.3966, "mean_token_accuracy": 0.17162241786718369, "num_tokens": 28927412.0, "step": 15685 }, { "entropy": 5.679540491104126, "epoch": 1.3181684520058812, "grad_norm": 1.9921875, "learning_rate": 0.00048301814039431227, "loss": 5.4299, "mean_token_accuracy": 0.1644519239664078, "num_tokens": 28936106.0, "step": 15690 }, { "entropy": 5.6732524871826175, "epoch": 1.318588531821046, "grad_norm": 2.0, "learning_rate": 0.00048300672528598553, "loss": 5.4675, "mean_token_accuracy": 0.16666047424077987, "num_tokens": 28945197.0, "step": 15695 }, { "entropy": 5.782284116744995, "epoch": 1.3190086116362107, "grad_norm": 2.5, "learning_rate": 0.0004829953064929268, "loss": 5.5033, "mean_token_accuracy": 0.15363497659564018, "num_tokens": 28954278.0, "step": 15700 }, { "entropy": 5.822621250152588, "epoch": 1.3194286914513758, "grad_norm": 1.8671875, "learning_rate": 0.0004829838840153383, "loss": 5.55, "mean_token_accuracy": 0.16536147743463517, "num_tokens": 28963101.0, "step": 15705 }, { "entropy": 5.619999361038208, "epoch": 1.3198487712665408, "grad_norm": 1.6015625, "learning_rate": 0.0004829724578534224, "loss": 5.4466, "mean_token_accuracy": 0.16242460757493973, "num_tokens": 28972063.0, "step": 15710 }, { "entropy": 5.685150098800659, "epoch": 1.3202688510817056, "grad_norm": 1.65625, "learning_rate": 0.00048296102800738153, "loss": 5.4051, "mean_token_accuracy": 0.1662852793931961, "num_tokens": 28981617.0, "step": 15715 }, { "entropy": 5.745265245437622, "epoch": 1.3206889308968703, "grad_norm": 1.7421875, "learning_rate": 0.00048294959447741807, "loss": 5.3931, "mean_token_accuracy": 0.16527727246284485, "num_tokens": 28989442.0, "step": 15720 }, { "entropy": 5.664169025421143, "epoch": 1.3211090107120353, "grad_norm": 1.7109375, "learning_rate": 0.00048293815726373467, "loss": 5.404, "mean_token_accuracy": 0.17082785815000534, "num_tokens": 28999104.0, "step": 15725 }, { "entropy": 5.650988054275513, "epoch": 1.3215290905272001, "grad_norm": 1.703125, "learning_rate": 0.00048292671636653386, "loss": 5.4456, "mean_token_accuracy": 0.16266124546527863, "num_tokens": 29008645.0, "step": 15730 }, { "entropy": 5.707271909713745, "epoch": 1.3219491703423651, "grad_norm": 1.8046875, "learning_rate": 0.0004829152717860184, "loss": 5.4324, "mean_token_accuracy": 0.16636938005685806, "num_tokens": 29018655.0, "step": 15735 }, { "entropy": 5.7679918766021725, "epoch": 1.32236925015753, "grad_norm": 1.703125, "learning_rate": 0.00048290382352239087, "loss": 5.4385, "mean_token_accuracy": 0.1688806027173996, "num_tokens": 29027109.0, "step": 15740 }, { "entropy": 5.653837728500366, "epoch": 1.3227893299726947, "grad_norm": 1.6484375, "learning_rate": 0.00048289237157585424, "loss": 5.2712, "mean_token_accuracy": 0.17749694585800171, "num_tokens": 29035535.0, "step": 15745 }, { "entropy": 5.622943782806397, "epoch": 1.3232094097878597, "grad_norm": 1.6953125, "learning_rate": 0.0004828809159466112, "loss": 5.4429, "mean_token_accuracy": 0.1581158846616745, "num_tokens": 29044723.0, "step": 15750 }, { "entropy": 5.718198776245117, "epoch": 1.3236294896030245, "grad_norm": 2.4375, "learning_rate": 0.0004828694566348648, "loss": 5.5804, "mean_token_accuracy": 0.1552947849035263, "num_tokens": 29053636.0, "step": 15755 }, { "entropy": 5.790498828887939, "epoch": 1.3240495694181895, "grad_norm": 1.609375, "learning_rate": 0.00048285799364081806, "loss": 5.4813, "mean_token_accuracy": 0.16202333718538284, "num_tokens": 29062940.0, "step": 15760 }, { "entropy": 5.721147918701172, "epoch": 1.3244696492333543, "grad_norm": 1.5703125, "learning_rate": 0.00048284652696467404, "loss": 5.4026, "mean_token_accuracy": 0.1688874751329422, "num_tokens": 29072159.0, "step": 15765 }, { "entropy": 5.75450963973999, "epoch": 1.324889729048519, "grad_norm": 1.84375, "learning_rate": 0.00048283505660663575, "loss": 5.4791, "mean_token_accuracy": 0.16751828640699387, "num_tokens": 29081544.0, "step": 15770 }, { "entropy": 5.638855648040772, "epoch": 1.325309808863684, "grad_norm": 1.84375, "learning_rate": 0.0004828235825669064, "loss": 5.4346, "mean_token_accuracy": 0.16318106204271315, "num_tokens": 29090710.0, "step": 15775 }, { "entropy": 5.688300275802613, "epoch": 1.325729888678849, "grad_norm": 1.609375, "learning_rate": 0.00048281210484568937, "loss": 5.4415, "mean_token_accuracy": 0.16632406264543534, "num_tokens": 29098988.0, "step": 15780 }, { "entropy": 5.665548658370971, "epoch": 1.326149968494014, "grad_norm": 1.6796875, "learning_rate": 0.00048280062344318794, "loss": 5.4862, "mean_token_accuracy": 0.15649251490831376, "num_tokens": 29108926.0, "step": 15785 }, { "entropy": 5.740646505355835, "epoch": 1.3265700483091787, "grad_norm": 1.9140625, "learning_rate": 0.0004827891383596054, "loss": 5.4148, "mean_token_accuracy": 0.1614031285047531, "num_tokens": 29118065.0, "step": 15790 }, { "entropy": 5.7241943359375, "epoch": 1.3269901281243437, "grad_norm": 1.703125, "learning_rate": 0.00048277764959514524, "loss": 5.3762, "mean_token_accuracy": 0.1652180477976799, "num_tokens": 29127030.0, "step": 15795 }, { "entropy": 5.748840999603272, "epoch": 1.3274102079395085, "grad_norm": 1.640625, "learning_rate": 0.0004827661571500111, "loss": 5.5058, "mean_token_accuracy": 0.16153218150138854, "num_tokens": 29137200.0, "step": 15800 }, { "entropy": 5.741848373413086, "epoch": 1.3278302877546735, "grad_norm": 1.5625, "learning_rate": 0.00048275466102440644, "loss": 5.4825, "mean_token_accuracy": 0.16485897302627564, "num_tokens": 29147029.0, "step": 15805 }, { "entropy": 5.631581258773804, "epoch": 1.3282503675698383, "grad_norm": 1.8046875, "learning_rate": 0.00048274316121853494, "loss": 5.3711, "mean_token_accuracy": 0.1663237363100052, "num_tokens": 29155675.0, "step": 15810 }, { "entropy": 5.749010705947876, "epoch": 1.328670447385003, "grad_norm": 1.9765625, "learning_rate": 0.00048273165773260023, "loss": 5.4356, "mean_token_accuracy": 0.1655052199959755, "num_tokens": 29164730.0, "step": 15815 }, { "entropy": 5.701095962524414, "epoch": 1.329090527200168, "grad_norm": 1.8125, "learning_rate": 0.0004827201505668063, "loss": 5.4364, "mean_token_accuracy": 0.1656198024749756, "num_tokens": 29173074.0, "step": 15820 }, { "entropy": 5.7562737464904785, "epoch": 1.3295106070153329, "grad_norm": 1.5078125, "learning_rate": 0.0004827086397213568, "loss": 5.5478, "mean_token_accuracy": 0.16311392933130264, "num_tokens": 29182175.0, "step": 15825 }, { "entropy": 5.8277308464050295, "epoch": 1.3299306868304979, "grad_norm": 1.703125, "learning_rate": 0.0004826971251964557, "loss": 5.7415, "mean_token_accuracy": 0.1557246647775173, "num_tokens": 29192910.0, "step": 15830 }, { "entropy": 5.710680675506592, "epoch": 1.3303507666456627, "grad_norm": 1.5234375, "learning_rate": 0.000482685606992307, "loss": 5.387, "mean_token_accuracy": 0.169048510491848, "num_tokens": 29201969.0, "step": 15835 }, { "entropy": 5.7679280757904055, "epoch": 1.3307708464608274, "grad_norm": 1.4140625, "learning_rate": 0.00048267408510911463, "loss": 5.5448, "mean_token_accuracy": 0.16056760400533676, "num_tokens": 29210475.0, "step": 15840 }, { "entropy": 5.648775243759156, "epoch": 1.3311909262759924, "grad_norm": 1.65625, "learning_rate": 0.0004826625595470829, "loss": 5.4135, "mean_token_accuracy": 0.16637052744626998, "num_tokens": 29222586.0, "step": 15845 }, { "entropy": 5.65334529876709, "epoch": 1.3316110060911575, "grad_norm": 1.9140625, "learning_rate": 0.00048265103030641575, "loss": 5.4624, "mean_token_accuracy": 0.161483795940876, "num_tokens": 29231503.0, "step": 15850 }, { "entropy": 5.648448801040649, "epoch": 1.3320310859063222, "grad_norm": 1.578125, "learning_rate": 0.0004826394973873176, "loss": 5.4313, "mean_token_accuracy": 0.1599622756242752, "num_tokens": 29241534.0, "step": 15855 }, { "entropy": 5.723581027984619, "epoch": 1.332451165721487, "grad_norm": 1.859375, "learning_rate": 0.00048262796078999266, "loss": 5.4497, "mean_token_accuracy": 0.16642218083143234, "num_tokens": 29250381.0, "step": 15860 }, { "entropy": 5.72342619895935, "epoch": 1.332871245536652, "grad_norm": 1.828125, "learning_rate": 0.0004826164205146453, "loss": 5.5105, "mean_token_accuracy": 0.15796453654766082, "num_tokens": 29259205.0, "step": 15865 }, { "entropy": 5.566991662979126, "epoch": 1.3332913253518168, "grad_norm": 1.7734375, "learning_rate": 0.00048260487656147995, "loss": 5.411, "mean_token_accuracy": 0.16878511905670165, "num_tokens": 29267723.0, "step": 15870 }, { "entropy": 5.663629627227783, "epoch": 1.3337114051669818, "grad_norm": 1.53125, "learning_rate": 0.00048259332893070106, "loss": 5.4105, "mean_token_accuracy": 0.16867344379425048, "num_tokens": 29277102.0, "step": 15875 }, { "entropy": 5.685384702682495, "epoch": 1.3341314849821466, "grad_norm": 1.5234375, "learning_rate": 0.0004825817776225133, "loss": 5.3994, "mean_token_accuracy": 0.16746718436479568, "num_tokens": 29286484.0, "step": 15880 }, { "entropy": 5.673250675201416, "epoch": 1.3345515647973114, "grad_norm": 1.5625, "learning_rate": 0.00048257022263712123, "loss": 5.4876, "mean_token_accuracy": 0.17098401337862015, "num_tokens": 29296528.0, "step": 15885 }, { "entropy": 5.589338874816894, "epoch": 1.3349716446124764, "grad_norm": 1.734375, "learning_rate": 0.00048255866397472954, "loss": 5.3384, "mean_token_accuracy": 0.17186694368720054, "num_tokens": 29305283.0, "step": 15890 }, { "entropy": 5.703013134002686, "epoch": 1.3353917244276412, "grad_norm": 1.8203125, "learning_rate": 0.000482547101635543, "loss": 5.3432, "mean_token_accuracy": 0.1679681733250618, "num_tokens": 29315088.0, "step": 15895 }, { "entropy": 5.703509330749512, "epoch": 1.3358118042428062, "grad_norm": 1.4296875, "learning_rate": 0.00048253553561976645, "loss": 5.3596, "mean_token_accuracy": 0.16931547373533248, "num_tokens": 29323793.0, "step": 15900 }, { "entropy": 5.644972229003907, "epoch": 1.336231884057971, "grad_norm": 1.6484375, "learning_rate": 0.0004825239659276047, "loss": 5.4415, "mean_token_accuracy": 0.1619830012321472, "num_tokens": 29334015.0, "step": 15905 }, { "entropy": 5.736817216873169, "epoch": 1.3366519638731358, "grad_norm": 1.78125, "learning_rate": 0.0004825123925592628, "loss": 5.5419, "mean_token_accuracy": 0.15840226113796235, "num_tokens": 29343221.0, "step": 15910 }, { "entropy": 5.711045169830323, "epoch": 1.3370720436883008, "grad_norm": 1.65625, "learning_rate": 0.00048250081551494574, "loss": 5.3858, "mean_token_accuracy": 0.16694712340831758, "num_tokens": 29352261.0, "step": 15915 }, { "entropy": 5.677080345153809, "epoch": 1.3374921235034656, "grad_norm": 1.703125, "learning_rate": 0.0004824892347948586, "loss": 5.4929, "mean_token_accuracy": 0.16138059496879578, "num_tokens": 29362138.0, "step": 15920 }, { "entropy": 5.6527352809906, "epoch": 1.3379122033186306, "grad_norm": 1.4765625, "learning_rate": 0.0004824776503992064, "loss": 5.3898, "mean_token_accuracy": 0.1713466763496399, "num_tokens": 29371234.0, "step": 15925 }, { "entropy": 5.635444116592407, "epoch": 1.3383322831337954, "grad_norm": 1.7265625, "learning_rate": 0.0004824660623281945, "loss": 5.4473, "mean_token_accuracy": 0.16970054805278778, "num_tokens": 29380371.0, "step": 15930 }, { "entropy": 5.773072195053101, "epoch": 1.3387523629489604, "grad_norm": 1.6015625, "learning_rate": 0.00048245447058202815, "loss": 5.5592, "mean_token_accuracy": 0.1614100843667984, "num_tokens": 29389230.0, "step": 15935 }, { "entropy": 5.7593803882598875, "epoch": 1.3391724427641252, "grad_norm": 2.203125, "learning_rate": 0.0004824428751609126, "loss": 5.4466, "mean_token_accuracy": 0.16970301866531373, "num_tokens": 29398753.0, "step": 15940 }, { "entropy": 5.7071356773376465, "epoch": 1.3395925225792902, "grad_norm": 1.8828125, "learning_rate": 0.00048243127606505343, "loss": 5.4092, "mean_token_accuracy": 0.16827066540718078, "num_tokens": 29407487.0, "step": 15945 }, { "entropy": 5.572418594360352, "epoch": 1.340012602394455, "grad_norm": 1.4765625, "learning_rate": 0.000482419673294656, "loss": 5.4018, "mean_token_accuracy": 0.16651310175657272, "num_tokens": 29416140.0, "step": 15950 }, { "entropy": 5.64957218170166, "epoch": 1.3404326822096198, "grad_norm": 1.5390625, "learning_rate": 0.0004824080668499259, "loss": 5.4397, "mean_token_accuracy": 0.1690505862236023, "num_tokens": 29424763.0, "step": 15955 }, { "entropy": 5.800030183792114, "epoch": 1.3408527620247848, "grad_norm": 1.546875, "learning_rate": 0.00048239645673106855, "loss": 5.4385, "mean_token_accuracy": 0.16088547855615615, "num_tokens": 29434589.0, "step": 15960 }, { "entropy": 5.71432843208313, "epoch": 1.3412728418399495, "grad_norm": 1.7109375, "learning_rate": 0.00048238484293828995, "loss": 5.4479, "mean_token_accuracy": 0.16145109385252, "num_tokens": 29443549.0, "step": 15965 }, { "entropy": 5.6876280307769775, "epoch": 1.3416929216551146, "grad_norm": 2.046875, "learning_rate": 0.0004823732254717955, "loss": 5.4565, "mean_token_accuracy": 0.16495574414730071, "num_tokens": 29452457.0, "step": 15970 }, { "entropy": 5.612264728546142, "epoch": 1.3421130014702793, "grad_norm": 1.390625, "learning_rate": 0.0004823616043317912, "loss": 5.4241, "mean_token_accuracy": 0.16470324099063874, "num_tokens": 29461238.0, "step": 15975 }, { "entropy": 5.720156478881836, "epoch": 1.3425330812854441, "grad_norm": 1.7578125, "learning_rate": 0.00048234997951848284, "loss": 5.4857, "mean_token_accuracy": 0.15919755399227142, "num_tokens": 29471170.0, "step": 15980 }, { "entropy": 5.813641786575317, "epoch": 1.3429531611006091, "grad_norm": 2.0625, "learning_rate": 0.0004823383510320764, "loss": 5.5245, "mean_token_accuracy": 0.155257136374712, "num_tokens": 29481017.0, "step": 15985 }, { "entropy": 5.799026155471802, "epoch": 1.343373240915774, "grad_norm": 1.90625, "learning_rate": 0.00048232671887277786, "loss": 5.457, "mean_token_accuracy": 0.1612869530916214, "num_tokens": 29489809.0, "step": 15990 }, { "entropy": 5.6579841613769535, "epoch": 1.343793320730939, "grad_norm": 1.65625, "learning_rate": 0.00048231508304079313, "loss": 5.4711, "mean_token_accuracy": 0.16473791301250457, "num_tokens": 29499499.0, "step": 15995 }, { "entropy": 5.745253086090088, "epoch": 1.3442134005461037, "grad_norm": 1.6328125, "learning_rate": 0.00048230344353632855, "loss": 5.4375, "mean_token_accuracy": 0.16314539089798927, "num_tokens": 29508526.0, "step": 16000 }, { "entropy": 5.741327238082886, "epoch": 1.3446334803612685, "grad_norm": 1.3671875, "learning_rate": 0.0004822918003595902, "loss": 5.3692, "mean_token_accuracy": 0.1664547398686409, "num_tokens": 29517516.0, "step": 16005 }, { "entropy": 5.649990653991699, "epoch": 1.3450535601764335, "grad_norm": 1.5, "learning_rate": 0.0004822801535107843, "loss": 5.4562, "mean_token_accuracy": 0.16211575120687485, "num_tokens": 29526949.0, "step": 16010 }, { "entropy": 5.62546067237854, "epoch": 1.3454736399915985, "grad_norm": 1.546875, "learning_rate": 0.0004822685029901173, "loss": 5.3694, "mean_token_accuracy": 0.16785492449998857, "num_tokens": 29536696.0, "step": 16015 }, { "entropy": 5.697886323928833, "epoch": 1.3458937198067633, "grad_norm": 1.6015625, "learning_rate": 0.0004822568487977954, "loss": 5.4598, "mean_token_accuracy": 0.1707649677991867, "num_tokens": 29545672.0, "step": 16020 }, { "entropy": 5.72620997428894, "epoch": 1.346313799621928, "grad_norm": 1.5390625, "learning_rate": 0.00048224519093402517, "loss": 5.4987, "mean_token_accuracy": 0.16094502359628676, "num_tokens": 29554888.0, "step": 16025 }, { "entropy": 5.706309843063354, "epoch": 1.346733879437093, "grad_norm": 1.7734375, "learning_rate": 0.00048223352939901317, "loss": 5.4213, "mean_token_accuracy": 0.1683374136686325, "num_tokens": 29564798.0, "step": 16030 }, { "entropy": 5.692904901504517, "epoch": 1.347153959252258, "grad_norm": 1.5078125, "learning_rate": 0.0004822218641929658, "loss": 5.4523, "mean_token_accuracy": 0.16932614743709565, "num_tokens": 29574802.0, "step": 16035 }, { "entropy": 5.79500937461853, "epoch": 1.347574039067423, "grad_norm": 1.8984375, "learning_rate": 0.0004822101953160899, "loss": 5.4429, "mean_token_accuracy": 0.16303310692310333, "num_tokens": 29583056.0, "step": 16040 }, { "entropy": 5.704788446426392, "epoch": 1.3479941188825877, "grad_norm": 1.4375, "learning_rate": 0.000482198522768592, "loss": 5.4188, "mean_token_accuracy": 0.1648677781224251, "num_tokens": 29591935.0, "step": 16045 }, { "entropy": 5.581204128265381, "epoch": 1.3484141986977525, "grad_norm": 1.7578125, "learning_rate": 0.00048218684655067907, "loss": 5.3587, "mean_token_accuracy": 0.16874558329582215, "num_tokens": 29600812.0, "step": 16050 }, { "entropy": 5.74789342880249, "epoch": 1.3488342785129175, "grad_norm": 1.6484375, "learning_rate": 0.0004821751666625577, "loss": 5.4803, "mean_token_accuracy": 0.16880127936601638, "num_tokens": 29610735.0, "step": 16055 }, { "entropy": 5.74139404296875, "epoch": 1.3492543583280823, "grad_norm": 1.9375, "learning_rate": 0.00048216348310443506, "loss": 5.4079, "mean_token_accuracy": 0.1595388814806938, "num_tokens": 29620295.0, "step": 16060 }, { "entropy": 5.614044618606568, "epoch": 1.3496744381432473, "grad_norm": 1.5390625, "learning_rate": 0.00048215179587651795, "loss": 5.278, "mean_token_accuracy": 0.17504663914442062, "num_tokens": 29628214.0, "step": 16065 }, { "entropy": 5.613619422912597, "epoch": 1.350094517958412, "grad_norm": 2.390625, "learning_rate": 0.0004821401049790134, "loss": 5.407, "mean_token_accuracy": 0.17384071946144103, "num_tokens": 29636598.0, "step": 16070 }, { "entropy": 5.722601461410522, "epoch": 1.3505145977735769, "grad_norm": 1.796875, "learning_rate": 0.0004821284104121286, "loss": 5.3986, "mean_token_accuracy": 0.16711462736129762, "num_tokens": 29646052.0, "step": 16075 }, { "entropy": 5.650021648406982, "epoch": 1.3509346775887419, "grad_norm": 1.6796875, "learning_rate": 0.00048211671217607066, "loss": 5.4292, "mean_token_accuracy": 0.1578374594449997, "num_tokens": 29655310.0, "step": 16080 }, { "entropy": 5.695374917984009, "epoch": 1.3513547574039069, "grad_norm": 1.578125, "learning_rate": 0.0004821050102710468, "loss": 5.4328, "mean_token_accuracy": 0.16689082086086274, "num_tokens": 29664020.0, "step": 16085 }, { "entropy": 5.67445330619812, "epoch": 1.3517748372190717, "grad_norm": 1.6875, "learning_rate": 0.00048209330469726433, "loss": 5.4928, "mean_token_accuracy": 0.16078488826751708, "num_tokens": 29672416.0, "step": 16090 }, { "entropy": 5.699030542373658, "epoch": 1.3521949170342364, "grad_norm": 1.5234375, "learning_rate": 0.00048208159545493057, "loss": 5.386, "mean_token_accuracy": 0.17253393828868865, "num_tokens": 29681148.0, "step": 16095 }, { "entropy": 5.662472581863403, "epoch": 1.3526149968494015, "grad_norm": 1.5, "learning_rate": 0.0004820698825442531, "loss": 5.356, "mean_token_accuracy": 0.16811198592185975, "num_tokens": 29689089.0, "step": 16100 }, { "entropy": 5.661127424240112, "epoch": 1.3530350766645662, "grad_norm": 2.03125, "learning_rate": 0.00048205816596543914, "loss": 5.4761, "mean_token_accuracy": 0.1623773142695427, "num_tokens": 29697704.0, "step": 16105 }, { "entropy": 5.721720790863037, "epoch": 1.3534551564797312, "grad_norm": 1.75, "learning_rate": 0.00048204644571869646, "loss": 5.4838, "mean_token_accuracy": 0.1618230536580086, "num_tokens": 29706966.0, "step": 16110 }, { "entropy": 5.689847612380982, "epoch": 1.353875236294896, "grad_norm": 1.3984375, "learning_rate": 0.0004820347218042326, "loss": 5.3846, "mean_token_accuracy": 0.1613849386572838, "num_tokens": 29715817.0, "step": 16115 }, { "entropy": 5.680365753173828, "epoch": 1.3542953161100608, "grad_norm": 1.5234375, "learning_rate": 0.0004820229942222553, "loss": 5.4815, "mean_token_accuracy": 0.16351019442081452, "num_tokens": 29725500.0, "step": 16120 }, { "entropy": 5.6516200542449955, "epoch": 1.3547153959252258, "grad_norm": 1.5390625, "learning_rate": 0.00048201126297297214, "loss": 5.4144, "mean_token_accuracy": 0.1723678767681122, "num_tokens": 29734774.0, "step": 16125 }, { "entropy": 5.713293790817261, "epoch": 1.3551354757403906, "grad_norm": 1.75, "learning_rate": 0.0004819995280565911, "loss": 5.3916, "mean_token_accuracy": 0.16618053019046783, "num_tokens": 29744667.0, "step": 16130 }, { "entropy": 5.790366268157959, "epoch": 1.3555555555555556, "grad_norm": 1.4921875, "learning_rate": 0.00048198778947332, "loss": 5.4858, "mean_token_accuracy": 0.16581830829381944, "num_tokens": 29753644.0, "step": 16135 }, { "entropy": 5.781135702133179, "epoch": 1.3559756353707204, "grad_norm": 1.828125, "learning_rate": 0.0004819760472233668, "loss": 5.4401, "mean_token_accuracy": 0.17537587881088257, "num_tokens": 29762977.0, "step": 16140 }, { "entropy": 5.652209234237671, "epoch": 1.3563957151858852, "grad_norm": 1.5078125, "learning_rate": 0.00048196430130693956, "loss": 5.417, "mean_token_accuracy": 0.1675757497549057, "num_tokens": 29772221.0, "step": 16145 }, { "entropy": 5.621037292480469, "epoch": 1.3568157950010502, "grad_norm": 1.765625, "learning_rate": 0.00048195255172424627, "loss": 5.3946, "mean_token_accuracy": 0.17199670076370238, "num_tokens": 29781240.0, "step": 16150 }, { "entropy": 5.7102892875671385, "epoch": 1.3572358748162152, "grad_norm": 1.4375, "learning_rate": 0.00048194079847549507, "loss": 5.3836, "mean_token_accuracy": 0.1677268549799919, "num_tokens": 29790330.0, "step": 16155 }, { "entropy": 5.742030811309815, "epoch": 1.35765595463138, "grad_norm": 1.5390625, "learning_rate": 0.0004819290415608942, "loss": 5.5299, "mean_token_accuracy": 0.15959240794181823, "num_tokens": 29800945.0, "step": 16160 }, { "entropy": 5.749591875076294, "epoch": 1.3580760344465448, "grad_norm": 1.3515625, "learning_rate": 0.0004819172809806519, "loss": 5.5563, "mean_token_accuracy": 0.16161940693855287, "num_tokens": 29810391.0, "step": 16165 }, { "entropy": 5.724706315994263, "epoch": 1.3584961142617098, "grad_norm": 1.671875, "learning_rate": 0.00048190551673497645, "loss": 5.4101, "mean_token_accuracy": 0.16489760130643843, "num_tokens": 29819511.0, "step": 16170 }, { "entropy": 5.671798896789551, "epoch": 1.3589161940768746, "grad_norm": 3.046875, "learning_rate": 0.0004818937488240764, "loss": 5.4587, "mean_token_accuracy": 0.16651098430156708, "num_tokens": 29828313.0, "step": 16175 }, { "entropy": 5.613863277435303, "epoch": 1.3593362738920396, "grad_norm": 1.6796875, "learning_rate": 0.00048188197724816014, "loss": 5.3552, "mean_token_accuracy": 0.17119555920362473, "num_tokens": 29837940.0, "step": 16180 }, { "entropy": 5.6810376167297365, "epoch": 1.3597563537072044, "grad_norm": 1.8828125, "learning_rate": 0.00048187020200743613, "loss": 5.3383, "mean_token_accuracy": 0.17339792847633362, "num_tokens": 29846799.0, "step": 16185 }, { "entropy": 5.665157318115234, "epoch": 1.3601764335223692, "grad_norm": 1.8046875, "learning_rate": 0.000481858423102113, "loss": 5.4742, "mean_token_accuracy": 0.16402493715286254, "num_tokens": 29856263.0, "step": 16190 }, { "entropy": 5.644852066040039, "epoch": 1.3605965133375342, "grad_norm": 1.8046875, "learning_rate": 0.0004818466405323994, "loss": 5.4008, "mean_token_accuracy": 0.16702589765191078, "num_tokens": 29864335.0, "step": 16195 }, { "entropy": 5.780227518081665, "epoch": 1.361016593152699, "grad_norm": 2.765625, "learning_rate": 0.00048183485429850417, "loss": 5.4571, "mean_token_accuracy": 0.16093909740447998, "num_tokens": 29873466.0, "step": 16200 }, { "entropy": 5.650618982315064, "epoch": 1.361436672967864, "grad_norm": 1.5859375, "learning_rate": 0.0004818230644006359, "loss": 5.4313, "mean_token_accuracy": 0.1745832309126854, "num_tokens": 29883051.0, "step": 16205 }, { "entropy": 5.6727265357971195, "epoch": 1.3618567527830288, "grad_norm": 1.8359375, "learning_rate": 0.0004818112708390036, "loss": 5.3724, "mean_token_accuracy": 0.16966692954301835, "num_tokens": 29891823.0, "step": 16210 }, { "entropy": 5.6647271633148195, "epoch": 1.3622768325981935, "grad_norm": 1.9921875, "learning_rate": 0.0004817994736138162, "loss": 5.3974, "mean_token_accuracy": 0.16659445315599442, "num_tokens": 29900735.0, "step": 16215 }, { "entropy": 5.723177146911621, "epoch": 1.3626969124133586, "grad_norm": 2.046875, "learning_rate": 0.0004817876727252824, "loss": 5.4645, "mean_token_accuracy": 0.16937078535556793, "num_tokens": 29910345.0, "step": 16220 }, { "entropy": 5.680374002456665, "epoch": 1.3631169922285233, "grad_norm": 1.84375, "learning_rate": 0.00048177586817361166, "loss": 5.4253, "mean_token_accuracy": 0.16509459167718887, "num_tokens": 29919650.0, "step": 16225 }, { "entropy": 5.744551753997802, "epoch": 1.3635370720436883, "grad_norm": 1.5, "learning_rate": 0.0004817640599590128, "loss": 5.4634, "mean_token_accuracy": 0.16363565474748612, "num_tokens": 29928851.0, "step": 16230 }, { "entropy": 5.795070457458496, "epoch": 1.3639571518588531, "grad_norm": 2.5625, "learning_rate": 0.00048175224808169506, "loss": 5.5652, "mean_token_accuracy": 0.1574440762400627, "num_tokens": 29939146.0, "step": 16235 }, { "entropy": 5.739347171783447, "epoch": 1.3643772316740181, "grad_norm": 1.578125, "learning_rate": 0.00048174043254186775, "loss": 5.3954, "mean_token_accuracy": 0.16345300823450087, "num_tokens": 29947556.0, "step": 16240 }, { "entropy": 5.723556280136108, "epoch": 1.364797311489183, "grad_norm": 1.3828125, "learning_rate": 0.0004817286133397401, "loss": 5.4954, "mean_token_accuracy": 0.1634947583079338, "num_tokens": 29957319.0, "step": 16245 }, { "entropy": 5.711846876144409, "epoch": 1.365217391304348, "grad_norm": 1.53125, "learning_rate": 0.0004817167904755216, "loss": 5.4681, "mean_token_accuracy": 0.16776756644248964, "num_tokens": 29966697.0, "step": 16250 }, { "entropy": 5.69892258644104, "epoch": 1.3656374711195127, "grad_norm": 1.53125, "learning_rate": 0.00048170496394942154, "loss": 5.4705, "mean_token_accuracy": 0.16467532590031625, "num_tokens": 29975103.0, "step": 16255 }, { "entropy": 5.626475429534912, "epoch": 1.3660575509346775, "grad_norm": 2.0, "learning_rate": 0.00048169313376164943, "loss": 5.3783, "mean_token_accuracy": 0.1634665012359619, "num_tokens": 29984865.0, "step": 16260 }, { "entropy": 5.687254858016968, "epoch": 1.3664776307498425, "grad_norm": 2.6875, "learning_rate": 0.00048168129991241497, "loss": 5.3935, "mean_token_accuracy": 0.16465528607368468, "num_tokens": 29994376.0, "step": 16265 }, { "entropy": 5.81418023109436, "epoch": 1.3668977105650073, "grad_norm": 1.6328125, "learning_rate": 0.0004816694624019277, "loss": 5.6269, "mean_token_accuracy": 0.1598551630973816, "num_tokens": 30004846.0, "step": 16270 }, { "entropy": 5.721722793579102, "epoch": 1.3673177903801723, "grad_norm": 1.609375, "learning_rate": 0.00048165762123039723, "loss": 5.4061, "mean_token_accuracy": 0.16762069165706633, "num_tokens": 30014083.0, "step": 16275 }, { "entropy": 5.668401479721069, "epoch": 1.367737870195337, "grad_norm": 1.484375, "learning_rate": 0.00048164577639803354, "loss": 5.4075, "mean_token_accuracy": 0.16811236888170242, "num_tokens": 30023606.0, "step": 16280 }, { "entropy": 5.625358724594117, "epoch": 1.3681579500105019, "grad_norm": 1.640625, "learning_rate": 0.0004816339279050463, "loss": 5.3889, "mean_token_accuracy": 0.1599855825304985, "num_tokens": 30033657.0, "step": 16285 }, { "entropy": 5.6841939926147464, "epoch": 1.368578029825667, "grad_norm": 1.6953125, "learning_rate": 0.00048162207575164537, "loss": 5.4454, "mean_token_accuracy": 0.16324448585510254, "num_tokens": 30043230.0, "step": 16290 }, { "entropy": 5.704262971878052, "epoch": 1.3689981096408317, "grad_norm": 1.7421875, "learning_rate": 0.00048161021993804075, "loss": 5.4687, "mean_token_accuracy": 0.16441552191972733, "num_tokens": 30054457.0, "step": 16295 }, { "entropy": 5.6318847179412845, "epoch": 1.3694181894559967, "grad_norm": 1.4140625, "learning_rate": 0.00048159836046444255, "loss": 5.3108, "mean_token_accuracy": 0.17175357937812805, "num_tokens": 30062912.0, "step": 16300 }, { "entropy": 5.697698926925659, "epoch": 1.3698382692711615, "grad_norm": 1.5390625, "learning_rate": 0.0004815864973310607, "loss": 5.4661, "mean_token_accuracy": 0.16420117467641832, "num_tokens": 30071340.0, "step": 16305 }, { "entropy": 5.774897241592408, "epoch": 1.3702583490863263, "grad_norm": 1.8125, "learning_rate": 0.00048157463053810553, "loss": 5.5472, "mean_token_accuracy": 0.15643561482429505, "num_tokens": 30080334.0, "step": 16310 }, { "entropy": 5.682491111755371, "epoch": 1.3706784289014913, "grad_norm": 1.4609375, "learning_rate": 0.00048156276008578706, "loss": 5.3925, "mean_token_accuracy": 0.16573573052883148, "num_tokens": 30089391.0, "step": 16315 }, { "entropy": 5.652284622192383, "epoch": 1.3710985087166563, "grad_norm": 2.21875, "learning_rate": 0.0004815508859743157, "loss": 5.3808, "mean_token_accuracy": 0.1688121259212494, "num_tokens": 30099027.0, "step": 16320 }, { "entropy": 5.625274896621704, "epoch": 1.371518588531821, "grad_norm": 1.515625, "learning_rate": 0.0004815390082039017, "loss": 5.3788, "mean_token_accuracy": 0.16874595433473588, "num_tokens": 30108088.0, "step": 16325 }, { "entropy": 5.650168752670288, "epoch": 1.3719386683469859, "grad_norm": 1.9921875, "learning_rate": 0.00048152712677475556, "loss": 5.3689, "mean_token_accuracy": 0.16458612233400344, "num_tokens": 30117768.0, "step": 16330 }, { "entropy": 5.756920528411865, "epoch": 1.3723587481621509, "grad_norm": 1.6484375, "learning_rate": 0.00048151524168708773, "loss": 5.4856, "mean_token_accuracy": 0.1635723114013672, "num_tokens": 30126364.0, "step": 16335 }, { "entropy": 5.663647317886353, "epoch": 1.3727788279773157, "grad_norm": 3.171875, "learning_rate": 0.00048150335294110867, "loss": 5.4301, "mean_token_accuracy": 0.1666969671845436, "num_tokens": 30135365.0, "step": 16340 }, { "entropy": 5.731143760681152, "epoch": 1.3731989077924807, "grad_norm": 1.859375, "learning_rate": 0.00048149146053702915, "loss": 5.5047, "mean_token_accuracy": 0.17594754695892334, "num_tokens": 30145542.0, "step": 16345 }, { "entropy": 5.734094142913818, "epoch": 1.3736189876076454, "grad_norm": 2.21875, "learning_rate": 0.0004814795644750597, "loss": 5.5201, "mean_token_accuracy": 0.15887483209371567, "num_tokens": 30154100.0, "step": 16350 }, { "entropy": 5.701383399963379, "epoch": 1.3740390674228102, "grad_norm": 1.4609375, "learning_rate": 0.00048146766475541105, "loss": 5.3993, "mean_token_accuracy": 0.16724410504102707, "num_tokens": 30162647.0, "step": 16355 }, { "entropy": 5.855766916275025, "epoch": 1.3744591472379752, "grad_norm": 2.109375, "learning_rate": 0.00048145576137829406, "loss": 5.5619, "mean_token_accuracy": 0.1569045066833496, "num_tokens": 30172518.0, "step": 16360 }, { "entropy": 5.693779468536377, "epoch": 1.37487922705314, "grad_norm": 1.7421875, "learning_rate": 0.0004814438543439195, "loss": 5.4842, "mean_token_accuracy": 0.166504430770874, "num_tokens": 30183124.0, "step": 16365 }, { "entropy": 5.750344085693359, "epoch": 1.375299306868305, "grad_norm": 1.5234375, "learning_rate": 0.0004814319436524984, "loss": 5.4196, "mean_token_accuracy": 0.16698621958494186, "num_tokens": 30191861.0, "step": 16370 }, { "entropy": 5.5947545051574705, "epoch": 1.3757193866834698, "grad_norm": 1.75, "learning_rate": 0.00048142002930424174, "loss": 5.3228, "mean_token_accuracy": 0.16853681355714797, "num_tokens": 30200308.0, "step": 16375 }, { "entropy": 5.743504285812378, "epoch": 1.3761394664986346, "grad_norm": 1.875, "learning_rate": 0.0004814081112993605, "loss": 5.442, "mean_token_accuracy": 0.17036024779081343, "num_tokens": 30209380.0, "step": 16380 }, { "entropy": 5.772786664962768, "epoch": 1.3765595463137996, "grad_norm": 1.5390625, "learning_rate": 0.0004813961896380659, "loss": 5.5344, "mean_token_accuracy": 0.16031552404165267, "num_tokens": 30218549.0, "step": 16385 }, { "entropy": 5.6893415451049805, "epoch": 1.3769796261289646, "grad_norm": 2.1875, "learning_rate": 0.0004813842643205691, "loss": 5.4677, "mean_token_accuracy": 0.1622385114431381, "num_tokens": 30228119.0, "step": 16390 }, { "entropy": 5.672909450531006, "epoch": 1.3773997059441294, "grad_norm": 1.7890625, "learning_rate": 0.0004813723353470813, "loss": 5.4366, "mean_token_accuracy": 0.15988910496234893, "num_tokens": 30236765.0, "step": 16395 }, { "entropy": 5.758907604217529, "epoch": 1.3778197857592942, "grad_norm": 1.953125, "learning_rate": 0.0004813604027178139, "loss": 5.3763, "mean_token_accuracy": 0.16447694152593612, "num_tokens": 30246089.0, "step": 16400 }, { "entropy": 5.692288017272949, "epoch": 1.3782398655744592, "grad_norm": 1.671875, "learning_rate": 0.00048134846643297817, "loss": 5.4961, "mean_token_accuracy": 0.16211422756314278, "num_tokens": 30255806.0, "step": 16405 }, { "entropy": 5.754509162902832, "epoch": 1.378659945389624, "grad_norm": 1.609375, "learning_rate": 0.0004813365264927856, "loss": 5.5533, "mean_token_accuracy": 0.1538752794265747, "num_tokens": 30267112.0, "step": 16410 }, { "entropy": 5.724986410140991, "epoch": 1.379080025204789, "grad_norm": 1.71875, "learning_rate": 0.0004813245828974477, "loss": 5.4113, "mean_token_accuracy": 0.1641213044524193, "num_tokens": 30276168.0, "step": 16415 }, { "entropy": 5.690103244781494, "epoch": 1.3795001050199538, "grad_norm": 1.5703125, "learning_rate": 0.0004813126356471761, "loss": 5.4506, "mean_token_accuracy": 0.16688449084758758, "num_tokens": 30285723.0, "step": 16420 }, { "entropy": 5.785612440109253, "epoch": 1.3799201848351186, "grad_norm": 1.703125, "learning_rate": 0.0004813006847421824, "loss": 5.4945, "mean_token_accuracy": 0.16515985280275344, "num_tokens": 30294790.0, "step": 16425 }, { "entropy": 5.722445869445801, "epoch": 1.3803402646502836, "grad_norm": 1.7421875, "learning_rate": 0.0004812887301826783, "loss": 5.4235, "mean_token_accuracy": 0.16739188879728317, "num_tokens": 30303439.0, "step": 16430 }, { "entropy": 5.640029811859131, "epoch": 1.3807603444654484, "grad_norm": 1.9140625, "learning_rate": 0.0004812767719688755, "loss": 5.3987, "mean_token_accuracy": 0.162314510345459, "num_tokens": 30312493.0, "step": 16435 }, { "entropy": 5.695783567428589, "epoch": 1.3811804242806134, "grad_norm": 1.34375, "learning_rate": 0.0004812648101009859, "loss": 5.4447, "mean_token_accuracy": 0.1699496790766716, "num_tokens": 30321637.0, "step": 16440 }, { "entropy": 5.824322462081909, "epoch": 1.3816005040957782, "grad_norm": 2.171875, "learning_rate": 0.0004812528445792215, "loss": 5.5741, "mean_token_accuracy": 0.1524802938103676, "num_tokens": 30330730.0, "step": 16445 }, { "entropy": 5.700669240951538, "epoch": 1.382020583910943, "grad_norm": 1.7734375, "learning_rate": 0.00048124087540379407, "loss": 5.4013, "mean_token_accuracy": 0.16979680806398392, "num_tokens": 30339568.0, "step": 16450 }, { "entropy": 5.680627346038818, "epoch": 1.382440663726108, "grad_norm": 1.5546875, "learning_rate": 0.00048122890257491573, "loss": 5.444, "mean_token_accuracy": 0.1615915670990944, "num_tokens": 30349225.0, "step": 16455 }, { "entropy": 5.717861557006836, "epoch": 1.382860743541273, "grad_norm": 1.5625, "learning_rate": 0.00048121692609279866, "loss": 5.4418, "mean_token_accuracy": 0.1737132966518402, "num_tokens": 30358804.0, "step": 16460 }, { "entropy": 5.745238399505615, "epoch": 1.3832808233564378, "grad_norm": 1.7421875, "learning_rate": 0.0004812049459576549, "loss": 5.5181, "mean_token_accuracy": 0.167852421104908, "num_tokens": 30368490.0, "step": 16465 }, { "entropy": 5.783865261077881, "epoch": 1.3837009031716025, "grad_norm": 1.5390625, "learning_rate": 0.0004811929621696966, "loss": 5.4073, "mean_token_accuracy": 0.16754318177700042, "num_tokens": 30377117.0, "step": 16470 }, { "entropy": 5.615077972412109, "epoch": 1.3841209829867676, "grad_norm": 1.4765625, "learning_rate": 0.00048118097472913627, "loss": 5.295, "mean_token_accuracy": 0.17376861870288848, "num_tokens": 30385151.0, "step": 16475 }, { "entropy": 5.568413877487183, "epoch": 1.3845410628019323, "grad_norm": 1.484375, "learning_rate": 0.0004811689836361861, "loss": 5.348, "mean_token_accuracy": 0.16653158515691757, "num_tokens": 30394837.0, "step": 16480 }, { "entropy": 5.679996347427368, "epoch": 1.3849611426170974, "grad_norm": 1.71875, "learning_rate": 0.0004811569888910585, "loss": 5.422, "mean_token_accuracy": 0.16941581070423126, "num_tokens": 30403507.0, "step": 16485 }, { "entropy": 5.644715404510498, "epoch": 1.3853812224322621, "grad_norm": 1.7890625, "learning_rate": 0.0004811449904939661, "loss": 5.4117, "mean_token_accuracy": 0.16634142994880677, "num_tokens": 30412941.0, "step": 16490 }, { "entropy": 5.688458490371704, "epoch": 1.385801302247427, "grad_norm": 1.4765625, "learning_rate": 0.00048113298844512127, "loss": 5.3812, "mean_token_accuracy": 0.17350736260414124, "num_tokens": 30421823.0, "step": 16495 }, { "entropy": 5.663712358474731, "epoch": 1.386221382062592, "grad_norm": 1.5234375, "learning_rate": 0.0004811209827447367, "loss": 5.4873, "mean_token_accuracy": 0.1585498943924904, "num_tokens": 30431901.0, "step": 16500 }, { "entropy": 5.632642030715942, "epoch": 1.3866414618777567, "grad_norm": 1.6015625, "learning_rate": 0.00048110897339302504, "loss": 5.4315, "mean_token_accuracy": 0.16275101751089097, "num_tokens": 30442037.0, "step": 16505 }, { "entropy": 5.718168163299561, "epoch": 1.3870615416929217, "grad_norm": 1.59375, "learning_rate": 0.00048109696039019915, "loss": 5.3902, "mean_token_accuracy": 0.1704296126961708, "num_tokens": 30451189.0, "step": 16510 }, { "entropy": 5.751668882369995, "epoch": 1.3874816215080865, "grad_norm": 1.5234375, "learning_rate": 0.0004810849437364716, "loss": 5.4663, "mean_token_accuracy": 0.16614769995212555, "num_tokens": 30460214.0, "step": 16515 }, { "entropy": 5.718422794342041, "epoch": 1.3879017013232513, "grad_norm": 1.4609375, "learning_rate": 0.00048107292343205546, "loss": 5.4882, "mean_token_accuracy": 0.1601525142788887, "num_tokens": 30469936.0, "step": 16520 }, { "entropy": 5.689886426925659, "epoch": 1.3883217811384163, "grad_norm": 1.6484375, "learning_rate": 0.0004810608994771636, "loss": 5.4283, "mean_token_accuracy": 0.16565386056900025, "num_tokens": 30479282.0, "step": 16525 }, { "entropy": 5.748596954345703, "epoch": 1.388741860953581, "grad_norm": 1.5234375, "learning_rate": 0.000481048871872009, "loss": 5.4586, "mean_token_accuracy": 0.16205482929944992, "num_tokens": 30487839.0, "step": 16530 }, { "entropy": 5.734499311447143, "epoch": 1.389161940768746, "grad_norm": 1.5546875, "learning_rate": 0.00048103684061680463, "loss": 5.5037, "mean_token_accuracy": 0.16305503845214844, "num_tokens": 30497327.0, "step": 16535 }, { "entropy": 5.670412492752075, "epoch": 1.389582020583911, "grad_norm": 1.453125, "learning_rate": 0.00048102480571176384, "loss": 5.4037, "mean_token_accuracy": 0.1694550558924675, "num_tokens": 30506996.0, "step": 16540 }, { "entropy": 5.673905563354492, "epoch": 1.390002100399076, "grad_norm": 1.78125, "learning_rate": 0.0004810127671570997, "loss": 5.3351, "mean_token_accuracy": 0.17729466110467912, "num_tokens": 30515627.0, "step": 16545 }, { "entropy": 5.730953550338745, "epoch": 1.3904221802142407, "grad_norm": 1.546875, "learning_rate": 0.00048100072495302544, "loss": 5.4797, "mean_token_accuracy": 0.16208681911230088, "num_tokens": 30525858.0, "step": 16550 }, { "entropy": 5.621087074279785, "epoch": 1.3908422600294057, "grad_norm": 1.9609375, "learning_rate": 0.0004809886790997544, "loss": 5.3797, "mean_token_accuracy": 0.1725637599825859, "num_tokens": 30536331.0, "step": 16555 }, { "entropy": 5.680699825286865, "epoch": 1.3912623398445705, "grad_norm": 1.765625, "learning_rate": 0.0004809766295975, "loss": 5.4237, "mean_token_accuracy": 0.16701553165912628, "num_tokens": 30545329.0, "step": 16560 }, { "entropy": 5.67092752456665, "epoch": 1.3916824196597353, "grad_norm": 1.828125, "learning_rate": 0.0004809645764464757, "loss": 5.3724, "mean_token_accuracy": 0.17025694251060486, "num_tokens": 30554357.0, "step": 16565 }, { "entropy": 5.728373718261719, "epoch": 1.3921024994749003, "grad_norm": 1.5625, "learning_rate": 0.00048095251964689494, "loss": 5.5604, "mean_token_accuracy": 0.16157087236642836, "num_tokens": 30563548.0, "step": 16570 }, { "entropy": 5.7081492900848385, "epoch": 1.392522579290065, "grad_norm": 1.484375, "learning_rate": 0.00048094045919897134, "loss": 5.4307, "mean_token_accuracy": 0.16958432644605637, "num_tokens": 30572844.0, "step": 16575 }, { "entropy": 5.658297061920166, "epoch": 1.39294265910523, "grad_norm": 1.4296875, "learning_rate": 0.0004809283951029185, "loss": 5.3522, "mean_token_accuracy": 0.17243621349334717, "num_tokens": 30580930.0, "step": 16580 }, { "entropy": 5.699292230606079, "epoch": 1.3933627389203949, "grad_norm": 1.796875, "learning_rate": 0.0004809163273589503, "loss": 5.3531, "mean_token_accuracy": 0.1716527074575424, "num_tokens": 30589917.0, "step": 16585 }, { "entropy": 5.645009279251099, "epoch": 1.3937828187355596, "grad_norm": 1.5390625, "learning_rate": 0.00048090425596728035, "loss": 5.4546, "mean_token_accuracy": 0.16196119636297227, "num_tokens": 30599282.0, "step": 16590 }, { "entropy": 5.66185154914856, "epoch": 1.3942028985507247, "grad_norm": 1.625, "learning_rate": 0.00048089218092812254, "loss": 5.4357, "mean_token_accuracy": 0.16347795724868774, "num_tokens": 30608244.0, "step": 16595 }, { "entropy": 5.751768589019775, "epoch": 1.3946229783658894, "grad_norm": 1.84375, "learning_rate": 0.00048088010224169064, "loss": 5.5588, "mean_token_accuracy": 0.16680994927883147, "num_tokens": 30617340.0, "step": 16600 }, { "entropy": 5.784567546844483, "epoch": 1.3950430581810545, "grad_norm": 1.6484375, "learning_rate": 0.00048086801990819886, "loss": 5.4828, "mean_token_accuracy": 0.16346753984689713, "num_tokens": 30626244.0, "step": 16605 }, { "entropy": 5.667201566696167, "epoch": 1.3954631379962192, "grad_norm": 1.8359375, "learning_rate": 0.00048085593392786113, "loss": 5.4677, "mean_token_accuracy": 0.1689893737435341, "num_tokens": 30635279.0, "step": 16610 }, { "entropy": 5.747064113616943, "epoch": 1.395883217811384, "grad_norm": 1.671875, "learning_rate": 0.0004808438443008915, "loss": 5.5995, "mean_token_accuracy": 0.15962631851434708, "num_tokens": 30645790.0, "step": 16615 }, { "entropy": 5.690942096710205, "epoch": 1.396303297626549, "grad_norm": 4.6875, "learning_rate": 0.0004808317510275041, "loss": 5.45, "mean_token_accuracy": 0.16256778538227082, "num_tokens": 30654497.0, "step": 16620 }, { "entropy": 5.765830707550049, "epoch": 1.396723377441714, "grad_norm": 1.625, "learning_rate": 0.0004808196541079133, "loss": 5.5093, "mean_token_accuracy": 0.16061384826898575, "num_tokens": 30663760.0, "step": 16625 }, { "entropy": 5.737986993789673, "epoch": 1.3971434572568788, "grad_norm": 1.515625, "learning_rate": 0.00048080755354233326, "loss": 5.5036, "mean_token_accuracy": 0.17019174993038177, "num_tokens": 30674263.0, "step": 16630 }, { "entropy": 5.708775997161865, "epoch": 1.3975635370720436, "grad_norm": 1.5234375, "learning_rate": 0.0004807954493309784, "loss": 5.3802, "mean_token_accuracy": 0.16836380660533906, "num_tokens": 30683501.0, "step": 16635 }, { "entropy": 5.653238725662232, "epoch": 1.3979836168872086, "grad_norm": 1.734375, "learning_rate": 0.00048078334147406314, "loss": 5.3704, "mean_token_accuracy": 0.17907529175281525, "num_tokens": 30691917.0, "step": 16640 }, { "entropy": 5.636937618255615, "epoch": 1.3984036967023734, "grad_norm": 1.6328125, "learning_rate": 0.00048077122997180197, "loss": 5.4514, "mean_token_accuracy": 0.1658071830868721, "num_tokens": 30701753.0, "step": 16645 }, { "entropy": 5.557118940353393, "epoch": 1.3988237765175384, "grad_norm": 3.28125, "learning_rate": 0.0004807591148244093, "loss": 5.4191, "mean_token_accuracy": 0.16260174959897994, "num_tokens": 30710878.0, "step": 16650 }, { "entropy": 5.618271827697754, "epoch": 1.3992438563327032, "grad_norm": 1.46875, "learning_rate": 0.0004807469960321, "loss": 5.3137, "mean_token_accuracy": 0.17308908998966216, "num_tokens": 30719372.0, "step": 16655 }, { "entropy": 5.683672761917114, "epoch": 1.399663936147868, "grad_norm": 1.6328125, "learning_rate": 0.00048073487359508854, "loss": 5.4876, "mean_token_accuracy": 0.15821529626846315, "num_tokens": 30728529.0, "step": 16660 }, { "entropy": 5.769331645965576, "epoch": 1.400084015963033, "grad_norm": 2.234375, "learning_rate": 0.00048072274751358976, "loss": 5.4266, "mean_token_accuracy": 0.16961006075143814, "num_tokens": 30737704.0, "step": 16665 }, { "entropy": 5.672802448272705, "epoch": 1.4005040957781978, "grad_norm": 1.9140625, "learning_rate": 0.00048071061778781843, "loss": 5.4031, "mean_token_accuracy": 0.16336706131696702, "num_tokens": 30747836.0, "step": 16670 }, { "entropy": 5.595252180099488, "epoch": 1.4009241755933628, "grad_norm": 1.828125, "learning_rate": 0.0004806984844179894, "loss": 5.4637, "mean_token_accuracy": 0.16031693965196608, "num_tokens": 30757881.0, "step": 16675 }, { "entropy": 5.706535530090332, "epoch": 1.4013442554085276, "grad_norm": 1.5234375, "learning_rate": 0.00048068634740431774, "loss": 5.4726, "mean_token_accuracy": 0.1561596304178238, "num_tokens": 30767592.0, "step": 16680 }, { "entropy": 5.703032445907593, "epoch": 1.4017643352236924, "grad_norm": 1.9140625, "learning_rate": 0.0004806742067470182, "loss": 5.435, "mean_token_accuracy": 0.16835850328207017, "num_tokens": 30776633.0, "step": 16685 }, { "entropy": 5.748832893371582, "epoch": 1.4021844150388574, "grad_norm": 1.46875, "learning_rate": 0.00048066206244630613, "loss": 5.3957, "mean_token_accuracy": 0.1625844269990921, "num_tokens": 30785195.0, "step": 16690 }, { "entropy": 5.596337413787841, "epoch": 1.4026044948540224, "grad_norm": 1.4296875, "learning_rate": 0.00048064991450239643, "loss": 5.3959, "mean_token_accuracy": 0.16495241075754166, "num_tokens": 30794397.0, "step": 16695 }, { "entropy": 5.76853609085083, "epoch": 1.4030245746691872, "grad_norm": 1.75, "learning_rate": 0.00048063776291550444, "loss": 5.5523, "mean_token_accuracy": 0.1575335018336773, "num_tokens": 30803312.0, "step": 16700 }, { "entropy": 5.758233070373535, "epoch": 1.403444654484352, "grad_norm": 1.75, "learning_rate": 0.00048062560768584537, "loss": 5.4565, "mean_token_accuracy": 0.17063064128160477, "num_tokens": 30812519.0, "step": 16705 }, { "entropy": 5.646391153335571, "epoch": 1.403864734299517, "grad_norm": 1.515625, "learning_rate": 0.00048061344881363444, "loss": 5.4061, "mean_token_accuracy": 0.17314539104700089, "num_tokens": 30821558.0, "step": 16710 }, { "entropy": 5.68760871887207, "epoch": 1.4042848141146818, "grad_norm": 1.5078125, "learning_rate": 0.0004806012862990873, "loss": 5.4262, "mean_token_accuracy": 0.16372249722480775, "num_tokens": 30831521.0, "step": 16715 }, { "entropy": 5.68061900138855, "epoch": 1.4047048939298468, "grad_norm": 1.3671875, "learning_rate": 0.00048058912014241914, "loss": 5.4044, "mean_token_accuracy": 0.16505313515663148, "num_tokens": 30841191.0, "step": 16720 }, { "entropy": 5.709570789337159, "epoch": 1.4051249737450116, "grad_norm": 1.703125, "learning_rate": 0.0004805769503438456, "loss": 5.5102, "mean_token_accuracy": 0.1652674689888954, "num_tokens": 30850556.0, "step": 16725 }, { "entropy": 5.701706600189209, "epoch": 1.4055450535601763, "grad_norm": 1.796875, "learning_rate": 0.00048056477690358227, "loss": 5.4131, "mean_token_accuracy": 0.1686984494328499, "num_tokens": 30859410.0, "step": 16730 }, { "entropy": 5.7712499618530275, "epoch": 1.4059651333753413, "grad_norm": 1.546875, "learning_rate": 0.0004805525998218447, "loss": 5.4582, "mean_token_accuracy": 0.16039325296878815, "num_tokens": 30868048.0, "step": 16735 }, { "entropy": 5.7124098777771, "epoch": 1.4063852131905061, "grad_norm": 1.515625, "learning_rate": 0.00048054041909884873, "loss": 5.4697, "mean_token_accuracy": 0.16726680248975753, "num_tokens": 30876785.0, "step": 16740 }, { "entropy": 5.764161920547485, "epoch": 1.4068052930056711, "grad_norm": 1.4375, "learning_rate": 0.00048052823473481007, "loss": 5.5345, "mean_token_accuracy": 0.16368919163942336, "num_tokens": 30886158.0, "step": 16745 }, { "entropy": 5.709494638442993, "epoch": 1.407225372820836, "grad_norm": 1.4140625, "learning_rate": 0.00048051604672994446, "loss": 5.3873, "mean_token_accuracy": 0.1646023690700531, "num_tokens": 30895283.0, "step": 16750 }, { "entropy": 5.696353149414063, "epoch": 1.4076454526360007, "grad_norm": 1.59375, "learning_rate": 0.00048050385508446804, "loss": 5.4284, "mean_token_accuracy": 0.16812965720891954, "num_tokens": 30905514.0, "step": 16755 }, { "entropy": 5.664879083633423, "epoch": 1.4080655324511657, "grad_norm": 1.4921875, "learning_rate": 0.00048049165979859655, "loss": 5.331, "mean_token_accuracy": 0.18449335247278215, "num_tokens": 30914794.0, "step": 16760 }, { "entropy": 5.610575008392334, "epoch": 1.4084856122663307, "grad_norm": 1.4609375, "learning_rate": 0.00048047946087254615, "loss": 5.3627, "mean_token_accuracy": 0.16559927463531493, "num_tokens": 30923823.0, "step": 16765 }, { "entropy": 5.6542726993560795, "epoch": 1.4089056920814955, "grad_norm": 1.7421875, "learning_rate": 0.00048046725830653295, "loss": 5.4819, "mean_token_accuracy": 0.16385638117790222, "num_tokens": 30932738.0, "step": 16770 }, { "entropy": 5.704129838943482, "epoch": 1.4093257718966603, "grad_norm": 1.4609375, "learning_rate": 0.00048045505210077304, "loss": 5.4767, "mean_token_accuracy": 0.15995497554540633, "num_tokens": 30942302.0, "step": 16775 }, { "entropy": 5.705305194854736, "epoch": 1.4097458517118253, "grad_norm": 1.765625, "learning_rate": 0.0004804428422554826, "loss": 5.3999, "mean_token_accuracy": 0.16517668217420578, "num_tokens": 30951662.0, "step": 16780 }, { "entropy": 5.643369197845459, "epoch": 1.41016593152699, "grad_norm": 2.484375, "learning_rate": 0.0004804306287708782, "loss": 5.4139, "mean_token_accuracy": 0.1685831978917122, "num_tokens": 30960475.0, "step": 16785 }, { "entropy": 5.592676210403442, "epoch": 1.410586011342155, "grad_norm": 1.5625, "learning_rate": 0.00048041841164717574, "loss": 5.2528, "mean_token_accuracy": 0.1767956107854843, "num_tokens": 30969075.0, "step": 16790 }, { "entropy": 5.635186338424683, "epoch": 1.41100609115732, "grad_norm": 1.6484375, "learning_rate": 0.0004804061908845921, "loss": 5.3445, "mean_token_accuracy": 0.17429747730493544, "num_tokens": 30978030.0, "step": 16795 }, { "entropy": 5.63826003074646, "epoch": 1.4114261709724847, "grad_norm": 1.875, "learning_rate": 0.00048039396648334346, "loss": 5.322, "mean_token_accuracy": 0.16926524937152862, "num_tokens": 30985639.0, "step": 16800 }, { "entropy": 5.685590744018555, "epoch": 1.4118462507876497, "grad_norm": 1.625, "learning_rate": 0.0004803817384436465, "loss": 5.4499, "mean_token_accuracy": 0.16543682664632797, "num_tokens": 30994811.0, "step": 16805 }, { "entropy": 5.71953272819519, "epoch": 1.4122663306028145, "grad_norm": 1.578125, "learning_rate": 0.0004803695067657178, "loss": 5.428, "mean_token_accuracy": 0.16598510146141052, "num_tokens": 31003813.0, "step": 16810 }, { "entropy": 5.641027021408081, "epoch": 1.4126864104179795, "grad_norm": 1.65625, "learning_rate": 0.000480357271449774, "loss": 5.3693, "mean_token_accuracy": 0.1740890622138977, "num_tokens": 31012488.0, "step": 16815 }, { "entropy": 5.6430689811706545, "epoch": 1.4131064902331443, "grad_norm": 1.9375, "learning_rate": 0.0004803450324960318, "loss": 5.3921, "mean_token_accuracy": 0.16979921013116836, "num_tokens": 31021089.0, "step": 16820 }, { "entropy": 5.653257369995117, "epoch": 1.413526570048309, "grad_norm": 2.484375, "learning_rate": 0.00048033278990470825, "loss": 5.4096, "mean_token_accuracy": 0.16547489091753959, "num_tokens": 31029903.0, "step": 16825 }, { "entropy": 5.63095440864563, "epoch": 1.413946649863474, "grad_norm": 1.5, "learning_rate": 0.00048032054367601996, "loss": 5.421, "mean_token_accuracy": 0.1633308783173561, "num_tokens": 31039207.0, "step": 16830 }, { "entropy": 5.651738262176513, "epoch": 1.414366729678639, "grad_norm": 2.078125, "learning_rate": 0.00048030829381018396, "loss": 5.4428, "mean_token_accuracy": 0.16122666299343108, "num_tokens": 31048190.0, "step": 16835 }, { "entropy": 5.708361196517944, "epoch": 1.4147868094938039, "grad_norm": 2.0, "learning_rate": 0.0004802960403074173, "loss": 5.5316, "mean_token_accuracy": 0.16462094187736512, "num_tokens": 31058769.0, "step": 16840 }, { "entropy": 5.701053285598755, "epoch": 1.4152068893089687, "grad_norm": 1.546875, "learning_rate": 0.00048028378316793705, "loss": 5.4687, "mean_token_accuracy": 0.16018210723996162, "num_tokens": 31066830.0, "step": 16845 }, { "entropy": 5.762956762313843, "epoch": 1.4156269691241337, "grad_norm": 1.515625, "learning_rate": 0.0004802715223919602, "loss": 5.5172, "mean_token_accuracy": 0.16773394793272017, "num_tokens": 31077205.0, "step": 16850 }, { "entropy": 5.7432409763336185, "epoch": 1.4160470489392984, "grad_norm": 1.4140625, "learning_rate": 0.00048025925797970403, "loss": 5.4479, "mean_token_accuracy": 0.17057251334190368, "num_tokens": 31087327.0, "step": 16855 }, { "entropy": 5.639508008956909, "epoch": 1.4164671287544635, "grad_norm": 1.5859375, "learning_rate": 0.00048024698993138587, "loss": 5.3833, "mean_token_accuracy": 0.16887278407812117, "num_tokens": 31096501.0, "step": 16860 }, { "entropy": 5.735842370986939, "epoch": 1.4168872085696282, "grad_norm": 1.5703125, "learning_rate": 0.00048023471824722294, "loss": 5.5523, "mean_token_accuracy": 0.1566422998905182, "num_tokens": 31105949.0, "step": 16865 }, { "entropy": 5.765266227722168, "epoch": 1.417307288384793, "grad_norm": 1.8203125, "learning_rate": 0.00048022244292743256, "loss": 5.4616, "mean_token_accuracy": 0.1579113557934761, "num_tokens": 31115482.0, "step": 16870 }, { "entropy": 5.7321278095245365, "epoch": 1.417727368199958, "grad_norm": 1.8046875, "learning_rate": 0.00048021016397223234, "loss": 5.407, "mean_token_accuracy": 0.16931116878986358, "num_tokens": 31124758.0, "step": 16875 }, { "entropy": 5.654321622848511, "epoch": 1.4181474480151228, "grad_norm": 1.4453125, "learning_rate": 0.00048019788138183977, "loss": 5.2972, "mean_token_accuracy": 0.17919143736362458, "num_tokens": 31134114.0, "step": 16880 }, { "entropy": 5.586613321304322, "epoch": 1.4185675278302878, "grad_norm": 1.6796875, "learning_rate": 0.00048018559515647244, "loss": 5.3523, "mean_token_accuracy": 0.17073431313037873, "num_tokens": 31142667.0, "step": 16885 }, { "entropy": 5.671979999542236, "epoch": 1.4189876076454526, "grad_norm": 1.421875, "learning_rate": 0.00048017330529634785, "loss": 5.4433, "mean_token_accuracy": 0.1582137778401375, "num_tokens": 31152105.0, "step": 16890 }, { "entropy": 5.698092317581176, "epoch": 1.4194076874606174, "grad_norm": 1.546875, "learning_rate": 0.00048016101180168376, "loss": 5.4208, "mean_token_accuracy": 0.1703786239027977, "num_tokens": 31160277.0, "step": 16895 }, { "entropy": 5.8327394962310795, "epoch": 1.4198277672757824, "grad_norm": 1.4296875, "learning_rate": 0.00048014871467269804, "loss": 5.6275, "mean_token_accuracy": 0.15696136504411698, "num_tokens": 31170677.0, "step": 16900 }, { "entropy": 5.710501289367675, "epoch": 1.4202478470909472, "grad_norm": 2.4375, "learning_rate": 0.00048013641390960856, "loss": 5.413, "mean_token_accuracy": 0.16240498870611192, "num_tokens": 31179298.0, "step": 16905 }, { "entropy": 5.650837802886963, "epoch": 1.4206679269061122, "grad_norm": 1.3984375, "learning_rate": 0.0004801241095126331, "loss": 5.4281, "mean_token_accuracy": 0.16397203356027604, "num_tokens": 31188547.0, "step": 16910 }, { "entropy": 5.683053827285766, "epoch": 1.421088006721277, "grad_norm": 1.671875, "learning_rate": 0.0004801118014819896, "loss": 5.429, "mean_token_accuracy": 0.16916512846946716, "num_tokens": 31197680.0, "step": 16915 }, { "entropy": 5.670131063461303, "epoch": 1.421508086536442, "grad_norm": 1.6875, "learning_rate": 0.0004800994898178962, "loss": 5.3795, "mean_token_accuracy": 0.17118050009012223, "num_tokens": 31206351.0, "step": 16920 }, { "entropy": 5.689674186706543, "epoch": 1.4219281663516068, "grad_norm": 1.609375, "learning_rate": 0.0004800871745205708, "loss": 5.5787, "mean_token_accuracy": 0.15732864812016487, "num_tokens": 31216478.0, "step": 16925 }, { "entropy": 5.787313032150268, "epoch": 1.4223482461667718, "grad_norm": 1.3125, "learning_rate": 0.00048007485559023195, "loss": 5.5266, "mean_token_accuracy": 0.15895494371652602, "num_tokens": 31225920.0, "step": 16930 }, { "entropy": 5.704079055786133, "epoch": 1.4227683259819366, "grad_norm": 1.5625, "learning_rate": 0.0004800625330270975, "loss": 5.4163, "mean_token_accuracy": 0.1649041622877121, "num_tokens": 31235061.0, "step": 16935 }, { "entropy": 5.615435409545898, "epoch": 1.4231884057971014, "grad_norm": 1.4296875, "learning_rate": 0.0004800502068313859, "loss": 5.3819, "mean_token_accuracy": 0.17181412726640702, "num_tokens": 31243448.0, "step": 16940 }, { "entropy": 5.699060869216919, "epoch": 1.4236084856122664, "grad_norm": 1.4453125, "learning_rate": 0.0004800378770033154, "loss": 5.4936, "mean_token_accuracy": 0.16858059167861938, "num_tokens": 31252569.0, "step": 16945 }, { "entropy": 5.703611755371094, "epoch": 1.4240285654274312, "grad_norm": 1.7890625, "learning_rate": 0.0004800255435431046, "loss": 5.3883, "mean_token_accuracy": 0.17073103338479995, "num_tokens": 31261905.0, "step": 16950 }, { "entropy": 5.615508317947388, "epoch": 1.4244486452425962, "grad_norm": 2.03125, "learning_rate": 0.00048001320645097177, "loss": 5.361, "mean_token_accuracy": 0.1737958535552025, "num_tokens": 31271203.0, "step": 16955 }, { "entropy": 5.630927085876465, "epoch": 1.424868725057761, "grad_norm": 1.59375, "learning_rate": 0.00048000086572713566, "loss": 5.354, "mean_token_accuracy": 0.17280679643154145, "num_tokens": 31279812.0, "step": 16960 }, { "entropy": 5.674156904220581, "epoch": 1.4252888048729258, "grad_norm": 1.4921875, "learning_rate": 0.0004799885213718147, "loss": 5.4149, "mean_token_accuracy": 0.16382081657648087, "num_tokens": 31289615.0, "step": 16965 }, { "entropy": 5.658738088607788, "epoch": 1.4257088846880908, "grad_norm": 1.4921875, "learning_rate": 0.00047997617338522763, "loss": 5.3518, "mean_token_accuracy": 0.17239830791950225, "num_tokens": 31298947.0, "step": 16970 }, { "entropy": 5.650487899780273, "epoch": 1.4261289645032555, "grad_norm": 1.5546875, "learning_rate": 0.00047996382176759324, "loss": 5.33, "mean_token_accuracy": 0.17185672670602797, "num_tokens": 31307465.0, "step": 16975 }, { "entropy": 5.605889129638672, "epoch": 1.4265490443184206, "grad_norm": 1.515625, "learning_rate": 0.0004799514665191303, "loss": 5.4702, "mean_token_accuracy": 0.16345242261886597, "num_tokens": 31317682.0, "step": 16980 }, { "entropy": 5.726818227767945, "epoch": 1.4269691241335853, "grad_norm": 1.9296875, "learning_rate": 0.0004799391076400576, "loss": 5.4472, "mean_token_accuracy": 0.16512487083673477, "num_tokens": 31326113.0, "step": 16985 }, { "entropy": 5.791937685012817, "epoch": 1.4273892039487501, "grad_norm": 1.7109375, "learning_rate": 0.00047992674513059415, "loss": 5.4919, "mean_token_accuracy": 0.16668398678302765, "num_tokens": 31335263.0, "step": 16990 }, { "entropy": 5.6625199794769285, "epoch": 1.4278092837639151, "grad_norm": 1.765625, "learning_rate": 0.00047991437899095896, "loss": 5.4298, "mean_token_accuracy": 0.1710612565279007, "num_tokens": 31344503.0, "step": 16995 }, { "entropy": 5.647831153869629, "epoch": 1.4282293635790801, "grad_norm": 1.8203125, "learning_rate": 0.00047990200922137105, "loss": 5.4908, "mean_token_accuracy": 0.16613128632307053, "num_tokens": 31354530.0, "step": 17000 }, { "entropy": 5.668387365341187, "epoch": 1.428649443394245, "grad_norm": 1.25, "learning_rate": 0.0004798896358220496, "loss": 5.3034, "mean_token_accuracy": 0.1711835592985153, "num_tokens": 31362761.0, "step": 17005 }, { "entropy": 5.680157566070557, "epoch": 1.4290695232094097, "grad_norm": 1.71875, "learning_rate": 0.0004798772587932137, "loss": 5.3365, "mean_token_accuracy": 0.16386652886867523, "num_tokens": 31372933.0, "step": 17010 }, { "entropy": 5.753627347946167, "epoch": 1.4294896030245747, "grad_norm": 1.546875, "learning_rate": 0.0004798648781350826, "loss": 5.5313, "mean_token_accuracy": 0.16360146701335906, "num_tokens": 31382651.0, "step": 17015 }, { "entropy": 5.657275533676147, "epoch": 1.4299096828397395, "grad_norm": 1.453125, "learning_rate": 0.0004798524938478758, "loss": 5.4663, "mean_token_accuracy": 0.16007311642169952, "num_tokens": 31392272.0, "step": 17020 }, { "entropy": 5.65654354095459, "epoch": 1.4303297626549045, "grad_norm": 1.515625, "learning_rate": 0.0004798401059318124, "loss": 5.3702, "mean_token_accuracy": 0.1685507357120514, "num_tokens": 31400684.0, "step": 17025 }, { "entropy": 5.652564525604248, "epoch": 1.4307498424700693, "grad_norm": 1.515625, "learning_rate": 0.0004798277143871122, "loss": 5.3624, "mean_token_accuracy": 0.17421618700027466, "num_tokens": 31409082.0, "step": 17030 }, { "entropy": 5.608336639404297, "epoch": 1.431169922285234, "grad_norm": 1.484375, "learning_rate": 0.0004798153192139944, "loss": 5.3376, "mean_token_accuracy": 0.1730009838938713, "num_tokens": 31417415.0, "step": 17035 }, { "entropy": 5.675747871398926, "epoch": 1.431590002100399, "grad_norm": 1.8671875, "learning_rate": 0.0004798029204126786, "loss": 5.5005, "mean_token_accuracy": 0.1690568134188652, "num_tokens": 31427510.0, "step": 17040 }, { "entropy": 5.636645269393921, "epoch": 1.432010081915564, "grad_norm": 2.234375, "learning_rate": 0.0004797905179833847, "loss": 5.3358, "mean_token_accuracy": 0.17016119211912156, "num_tokens": 31436187.0, "step": 17045 }, { "entropy": 5.665362167358398, "epoch": 1.432430161730729, "grad_norm": 1.640625, "learning_rate": 0.0004797781119263321, "loss": 5.3552, "mean_token_accuracy": 0.16701350957155228, "num_tokens": 31445179.0, "step": 17050 }, { "entropy": 5.708790063858032, "epoch": 1.4328502415458937, "grad_norm": 2.25, "learning_rate": 0.0004797657022417408, "loss": 5.4449, "mean_token_accuracy": 0.16478859335184098, "num_tokens": 31454434.0, "step": 17055 }, { "entropy": 5.681474256515503, "epoch": 1.4332703213610585, "grad_norm": 1.6015625, "learning_rate": 0.00047975328892983045, "loss": 5.4336, "mean_token_accuracy": 0.16830161362886428, "num_tokens": 31464202.0, "step": 17060 }, { "entropy": 5.605258941650391, "epoch": 1.4336904011762235, "grad_norm": 1.71875, "learning_rate": 0.00047974087199082095, "loss": 5.3277, "mean_token_accuracy": 0.17215612679719924, "num_tokens": 31473158.0, "step": 17065 }, { "entropy": 5.659666633605957, "epoch": 1.4341104809913885, "grad_norm": 1.9453125, "learning_rate": 0.00047972845142493244, "loss": 5.3615, "mean_token_accuracy": 0.16211307048797607, "num_tokens": 31482643.0, "step": 17070 }, { "entropy": 5.625508260726929, "epoch": 1.4345305608065533, "grad_norm": 1.625, "learning_rate": 0.0004797160272323848, "loss": 5.4164, "mean_token_accuracy": 0.1696289971470833, "num_tokens": 31492080.0, "step": 17075 }, { "entropy": 5.67778902053833, "epoch": 1.434950640621718, "grad_norm": 1.71875, "learning_rate": 0.00047970359941339815, "loss": 5.393, "mean_token_accuracy": 0.16916269809007645, "num_tokens": 31501990.0, "step": 17080 }, { "entropy": 5.686505317687988, "epoch": 1.435370720436883, "grad_norm": 1.875, "learning_rate": 0.0004796911679681926, "loss": 5.4451, "mean_token_accuracy": 0.16082692742347718, "num_tokens": 31510548.0, "step": 17085 }, { "entropy": 5.671147012710572, "epoch": 1.4357908002520479, "grad_norm": 1.5390625, "learning_rate": 0.00047967873289698847, "loss": 5.4048, "mean_token_accuracy": 0.16617012917995452, "num_tokens": 31518695.0, "step": 17090 }, { "entropy": 5.754871845245361, "epoch": 1.4362108800672129, "grad_norm": 1.8125, "learning_rate": 0.00047966629420000595, "loss": 5.5615, "mean_token_accuracy": 0.16368394792079927, "num_tokens": 31528021.0, "step": 17095 }, { "entropy": 5.7568220615386965, "epoch": 1.4366309598823777, "grad_norm": 1.6796875, "learning_rate": 0.0004796538518774654, "loss": 5.5284, "mean_token_accuracy": 0.16016919240355493, "num_tokens": 31537786.0, "step": 17100 }, { "entropy": 5.689480447769165, "epoch": 1.4370510396975424, "grad_norm": 1.7421875, "learning_rate": 0.00047964140592958725, "loss": 5.4719, "mean_token_accuracy": 0.16369976103305817, "num_tokens": 31548006.0, "step": 17105 }, { "entropy": 5.710929727554321, "epoch": 1.4374711195127075, "grad_norm": 2.0625, "learning_rate": 0.000479628956356592, "loss": 5.4102, "mean_token_accuracy": 0.16462980061769486, "num_tokens": 31557042.0, "step": 17110 }, { "entropy": 5.743411254882813, "epoch": 1.4378911993278722, "grad_norm": 1.453125, "learning_rate": 0.0004796165031587001, "loss": 5.4294, "mean_token_accuracy": 0.16347581148147583, "num_tokens": 31566661.0, "step": 17115 }, { "entropy": 5.729912614822387, "epoch": 1.4383112791430372, "grad_norm": 2.171875, "learning_rate": 0.0004796040463361323, "loss": 5.3991, "mean_token_accuracy": 0.17761249095201492, "num_tokens": 31575724.0, "step": 17120 }, { "entropy": 5.694164514541626, "epoch": 1.438731358958202, "grad_norm": 1.7265625, "learning_rate": 0.0004795915858891091, "loss": 5.4881, "mean_token_accuracy": 0.17017182260751723, "num_tokens": 31585068.0, "step": 17125 }, { "entropy": 5.751265192031861, "epoch": 1.4391514387733668, "grad_norm": 2.296875, "learning_rate": 0.0004795791218178514, "loss": 5.5202, "mean_token_accuracy": 0.1640462413430214, "num_tokens": 31594629.0, "step": 17130 }, { "entropy": 5.655387258529663, "epoch": 1.4395715185885318, "grad_norm": 1.4609375, "learning_rate": 0.00047956665412257984, "loss": 5.3913, "mean_token_accuracy": 0.16778073012828826, "num_tokens": 31603469.0, "step": 17135 }, { "entropy": 5.647593832015991, "epoch": 1.4399915984036968, "grad_norm": 2.078125, "learning_rate": 0.00047955418280351526, "loss": 5.3461, "mean_token_accuracy": 0.17495327293872834, "num_tokens": 31611674.0, "step": 17140 }, { "entropy": 5.800208330154419, "epoch": 1.4404116782188616, "grad_norm": 1.6796875, "learning_rate": 0.0004795417078608788, "loss": 5.622, "mean_token_accuracy": 0.1545601725578308, "num_tokens": 31621863.0, "step": 17145 }, { "entropy": 5.788693571090699, "epoch": 1.4408317580340264, "grad_norm": 1.78125, "learning_rate": 0.00047952922929489126, "loss": 5.4521, "mean_token_accuracy": 0.1642246201634407, "num_tokens": 31630968.0, "step": 17150 }, { "entropy": 5.64285249710083, "epoch": 1.4412518378491914, "grad_norm": 2.0, "learning_rate": 0.00047951674710577366, "loss": 5.4419, "mean_token_accuracy": 0.16613068878650666, "num_tokens": 31640643.0, "step": 17155 }, { "entropy": 5.565954065322876, "epoch": 1.4416719176643562, "grad_norm": 1.46875, "learning_rate": 0.00047950426129374723, "loss": 5.3347, "mean_token_accuracy": 0.1745448738336563, "num_tokens": 31648941.0, "step": 17160 }, { "entropy": 5.700513076782227, "epoch": 1.4420919974795212, "grad_norm": 1.4921875, "learning_rate": 0.00047949177185903314, "loss": 5.4437, "mean_token_accuracy": 0.1697974219918251, "num_tokens": 31658019.0, "step": 17165 }, { "entropy": 5.769097185134887, "epoch": 1.442512077294686, "grad_norm": 1.6484375, "learning_rate": 0.0004794792788018526, "loss": 5.5065, "mean_token_accuracy": 0.15758488774299623, "num_tokens": 31668050.0, "step": 17170 }, { "entropy": 5.686607456207275, "epoch": 1.4429321571098508, "grad_norm": 1.671875, "learning_rate": 0.000479466782122427, "loss": 5.3551, "mean_token_accuracy": 0.16535573899745942, "num_tokens": 31676727.0, "step": 17175 }, { "entropy": 5.683789539337158, "epoch": 1.4433522369250158, "grad_norm": 1.7109375, "learning_rate": 0.00047945428182097756, "loss": 5.4525, "mean_token_accuracy": 0.1617741197347641, "num_tokens": 31686205.0, "step": 17180 }, { "entropy": 5.693828535079956, "epoch": 1.4437723167401806, "grad_norm": 1.4296875, "learning_rate": 0.00047944177789772583, "loss": 5.4559, "mean_token_accuracy": 0.16552175134420394, "num_tokens": 31695521.0, "step": 17185 }, { "entropy": 5.767838621139527, "epoch": 1.4441923965553456, "grad_norm": 1.7578125, "learning_rate": 0.0004794292703528932, "loss": 5.5186, "mean_token_accuracy": 0.15298188775777816, "num_tokens": 31706606.0, "step": 17190 }, { "entropy": 5.7720374584198, "epoch": 1.4446124763705104, "grad_norm": 1.546875, "learning_rate": 0.00047941675918670133, "loss": 5.5934, "mean_token_accuracy": 0.15864021703600883, "num_tokens": 31716881.0, "step": 17195 }, { "entropy": 5.724113607406617, "epoch": 1.4450325561856752, "grad_norm": 1.609375, "learning_rate": 0.0004794042443993719, "loss": 5.3791, "mean_token_accuracy": 0.16267655789852142, "num_tokens": 31725878.0, "step": 17200 }, { "entropy": 5.657223463058472, "epoch": 1.4454526360008402, "grad_norm": 1.46875, "learning_rate": 0.0004793917259911265, "loss": 5.4676, "mean_token_accuracy": 0.16497932225465775, "num_tokens": 31735033.0, "step": 17205 }, { "entropy": 5.5900531768798825, "epoch": 1.445872715816005, "grad_norm": 1.40625, "learning_rate": 0.0004793792039621869, "loss": 5.4147, "mean_token_accuracy": 0.171473328769207, "num_tokens": 31744887.0, "step": 17210 }, { "entropy": 5.733187532424926, "epoch": 1.44629279563117, "grad_norm": 1.3515625, "learning_rate": 0.00047936667831277504, "loss": 5.4767, "mean_token_accuracy": 0.15901170670986176, "num_tokens": 31754137.0, "step": 17215 }, { "entropy": 5.690765762329102, "epoch": 1.4467128754463348, "grad_norm": 1.4375, "learning_rate": 0.0004793541490431126, "loss": 5.2947, "mean_token_accuracy": 0.17318409383296968, "num_tokens": 31763394.0, "step": 17220 }, { "entropy": 5.671582269668579, "epoch": 1.4471329552614998, "grad_norm": 1.421875, "learning_rate": 0.0004793416161534216, "loss": 5.4326, "mean_token_accuracy": 0.1657954916357994, "num_tokens": 31771905.0, "step": 17225 }, { "entropy": 5.5294126033782955, "epoch": 1.4475530350766646, "grad_norm": 2.640625, "learning_rate": 0.00047932907964392423, "loss": 5.2655, "mean_token_accuracy": 0.1774240866303444, "num_tokens": 31780788.0, "step": 17230 }, { "entropy": 5.70396466255188, "epoch": 1.4479731148918296, "grad_norm": 2.203125, "learning_rate": 0.00047931653951484234, "loss": 5.4452, "mean_token_accuracy": 0.16516012102365493, "num_tokens": 31790198.0, "step": 17235 }, { "entropy": 5.712733507156372, "epoch": 1.4483931947069943, "grad_norm": 1.8828125, "learning_rate": 0.00047930399576639815, "loss": 5.4324, "mean_token_accuracy": 0.16861406937241555, "num_tokens": 31799396.0, "step": 17240 }, { "entropy": 5.621314477920532, "epoch": 1.4488132745221591, "grad_norm": 1.4140625, "learning_rate": 0.00047929144839881386, "loss": 5.2884, "mean_token_accuracy": 0.18074664771556853, "num_tokens": 31807680.0, "step": 17245 }, { "entropy": 5.722569370269776, "epoch": 1.4492333543373241, "grad_norm": 1.3515625, "learning_rate": 0.00047927889741231186, "loss": 5.4295, "mean_token_accuracy": 0.16408731043338776, "num_tokens": 31817406.0, "step": 17250 }, { "entropy": 5.660385704040527, "epoch": 1.449653434152489, "grad_norm": 1.3828125, "learning_rate": 0.00047926634280711435, "loss": 5.4135, "mean_token_accuracy": 0.16933335810899736, "num_tokens": 31826518.0, "step": 17255 }, { "entropy": 5.703531122207641, "epoch": 1.450073513967654, "grad_norm": 1.3125, "learning_rate": 0.0004792537845834437, "loss": 5.4947, "mean_token_accuracy": 0.15975457429885864, "num_tokens": 31835538.0, "step": 17260 }, { "entropy": 5.664773654937744, "epoch": 1.4504935937828187, "grad_norm": 1.6015625, "learning_rate": 0.0004792412227415224, "loss": 5.3497, "mean_token_accuracy": 0.17190734297037125, "num_tokens": 31844899.0, "step": 17265 }, { "entropy": 5.627852296829223, "epoch": 1.4509136735979835, "grad_norm": 1.453125, "learning_rate": 0.00047922865728157314, "loss": 5.3981, "mean_token_accuracy": 0.1743706777691841, "num_tokens": 31854322.0, "step": 17270 }, { "entropy": 5.6161435604095455, "epoch": 1.4513337534131485, "grad_norm": 1.5234375, "learning_rate": 0.0004792160882038183, "loss": 5.3679, "mean_token_accuracy": 0.16462661772966386, "num_tokens": 31863657.0, "step": 17275 }, { "entropy": 5.655448341369629, "epoch": 1.4517538332283133, "grad_norm": 1.5625, "learning_rate": 0.0004792035155084806, "loss": 5.3615, "mean_token_accuracy": 0.1683821603655815, "num_tokens": 31873468.0, "step": 17280 }, { "entropy": 5.637265586853028, "epoch": 1.4521739130434783, "grad_norm": 1.421875, "learning_rate": 0.00047919093919578283, "loss": 5.4728, "mean_token_accuracy": 0.16719345450401307, "num_tokens": 31882391.0, "step": 17285 }, { "entropy": 5.6774333953857425, "epoch": 1.452593992858643, "grad_norm": 1.7734375, "learning_rate": 0.0004791783592659476, "loss": 5.4566, "mean_token_accuracy": 0.16625383794307708, "num_tokens": 31891370.0, "step": 17290 }, { "entropy": 5.641020917892456, "epoch": 1.4530140726738079, "grad_norm": 1.3359375, "learning_rate": 0.000479165775719198, "loss": 5.3919, "mean_token_accuracy": 0.169977006316185, "num_tokens": 31900688.0, "step": 17295 }, { "entropy": 5.628441858291626, "epoch": 1.453434152488973, "grad_norm": 1.8359375, "learning_rate": 0.00047915318855575674, "loss": 5.4264, "mean_token_accuracy": 0.1753471314907074, "num_tokens": 31909359.0, "step": 17300 }, { "entropy": 5.650968837738037, "epoch": 1.453854232304138, "grad_norm": 1.640625, "learning_rate": 0.00047914059777584686, "loss": 5.3947, "mean_token_accuracy": 0.16623954772949218, "num_tokens": 31918529.0, "step": 17305 }, { "entropy": 5.679246520996093, "epoch": 1.4542743121193027, "grad_norm": 1.609375, "learning_rate": 0.00047912800337969144, "loss": 5.4662, "mean_token_accuracy": 0.16294726431369783, "num_tokens": 31928310.0, "step": 17310 }, { "entropy": 5.64129490852356, "epoch": 1.4546943919344675, "grad_norm": 1.4375, "learning_rate": 0.00047911540536751355, "loss": 5.3744, "mean_token_accuracy": 0.17034156024456024, "num_tokens": 31937077.0, "step": 17315 }, { "entropy": 5.695573711395264, "epoch": 1.4551144717496325, "grad_norm": 1.4765625, "learning_rate": 0.0004791028037395363, "loss": 5.4298, "mean_token_accuracy": 0.16439317166805267, "num_tokens": 31946023.0, "step": 17320 }, { "entropy": 5.581758499145508, "epoch": 1.4555345515647973, "grad_norm": 1.515625, "learning_rate": 0.00047909019849598305, "loss": 5.2733, "mean_token_accuracy": 0.17995132952928544, "num_tokens": 31954741.0, "step": 17325 }, { "entropy": 5.651013660430908, "epoch": 1.4559546313799623, "grad_norm": 1.75, "learning_rate": 0.00047907758963707696, "loss": 5.3939, "mean_token_accuracy": 0.167492838203907, "num_tokens": 31963516.0, "step": 17330 }, { "entropy": 5.683594417572022, "epoch": 1.456374711195127, "grad_norm": 1.59375, "learning_rate": 0.00047906497716304153, "loss": 5.4132, "mean_token_accuracy": 0.17192533612251282, "num_tokens": 31971917.0, "step": 17335 }, { "entropy": 5.674582862854004, "epoch": 1.4567947910102919, "grad_norm": 1.5703125, "learning_rate": 0.0004790523610741001, "loss": 5.4584, "mean_token_accuracy": 0.16307643949985504, "num_tokens": 31980718.0, "step": 17340 }, { "entropy": 5.716789674758911, "epoch": 1.4572148708254569, "grad_norm": 1.53125, "learning_rate": 0.00047903974137047614, "loss": 5.4001, "mean_token_accuracy": 0.16782204508781434, "num_tokens": 31988664.0, "step": 17345 }, { "entropy": 5.757473373413086, "epoch": 1.4576349506406217, "grad_norm": 1.640625, "learning_rate": 0.00047902711805239325, "loss": 5.4791, "mean_token_accuracy": 0.1642825037240982, "num_tokens": 31998415.0, "step": 17350 }, { "entropy": 5.7503297328948975, "epoch": 1.4580550304557867, "grad_norm": 1.421875, "learning_rate": 0.00047901449112007494, "loss": 5.4908, "mean_token_accuracy": 0.16542867422103882, "num_tokens": 32007915.0, "step": 17355 }, { "entropy": 5.642038631439209, "epoch": 1.4584751102709514, "grad_norm": 1.4921875, "learning_rate": 0.00047900186057374514, "loss": 5.4186, "mean_token_accuracy": 0.16974506080150603, "num_tokens": 32016582.0, "step": 17360 }, { "entropy": 5.568690633773803, "epoch": 1.4588951900861162, "grad_norm": 1.75, "learning_rate": 0.00047898922641362724, "loss": 5.4113, "mean_token_accuracy": 0.16496356278657914, "num_tokens": 32026008.0, "step": 17365 }, { "entropy": 5.723394393920898, "epoch": 1.4593152699012812, "grad_norm": 1.8671875, "learning_rate": 0.0004789765886399453, "loss": 5.4592, "mean_token_accuracy": 0.16515185236930846, "num_tokens": 32034554.0, "step": 17370 }, { "entropy": 5.817387819290161, "epoch": 1.4597353497164463, "grad_norm": 1.71875, "learning_rate": 0.00047896394725292313, "loss": 5.4701, "mean_token_accuracy": 0.17239008098840714, "num_tokens": 32044003.0, "step": 17375 }, { "entropy": 5.650395154953003, "epoch": 1.460155429531611, "grad_norm": 1.8203125, "learning_rate": 0.00047895130225278473, "loss": 5.4281, "mean_token_accuracy": 0.1707577034831047, "num_tokens": 32053753.0, "step": 17380 }, { "entropy": 5.639893341064453, "epoch": 1.4605755093467758, "grad_norm": 1.3359375, "learning_rate": 0.0004789386536397539, "loss": 5.4314, "mean_token_accuracy": 0.1669726625084877, "num_tokens": 32062459.0, "step": 17385 }, { "entropy": 5.7756260395050045, "epoch": 1.4609955891619408, "grad_norm": 1.765625, "learning_rate": 0.0004789260014140549, "loss": 5.5241, "mean_token_accuracy": 0.1664410337805748, "num_tokens": 32072544.0, "step": 17390 }, { "entropy": 5.75843915939331, "epoch": 1.4614156689771056, "grad_norm": 1.4765625, "learning_rate": 0.00047891334557591177, "loss": 5.4623, "mean_token_accuracy": 0.1596985414624214, "num_tokens": 32082015.0, "step": 17395 }, { "entropy": 5.644048738479614, "epoch": 1.4618357487922706, "grad_norm": 1.6015625, "learning_rate": 0.0004789006861255488, "loss": 5.3924, "mean_token_accuracy": 0.1662799596786499, "num_tokens": 32091622.0, "step": 17400 }, { "entropy": 5.709836626052857, "epoch": 1.4622558286074354, "grad_norm": 1.84375, "learning_rate": 0.0004788880230631901, "loss": 5.5673, "mean_token_accuracy": 0.15625317990779877, "num_tokens": 32102716.0, "step": 17405 }, { "entropy": 5.7003098487854, "epoch": 1.4626759084226002, "grad_norm": 1.3203125, "learning_rate": 0.00047887535638906005, "loss": 5.3208, "mean_token_accuracy": 0.1776137113571167, "num_tokens": 32111051.0, "step": 17410 }, { "entropy": 5.586649465560913, "epoch": 1.4630959882377652, "grad_norm": 1.6171875, "learning_rate": 0.000478862686103383, "loss": 5.3372, "mean_token_accuracy": 0.17761677205562593, "num_tokens": 32119781.0, "step": 17415 }, { "entropy": 5.712557697296143, "epoch": 1.46351606805293, "grad_norm": 1.6015625, "learning_rate": 0.00047885001220638354, "loss": 5.435, "mean_token_accuracy": 0.16851735562086106, "num_tokens": 32128849.0, "step": 17420 }, { "entropy": 5.7341227531433105, "epoch": 1.463936147868095, "grad_norm": 1.6328125, "learning_rate": 0.00047883733469828604, "loss": 5.4624, "mean_token_accuracy": 0.1703486517071724, "num_tokens": 32138046.0, "step": 17425 }, { "entropy": 5.8417564868927006, "epoch": 1.4643562276832598, "grad_norm": 1.5, "learning_rate": 0.00047882465357931516, "loss": 5.5281, "mean_token_accuracy": 0.161974436044693, "num_tokens": 32147994.0, "step": 17430 }, { "entropy": 5.779322147369385, "epoch": 1.4647763074984246, "grad_norm": 1.4453125, "learning_rate": 0.0004788119688496954, "loss": 5.4589, "mean_token_accuracy": 0.16861263811588287, "num_tokens": 32156835.0, "step": 17435 }, { "entropy": 5.6862884044647215, "epoch": 1.4651963873135896, "grad_norm": 1.4375, "learning_rate": 0.0004787992805096516, "loss": 5.3936, "mean_token_accuracy": 0.17358130365610122, "num_tokens": 32166751.0, "step": 17440 }, { "entropy": 5.713692283630371, "epoch": 1.4656164671287546, "grad_norm": 1.7890625, "learning_rate": 0.00047878658855940855, "loss": 5.5068, "mean_token_accuracy": 0.16271049082279204, "num_tokens": 32175705.0, "step": 17445 }, { "entropy": 5.826437711715698, "epoch": 1.4660365469439194, "grad_norm": 1.546875, "learning_rate": 0.0004787738929991909, "loss": 5.5591, "mean_token_accuracy": 0.15781314745545388, "num_tokens": 32185404.0, "step": 17450 }, { "entropy": 5.72130651473999, "epoch": 1.4664566267590842, "grad_norm": 1.53125, "learning_rate": 0.00047876119382922374, "loss": 5.4299, "mean_token_accuracy": 0.16798323690891265, "num_tokens": 32194054.0, "step": 17455 }, { "entropy": 5.730863285064697, "epoch": 1.4668767065742492, "grad_norm": 1.8125, "learning_rate": 0.00047874849104973194, "loss": 5.4984, "mean_token_accuracy": 0.15487258285284042, "num_tokens": 32204080.0, "step": 17460 }, { "entropy": 5.704109954833984, "epoch": 1.467296786389414, "grad_norm": 1.40625, "learning_rate": 0.00047873578466094054, "loss": 5.4125, "mean_token_accuracy": 0.161499485373497, "num_tokens": 32213279.0, "step": 17465 }, { "entropy": 5.664938116073609, "epoch": 1.467716866204579, "grad_norm": 1.90625, "learning_rate": 0.0004787230746630746, "loss": 5.4104, "mean_token_accuracy": 0.17155456244945527, "num_tokens": 32221668.0, "step": 17470 }, { "entropy": 5.695741128921509, "epoch": 1.4681369460197438, "grad_norm": 1.7578125, "learning_rate": 0.0004787103610563593, "loss": 5.3415, "mean_token_accuracy": 0.17343094050884247, "num_tokens": 32229683.0, "step": 17475 }, { "entropy": 5.691019868850708, "epoch": 1.4685570258349085, "grad_norm": 1.5546875, "learning_rate": 0.00047869764384101993, "loss": 5.4058, "mean_token_accuracy": 0.16649516075849533, "num_tokens": 32238948.0, "step": 17480 }, { "entropy": 5.6545178413391115, "epoch": 1.4689771056500736, "grad_norm": 1.46875, "learning_rate": 0.00047868492301728164, "loss": 5.4404, "mean_token_accuracy": 0.16138940006494523, "num_tokens": 32248079.0, "step": 17485 }, { "entropy": 5.605484294891357, "epoch": 1.4693971854652383, "grad_norm": 1.9140625, "learning_rate": 0.00047867219858536975, "loss": 5.2716, "mean_token_accuracy": 0.1824018180370331, "num_tokens": 32256413.0, "step": 17490 }, { "entropy": 5.6888096809387205, "epoch": 1.4698172652804034, "grad_norm": 1.359375, "learning_rate": 0.0004786594705455098, "loss": 5.4408, "mean_token_accuracy": 0.16207701563835145, "num_tokens": 32265954.0, "step": 17495 }, { "entropy": 5.676724147796631, "epoch": 1.4702373450955681, "grad_norm": 1.3984375, "learning_rate": 0.0004786467388979272, "loss": 5.349, "mean_token_accuracy": 0.171977636218071, "num_tokens": 32273817.0, "step": 17500 }, { "entropy": 5.605041551589966, "epoch": 1.470657424910733, "grad_norm": 1.3984375, "learning_rate": 0.00047863400364284744, "loss": 5.4111, "mean_token_accuracy": 0.1661633461713791, "num_tokens": 32283025.0, "step": 17505 }, { "entropy": 5.665054225921631, "epoch": 1.471077504725898, "grad_norm": 2.140625, "learning_rate": 0.00047862126478049623, "loss": 5.3882, "mean_token_accuracy": 0.16659335941076278, "num_tokens": 32292321.0, "step": 17510 }, { "entropy": 5.784007930755616, "epoch": 1.4714975845410627, "grad_norm": 1.578125, "learning_rate": 0.00047860852231109915, "loss": 5.4876, "mean_token_accuracy": 0.15348291248083115, "num_tokens": 32302203.0, "step": 17515 }, { "entropy": 5.56384539604187, "epoch": 1.4719176643562277, "grad_norm": 1.3828125, "learning_rate": 0.0004785957762348819, "loss": 5.3156, "mean_token_accuracy": 0.16967657655477525, "num_tokens": 32310893.0, "step": 17520 }, { "entropy": 5.559794855117798, "epoch": 1.4723377441713925, "grad_norm": 1.515625, "learning_rate": 0.0004785830265520703, "loss": 5.3744, "mean_token_accuracy": 0.16862395852804185, "num_tokens": 32320320.0, "step": 17525 }, { "entropy": 5.607880735397339, "epoch": 1.4727578239865575, "grad_norm": 1.578125, "learning_rate": 0.00047857027326289023, "loss": 5.2844, "mean_token_accuracy": 0.17600037455558776, "num_tokens": 32329196.0, "step": 17530 }, { "entropy": 5.6827874183654785, "epoch": 1.4731779038017223, "grad_norm": 1.515625, "learning_rate": 0.00047855751636756763, "loss": 5.4258, "mean_token_accuracy": 0.16296974420547486, "num_tokens": 32338529.0, "step": 17535 }, { "entropy": 5.707752323150634, "epoch": 1.4735979836168873, "grad_norm": 1.40625, "learning_rate": 0.0004785447558663284, "loss": 5.418, "mean_token_accuracy": 0.1722614958882332, "num_tokens": 32347114.0, "step": 17540 }, { "entropy": 5.75955114364624, "epoch": 1.474018063432052, "grad_norm": 1.3984375, "learning_rate": 0.00047853199175939865, "loss": 5.6021, "mean_token_accuracy": 0.1608388304710388, "num_tokens": 32356765.0, "step": 17545 }, { "entropy": 5.7798620700836185, "epoch": 1.474438143247217, "grad_norm": 1.5859375, "learning_rate": 0.0004785192240470045, "loss": 5.5294, "mean_token_accuracy": 0.16074298024177552, "num_tokens": 32366175.0, "step": 17550 }, { "entropy": 5.649854564666748, "epoch": 1.474858223062382, "grad_norm": 1.2734375, "learning_rate": 0.000478506452729372, "loss": 5.315, "mean_token_accuracy": 0.1758470743894577, "num_tokens": 32375063.0, "step": 17555 }, { "entropy": 5.6665150165557865, "epoch": 1.4752783028775467, "grad_norm": 1.4296875, "learning_rate": 0.00047849367780672755, "loss": 5.4086, "mean_token_accuracy": 0.1674113929271698, "num_tokens": 32384596.0, "step": 17560 }, { "entropy": 5.636862468719483, "epoch": 1.4756983826927117, "grad_norm": 1.4140625, "learning_rate": 0.0004784808992792974, "loss": 5.3593, "mean_token_accuracy": 0.168624584376812, "num_tokens": 32393489.0, "step": 17565 }, { "entropy": 5.677070379257202, "epoch": 1.4761184625078765, "grad_norm": 1.34375, "learning_rate": 0.0004784681171473079, "loss": 5.3487, "mean_token_accuracy": 0.1728109061717987, "num_tokens": 32402192.0, "step": 17570 }, { "entropy": 5.739632654190063, "epoch": 1.4765385423230413, "grad_norm": 1.71875, "learning_rate": 0.00047845533141098543, "loss": 5.4413, "mean_token_accuracy": 0.15874089151620865, "num_tokens": 32411317.0, "step": 17575 }, { "entropy": 5.708612537384033, "epoch": 1.4769586221382063, "grad_norm": 1.4453125, "learning_rate": 0.0004784425420705565, "loss": 5.499, "mean_token_accuracy": 0.1618265450000763, "num_tokens": 32420308.0, "step": 17580 }, { "entropy": 5.618194961547852, "epoch": 1.477378701953371, "grad_norm": 1.265625, "learning_rate": 0.0004784297491262477, "loss": 5.4258, "mean_token_accuracy": 0.16643496304750444, "num_tokens": 32429532.0, "step": 17585 }, { "entropy": 5.682935762405395, "epoch": 1.477798781768536, "grad_norm": 1.640625, "learning_rate": 0.0004784169525782858, "loss": 5.4164, "mean_token_accuracy": 0.16577064841985703, "num_tokens": 32439382.0, "step": 17590 }, { "entropy": 5.7163759708404545, "epoch": 1.4782188615837009, "grad_norm": 1.625, "learning_rate": 0.0004784041524268971, "loss": 5.4034, "mean_token_accuracy": 0.17389584332704544, "num_tokens": 32447893.0, "step": 17595 }, { "entropy": 5.629817867279053, "epoch": 1.4786389413988656, "grad_norm": 1.875, "learning_rate": 0.00047839134867230874, "loss": 5.4084, "mean_token_accuracy": 0.1654166266322136, "num_tokens": 32457770.0, "step": 17600 }, { "entropy": 5.729843044281006, "epoch": 1.4790590212140307, "grad_norm": 1.5625, "learning_rate": 0.00047837854131474726, "loss": 5.5139, "mean_token_accuracy": 0.16561387926340104, "num_tokens": 32467247.0, "step": 17605 }, { "entropy": 5.7485791683197025, "epoch": 1.4794791010291957, "grad_norm": 1.28125, "learning_rate": 0.00047836573035443976, "loss": 5.4893, "mean_token_accuracy": 0.16393031179904938, "num_tokens": 32477453.0, "step": 17610 }, { "entropy": 5.762020063400269, "epoch": 1.4798991808443605, "grad_norm": 1.5234375, "learning_rate": 0.00047835291579161293, "loss": 5.4549, "mean_token_accuracy": 0.17096612453460694, "num_tokens": 32486278.0, "step": 17615 }, { "entropy": 5.62855486869812, "epoch": 1.4803192606595252, "grad_norm": 1.90625, "learning_rate": 0.0004783400976264941, "loss": 5.3828, "mean_token_accuracy": 0.17316290289163588, "num_tokens": 32495523.0, "step": 17620 }, { "entropy": 5.669747161865234, "epoch": 1.4807393404746902, "grad_norm": 1.6484375, "learning_rate": 0.00047832727585930997, "loss": 5.419, "mean_token_accuracy": 0.16708965897560119, "num_tokens": 32504952.0, "step": 17625 }, { "entropy": 5.667424058914184, "epoch": 1.481159420289855, "grad_norm": 1.3828125, "learning_rate": 0.0004783144504902879, "loss": 5.3972, "mean_token_accuracy": 0.16518824696540832, "num_tokens": 32515620.0, "step": 17630 }, { "entropy": 5.632094812393189, "epoch": 1.48157950010502, "grad_norm": 1.359375, "learning_rate": 0.000478301621519655, "loss": 5.3601, "mean_token_accuracy": 0.17287708073854446, "num_tokens": 32524549.0, "step": 17635 }, { "entropy": 5.663208436965943, "epoch": 1.4819995799201848, "grad_norm": 1.359375, "learning_rate": 0.0004782887889476386, "loss": 5.2658, "mean_token_accuracy": 0.17909094095230102, "num_tokens": 32533043.0, "step": 17640 }, { "entropy": 5.639974546432495, "epoch": 1.4824196597353496, "grad_norm": 2.0625, "learning_rate": 0.000478275952774466, "loss": 5.3707, "mean_token_accuracy": 0.1682120993733406, "num_tokens": 32541679.0, "step": 17645 }, { "entropy": 5.659855556488037, "epoch": 1.4828397395505146, "grad_norm": 1.46875, "learning_rate": 0.0004782631130003646, "loss": 5.4875, "mean_token_accuracy": 0.17222274392843245, "num_tokens": 32550922.0, "step": 17650 }, { "entropy": 5.73547625541687, "epoch": 1.4832598193656794, "grad_norm": 1.296875, "learning_rate": 0.0004782502696255617, "loss": 5.4881, "mean_token_accuracy": 0.16443574875593187, "num_tokens": 32560063.0, "step": 17655 }, { "entropy": 5.655674934387207, "epoch": 1.4836798991808444, "grad_norm": 2.109375, "learning_rate": 0.00047823742265028495, "loss": 5.3575, "mean_token_accuracy": 0.16813185214996337, "num_tokens": 32569476.0, "step": 17660 }, { "entropy": 5.677836179733276, "epoch": 1.4840999789960092, "grad_norm": 2.890625, "learning_rate": 0.000478224572074762, "loss": 5.4225, "mean_token_accuracy": 0.17570533752441406, "num_tokens": 32578552.0, "step": 17665 }, { "entropy": 5.661578607559204, "epoch": 1.484520058811174, "grad_norm": 1.2578125, "learning_rate": 0.0004782117178992203, "loss": 5.4238, "mean_token_accuracy": 0.16717635840177536, "num_tokens": 32589074.0, "step": 17670 }, { "entropy": 5.676818895339966, "epoch": 1.484940138626339, "grad_norm": 1.546875, "learning_rate": 0.0004781988601238878, "loss": 5.4446, "mean_token_accuracy": 0.16712375432252885, "num_tokens": 32599288.0, "step": 17675 }, { "entropy": 5.795759963989258, "epoch": 1.485360218441504, "grad_norm": 1.4296875, "learning_rate": 0.000478185998748992, "loss": 5.4935, "mean_token_accuracy": 0.16263023763895035, "num_tokens": 32609430.0, "step": 17680 }, { "entropy": 5.638738298416138, "epoch": 1.4857802982566688, "grad_norm": 1.3984375, "learning_rate": 0.00047817313377476083, "loss": 5.3467, "mean_token_accuracy": 0.16966764032840728, "num_tokens": 32617763.0, "step": 17685 }, { "entropy": 5.5954235076904295, "epoch": 1.4862003780718336, "grad_norm": 1.671875, "learning_rate": 0.00047816026520142234, "loss": 5.4342, "mean_token_accuracy": 0.16032783836126327, "num_tokens": 32627465.0, "step": 17690 }, { "entropy": 5.728960990905762, "epoch": 1.4866204578869986, "grad_norm": 1.53125, "learning_rate": 0.0004781473930292043, "loss": 5.3391, "mean_token_accuracy": 0.17672401666641235, "num_tokens": 32635984.0, "step": 17695 }, { "entropy": 5.587149381637573, "epoch": 1.4870405377021634, "grad_norm": 1.328125, "learning_rate": 0.0004781345172583348, "loss": 5.2784, "mean_token_accuracy": 0.17341048419475555, "num_tokens": 32644346.0, "step": 17700 }, { "entropy": 5.616852807998657, "epoch": 1.4874606175173284, "grad_norm": 1.5546875, "learning_rate": 0.00047812163788904196, "loss": 5.4103, "mean_token_accuracy": 0.16415098160505295, "num_tokens": 32654118.0, "step": 17705 }, { "entropy": 5.749323081970215, "epoch": 1.4878806973324932, "grad_norm": 1.375, "learning_rate": 0.00047810875492155386, "loss": 5.4415, "mean_token_accuracy": 0.16800331622362136, "num_tokens": 32664258.0, "step": 17710 }, { "entropy": 5.688397216796875, "epoch": 1.488300777147658, "grad_norm": 1.359375, "learning_rate": 0.0004780958683560987, "loss": 5.4765, "mean_token_accuracy": 0.16039148345589638, "num_tokens": 32673672.0, "step": 17715 }, { "entropy": 5.712081003189087, "epoch": 1.488720856962823, "grad_norm": 1.578125, "learning_rate": 0.0004780829781929049, "loss": 5.4578, "mean_token_accuracy": 0.15657913982868193, "num_tokens": 32682901.0, "step": 17720 }, { "entropy": 5.735140562057495, "epoch": 1.4891409367779878, "grad_norm": 1.4765625, "learning_rate": 0.0004780700844322007, "loss": 5.4014, "mean_token_accuracy": 0.17273005843162537, "num_tokens": 32691384.0, "step": 17725 }, { "entropy": 5.635052490234375, "epoch": 1.4895610165931528, "grad_norm": 1.890625, "learning_rate": 0.00047805718707421446, "loss": 5.4357, "mean_token_accuracy": 0.16961687952280044, "num_tokens": 32700758.0, "step": 17730 }, { "entropy": 5.759167098999024, "epoch": 1.4899810964083176, "grad_norm": 2.4375, "learning_rate": 0.00047804428611917475, "loss": 5.5407, "mean_token_accuracy": 0.16442745178937912, "num_tokens": 32709676.0, "step": 17735 }, { "entropy": 5.7738946914672855, "epoch": 1.4904011762234823, "grad_norm": 1.3671875, "learning_rate": 0.00047803138156731, "loss": 5.4367, "mean_token_accuracy": 0.1609507068991661, "num_tokens": 32718102.0, "step": 17740 }, { "entropy": 5.749574279785156, "epoch": 1.4908212560386473, "grad_norm": 1.4921875, "learning_rate": 0.00047801847341884897, "loss": 5.4238, "mean_token_accuracy": 0.16728150397539138, "num_tokens": 32727356.0, "step": 17745 }, { "entropy": 5.610603475570679, "epoch": 1.4912413358538124, "grad_norm": 1.5234375, "learning_rate": 0.0004780055616740202, "loss": 5.4164, "mean_token_accuracy": 0.16602010279893875, "num_tokens": 32736605.0, "step": 17750 }, { "entropy": 5.626084041595459, "epoch": 1.4916614156689771, "grad_norm": 1.375, "learning_rate": 0.0004779926463330524, "loss": 5.3607, "mean_token_accuracy": 0.17045399099588393, "num_tokens": 32745573.0, "step": 17755 }, { "entropy": 5.6878427028656, "epoch": 1.492081495484142, "grad_norm": 1.5859375, "learning_rate": 0.0004779797273961744, "loss": 5.414, "mean_token_accuracy": 0.17474236190319062, "num_tokens": 32755695.0, "step": 17760 }, { "entropy": 5.6625172138214115, "epoch": 1.492501575299307, "grad_norm": 1.4296875, "learning_rate": 0.0004779668048636151, "loss": 5.3292, "mean_token_accuracy": 0.1730514347553253, "num_tokens": 32763570.0, "step": 17765 }, { "entropy": 5.612107133865356, "epoch": 1.4929216551144717, "grad_norm": 1.40625, "learning_rate": 0.00047795387873560336, "loss": 5.4331, "mean_token_accuracy": 0.1678207114338875, "num_tokens": 32772006.0, "step": 17770 }, { "entropy": 5.7148637771606445, "epoch": 1.4933417349296367, "grad_norm": 1.515625, "learning_rate": 0.0004779409490123681, "loss": 5.3881, "mean_token_accuracy": 0.16234676241874696, "num_tokens": 32781080.0, "step": 17775 }, { "entropy": 5.635086727142334, "epoch": 1.4937618147448015, "grad_norm": 1.40625, "learning_rate": 0.0004779280156941384, "loss": 5.3503, "mean_token_accuracy": 0.16645084470510482, "num_tokens": 32789880.0, "step": 17780 }, { "entropy": 5.69928207397461, "epoch": 1.4941818945599663, "grad_norm": 1.4375, "learning_rate": 0.00047791507878114354, "loss": 5.3909, "mean_token_accuracy": 0.16705690920352936, "num_tokens": 32799222.0, "step": 17785 }, { "entropy": 5.626346826553345, "epoch": 1.4946019743751313, "grad_norm": 1.375, "learning_rate": 0.0004779021382736124, "loss": 5.387, "mean_token_accuracy": 0.16727182418107986, "num_tokens": 32808945.0, "step": 17790 }, { "entropy": 5.611076211929321, "epoch": 1.495022054190296, "grad_norm": 1.4296875, "learning_rate": 0.0004778891941717745, "loss": 5.3118, "mean_token_accuracy": 0.18029792606830597, "num_tokens": 32818386.0, "step": 17795 }, { "entropy": 5.5952142715454105, "epoch": 1.495442134005461, "grad_norm": 1.5546875, "learning_rate": 0.0004778762464758589, "loss": 5.3771, "mean_token_accuracy": 0.16038608253002168, "num_tokens": 32828364.0, "step": 17800 }, { "entropy": 5.779965257644653, "epoch": 1.495862213820626, "grad_norm": 1.8359375, "learning_rate": 0.00047786329518609505, "loss": 5.5137, "mean_token_accuracy": 0.16410740464925766, "num_tokens": 32837399.0, "step": 17805 }, { "entropy": 5.671717548370362, "epoch": 1.4962822936357907, "grad_norm": 1.3671875, "learning_rate": 0.00047785034030271243, "loss": 5.3413, "mean_token_accuracy": 0.1711513638496399, "num_tokens": 32846111.0, "step": 17810 }, { "entropy": 5.6222676753997805, "epoch": 1.4967023734509557, "grad_norm": 1.4375, "learning_rate": 0.0004778373818259404, "loss": 5.2429, "mean_token_accuracy": 0.1814047634601593, "num_tokens": 32855839.0, "step": 17815 }, { "entropy": 5.71916937828064, "epoch": 1.4971224532661207, "grad_norm": 1.625, "learning_rate": 0.00047782441975600866, "loss": 5.5456, "mean_token_accuracy": 0.16741917729377748, "num_tokens": 32865946.0, "step": 17820 }, { "entropy": 5.748912906646728, "epoch": 1.4975425330812855, "grad_norm": 1.53125, "learning_rate": 0.0004778114540931468, "loss": 5.5114, "mean_token_accuracy": 0.16409117877483367, "num_tokens": 32875310.0, "step": 17825 }, { "entropy": 5.702952241897583, "epoch": 1.4979626128964503, "grad_norm": 2.515625, "learning_rate": 0.00047779848483758445, "loss": 5.4483, "mean_token_accuracy": 0.16831188052892684, "num_tokens": 32885315.0, "step": 17830 }, { "entropy": 5.684667110443115, "epoch": 1.4983826927116153, "grad_norm": 1.5, "learning_rate": 0.00047778551198955133, "loss": 5.4043, "mean_token_accuracy": 0.1707111567258835, "num_tokens": 32894055.0, "step": 17835 }, { "entropy": 5.64805235862732, "epoch": 1.49880277252678, "grad_norm": 1.46875, "learning_rate": 0.0004777725355492773, "loss": 5.4056, "mean_token_accuracy": 0.17348893135786056, "num_tokens": 32903030.0, "step": 17840 }, { "entropy": 5.665900611877442, "epoch": 1.499222852341945, "grad_norm": 1.484375, "learning_rate": 0.0004777595555169922, "loss": 5.3429, "mean_token_accuracy": 0.17314210832118987, "num_tokens": 32911562.0, "step": 17845 }, { "entropy": 5.706243324279785, "epoch": 1.4996429321571099, "grad_norm": 1.4453125, "learning_rate": 0.000477746571892926, "loss": 5.464, "mean_token_accuracy": 0.16257281601428986, "num_tokens": 32920376.0, "step": 17850 }, { "entropy": 5.663986158370972, "epoch": 1.5000630119722747, "grad_norm": 1.546875, "learning_rate": 0.0004777335846773087, "loss": 5.3903, "mean_token_accuracy": 0.16299790441989898, "num_tokens": 32929374.0, "step": 17855 }, { "entropy": 5.528833436965942, "epoch": 1.5004830917874397, "grad_norm": 1.625, "learning_rate": 0.00047772059387037025, "loss": 5.345, "mean_token_accuracy": 0.16556637734174728, "num_tokens": 32938695.0, "step": 17860 }, { "entropy": 5.671306324005127, "epoch": 1.5009031716026044, "grad_norm": 1.515625, "learning_rate": 0.0004777075994723409, "loss": 5.4045, "mean_token_accuracy": 0.1704086810350418, "num_tokens": 32947725.0, "step": 17865 }, { "entropy": 5.726226949691773, "epoch": 1.5013232514177695, "grad_norm": 1.453125, "learning_rate": 0.00047769460148345085, "loss": 5.4181, "mean_token_accuracy": 0.16411009281873704, "num_tokens": 32957017.0, "step": 17870 }, { "entropy": 5.675952672958374, "epoch": 1.5017433312329342, "grad_norm": 1.40625, "learning_rate": 0.0004776815999039303, "loss": 5.3935, "mean_token_accuracy": 0.1685171753168106, "num_tokens": 32965944.0, "step": 17875 }, { "entropy": 5.637391996383667, "epoch": 1.502163411048099, "grad_norm": 1.6640625, "learning_rate": 0.0004776685947340096, "loss": 5.3918, "mean_token_accuracy": 0.17094200998544692, "num_tokens": 32975368.0, "step": 17880 }, { "entropy": 5.685165643692017, "epoch": 1.502583490863264, "grad_norm": 1.59375, "learning_rate": 0.0004776555859739191, "loss": 5.4559, "mean_token_accuracy": 0.16454171389341354, "num_tokens": 32984603.0, "step": 17885 }, { "entropy": 5.6984397888183596, "epoch": 1.503003570678429, "grad_norm": 1.8671875, "learning_rate": 0.00047764257362388913, "loss": 5.4249, "mean_token_accuracy": 0.16488805860280992, "num_tokens": 32993621.0, "step": 17890 }, { "entropy": 5.642865991592407, "epoch": 1.5034236504935938, "grad_norm": 1.65625, "learning_rate": 0.0004776295576841504, "loss": 5.4058, "mean_token_accuracy": 0.1731736972928047, "num_tokens": 33002637.0, "step": 17895 }, { "entropy": 5.664972877502441, "epoch": 1.5038437303087586, "grad_norm": 1.40625, "learning_rate": 0.00047761653815493337, "loss": 5.3564, "mean_token_accuracy": 0.17393183410167695, "num_tokens": 33011964.0, "step": 17900 }, { "entropy": 5.658042669296265, "epoch": 1.5042638101239234, "grad_norm": 1.78125, "learning_rate": 0.00047760351503646877, "loss": 5.4165, "mean_token_accuracy": 0.16770535558462143, "num_tokens": 33020626.0, "step": 17905 }, { "entropy": 5.70390887260437, "epoch": 1.5046838899390884, "grad_norm": 1.7421875, "learning_rate": 0.0004775904883289871, "loss": 5.369, "mean_token_accuracy": 0.1692973181605339, "num_tokens": 33029212.0, "step": 17910 }, { "entropy": 5.6756768226623535, "epoch": 1.5051039697542534, "grad_norm": 1.625, "learning_rate": 0.00047757745803271936, "loss": 5.4381, "mean_token_accuracy": 0.16383266746997832, "num_tokens": 33038893.0, "step": 17915 }, { "entropy": 5.661106920242309, "epoch": 1.5055240495694182, "grad_norm": 1.4921875, "learning_rate": 0.0004775644241478962, "loss": 5.4223, "mean_token_accuracy": 0.16328874826431275, "num_tokens": 33048058.0, "step": 17920 }, { "entropy": 5.62230749130249, "epoch": 1.505944129384583, "grad_norm": 2.34375, "learning_rate": 0.00047755138667474864, "loss": 5.3164, "mean_token_accuracy": 0.1771548643708229, "num_tokens": 33057106.0, "step": 17925 }, { "entropy": 5.60415210723877, "epoch": 1.506364209199748, "grad_norm": 1.6171875, "learning_rate": 0.0004775383456135075, "loss": 5.4777, "mean_token_accuracy": 0.16880970150232316, "num_tokens": 33066400.0, "step": 17930 }, { "entropy": 5.663134336471558, "epoch": 1.5067842890149128, "grad_norm": 1.5859375, "learning_rate": 0.0004775253009644038, "loss": 5.3276, "mean_token_accuracy": 0.17642468810081482, "num_tokens": 33075357.0, "step": 17935 }, { "entropy": 5.7705831050872805, "epoch": 1.5072043688300778, "grad_norm": 1.4609375, "learning_rate": 0.00047751225272766885, "loss": 5.4278, "mean_token_accuracy": 0.1641027197241783, "num_tokens": 33085707.0, "step": 17940 }, { "entropy": 5.800422859191895, "epoch": 1.5076244486452426, "grad_norm": 1.6796875, "learning_rate": 0.0004774992009035335, "loss": 5.5494, "mean_token_accuracy": 0.16157107502222062, "num_tokens": 33095825.0, "step": 17945 }, { "entropy": 5.597539043426513, "epoch": 1.5080445284604074, "grad_norm": 1.4921875, "learning_rate": 0.0004774861454922291, "loss": 5.3414, "mean_token_accuracy": 0.174434395134449, "num_tokens": 33105130.0, "step": 17950 }, { "entropy": 5.596598339080811, "epoch": 1.5084646082755724, "grad_norm": 2.078125, "learning_rate": 0.0004774730864939869, "loss": 5.378, "mean_token_accuracy": 0.16594540178775788, "num_tokens": 33113226.0, "step": 17955 }, { "entropy": 5.715326309204102, "epoch": 1.5088846880907374, "grad_norm": 1.3125, "learning_rate": 0.00047746002390903824, "loss": 5.3872, "mean_token_accuracy": 0.1708257630467415, "num_tokens": 33120824.0, "step": 17960 }, { "entropy": 5.746819305419922, "epoch": 1.5093047679059022, "grad_norm": 2.078125, "learning_rate": 0.0004774469577376145, "loss": 5.3633, "mean_token_accuracy": 0.17433841079473494, "num_tokens": 33129503.0, "step": 17965 }, { "entropy": 5.552629375457764, "epoch": 1.509724847721067, "grad_norm": 1.3125, "learning_rate": 0.00047743388797994715, "loss": 5.2681, "mean_token_accuracy": 0.17450862377882004, "num_tokens": 33138838.0, "step": 17970 }, { "entropy": 5.621928453445435, "epoch": 1.5101449275362318, "grad_norm": 1.4375, "learning_rate": 0.00047742081463626767, "loss": 5.3923, "mean_token_accuracy": 0.16948231309652328, "num_tokens": 33148142.0, "step": 17975 }, { "entropy": 5.645056867599488, "epoch": 1.5105650073513968, "grad_norm": 1.4609375, "learning_rate": 0.0004774077377068078, "loss": 5.3853, "mean_token_accuracy": 0.16999683529138565, "num_tokens": 33156750.0, "step": 17980 }, { "entropy": 5.755242204666137, "epoch": 1.5109850871665618, "grad_norm": 1.40625, "learning_rate": 0.000477394657191799, "loss": 5.5408, "mean_token_accuracy": 0.15939399749040603, "num_tokens": 33166511.0, "step": 17985 }, { "entropy": 5.70735993385315, "epoch": 1.5114051669817266, "grad_norm": 1.5625, "learning_rate": 0.00047738157309147307, "loss": 5.4727, "mean_token_accuracy": 0.16851068288087845, "num_tokens": 33175812.0, "step": 17990 }, { "entropy": 5.578419828414917, "epoch": 1.5118252467968913, "grad_norm": 1.6171875, "learning_rate": 0.00047736848540606174, "loss": 5.3388, "mean_token_accuracy": 0.16674845963716506, "num_tokens": 33185201.0, "step": 17995 }, { "entropy": 5.634521389007569, "epoch": 1.5122453266120561, "grad_norm": 1.640625, "learning_rate": 0.000477355394135797, "loss": 5.3332, "mean_token_accuracy": 0.17178126722574233, "num_tokens": 33195151.0, "step": 18000 }, { "epoch": 1.5122453266120561, "eval_entropy": 5.504568783942394, "eval_loss": 5.480621814727783, "eval_mean_token_accuracy": 0.17380510120579043, "eval_num_tokens": 33195151.0, "eval_runtime": 27.2739, "eval_samples_per_second": 1370.028, "eval_steps_per_second": 171.263, "step": 18000 }, { "entropy": 5.7297890186309814, "epoch": 1.5126654064272211, "grad_norm": 1.5390625, "learning_rate": 0.0004773422992809106, "loss": 5.3859, "mean_token_accuracy": 0.16926338374614716, "num_tokens": 33204800.0, "step": 18005 }, { "entropy": 5.695334625244141, "epoch": 1.5130854862423861, "grad_norm": 1.609375, "learning_rate": 0.0004773292008416346, "loss": 5.4322, "mean_token_accuracy": 0.1651061251759529, "num_tokens": 33214529.0, "step": 18010 }, { "entropy": 5.6870293617248535, "epoch": 1.513505566057551, "grad_norm": 1.3984375, "learning_rate": 0.00047731609881820095, "loss": 5.4368, "mean_token_accuracy": 0.16418869495391847, "num_tokens": 33224522.0, "step": 18015 }, { "entropy": 5.750136613845825, "epoch": 1.5139256458727157, "grad_norm": 1.484375, "learning_rate": 0.00047730299321084173, "loss": 5.4425, "mean_token_accuracy": 0.16809688359498978, "num_tokens": 33233220.0, "step": 18020 }, { "entropy": 5.716884803771973, "epoch": 1.5143457256878807, "grad_norm": 1.421875, "learning_rate": 0.00047728988401978916, "loss": 5.3468, "mean_token_accuracy": 0.173400317132473, "num_tokens": 33242277.0, "step": 18025 }, { "entropy": 5.7281084060668945, "epoch": 1.5147658055030457, "grad_norm": 1.546875, "learning_rate": 0.0004772767712452756, "loss": 5.4088, "mean_token_accuracy": 0.17954297214746476, "num_tokens": 33251113.0, "step": 18030 }, { "entropy": 5.60842080116272, "epoch": 1.5151858853182105, "grad_norm": 2.0, "learning_rate": 0.00047726365488753305, "loss": 5.548, "mean_token_accuracy": 0.15993862450122834, "num_tokens": 33261055.0, "step": 18035 }, { "entropy": 5.685538625717163, "epoch": 1.5156059651333753, "grad_norm": 1.78125, "learning_rate": 0.00047725053494679403, "loss": 5.5104, "mean_token_accuracy": 0.16750353425741196, "num_tokens": 33270981.0, "step": 18040 }, { "entropy": 5.811197137832641, "epoch": 1.51602604494854, "grad_norm": 1.65625, "learning_rate": 0.00047723741142329104, "loss": 5.4511, "mean_token_accuracy": 0.16344697326421737, "num_tokens": 33279516.0, "step": 18045 }, { "entropy": 5.623986768722534, "epoch": 1.516446124763705, "grad_norm": 1.578125, "learning_rate": 0.00047722428431725637, "loss": 5.372, "mean_token_accuracy": 0.17835780680179597, "num_tokens": 33288300.0, "step": 18050 }, { "entropy": 5.646885824203491, "epoch": 1.5168662045788701, "grad_norm": 1.71875, "learning_rate": 0.0004772111536289226, "loss": 5.4115, "mean_token_accuracy": 0.1641728550195694, "num_tokens": 33299059.0, "step": 18055 }, { "entropy": 5.689133930206299, "epoch": 1.517286284394035, "grad_norm": 1.46875, "learning_rate": 0.00047719801935852235, "loss": 5.468, "mean_token_accuracy": 0.16429835706949233, "num_tokens": 33308879.0, "step": 18060 }, { "entropy": 5.763861560821534, "epoch": 1.5177063642091997, "grad_norm": 1.7109375, "learning_rate": 0.0004771848815062883, "loss": 5.5568, "mean_token_accuracy": 0.1608145996928215, "num_tokens": 33318615.0, "step": 18065 }, { "entropy": 5.809006929397583, "epoch": 1.5181264440243645, "grad_norm": 1.359375, "learning_rate": 0.0004771717400724532, "loss": 5.5845, "mean_token_accuracy": 0.15996418967843057, "num_tokens": 33328748.0, "step": 18070 }, { "entropy": 5.765374803543091, "epoch": 1.5185465238395295, "grad_norm": 1.4140625, "learning_rate": 0.0004771585950572499, "loss": 5.3919, "mean_token_accuracy": 0.16406020075082778, "num_tokens": 33338350.0, "step": 18075 }, { "entropy": 5.623263883590698, "epoch": 1.5189666036546945, "grad_norm": 1.4765625, "learning_rate": 0.0004771454464609111, "loss": 5.4011, "mean_token_accuracy": 0.16918568760156633, "num_tokens": 33348202.0, "step": 18080 }, { "entropy": 5.613306331634521, "epoch": 1.5193866834698593, "grad_norm": 1.546875, "learning_rate": 0.0004771322942836699, "loss": 5.3967, "mean_token_accuracy": 0.16765800267457961, "num_tokens": 33356996.0, "step": 18085 }, { "entropy": 5.791823196411133, "epoch": 1.519806763285024, "grad_norm": 1.7890625, "learning_rate": 0.0004771191385257592, "loss": 5.5247, "mean_token_accuracy": 0.16046885251998902, "num_tokens": 33366173.0, "step": 18090 }, { "entropy": 5.713813591003418, "epoch": 1.520226843100189, "grad_norm": 1.4765625, "learning_rate": 0.0004771059791874119, "loss": 5.4365, "mean_token_accuracy": 0.15948131680488586, "num_tokens": 33375921.0, "step": 18095 }, { "entropy": 5.6319067001342775, "epoch": 1.520646922915354, "grad_norm": 2.9375, "learning_rate": 0.0004770928162688613, "loss": 5.4232, "mean_token_accuracy": 0.16363133490085602, "num_tokens": 33385538.0, "step": 18100 }, { "entropy": 5.633490324020386, "epoch": 1.5210670027305189, "grad_norm": 1.8984375, "learning_rate": 0.00047707964977034055, "loss": 5.3274, "mean_token_accuracy": 0.18080521374940872, "num_tokens": 33393728.0, "step": 18105 }, { "entropy": 5.776975011825561, "epoch": 1.5214870825456837, "grad_norm": 1.671875, "learning_rate": 0.0004770664796920828, "loss": 5.4259, "mean_token_accuracy": 0.1658819019794464, "num_tokens": 33402540.0, "step": 18110 }, { "entropy": 5.648982095718384, "epoch": 1.5219071623608484, "grad_norm": 1.7109375, "learning_rate": 0.0004770533060343215, "loss": 5.3993, "mean_token_accuracy": 0.1668563425540924, "num_tokens": 33411706.0, "step": 18115 }, { "entropy": 5.619913053512573, "epoch": 1.5223272421760135, "grad_norm": 1.5546875, "learning_rate": 0.0004770401287972899, "loss": 5.346, "mean_token_accuracy": 0.17197668105363845, "num_tokens": 33420604.0, "step": 18120 }, { "entropy": 5.612928819656372, "epoch": 1.5227473219911785, "grad_norm": 1.5546875, "learning_rate": 0.00047702694798122143, "loss": 5.3312, "mean_token_accuracy": 0.18267546892166137, "num_tokens": 33429558.0, "step": 18125 }, { "entropy": 5.845659017562866, "epoch": 1.5231674018063432, "grad_norm": 1.640625, "learning_rate": 0.00047701376358634957, "loss": 5.5331, "mean_token_accuracy": 0.16271810382604598, "num_tokens": 33439620.0, "step": 18130 }, { "entropy": 5.746625709533691, "epoch": 1.523587481621508, "grad_norm": 1.6875, "learning_rate": 0.00047700057561290797, "loss": 5.4849, "mean_token_accuracy": 0.1619314581155777, "num_tokens": 33449067.0, "step": 18135 }, { "entropy": 5.6104577541351315, "epoch": 1.5240075614366728, "grad_norm": 1.6484375, "learning_rate": 0.0004769873840611302, "loss": 5.388, "mean_token_accuracy": 0.17093031108379364, "num_tokens": 33458089.0, "step": 18140 }, { "entropy": 5.674795293807984, "epoch": 1.5244276412518378, "grad_norm": 1.390625, "learning_rate": 0.0004769741889312499, "loss": 5.4976, "mean_token_accuracy": 0.1689228668808937, "num_tokens": 33466883.0, "step": 18145 }, { "entropy": 5.725237464904785, "epoch": 1.5248477210670028, "grad_norm": 1.375, "learning_rate": 0.00047696099022350087, "loss": 5.5247, "mean_token_accuracy": 0.15924528241157532, "num_tokens": 33476649.0, "step": 18150 }, { "entropy": 5.798870325088501, "epoch": 1.5252678008821676, "grad_norm": 1.609375, "learning_rate": 0.00047694778793811685, "loss": 5.4913, "mean_token_accuracy": 0.16371531635522843, "num_tokens": 33486274.0, "step": 18155 }, { "entropy": 5.731025695800781, "epoch": 1.5256878806973324, "grad_norm": 1.609375, "learning_rate": 0.00047693458207533177, "loss": 5.3745, "mean_token_accuracy": 0.1666399672627449, "num_tokens": 33494950.0, "step": 18160 }, { "entropy": 5.659780883789063, "epoch": 1.5261079605124974, "grad_norm": 1.46875, "learning_rate": 0.0004769213726353795, "loss": 5.3996, "mean_token_accuracy": 0.1708945393562317, "num_tokens": 33503545.0, "step": 18165 }, { "entropy": 5.648102521896362, "epoch": 1.5265280403276622, "grad_norm": 1.5625, "learning_rate": 0.00047690815961849416, "loss": 5.4462, "mean_token_accuracy": 0.1661043107509613, "num_tokens": 33512871.0, "step": 18170 }, { "entropy": 5.623683214187622, "epoch": 1.5269481201428272, "grad_norm": 1.6484375, "learning_rate": 0.0004768949430249097, "loss": 5.3626, "mean_token_accuracy": 0.16892132312059402, "num_tokens": 33521933.0, "step": 18175 }, { "entropy": 5.672886848449707, "epoch": 1.527368199957992, "grad_norm": 1.6484375, "learning_rate": 0.0004768817228548603, "loss": 5.3511, "mean_token_accuracy": 0.1706907257437706, "num_tokens": 33531370.0, "step": 18180 }, { "entropy": 5.755971002578735, "epoch": 1.5277882797731568, "grad_norm": 1.8203125, "learning_rate": 0.0004768684991085802, "loss": 5.4365, "mean_token_accuracy": 0.16248024702072145, "num_tokens": 33540310.0, "step": 18185 }, { "entropy": 5.687887954711914, "epoch": 1.5282083595883218, "grad_norm": 1.5859375, "learning_rate": 0.00047685527178630347, "loss": 5.4598, "mean_token_accuracy": 0.16537974774837494, "num_tokens": 33549943.0, "step": 18190 }, { "entropy": 5.752259922027588, "epoch": 1.5286284394034868, "grad_norm": 1.96875, "learning_rate": 0.0004768420408882646, "loss": 5.5298, "mean_token_accuracy": 0.16441700905561446, "num_tokens": 33560167.0, "step": 18195 }, { "entropy": 5.757403898239136, "epoch": 1.5290485192186516, "grad_norm": 1.59375, "learning_rate": 0.00047682880641469787, "loss": 5.4111, "mean_token_accuracy": 0.16261017471551895, "num_tokens": 33569604.0, "step": 18200 }, { "entropy": 5.701638650894165, "epoch": 1.5294685990338164, "grad_norm": 1.4140625, "learning_rate": 0.0004768155683658378, "loss": 5.3972, "mean_token_accuracy": 0.168385748565197, "num_tokens": 33578400.0, "step": 18205 }, { "entropy": 5.596540117263794, "epoch": 1.5298886788489812, "grad_norm": 1.515625, "learning_rate": 0.0004768023267419188, "loss": 5.3728, "mean_token_accuracy": 0.16698229908943177, "num_tokens": 33587527.0, "step": 18210 }, { "entropy": 5.585406541824341, "epoch": 1.5303087586641462, "grad_norm": 1.5078125, "learning_rate": 0.0004767890815431756, "loss": 5.31, "mean_token_accuracy": 0.1722709432244301, "num_tokens": 33596026.0, "step": 18215 }, { "entropy": 5.698364782333374, "epoch": 1.5307288384793112, "grad_norm": 1.375, "learning_rate": 0.00047677583276984264, "loss": 5.3995, "mean_token_accuracy": 0.16997897922992705, "num_tokens": 33605906.0, "step": 18220 }, { "entropy": 5.687321901321411, "epoch": 1.531148918294476, "grad_norm": 1.6328125, "learning_rate": 0.0004767625804221548, "loss": 5.36, "mean_token_accuracy": 0.17047615945339203, "num_tokens": 33615758.0, "step": 18225 }, { "entropy": 5.662997770309448, "epoch": 1.5315689981096408, "grad_norm": 1.6328125, "learning_rate": 0.0004767493245003466, "loss": 5.4245, "mean_token_accuracy": 0.18040256053209305, "num_tokens": 33625486.0, "step": 18230 }, { "entropy": 5.663189315795899, "epoch": 1.5319890779248058, "grad_norm": 1.7421875, "learning_rate": 0.00047673606500465315, "loss": 5.3718, "mean_token_accuracy": 0.17638310939073562, "num_tokens": 33633954.0, "step": 18235 }, { "entropy": 5.633836793899536, "epoch": 1.5324091577399706, "grad_norm": 1.671875, "learning_rate": 0.000476722801935309, "loss": 5.4511, "mean_token_accuracy": 0.166046205163002, "num_tokens": 33642478.0, "step": 18240 }, { "entropy": 5.6103380680084225, "epoch": 1.5328292375551356, "grad_norm": 1.5078125, "learning_rate": 0.0004767095352925495, "loss": 5.3701, "mean_token_accuracy": 0.1702152296900749, "num_tokens": 33650785.0, "step": 18245 }, { "entropy": 5.659248542785645, "epoch": 1.5332493173703003, "grad_norm": 1.6171875, "learning_rate": 0.0004766962650766093, "loss": 5.3337, "mean_token_accuracy": 0.17309417128562926, "num_tokens": 33659677.0, "step": 18250 }, { "entropy": 5.716655015945435, "epoch": 1.5336693971854651, "grad_norm": 1.8125, "learning_rate": 0.00047668299128772365, "loss": 5.5052, "mean_token_accuracy": 0.1620546281337738, "num_tokens": 33669493.0, "step": 18255 }, { "entropy": 5.766137742996216, "epoch": 1.5340894770006301, "grad_norm": 1.4765625, "learning_rate": 0.0004766697139261277, "loss": 5.4809, "mean_token_accuracy": 0.1693834885954857, "num_tokens": 33678446.0, "step": 18260 }, { "entropy": 5.688551139831543, "epoch": 1.5345095568157952, "grad_norm": 1.4453125, "learning_rate": 0.0004766564329920566, "loss": 5.3417, "mean_token_accuracy": 0.17938026487827302, "num_tokens": 33687647.0, "step": 18265 }, { "entropy": 5.66825041770935, "epoch": 1.53492963663096, "grad_norm": 1.7421875, "learning_rate": 0.0004766431484857456, "loss": 5.4354, "mean_token_accuracy": 0.1683764412999153, "num_tokens": 33697395.0, "step": 18270 }, { "entropy": 5.6449426174163815, "epoch": 1.5353497164461247, "grad_norm": 1.4375, "learning_rate": 0.00047662986040743004, "loss": 5.4179, "mean_token_accuracy": 0.1762421429157257, "num_tokens": 33706779.0, "step": 18275 }, { "entropy": 5.6319070816040036, "epoch": 1.5357697962612895, "grad_norm": 1.8046875, "learning_rate": 0.0004766165687573454, "loss": 5.399, "mean_token_accuracy": 0.16638792753219606, "num_tokens": 33714828.0, "step": 18280 }, { "entropy": 5.7225525856018065, "epoch": 1.5361898760764545, "grad_norm": 1.875, "learning_rate": 0.000476603273535727, "loss": 5.4058, "mean_token_accuracy": 0.16816843450069427, "num_tokens": 33724730.0, "step": 18285 }, { "entropy": 5.7629804611206055, "epoch": 1.5366099558916195, "grad_norm": 1.6171875, "learning_rate": 0.0004765899747428104, "loss": 5.4813, "mean_token_accuracy": 0.16490163505077363, "num_tokens": 33734374.0, "step": 18290 }, { "entropy": 5.7630139827728275, "epoch": 1.5370300357067843, "grad_norm": 1.6875, "learning_rate": 0.00047657667237883125, "loss": 5.4618, "mean_token_accuracy": 0.17239924520254135, "num_tokens": 33743395.0, "step": 18295 }, { "entropy": 5.72203483581543, "epoch": 1.537450115521949, "grad_norm": 1.8203125, "learning_rate": 0.00047656336644402513, "loss": 5.5038, "mean_token_accuracy": 0.1658702626824379, "num_tokens": 33752526.0, "step": 18300 }, { "entropy": 5.73434624671936, "epoch": 1.5378701953371139, "grad_norm": 1.734375, "learning_rate": 0.0004765500569386278, "loss": 5.4341, "mean_token_accuracy": 0.17372529208660126, "num_tokens": 33761310.0, "step": 18305 }, { "entropy": 5.630677986145019, "epoch": 1.538290275152279, "grad_norm": 1.765625, "learning_rate": 0.000476536743862875, "loss": 5.3564, "mean_token_accuracy": 0.17069067656993867, "num_tokens": 33770870.0, "step": 18310 }, { "entropy": 5.587197399139404, "epoch": 1.538710354967444, "grad_norm": 1.4765625, "learning_rate": 0.00047652342721700246, "loss": 5.3123, "mean_token_accuracy": 0.16748333871364593, "num_tokens": 33779648.0, "step": 18315 }, { "entropy": 5.689319229125976, "epoch": 1.5391304347826087, "grad_norm": 1.4453125, "learning_rate": 0.0004765101070012462, "loss": 5.5059, "mean_token_accuracy": 0.1615031287074089, "num_tokens": 33789172.0, "step": 18320 }, { "entropy": 5.810400390625, "epoch": 1.5395505145977735, "grad_norm": 1.6484375, "learning_rate": 0.00047649678321584214, "loss": 5.4895, "mean_token_accuracy": 0.15798811763525009, "num_tokens": 33798069.0, "step": 18325 }, { "entropy": 5.732732534408569, "epoch": 1.5399705944129385, "grad_norm": 1.453125, "learning_rate": 0.00047648345586102643, "loss": 5.4397, "mean_token_accuracy": 0.16982662975788115, "num_tokens": 33806214.0, "step": 18330 }, { "entropy": 5.712227535247803, "epoch": 1.5403906742281035, "grad_norm": 1.8984375, "learning_rate": 0.000476470124937035, "loss": 5.4266, "mean_token_accuracy": 0.17047962546348572, "num_tokens": 33815365.0, "step": 18335 }, { "entropy": 5.728869104385376, "epoch": 1.5408107540432683, "grad_norm": 1.421875, "learning_rate": 0.000476456790444104, "loss": 5.3487, "mean_token_accuracy": 0.17773585617542267, "num_tokens": 33825204.0, "step": 18340 }, { "entropy": 5.687373256683349, "epoch": 1.541230833858433, "grad_norm": 2.34375, "learning_rate": 0.0004764434523824697, "loss": 5.4619, "mean_token_accuracy": 0.1697180077433586, "num_tokens": 33834439.0, "step": 18345 }, { "entropy": 5.622870349884034, "epoch": 1.5416509136735979, "grad_norm": 1.703125, "learning_rate": 0.00047643011075236845, "loss": 5.4381, "mean_token_accuracy": 0.1638789251446724, "num_tokens": 33843959.0, "step": 18350 }, { "entropy": 5.776487016677857, "epoch": 1.5420709934887629, "grad_norm": 1.4921875, "learning_rate": 0.00047641676555403646, "loss": 5.4804, "mean_token_accuracy": 0.15986314862966539, "num_tokens": 33853234.0, "step": 18355 }, { "entropy": 5.695157814025879, "epoch": 1.5424910733039279, "grad_norm": 1.578125, "learning_rate": 0.0004764034167877102, "loss": 5.3797, "mean_token_accuracy": 0.16742191165685655, "num_tokens": 33861755.0, "step": 18360 }, { "entropy": 5.719500398635864, "epoch": 1.5429111531190927, "grad_norm": 1.828125, "learning_rate": 0.00047639006445362607, "loss": 5.4946, "mean_token_accuracy": 0.16939375996589662, "num_tokens": 33870956.0, "step": 18365 }, { "entropy": 5.639527320861816, "epoch": 1.5433312329342574, "grad_norm": 1.5390625, "learning_rate": 0.0004763767085520207, "loss": 5.3368, "mean_token_accuracy": 0.17298437505960465, "num_tokens": 33880568.0, "step": 18370 }, { "entropy": 5.727531051635742, "epoch": 1.5437513127494222, "grad_norm": 2.078125, "learning_rate": 0.0004763633490831306, "loss": 5.5471, "mean_token_accuracy": 0.15493866950273513, "num_tokens": 33890145.0, "step": 18375 }, { "entropy": 5.6116053581237795, "epoch": 1.5441713925645872, "grad_norm": 1.78125, "learning_rate": 0.0004763499860471925, "loss": 5.3965, "mean_token_accuracy": 0.16893347650766372, "num_tokens": 33899155.0, "step": 18380 }, { "entropy": 5.6794798374176025, "epoch": 1.5445914723797523, "grad_norm": 1.59375, "learning_rate": 0.000476336619444443, "loss": 5.4366, "mean_token_accuracy": 0.16216899007558822, "num_tokens": 33909410.0, "step": 18385 }, { "entropy": 5.643740177154541, "epoch": 1.545011552194917, "grad_norm": 1.4609375, "learning_rate": 0.000476323249275119, "loss": 5.3071, "mean_token_accuracy": 0.17813037484884262, "num_tokens": 33918451.0, "step": 18390 }, { "entropy": 5.5850482940673825, "epoch": 1.5454316320100818, "grad_norm": 1.6484375, "learning_rate": 0.0004763098755394573, "loss": 5.3449, "mean_token_accuracy": 0.17247679233551025, "num_tokens": 33928317.0, "step": 18395 }, { "entropy": 5.704434871673584, "epoch": 1.5458517118252468, "grad_norm": 1.6171875, "learning_rate": 0.0004762964982376949, "loss": 5.5166, "mean_token_accuracy": 0.16591467410326005, "num_tokens": 33938010.0, "step": 18400 }, { "entropy": 5.716954278945923, "epoch": 1.5462717916404118, "grad_norm": 1.5234375, "learning_rate": 0.00047628311737006856, "loss": 5.3336, "mean_token_accuracy": 0.1735645353794098, "num_tokens": 33946964.0, "step": 18405 }, { "entropy": 5.686046504974366, "epoch": 1.5466918714555766, "grad_norm": 1.359375, "learning_rate": 0.00047626973293681555, "loss": 5.349, "mean_token_accuracy": 0.16914291232824324, "num_tokens": 33956026.0, "step": 18410 }, { "entropy": 5.612794685363769, "epoch": 1.5471119512707414, "grad_norm": 1.4375, "learning_rate": 0.0004762563449381728, "loss": 5.3924, "mean_token_accuracy": 0.16146431416273116, "num_tokens": 33965787.0, "step": 18415 }, { "entropy": 5.663423538208008, "epoch": 1.5475320310859062, "grad_norm": 1.796875, "learning_rate": 0.00047624295337437753, "loss": 5.4273, "mean_token_accuracy": 0.1688649833202362, "num_tokens": 33974178.0, "step": 18420 }, { "entropy": 5.628804731369018, "epoch": 1.5479521109010712, "grad_norm": 1.5625, "learning_rate": 0.0004762295582456669, "loss": 5.2858, "mean_token_accuracy": 0.17369863390922546, "num_tokens": 33983652.0, "step": 18425 }, { "entropy": 5.696892833709716, "epoch": 1.5483721907162362, "grad_norm": 1.4609375, "learning_rate": 0.00047621615955227835, "loss": 5.3687, "mean_token_accuracy": 0.1774067535996437, "num_tokens": 33991938.0, "step": 18430 }, { "entropy": 5.6132800579071045, "epoch": 1.548792270531401, "grad_norm": 1.453125, "learning_rate": 0.0004762027572944491, "loss": 5.3544, "mean_token_accuracy": 0.16801770478487016, "num_tokens": 33999918.0, "step": 18435 }, { "entropy": 5.5902656555175785, "epoch": 1.5492123503465658, "grad_norm": 1.5234375, "learning_rate": 0.00047618935147241667, "loss": 5.3731, "mean_token_accuracy": 0.17459045797586442, "num_tokens": 34008416.0, "step": 18440 }, { "entropy": 5.701586627960205, "epoch": 1.5496324301617306, "grad_norm": 3.453125, "learning_rate": 0.0004761759420864184, "loss": 5.4532, "mean_token_accuracy": 0.16581283658742904, "num_tokens": 34017616.0, "step": 18445 }, { "entropy": 5.712861061096191, "epoch": 1.5500525099768956, "grad_norm": 1.7265625, "learning_rate": 0.000476162529136692, "loss": 5.3818, "mean_token_accuracy": 0.17086593359708785, "num_tokens": 34026064.0, "step": 18450 }, { "entropy": 5.564694118499756, "epoch": 1.5504725897920606, "grad_norm": 1.4375, "learning_rate": 0.0004761491126234749, "loss": 5.2959, "mean_token_accuracy": 0.1739438533782959, "num_tokens": 34035378.0, "step": 18455 }, { "entropy": 5.6146468162536625, "epoch": 1.5508926696072254, "grad_norm": 1.6875, "learning_rate": 0.0004761356925470049, "loss": 5.3503, "mean_token_accuracy": 0.1704146921634674, "num_tokens": 34044600.0, "step": 18460 }, { "entropy": 5.710069417953491, "epoch": 1.5513127494223902, "grad_norm": 1.5859375, "learning_rate": 0.00047612226890751956, "loss": 5.4336, "mean_token_accuracy": 0.16696672439575194, "num_tokens": 34054680.0, "step": 18465 }, { "entropy": 5.65276689529419, "epoch": 1.5517328292375552, "grad_norm": 1.375, "learning_rate": 0.00047610884170525697, "loss": 5.3498, "mean_token_accuracy": 0.1752360135316849, "num_tokens": 34063034.0, "step": 18470 }, { "entropy": 5.627860975265503, "epoch": 1.55215290905272, "grad_norm": 1.6875, "learning_rate": 0.0004760954109404547, "loss": 5.351, "mean_token_accuracy": 0.17447586208581925, "num_tokens": 34072122.0, "step": 18475 }, { "entropy": 5.674824905395508, "epoch": 1.552572988867885, "grad_norm": 2.890625, "learning_rate": 0.0004760819766133508, "loss": 5.3586, "mean_token_accuracy": 0.16940293908119203, "num_tokens": 34081493.0, "step": 18480 }, { "entropy": 5.668784093856812, "epoch": 1.5529930686830498, "grad_norm": 1.703125, "learning_rate": 0.00047606853872418317, "loss": 5.4445, "mean_token_accuracy": 0.16179682463407516, "num_tokens": 34090872.0, "step": 18485 }, { "entropy": 5.624145078659057, "epoch": 1.5534131484982145, "grad_norm": 1.4609375, "learning_rate": 0.0004760550972731899, "loss": 5.3052, "mean_token_accuracy": 0.1742589369416237, "num_tokens": 34100729.0, "step": 18490 }, { "entropy": 5.540934467315674, "epoch": 1.5538332283133796, "grad_norm": 1.4296875, "learning_rate": 0.0004760416522606092, "loss": 5.2939, "mean_token_accuracy": 0.1751124456524849, "num_tokens": 34109492.0, "step": 18495 }, { "entropy": 5.580523681640625, "epoch": 1.5542533081285446, "grad_norm": 1.7265625, "learning_rate": 0.0004760282036866791, "loss": 5.4, "mean_token_accuracy": 0.17484914511442184, "num_tokens": 34119529.0, "step": 18500 }, { "entropy": 5.76246075630188, "epoch": 1.5546733879437094, "grad_norm": 1.65625, "learning_rate": 0.0004760147515516379, "loss": 5.4362, "mean_token_accuracy": 0.1649763211607933, "num_tokens": 34128261.0, "step": 18505 }, { "entropy": 5.6341499328613285, "epoch": 1.5550934677588741, "grad_norm": 1.3515625, "learning_rate": 0.00047600129585572386, "loss": 5.4324, "mean_token_accuracy": 0.17126839607954025, "num_tokens": 34136916.0, "step": 18510 }, { "entropy": 5.713113260269165, "epoch": 1.555513547574039, "grad_norm": 1.6171875, "learning_rate": 0.0004759878365991754, "loss": 5.3471, "mean_token_accuracy": 0.17166002988815307, "num_tokens": 34146400.0, "step": 18515 }, { "entropy": 5.674141216278076, "epoch": 1.555933627389204, "grad_norm": 2.25, "learning_rate": 0.0004759743737822309, "loss": 5.3993, "mean_token_accuracy": 0.1691730111837387, "num_tokens": 34155611.0, "step": 18520 }, { "entropy": 5.65713849067688, "epoch": 1.556353707204369, "grad_norm": 1.984375, "learning_rate": 0.00047596090740512884, "loss": 5.428, "mean_token_accuracy": 0.1695108011364937, "num_tokens": 34165301.0, "step": 18525 }, { "entropy": 5.70047779083252, "epoch": 1.5567737870195337, "grad_norm": 2.03125, "learning_rate": 0.00047594743746810786, "loss": 5.4018, "mean_token_accuracy": 0.16435787677764893, "num_tokens": 34174655.0, "step": 18530 }, { "entropy": 5.802553367614746, "epoch": 1.5571938668346985, "grad_norm": 1.5625, "learning_rate": 0.00047593396397140644, "loss": 5.5507, "mean_token_accuracy": 0.1595836400985718, "num_tokens": 34184293.0, "step": 18535 }, { "entropy": 5.7214781761169435, "epoch": 1.5576139466498635, "grad_norm": 1.5625, "learning_rate": 0.0004759204869152632, "loss": 5.4373, "mean_token_accuracy": 0.16149042397737504, "num_tokens": 34193025.0, "step": 18540 }, { "entropy": 5.620850515365601, "epoch": 1.5580340264650283, "grad_norm": 1.4140625, "learning_rate": 0.0004759070062999171, "loss": 5.3478, "mean_token_accuracy": 0.1678580015897751, "num_tokens": 34201082.0, "step": 18545 }, { "entropy": 5.739461946487427, "epoch": 1.5584541062801933, "grad_norm": 1.5859375, "learning_rate": 0.0004758935221256069, "loss": 5.4907, "mean_token_accuracy": 0.16538347899913788, "num_tokens": 34211210.0, "step": 18550 }, { "entropy": 5.702043962478638, "epoch": 1.558874186095358, "grad_norm": 1.5078125, "learning_rate": 0.00047588003439257134, "loss": 5.4279, "mean_token_accuracy": 0.1693740040063858, "num_tokens": 34220309.0, "step": 18555 }, { "entropy": 5.728823947906494, "epoch": 1.559294265910523, "grad_norm": 1.890625, "learning_rate": 0.00047586654310104946, "loss": 5.4202, "mean_token_accuracy": 0.1592714488506317, "num_tokens": 34229532.0, "step": 18560 }, { "entropy": 5.792129182815552, "epoch": 1.559714345725688, "grad_norm": 1.515625, "learning_rate": 0.0004758530482512801, "loss": 5.6455, "mean_token_accuracy": 0.15465014576911926, "num_tokens": 34239543.0, "step": 18565 }, { "entropy": 5.7673375606536865, "epoch": 1.560134425540853, "grad_norm": 1.3828125, "learning_rate": 0.0004758395498435024, "loss": 5.4486, "mean_token_accuracy": 0.16822385787963867, "num_tokens": 34248654.0, "step": 18570 }, { "entropy": 5.71659140586853, "epoch": 1.5605545053560177, "grad_norm": 1.734375, "learning_rate": 0.00047582604787795555, "loss": 5.4313, "mean_token_accuracy": 0.16151682287454605, "num_tokens": 34258757.0, "step": 18575 }, { "entropy": 5.668481111526489, "epoch": 1.5609745851711825, "grad_norm": 2.578125, "learning_rate": 0.0004758125423548787, "loss": 5.4308, "mean_token_accuracy": 0.1640526682138443, "num_tokens": 34268253.0, "step": 18580 }, { "entropy": 5.759385299682617, "epoch": 1.5613946649863473, "grad_norm": 1.4375, "learning_rate": 0.00047579903327451097, "loss": 5.4909, "mean_token_accuracy": 0.1663891091942787, "num_tokens": 34277361.0, "step": 18585 }, { "entropy": 5.640477037429809, "epoch": 1.5618147448015123, "grad_norm": 1.7265625, "learning_rate": 0.0004757855206370919, "loss": 5.3618, "mean_token_accuracy": 0.16783252209424973, "num_tokens": 34285923.0, "step": 18590 }, { "entropy": 5.600503778457641, "epoch": 1.5622348246166773, "grad_norm": 1.4609375, "learning_rate": 0.00047577200444286064, "loss": 5.3768, "mean_token_accuracy": 0.1716615855693817, "num_tokens": 34296300.0, "step": 18595 }, { "entropy": 5.766132545471192, "epoch": 1.562654904431842, "grad_norm": 1.703125, "learning_rate": 0.0004757584846920567, "loss": 5.4101, "mean_token_accuracy": 0.16635561734437943, "num_tokens": 34305757.0, "step": 18600 }, { "entropy": 5.677987813949585, "epoch": 1.5630749842470069, "grad_norm": 1.7265625, "learning_rate": 0.0004757449613849196, "loss": 5.4464, "mean_token_accuracy": 0.16000643074512483, "num_tokens": 34314714.0, "step": 18605 }, { "entropy": 5.718979597091675, "epoch": 1.5634950640621716, "grad_norm": 1.5, "learning_rate": 0.00047573143452168883, "loss": 5.473, "mean_token_accuracy": 0.16973401680588723, "num_tokens": 34323501.0, "step": 18610 }, { "entropy": 5.73273868560791, "epoch": 1.5639151438773367, "grad_norm": 1.703125, "learning_rate": 0.00047571790410260405, "loss": 5.4017, "mean_token_accuracy": 0.17346812933683395, "num_tokens": 34331752.0, "step": 18615 }, { "entropy": 5.717556381225586, "epoch": 1.5643352236925017, "grad_norm": 1.9140625, "learning_rate": 0.000475704370127905, "loss": 5.4609, "mean_token_accuracy": 0.16100564748048782, "num_tokens": 34341479.0, "step": 18620 }, { "entropy": 5.6721264839172365, "epoch": 1.5647553035076665, "grad_norm": 1.3828125, "learning_rate": 0.0004756908325978314, "loss": 5.4556, "mean_token_accuracy": 0.1629202827811241, "num_tokens": 34350991.0, "step": 18625 }, { "entropy": 5.697770977020264, "epoch": 1.5651753833228312, "grad_norm": 1.71875, "learning_rate": 0.00047567729151262305, "loss": 5.3765, "mean_token_accuracy": 0.16833187639713287, "num_tokens": 34360089.0, "step": 18630 }, { "entropy": 5.693409872055054, "epoch": 1.5655954631379962, "grad_norm": 1.5703125, "learning_rate": 0.0004756637468725198, "loss": 5.3417, "mean_token_accuracy": 0.17019174247980118, "num_tokens": 34370352.0, "step": 18635 }, { "entropy": 5.6412163257598875, "epoch": 1.5660155429531613, "grad_norm": 1.5, "learning_rate": 0.0004756501986777616, "loss": 5.3334, "mean_token_accuracy": 0.1646198183298111, "num_tokens": 34378958.0, "step": 18640 }, { "entropy": 5.577014398574829, "epoch": 1.566435622768326, "grad_norm": 1.6171875, "learning_rate": 0.00047563664692858843, "loss": 5.3075, "mean_token_accuracy": 0.17557633221149443, "num_tokens": 34387723.0, "step": 18645 }, { "entropy": 5.667887926101685, "epoch": 1.5668557025834908, "grad_norm": 1.5859375, "learning_rate": 0.0004756230916252404, "loss": 5.4322, "mean_token_accuracy": 0.17246091961860657, "num_tokens": 34397089.0, "step": 18650 }, { "entropy": 5.754067516326904, "epoch": 1.5672757823986556, "grad_norm": 1.640625, "learning_rate": 0.00047560953276795756, "loss": 5.4493, "mean_token_accuracy": 0.1675298720598221, "num_tokens": 34406278.0, "step": 18655 }, { "entropy": 5.7338409423828125, "epoch": 1.5676958622138206, "grad_norm": 4.75, "learning_rate": 0.00047559597035698014, "loss": 5.4153, "mean_token_accuracy": 0.16818469762802124, "num_tokens": 34415404.0, "step": 18660 }, { "entropy": 5.689050960540771, "epoch": 1.5681159420289856, "grad_norm": 1.78125, "learning_rate": 0.0004755824043925485, "loss": 5.4658, "mean_token_accuracy": 0.17355379313230515, "num_tokens": 34425036.0, "step": 18665 }, { "entropy": 5.6759899139404295, "epoch": 1.5685360218441504, "grad_norm": 2.0625, "learning_rate": 0.0004755688348749027, "loss": 5.3721, "mean_token_accuracy": 0.16852474361658096, "num_tokens": 34434246.0, "step": 18670 }, { "entropy": 5.6307172775268555, "epoch": 1.5689561016593152, "grad_norm": 1.875, "learning_rate": 0.0004755552618042834, "loss": 5.3735, "mean_token_accuracy": 0.1715213656425476, "num_tokens": 34444189.0, "step": 18675 }, { "entropy": 5.694113779067993, "epoch": 1.56937618147448, "grad_norm": 1.3984375, "learning_rate": 0.0004755416851809308, "loss": 5.3705, "mean_token_accuracy": 0.17202963531017304, "num_tokens": 34453727.0, "step": 18680 }, { "entropy": 5.555972719192505, "epoch": 1.569796261289645, "grad_norm": 1.5703125, "learning_rate": 0.0004755281050050856, "loss": 5.3687, "mean_token_accuracy": 0.16777419596910476, "num_tokens": 34462835.0, "step": 18685 }, { "entropy": 5.644486761093139, "epoch": 1.57021634110481, "grad_norm": 1.4296875, "learning_rate": 0.0004755145212769882, "loss": 5.4169, "mean_token_accuracy": 0.16981538236141205, "num_tokens": 34471642.0, "step": 18690 }, { "entropy": 5.722853660583496, "epoch": 1.5706364209199748, "grad_norm": 1.390625, "learning_rate": 0.00047550093399687936, "loss": 5.3804, "mean_token_accuracy": 0.16804203689098357, "num_tokens": 34480468.0, "step": 18695 }, { "entropy": 5.763798809051513, "epoch": 1.5710565007351396, "grad_norm": 1.40625, "learning_rate": 0.0004754873431649997, "loss": 5.4243, "mean_token_accuracy": 0.16598083227872848, "num_tokens": 34490299.0, "step": 18700 }, { "entropy": 5.668231630325318, "epoch": 1.5714765805503046, "grad_norm": 1.4453125, "learning_rate": 0.00047547374878159003, "loss": 5.4338, "mean_token_accuracy": 0.1664573848247528, "num_tokens": 34498831.0, "step": 18705 }, { "entropy": 5.675115299224854, "epoch": 1.5718966603654696, "grad_norm": 1.4453125, "learning_rate": 0.0004754601508468911, "loss": 5.4249, "mean_token_accuracy": 0.16958544850349427, "num_tokens": 34508048.0, "step": 18710 }, { "entropy": 5.676489400863647, "epoch": 1.5723167401806344, "grad_norm": 1.5703125, "learning_rate": 0.0004754465493611438, "loss": 5.5091, "mean_token_accuracy": 0.16318027675151825, "num_tokens": 34517070.0, "step": 18715 }, { "entropy": 5.6339551448822025, "epoch": 1.5727368199957992, "grad_norm": 1.84375, "learning_rate": 0.00047543294432458904, "loss": 5.2937, "mean_token_accuracy": 0.1759590983390808, "num_tokens": 34525934.0, "step": 18720 }, { "entropy": 5.744489860534668, "epoch": 1.573156899810964, "grad_norm": 1.4296875, "learning_rate": 0.000475419335737468, "loss": 5.5149, "mean_token_accuracy": 0.16737214624881744, "num_tokens": 34534222.0, "step": 18725 }, { "entropy": 5.750201940536499, "epoch": 1.573576979626129, "grad_norm": 2.8125, "learning_rate": 0.00047540572360002157, "loss": 5.4944, "mean_token_accuracy": 0.16553839445114135, "num_tokens": 34543291.0, "step": 18730 }, { "entropy": 5.757966184616089, "epoch": 1.573997059441294, "grad_norm": 1.578125, "learning_rate": 0.00047539210791249095, "loss": 5.363, "mean_token_accuracy": 0.17250452637672425, "num_tokens": 34552383.0, "step": 18735 }, { "entropy": 5.687971353530884, "epoch": 1.5744171392564588, "grad_norm": 1.5078125, "learning_rate": 0.0004753784886751173, "loss": 5.3368, "mean_token_accuracy": 0.1798310786485672, "num_tokens": 34560311.0, "step": 18740 }, { "entropy": 5.587876176834106, "epoch": 1.5748372190716236, "grad_norm": 1.59375, "learning_rate": 0.0004753648658881419, "loss": 5.3912, "mean_token_accuracy": 0.17629951983690262, "num_tokens": 34569903.0, "step": 18745 }, { "entropy": 5.642320442199707, "epoch": 1.5752572988867883, "grad_norm": 1.4140625, "learning_rate": 0.00047535123955180607, "loss": 5.4037, "mean_token_accuracy": 0.16801706254482268, "num_tokens": 34579735.0, "step": 18750 }, { "entropy": 5.7766814708709715, "epoch": 1.5756773787019533, "grad_norm": 1.40625, "learning_rate": 0.0004753376096663512, "loss": 5.4316, "mean_token_accuracy": 0.16776171922683716, "num_tokens": 34589105.0, "step": 18755 }, { "entropy": 5.625161170959473, "epoch": 1.5760974585171184, "grad_norm": 1.6015625, "learning_rate": 0.00047532397623201877, "loss": 5.3705, "mean_token_accuracy": 0.17527176439762115, "num_tokens": 34597883.0, "step": 18760 }, { "entropy": 5.675880050659179, "epoch": 1.5765175383322831, "grad_norm": 1.453125, "learning_rate": 0.00047531033924905024, "loss": 5.3506, "mean_token_accuracy": 0.17240157425403596, "num_tokens": 34606666.0, "step": 18765 }, { "entropy": 5.746255779266358, "epoch": 1.576937618147448, "grad_norm": 1.3984375, "learning_rate": 0.0004752966987176873, "loss": 5.4819, "mean_token_accuracy": 0.16786147505044938, "num_tokens": 34616547.0, "step": 18770 }, { "entropy": 5.713323879241943, "epoch": 1.577357697962613, "grad_norm": 1.5703125, "learning_rate": 0.0004752830546381713, "loss": 5.4497, "mean_token_accuracy": 0.16839058697223663, "num_tokens": 34625679.0, "step": 18775 }, { "entropy": 5.610950660705567, "epoch": 1.5777777777777777, "grad_norm": 1.7734375, "learning_rate": 0.0004752694070107442, "loss": 5.3817, "mean_token_accuracy": 0.1739755392074585, "num_tokens": 34635633.0, "step": 18780 }, { "entropy": 5.7086036682128904, "epoch": 1.5781978575929427, "grad_norm": 1.5, "learning_rate": 0.0004752557558356476, "loss": 5.4156, "mean_token_accuracy": 0.17332434356212617, "num_tokens": 34645206.0, "step": 18785 }, { "entropy": 5.656038665771485, "epoch": 1.5786179374081075, "grad_norm": 1.4765625, "learning_rate": 0.0004752421011131234, "loss": 5.4342, "mean_token_accuracy": 0.16186445355415344, "num_tokens": 34653884.0, "step": 18790 }, { "entropy": 5.606300926208496, "epoch": 1.5790380172232723, "grad_norm": 1.5390625, "learning_rate": 0.00047522844284341364, "loss": 5.2898, "mean_token_accuracy": 0.17618423253297805, "num_tokens": 34662170.0, "step": 18795 }, { "entropy": 5.66893949508667, "epoch": 1.5794580970384373, "grad_norm": 1.6875, "learning_rate": 0.0004752147810267601, "loss": 5.4433, "mean_token_accuracy": 0.16510264128446578, "num_tokens": 34672548.0, "step": 18800 }, { "entropy": 5.760573959350586, "epoch": 1.5798781768536023, "grad_norm": 1.453125, "learning_rate": 0.00047520111566340465, "loss": 5.4323, "mean_token_accuracy": 0.1679047629237175, "num_tokens": 34680972.0, "step": 18805 }, { "entropy": 5.643776369094849, "epoch": 1.580298256668767, "grad_norm": 1.7265625, "learning_rate": 0.00047518744675358965, "loss": 5.3027, "mean_token_accuracy": 0.17184915244579316, "num_tokens": 34689589.0, "step": 18810 }, { "entropy": 5.626055669784546, "epoch": 1.580718336483932, "grad_norm": 1.453125, "learning_rate": 0.0004751737742975571, "loss": 5.338, "mean_token_accuracy": 0.17203721702098845, "num_tokens": 34698747.0, "step": 18815 }, { "entropy": 5.675599765777588, "epoch": 1.5811384162990967, "grad_norm": 1.3828125, "learning_rate": 0.00047516009829554913, "loss": 5.4003, "mean_token_accuracy": 0.16775297075510026, "num_tokens": 34707502.0, "step": 18820 }, { "entropy": 5.608147096633911, "epoch": 1.5815584961142617, "grad_norm": 1.5234375, "learning_rate": 0.00047514641874780815, "loss": 5.3289, "mean_token_accuracy": 0.17193017303943633, "num_tokens": 34715879.0, "step": 18825 }, { "entropy": 5.649180126190186, "epoch": 1.5819785759294267, "grad_norm": 1.6171875, "learning_rate": 0.00047513273565457644, "loss": 5.5108, "mean_token_accuracy": 0.16794274374842644, "num_tokens": 34726090.0, "step": 18830 }, { "entropy": 5.7852592945098875, "epoch": 1.5823986557445915, "grad_norm": 1.6875, "learning_rate": 0.0004751190490160964, "loss": 5.4755, "mean_token_accuracy": 0.16427757740020751, "num_tokens": 34736014.0, "step": 18835 }, { "entropy": 5.749915409088135, "epoch": 1.5828187355597563, "grad_norm": 1.4375, "learning_rate": 0.00047510535883261035, "loss": 5.415, "mean_token_accuracy": 0.16692599207162856, "num_tokens": 34745648.0, "step": 18840 }, { "entropy": 5.649198770523071, "epoch": 1.5832388153749213, "grad_norm": 2.375, "learning_rate": 0.000475091665104361, "loss": 5.3967, "mean_token_accuracy": 0.17258985787630082, "num_tokens": 34753908.0, "step": 18845 }, { "entropy": 5.645108318328857, "epoch": 1.583658895190086, "grad_norm": 1.703125, "learning_rate": 0.0004750779678315908, "loss": 5.2509, "mean_token_accuracy": 0.17971468716859818, "num_tokens": 34762303.0, "step": 18850 }, { "entropy": 5.632398986816407, "epoch": 1.584078975005251, "grad_norm": 1.5703125, "learning_rate": 0.0004750642670145424, "loss": 5.4294, "mean_token_accuracy": 0.16685875207185746, "num_tokens": 34771463.0, "step": 18855 }, { "entropy": 5.779457092285156, "epoch": 1.5844990548204159, "grad_norm": 1.484375, "learning_rate": 0.0004750505626534585, "loss": 5.5146, "mean_token_accuracy": 0.16541918367147446, "num_tokens": 34780704.0, "step": 18860 }, { "entropy": 5.615437173843384, "epoch": 1.5849191346355807, "grad_norm": 1.5625, "learning_rate": 0.00047503685474858194, "loss": 5.3305, "mean_token_accuracy": 0.1751614198088646, "num_tokens": 34790262.0, "step": 18865 }, { "entropy": 5.691679000854492, "epoch": 1.5853392144507457, "grad_norm": 1.3828125, "learning_rate": 0.0004750231433001555, "loss": 5.3665, "mean_token_accuracy": 0.1725798651576042, "num_tokens": 34799450.0, "step": 18870 }, { "entropy": 5.732432460784912, "epoch": 1.5857592942659107, "grad_norm": 1.3984375, "learning_rate": 0.0004750094283084221, "loss": 5.4141, "mean_token_accuracy": 0.16283925771713256, "num_tokens": 34808220.0, "step": 18875 }, { "entropy": 5.716584873199463, "epoch": 1.5861793740810755, "grad_norm": 1.671875, "learning_rate": 0.00047499570977362467, "loss": 5.4334, "mean_token_accuracy": 0.16313114315271376, "num_tokens": 34817846.0, "step": 18880 }, { "entropy": 5.688366794586182, "epoch": 1.5865994538962402, "grad_norm": 1.90625, "learning_rate": 0.00047498198769600617, "loss": 5.4526, "mean_token_accuracy": 0.16976003497838973, "num_tokens": 34826962.0, "step": 18885 }, { "entropy": 5.637577390670776, "epoch": 1.587019533711405, "grad_norm": 1.5390625, "learning_rate": 0.0004749682620758097, "loss": 5.3876, "mean_token_accuracy": 0.1662908226251602, "num_tokens": 34837170.0, "step": 18890 }, { "entropy": 5.624025487899781, "epoch": 1.58743961352657, "grad_norm": 1.765625, "learning_rate": 0.00047495453291327854, "loss": 5.3856, "mean_token_accuracy": 0.17156262695789337, "num_tokens": 34845336.0, "step": 18895 }, { "entropy": 5.641190814971924, "epoch": 1.587859693341735, "grad_norm": 1.421875, "learning_rate": 0.00047494080020865577, "loss": 5.3634, "mean_token_accuracy": 0.17117148637771606, "num_tokens": 34854613.0, "step": 18900 }, { "entropy": 5.714927101135254, "epoch": 1.5882797731568998, "grad_norm": 1.6171875, "learning_rate": 0.0004749270639621846, "loss": 5.4495, "mean_token_accuracy": 0.16892678290605545, "num_tokens": 34864254.0, "step": 18905 }, { "entropy": 5.7336501121521, "epoch": 1.5886998529720646, "grad_norm": 1.4375, "learning_rate": 0.0004749133241741085, "loss": 5.4825, "mean_token_accuracy": 0.1654273435473442, "num_tokens": 34874380.0, "step": 18910 }, { "entropy": 5.716697835922242, "epoch": 1.5891199327872296, "grad_norm": 1.75, "learning_rate": 0.0004748995808446708, "loss": 5.4443, "mean_token_accuracy": 0.16573767066001893, "num_tokens": 34883688.0, "step": 18915 }, { "entropy": 5.658730459213257, "epoch": 1.5895400126023944, "grad_norm": 1.4765625, "learning_rate": 0.00047488583397411495, "loss": 5.3102, "mean_token_accuracy": 0.17554232925176622, "num_tokens": 34892831.0, "step": 18920 }, { "entropy": 5.709734773635864, "epoch": 1.5899600924175594, "grad_norm": 1.4296875, "learning_rate": 0.00047487208356268454, "loss": 5.4004, "mean_token_accuracy": 0.17941274642944335, "num_tokens": 34901517.0, "step": 18925 }, { "entropy": 5.688491916656494, "epoch": 1.5903801722327242, "grad_norm": 1.46875, "learning_rate": 0.00047485832961062296, "loss": 5.4002, "mean_token_accuracy": 0.17023382037878038, "num_tokens": 34910765.0, "step": 18930 }, { "entropy": 5.723994779586792, "epoch": 1.590800252047889, "grad_norm": 1.5390625, "learning_rate": 0.00047484457211817405, "loss": 5.4441, "mean_token_accuracy": 0.16038562953472138, "num_tokens": 34919799.0, "step": 18935 }, { "entropy": 5.630226898193359, "epoch": 1.591220331863054, "grad_norm": 1.4921875, "learning_rate": 0.00047483081108558143, "loss": 5.3115, "mean_token_accuracy": 0.17336263954639436, "num_tokens": 34928199.0, "step": 18940 }, { "entropy": 5.682058715820313, "epoch": 1.591640411678219, "grad_norm": 1.59375, "learning_rate": 0.000474817046513089, "loss": 5.4412, "mean_token_accuracy": 0.16989699453115464, "num_tokens": 34937751.0, "step": 18945 }, { "entropy": 5.724739217758179, "epoch": 1.5920604914933838, "grad_norm": 1.4609375, "learning_rate": 0.0004748032784009403, "loss": 5.3858, "mean_token_accuracy": 0.17437688410282134, "num_tokens": 34946052.0, "step": 18950 }, { "entropy": 5.651232576370239, "epoch": 1.5924805713085486, "grad_norm": 1.3984375, "learning_rate": 0.0004747895067493796, "loss": 5.3793, "mean_token_accuracy": 0.1674926221370697, "num_tokens": 34954932.0, "step": 18955 }, { "entropy": 5.675562763214112, "epoch": 1.5929006511237134, "grad_norm": 1.53125, "learning_rate": 0.0004747757315586505, "loss": 5.3688, "mean_token_accuracy": 0.17305743098258972, "num_tokens": 34963581.0, "step": 18960 }, { "entropy": 5.539657783508301, "epoch": 1.5933207309388784, "grad_norm": 1.5, "learning_rate": 0.00047476195282899727, "loss": 5.1861, "mean_token_accuracy": 0.18181020617485047, "num_tokens": 34972844.0, "step": 18965 }, { "entropy": 5.623536205291748, "epoch": 1.5937408107540434, "grad_norm": 1.65625, "learning_rate": 0.00047474817056066383, "loss": 5.396, "mean_token_accuracy": 0.176412869989872, "num_tokens": 34981998.0, "step": 18970 }, { "entropy": 5.595731449127197, "epoch": 1.5941608905692082, "grad_norm": 1.265625, "learning_rate": 0.00047473438475389453, "loss": 5.3263, "mean_token_accuracy": 0.17470391392707824, "num_tokens": 34990552.0, "step": 18975 }, { "entropy": 5.687963628768921, "epoch": 1.594580970384373, "grad_norm": 1.8671875, "learning_rate": 0.0004747205954089333, "loss": 5.3401, "mean_token_accuracy": 0.17572322934865953, "num_tokens": 35000259.0, "step": 18980 }, { "entropy": 5.716721391677856, "epoch": 1.5950010501995378, "grad_norm": 1.640625, "learning_rate": 0.0004747068025260247, "loss": 5.4253, "mean_token_accuracy": 0.16249436065554618, "num_tokens": 35009592.0, "step": 18985 }, { "entropy": 5.700528287887574, "epoch": 1.5954211300147028, "grad_norm": 1.5859375, "learning_rate": 0.0004746930061054129, "loss": 5.4772, "mean_token_accuracy": 0.15898309648036957, "num_tokens": 35019356.0, "step": 18990 }, { "entropy": 5.6559325695037845, "epoch": 1.5958412098298678, "grad_norm": 1.546875, "learning_rate": 0.00047467920614734224, "loss": 5.3952, "mean_token_accuracy": 0.17310373932123185, "num_tokens": 35028764.0, "step": 18995 }, { "entropy": 5.6906005859375, "epoch": 1.5962612896450326, "grad_norm": 1.296875, "learning_rate": 0.0004746654026520573, "loss": 5.4045, "mean_token_accuracy": 0.16763416677713394, "num_tokens": 35037903.0, "step": 19000 }, { "entropy": 5.642781209945679, "epoch": 1.5966813694601973, "grad_norm": 1.3984375, "learning_rate": 0.0004746515956198026, "loss": 5.3038, "mean_token_accuracy": 0.17678880393505098, "num_tokens": 35046326.0, "step": 19005 }, { "entropy": 5.741660451889038, "epoch": 1.5971014492753624, "grad_norm": 1.8359375, "learning_rate": 0.00047463778505082266, "loss": 5.5384, "mean_token_accuracy": 0.16487176418304444, "num_tokens": 35055551.0, "step": 19010 }, { "entropy": 5.641852474212646, "epoch": 1.5975215290905274, "grad_norm": 1.5234375, "learning_rate": 0.0004746239709453621, "loss": 5.3079, "mean_token_accuracy": 0.18089368045330048, "num_tokens": 35065595.0, "step": 19015 }, { "entropy": 5.710475492477417, "epoch": 1.5979416089056921, "grad_norm": 1.40625, "learning_rate": 0.0004746101533036658, "loss": 5.4167, "mean_token_accuracy": 0.16984072029590608, "num_tokens": 35075097.0, "step": 19020 }, { "entropy": 5.825159311294556, "epoch": 1.598361688720857, "grad_norm": 1.765625, "learning_rate": 0.00047459633212597834, "loss": 5.5007, "mean_token_accuracy": 0.16182542145252227, "num_tokens": 35084092.0, "step": 19025 }, { "entropy": 5.685335683822632, "epoch": 1.5987817685360217, "grad_norm": 1.515625, "learning_rate": 0.0004745825074125447, "loss": 5.3897, "mean_token_accuracy": 0.16710626929998398, "num_tokens": 35093007.0, "step": 19030 }, { "entropy": 5.754900789260864, "epoch": 1.5992018483511867, "grad_norm": 1.40625, "learning_rate": 0.0004745686791636097, "loss": 5.4559, "mean_token_accuracy": 0.16395678967237473, "num_tokens": 35103094.0, "step": 19035 }, { "entropy": 5.639309453964233, "epoch": 1.5996219281663517, "grad_norm": 1.4140625, "learning_rate": 0.00047455484737941823, "loss": 5.3045, "mean_token_accuracy": 0.17383471876382828, "num_tokens": 35112561.0, "step": 19040 }, { "entropy": 5.610976266860962, "epoch": 1.6000420079815165, "grad_norm": 1.3828125, "learning_rate": 0.0004745410120602155, "loss": 5.3837, "mean_token_accuracy": 0.16612301766872406, "num_tokens": 35121718.0, "step": 19045 }, { "entropy": 5.7062891006469725, "epoch": 1.6004620877966813, "grad_norm": 1.40625, "learning_rate": 0.00047452717320624647, "loss": 5.344, "mean_token_accuracy": 0.18142815828323364, "num_tokens": 35130073.0, "step": 19050 }, { "entropy": 5.670109796524048, "epoch": 1.600882167611846, "grad_norm": 1.7421875, "learning_rate": 0.0004745133308177562, "loss": 5.3913, "mean_token_accuracy": 0.16597676426172256, "num_tokens": 35138876.0, "step": 19055 }, { "entropy": 5.669570541381836, "epoch": 1.601302247427011, "grad_norm": 1.5703125, "learning_rate": 0.00047449948489499007, "loss": 5.381, "mean_token_accuracy": 0.1685373529791832, "num_tokens": 35147750.0, "step": 19060 }, { "entropy": 5.678817701339722, "epoch": 1.6017223272421761, "grad_norm": 1.6328125, "learning_rate": 0.00047448563543819335, "loss": 5.4017, "mean_token_accuracy": 0.17186661213636398, "num_tokens": 35156955.0, "step": 19065 }, { "entropy": 5.661539745330811, "epoch": 1.602142407057341, "grad_norm": 1.859375, "learning_rate": 0.0004744717824476112, "loss": 5.4264, "mean_token_accuracy": 0.16969927847385408, "num_tokens": 35166542.0, "step": 19070 }, { "entropy": 5.707697916030884, "epoch": 1.6025624868725057, "grad_norm": 1.7578125, "learning_rate": 0.00047445792592348926, "loss": 5.3853, "mean_token_accuracy": 0.16943657100200654, "num_tokens": 35175258.0, "step": 19075 }, { "entropy": 5.701454114913941, "epoch": 1.6029825666876707, "grad_norm": 1.875, "learning_rate": 0.0004744440658660729, "loss": 5.3865, "mean_token_accuracy": 0.16605425924062728, "num_tokens": 35184970.0, "step": 19080 }, { "entropy": 5.687052440643311, "epoch": 1.6034026465028357, "grad_norm": 1.53125, "learning_rate": 0.0004744302022756075, "loss": 5.3784, "mean_token_accuracy": 0.16496190279722214, "num_tokens": 35193948.0, "step": 19085 }, { "entropy": 5.577232599258423, "epoch": 1.6038227263180005, "grad_norm": 1.59375, "learning_rate": 0.00047441633515233874, "loss": 5.3198, "mean_token_accuracy": 0.17375623136758805, "num_tokens": 35203792.0, "step": 19090 }, { "entropy": 5.670841121673584, "epoch": 1.6042428061331653, "grad_norm": 1.515625, "learning_rate": 0.0004744024644965123, "loss": 5.4944, "mean_token_accuracy": 0.16568351536989212, "num_tokens": 35212684.0, "step": 19095 }, { "entropy": 5.673999786376953, "epoch": 1.60466288594833, "grad_norm": 1.4453125, "learning_rate": 0.00047438859030837397, "loss": 5.2946, "mean_token_accuracy": 0.17858032286167144, "num_tokens": 35220830.0, "step": 19100 }, { "entropy": 5.708344316482544, "epoch": 1.605082965763495, "grad_norm": 1.4140625, "learning_rate": 0.00047437471258816936, "loss": 5.3833, "mean_token_accuracy": 0.16468634456396103, "num_tokens": 35230171.0, "step": 19105 }, { "entropy": 5.619188070297241, "epoch": 1.60550304557866, "grad_norm": 1.640625, "learning_rate": 0.00047436083133614446, "loss": 5.3073, "mean_token_accuracy": 0.17591052502393723, "num_tokens": 35239022.0, "step": 19110 }, { "entropy": 5.629873466491699, "epoch": 1.6059231253938249, "grad_norm": 1.3046875, "learning_rate": 0.00047434694655254495, "loss": 5.3297, "mean_token_accuracy": 0.16770700961351395, "num_tokens": 35247564.0, "step": 19115 }, { "entropy": 5.6350812911987305, "epoch": 1.6063432052089897, "grad_norm": 1.4140625, "learning_rate": 0.000474333058237617, "loss": 5.3529, "mean_token_accuracy": 0.16446800380945206, "num_tokens": 35256175.0, "step": 19120 }, { "entropy": 5.780952882766724, "epoch": 1.6067632850241544, "grad_norm": 1.5625, "learning_rate": 0.00047431916639160656, "loss": 5.5043, "mean_token_accuracy": 0.1661346063017845, "num_tokens": 35265278.0, "step": 19125 }, { "entropy": 5.603296756744385, "epoch": 1.6071833648393195, "grad_norm": 1.5078125, "learning_rate": 0.0004743052710147598, "loss": 5.2283, "mean_token_accuracy": 0.1780938133597374, "num_tokens": 35274715.0, "step": 19130 }, { "entropy": 5.574432277679444, "epoch": 1.6076034446544845, "grad_norm": 1.5859375, "learning_rate": 0.00047429137210732266, "loss": 5.3431, "mean_token_accuracy": 0.1689825624227524, "num_tokens": 35285450.0, "step": 19135 }, { "entropy": 5.659537506103516, "epoch": 1.6080235244696492, "grad_norm": 1.3515625, "learning_rate": 0.0004742774696695415, "loss": 5.3553, "mean_token_accuracy": 0.1621303752064705, "num_tokens": 35294531.0, "step": 19140 }, { "entropy": 5.693420028686523, "epoch": 1.608443604284814, "grad_norm": 1.421875, "learning_rate": 0.00047426356370166266, "loss": 5.4104, "mean_token_accuracy": 0.16336591690778732, "num_tokens": 35303749.0, "step": 19145 }, { "entropy": 5.59863772392273, "epoch": 1.608863684099979, "grad_norm": 1.390625, "learning_rate": 0.0004742496542039324, "loss": 5.3695, "mean_token_accuracy": 0.16599306017160415, "num_tokens": 35312994.0, "step": 19150 }, { "entropy": 5.656160926818847, "epoch": 1.6092837639151438, "grad_norm": 1.453125, "learning_rate": 0.00047423574117659703, "loss": 5.3488, "mean_token_accuracy": 0.1693723350763321, "num_tokens": 35322533.0, "step": 19155 }, { "entropy": 5.681179428100586, "epoch": 1.6097038437303088, "grad_norm": 2.453125, "learning_rate": 0.00047422182461990316, "loss": 5.3872, "mean_token_accuracy": 0.1734430029988289, "num_tokens": 35331872.0, "step": 19160 }, { "entropy": 5.643349313735962, "epoch": 1.6101239235454736, "grad_norm": 1.7890625, "learning_rate": 0.00047420790453409724, "loss": 5.4206, "mean_token_accuracy": 0.16745028495788575, "num_tokens": 35341517.0, "step": 19165 }, { "entropy": 5.632366943359375, "epoch": 1.6105440033606384, "grad_norm": 1.4453125, "learning_rate": 0.0004741939809194258, "loss": 5.3309, "mean_token_accuracy": 0.176885287463665, "num_tokens": 35350291.0, "step": 19170 }, { "entropy": 5.727736234664917, "epoch": 1.6109640831758034, "grad_norm": 1.4453125, "learning_rate": 0.00047418005377613566, "loss": 5.499, "mean_token_accuracy": 0.1620399162173271, "num_tokens": 35360711.0, "step": 19175 }, { "entropy": 5.703640460968018, "epoch": 1.6113841629909684, "grad_norm": 1.46875, "learning_rate": 0.0004741661231044733, "loss": 5.3995, "mean_token_accuracy": 0.1704120382666588, "num_tokens": 35370069.0, "step": 19180 }, { "entropy": 5.749680423736573, "epoch": 1.6118042428061332, "grad_norm": 1.9375, "learning_rate": 0.00047415218890468577, "loss": 5.3856, "mean_token_accuracy": 0.18042093962430955, "num_tokens": 35380389.0, "step": 19185 }, { "entropy": 5.660278797149658, "epoch": 1.612224322621298, "grad_norm": 1.4296875, "learning_rate": 0.0004741382511770197, "loss": 5.3838, "mean_token_accuracy": 0.17036385387182235, "num_tokens": 35389420.0, "step": 19190 }, { "entropy": 5.662668371200562, "epoch": 1.6126444024364628, "grad_norm": 1.7265625, "learning_rate": 0.00047412430992172205, "loss": 5.4823, "mean_token_accuracy": 0.15827725529670716, "num_tokens": 35399418.0, "step": 19195 }, { "entropy": 5.634368419647217, "epoch": 1.6130644822516278, "grad_norm": 1.7890625, "learning_rate": 0.00047411036513903974, "loss": 5.3616, "mean_token_accuracy": 0.17389402389526368, "num_tokens": 35408717.0, "step": 19200 }, { "entropy": 5.6884690284729, "epoch": 1.6134845620667928, "grad_norm": 1.625, "learning_rate": 0.00047409641682921987, "loss": 5.3188, "mean_token_accuracy": 0.18027044236660003, "num_tokens": 35417118.0, "step": 19205 }, { "entropy": 5.686248636245727, "epoch": 1.6139046418819576, "grad_norm": 1.703125, "learning_rate": 0.0004740824649925096, "loss": 5.4141, "mean_token_accuracy": 0.1654793232679367, "num_tokens": 35425526.0, "step": 19210 }, { "entropy": 5.595103168487549, "epoch": 1.6143247216971224, "grad_norm": 1.5234375, "learning_rate": 0.0004740685096291559, "loss": 5.4122, "mean_token_accuracy": 0.16647179573774337, "num_tokens": 35434932.0, "step": 19215 }, { "entropy": 5.725376129150391, "epoch": 1.6147448015122874, "grad_norm": 1.671875, "learning_rate": 0.00047405455073940597, "loss": 5.4364, "mean_token_accuracy": 0.16955055445432662, "num_tokens": 35443909.0, "step": 19220 }, { "entropy": 5.752730035781861, "epoch": 1.6151648813274522, "grad_norm": 1.46875, "learning_rate": 0.0004740405883235072, "loss": 5.4143, "mean_token_accuracy": 0.17224101722240448, "num_tokens": 35454082.0, "step": 19225 }, { "entropy": 5.780597686767578, "epoch": 1.6155849611426172, "grad_norm": 1.3125, "learning_rate": 0.00047402662238170694, "loss": 5.4702, "mean_token_accuracy": 0.16434868276119233, "num_tokens": 35464547.0, "step": 19230 }, { "entropy": 5.657827091217041, "epoch": 1.616005040957782, "grad_norm": 1.2890625, "learning_rate": 0.0004740126529142526, "loss": 5.3376, "mean_token_accuracy": 0.17347298115491866, "num_tokens": 35473310.0, "step": 19235 }, { "entropy": 5.602123212814331, "epoch": 1.6164251207729468, "grad_norm": 1.875, "learning_rate": 0.0004739986799213915, "loss": 5.4651, "mean_token_accuracy": 0.1707776516675949, "num_tokens": 35483502.0, "step": 19240 }, { "entropy": 5.694213247299194, "epoch": 1.6168452005881118, "grad_norm": 1.484375, "learning_rate": 0.0004739847034033713, "loss": 5.4299, "mean_token_accuracy": 0.16592200696468354, "num_tokens": 35493063.0, "step": 19245 }, { "entropy": 5.674246883392334, "epoch": 1.6172652804032768, "grad_norm": 1.6640625, "learning_rate": 0.00047397072336043957, "loss": 5.3847, "mean_token_accuracy": 0.1654440939426422, "num_tokens": 35501829.0, "step": 19250 }, { "entropy": 5.7208233833312985, "epoch": 1.6176853602184416, "grad_norm": 1.3046875, "learning_rate": 0.00047395673979284383, "loss": 5.4025, "mean_token_accuracy": 0.16252227872610092, "num_tokens": 35510411.0, "step": 19255 }, { "entropy": 5.695710945129394, "epoch": 1.6181054400336063, "grad_norm": 1.5546875, "learning_rate": 0.000473942752700832, "loss": 5.4071, "mean_token_accuracy": 0.168272402882576, "num_tokens": 35519571.0, "step": 19260 }, { "entropy": 5.633262681961059, "epoch": 1.6185255198487711, "grad_norm": 1.7109375, "learning_rate": 0.00047392876208465166, "loss": 5.3537, "mean_token_accuracy": 0.1690814658999443, "num_tokens": 35527306.0, "step": 19265 }, { "entropy": 5.6343008518219, "epoch": 1.6189455996639361, "grad_norm": 1.390625, "learning_rate": 0.0004739147679445508, "loss": 5.3577, "mean_token_accuracy": 0.1658302888274193, "num_tokens": 35536126.0, "step": 19270 }, { "entropy": 5.644708919525146, "epoch": 1.6193656794791011, "grad_norm": 1.6015625, "learning_rate": 0.0004739007702807773, "loss": 5.4217, "mean_token_accuracy": 0.16885081082582473, "num_tokens": 35545593.0, "step": 19275 }, { "entropy": 5.625165557861328, "epoch": 1.619785759294266, "grad_norm": 1.5625, "learning_rate": 0.00047388676909357894, "loss": 5.3437, "mean_token_accuracy": 0.1670317158102989, "num_tokens": 35554780.0, "step": 19280 }, { "entropy": 5.674202489852905, "epoch": 1.6202058391094307, "grad_norm": 1.4140625, "learning_rate": 0.00047387276438320394, "loss": 5.3462, "mean_token_accuracy": 0.17734202444553376, "num_tokens": 35562982.0, "step": 19285 }, { "entropy": 5.682125806808472, "epoch": 1.6206259189245955, "grad_norm": 1.4921875, "learning_rate": 0.0004738587561499003, "loss": 5.464, "mean_token_accuracy": 0.16998654305934907, "num_tokens": 35571528.0, "step": 19290 }, { "entropy": 5.594412136077881, "epoch": 1.6210459987397605, "grad_norm": 1.4140625, "learning_rate": 0.00047384474439391615, "loss": 5.2968, "mean_token_accuracy": 0.17942917197942734, "num_tokens": 35580386.0, "step": 19295 }, { "entropy": 5.609464263916015, "epoch": 1.6214660785549255, "grad_norm": 1.421875, "learning_rate": 0.0004738307291154998, "loss": 5.269, "mean_token_accuracy": 0.16951826214790344, "num_tokens": 35589456.0, "step": 19300 }, { "entropy": 5.646043395996093, "epoch": 1.6218861583700903, "grad_norm": 1.53125, "learning_rate": 0.0004738167103148995, "loss": 5.3687, "mean_token_accuracy": 0.17219835072755812, "num_tokens": 35598116.0, "step": 19305 }, { "entropy": 5.676636123657227, "epoch": 1.622306238185255, "grad_norm": 1.5625, "learning_rate": 0.00047380268799236355, "loss": 5.3641, "mean_token_accuracy": 0.16999810189008713, "num_tokens": 35606481.0, "step": 19310 }, { "entropy": 5.633781385421753, "epoch": 1.62272631800042, "grad_norm": 1.40625, "learning_rate": 0.00047378866214814024, "loss": 5.3475, "mean_token_accuracy": 0.16768400371074677, "num_tokens": 35615517.0, "step": 19315 }, { "entropy": 5.662630224227906, "epoch": 1.6231463978155851, "grad_norm": 1.4296875, "learning_rate": 0.00047377463278247827, "loss": 5.4018, "mean_token_accuracy": 0.1614094376564026, "num_tokens": 35625100.0, "step": 19320 }, { "entropy": 5.698197555541992, "epoch": 1.62356647763075, "grad_norm": 1.6328125, "learning_rate": 0.000473760599895626, "loss": 5.3197, "mean_token_accuracy": 0.16777887046337128, "num_tokens": 35634572.0, "step": 19325 }, { "entropy": 5.671027040481567, "epoch": 1.6239865574459147, "grad_norm": 1.4375, "learning_rate": 0.000473746563487832, "loss": 5.3585, "mean_token_accuracy": 0.1732994943857193, "num_tokens": 35643883.0, "step": 19330 }, { "entropy": 5.641132640838623, "epoch": 1.6244066372610795, "grad_norm": 1.4375, "learning_rate": 0.00047373252355934506, "loss": 5.4252, "mean_token_accuracy": 0.16886914223432542, "num_tokens": 35652527.0, "step": 19335 }, { "entropy": 5.691527080535889, "epoch": 1.6248267170762445, "grad_norm": 1.4765625, "learning_rate": 0.00047371848011041375, "loss": 5.4632, "mean_token_accuracy": 0.16798330396413802, "num_tokens": 35662436.0, "step": 19340 }, { "entropy": 5.699794816970825, "epoch": 1.6252467968914095, "grad_norm": 1.3125, "learning_rate": 0.00047370443314128687, "loss": 5.3483, "mean_token_accuracy": 0.17165588736534118, "num_tokens": 35672302.0, "step": 19345 }, { "entropy": 5.659704780578613, "epoch": 1.6256668767065743, "grad_norm": 1.34375, "learning_rate": 0.0004736903826522132, "loss": 5.4101, "mean_token_accuracy": 0.16816721260547637, "num_tokens": 35680852.0, "step": 19350 }, { "entropy": 5.6761833190917965, "epoch": 1.626086956521739, "grad_norm": 1.5234375, "learning_rate": 0.0004736763286434419, "loss": 5.3811, "mean_token_accuracy": 0.17145880460739135, "num_tokens": 35690159.0, "step": 19355 }, { "entropy": 5.622335624694824, "epoch": 1.6265070363369039, "grad_norm": 1.4921875, "learning_rate": 0.0004736622711152216, "loss": 5.3144, "mean_token_accuracy": 0.17438797056674957, "num_tokens": 35699165.0, "step": 19360 }, { "entropy": 5.680206346511841, "epoch": 1.6269271161520689, "grad_norm": 1.5859375, "learning_rate": 0.0004736482100678015, "loss": 5.379, "mean_token_accuracy": 0.17168426364660264, "num_tokens": 35708910.0, "step": 19365 }, { "entropy": 5.680268287658691, "epoch": 1.6273471959672339, "grad_norm": 1.34375, "learning_rate": 0.00047363414550143063, "loss": 5.4539, "mean_token_accuracy": 0.16627233028411864, "num_tokens": 35718218.0, "step": 19370 }, { "entropy": 5.661238050460815, "epoch": 1.6277672757823987, "grad_norm": 1.484375, "learning_rate": 0.00047362007741635816, "loss": 5.3692, "mean_token_accuracy": 0.17138148248195648, "num_tokens": 35727076.0, "step": 19375 }, { "entropy": 5.655786752700806, "epoch": 1.6281873555975634, "grad_norm": 1.53125, "learning_rate": 0.0004736060058128333, "loss": 5.4598, "mean_token_accuracy": 0.1673205927014351, "num_tokens": 35736316.0, "step": 19380 }, { "entropy": 5.689300918579102, "epoch": 1.6286074354127285, "grad_norm": 1.4296875, "learning_rate": 0.00047359193069110533, "loss": 5.4293, "mean_token_accuracy": 0.17298100590705873, "num_tokens": 35745747.0, "step": 19385 }, { "entropy": 5.791736125946045, "epoch": 1.6290275152278935, "grad_norm": 1.46875, "learning_rate": 0.00047357785205142354, "loss": 5.3922, "mean_token_accuracy": 0.17255930006504058, "num_tokens": 35754825.0, "step": 19390 }, { "entropy": 5.630894136428833, "epoch": 1.6294475950430583, "grad_norm": 1.4453125, "learning_rate": 0.0004735637698940374, "loss": 5.3536, "mean_token_accuracy": 0.17112387716770172, "num_tokens": 35764504.0, "step": 19395 }, { "entropy": 5.721408700942993, "epoch": 1.629867674858223, "grad_norm": 1.828125, "learning_rate": 0.0004735496842191963, "loss": 5.4416, "mean_token_accuracy": 0.17230593860149385, "num_tokens": 35774195.0, "step": 19400 }, { "entropy": 5.609949207305908, "epoch": 1.6302877546733878, "grad_norm": 1.359375, "learning_rate": 0.00047353559502714976, "loss": 5.3104, "mean_token_accuracy": 0.1747656896710396, "num_tokens": 35783721.0, "step": 19405 }, { "entropy": 5.641864967346192, "epoch": 1.6307078344885528, "grad_norm": 1.6796875, "learning_rate": 0.0004735215023181474, "loss": 5.3991, "mean_token_accuracy": 0.16826074570417404, "num_tokens": 35792821.0, "step": 19410 }, { "entropy": 5.676604318618774, "epoch": 1.6311279143037178, "grad_norm": 1.390625, "learning_rate": 0.00047350740609243883, "loss": 5.4285, "mean_token_accuracy": 0.1649575188755989, "num_tokens": 35802746.0, "step": 19415 }, { "entropy": 5.721334552764892, "epoch": 1.6315479941188826, "grad_norm": 1.5625, "learning_rate": 0.0004734933063502738, "loss": 5.421, "mean_token_accuracy": 0.17509810924530028, "num_tokens": 35811196.0, "step": 19420 }, { "entropy": 5.818255996704101, "epoch": 1.6319680739340474, "grad_norm": 1.5859375, "learning_rate": 0.00047347920309190203, "loss": 5.4471, "mean_token_accuracy": 0.16493862569332124, "num_tokens": 35820787.0, "step": 19425 }, { "entropy": 5.703247213363648, "epoch": 1.6323881537492122, "grad_norm": 1.578125, "learning_rate": 0.0004734650963175734, "loss": 5.4246, "mean_token_accuracy": 0.16639426350593567, "num_tokens": 35831247.0, "step": 19430 }, { "entropy": 5.648799848556519, "epoch": 1.6328082335643772, "grad_norm": 1.515625, "learning_rate": 0.00047345098602753777, "loss": 5.4563, "mean_token_accuracy": 0.16505984961986542, "num_tokens": 35840759.0, "step": 19435 }, { "entropy": 5.622422122955323, "epoch": 1.6332283133795422, "grad_norm": 1.5390625, "learning_rate": 0.0004734368722220451, "loss": 5.41, "mean_token_accuracy": 0.16521313637495041, "num_tokens": 35850137.0, "step": 19440 }, { "entropy": 5.628439140319824, "epoch": 1.633648393194707, "grad_norm": 1.421875, "learning_rate": 0.0004734227549013455, "loss": 5.2396, "mean_token_accuracy": 0.1794390082359314, "num_tokens": 35858412.0, "step": 19445 }, { "entropy": 5.655402612686157, "epoch": 1.6340684730098718, "grad_norm": 1.6875, "learning_rate": 0.0004734086340656889, "loss": 5.3312, "mean_token_accuracy": 0.1723542883992195, "num_tokens": 35868202.0, "step": 19450 }, { "entropy": 5.646328258514404, "epoch": 1.6344885528250368, "grad_norm": 1.4375, "learning_rate": 0.0004733945097153255, "loss": 5.4003, "mean_token_accuracy": 0.17372321784496308, "num_tokens": 35877237.0, "step": 19455 }, { "entropy": 5.608543586730957, "epoch": 1.6349086326402016, "grad_norm": 1.28125, "learning_rate": 0.0004733803818505055, "loss": 5.2715, "mean_token_accuracy": 0.1802636206150055, "num_tokens": 35887016.0, "step": 19460 }, { "entropy": 5.677346563339233, "epoch": 1.6353287124553666, "grad_norm": 1.265625, "learning_rate": 0.00047336625047147924, "loss": 5.3485, "mean_token_accuracy": 0.17663054317235946, "num_tokens": 35896393.0, "step": 19465 }, { "entropy": 5.643209791183471, "epoch": 1.6357487922705314, "grad_norm": 1.375, "learning_rate": 0.00047335211557849693, "loss": 5.3902, "mean_token_accuracy": 0.16930769830942155, "num_tokens": 35905237.0, "step": 19470 }, { "entropy": 5.671267795562744, "epoch": 1.6361688720856962, "grad_norm": 1.5078125, "learning_rate": 0.0004733379771718092, "loss": 5.4229, "mean_token_accuracy": 0.17023178488016127, "num_tokens": 35914352.0, "step": 19475 }, { "entropy": 5.692772483825683, "epoch": 1.6365889519008612, "grad_norm": 1.71875, "learning_rate": 0.0004733238352516661, "loss": 5.4805, "mean_token_accuracy": 0.16938166916370392, "num_tokens": 35923785.0, "step": 19480 }, { "entropy": 5.761615133285522, "epoch": 1.6370090317160262, "grad_norm": 1.4453125, "learning_rate": 0.00047330968981831856, "loss": 5.3858, "mean_token_accuracy": 0.16777340024709703, "num_tokens": 35932495.0, "step": 19485 }, { "entropy": 5.69402379989624, "epoch": 1.637429111531191, "grad_norm": 1.625, "learning_rate": 0.00047329554087201687, "loss": 5.351, "mean_token_accuracy": 0.17982448786497116, "num_tokens": 35941745.0, "step": 19490 }, { "entropy": 5.660278224945069, "epoch": 1.6378491913463558, "grad_norm": 1.6484375, "learning_rate": 0.00047328138841301186, "loss": 5.4418, "mean_token_accuracy": 0.16807905286550523, "num_tokens": 35950281.0, "step": 19495 }, { "entropy": 5.653802061080933, "epoch": 1.6382692711615205, "grad_norm": 1.3671875, "learning_rate": 0.0004732672324415541, "loss": 5.372, "mean_token_accuracy": 0.1754430741071701, "num_tokens": 35959531.0, "step": 19500 }, { "entropy": 5.73360242843628, "epoch": 1.6386893509766856, "grad_norm": 1.625, "learning_rate": 0.0004732530729578945, "loss": 5.4361, "mean_token_accuracy": 0.17509964853525162, "num_tokens": 35969462.0, "step": 19505 }, { "entropy": 5.659942388534546, "epoch": 1.6391094307918506, "grad_norm": 1.40625, "learning_rate": 0.0004732389099622837, "loss": 5.411, "mean_token_accuracy": 0.16947837471961974, "num_tokens": 35978022.0, "step": 19510 }, { "entropy": 5.7105179786682125, "epoch": 1.6395295106070154, "grad_norm": 1.453125, "learning_rate": 0.00047322474345497267, "loss": 5.4246, "mean_token_accuracy": 0.16419751197099686, "num_tokens": 35988193.0, "step": 19515 }, { "entropy": 5.762126207351685, "epoch": 1.6399495904221801, "grad_norm": 1.421875, "learning_rate": 0.00047321057343621247, "loss": 5.4216, "mean_token_accuracy": 0.16807464212179185, "num_tokens": 35997404.0, "step": 19520 }, { "entropy": 5.6027778625488285, "epoch": 1.6403696702373451, "grad_norm": 1.3359375, "learning_rate": 0.00047319639990625395, "loss": 5.3067, "mean_token_accuracy": 0.1780134305357933, "num_tokens": 36005356.0, "step": 19525 }, { "entropy": 5.74001407623291, "epoch": 1.64078975005251, "grad_norm": 1.8125, "learning_rate": 0.00047318222286534824, "loss": 5.58, "mean_token_accuracy": 0.16051921397447586, "num_tokens": 36015305.0, "step": 19530 }, { "entropy": 5.77122483253479, "epoch": 1.641209829867675, "grad_norm": 1.5625, "learning_rate": 0.00047316804231374663, "loss": 5.4209, "mean_token_accuracy": 0.1640459731221199, "num_tokens": 36024278.0, "step": 19535 }, { "entropy": 5.6274620532989506, "epoch": 1.6416299096828397, "grad_norm": 1.40625, "learning_rate": 0.0004731538582517001, "loss": 5.2479, "mean_token_accuracy": 0.17768406867980957, "num_tokens": 36032870.0, "step": 19540 }, { "entropy": 5.569975423812866, "epoch": 1.6420499894980045, "grad_norm": 1.59375, "learning_rate": 0.00047313967067945996, "loss": 5.2931, "mean_token_accuracy": 0.17938766926527022, "num_tokens": 36041725.0, "step": 19545 }, { "entropy": 5.649091005325317, "epoch": 1.6424700693131695, "grad_norm": 1.59375, "learning_rate": 0.0004731254795972777, "loss": 5.423, "mean_token_accuracy": 0.16832873672246934, "num_tokens": 36050929.0, "step": 19550 }, { "entropy": 5.709831714630127, "epoch": 1.6428901491283345, "grad_norm": 1.421875, "learning_rate": 0.0004731112850054045, "loss": 5.4119, "mean_token_accuracy": 0.16599251627922057, "num_tokens": 36060059.0, "step": 19555 }, { "entropy": 5.649776840209961, "epoch": 1.6433102289434993, "grad_norm": 1.7734375, "learning_rate": 0.0004730970869040919, "loss": 5.3525, "mean_token_accuracy": 0.18190265446901321, "num_tokens": 36069445.0, "step": 19560 }, { "entropy": 5.696929168701172, "epoch": 1.643730308758664, "grad_norm": 1.8125, "learning_rate": 0.00047308288529359147, "loss": 5.4943, "mean_token_accuracy": 0.16712310314178466, "num_tokens": 36079129.0, "step": 19565 }, { "entropy": 5.7188207626342775, "epoch": 1.644150388573829, "grad_norm": 1.375, "learning_rate": 0.0004730686801741547, "loss": 5.3679, "mean_token_accuracy": 0.17080006003379822, "num_tokens": 36088320.0, "step": 19570 }, { "entropy": 5.674493503570557, "epoch": 1.644570468388994, "grad_norm": 1.3671875, "learning_rate": 0.0004730544715460332, "loss": 5.4237, "mean_token_accuracy": 0.17072638422250747, "num_tokens": 36097728.0, "step": 19575 }, { "entropy": 5.724712228775024, "epoch": 1.644990548204159, "grad_norm": 1.703125, "learning_rate": 0.00047304025940947875, "loss": 5.4189, "mean_token_accuracy": 0.1723160296678543, "num_tokens": 36106566.0, "step": 19580 }, { "entropy": 5.699596214294433, "epoch": 1.6454106280193237, "grad_norm": 1.671875, "learning_rate": 0.00047302604376474306, "loss": 5.3691, "mean_token_accuracy": 0.16786410212516784, "num_tokens": 36115475.0, "step": 19585 }, { "entropy": 5.62215142250061, "epoch": 1.6458307078344885, "grad_norm": 1.3671875, "learning_rate": 0.00047301182461207807, "loss": 5.4812, "mean_token_accuracy": 0.17268287092447282, "num_tokens": 36124404.0, "step": 19590 }, { "entropy": 5.670156955718994, "epoch": 1.6462507876496533, "grad_norm": 1.59375, "learning_rate": 0.00047299760195173554, "loss": 5.3278, "mean_token_accuracy": 0.1758397027850151, "num_tokens": 36132987.0, "step": 19595 }, { "entropy": 5.701921844482422, "epoch": 1.6466708674648183, "grad_norm": 1.2421875, "learning_rate": 0.0004729833757839673, "loss": 5.4756, "mean_token_accuracy": 0.17378847897052765, "num_tokens": 36142163.0, "step": 19600 }, { "entropy": 5.707473468780518, "epoch": 1.6470909472799833, "grad_norm": 1.640625, "learning_rate": 0.00047296914610902565, "loss": 5.4488, "mean_token_accuracy": 0.16369751691818238, "num_tokens": 36152561.0, "step": 19605 }, { "entropy": 5.710807847976684, "epoch": 1.647511027095148, "grad_norm": 1.609375, "learning_rate": 0.00047295491292716245, "loss": 5.363, "mean_token_accuracy": 0.16720346361398697, "num_tokens": 36161877.0, "step": 19610 }, { "entropy": 5.670904731750488, "epoch": 1.6479311069103129, "grad_norm": 1.5390625, "learning_rate": 0.00047294067623862996, "loss": 5.3954, "mean_token_accuracy": 0.164234559237957, "num_tokens": 36171523.0, "step": 19615 }, { "entropy": 5.612199401855468, "epoch": 1.6483511867254779, "grad_norm": 1.6015625, "learning_rate": 0.00047292643604368025, "loss": 5.3371, "mean_token_accuracy": 0.1748445972800255, "num_tokens": 36180339.0, "step": 19620 }, { "entropy": 5.7124796390533445, "epoch": 1.6487712665406429, "grad_norm": 1.5703125, "learning_rate": 0.0004729121923425657, "loss": 5.4309, "mean_token_accuracy": 0.1659110963344574, "num_tokens": 36191584.0, "step": 19625 }, { "entropy": 5.788698005676269, "epoch": 1.6491913463558077, "grad_norm": 1.5, "learning_rate": 0.0004728979451355385, "loss": 5.4677, "mean_token_accuracy": 0.16967541128396987, "num_tokens": 36200738.0, "step": 19630 }, { "entropy": 5.621402883529663, "epoch": 1.6496114261709725, "grad_norm": 1.25, "learning_rate": 0.00047288369442285115, "loss": 5.2805, "mean_token_accuracy": 0.18398987352848054, "num_tokens": 36209394.0, "step": 19635 }, { "entropy": 5.628550434112549, "epoch": 1.6500315059861372, "grad_norm": 1.7578125, "learning_rate": 0.00047286944020475606, "loss": 5.4013, "mean_token_accuracy": 0.17032790631055833, "num_tokens": 36218268.0, "step": 19640 }, { "entropy": 5.638523435592651, "epoch": 1.6504515858013022, "grad_norm": 1.4765625, "learning_rate": 0.0004728551824815057, "loss": 5.3451, "mean_token_accuracy": 0.17313553392887115, "num_tokens": 36226974.0, "step": 19645 }, { "entropy": 5.580386114120484, "epoch": 1.6508716656164673, "grad_norm": 1.390625, "learning_rate": 0.00047284092125335277, "loss": 5.3191, "mean_token_accuracy": 0.1764894738793373, "num_tokens": 36235892.0, "step": 19650 }, { "entropy": 5.583609628677368, "epoch": 1.651291745431632, "grad_norm": 1.3984375, "learning_rate": 0.0004728266565205497, "loss": 5.3286, "mean_token_accuracy": 0.17261691987514496, "num_tokens": 36244750.0, "step": 19655 }, { "entropy": 5.665705299377441, "epoch": 1.6517118252467968, "grad_norm": 1.4140625, "learning_rate": 0.00047281238828334924, "loss": 5.3737, "mean_token_accuracy": 0.17210416346788407, "num_tokens": 36254902.0, "step": 19660 }, { "entropy": 5.684027051925659, "epoch": 1.6521319050619616, "grad_norm": 1.359375, "learning_rate": 0.0004727981165420042, "loss": 5.4264, "mean_token_accuracy": 0.16854705959558486, "num_tokens": 36265546.0, "step": 19665 }, { "entropy": 5.635334634780884, "epoch": 1.6525519848771266, "grad_norm": 1.359375, "learning_rate": 0.0004727838412967674, "loss": 5.3356, "mean_token_accuracy": 0.1739551231265068, "num_tokens": 36273978.0, "step": 19670 }, { "entropy": 5.694224214553833, "epoch": 1.6529720646922916, "grad_norm": 1.4296875, "learning_rate": 0.0004727695625478917, "loss": 5.3725, "mean_token_accuracy": 0.16794622987508773, "num_tokens": 36283117.0, "step": 19675 }, { "entropy": 5.7062092304229735, "epoch": 1.6533921445074564, "grad_norm": 1.6328125, "learning_rate": 0.00047275528029562996, "loss": 5.37, "mean_token_accuracy": 0.16468877643346785, "num_tokens": 36293031.0, "step": 19680 }, { "entropy": 5.597905492782592, "epoch": 1.6538122243226212, "grad_norm": 1.546875, "learning_rate": 0.00047274099454023535, "loss": 5.3618, "mean_token_accuracy": 0.1748396039009094, "num_tokens": 36302080.0, "step": 19685 }, { "entropy": 5.6517712593078615, "epoch": 1.6542323041377862, "grad_norm": 1.5546875, "learning_rate": 0.00047272670528196084, "loss": 5.389, "mean_token_accuracy": 0.1675845429301262, "num_tokens": 36311077.0, "step": 19690 }, { "entropy": 5.637048244476318, "epoch": 1.6546523839529512, "grad_norm": 1.5546875, "learning_rate": 0.0004727124125210595, "loss": 5.3213, "mean_token_accuracy": 0.1745500758290291, "num_tokens": 36320300.0, "step": 19695 }, { "entropy": 5.641404485702514, "epoch": 1.655072463768116, "grad_norm": 1.28125, "learning_rate": 0.00047269811625778456, "loss": 5.3872, "mean_token_accuracy": 0.17139033675193788, "num_tokens": 36330184.0, "step": 19700 }, { "entropy": 5.538795757293701, "epoch": 1.6554925435832808, "grad_norm": 1.1953125, "learning_rate": 0.0004726838164923893, "loss": 5.3895, "mean_token_accuracy": 0.16786455661058425, "num_tokens": 36339526.0, "step": 19705 }, { "entropy": 5.6508077621459964, "epoch": 1.6559126233984456, "grad_norm": 1.828125, "learning_rate": 0.00047266951322512716, "loss": 5.3813, "mean_token_accuracy": 0.1695254623889923, "num_tokens": 36348849.0, "step": 19710 }, { "entropy": 5.727986001968384, "epoch": 1.6563327032136106, "grad_norm": 1.3828125, "learning_rate": 0.00047265520645625123, "loss": 5.3911, "mean_token_accuracy": 0.1646333172917366, "num_tokens": 36358924.0, "step": 19715 }, { "entropy": 5.7471997261047365, "epoch": 1.6567527830287756, "grad_norm": 1.859375, "learning_rate": 0.00047264089618601513, "loss": 5.422, "mean_token_accuracy": 0.17060866355895996, "num_tokens": 36367130.0, "step": 19720 }, { "entropy": 5.6405291080474855, "epoch": 1.6571728628439404, "grad_norm": 1.40625, "learning_rate": 0.0004726265824146724, "loss": 5.3726, "mean_token_accuracy": 0.16610245555639266, "num_tokens": 36376575.0, "step": 19725 }, { "entropy": 5.561537742614746, "epoch": 1.6575929426591052, "grad_norm": 1.4921875, "learning_rate": 0.0004726122651424764, "loss": 5.3, "mean_token_accuracy": 0.1740986868739128, "num_tokens": 36385010.0, "step": 19730 }, { "entropy": 5.581302356719971, "epoch": 1.65801302247427, "grad_norm": 1.3515625, "learning_rate": 0.000472597944369681, "loss": 5.1033, "mean_token_accuracy": 0.18641779869794844, "num_tokens": 36393574.0, "step": 19735 }, { "entropy": 5.634199094772339, "epoch": 1.658433102289435, "grad_norm": 1.2421875, "learning_rate": 0.00047258362009653965, "loss": 5.3236, "mean_token_accuracy": 0.17412642389535904, "num_tokens": 36401992.0, "step": 19740 }, { "entropy": 5.673167896270752, "epoch": 1.6588531821046, "grad_norm": 1.2890625, "learning_rate": 0.00047256929232330624, "loss": 5.463, "mean_token_accuracy": 0.160048608481884, "num_tokens": 36411712.0, "step": 19745 }, { "entropy": 5.579254055023194, "epoch": 1.6592732619197648, "grad_norm": 1.21875, "learning_rate": 0.0004725549610502346, "loss": 5.2837, "mean_token_accuracy": 0.17299832701683043, "num_tokens": 36420240.0, "step": 19750 }, { "entropy": 5.632542705535888, "epoch": 1.6596933417349296, "grad_norm": 1.3984375, "learning_rate": 0.00047254062627757854, "loss": 5.4063, "mean_token_accuracy": 0.17789214998483657, "num_tokens": 36430068.0, "step": 19755 }, { "entropy": 5.690257835388183, "epoch": 1.6601134215500946, "grad_norm": 1.6875, "learning_rate": 0.000472526288005592, "loss": 5.4355, "mean_token_accuracy": 0.16823179572820662, "num_tokens": 36439808.0, "step": 19760 }, { "entropy": 5.611015462875367, "epoch": 1.6605335013652593, "grad_norm": 1.2109375, "learning_rate": 0.000472511946234529, "loss": 5.3956, "mean_token_accuracy": 0.17020961195230483, "num_tokens": 36449609.0, "step": 19765 }, { "entropy": 5.7615532875061035, "epoch": 1.6609535811804244, "grad_norm": 1.3046875, "learning_rate": 0.0004724976009646435, "loss": 5.3424, "mean_token_accuracy": 0.17360990196466447, "num_tokens": 36457700.0, "step": 19770 }, { "entropy": 5.669061231613159, "epoch": 1.6613736609955891, "grad_norm": 2.0625, "learning_rate": 0.0004724832521961897, "loss": 5.4023, "mean_token_accuracy": 0.17211264073848725, "num_tokens": 36466881.0, "step": 19775 }, { "entropy": 5.711100006103516, "epoch": 1.661793740810754, "grad_norm": 1.34375, "learning_rate": 0.00047246889992942187, "loss": 5.495, "mean_token_accuracy": 0.16188012808561325, "num_tokens": 36475433.0, "step": 19780 }, { "entropy": 5.68057951927185, "epoch": 1.662213820625919, "grad_norm": 1.3359375, "learning_rate": 0.0004724545441645941, "loss": 5.4116, "mean_token_accuracy": 0.16782844066619873, "num_tokens": 36484232.0, "step": 19785 }, { "entropy": 5.754859256744385, "epoch": 1.662633900441084, "grad_norm": 1.421875, "learning_rate": 0.0004724401849019608, "loss": 5.5269, "mean_token_accuracy": 0.1602175533771515, "num_tokens": 36493588.0, "step": 19790 }, { "entropy": 5.669810009002686, "epoch": 1.6630539802562487, "grad_norm": 1.5546875, "learning_rate": 0.00047242582214177616, "loss": 5.3045, "mean_token_accuracy": 0.1697609916329384, "num_tokens": 36502289.0, "step": 19795 }, { "entropy": 5.724186754226684, "epoch": 1.6634740600714135, "grad_norm": 1.3359375, "learning_rate": 0.00047241145588429483, "loss": 5.4492, "mean_token_accuracy": 0.1644959807395935, "num_tokens": 36511978.0, "step": 19800 }, { "entropy": 5.680912446975708, "epoch": 1.6638941398865783, "grad_norm": 1.234375, "learning_rate": 0.0004723970861297712, "loss": 5.4175, "mean_token_accuracy": 0.17128399163484573, "num_tokens": 36520378.0, "step": 19805 }, { "entropy": 5.655539083480835, "epoch": 1.6643142197017433, "grad_norm": 1.3125, "learning_rate": 0.0004723827128784599, "loss": 5.4029, "mean_token_accuracy": 0.16915369629859925, "num_tokens": 36529965.0, "step": 19810 }, { "entropy": 5.836799001693725, "epoch": 1.6647342995169083, "grad_norm": 1.265625, "learning_rate": 0.00047236833613061534, "loss": 5.4194, "mean_token_accuracy": 0.16969371736049652, "num_tokens": 36539394.0, "step": 19815 }, { "entropy": 5.667885828018188, "epoch": 1.665154379332073, "grad_norm": 1.328125, "learning_rate": 0.0004723539558864925, "loss": 5.4697, "mean_token_accuracy": 0.17036117166280745, "num_tokens": 36548608.0, "step": 19820 }, { "entropy": 5.670717477798462, "epoch": 1.665574459147238, "grad_norm": 1.359375, "learning_rate": 0.0004723395721463459, "loss": 5.3393, "mean_token_accuracy": 0.1704514279961586, "num_tokens": 36557736.0, "step": 19825 }, { "entropy": 5.6675090312957765, "epoch": 1.665994538962403, "grad_norm": 1.6796875, "learning_rate": 0.0004723251849104303, "loss": 5.3703, "mean_token_accuracy": 0.16267035156488419, "num_tokens": 36566745.0, "step": 19830 }, { "entropy": 5.575802850723266, "epoch": 1.6664146187775677, "grad_norm": 1.1953125, "learning_rate": 0.00047231079417900076, "loss": 5.3086, "mean_token_accuracy": 0.1693269893527031, "num_tokens": 36575956.0, "step": 19835 }, { "entropy": 5.638355350494384, "epoch": 1.6668346985927327, "grad_norm": 1.4140625, "learning_rate": 0.000472296399952312, "loss": 5.3651, "mean_token_accuracy": 0.17209307253360748, "num_tokens": 36584673.0, "step": 19840 }, { "entropy": 5.703708839416504, "epoch": 1.6672547784078975, "grad_norm": 1.3203125, "learning_rate": 0.0004722820022306192, "loss": 5.422, "mean_token_accuracy": 0.17276596128940583, "num_tokens": 36593758.0, "step": 19845 }, { "entropy": 5.591260862350464, "epoch": 1.6676748582230623, "grad_norm": 1.484375, "learning_rate": 0.0004722676010141773, "loss": 5.2767, "mean_token_accuracy": 0.16923788189888, "num_tokens": 36603722.0, "step": 19850 }, { "entropy": 5.6357824325561525, "epoch": 1.6680949380382273, "grad_norm": 1.359375, "learning_rate": 0.00047225319630324136, "loss": 5.3335, "mean_token_accuracy": 0.17396993786096573, "num_tokens": 36612478.0, "step": 19855 }, { "entropy": 5.656694173812866, "epoch": 1.6685150178533923, "grad_norm": 1.546875, "learning_rate": 0.0004722387880980667, "loss": 5.535, "mean_token_accuracy": 0.16138018071651458, "num_tokens": 36622399.0, "step": 19860 }, { "entropy": 5.709191513061524, "epoch": 1.668935097668557, "grad_norm": 1.3984375, "learning_rate": 0.00047222437639890844, "loss": 5.3687, "mean_token_accuracy": 0.17041545510292053, "num_tokens": 36631798.0, "step": 19865 }, { "entropy": 5.570785617828369, "epoch": 1.6693551774837219, "grad_norm": 1.9296875, "learning_rate": 0.00047220996120602197, "loss": 5.3879, "mean_token_accuracy": 0.1724646970629692, "num_tokens": 36640405.0, "step": 19870 }, { "entropy": 5.717275476455688, "epoch": 1.6697752572988867, "grad_norm": 1.375, "learning_rate": 0.00047219554251966246, "loss": 5.5201, "mean_token_accuracy": 0.1616477571427822, "num_tokens": 36650209.0, "step": 19875 }, { "entropy": 5.780755186080933, "epoch": 1.6701953371140517, "grad_norm": 1.390625, "learning_rate": 0.0004721811203400855, "loss": 5.4614, "mean_token_accuracy": 0.16350688189268112, "num_tokens": 36660248.0, "step": 19880 }, { "entropy": 5.65017991065979, "epoch": 1.6706154169292167, "grad_norm": 1.453125, "learning_rate": 0.00047216669466754657, "loss": 5.3575, "mean_token_accuracy": 0.17064955681562424, "num_tokens": 36669938.0, "step": 19885 }, { "entropy": 5.548468828201294, "epoch": 1.6710354967443815, "grad_norm": 1.2265625, "learning_rate": 0.0004721522655023012, "loss": 5.375, "mean_token_accuracy": 0.17481788247823715, "num_tokens": 36679903.0, "step": 19890 }, { "entropy": 5.753811597824097, "epoch": 1.6714555765595462, "grad_norm": 1.5, "learning_rate": 0.0004721378328446049, "loss": 5.4404, "mean_token_accuracy": 0.17175495326519014, "num_tokens": 36688424.0, "step": 19895 }, { "entropy": 5.7923060894012455, "epoch": 1.6718756563747112, "grad_norm": 1.296875, "learning_rate": 0.0004721233966947134, "loss": 5.5084, "mean_token_accuracy": 0.16471525579690932, "num_tokens": 36698715.0, "step": 19900 }, { "entropy": 5.536203193664551, "epoch": 1.672295736189876, "grad_norm": 1.3125, "learning_rate": 0.00047210895705288237, "loss": 5.3675, "mean_token_accuracy": 0.18398713916540146, "num_tokens": 36708456.0, "step": 19905 }, { "entropy": 5.648340320587158, "epoch": 1.672715816005041, "grad_norm": 1.34375, "learning_rate": 0.0004720945139193678, "loss": 5.3991, "mean_token_accuracy": 0.1710827425122261, "num_tokens": 36717596.0, "step": 19910 }, { "entropy": 5.751449108123779, "epoch": 1.6731358958202058, "grad_norm": 1.5703125, "learning_rate": 0.0004720800672944253, "loss": 5.483, "mean_token_accuracy": 0.16295073330402374, "num_tokens": 36727092.0, "step": 19915 }, { "entropy": 5.658854913711548, "epoch": 1.6735559756353706, "grad_norm": 1.5703125, "learning_rate": 0.0004720656171783109, "loss": 5.2087, "mean_token_accuracy": 0.18139244765043258, "num_tokens": 36735910.0, "step": 19920 }, { "entropy": 5.5950675964355465, "epoch": 1.6739760554505356, "grad_norm": 1.265625, "learning_rate": 0.0004720511635712806, "loss": 5.3288, "mean_token_accuracy": 0.17601545453071593, "num_tokens": 36745237.0, "step": 19925 }, { "entropy": 5.65685772895813, "epoch": 1.6743961352657006, "grad_norm": 1.5546875, "learning_rate": 0.00047203670647359035, "loss": 5.466, "mean_token_accuracy": 0.16994206011295318, "num_tokens": 36753603.0, "step": 19930 }, { "entropy": 5.777238512039185, "epoch": 1.6748162150808654, "grad_norm": 1.53125, "learning_rate": 0.0004720222458854964, "loss": 5.4552, "mean_token_accuracy": 0.16490527987480164, "num_tokens": 36763010.0, "step": 19935 }, { "entropy": 5.711502265930176, "epoch": 1.6752362948960302, "grad_norm": 1.3515625, "learning_rate": 0.00047200778180725477, "loss": 5.384, "mean_token_accuracy": 0.17073629200458526, "num_tokens": 36772156.0, "step": 19940 }, { "entropy": 5.617578077316284, "epoch": 1.675656374711195, "grad_norm": 1.296875, "learning_rate": 0.00047199331423912174, "loss": 5.2788, "mean_token_accuracy": 0.175381575524807, "num_tokens": 36781386.0, "step": 19945 }, { "entropy": 5.661901426315308, "epoch": 1.67607645452636, "grad_norm": 1.3125, "learning_rate": 0.0004719788431813536, "loss": 5.4426, "mean_token_accuracy": 0.1666231006383896, "num_tokens": 36790754.0, "step": 19950 }, { "entropy": 5.7037333965301515, "epoch": 1.676496534341525, "grad_norm": 1.3125, "learning_rate": 0.0004719643686342066, "loss": 5.411, "mean_token_accuracy": 0.1669971838593483, "num_tokens": 36799623.0, "step": 19955 }, { "entropy": 5.560089445114135, "epoch": 1.6769166141566898, "grad_norm": 1.4609375, "learning_rate": 0.0004719498905979373, "loss": 5.2094, "mean_token_accuracy": 0.18330834209918975, "num_tokens": 36808662.0, "step": 19960 }, { "entropy": 5.662607908248901, "epoch": 1.6773366939718546, "grad_norm": 1.328125, "learning_rate": 0.0004719354090728021, "loss": 5.3575, "mean_token_accuracy": 0.1706179365515709, "num_tokens": 36817730.0, "step": 19965 }, { "entropy": 5.664571619033813, "epoch": 1.6777567737870194, "grad_norm": 1.328125, "learning_rate": 0.00047192092405905743, "loss": 5.3373, "mean_token_accuracy": 0.1712536782026291, "num_tokens": 36827203.0, "step": 19970 }, { "entropy": 5.685041427612305, "epoch": 1.6781768536021844, "grad_norm": 1.453125, "learning_rate": 0.0004719064355569601, "loss": 5.5026, "mean_token_accuracy": 0.1671118676662445, "num_tokens": 36836145.0, "step": 19975 }, { "entropy": 5.684621858596802, "epoch": 1.6785969334173494, "grad_norm": 1.5703125, "learning_rate": 0.00047189194356676666, "loss": 5.4991, "mean_token_accuracy": 0.16675533056259156, "num_tokens": 36845609.0, "step": 19980 }, { "entropy": 5.687040328979492, "epoch": 1.6790170132325142, "grad_norm": 1.4765625, "learning_rate": 0.00047187744808873386, "loss": 5.5006, "mean_token_accuracy": 0.16970218122005462, "num_tokens": 36855367.0, "step": 19985 }, { "entropy": 5.6927672863006595, "epoch": 1.679437093047679, "grad_norm": 2.484375, "learning_rate": 0.00047186294912311835, "loss": 5.4542, "mean_token_accuracy": 0.16267849504947662, "num_tokens": 36864808.0, "step": 19990 }, { "entropy": 5.654680156707764, "epoch": 1.679857172862844, "grad_norm": 1.4453125, "learning_rate": 0.00047184844667017705, "loss": 5.3155, "mean_token_accuracy": 0.1753552258014679, "num_tokens": 36873651.0, "step": 19995 }, { "entropy": 5.636945676803589, "epoch": 1.680277252678009, "grad_norm": 1.375, "learning_rate": 0.00047183394073016695, "loss": 5.4605, "mean_token_accuracy": 0.1641372784972191, "num_tokens": 36883227.0, "step": 20000 }, { "entropy": 5.602861928939819, "epoch": 1.6806973324931738, "grad_norm": 1.328125, "learning_rate": 0.00047181943130334493, "loss": 5.2416, "mean_token_accuracy": 0.1794225737452507, "num_tokens": 36891628.0, "step": 20005 }, { "entropy": 5.613242959976196, "epoch": 1.6811174123083386, "grad_norm": 1.375, "learning_rate": 0.000471804918389968, "loss": 5.3644, "mean_token_accuracy": 0.16712576299905776, "num_tokens": 36901819.0, "step": 20010 }, { "entropy": 5.6311595916748045, "epoch": 1.6815374921235033, "grad_norm": 1.84375, "learning_rate": 0.0004717904019902933, "loss": 5.4003, "mean_token_accuracy": 0.17059791535139085, "num_tokens": 36911206.0, "step": 20015 }, { "entropy": 5.639288139343262, "epoch": 1.6819575719386683, "grad_norm": 1.5234375, "learning_rate": 0.000471775882104578, "loss": 5.3459, "mean_token_accuracy": 0.17099965065717698, "num_tokens": 36920830.0, "step": 20020 }, { "entropy": 5.543249130249023, "epoch": 1.6823776517538334, "grad_norm": 1.6796875, "learning_rate": 0.00047176135873307917, "loss": 5.2633, "mean_token_accuracy": 0.17037912011146544, "num_tokens": 36929702.0, "step": 20025 }, { "entropy": 5.723000860214233, "epoch": 1.6827977315689981, "grad_norm": 1.5625, "learning_rate": 0.0004717468318760543, "loss": 5.4725, "mean_token_accuracy": 0.16794218271970748, "num_tokens": 36938423.0, "step": 20030 }, { "entropy": 5.695086097717285, "epoch": 1.683217811384163, "grad_norm": 1.3515625, "learning_rate": 0.00047173230153376057, "loss": 5.3934, "mean_token_accuracy": 0.16773395538330077, "num_tokens": 36947198.0, "step": 20035 }, { "entropy": 5.658504676818848, "epoch": 1.6836378911993277, "grad_norm": 1.4453125, "learning_rate": 0.0004717177677064554, "loss": 5.3724, "mean_token_accuracy": 0.17398134768009185, "num_tokens": 36955636.0, "step": 20040 }, { "entropy": 5.590145826339722, "epoch": 1.6840579710144927, "grad_norm": 1.5234375, "learning_rate": 0.00047170323039439634, "loss": 5.3286, "mean_token_accuracy": 0.17025046944618225, "num_tokens": 36964463.0, "step": 20045 }, { "entropy": 5.70718035697937, "epoch": 1.6844780508296577, "grad_norm": 1.40625, "learning_rate": 0.0004716886895978408, "loss": 5.4353, "mean_token_accuracy": 0.1722966879606247, "num_tokens": 36974043.0, "step": 20050 }, { "entropy": 5.650777006149292, "epoch": 1.6848981306448225, "grad_norm": 2.0625, "learning_rate": 0.00047167414531704637, "loss": 5.3406, "mean_token_accuracy": 0.17258572578430176, "num_tokens": 36983856.0, "step": 20055 }, { "entropy": 5.618655967712402, "epoch": 1.6853182104599873, "grad_norm": 1.28125, "learning_rate": 0.00047165959755227077, "loss": 5.3678, "mean_token_accuracy": 0.17598632574081421, "num_tokens": 36992664.0, "step": 20060 }, { "entropy": 5.616749095916748, "epoch": 1.6857382902751523, "grad_norm": 1.3203125, "learning_rate": 0.00047164504630377166, "loss": 5.4167, "mean_token_accuracy": 0.17754430770874025, "num_tokens": 37001826.0, "step": 20065 }, { "entropy": 5.744650173187256, "epoch": 1.6861583700903173, "grad_norm": 1.3984375, "learning_rate": 0.00047163049157180676, "loss": 5.4431, "mean_token_accuracy": 0.16668420433998107, "num_tokens": 37010821.0, "step": 20070 }, { "entropy": 5.711326599121094, "epoch": 1.6865784499054821, "grad_norm": 1.34375, "learning_rate": 0.000471615933356634, "loss": 5.5325, "mean_token_accuracy": 0.15652497559785844, "num_tokens": 37021293.0, "step": 20075 }, { "entropy": 5.657518434524536, "epoch": 1.686998529720647, "grad_norm": 1.453125, "learning_rate": 0.0004716013716585112, "loss": 5.3066, "mean_token_accuracy": 0.17586107850074767, "num_tokens": 37031063.0, "step": 20080 }, { "entropy": 5.612600946426392, "epoch": 1.6874186095358117, "grad_norm": 1.4375, "learning_rate": 0.0004715868064776964, "loss": 5.3682, "mean_token_accuracy": 0.17950290441513062, "num_tokens": 37040879.0, "step": 20085 }, { "entropy": 5.600485229492188, "epoch": 1.6878386893509767, "grad_norm": 1.3203125, "learning_rate": 0.0004715722378144474, "loss": 5.2522, "mean_token_accuracy": 0.17968133985996246, "num_tokens": 37049452.0, "step": 20090 }, { "entropy": 5.524720573425293, "epoch": 1.6882587691661417, "grad_norm": 1.5859375, "learning_rate": 0.0004715576656690225, "loss": 5.2317, "mean_token_accuracy": 0.17775061279535292, "num_tokens": 37058010.0, "step": 20095 }, { "entropy": 5.663621473312378, "epoch": 1.6886788489813065, "grad_norm": 1.3828125, "learning_rate": 0.00047154309004167984, "loss": 5.4581, "mean_token_accuracy": 0.1619523733854294, "num_tokens": 37067580.0, "step": 20100 }, { "entropy": 5.626581048965454, "epoch": 1.6890989287964713, "grad_norm": 1.3671875, "learning_rate": 0.00047152851093267744, "loss": 5.3434, "mean_token_accuracy": 0.17342365384101868, "num_tokens": 37076584.0, "step": 20105 }, { "entropy": 5.6316078186035154, "epoch": 1.689519008611636, "grad_norm": 1.4375, "learning_rate": 0.0004715139283422737, "loss": 5.3632, "mean_token_accuracy": 0.16921617537736894, "num_tokens": 37086330.0, "step": 20110 }, { "entropy": 5.696176671981812, "epoch": 1.689939088426801, "grad_norm": 1.5078125, "learning_rate": 0.000471499342270727, "loss": 5.4194, "mean_token_accuracy": 0.16318628638982774, "num_tokens": 37096323.0, "step": 20115 }, { "entropy": 5.619508266448975, "epoch": 1.690359168241966, "grad_norm": 1.4921875, "learning_rate": 0.00047148475271829556, "loss": 5.3484, "mean_token_accuracy": 0.1682300463318825, "num_tokens": 37106281.0, "step": 20120 }, { "entropy": 5.5695782661437985, "epoch": 1.6907792480571309, "grad_norm": 1.6796875, "learning_rate": 0.0004714701596852379, "loss": 5.3293, "mean_token_accuracy": 0.1787579908967018, "num_tokens": 37116002.0, "step": 20125 }, { "entropy": 5.626379442214966, "epoch": 1.6911993278722957, "grad_norm": 1.3671875, "learning_rate": 0.0004714555631718125, "loss": 5.4004, "mean_token_accuracy": 0.17309577763080597, "num_tokens": 37125125.0, "step": 20130 }, { "entropy": 5.601164245605469, "epoch": 1.6916194076874607, "grad_norm": 1.546875, "learning_rate": 0.000471440963178278, "loss": 5.2532, "mean_token_accuracy": 0.18026716858148575, "num_tokens": 37134358.0, "step": 20135 }, { "entropy": 5.707911014556885, "epoch": 1.6920394875026254, "grad_norm": 1.4375, "learning_rate": 0.00047142635970489293, "loss": 5.4198, "mean_token_accuracy": 0.16907861083745956, "num_tokens": 37143732.0, "step": 20140 }, { "entropy": 5.634232664108277, "epoch": 1.6924595673177905, "grad_norm": 1.328125, "learning_rate": 0.0004714117527519161, "loss": 5.3242, "mean_token_accuracy": 0.17292115837335587, "num_tokens": 37153809.0, "step": 20145 }, { "entropy": 5.632751035690307, "epoch": 1.6928796471329552, "grad_norm": 1.3671875, "learning_rate": 0.00047139714231960616, "loss": 5.3578, "mean_token_accuracy": 0.16431571841239928, "num_tokens": 37163272.0, "step": 20150 }, { "entropy": 5.591974878311158, "epoch": 1.69329972694812, "grad_norm": 1.6171875, "learning_rate": 0.000471382528408222, "loss": 5.2814, "mean_token_accuracy": 0.17594963163137436, "num_tokens": 37172323.0, "step": 20155 }, { "entropy": 5.712676620483398, "epoch": 1.693719806763285, "grad_norm": 1.3828125, "learning_rate": 0.0004713679110180225, "loss": 5.4905, "mean_token_accuracy": 0.16899462938308715, "num_tokens": 37181262.0, "step": 20160 }, { "entropy": 5.6906595706939695, "epoch": 1.69413988657845, "grad_norm": 1.4453125, "learning_rate": 0.0004713532901492666, "loss": 5.4426, "mean_token_accuracy": 0.17433411180973052, "num_tokens": 37189576.0, "step": 20165 }, { "entropy": 5.718492841720581, "epoch": 1.6945599663936148, "grad_norm": 1.390625, "learning_rate": 0.0004713386658022132, "loss": 5.4397, "mean_token_accuracy": 0.16342198550701142, "num_tokens": 37199502.0, "step": 20170 }, { "entropy": 5.702072095870972, "epoch": 1.6949800462087796, "grad_norm": 1.2734375, "learning_rate": 0.0004713240379771214, "loss": 5.3477, "mean_token_accuracy": 0.16661544740200043, "num_tokens": 37209028.0, "step": 20175 }, { "entropy": 5.704798460006714, "epoch": 1.6954001260239444, "grad_norm": 1.3203125, "learning_rate": 0.0004713094066742505, "loss": 5.4943, "mean_token_accuracy": 0.16960543841123582, "num_tokens": 37218087.0, "step": 20180 }, { "entropy": 5.675905656814575, "epoch": 1.6958202058391094, "grad_norm": 1.359375, "learning_rate": 0.00047129477189385946, "loss": 5.4475, "mean_token_accuracy": 0.16832420825958253, "num_tokens": 37227345.0, "step": 20185 }, { "entropy": 5.73945164680481, "epoch": 1.6962402856542744, "grad_norm": 1.3359375, "learning_rate": 0.0004712801336362076, "loss": 5.3736, "mean_token_accuracy": 0.16931984573602676, "num_tokens": 37236011.0, "step": 20190 }, { "entropy": 5.620118522644043, "epoch": 1.6966603654694392, "grad_norm": 1.3125, "learning_rate": 0.0004712654919015543, "loss": 5.3576, "mean_token_accuracy": 0.17278312891721725, "num_tokens": 37244613.0, "step": 20195 }, { "entropy": 5.61281909942627, "epoch": 1.697080445284604, "grad_norm": 1.59375, "learning_rate": 0.0004712508466901588, "loss": 5.3743, "mean_token_accuracy": 0.1720852240920067, "num_tokens": 37253768.0, "step": 20200 }, { "entropy": 5.713197374343872, "epoch": 1.697500525099769, "grad_norm": 1.3671875, "learning_rate": 0.00047123619800228057, "loss": 5.4486, "mean_token_accuracy": 0.1585058517754078, "num_tokens": 37263230.0, "step": 20205 }, { "entropy": 5.66543140411377, "epoch": 1.6979206049149338, "grad_norm": 1.421875, "learning_rate": 0.0004712215458381792, "loss": 5.344, "mean_token_accuracy": 0.1704501375555992, "num_tokens": 37272752.0, "step": 20210 }, { "entropy": 5.69918270111084, "epoch": 1.6983406847300988, "grad_norm": 1.375, "learning_rate": 0.0004712068901981142, "loss": 5.3909, "mean_token_accuracy": 0.17387653589248658, "num_tokens": 37281465.0, "step": 20215 }, { "entropy": 5.658880043029785, "epoch": 1.6987607645452636, "grad_norm": 1.296875, "learning_rate": 0.0004711922310823452, "loss": 5.3859, "mean_token_accuracy": 0.16725497990846633, "num_tokens": 37290408.0, "step": 20220 }, { "entropy": 5.644626569747925, "epoch": 1.6991808443604284, "grad_norm": 1.3671875, "learning_rate": 0.0004711775684911318, "loss": 5.3498, "mean_token_accuracy": 0.1716018721461296, "num_tokens": 37298890.0, "step": 20225 }, { "entropy": 5.60590615272522, "epoch": 1.6996009241755934, "grad_norm": 1.4453125, "learning_rate": 0.00047116290242473375, "loss": 5.3494, "mean_token_accuracy": 0.16820138245820998, "num_tokens": 37307720.0, "step": 20230 }, { "entropy": 5.641182088851929, "epoch": 1.7000210039907584, "grad_norm": 1.703125, "learning_rate": 0.000471148232883411, "loss": 5.3946, "mean_token_accuracy": 0.16923058927059173, "num_tokens": 37317145.0, "step": 20235 }, { "entropy": 5.6542257308959964, "epoch": 1.7004410838059232, "grad_norm": 1.203125, "learning_rate": 0.00047113355986742325, "loss": 5.329, "mean_token_accuracy": 0.17771051228046417, "num_tokens": 37326579.0, "step": 20240 }, { "entropy": 5.677807474136353, "epoch": 1.700861163621088, "grad_norm": 1.6484375, "learning_rate": 0.00047111888337703046, "loss": 5.4174, "mean_token_accuracy": 0.17049338668584824, "num_tokens": 37336065.0, "step": 20245 }, { "entropy": 5.56732497215271, "epoch": 1.7012812434362528, "grad_norm": 1.3515625, "learning_rate": 0.0004711042034124926, "loss": 5.2807, "mean_token_accuracy": 0.17862701117992402, "num_tokens": 37345297.0, "step": 20250 }, { "entropy": 5.668249082565308, "epoch": 1.7017013232514178, "grad_norm": 1.4296875, "learning_rate": 0.0004710895199740698, "loss": 5.42, "mean_token_accuracy": 0.16874612122774124, "num_tokens": 37354942.0, "step": 20255 }, { "entropy": 5.729604482650757, "epoch": 1.7021214030665828, "grad_norm": 2.03125, "learning_rate": 0.0004710748330620222, "loss": 5.3187, "mean_token_accuracy": 0.17622058391571044, "num_tokens": 37364068.0, "step": 20260 }, { "entropy": 5.6129645824432375, "epoch": 1.7025414828817476, "grad_norm": 1.34375, "learning_rate": 0.0004710601426766098, "loss": 5.4302, "mean_token_accuracy": 0.16786112040281295, "num_tokens": 37373256.0, "step": 20265 }, { "entropy": 5.576197624206543, "epoch": 1.7029615626969123, "grad_norm": 1.8203125, "learning_rate": 0.00047104544881809295, "loss": 5.2813, "mean_token_accuracy": 0.17993906289339065, "num_tokens": 37382098.0, "step": 20270 }, { "entropy": 5.573770999908447, "epoch": 1.7033816425120771, "grad_norm": 1.390625, "learning_rate": 0.0004710307514867319, "loss": 5.2724, "mean_token_accuracy": 0.17502158433198928, "num_tokens": 37390844.0, "step": 20275 }, { "entropy": 5.67983660697937, "epoch": 1.7038017223272421, "grad_norm": 1.296875, "learning_rate": 0.0004710160506827871, "loss": 5.3478, "mean_token_accuracy": 0.16562999337911605, "num_tokens": 37399617.0, "step": 20280 }, { "entropy": 5.7143641948699955, "epoch": 1.7042218021424071, "grad_norm": 1.609375, "learning_rate": 0.0004710013464065189, "loss": 5.4787, "mean_token_accuracy": 0.16709637641906738, "num_tokens": 37409368.0, "step": 20285 }, { "entropy": 5.6330140113830565, "epoch": 1.704641881957572, "grad_norm": 1.9609375, "learning_rate": 0.0004709866386581877, "loss": 5.2808, "mean_token_accuracy": 0.1773850664496422, "num_tokens": 37418026.0, "step": 20290 }, { "entropy": 5.621044492721557, "epoch": 1.7050619617727367, "grad_norm": 1.7734375, "learning_rate": 0.00047097192743805413, "loss": 5.3021, "mean_token_accuracy": 0.1740890622138977, "num_tokens": 37426850.0, "step": 20295 }, { "entropy": 5.63762059211731, "epoch": 1.7054820415879017, "grad_norm": 1.453125, "learning_rate": 0.0004709572127463788, "loss": 5.3505, "mean_token_accuracy": 0.1763610526919365, "num_tokens": 37436631.0, "step": 20300 }, { "entropy": 5.673188161849976, "epoch": 1.7059021214030667, "grad_norm": 1.3828125, "learning_rate": 0.0004709424945834223, "loss": 5.3697, "mean_token_accuracy": 0.1696738511323929, "num_tokens": 37445619.0, "step": 20305 }, { "entropy": 5.609205055236816, "epoch": 1.7063222012182315, "grad_norm": 1.46875, "learning_rate": 0.00047092777294944544, "loss": 5.3223, "mean_token_accuracy": 0.17436521351337433, "num_tokens": 37454205.0, "step": 20310 }, { "entropy": 5.672186851501465, "epoch": 1.7067422810333963, "grad_norm": 1.9375, "learning_rate": 0.000470913047844709, "loss": 5.4272, "mean_token_accuracy": 0.17115625292062758, "num_tokens": 37463301.0, "step": 20315 }, { "entropy": 5.664550542831421, "epoch": 1.707162360848561, "grad_norm": 1.5, "learning_rate": 0.00047089831926947374, "loss": 5.4153, "mean_token_accuracy": 0.1740603879094124, "num_tokens": 37471937.0, "step": 20320 }, { "entropy": 5.715552902221679, "epoch": 1.707582440663726, "grad_norm": 2.015625, "learning_rate": 0.0004708835872240007, "loss": 5.378, "mean_token_accuracy": 0.17142789512872697, "num_tokens": 37480779.0, "step": 20325 }, { "entropy": 5.700094079971313, "epoch": 1.7080025204788911, "grad_norm": 1.6328125, "learning_rate": 0.00047086885170855074, "loss": 5.4218, "mean_token_accuracy": 0.16403729021549224, "num_tokens": 37491053.0, "step": 20330 }, { "entropy": 5.68527364730835, "epoch": 1.708422600294056, "grad_norm": 1.390625, "learning_rate": 0.000470854112723385, "loss": 5.3663, "mean_token_accuracy": 0.17164998948574067, "num_tokens": 37499091.0, "step": 20335 }, { "entropy": 5.639491558074951, "epoch": 1.7088426801092207, "grad_norm": 1.421875, "learning_rate": 0.0004708393702687644, "loss": 5.4264, "mean_token_accuracy": 0.1666134625673294, "num_tokens": 37507882.0, "step": 20340 }, { "entropy": 5.662171506881714, "epoch": 1.7092627599243855, "grad_norm": 1.7265625, "learning_rate": 0.00047082462434495015, "loss": 5.3894, "mean_token_accuracy": 0.17504524290561677, "num_tokens": 37517048.0, "step": 20345 }, { "entropy": 5.745312738418579, "epoch": 1.7096828397395505, "grad_norm": 1.3359375, "learning_rate": 0.0004708098749522036, "loss": 5.4333, "mean_token_accuracy": 0.16021379381418227, "num_tokens": 37526355.0, "step": 20350 }, { "entropy": 5.697979307174682, "epoch": 1.7101029195547155, "grad_norm": 1.515625, "learning_rate": 0.0004707951220907859, "loss": 5.4629, "mean_token_accuracy": 0.1664559945464134, "num_tokens": 37535746.0, "step": 20355 }, { "entropy": 5.711132001876831, "epoch": 1.7105229993698803, "grad_norm": 1.453125, "learning_rate": 0.0004707803657609585, "loss": 5.4243, "mean_token_accuracy": 0.16239014863967896, "num_tokens": 37546479.0, "step": 20360 }, { "entropy": 5.745557022094727, "epoch": 1.710943079185045, "grad_norm": 1.390625, "learning_rate": 0.00047076560596298275, "loss": 5.4748, "mean_token_accuracy": 0.1672067642211914, "num_tokens": 37556805.0, "step": 20365 }, { "entropy": 5.74319806098938, "epoch": 1.71136315900021, "grad_norm": 1.3515625, "learning_rate": 0.00047075084269712, "loss": 5.4602, "mean_token_accuracy": 0.1733308419585228, "num_tokens": 37564748.0, "step": 20370 }, { "entropy": 5.585902261734009, "epoch": 1.711783238815375, "grad_norm": 1.5859375, "learning_rate": 0.0004707360759636319, "loss": 5.2641, "mean_token_accuracy": 0.1821661874651909, "num_tokens": 37574674.0, "step": 20375 }, { "entropy": 5.665689754486084, "epoch": 1.7122033186305399, "grad_norm": 1.4296875, "learning_rate": 0.00047072130576278, "loss": 5.3839, "mean_token_accuracy": 0.17048663049936294, "num_tokens": 37584459.0, "step": 20380 }, { "entropy": 5.67416934967041, "epoch": 1.7126233984457047, "grad_norm": 1.3203125, "learning_rate": 0.0004707065320948259, "loss": 5.4119, "mean_token_accuracy": 0.17284180521965026, "num_tokens": 37593570.0, "step": 20385 }, { "entropy": 5.661170578002929, "epoch": 1.7130434782608694, "grad_norm": 1.2734375, "learning_rate": 0.00047069175496003147, "loss": 5.4147, "mean_token_accuracy": 0.16955641210079192, "num_tokens": 37603032.0, "step": 20390 }, { "entropy": 5.6446874141693115, "epoch": 1.7134635580760345, "grad_norm": 1.4765625, "learning_rate": 0.0004706769743586583, "loss": 5.3464, "mean_token_accuracy": 0.1723109945654869, "num_tokens": 37612404.0, "step": 20395 }, { "entropy": 5.636924123764038, "epoch": 1.7138836378911995, "grad_norm": 1.5, "learning_rate": 0.00047066219029096837, "loss": 5.3658, "mean_token_accuracy": 0.1704767942428589, "num_tokens": 37621933.0, "step": 20400 }, { "entropy": 5.732251310348511, "epoch": 1.7143037177063642, "grad_norm": 1.359375, "learning_rate": 0.0004706474027572234, "loss": 5.3965, "mean_token_accuracy": 0.17179838567972183, "num_tokens": 37632078.0, "step": 20405 }, { "entropy": 5.536679124832153, "epoch": 1.714723797521529, "grad_norm": 1.3359375, "learning_rate": 0.00047063261175768543, "loss": 5.3053, "mean_token_accuracy": 0.17315014004707335, "num_tokens": 37641665.0, "step": 20410 }, { "entropy": 5.708039617538452, "epoch": 1.7151438773366938, "grad_norm": 1.4765625, "learning_rate": 0.00047061781729261656, "loss": 5.3721, "mean_token_accuracy": 0.1656670242547989, "num_tokens": 37650751.0, "step": 20415 }, { "entropy": 5.628295135498047, "epoch": 1.7155639571518588, "grad_norm": 1.546875, "learning_rate": 0.00047060301936227865, "loss": 5.3617, "mean_token_accuracy": 0.17506831139326096, "num_tokens": 37659165.0, "step": 20420 }, { "entropy": 5.646256732940674, "epoch": 1.7159840369670238, "grad_norm": 1.4140625, "learning_rate": 0.0004705882179669341, "loss": 5.3544, "mean_token_accuracy": 0.17101034224033357, "num_tokens": 37668057.0, "step": 20425 }, { "entropy": 5.685383367538452, "epoch": 1.7164041167821886, "grad_norm": 1.359375, "learning_rate": 0.0004705734131068449, "loss": 5.348, "mean_token_accuracy": 0.16941767185926437, "num_tokens": 37677674.0, "step": 20430 }, { "entropy": 5.602097034454346, "epoch": 1.7168241965973534, "grad_norm": 1.3203125, "learning_rate": 0.0004705586047822734, "loss": 5.3825, "mean_token_accuracy": 0.17536012828350067, "num_tokens": 37687009.0, "step": 20435 }, { "entropy": 5.629279613494873, "epoch": 1.7172442764125184, "grad_norm": 1.453125, "learning_rate": 0.00047054379299348194, "loss": 5.2569, "mean_token_accuracy": 0.17184604406356813, "num_tokens": 37696723.0, "step": 20440 }, { "entropy": 5.608349704742432, "epoch": 1.7176643562276832, "grad_norm": 1.7578125, "learning_rate": 0.00047052897774073295, "loss": 5.3778, "mean_token_accuracy": 0.17021260857582093, "num_tokens": 37706560.0, "step": 20445 }, { "entropy": 5.6732789993286135, "epoch": 1.7180844360428482, "grad_norm": 1.484375, "learning_rate": 0.00047051415902428875, "loss": 5.3945, "mean_token_accuracy": 0.1690693438053131, "num_tokens": 37715176.0, "step": 20450 }, { "entropy": 5.639693117141723, "epoch": 1.718504515858013, "grad_norm": 1.359375, "learning_rate": 0.0004704993368444119, "loss": 5.3816, "mean_token_accuracy": 0.16992994248867035, "num_tokens": 37723956.0, "step": 20455 }, { "entropy": 5.684892559051514, "epoch": 1.7189245956731778, "grad_norm": 1.6796875, "learning_rate": 0.0004704845112013649, "loss": 5.3845, "mean_token_accuracy": 0.17116268277168273, "num_tokens": 37733236.0, "step": 20460 }, { "entropy": 5.705689287185669, "epoch": 1.7193446754883428, "grad_norm": 1.2578125, "learning_rate": 0.0004704696820954105, "loss": 5.441, "mean_token_accuracy": 0.16739957481622697, "num_tokens": 37742626.0, "step": 20465 }, { "entropy": 5.627845668792725, "epoch": 1.7197647553035078, "grad_norm": 1.3828125, "learning_rate": 0.0004704548495268113, "loss": 5.3161, "mean_token_accuracy": 0.1832030311226845, "num_tokens": 37751854.0, "step": 20470 }, { "entropy": 5.622863864898681, "epoch": 1.7201848351186726, "grad_norm": 1.421875, "learning_rate": 0.00047044001349583, "loss": 5.3535, "mean_token_accuracy": 0.16801756620407104, "num_tokens": 37760993.0, "step": 20475 }, { "entropy": 5.647179698944091, "epoch": 1.7206049149338374, "grad_norm": 1.3203125, "learning_rate": 0.00047042517400272966, "loss": 5.4368, "mean_token_accuracy": 0.17291858792304993, "num_tokens": 37771714.0, "step": 20480 }, { "entropy": 5.68183217048645, "epoch": 1.7210249947490022, "grad_norm": 1.3671875, "learning_rate": 0.0004704103310477729, "loss": 5.3595, "mean_token_accuracy": 0.17705655097961426, "num_tokens": 37780653.0, "step": 20485 }, { "entropy": 5.679630851745605, "epoch": 1.7214450745641672, "grad_norm": 1.234375, "learning_rate": 0.0004703954846312228, "loss": 5.4293, "mean_token_accuracy": 0.1708232581615448, "num_tokens": 37790450.0, "step": 20490 }, { "entropy": 5.649170446395874, "epoch": 1.7218651543793322, "grad_norm": 1.3828125, "learning_rate": 0.0004703806347533423, "loss": 5.4189, "mean_token_accuracy": 0.16525555849075318, "num_tokens": 37800450.0, "step": 20495 }, { "entropy": 5.679272603988648, "epoch": 1.722285234194497, "grad_norm": 1.4296875, "learning_rate": 0.0004703657814143945, "loss": 5.4314, "mean_token_accuracy": 0.16290275305509566, "num_tokens": 37809261.0, "step": 20500 }, { "entropy": 5.642959403991699, "epoch": 1.7227053140096618, "grad_norm": 1.296875, "learning_rate": 0.0004703509246146424, "loss": 5.2126, "mean_token_accuracy": 0.17658686637878418, "num_tokens": 37818244.0, "step": 20505 }, { "entropy": 5.628348398208618, "epoch": 1.7231253938248268, "grad_norm": 1.40625, "learning_rate": 0.0004703360643543493, "loss": 5.3409, "mean_token_accuracy": 0.17722394019365312, "num_tokens": 37828555.0, "step": 20510 }, { "entropy": 5.563862991333008, "epoch": 1.7235454736399916, "grad_norm": 1.28125, "learning_rate": 0.00047032120063377836, "loss": 5.3109, "mean_token_accuracy": 0.17544028162956238, "num_tokens": 37837840.0, "step": 20515 }, { "entropy": 5.6779731750488285, "epoch": 1.7239655534551566, "grad_norm": 1.640625, "learning_rate": 0.00047030633345319293, "loss": 5.3616, "mean_token_accuracy": 0.16884265542030336, "num_tokens": 37846910.0, "step": 20520 }, { "entropy": 5.50715069770813, "epoch": 1.7243856332703213, "grad_norm": 1.46875, "learning_rate": 0.00047029146281285647, "loss": 5.2011, "mean_token_accuracy": 0.1903439998626709, "num_tokens": 37855642.0, "step": 20525 }, { "entropy": 5.667270755767822, "epoch": 1.7248057130854861, "grad_norm": 1.3828125, "learning_rate": 0.0004702765887130322, "loss": 5.3833, "mean_token_accuracy": 0.17016415446996688, "num_tokens": 37864439.0, "step": 20530 }, { "entropy": 5.749380970001221, "epoch": 1.7252257929006511, "grad_norm": 1.3671875, "learning_rate": 0.00047026171115398377, "loss": 5.4523, "mean_token_accuracy": 0.1656784437596798, "num_tokens": 37873801.0, "step": 20535 }, { "entropy": 5.5730626583099365, "epoch": 1.7256458727158162, "grad_norm": 2.90625, "learning_rate": 0.0004702468301359746, "loss": 5.3311, "mean_token_accuracy": 0.17896921038627625, "num_tokens": 37883915.0, "step": 20540 }, { "entropy": 5.662897348403931, "epoch": 1.726065952530981, "grad_norm": 1.8984375, "learning_rate": 0.0004702319456592684, "loss": 5.4436, "mean_token_accuracy": 0.16838170140981673, "num_tokens": 37894083.0, "step": 20545 }, { "entropy": 5.6904213428497314, "epoch": 1.7264860323461457, "grad_norm": 1.4296875, "learning_rate": 0.00047021705772412885, "loss": 5.4377, "mean_token_accuracy": 0.16888969093561174, "num_tokens": 37902264.0, "step": 20550 }, { "entropy": 5.643442630767822, "epoch": 1.7269061121613105, "grad_norm": 1.34375, "learning_rate": 0.00047020216633081964, "loss": 5.341, "mean_token_accuracy": 0.17365592420101167, "num_tokens": 37911071.0, "step": 20555 }, { "entropy": 5.628277730941773, "epoch": 1.7273261919764755, "grad_norm": 1.78125, "learning_rate": 0.00047018727147960453, "loss": 5.4221, "mean_token_accuracy": 0.16485991030931474, "num_tokens": 37920048.0, "step": 20560 }, { "entropy": 5.698462057113647, "epoch": 1.7277462717916405, "grad_norm": 1.4375, "learning_rate": 0.00047017237317074743, "loss": 5.3894, "mean_token_accuracy": 0.17410711497068404, "num_tokens": 37928877.0, "step": 20565 }, { "entropy": 5.709046506881714, "epoch": 1.7281663516068053, "grad_norm": 1.671875, "learning_rate": 0.0004701574714045123, "loss": 5.4051, "mean_token_accuracy": 0.16448906511068345, "num_tokens": 37937860.0, "step": 20570 }, { "entropy": 5.6509918689727785, "epoch": 1.72858643142197, "grad_norm": 1.2578125, "learning_rate": 0.00047014256618116304, "loss": 5.4058, "mean_token_accuracy": 0.1658877193927765, "num_tokens": 37947588.0, "step": 20575 }, { "entropy": 5.638443422317505, "epoch": 1.729006511237135, "grad_norm": 1.6875, "learning_rate": 0.00047012765750096365, "loss": 5.3205, "mean_token_accuracy": 0.1806677833199501, "num_tokens": 37957598.0, "step": 20580 }, { "entropy": 5.641557359695435, "epoch": 1.7294265910523, "grad_norm": 1.3671875, "learning_rate": 0.00047011274536417827, "loss": 5.3013, "mean_token_accuracy": 0.17997593879699708, "num_tokens": 37965294.0, "step": 20585 }, { "entropy": 5.590178346633911, "epoch": 1.729846670867465, "grad_norm": 1.4140625, "learning_rate": 0.00047009782977107113, "loss": 5.3207, "mean_token_accuracy": 0.1827242076396942, "num_tokens": 37973977.0, "step": 20590 }, { "entropy": 5.771245050430298, "epoch": 1.7302667506826297, "grad_norm": 1.5546875, "learning_rate": 0.00047008291072190634, "loss": 5.4798, "mean_token_accuracy": 0.1616050750017166, "num_tokens": 37984492.0, "step": 20595 }, { "entropy": 5.745875120162964, "epoch": 1.7306868304977945, "grad_norm": 1.4140625, "learning_rate": 0.0004700679882169482, "loss": 5.3922, "mean_token_accuracy": 0.17045068442821504, "num_tokens": 37994404.0, "step": 20600 }, { "entropy": 5.543208265304566, "epoch": 1.7311069103129595, "grad_norm": 1.8828125, "learning_rate": 0.0004700530622564613, "loss": 5.3057, "mean_token_accuracy": 0.18024921864271165, "num_tokens": 38002659.0, "step": 20605 }, { "entropy": 5.619626903533936, "epoch": 1.7315269901281245, "grad_norm": 1.515625, "learning_rate": 0.0004700381328407096, "loss": 5.2932, "mean_token_accuracy": 0.17747585326433182, "num_tokens": 38012290.0, "step": 20610 }, { "entropy": 5.699101209640503, "epoch": 1.7319470699432893, "grad_norm": 1.734375, "learning_rate": 0.0004700231999699579, "loss": 5.4263, "mean_token_accuracy": 0.16802889853715897, "num_tokens": 38022163.0, "step": 20615 }, { "entropy": 5.6378819942474365, "epoch": 1.732367149758454, "grad_norm": 1.5234375, "learning_rate": 0.0004700082636444706, "loss": 5.3703, "mean_token_accuracy": 0.16259206235408782, "num_tokens": 38031051.0, "step": 20620 }, { "entropy": 5.6816980838775635, "epoch": 1.7327872295736189, "grad_norm": 1.640625, "learning_rate": 0.00046999332386451245, "loss": 5.4231, "mean_token_accuracy": 0.16787817180156708, "num_tokens": 38040474.0, "step": 20625 }, { "entropy": 5.6875709056854244, "epoch": 1.7332073093887839, "grad_norm": 1.3203125, "learning_rate": 0.00046997838063034784, "loss": 5.3934, "mean_token_accuracy": 0.1709348142147064, "num_tokens": 38049620.0, "step": 20630 }, { "entropy": 5.58522481918335, "epoch": 1.7336273892039489, "grad_norm": 1.5, "learning_rate": 0.00046996343394224173, "loss": 5.3489, "mean_token_accuracy": 0.17310173362493514, "num_tokens": 38059866.0, "step": 20635 }, { "entropy": 5.6092894077301025, "epoch": 1.7340474690191137, "grad_norm": 1.4296875, "learning_rate": 0.00046994848380045866, "loss": 5.3227, "mean_token_accuracy": 0.16931509375572204, "num_tokens": 38068948.0, "step": 20640 }, { "entropy": 5.730233001708984, "epoch": 1.7344675488342784, "grad_norm": 1.4140625, "learning_rate": 0.00046993353020526366, "loss": 5.5148, "mean_token_accuracy": 0.17122802436351775, "num_tokens": 38079239.0, "step": 20645 }, { "entropy": 5.666778707504273, "epoch": 1.7348876286494432, "grad_norm": 1.8515625, "learning_rate": 0.0004699185731569215, "loss": 5.3772, "mean_token_accuracy": 0.17148027569055557, "num_tokens": 38087999.0, "step": 20650 }, { "entropy": 5.669428873062134, "epoch": 1.7353077084646082, "grad_norm": 1.234375, "learning_rate": 0.0004699036126556972, "loss": 5.3704, "mean_token_accuracy": 0.17108169794082642, "num_tokens": 38096586.0, "step": 20655 }, { "entropy": 5.577715730667114, "epoch": 1.7357277882797733, "grad_norm": 1.71875, "learning_rate": 0.0004698886487018558, "loss": 5.3346, "mean_token_accuracy": 0.1717136487364769, "num_tokens": 38104766.0, "step": 20660 }, { "entropy": 5.62361216545105, "epoch": 1.736147868094938, "grad_norm": 1.296875, "learning_rate": 0.0004698736812956623, "loss": 5.3684, "mean_token_accuracy": 0.17202869206666946, "num_tokens": 38113574.0, "step": 20665 }, { "entropy": 5.6415934562683105, "epoch": 1.7365679479101028, "grad_norm": 1.53125, "learning_rate": 0.0004698587104373819, "loss": 5.3325, "mean_token_accuracy": 0.16672066748142242, "num_tokens": 38122513.0, "step": 20670 }, { "entropy": 5.526204442977905, "epoch": 1.7369880277252678, "grad_norm": 1.328125, "learning_rate": 0.00046984373612727975, "loss": 5.3066, "mean_token_accuracy": 0.16567323356866837, "num_tokens": 38131105.0, "step": 20675 }, { "entropy": 5.6237061500549315, "epoch": 1.7374081075404328, "grad_norm": 1.75, "learning_rate": 0.00046982875836562116, "loss": 5.3868, "mean_token_accuracy": 0.16423740088939667, "num_tokens": 38140106.0, "step": 20680 }, { "entropy": 5.661822700500489, "epoch": 1.7378281873555976, "grad_norm": 1.4921875, "learning_rate": 0.00046981377715267145, "loss": 5.3491, "mean_token_accuracy": 0.17514974921941756, "num_tokens": 38149215.0, "step": 20685 }, { "entropy": 5.637057638168335, "epoch": 1.7382482671707624, "grad_norm": 1.6796875, "learning_rate": 0.000469798792488696, "loss": 5.2793, "mean_token_accuracy": 0.17926838994026184, "num_tokens": 38157591.0, "step": 20690 }, { "entropy": 5.580015373229981, "epoch": 1.7386683469859272, "grad_norm": 1.53125, "learning_rate": 0.0004697838043739602, "loss": 5.4022, "mean_token_accuracy": 0.16713829338550568, "num_tokens": 38167673.0, "step": 20695 }, { "entropy": 5.708221006393432, "epoch": 1.7390884268010922, "grad_norm": 1.8515625, "learning_rate": 0.00046976881280872974, "loss": 5.368, "mean_token_accuracy": 0.1714918613433838, "num_tokens": 38177586.0, "step": 20700 }, { "entropy": 5.71192569732666, "epoch": 1.7395085066162572, "grad_norm": 1.3046875, "learning_rate": 0.0004697538177932699, "loss": 5.3698, "mean_token_accuracy": 0.16908372268080712, "num_tokens": 38187020.0, "step": 20705 }, { "entropy": 5.527950620651245, "epoch": 1.739928586431422, "grad_norm": 1.2734375, "learning_rate": 0.0004697388193278465, "loss": 5.1499, "mean_token_accuracy": 0.1834670916199684, "num_tokens": 38195705.0, "step": 20710 }, { "entropy": 5.576827144622802, "epoch": 1.7403486662465868, "grad_norm": 1.5625, "learning_rate": 0.0004697238174127252, "loss": 5.2747, "mean_token_accuracy": 0.1754479631781578, "num_tokens": 38204726.0, "step": 20715 }, { "entropy": 5.591728734970093, "epoch": 1.7407687460617516, "grad_norm": 1.34375, "learning_rate": 0.0004697088120481717, "loss": 5.3875, "mean_token_accuracy": 0.16983902752399443, "num_tokens": 38214376.0, "step": 20720 }, { "entropy": 5.610480928421021, "epoch": 1.7411888258769166, "grad_norm": 1.3203125, "learning_rate": 0.0004696938032344519, "loss": 5.298, "mean_token_accuracy": 0.17367589026689528, "num_tokens": 38223631.0, "step": 20725 }, { "entropy": 5.627554512023925, "epoch": 1.7416089056920816, "grad_norm": 1.2734375, "learning_rate": 0.0004696787909718317, "loss": 5.3183, "mean_token_accuracy": 0.18182352632284166, "num_tokens": 38233519.0, "step": 20730 }, { "entropy": 5.636379337310791, "epoch": 1.7420289855072464, "grad_norm": 2.046875, "learning_rate": 0.00046966377526057686, "loss": 5.2841, "mean_token_accuracy": 0.1782074749469757, "num_tokens": 38242340.0, "step": 20735 }, { "entropy": 5.582876539230346, "epoch": 1.7424490653224112, "grad_norm": 1.4375, "learning_rate": 0.0004696487561009535, "loss": 5.2942, "mean_token_accuracy": 0.17328224033117295, "num_tokens": 38251194.0, "step": 20740 }, { "entropy": 5.637811088562012, "epoch": 1.7428691451375762, "grad_norm": 1.3125, "learning_rate": 0.0004696337334932277, "loss": 5.3531, "mean_token_accuracy": 0.17145794332027436, "num_tokens": 38259938.0, "step": 20745 }, { "entropy": 5.654774141311646, "epoch": 1.743289224952741, "grad_norm": 1.53125, "learning_rate": 0.00046961870743766546, "loss": 5.3472, "mean_token_accuracy": 0.17386607378721236, "num_tokens": 38268073.0, "step": 20750 }, { "entropy": 5.666212892532348, "epoch": 1.743709304767906, "grad_norm": 2.125, "learning_rate": 0.00046960367793453313, "loss": 5.4556, "mean_token_accuracy": 0.16973720118403435, "num_tokens": 38277667.0, "step": 20755 }, { "entropy": 5.710540676116944, "epoch": 1.7441293845830708, "grad_norm": 1.453125, "learning_rate": 0.00046958864498409673, "loss": 5.4055, "mean_token_accuracy": 0.17234568446874618, "num_tokens": 38287142.0, "step": 20760 }, { "entropy": 5.692324304580689, "epoch": 1.7445494643982355, "grad_norm": 1.5546875, "learning_rate": 0.00046957360858662276, "loss": 5.3783, "mean_token_accuracy": 0.17638524919748305, "num_tokens": 38296199.0, "step": 20765 }, { "entropy": 5.645661878585815, "epoch": 1.7449695442134006, "grad_norm": 1.5234375, "learning_rate": 0.0004695585687423775, "loss": 5.3891, "mean_token_accuracy": 0.17083698213100434, "num_tokens": 38305412.0, "step": 20770 }, { "entropy": 5.592067527770996, "epoch": 1.7453896240285656, "grad_norm": 1.2890625, "learning_rate": 0.0004695435254516273, "loss": 5.3152, "mean_token_accuracy": 0.18210890293121337, "num_tokens": 38313898.0, "step": 20775 }, { "entropy": 5.671021890640259, "epoch": 1.7458097038437304, "grad_norm": 1.4375, "learning_rate": 0.0004695284787146388, "loss": 5.4325, "mean_token_accuracy": 0.1672999680042267, "num_tokens": 38322835.0, "step": 20780 }, { "entropy": 5.610225439071655, "epoch": 1.7462297836588951, "grad_norm": 1.3125, "learning_rate": 0.0004695134285316784, "loss": 5.2361, "mean_token_accuracy": 0.18298912942409515, "num_tokens": 38331448.0, "step": 20785 }, { "entropy": 5.620502758026123, "epoch": 1.74664986347406, "grad_norm": 1.453125, "learning_rate": 0.00046949837490301293, "loss": 5.3828, "mean_token_accuracy": 0.16921331137418746, "num_tokens": 38340837.0, "step": 20790 }, { "entropy": 5.626954984664917, "epoch": 1.747069943289225, "grad_norm": 1.453125, "learning_rate": 0.0004694833178289088, "loss": 5.3406, "mean_token_accuracy": 0.1766287937760353, "num_tokens": 38349363.0, "step": 20795 }, { "entropy": 5.631927633285523, "epoch": 1.74749002310439, "grad_norm": 1.3828125, "learning_rate": 0.0004694682573096328, "loss": 5.376, "mean_token_accuracy": 0.17368592023849488, "num_tokens": 38358017.0, "step": 20800 }, { "entropy": 5.6352317333221436, "epoch": 1.7479101029195547, "grad_norm": 1.5, "learning_rate": 0.00046945319334545184, "loss": 5.3588, "mean_token_accuracy": 0.17234770804643632, "num_tokens": 38367256.0, "step": 20805 }, { "entropy": 5.618623685836792, "epoch": 1.7483301827347195, "grad_norm": 1.78125, "learning_rate": 0.0004694381259366327, "loss": 5.3577, "mean_token_accuracy": 0.17468070536851882, "num_tokens": 38376169.0, "step": 20810 }, { "entropy": 5.641800165176392, "epoch": 1.7487502625498845, "grad_norm": 1.3515625, "learning_rate": 0.00046942305508344216, "loss": 5.3273, "mean_token_accuracy": 0.17379536628723144, "num_tokens": 38385379.0, "step": 20815 }, { "entropy": 5.693554830551148, "epoch": 1.7491703423650493, "grad_norm": 1.484375, "learning_rate": 0.0004694079807861473, "loss": 5.4342, "mean_token_accuracy": 0.1681118994951248, "num_tokens": 38395217.0, "step": 20820 }, { "entropy": 5.636894845962525, "epoch": 1.7495904221802143, "grad_norm": 1.359375, "learning_rate": 0.0004693929030450153, "loss": 5.3247, "mean_token_accuracy": 0.17704234570264815, "num_tokens": 38404347.0, "step": 20825 }, { "entropy": 5.6810730457305905, "epoch": 1.750010501995379, "grad_norm": 1.6328125, "learning_rate": 0.00046937782186031303, "loss": 5.3081, "mean_token_accuracy": 0.1747249722480774, "num_tokens": 38413394.0, "step": 20830 }, { "entropy": 5.676941013336181, "epoch": 1.750430581810544, "grad_norm": 1.453125, "learning_rate": 0.0004693627372323078, "loss": 5.3446, "mean_token_accuracy": 0.17214433401823043, "num_tokens": 38422043.0, "step": 20835 }, { "entropy": 5.753418397903443, "epoch": 1.750850661625709, "grad_norm": 1.4140625, "learning_rate": 0.0004693476491612667, "loss": 5.5131, "mean_token_accuracy": 0.1660939335823059, "num_tokens": 38430792.0, "step": 20840 }, { "entropy": 5.56128044128418, "epoch": 1.751270741440874, "grad_norm": 1.609375, "learning_rate": 0.0004693325576474571, "loss": 5.299, "mean_token_accuracy": 0.17610928416252136, "num_tokens": 38439105.0, "step": 20845 }, { "entropy": 5.644918298721313, "epoch": 1.7516908212560387, "grad_norm": 1.4609375, "learning_rate": 0.0004693174626911463, "loss": 5.3261, "mean_token_accuracy": 0.1766454264521599, "num_tokens": 38447944.0, "step": 20850 }, { "entropy": 5.628182983398437, "epoch": 1.7521109010712035, "grad_norm": 1.8125, "learning_rate": 0.00046930236429260173, "loss": 5.3694, "mean_token_accuracy": 0.16761911809444427, "num_tokens": 38457206.0, "step": 20855 }, { "entropy": 5.685393190383911, "epoch": 1.7525309808863683, "grad_norm": 1.53125, "learning_rate": 0.0004692872624520908, "loss": 5.446, "mean_token_accuracy": 0.16450470089912414, "num_tokens": 38467085.0, "step": 20860 }, { "entropy": 5.687595844268799, "epoch": 1.7529510607015333, "grad_norm": 1.6171875, "learning_rate": 0.000469272157169881, "loss": 5.2827, "mean_token_accuracy": 0.17452918142080306, "num_tokens": 38475970.0, "step": 20865 }, { "entropy": 5.637504386901855, "epoch": 1.7533711405166983, "grad_norm": 1.734375, "learning_rate": 0.0004692570484462401, "loss": 5.4291, "mean_token_accuracy": 0.17007501125335694, "num_tokens": 38484579.0, "step": 20870 }, { "entropy": 5.683751344680786, "epoch": 1.753791220331863, "grad_norm": 1.3828125, "learning_rate": 0.00046924193628143554, "loss": 5.4706, "mean_token_accuracy": 0.16491821259260178, "num_tokens": 38495107.0, "step": 20875 }, { "entropy": 5.733100080490113, "epoch": 1.7542113001470279, "grad_norm": 2.125, "learning_rate": 0.00046922682067573516, "loss": 5.455, "mean_token_accuracy": 0.1720812901854515, "num_tokens": 38505731.0, "step": 20880 }, { "entropy": 5.629334449768066, "epoch": 1.7546313799621929, "grad_norm": 1.859375, "learning_rate": 0.00046921170162940657, "loss": 5.3422, "mean_token_accuracy": 0.1781423345208168, "num_tokens": 38514483.0, "step": 20885 }, { "entropy": 5.6288830757141115, "epoch": 1.7550514597773577, "grad_norm": 1.3984375, "learning_rate": 0.00046919657914271774, "loss": 5.2621, "mean_token_accuracy": 0.18058374375104905, "num_tokens": 38522953.0, "step": 20890 }, { "entropy": 5.567493963241577, "epoch": 1.7554715395925227, "grad_norm": 3.0, "learning_rate": 0.0004691814532159365, "loss": 5.2562, "mean_token_accuracy": 0.18670934140682222, "num_tokens": 38531891.0, "step": 20895 }, { "entropy": 5.650929737091064, "epoch": 1.7558916194076875, "grad_norm": 1.4140625, "learning_rate": 0.0004691663238493308, "loss": 5.431, "mean_token_accuracy": 0.1708792820572853, "num_tokens": 38541609.0, "step": 20900 }, { "entropy": 5.714797496795654, "epoch": 1.7563116992228522, "grad_norm": 1.3671875, "learning_rate": 0.0004691511910431686, "loss": 5.4352, "mean_token_accuracy": 0.17311373427510263, "num_tokens": 38550348.0, "step": 20905 }, { "entropy": 5.609110689163208, "epoch": 1.7567317790380172, "grad_norm": 1.6796875, "learning_rate": 0.0004691360547977181, "loss": 5.2661, "mean_token_accuracy": 0.17832353860139846, "num_tokens": 38559493.0, "step": 20910 }, { "entropy": 5.621959161758423, "epoch": 1.7571518588531823, "grad_norm": 1.359375, "learning_rate": 0.0004691209151132474, "loss": 5.3231, "mean_token_accuracy": 0.1581482857465744, "num_tokens": 38567888.0, "step": 20915 }, { "entropy": 5.6945287704467775, "epoch": 1.757571938668347, "grad_norm": 1.390625, "learning_rate": 0.0004691057719900246, "loss": 5.3927, "mean_token_accuracy": 0.17266636341810226, "num_tokens": 38577216.0, "step": 20920 }, { "entropy": 5.6431300163269045, "epoch": 1.7579920184835118, "grad_norm": 1.6484375, "learning_rate": 0.00046909062542831794, "loss": 5.34, "mean_token_accuracy": 0.17715939432382583, "num_tokens": 38586258.0, "step": 20925 }, { "entropy": 5.642459106445313, "epoch": 1.7584120982986766, "grad_norm": 1.296875, "learning_rate": 0.0004690754754283959, "loss": 5.2895, "mean_token_accuracy": 0.17726175487041473, "num_tokens": 38594900.0, "step": 20930 }, { "entropy": 5.594657468795776, "epoch": 1.7588321781138416, "grad_norm": 1.2578125, "learning_rate": 0.0004690603219905266, "loss": 5.3709, "mean_token_accuracy": 0.171932390332222, "num_tokens": 38603980.0, "step": 20935 }, { "entropy": 5.678670597076416, "epoch": 1.7592522579290066, "grad_norm": 1.6640625, "learning_rate": 0.00046904516511497873, "loss": 5.4647, "mean_token_accuracy": 0.16452773064374923, "num_tokens": 38613888.0, "step": 20940 }, { "entropy": 5.754366111755371, "epoch": 1.7596723377441714, "grad_norm": 1.3828125, "learning_rate": 0.00046903000480202065, "loss": 5.3917, "mean_token_accuracy": 0.1681995779275894, "num_tokens": 38623969.0, "step": 20945 }, { "entropy": 5.62518458366394, "epoch": 1.7600924175593362, "grad_norm": 1.4765625, "learning_rate": 0.00046901484105192094, "loss": 5.3453, "mean_token_accuracy": 0.17296512126922609, "num_tokens": 38633387.0, "step": 20950 }, { "entropy": 5.6430340766906735, "epoch": 1.760512497374501, "grad_norm": 1.265625, "learning_rate": 0.00046899967386494816, "loss": 5.4, "mean_token_accuracy": 0.16604579389095306, "num_tokens": 38642481.0, "step": 20955 }, { "entropy": 5.687196922302246, "epoch": 1.760932577189666, "grad_norm": 1.3828125, "learning_rate": 0.0004689845032413712, "loss": 5.3981, "mean_token_accuracy": 0.1664348542690277, "num_tokens": 38652345.0, "step": 20960 }, { "entropy": 5.732553148269654, "epoch": 1.761352657004831, "grad_norm": 1.421875, "learning_rate": 0.0004689693291814586, "loss": 5.4189, "mean_token_accuracy": 0.16699230074882507, "num_tokens": 38661529.0, "step": 20965 }, { "entropy": 5.602785253524781, "epoch": 1.7617727368199958, "grad_norm": 1.390625, "learning_rate": 0.0004689541516854791, "loss": 5.3202, "mean_token_accuracy": 0.17832910716533662, "num_tokens": 38670191.0, "step": 20970 }, { "entropy": 5.621751117706299, "epoch": 1.7621928166351606, "grad_norm": 1.3515625, "learning_rate": 0.0004689389707537018, "loss": 5.4132, "mean_token_accuracy": 0.16694632470607756, "num_tokens": 38679089.0, "step": 20975 }, { "entropy": 5.660399055480957, "epoch": 1.7626128964503256, "grad_norm": 1.40625, "learning_rate": 0.00046892378638639545, "loss": 5.3529, "mean_token_accuracy": 0.1763218879699707, "num_tokens": 38688821.0, "step": 20980 }, { "entropy": 5.709231901168823, "epoch": 1.7630329762654906, "grad_norm": 1.3359375, "learning_rate": 0.00046890859858382913, "loss": 5.4325, "mean_token_accuracy": 0.16355552822351455, "num_tokens": 38698232.0, "step": 20985 }, { "entropy": 5.778678321838379, "epoch": 1.7634530560806554, "grad_norm": 1.484375, "learning_rate": 0.0004688934073462718, "loss": 5.5, "mean_token_accuracy": 0.15904544815421104, "num_tokens": 38708090.0, "step": 20990 }, { "entropy": 5.694181299209594, "epoch": 1.7638731358958202, "grad_norm": 1.4296875, "learning_rate": 0.00046887821267399256, "loss": 5.4005, "mean_token_accuracy": 0.17715791165828704, "num_tokens": 38717370.0, "step": 20995 }, { "entropy": 5.667404508590698, "epoch": 1.764293215710985, "grad_norm": 1.921875, "learning_rate": 0.0004688630145672607, "loss": 5.3688, "mean_token_accuracy": 0.17490747272968293, "num_tokens": 38726758.0, "step": 21000 }, { "epoch": 1.764293215710985, "eval_entropy": 5.447259841907512, "eval_loss": 5.429024696350098, "eval_mean_token_accuracy": 0.17760649738501136, "eval_num_tokens": 38726758.0, "eval_runtime": 27.2768, "eval_samples_per_second": 1369.882, "eval_steps_per_second": 171.244, "step": 21000 }, { "entropy": 5.620334959030151, "epoch": 1.76471329552615, "grad_norm": 1.40625, "learning_rate": 0.0004688478130263453, "loss": 5.3613, "mean_token_accuracy": 0.1682727813720703, "num_tokens": 38736180.0, "step": 21005 }, { "entropy": 5.655771541595459, "epoch": 1.765133375341315, "grad_norm": 1.5, "learning_rate": 0.0004688326080515157, "loss": 5.3121, "mean_token_accuracy": 0.17605517357587813, "num_tokens": 38744529.0, "step": 21010 }, { "entropy": 5.517810726165772, "epoch": 1.7655534551564798, "grad_norm": 1.421875, "learning_rate": 0.00046881739964304127, "loss": 5.2272, "mean_token_accuracy": 0.18033822625875473, "num_tokens": 38753434.0, "step": 21015 }, { "entropy": 5.597821426391602, "epoch": 1.7659735349716446, "grad_norm": 1.421875, "learning_rate": 0.00046880218780119136, "loss": 5.3471, "mean_token_accuracy": 0.17827894389629365, "num_tokens": 38762021.0, "step": 21020 }, { "entropy": 5.703983736038208, "epoch": 1.7663936147868093, "grad_norm": 1.3359375, "learning_rate": 0.0004687869725262356, "loss": 5.4687, "mean_token_accuracy": 0.1671355977654457, "num_tokens": 38771373.0, "step": 21025 }, { "entropy": 5.684408521652221, "epoch": 1.7668136946019743, "grad_norm": 1.2734375, "learning_rate": 0.0004687717538184433, "loss": 5.427, "mean_token_accuracy": 0.1724289759993553, "num_tokens": 38780388.0, "step": 21030 }, { "entropy": 5.61069803237915, "epoch": 1.7672337744171394, "grad_norm": 1.5625, "learning_rate": 0.00046875653167808423, "loss": 5.26, "mean_token_accuracy": 0.1805383160710335, "num_tokens": 38789285.0, "step": 21035 }, { "entropy": 5.5840356826782225, "epoch": 1.7676538542323041, "grad_norm": 1.703125, "learning_rate": 0.00046874130610542796, "loss": 5.3548, "mean_token_accuracy": 0.17195963561534883, "num_tokens": 38799321.0, "step": 21040 }, { "entropy": 5.69988784790039, "epoch": 1.768073934047469, "grad_norm": 1.3359375, "learning_rate": 0.0004687260771007442, "loss": 5.3414, "mean_token_accuracy": 0.16847853660583495, "num_tokens": 38808515.0, "step": 21045 }, { "entropy": 5.594510459899903, "epoch": 1.768494013862634, "grad_norm": 1.515625, "learning_rate": 0.0004687108446643027, "loss": 5.3048, "mean_token_accuracy": 0.17457255125045776, "num_tokens": 38817634.0, "step": 21050 }, { "entropy": 5.740410614013672, "epoch": 1.7689140936777987, "grad_norm": 1.453125, "learning_rate": 0.0004686956087963734, "loss": 5.5311, "mean_token_accuracy": 0.1675797998905182, "num_tokens": 38826766.0, "step": 21055 }, { "entropy": 5.64251217842102, "epoch": 1.7693341734929637, "grad_norm": 1.453125, "learning_rate": 0.0004686803694972261, "loss": 5.2846, "mean_token_accuracy": 0.17146496325731278, "num_tokens": 38835942.0, "step": 21060 }, { "entropy": 5.637530994415283, "epoch": 1.7697542533081285, "grad_norm": 1.4765625, "learning_rate": 0.00046866512676713075, "loss": 5.381, "mean_token_accuracy": 0.16483051627874373, "num_tokens": 38845691.0, "step": 21065 }, { "entropy": 5.6026856899261475, "epoch": 1.7701743331232933, "grad_norm": 1.5078125, "learning_rate": 0.00046864988060635744, "loss": 5.3686, "mean_token_accuracy": 0.16854404360055925, "num_tokens": 38855737.0, "step": 21070 }, { "entropy": 5.634773826599121, "epoch": 1.7705944129384583, "grad_norm": 1.4453125, "learning_rate": 0.0004686346310151762, "loss": 5.3817, "mean_token_accuracy": 0.17483728677034377, "num_tokens": 38864887.0, "step": 21075 }, { "entropy": 5.710461950302124, "epoch": 1.7710144927536233, "grad_norm": 1.3515625, "learning_rate": 0.00046861937799385717, "loss": 5.3603, "mean_token_accuracy": 0.1777254104614258, "num_tokens": 38873924.0, "step": 21080 }, { "entropy": 5.648996734619141, "epoch": 1.7714345725687881, "grad_norm": 1.59375, "learning_rate": 0.0004686041215426706, "loss": 5.4071, "mean_token_accuracy": 0.1716112896800041, "num_tokens": 38883447.0, "step": 21085 }, { "entropy": 5.647649192810059, "epoch": 1.771854652383953, "grad_norm": 1.5078125, "learning_rate": 0.0004685888616618867, "loss": 5.393, "mean_token_accuracy": 0.17345526367425917, "num_tokens": 38892389.0, "step": 21090 }, { "entropy": 5.688521671295166, "epoch": 1.7722747321991177, "grad_norm": 1.328125, "learning_rate": 0.00046857359835177575, "loss": 5.4408, "mean_token_accuracy": 0.16651444435119628, "num_tokens": 38901574.0, "step": 21095 }, { "entropy": 5.710891914367676, "epoch": 1.7726948120142827, "grad_norm": 1.3984375, "learning_rate": 0.00046855833161260825, "loss": 5.4205, "mean_token_accuracy": 0.1721094399690628, "num_tokens": 38910070.0, "step": 21100 }, { "entropy": 5.6420543670654295, "epoch": 1.7731148918294477, "grad_norm": 1.3671875, "learning_rate": 0.0004685430614446545, "loss": 5.3168, "mean_token_accuracy": 0.17222830057144164, "num_tokens": 38919868.0, "step": 21105 }, { "entropy": 5.66776967048645, "epoch": 1.7735349716446125, "grad_norm": 1.453125, "learning_rate": 0.0004685277878481852, "loss": 5.3784, "mean_token_accuracy": 0.16582091450691222, "num_tokens": 38928840.0, "step": 21110 }, { "entropy": 5.662716102600098, "epoch": 1.7739550514597773, "grad_norm": 1.28125, "learning_rate": 0.00046851251082347063, "loss": 5.455, "mean_token_accuracy": 0.166241654753685, "num_tokens": 38938112.0, "step": 21115 }, { "entropy": 5.7086883068084715, "epoch": 1.7743751312749423, "grad_norm": 1.828125, "learning_rate": 0.0004684972303707816, "loss": 5.3755, "mean_token_accuracy": 0.1721594288945198, "num_tokens": 38947463.0, "step": 21120 }, { "entropy": 5.71903281211853, "epoch": 1.774795211090107, "grad_norm": 1.421875, "learning_rate": 0.0004684819464903888, "loss": 5.5394, "mean_token_accuracy": 0.16309396475553511, "num_tokens": 38957221.0, "step": 21125 }, { "entropy": 5.59781403541565, "epoch": 1.775215290905272, "grad_norm": 1.375, "learning_rate": 0.000468466659182563, "loss": 5.2735, "mean_token_accuracy": 0.1779392898082733, "num_tokens": 38966656.0, "step": 21130 }, { "entropy": 5.600368213653565, "epoch": 1.7756353707204369, "grad_norm": 1.484375, "learning_rate": 0.0004684513684475749, "loss": 5.28, "mean_token_accuracy": 0.17913274914026261, "num_tokens": 38975281.0, "step": 21135 }, { "entropy": 5.711512088775635, "epoch": 1.7760554505356017, "grad_norm": 1.328125, "learning_rate": 0.00046843607428569546, "loss": 5.4295, "mean_token_accuracy": 0.17240019291639327, "num_tokens": 38985147.0, "step": 21140 }, { "entropy": 5.63455982208252, "epoch": 1.7764755303507667, "grad_norm": 1.59375, "learning_rate": 0.00046842077669719554, "loss": 5.2079, "mean_token_accuracy": 0.1831870675086975, "num_tokens": 38994104.0, "step": 21145 }, { "entropy": 5.631388187408447, "epoch": 1.7768956101659317, "grad_norm": 1.3125, "learning_rate": 0.00046840547568234613, "loss": 5.4063, "mean_token_accuracy": 0.1688321650028229, "num_tokens": 39003983.0, "step": 21150 }, { "entropy": 5.6240592956542965, "epoch": 1.7773156899810965, "grad_norm": 1.3359375, "learning_rate": 0.00046839017124141835, "loss": 5.3136, "mean_token_accuracy": 0.17558915317058563, "num_tokens": 39012636.0, "step": 21155 }, { "entropy": 5.648619031906128, "epoch": 1.7777357697962612, "grad_norm": 1.546875, "learning_rate": 0.00046837486337468335, "loss": 5.4367, "mean_token_accuracy": 0.16739535629749297, "num_tokens": 39022173.0, "step": 21160 }, { "entropy": 5.746690368652343, "epoch": 1.778155849611426, "grad_norm": 1.3359375, "learning_rate": 0.000468359552082412, "loss": 5.4438, "mean_token_accuracy": 0.16601206958293915, "num_tokens": 39032651.0, "step": 21165 }, { "entropy": 5.650369501113891, "epoch": 1.778575929426591, "grad_norm": 1.4296875, "learning_rate": 0.0004683442373648759, "loss": 5.3624, "mean_token_accuracy": 0.16653727144002914, "num_tokens": 39041543.0, "step": 21170 }, { "entropy": 5.610308504104614, "epoch": 1.778996009241756, "grad_norm": 1.4921875, "learning_rate": 0.0004683289192223462, "loss": 5.336, "mean_token_accuracy": 0.17248573154211044, "num_tokens": 39050467.0, "step": 21175 }, { "entropy": 5.684257221221924, "epoch": 1.7794160890569208, "grad_norm": 1.6953125, "learning_rate": 0.00046831359765509424, "loss": 5.3996, "mean_token_accuracy": 0.16482697874307634, "num_tokens": 39059224.0, "step": 21180 }, { "entropy": 5.65388126373291, "epoch": 1.7798361688720856, "grad_norm": 1.3515625, "learning_rate": 0.00046829827266339134, "loss": 5.4226, "mean_token_accuracy": 0.17068626284599303, "num_tokens": 39067884.0, "step": 21185 }, { "entropy": 5.682791662216187, "epoch": 1.7802562486872506, "grad_norm": 1.5390625, "learning_rate": 0.00046828294424750916, "loss": 5.3776, "mean_token_accuracy": 0.1663289338350296, "num_tokens": 39076774.0, "step": 21190 }, { "entropy": 5.6760657787322994, "epoch": 1.7806763285024154, "grad_norm": 1.3671875, "learning_rate": 0.0004682676124077192, "loss": 5.312, "mean_token_accuracy": 0.17417764961719512, "num_tokens": 39086021.0, "step": 21195 }, { "entropy": 5.687941169738769, "epoch": 1.7810964083175804, "grad_norm": 1.28125, "learning_rate": 0.00046825227714429287, "loss": 5.3043, "mean_token_accuracy": 0.17446549832820893, "num_tokens": 39095682.0, "step": 21200 }, { "entropy": 5.591732406616211, "epoch": 1.7815164881327452, "grad_norm": 1.21875, "learning_rate": 0.00046823693845750205, "loss": 5.3381, "mean_token_accuracy": 0.17597149461507797, "num_tokens": 39104904.0, "step": 21205 }, { "entropy": 5.669663190841675, "epoch": 1.78193656794791, "grad_norm": 1.5859375, "learning_rate": 0.00046822159634761837, "loss": 5.4867, "mean_token_accuracy": 0.16566276848316192, "num_tokens": 39113128.0, "step": 21210 }, { "entropy": 5.5798241138458256, "epoch": 1.782356647763075, "grad_norm": 1.4609375, "learning_rate": 0.0004682062508149136, "loss": 5.3373, "mean_token_accuracy": 0.17040848433971406, "num_tokens": 39122503.0, "step": 21215 }, { "entropy": 5.65609712600708, "epoch": 1.78277672757824, "grad_norm": 1.3515625, "learning_rate": 0.0004681909018596595, "loss": 5.3367, "mean_token_accuracy": 0.17275859266519547, "num_tokens": 39132020.0, "step": 21220 }, { "entropy": 5.674847936630249, "epoch": 1.7831968073934048, "grad_norm": 1.4453125, "learning_rate": 0.00046817554948212813, "loss": 5.3719, "mean_token_accuracy": 0.17056983560323716, "num_tokens": 39141542.0, "step": 21225 }, { "entropy": 5.6901304721832275, "epoch": 1.7836168872085696, "grad_norm": 1.578125, "learning_rate": 0.00046816019368259136, "loss": 5.3959, "mean_token_accuracy": 0.1733367383480072, "num_tokens": 39151573.0, "step": 21230 }, { "entropy": 5.597964191436768, "epoch": 1.7840369670237344, "grad_norm": 1.5546875, "learning_rate": 0.0004681448344613212, "loss": 5.3772, "mean_token_accuracy": 0.18744425475597382, "num_tokens": 39160023.0, "step": 21235 }, { "entropy": 5.585873651504516, "epoch": 1.7844570468388994, "grad_norm": 1.4609375, "learning_rate": 0.00046812947181858986, "loss": 5.3522, "mean_token_accuracy": 0.17375268936157226, "num_tokens": 39169335.0, "step": 21240 }, { "entropy": 5.7382103443145756, "epoch": 1.7848771266540644, "grad_norm": 1.3125, "learning_rate": 0.0004681141057546693, "loss": 5.4522, "mean_token_accuracy": 0.1610276386141777, "num_tokens": 39177953.0, "step": 21245 }, { "entropy": 5.6841898441314695, "epoch": 1.7852972064692292, "grad_norm": 1.640625, "learning_rate": 0.00046809873626983174, "loss": 5.3873, "mean_token_accuracy": 0.16958157420158387, "num_tokens": 39188984.0, "step": 21250 }, { "entropy": 5.650718355178833, "epoch": 1.785717286284394, "grad_norm": 2.015625, "learning_rate": 0.00046808336336434946, "loss": 5.354, "mean_token_accuracy": 0.1693144455552101, "num_tokens": 39198033.0, "step": 21255 }, { "entropy": 5.640344429016113, "epoch": 1.7861373660995588, "grad_norm": 1.34375, "learning_rate": 0.00046806798703849495, "loss": 5.3114, "mean_token_accuracy": 0.17812950164079666, "num_tokens": 39207429.0, "step": 21260 }, { "entropy": 5.668394279479981, "epoch": 1.7865574459147238, "grad_norm": 1.4375, "learning_rate": 0.0004680526072925404, "loss": 5.3638, "mean_token_accuracy": 0.17503189891576768, "num_tokens": 39216484.0, "step": 21265 }, { "entropy": 5.751850509643555, "epoch": 1.7869775257298888, "grad_norm": 1.3671875, "learning_rate": 0.00046803722412675836, "loss": 5.4421, "mean_token_accuracy": 0.16722988039255143, "num_tokens": 39226385.0, "step": 21270 }, { "entropy": 5.659515428543091, "epoch": 1.7873976055450536, "grad_norm": 1.6953125, "learning_rate": 0.00046802183754142125, "loss": 5.355, "mean_token_accuracy": 0.17532113194465637, "num_tokens": 39235424.0, "step": 21275 }, { "entropy": 5.615523481369019, "epoch": 1.7878176853602183, "grad_norm": 1.40625, "learning_rate": 0.0004680064475368017, "loss": 5.334, "mean_token_accuracy": 0.17136083245277406, "num_tokens": 39244109.0, "step": 21280 }, { "entropy": 5.638778781890869, "epoch": 1.7882377651753834, "grad_norm": 1.640625, "learning_rate": 0.00046799105411317234, "loss": 5.3672, "mean_token_accuracy": 0.18030614107847215, "num_tokens": 39253685.0, "step": 21285 }, { "entropy": 5.636345529556275, "epoch": 1.7886578449905484, "grad_norm": 1.4375, "learning_rate": 0.00046797565727080585, "loss": 5.3363, "mean_token_accuracy": 0.1694641187787056, "num_tokens": 39262743.0, "step": 21290 }, { "entropy": 5.595978879928589, "epoch": 1.7890779248057131, "grad_norm": 1.3203125, "learning_rate": 0.00046796025700997484, "loss": 5.2617, "mean_token_accuracy": 0.1859144985675812, "num_tokens": 39270962.0, "step": 21295 }, { "entropy": 5.629796028137207, "epoch": 1.789498004620878, "grad_norm": 1.3828125, "learning_rate": 0.0004679448533309523, "loss": 5.357, "mean_token_accuracy": 0.1806061625480652, "num_tokens": 39279994.0, "step": 21300 }, { "entropy": 5.648918485641479, "epoch": 1.7899180844360427, "grad_norm": 1.375, "learning_rate": 0.00046792944623401107, "loss": 5.3957, "mean_token_accuracy": 0.17086594551801682, "num_tokens": 39289481.0, "step": 21305 }, { "entropy": 5.726909589767456, "epoch": 1.7903381642512077, "grad_norm": 1.3828125, "learning_rate": 0.00046791403571942405, "loss": 5.4798, "mean_token_accuracy": 0.16001774370670319, "num_tokens": 39298383.0, "step": 21310 }, { "entropy": 5.628692245483398, "epoch": 1.7907582440663727, "grad_norm": 1.3828125, "learning_rate": 0.0004678986217874642, "loss": 5.3709, "mean_token_accuracy": 0.17079650610685349, "num_tokens": 39307809.0, "step": 21315 }, { "entropy": 5.601756286621094, "epoch": 1.7911783238815375, "grad_norm": 1.375, "learning_rate": 0.00046788320443840457, "loss": 5.2556, "mean_token_accuracy": 0.18573263436555862, "num_tokens": 39316332.0, "step": 21320 }, { "entropy": 5.617982006072998, "epoch": 1.7915984036967023, "grad_norm": 1.8359375, "learning_rate": 0.00046786778367251833, "loss": 5.292, "mean_token_accuracy": 0.17370064407587052, "num_tokens": 39325672.0, "step": 21325 }, { "entropy": 5.591927242279053, "epoch": 1.792018483511867, "grad_norm": 1.359375, "learning_rate": 0.00046785235949007854, "loss": 5.3672, "mean_token_accuracy": 0.1754762977361679, "num_tokens": 39334478.0, "step": 21330 }, { "entropy": 5.4915365219116214, "epoch": 1.792438563327032, "grad_norm": 1.2578125, "learning_rate": 0.00046783693189135863, "loss": 5.2474, "mean_token_accuracy": 0.17552462220191956, "num_tokens": 39343573.0, "step": 21335 }, { "entropy": 5.642029523849487, "epoch": 1.7928586431421971, "grad_norm": 1.2734375, "learning_rate": 0.00046782150087663167, "loss": 5.3067, "mean_token_accuracy": 0.18337966054677962, "num_tokens": 39351956.0, "step": 21340 }, { "entropy": 5.699854373931885, "epoch": 1.793278722957362, "grad_norm": 1.3046875, "learning_rate": 0.0004678060664461711, "loss": 5.4656, "mean_token_accuracy": 0.16409681141376495, "num_tokens": 39361911.0, "step": 21345 }, { "entropy": 5.689766883850098, "epoch": 1.7936988027725267, "grad_norm": 1.3359375, "learning_rate": 0.0004677906286002504, "loss": 5.3918, "mean_token_accuracy": 0.1700123593211174, "num_tokens": 39370916.0, "step": 21350 }, { "entropy": 5.681654787063598, "epoch": 1.7941188825876917, "grad_norm": 1.265625, "learning_rate": 0.0004677751873391429, "loss": 5.4125, "mean_token_accuracy": 0.16728848665952684, "num_tokens": 39380662.0, "step": 21355 }, { "entropy": 5.632915210723877, "epoch": 1.7945389624028567, "grad_norm": 1.3203125, "learning_rate": 0.00046775974266312234, "loss": 5.3231, "mean_token_accuracy": 0.18093785941600798, "num_tokens": 39389644.0, "step": 21360 }, { "entropy": 5.627950620651245, "epoch": 1.7949590422180215, "grad_norm": 1.546875, "learning_rate": 0.00046774429457246215, "loss": 5.317, "mean_token_accuracy": 0.1713301122188568, "num_tokens": 39398662.0, "step": 21365 }, { "entropy": 5.665298891067505, "epoch": 1.7953791220331863, "grad_norm": 1.5, "learning_rate": 0.000467728843067436, "loss": 5.4089, "mean_token_accuracy": 0.17280863374471664, "num_tokens": 39408064.0, "step": 21370 }, { "entropy": 5.68812518119812, "epoch": 1.795799201848351, "grad_norm": 1.53125, "learning_rate": 0.0004677133881483177, "loss": 5.4316, "mean_token_accuracy": 0.16077583879232407, "num_tokens": 39418991.0, "step": 21375 }, { "entropy": 5.6052967548370365, "epoch": 1.796219281663516, "grad_norm": 1.7421875, "learning_rate": 0.0004676979298153809, "loss": 5.2948, "mean_token_accuracy": 0.17317767292261124, "num_tokens": 39428707.0, "step": 21380 }, { "entropy": 5.718463802337647, "epoch": 1.796639361478681, "grad_norm": 1.546875, "learning_rate": 0.0004676824680688996, "loss": 5.4489, "mean_token_accuracy": 0.17518044412136077, "num_tokens": 39437173.0, "step": 21385 }, { "entropy": 5.70597095489502, "epoch": 1.7970594412938459, "grad_norm": 1.28125, "learning_rate": 0.00046766700290914743, "loss": 5.3734, "mean_token_accuracy": 0.16496011465787888, "num_tokens": 39446336.0, "step": 21390 }, { "entropy": 5.687495326995849, "epoch": 1.7974795211090107, "grad_norm": 1.296875, "learning_rate": 0.00046765153433639856, "loss": 5.5444, "mean_token_accuracy": 0.16359457075595857, "num_tokens": 39456129.0, "step": 21395 }, { "entropy": 5.662699794769287, "epoch": 1.7978996009241754, "grad_norm": 1.328125, "learning_rate": 0.00046763606235092705, "loss": 5.3918, "mean_token_accuracy": 0.173219533264637, "num_tokens": 39465386.0, "step": 21400 }, { "entropy": 5.6809672832489015, "epoch": 1.7983196807393405, "grad_norm": 1.21875, "learning_rate": 0.0004676205869530068, "loss": 5.4419, "mean_token_accuracy": 0.17025604397058486, "num_tokens": 39475085.0, "step": 21405 }, { "entropy": 5.678685855865479, "epoch": 1.7987397605545055, "grad_norm": 1.671875, "learning_rate": 0.00046760510814291206, "loss": 5.4574, "mean_token_accuracy": 0.16565362811088563, "num_tokens": 39484500.0, "step": 21410 }, { "entropy": 5.675810527801514, "epoch": 1.7991598403696702, "grad_norm": 1.2578125, "learning_rate": 0.000467589625920917, "loss": 5.3463, "mean_token_accuracy": 0.17084126621484758, "num_tokens": 39494049.0, "step": 21415 }, { "entropy": 5.617605352401734, "epoch": 1.799579920184835, "grad_norm": 1.34375, "learning_rate": 0.000467574140287296, "loss": 5.3515, "mean_token_accuracy": 0.17164410948753356, "num_tokens": 39502874.0, "step": 21420 }, { "entropy": 5.603321266174317, "epoch": 1.8, "grad_norm": 1.4453125, "learning_rate": 0.0004675586512423231, "loss": 5.3848, "mean_token_accuracy": 0.16502818018198012, "num_tokens": 39512371.0, "step": 21425 }, { "entropy": 5.67237401008606, "epoch": 1.8004200798151648, "grad_norm": 1.28125, "learning_rate": 0.000467543158786273, "loss": 5.4078, "mean_token_accuracy": 0.1716100186109543, "num_tokens": 39521477.0, "step": 21430 }, { "entropy": 5.677791595458984, "epoch": 1.8008401596303298, "grad_norm": 1.375, "learning_rate": 0.00046752766291941985, "loss": 5.418, "mean_token_accuracy": 0.1607919916510582, "num_tokens": 39530072.0, "step": 21435 }, { "entropy": 5.66340913772583, "epoch": 1.8012602394454946, "grad_norm": 1.59375, "learning_rate": 0.0004675121636420383, "loss": 5.3903, "mean_token_accuracy": 0.16702970415353774, "num_tokens": 39540762.0, "step": 21440 }, { "entropy": 5.696033906936646, "epoch": 1.8016803192606594, "grad_norm": 1.4375, "learning_rate": 0.000467496660954403, "loss": 5.4174, "mean_token_accuracy": 0.16265884339809417, "num_tokens": 39549699.0, "step": 21445 }, { "entropy": 5.677773523330688, "epoch": 1.8021003990758244, "grad_norm": 1.4375, "learning_rate": 0.00046748115485678837, "loss": 5.4414, "mean_token_accuracy": 0.1688990116119385, "num_tokens": 39558725.0, "step": 21450 }, { "entropy": 5.60676121711731, "epoch": 1.8025204788909894, "grad_norm": 1.703125, "learning_rate": 0.00046746564534946926, "loss": 5.2994, "mean_token_accuracy": 0.17619529366493225, "num_tokens": 39567357.0, "step": 21455 }, { "entropy": 5.635344457626343, "epoch": 1.8029405587061542, "grad_norm": 1.609375, "learning_rate": 0.0004674501324327203, "loss": 5.2869, "mean_token_accuracy": 0.17789805233478545, "num_tokens": 39576147.0, "step": 21460 }, { "entropy": 5.669049167633057, "epoch": 1.803360638521319, "grad_norm": 1.3515625, "learning_rate": 0.00046743461610681636, "loss": 5.4369, "mean_token_accuracy": 0.17405525892972945, "num_tokens": 39584963.0, "step": 21465 }, { "entropy": 5.590832757949829, "epoch": 1.8037807183364838, "grad_norm": 1.6640625, "learning_rate": 0.0004674190963720323, "loss": 5.2983, "mean_token_accuracy": 0.17730980813503266, "num_tokens": 39594420.0, "step": 21470 }, { "entropy": 5.597025918960571, "epoch": 1.8042007981516488, "grad_norm": 1.34375, "learning_rate": 0.000467403573228643, "loss": 5.394, "mean_token_accuracy": 0.16461124569177626, "num_tokens": 39603276.0, "step": 21475 }, { "entropy": 5.600082731246948, "epoch": 1.8046208779668138, "grad_norm": 1.7421875, "learning_rate": 0.0004673880466769235, "loss": 5.4545, "mean_token_accuracy": 0.16378810703754426, "num_tokens": 39613161.0, "step": 21480 }, { "entropy": 5.631666612625122, "epoch": 1.8050409577819786, "grad_norm": 1.6796875, "learning_rate": 0.00046737251671714886, "loss": 5.3009, "mean_token_accuracy": 0.17678849697113036, "num_tokens": 39621889.0, "step": 21485 }, { "entropy": 5.751594495773316, "epoch": 1.8054610375971434, "grad_norm": 1.3515625, "learning_rate": 0.00046735698334959407, "loss": 5.4888, "mean_token_accuracy": 0.17027620673179628, "num_tokens": 39632009.0, "step": 21490 }, { "entropy": 5.7419802188873295, "epoch": 1.8058811174123084, "grad_norm": 1.5234375, "learning_rate": 0.00046734144657453443, "loss": 5.3736, "mean_token_accuracy": 0.17260289043188096, "num_tokens": 39640639.0, "step": 21495 }, { "entropy": 5.6267815113067625, "epoch": 1.8063011972274732, "grad_norm": 1.4296875, "learning_rate": 0.00046732590639224505, "loss": 5.394, "mean_token_accuracy": 0.17710949927568437, "num_tokens": 39649837.0, "step": 21500 }, { "entropy": 5.635099458694458, "epoch": 1.8067212770426382, "grad_norm": 1.375, "learning_rate": 0.00046731036280300126, "loss": 5.4226, "mean_token_accuracy": 0.17313309758901596, "num_tokens": 39659890.0, "step": 21505 }, { "entropy": 5.6682960987091064, "epoch": 1.807141356857803, "grad_norm": 1.484375, "learning_rate": 0.00046729481580707846, "loss": 5.3342, "mean_token_accuracy": 0.17116763591766357, "num_tokens": 39669550.0, "step": 21510 }, { "entropy": 5.6409660339355465, "epoch": 1.8075614366729678, "grad_norm": 1.5390625, "learning_rate": 0.00046727926540475207, "loss": 5.3313, "mean_token_accuracy": 0.16743680387735366, "num_tokens": 39678471.0, "step": 21515 }, { "entropy": 5.514544820785522, "epoch": 1.8079815164881328, "grad_norm": 1.34375, "learning_rate": 0.0004672637115962974, "loss": 5.2649, "mean_token_accuracy": 0.17956244349479675, "num_tokens": 39686600.0, "step": 21520 }, { "entropy": 5.597471857070923, "epoch": 1.8084015963032978, "grad_norm": 1.34375, "learning_rate": 0.00046724815438199007, "loss": 5.3991, "mean_token_accuracy": 0.1686672165989876, "num_tokens": 39696848.0, "step": 21525 }, { "entropy": 5.61803035736084, "epoch": 1.8088216761184626, "grad_norm": 1.46875, "learning_rate": 0.00046723259376210577, "loss": 5.335, "mean_token_accuracy": 0.17923670560121535, "num_tokens": 39706051.0, "step": 21530 }, { "entropy": 5.691323709487915, "epoch": 1.8092417559336273, "grad_norm": 1.3046875, "learning_rate": 0.00046721702973692, "loss": 5.3996, "mean_token_accuracy": 0.16498573273420333, "num_tokens": 39716035.0, "step": 21535 }, { "entropy": 5.6498912334442135, "epoch": 1.8096618357487921, "grad_norm": 1.890625, "learning_rate": 0.00046720146230670853, "loss": 5.3763, "mean_token_accuracy": 0.16898033916950225, "num_tokens": 39725717.0, "step": 21540 }, { "entropy": 5.623174715042114, "epoch": 1.8100819155639571, "grad_norm": 1.34375, "learning_rate": 0.0004671858914717471, "loss": 5.3948, "mean_token_accuracy": 0.16846336126327516, "num_tokens": 39734543.0, "step": 21545 }, { "entropy": 5.647709131240845, "epoch": 1.8105019953791222, "grad_norm": 1.4140625, "learning_rate": 0.00046717031723231164, "loss": 5.4131, "mean_token_accuracy": 0.17280775755643846, "num_tokens": 39744503.0, "step": 21550 }, { "entropy": 5.638943243026733, "epoch": 1.810922075194287, "grad_norm": 1.5390625, "learning_rate": 0.0004671547395886779, "loss": 5.3921, "mean_token_accuracy": 0.16712662130594252, "num_tokens": 39753484.0, "step": 21555 }, { "entropy": 5.610015249252319, "epoch": 1.8113421550094517, "grad_norm": 1.5625, "learning_rate": 0.0004671391585411219, "loss": 5.3029, "mean_token_accuracy": 0.1781844601035118, "num_tokens": 39762673.0, "step": 21560 }, { "entropy": 5.645753812789917, "epoch": 1.8117622348246165, "grad_norm": 1.2109375, "learning_rate": 0.00046712357408991965, "loss": 5.4587, "mean_token_accuracy": 0.16241314709186555, "num_tokens": 39773138.0, "step": 21565 }, { "entropy": 5.722913217544556, "epoch": 1.8121823146397815, "grad_norm": 1.3515625, "learning_rate": 0.0004671079862353472, "loss": 5.4498, "mean_token_accuracy": 0.168387970328331, "num_tokens": 39782282.0, "step": 21570 }, { "entropy": 5.623279857635498, "epoch": 1.8126023944549465, "grad_norm": 1.296875, "learning_rate": 0.00046709239497768067, "loss": 5.3519, "mean_token_accuracy": 0.1776757076382637, "num_tokens": 39792035.0, "step": 21575 }, { "entropy": 5.724744987487793, "epoch": 1.8130224742701113, "grad_norm": 1.2734375, "learning_rate": 0.00046707680031719633, "loss": 5.4498, "mean_token_accuracy": 0.16576552540063857, "num_tokens": 39801696.0, "step": 21580 }, { "entropy": 5.743741226196289, "epoch": 1.813442554085276, "grad_norm": 1.2578125, "learning_rate": 0.0004670612022541705, "loss": 5.4751, "mean_token_accuracy": 0.16882607191801072, "num_tokens": 39811449.0, "step": 21585 }, { "entropy": 5.6892838954925535, "epoch": 1.8138626339004411, "grad_norm": 1.2578125, "learning_rate": 0.0004670456007888792, "loss": 5.4313, "mean_token_accuracy": 0.16952537894248962, "num_tokens": 39820339.0, "step": 21590 }, { "entropy": 5.61853666305542, "epoch": 1.8142827137156061, "grad_norm": 1.4921875, "learning_rate": 0.0004670299959215989, "loss": 5.3599, "mean_token_accuracy": 0.17586547285318374, "num_tokens": 39829861.0, "step": 21595 }, { "entropy": 5.645865345001221, "epoch": 1.814702793530771, "grad_norm": 1.296875, "learning_rate": 0.0004670143876526062, "loss": 5.3182, "mean_token_accuracy": 0.17726973295211793, "num_tokens": 39838568.0, "step": 21600 }, { "entropy": 5.625951051712036, "epoch": 1.8151228733459357, "grad_norm": 1.3671875, "learning_rate": 0.00046699877598217754, "loss": 5.3338, "mean_token_accuracy": 0.1771585986018181, "num_tokens": 39847705.0, "step": 21605 }, { "entropy": 5.651676988601684, "epoch": 1.8155429531611005, "grad_norm": 1.34375, "learning_rate": 0.00046698316091058946, "loss": 5.4239, "mean_token_accuracy": 0.16565542817115783, "num_tokens": 39856700.0, "step": 21610 }, { "entropy": 5.672735357284546, "epoch": 1.8159630329762655, "grad_norm": 1.578125, "learning_rate": 0.00046696754243811845, "loss": 5.3138, "mean_token_accuracy": 0.17889431715011597, "num_tokens": 39865647.0, "step": 21615 }, { "entropy": 5.7311060428619385, "epoch": 1.8163831127914305, "grad_norm": 1.6796875, "learning_rate": 0.0004669519205650413, "loss": 5.4334, "mean_token_accuracy": 0.1666131630539894, "num_tokens": 39874705.0, "step": 21620 }, { "entropy": 5.6093682765960695, "epoch": 1.8168031926065953, "grad_norm": 1.3984375, "learning_rate": 0.00046693629529163467, "loss": 5.2633, "mean_token_accuracy": 0.17741246819496154, "num_tokens": 39883795.0, "step": 21625 }, { "entropy": 5.648662471771241, "epoch": 1.81722327242176, "grad_norm": 1.5390625, "learning_rate": 0.0004669206666181755, "loss": 5.3502, "mean_token_accuracy": 0.17730291932821274, "num_tokens": 39893165.0, "step": 21630 }, { "entropy": 5.58931975364685, "epoch": 1.8176433522369249, "grad_norm": 1.8046875, "learning_rate": 0.0004669050345449404, "loss": 5.3901, "mean_token_accuracy": 0.17008297443389891, "num_tokens": 39902241.0, "step": 21635 }, { "entropy": 5.598491477966308, "epoch": 1.8180634320520899, "grad_norm": 1.6484375, "learning_rate": 0.0004668893990722066, "loss": 5.3486, "mean_token_accuracy": 0.1675383910536766, "num_tokens": 39911211.0, "step": 21640 }, { "entropy": 5.629481792449951, "epoch": 1.8184835118672549, "grad_norm": 1.390625, "learning_rate": 0.0004668737602002508, "loss": 5.3409, "mean_token_accuracy": 0.17022158205509186, "num_tokens": 39920192.0, "step": 21645 }, { "entropy": 5.676052808761597, "epoch": 1.8189035916824197, "grad_norm": 1.34375, "learning_rate": 0.00046685811792935016, "loss": 5.3769, "mean_token_accuracy": 0.1712314024567604, "num_tokens": 39929169.0, "step": 21650 }, { "entropy": 5.674107933044434, "epoch": 1.8193236714975844, "grad_norm": 1.34375, "learning_rate": 0.00046684247225978176, "loss": 5.393, "mean_token_accuracy": 0.1656157284975052, "num_tokens": 39939333.0, "step": 21655 }, { "entropy": 5.604327821731568, "epoch": 1.8197437513127495, "grad_norm": 1.3515625, "learning_rate": 0.00046682682319182275, "loss": 5.3847, "mean_token_accuracy": 0.17021397948265077, "num_tokens": 39948042.0, "step": 21660 }, { "entropy": 5.622416305541992, "epoch": 1.8201638311279145, "grad_norm": 1.2890625, "learning_rate": 0.00046681117072575035, "loss": 5.3134, "mean_token_accuracy": 0.1767050787806511, "num_tokens": 39956847.0, "step": 21665 }, { "entropy": 5.790766382217408, "epoch": 1.8205839109430793, "grad_norm": 2.0, "learning_rate": 0.0004667955148618418, "loss": 5.5804, "mean_token_accuracy": 0.15933856070041658, "num_tokens": 39966598.0, "step": 21670 }, { "entropy": 5.597654056549072, "epoch": 1.821003990758244, "grad_norm": 1.4765625, "learning_rate": 0.0004667798556003745, "loss": 5.2301, "mean_token_accuracy": 0.1689037188887596, "num_tokens": 39975236.0, "step": 21675 }, { "entropy": 5.608970832824707, "epoch": 1.8214240705734088, "grad_norm": 1.59375, "learning_rate": 0.0004667641929416258, "loss": 5.3879, "mean_token_accuracy": 0.17176640927791595, "num_tokens": 39984582.0, "step": 21680 }, { "entropy": 5.623990154266357, "epoch": 1.8218441503885738, "grad_norm": 1.15625, "learning_rate": 0.0004667485268858731, "loss": 5.3783, "mean_token_accuracy": 0.17252393662929535, "num_tokens": 39993122.0, "step": 21685 }, { "entropy": 5.652155160903931, "epoch": 1.8222642302037388, "grad_norm": 1.5546875, "learning_rate": 0.00046673285743339406, "loss": 5.3438, "mean_token_accuracy": 0.1751272648572922, "num_tokens": 40002974.0, "step": 21690 }, { "entropy": 5.646127367019654, "epoch": 1.8226843100189036, "grad_norm": 1.265625, "learning_rate": 0.00046671718458446616, "loss": 5.3852, "mean_token_accuracy": 0.17070560306310653, "num_tokens": 40011790.0, "step": 21695 }, { "entropy": 5.713921976089478, "epoch": 1.8231043898340684, "grad_norm": 1.1875, "learning_rate": 0.0004667015083393671, "loss": 5.3966, "mean_token_accuracy": 0.17200501561164855, "num_tokens": 40021327.0, "step": 21700 }, { "entropy": 5.636666631698608, "epoch": 1.8235244696492332, "grad_norm": 1.28125, "learning_rate": 0.0004666858286983744, "loss": 5.3929, "mean_token_accuracy": 0.17091110199689866, "num_tokens": 40030471.0, "step": 21705 }, { "entropy": 5.674646282196045, "epoch": 1.8239445494643982, "grad_norm": 1.296875, "learning_rate": 0.0004666701456617661, "loss": 5.3948, "mean_token_accuracy": 0.1720045655965805, "num_tokens": 40039305.0, "step": 21710 }, { "entropy": 5.640139770507813, "epoch": 1.8243646292795632, "grad_norm": 1.3671875, "learning_rate": 0.00046665445922981975, "loss": 5.3103, "mean_token_accuracy": 0.17814622223377227, "num_tokens": 40047389.0, "step": 21715 }, { "entropy": 5.697626256942749, "epoch": 1.824784709094728, "grad_norm": 1.65625, "learning_rate": 0.0004666387694028134, "loss": 5.3839, "mean_token_accuracy": 0.17282926440238952, "num_tokens": 40057640.0, "step": 21720 }, { "entropy": 5.588255500793457, "epoch": 1.8252047889098928, "grad_norm": 1.2421875, "learning_rate": 0.0004666230761810249, "loss": 5.3463, "mean_token_accuracy": 0.1746961608529091, "num_tokens": 40066770.0, "step": 21725 }, { "entropy": 5.5960245609283445, "epoch": 1.8256248687250578, "grad_norm": 1.2578125, "learning_rate": 0.0004666073795647323, "loss": 5.3288, "mean_token_accuracy": 0.17138356268405913, "num_tokens": 40075902.0, "step": 21730 }, { "entropy": 5.595876836776734, "epoch": 1.8260449485402226, "grad_norm": 1.3203125, "learning_rate": 0.00046659167955421366, "loss": 5.367, "mean_token_accuracy": 0.16742293983697892, "num_tokens": 40084945.0, "step": 21735 }, { "entropy": 5.574261808395386, "epoch": 1.8264650283553876, "grad_norm": 1.3125, "learning_rate": 0.000466575976149747, "loss": 5.2742, "mean_token_accuracy": 0.1774066463112831, "num_tokens": 40095104.0, "step": 21740 }, { "entropy": 5.708527374267578, "epoch": 1.8268851081705524, "grad_norm": 1.5, "learning_rate": 0.0004665602693516106, "loss": 5.4188, "mean_token_accuracy": 0.17146946042776107, "num_tokens": 40105329.0, "step": 21745 }, { "entropy": 5.589442586898803, "epoch": 1.8273051879857172, "grad_norm": 1.921875, "learning_rate": 0.0004665445591600827, "loss": 5.2376, "mean_token_accuracy": 0.18168216943740845, "num_tokens": 40114555.0, "step": 21750 }, { "entropy": 5.624778461456299, "epoch": 1.8277252678008822, "grad_norm": 1.2421875, "learning_rate": 0.0004665288455754415, "loss": 5.2822, "mean_token_accuracy": 0.18185721337795258, "num_tokens": 40123314.0, "step": 21755 }, { "entropy": 5.613637542724609, "epoch": 1.8281453476160472, "grad_norm": 1.3671875, "learning_rate": 0.0004665131285979655, "loss": 5.3253, "mean_token_accuracy": 0.17175379693508147, "num_tokens": 40132483.0, "step": 21760 }, { "entropy": 5.63437066078186, "epoch": 1.828565427431212, "grad_norm": 1.5859375, "learning_rate": 0.00046649740822793303, "loss": 5.355, "mean_token_accuracy": 0.17184757441282272, "num_tokens": 40141800.0, "step": 21765 }, { "entropy": 5.688283252716064, "epoch": 1.8289855072463768, "grad_norm": 1.6640625, "learning_rate": 0.0004664816844656225, "loss": 5.3415, "mean_token_accuracy": 0.18133000284433365, "num_tokens": 40149892.0, "step": 21770 }, { "entropy": 5.625762462615967, "epoch": 1.8294055870615415, "grad_norm": 1.4921875, "learning_rate": 0.00046646595731131263, "loss": 5.3221, "mean_token_accuracy": 0.17174559831619263, "num_tokens": 40159376.0, "step": 21775 }, { "entropy": 5.614609718322754, "epoch": 1.8298256668767066, "grad_norm": 1.5625, "learning_rate": 0.0004664502267652819, "loss": 5.2827, "mean_token_accuracy": 0.17843341827392578, "num_tokens": 40168497.0, "step": 21780 }, { "entropy": 5.674923658370972, "epoch": 1.8302457466918716, "grad_norm": 1.3203125, "learning_rate": 0.00046643449282780894, "loss": 5.3782, "mean_token_accuracy": 0.16659992337226867, "num_tokens": 40177432.0, "step": 21785 }, { "entropy": 5.654786205291748, "epoch": 1.8306658265070364, "grad_norm": 1.25, "learning_rate": 0.0004664187554991725, "loss": 5.2698, "mean_token_accuracy": 0.17321840226650237, "num_tokens": 40186582.0, "step": 21790 }, { "entropy": 5.637910079956055, "epoch": 1.8310859063222011, "grad_norm": 1.65625, "learning_rate": 0.0004664030147796514, "loss": 5.3276, "mean_token_accuracy": 0.17326397448778152, "num_tokens": 40196094.0, "step": 21795 }, { "entropy": 5.5886390686035154, "epoch": 1.8315059861373661, "grad_norm": 1.4375, "learning_rate": 0.0004663872706695244, "loss": 5.3779, "mean_token_accuracy": 0.17434979230165482, "num_tokens": 40205239.0, "step": 21800 }, { "entropy": 5.641726875305176, "epoch": 1.831926065952531, "grad_norm": 1.265625, "learning_rate": 0.0004663715231690706, "loss": 5.4406, "mean_token_accuracy": 0.1751034140586853, "num_tokens": 40213908.0, "step": 21805 }, { "entropy": 5.709264898300171, "epoch": 1.832346145767696, "grad_norm": 1.34375, "learning_rate": 0.00046635577227856873, "loss": 5.4025, "mean_token_accuracy": 0.17268626689910888, "num_tokens": 40223370.0, "step": 21810 }, { "entropy": 5.7190502166748045, "epoch": 1.8327662255828607, "grad_norm": 1.328125, "learning_rate": 0.0004663400179982978, "loss": 5.487, "mean_token_accuracy": 0.1673346996307373, "num_tokens": 40233934.0, "step": 21815 }, { "entropy": 5.70527868270874, "epoch": 1.8331863053980255, "grad_norm": 1.8203125, "learning_rate": 0.00046632426032853705, "loss": 5.366, "mean_token_accuracy": 0.16772017180919646, "num_tokens": 40244335.0, "step": 21820 }, { "entropy": 5.570014429092407, "epoch": 1.8336063852131905, "grad_norm": 1.3359375, "learning_rate": 0.00046630849926956555, "loss": 5.3147, "mean_token_accuracy": 0.1714258924126625, "num_tokens": 40254354.0, "step": 21825 }, { "entropy": 5.6582074642181395, "epoch": 1.8340264650283555, "grad_norm": 1.328125, "learning_rate": 0.00046629273482166244, "loss": 5.3156, "mean_token_accuracy": 0.17588206827640535, "num_tokens": 40262748.0, "step": 21830 }, { "entropy": 5.697437191009522, "epoch": 1.8344465448435203, "grad_norm": 1.421875, "learning_rate": 0.00046627696698510706, "loss": 5.4048, "mean_token_accuracy": 0.17420812398195268, "num_tokens": 40271818.0, "step": 21835 }, { "entropy": 5.632059001922608, "epoch": 1.834866624658685, "grad_norm": 1.3359375, "learning_rate": 0.0004662611957601788, "loss": 5.4213, "mean_token_accuracy": 0.16834606230258942, "num_tokens": 40280552.0, "step": 21840 }, { "entropy": 5.690567255020142, "epoch": 1.83528670447385, "grad_norm": 1.921875, "learning_rate": 0.00046624542114715687, "loss": 5.3115, "mean_token_accuracy": 0.1798562154173851, "num_tokens": 40289368.0, "step": 21845 }, { "entropy": 5.7882728099823, "epoch": 1.835706784289015, "grad_norm": 1.3046875, "learning_rate": 0.0004662296431463208, "loss": 5.5121, "mean_token_accuracy": 0.1584714248776436, "num_tokens": 40298884.0, "step": 21850 }, { "entropy": 5.69461989402771, "epoch": 1.83612686410418, "grad_norm": 1.2109375, "learning_rate": 0.00046621386175795, "loss": 5.4196, "mean_token_accuracy": 0.16526482701301576, "num_tokens": 40307886.0, "step": 21855 }, { "entropy": 5.614387512207031, "epoch": 1.8365469439193447, "grad_norm": 1.5, "learning_rate": 0.00046619807698232413, "loss": 5.3323, "mean_token_accuracy": 0.16994198113679887, "num_tokens": 40317688.0, "step": 21860 }, { "entropy": 5.691216659545899, "epoch": 1.8369670237345095, "grad_norm": 1.484375, "learning_rate": 0.0004661822888197228, "loss": 5.391, "mean_token_accuracy": 0.1661013074219227, "num_tokens": 40327630.0, "step": 21865 }, { "entropy": 5.651998567581177, "epoch": 1.8373871035496743, "grad_norm": 1.671875, "learning_rate": 0.00046616649727042564, "loss": 5.3661, "mean_token_accuracy": 0.17099616825580596, "num_tokens": 40336613.0, "step": 21870 }, { "entropy": 5.646777057647705, "epoch": 1.8378071833648393, "grad_norm": 1.4296875, "learning_rate": 0.00046615070233471244, "loss": 5.4562, "mean_token_accuracy": 0.1672051414847374, "num_tokens": 40346582.0, "step": 21875 }, { "entropy": 5.751259517669678, "epoch": 1.8382272631800043, "grad_norm": 1.2109375, "learning_rate": 0.00046613490401286304, "loss": 5.4752, "mean_token_accuracy": 0.1641298934817314, "num_tokens": 40355960.0, "step": 21880 }, { "entropy": 5.773221445083618, "epoch": 1.838647342995169, "grad_norm": 1.25, "learning_rate": 0.00046611910230515716, "loss": 5.3246, "mean_token_accuracy": 0.17324539572000502, "num_tokens": 40366043.0, "step": 21885 }, { "entropy": 5.6297935962677, "epoch": 1.8390674228103339, "grad_norm": 1.328125, "learning_rate": 0.0004661032972118748, "loss": 5.3792, "mean_token_accuracy": 0.1739889457821846, "num_tokens": 40374919.0, "step": 21890 }, { "entropy": 5.586809396743774, "epoch": 1.8394875026254989, "grad_norm": 1.2578125, "learning_rate": 0.00046608748873329587, "loss": 5.3512, "mean_token_accuracy": 0.17698893696069717, "num_tokens": 40383415.0, "step": 21895 }, { "entropy": 5.741325616836548, "epoch": 1.8399075824406639, "grad_norm": 1.234375, "learning_rate": 0.0004660716768697005, "loss": 5.3999, "mean_token_accuracy": 0.16888994574546815, "num_tokens": 40392252.0, "step": 21900 }, { "entropy": 5.614504766464234, "epoch": 1.8403276622558287, "grad_norm": 1.375, "learning_rate": 0.0004660558616213689, "loss": 5.2419, "mean_token_accuracy": 0.1856852650642395, "num_tokens": 40400717.0, "step": 21905 }, { "entropy": 5.57713942527771, "epoch": 1.8407477420709935, "grad_norm": 1.21875, "learning_rate": 0.00046604004298858093, "loss": 5.2895, "mean_token_accuracy": 0.18077120929956436, "num_tokens": 40409236.0, "step": 21910 }, { "entropy": 5.577246189117432, "epoch": 1.8411678218861582, "grad_norm": 1.2890625, "learning_rate": 0.0004660242209716171, "loss": 5.3133, "mean_token_accuracy": 0.17522037625312806, "num_tokens": 40419073.0, "step": 21915 }, { "entropy": 5.727301597595215, "epoch": 1.8415879017013232, "grad_norm": 1.34375, "learning_rate": 0.0004660083955707575, "loss": 5.4115, "mean_token_accuracy": 0.1722704529762268, "num_tokens": 40428427.0, "step": 21920 }, { "entropy": 5.678405237197876, "epoch": 1.8420079815164883, "grad_norm": 1.265625, "learning_rate": 0.0004659925667862825, "loss": 5.3801, "mean_token_accuracy": 0.17540597915649414, "num_tokens": 40437350.0, "step": 21925 }, { "entropy": 5.644293546676636, "epoch": 1.842428061331653, "grad_norm": 1.3359375, "learning_rate": 0.0004659767346184725, "loss": 5.3908, "mean_token_accuracy": 0.17217467427253724, "num_tokens": 40446059.0, "step": 21930 }, { "entropy": 5.648314523696899, "epoch": 1.8428481411468178, "grad_norm": 1.1484375, "learning_rate": 0.00046596089906760803, "loss": 5.3842, "mean_token_accuracy": 0.17176232039928435, "num_tokens": 40454959.0, "step": 21935 }, { "entropy": 5.662789678573608, "epoch": 1.8432682209619826, "grad_norm": 1.40625, "learning_rate": 0.0004659450601339696, "loss": 5.3968, "mean_token_accuracy": 0.17346233129501343, "num_tokens": 40464202.0, "step": 21940 }, { "entropy": 5.638479089736938, "epoch": 1.8436883007771476, "grad_norm": 1.453125, "learning_rate": 0.0004659292178178377, "loss": 5.3427, "mean_token_accuracy": 0.1746865801513195, "num_tokens": 40473331.0, "step": 21945 }, { "entropy": 5.65547399520874, "epoch": 1.8441083805923126, "grad_norm": 1.625, "learning_rate": 0.000465913372119493, "loss": 5.3089, "mean_token_accuracy": 0.17193447351455687, "num_tokens": 40482098.0, "step": 21950 }, { "entropy": 5.674790859222412, "epoch": 1.8445284604074774, "grad_norm": 1.2890625, "learning_rate": 0.0004658975230392162, "loss": 5.3536, "mean_token_accuracy": 0.178443942964077, "num_tokens": 40491134.0, "step": 21955 }, { "entropy": 5.706801652908325, "epoch": 1.8449485402226422, "grad_norm": 1.3203125, "learning_rate": 0.0004658816705772882, "loss": 5.4789, "mean_token_accuracy": 0.16973639875650406, "num_tokens": 40501488.0, "step": 21960 }, { "entropy": 5.581787538528443, "epoch": 1.8453686200378072, "grad_norm": 1.2421875, "learning_rate": 0.0004658658147339896, "loss": 5.2266, "mean_token_accuracy": 0.18129412233829498, "num_tokens": 40510506.0, "step": 21965 }, { "entropy": 5.672901391983032, "epoch": 1.8457886998529722, "grad_norm": 1.328125, "learning_rate": 0.00046584995550960146, "loss": 5.3732, "mean_token_accuracy": 0.17865750938653946, "num_tokens": 40520222.0, "step": 21970 }, { "entropy": 5.584681797027588, "epoch": 1.846208779668137, "grad_norm": 1.2265625, "learning_rate": 0.00046583409290440453, "loss": 5.2809, "mean_token_accuracy": 0.17908318787813188, "num_tokens": 40528824.0, "step": 21975 }, { "entropy": 5.5373616218566895, "epoch": 1.8466288594833018, "grad_norm": 1.3359375, "learning_rate": 0.0004658182269186799, "loss": 5.3659, "mean_token_accuracy": 0.16913065910339356, "num_tokens": 40538144.0, "step": 21980 }, { "entropy": 5.64822678565979, "epoch": 1.8470489392984666, "grad_norm": 1.2578125, "learning_rate": 0.0004658023575527087, "loss": 5.4214, "mean_token_accuracy": 0.17093602418899537, "num_tokens": 40547457.0, "step": 21985 }, { "entropy": 5.724355268478393, "epoch": 1.8474690191136316, "grad_norm": 1.3046875, "learning_rate": 0.000465786484806772, "loss": 5.2834, "mean_token_accuracy": 0.1817471370100975, "num_tokens": 40556005.0, "step": 21990 }, { "entropy": 5.489060354232788, "epoch": 1.8478890989287966, "grad_norm": 1.265625, "learning_rate": 0.00046577060868115095, "loss": 5.2522, "mean_token_accuracy": 0.17731622010469436, "num_tokens": 40565018.0, "step": 21995 }, { "entropy": 5.5660477638244625, "epoch": 1.8483091787439614, "grad_norm": 1.2890625, "learning_rate": 0.0004657547291761268, "loss": 5.328, "mean_token_accuracy": 0.16542342603206633, "num_tokens": 40574931.0, "step": 22000 }, { "entropy": 5.662542819976807, "epoch": 1.8487292585591262, "grad_norm": 1.328125, "learning_rate": 0.00046573884629198077, "loss": 5.3064, "mean_token_accuracy": 0.17560895532369614, "num_tokens": 40584496.0, "step": 22005 }, { "entropy": 5.712861347198486, "epoch": 1.849149338374291, "grad_norm": 1.3984375, "learning_rate": 0.0004657229600289944, "loss": 5.4127, "mean_token_accuracy": 0.16572435200214386, "num_tokens": 40594363.0, "step": 22010 }, { "entropy": 5.641199684143066, "epoch": 1.849569418189456, "grad_norm": 1.3125, "learning_rate": 0.0004657070703874489, "loss": 5.4345, "mean_token_accuracy": 0.1685244232416153, "num_tokens": 40603001.0, "step": 22015 }, { "entropy": 5.603077220916748, "epoch": 1.849989498004621, "grad_norm": 1.2109375, "learning_rate": 0.00046569117736762597, "loss": 5.3624, "mean_token_accuracy": 0.1757808193564415, "num_tokens": 40612660.0, "step": 22020 }, { "entropy": 5.599392080307007, "epoch": 1.8504095778197858, "grad_norm": 1.328125, "learning_rate": 0.00046567528096980686, "loss": 5.2727, "mean_token_accuracy": 0.17757227271795273, "num_tokens": 40622209.0, "step": 22025 }, { "entropy": 5.689568948745728, "epoch": 1.8508296576349506, "grad_norm": 1.5859375, "learning_rate": 0.00046565938119427346, "loss": 5.3844, "mean_token_accuracy": 0.16621674001216888, "num_tokens": 40632011.0, "step": 22030 }, { "entropy": 5.615991640090942, "epoch": 1.8512497374501156, "grad_norm": 1.265625, "learning_rate": 0.0004656434780413073, "loss": 5.2703, "mean_token_accuracy": 0.1767064481973648, "num_tokens": 40641201.0, "step": 22035 }, { "entropy": 5.595028305053711, "epoch": 1.8516698172652803, "grad_norm": 1.4375, "learning_rate": 0.00046562757151119, "loss": 5.3252, "mean_token_accuracy": 0.17203227579593658, "num_tokens": 40650752.0, "step": 22040 }, { "entropy": 5.58217225074768, "epoch": 1.8520898970804454, "grad_norm": 1.234375, "learning_rate": 0.0004656116616042035, "loss": 5.322, "mean_token_accuracy": 0.17230453789234162, "num_tokens": 40659975.0, "step": 22045 }, { "entropy": 5.567688846588135, "epoch": 1.8525099768956101, "grad_norm": 1.1796875, "learning_rate": 0.00046559574832062955, "loss": 5.3465, "mean_token_accuracy": 0.17881006896495819, "num_tokens": 40668944.0, "step": 22050 }, { "entropy": 5.735271883010864, "epoch": 1.852930056710775, "grad_norm": 1.421875, "learning_rate": 0.00046557983166075, "loss": 5.4974, "mean_token_accuracy": 0.1705012798309326, "num_tokens": 40678333.0, "step": 22055 }, { "entropy": 5.564062070846558, "epoch": 1.85335013652594, "grad_norm": 2.25, "learning_rate": 0.00046556391162484696, "loss": 5.2249, "mean_token_accuracy": 0.17906277775764465, "num_tokens": 40687781.0, "step": 22060 }, { "entropy": 5.588773012161255, "epoch": 1.853770216341105, "grad_norm": 1.375, "learning_rate": 0.0004655479882132023, "loss": 5.4058, "mean_token_accuracy": 0.1747704863548279, "num_tokens": 40697637.0, "step": 22065 }, { "entropy": 5.582477378845215, "epoch": 1.8541902961562697, "grad_norm": 1.34375, "learning_rate": 0.0004655320614260982, "loss": 5.2545, "mean_token_accuracy": 0.17724298536777497, "num_tokens": 40707097.0, "step": 22070 }, { "entropy": 5.7074973583221436, "epoch": 1.8546103759714345, "grad_norm": 1.6953125, "learning_rate": 0.00046551613126381673, "loss": 5.4568, "mean_token_accuracy": 0.16699569970369338, "num_tokens": 40716821.0, "step": 22075 }, { "entropy": 5.651368951797485, "epoch": 1.8550304557865993, "grad_norm": 1.3125, "learning_rate": 0.0004655001977266401, "loss": 5.3392, "mean_token_accuracy": 0.17080808728933333, "num_tokens": 40726731.0, "step": 22080 }, { "entropy": 5.683496332168579, "epoch": 1.8554505356017643, "grad_norm": 1.2265625, "learning_rate": 0.00046548426081485046, "loss": 5.3118, "mean_token_accuracy": 0.1734781190752983, "num_tokens": 40736935.0, "step": 22085 }, { "entropy": 5.674536466598511, "epoch": 1.8558706154169293, "grad_norm": 1.390625, "learning_rate": 0.00046546832052873026, "loss": 5.4569, "mean_token_accuracy": 0.16797720938920974, "num_tokens": 40746643.0, "step": 22090 }, { "entropy": 5.773360300064087, "epoch": 1.8562906952320941, "grad_norm": 1.2578125, "learning_rate": 0.00046545237686856195, "loss": 5.5021, "mean_token_accuracy": 0.16252224892377853, "num_tokens": 40755713.0, "step": 22095 }, { "entropy": 5.745291662216187, "epoch": 1.856710775047259, "grad_norm": 1.453125, "learning_rate": 0.00046543642983462775, "loss": 5.4755, "mean_token_accuracy": 0.17116216123104094, "num_tokens": 40764878.0, "step": 22100 }, { "entropy": 5.70435905456543, "epoch": 1.857130854862424, "grad_norm": 1.2421875, "learning_rate": 0.00046542047942721025, "loss": 5.3763, "mean_token_accuracy": 0.17294495701789855, "num_tokens": 40774101.0, "step": 22105 }, { "entropy": 5.662673377990723, "epoch": 1.8575509346775887, "grad_norm": 1.34375, "learning_rate": 0.000465404525646592, "loss": 5.3546, "mean_token_accuracy": 0.1730918511748314, "num_tokens": 40783126.0, "step": 22110 }, { "entropy": 5.558742523193359, "epoch": 1.8579710144927537, "grad_norm": 1.3203125, "learning_rate": 0.0004653885684930557, "loss": 5.3408, "mean_token_accuracy": 0.17543695122003555, "num_tokens": 40792508.0, "step": 22115 }, { "entropy": 5.638851690292358, "epoch": 1.8583910943079185, "grad_norm": 1.2421875, "learning_rate": 0.0004653726079668839, "loss": 5.3736, "mean_token_accuracy": 0.1744435727596283, "num_tokens": 40802252.0, "step": 22120 }, { "entropy": 5.594110679626465, "epoch": 1.8588111741230833, "grad_norm": 1.359375, "learning_rate": 0.0004653566440683594, "loss": 5.2069, "mean_token_accuracy": 0.18530822545289993, "num_tokens": 40811041.0, "step": 22125 }, { "entropy": 5.588617467880249, "epoch": 1.8592312539382483, "grad_norm": 1.34375, "learning_rate": 0.000465340676797765, "loss": 5.2472, "mean_token_accuracy": 0.1726512759923935, "num_tokens": 40819976.0, "step": 22130 }, { "entropy": 5.55437970161438, "epoch": 1.8596513337534133, "grad_norm": 1.3671875, "learning_rate": 0.00046532470615538344, "loss": 5.3, "mean_token_accuracy": 0.17627189308404922, "num_tokens": 40828544.0, "step": 22135 }, { "entropy": 5.619093751907348, "epoch": 1.860071413568578, "grad_norm": 1.296875, "learning_rate": 0.00046530873214149776, "loss": 5.4031, "mean_token_accuracy": 0.17888118773698808, "num_tokens": 40838386.0, "step": 22140 }, { "entropy": 5.716306734085083, "epoch": 1.8604914933837429, "grad_norm": 1.3046875, "learning_rate": 0.0004652927547563908, "loss": 5.3773, "mean_token_accuracy": 0.16736137866973877, "num_tokens": 40847047.0, "step": 22145 }, { "entropy": 5.63306770324707, "epoch": 1.8609115731989077, "grad_norm": 1.5234375, "learning_rate": 0.0004652767740003458, "loss": 5.3992, "mean_token_accuracy": 0.17356206327676774, "num_tokens": 40856653.0, "step": 22150 }, { "entropy": 5.715381336212158, "epoch": 1.8613316530140727, "grad_norm": 1.3828125, "learning_rate": 0.00046526078987364566, "loss": 5.4628, "mean_token_accuracy": 0.16745153367519378, "num_tokens": 40865176.0, "step": 22155 }, { "entropy": 5.713296556472779, "epoch": 1.8617517328292377, "grad_norm": 1.3203125, "learning_rate": 0.0004652448023765736, "loss": 5.5087, "mean_token_accuracy": 0.17654276341199876, "num_tokens": 40874084.0, "step": 22160 }, { "entropy": 5.7095225811004635, "epoch": 1.8621718126444025, "grad_norm": 1.4921875, "learning_rate": 0.0004652288115094129, "loss": 5.3929, "mean_token_accuracy": 0.1741943970322609, "num_tokens": 40883704.0, "step": 22165 }, { "entropy": 5.669970941543579, "epoch": 1.8625918924595672, "grad_norm": 1.2890625, "learning_rate": 0.0004652128172724466, "loss": 5.4364, "mean_token_accuracy": 0.16834318935871123, "num_tokens": 40893232.0, "step": 22170 }, { "entropy": 5.623793125152588, "epoch": 1.8630119722747323, "grad_norm": 1.328125, "learning_rate": 0.00046519681966595834, "loss": 5.3053, "mean_token_accuracy": 0.18128742128610612, "num_tokens": 40902242.0, "step": 22175 }, { "entropy": 5.603348350524902, "epoch": 1.863432052089897, "grad_norm": 1.359375, "learning_rate": 0.0004651808186902313, "loss": 5.3319, "mean_token_accuracy": 0.17357497066259384, "num_tokens": 40912349.0, "step": 22180 }, { "entropy": 5.63823561668396, "epoch": 1.863852131905062, "grad_norm": 1.5234375, "learning_rate": 0.000465164814345549, "loss": 5.3295, "mean_token_accuracy": 0.18075911849737167, "num_tokens": 40922206.0, "step": 22185 }, { "entropy": 5.695439434051513, "epoch": 1.8642722117202268, "grad_norm": 1.3984375, "learning_rate": 0.00046514880663219493, "loss": 5.3078, "mean_token_accuracy": 0.17492891997098922, "num_tokens": 40931145.0, "step": 22190 }, { "entropy": 5.581724739074707, "epoch": 1.8646922915353916, "grad_norm": 1.28125, "learning_rate": 0.0004651327955504526, "loss": 5.227, "mean_token_accuracy": 0.18555289059877395, "num_tokens": 40939917.0, "step": 22195 }, { "entropy": 5.589807987213135, "epoch": 1.8651123713505566, "grad_norm": 1.296875, "learning_rate": 0.0004651167811006058, "loss": 5.2953, "mean_token_accuracy": 0.17969516515731812, "num_tokens": 40947972.0, "step": 22200 }, { "entropy": 5.4864636898040775, "epoch": 1.8655324511657216, "grad_norm": 1.3828125, "learning_rate": 0.000465100763282938, "loss": 5.2555, "mean_token_accuracy": 0.1773657724261284, "num_tokens": 40956999.0, "step": 22205 }, { "entropy": 5.5771567821502686, "epoch": 1.8659525309808864, "grad_norm": 1.2421875, "learning_rate": 0.0004650847420977332, "loss": 5.2257, "mean_token_accuracy": 0.18388084918260575, "num_tokens": 40965917.0, "step": 22210 }, { "entropy": 5.6036945343017575, "epoch": 1.8663726107960512, "grad_norm": 1.2421875, "learning_rate": 0.00046506871754527495, "loss": 5.3267, "mean_token_accuracy": 0.17044262886047362, "num_tokens": 40976545.0, "step": 22215 }, { "entropy": 5.667624187469483, "epoch": 1.866792690611216, "grad_norm": 1.2578125, "learning_rate": 0.00046505268962584735, "loss": 5.3608, "mean_token_accuracy": 0.1779804602265358, "num_tokens": 40985890.0, "step": 22220 }, { "entropy": 5.649150419235229, "epoch": 1.867212770426381, "grad_norm": 1.4921875, "learning_rate": 0.0004650366583397342, "loss": 5.3532, "mean_token_accuracy": 0.1736404314637184, "num_tokens": 40995255.0, "step": 22225 }, { "entropy": 5.658413934707641, "epoch": 1.867632850241546, "grad_norm": 1.421875, "learning_rate": 0.0004650206236872194, "loss": 5.4099, "mean_token_accuracy": 0.1688782885670662, "num_tokens": 41004419.0, "step": 22230 }, { "entropy": 5.551290559768677, "epoch": 1.8680529300567108, "grad_norm": 3.0625, "learning_rate": 0.0004650045856685872, "loss": 5.1684, "mean_token_accuracy": 0.19379522502422333, "num_tokens": 41013179.0, "step": 22235 }, { "entropy": 5.574545621871948, "epoch": 1.8684730098718756, "grad_norm": 1.3515625, "learning_rate": 0.00046498854428412157, "loss": 5.2586, "mean_token_accuracy": 0.17610279172658921, "num_tokens": 41022307.0, "step": 22240 }, { "entropy": 5.584465265274048, "epoch": 1.8688930896870404, "grad_norm": 1.1953125, "learning_rate": 0.00046497249953410675, "loss": 5.4041, "mean_token_accuracy": 0.1725468173623085, "num_tokens": 41032331.0, "step": 22245 }, { "entropy": 5.673091411590576, "epoch": 1.8693131695022054, "grad_norm": 1.328125, "learning_rate": 0.0004649564514188269, "loss": 5.4283, "mean_token_accuracy": 0.17023940831422807, "num_tokens": 41041895.0, "step": 22250 }, { "entropy": 5.588696575164795, "epoch": 1.8697332493173704, "grad_norm": 1.296875, "learning_rate": 0.0004649403999385662, "loss": 5.2562, "mean_token_accuracy": 0.17987769544124604, "num_tokens": 41051643.0, "step": 22255 }, { "entropy": 5.619039726257324, "epoch": 1.8701533291325352, "grad_norm": 1.3203125, "learning_rate": 0.0004649243450936092, "loss": 5.2732, "mean_token_accuracy": 0.17971949875354767, "num_tokens": 41060478.0, "step": 22260 }, { "entropy": 5.559855890274048, "epoch": 1.8705734089477, "grad_norm": 1.21875, "learning_rate": 0.0004649082868842403, "loss": 5.342, "mean_token_accuracy": 0.17189270853996277, "num_tokens": 41069389.0, "step": 22265 }, { "entropy": 5.50737624168396, "epoch": 1.870993488762865, "grad_norm": 1.40625, "learning_rate": 0.00046489222531074376, "loss": 5.2905, "mean_token_accuracy": 0.1808660864830017, "num_tokens": 41078529.0, "step": 22270 }, { "entropy": 5.698597478866577, "epoch": 1.87141356857803, "grad_norm": 1.7890625, "learning_rate": 0.00046487616037340436, "loss": 5.3898, "mean_token_accuracy": 0.17067276537418366, "num_tokens": 41087593.0, "step": 22275 }, { "entropy": 5.727426338195801, "epoch": 1.8718336483931948, "grad_norm": 1.1953125, "learning_rate": 0.0004648600920725065, "loss": 5.3667, "mean_token_accuracy": 0.1694189727306366, "num_tokens": 41098317.0, "step": 22280 }, { "entropy": 5.639452648162842, "epoch": 1.8722537282083596, "grad_norm": 1.3359375, "learning_rate": 0.00046484402040833486, "loss": 5.3661, "mean_token_accuracy": 0.1721063882112503, "num_tokens": 41108659.0, "step": 22285 }, { "entropy": 5.701750898361206, "epoch": 1.8726738080235243, "grad_norm": 1.171875, "learning_rate": 0.00046482794538117413, "loss": 5.4564, "mean_token_accuracy": 0.17068351805210114, "num_tokens": 41117504.0, "step": 22290 }, { "entropy": 5.677620840072632, "epoch": 1.8730938878386894, "grad_norm": 1.2109375, "learning_rate": 0.00046481186699130913, "loss": 5.3154, "mean_token_accuracy": 0.1742495611310005, "num_tokens": 41126249.0, "step": 22295 }, { "entropy": 5.505521583557129, "epoch": 1.8735139676538544, "grad_norm": 1.3359375, "learning_rate": 0.0004647957852390247, "loss": 5.2023, "mean_token_accuracy": 0.17846413403749467, "num_tokens": 41134956.0, "step": 22300 }, { "entropy": 5.619848108291626, "epoch": 1.8739340474690191, "grad_norm": 1.59375, "learning_rate": 0.00046477970012460555, "loss": 5.3039, "mean_token_accuracy": 0.17693169862031938, "num_tokens": 41144340.0, "step": 22305 }, { "entropy": 5.574439334869385, "epoch": 1.874354127284184, "grad_norm": 1.3046875, "learning_rate": 0.0004647636116483367, "loss": 5.3211, "mean_token_accuracy": 0.17290742844343185, "num_tokens": 41152937.0, "step": 22310 }, { "entropy": 5.687628555297851, "epoch": 1.8747742070993487, "grad_norm": 1.4296875, "learning_rate": 0.00046474751981050334, "loss": 5.4899, "mean_token_accuracy": 0.16893676668405533, "num_tokens": 41162361.0, "step": 22315 }, { "entropy": 5.744011449813843, "epoch": 1.8751942869145137, "grad_norm": 1.1640625, "learning_rate": 0.00046473142461139034, "loss": 5.4738, "mean_token_accuracy": 0.16351682245731353, "num_tokens": 41171979.0, "step": 22320 }, { "entropy": 5.596357345581055, "epoch": 1.8756143667296787, "grad_norm": 1.296875, "learning_rate": 0.0004647153260512828, "loss": 5.3187, "mean_token_accuracy": 0.17824737578630448, "num_tokens": 41182145.0, "step": 22325 }, { "entropy": 5.619386386871338, "epoch": 1.8760344465448435, "grad_norm": 1.3359375, "learning_rate": 0.0004646992241304659, "loss": 5.3443, "mean_token_accuracy": 0.17120126634836197, "num_tokens": 41191522.0, "step": 22330 }, { "entropy": 5.681074142456055, "epoch": 1.8764545263600083, "grad_norm": 1.3359375, "learning_rate": 0.000464683118849225, "loss": 5.4375, "mean_token_accuracy": 0.16661341339349747, "num_tokens": 41201052.0, "step": 22335 }, { "entropy": 5.59029483795166, "epoch": 1.8768746061751733, "grad_norm": 1.3359375, "learning_rate": 0.0004646670102078453, "loss": 5.3007, "mean_token_accuracy": 0.17739052772521974, "num_tokens": 41210211.0, "step": 22340 }, { "entropy": 5.63611478805542, "epoch": 1.8772946859903383, "grad_norm": 1.265625, "learning_rate": 0.0004646508982066122, "loss": 5.4579, "mean_token_accuracy": 0.16615783870220185, "num_tokens": 41219778.0, "step": 22345 }, { "entropy": 5.676466846466065, "epoch": 1.8777147658055031, "grad_norm": 1.390625, "learning_rate": 0.00046463478284581114, "loss": 5.4143, "mean_token_accuracy": 0.17920258045196533, "num_tokens": 41229550.0, "step": 22350 }, { "entropy": 5.6495026588439945, "epoch": 1.878134845620668, "grad_norm": 1.3828125, "learning_rate": 0.0004646186641257275, "loss": 5.2768, "mean_token_accuracy": 0.18095650821924208, "num_tokens": 41238130.0, "step": 22355 }, { "entropy": 5.608019781112671, "epoch": 1.8785549254358327, "grad_norm": 1.3046875, "learning_rate": 0.0004646025420466468, "loss": 5.2893, "mean_token_accuracy": 0.17400604337453843, "num_tokens": 41247324.0, "step": 22360 }, { "entropy": 5.572899055480957, "epoch": 1.8789750052509977, "grad_norm": 1.3125, "learning_rate": 0.00046458641660885474, "loss": 5.3558, "mean_token_accuracy": 0.17406723499298096, "num_tokens": 41256131.0, "step": 22365 }, { "entropy": 5.612897682189941, "epoch": 1.8793950850661627, "grad_norm": 1.234375, "learning_rate": 0.00046457028781263693, "loss": 5.3927, "mean_token_accuracy": 0.16953571438789367, "num_tokens": 41265225.0, "step": 22370 }, { "entropy": 5.715818929672241, "epoch": 1.8798151648813275, "grad_norm": 1.2890625, "learning_rate": 0.00046455415565827907, "loss": 5.3887, "mean_token_accuracy": 0.1686493307352066, "num_tokens": 41274023.0, "step": 22375 }, { "entropy": 5.70667200088501, "epoch": 1.8802352446964923, "grad_norm": 1.375, "learning_rate": 0.000464538020146067, "loss": 5.4151, "mean_token_accuracy": 0.16874268501996995, "num_tokens": 41283030.0, "step": 22380 }, { "entropy": 5.729498672485351, "epoch": 1.880655324511657, "grad_norm": 1.265625, "learning_rate": 0.0004645218812762864, "loss": 5.4709, "mean_token_accuracy": 0.1674060821533203, "num_tokens": 41292654.0, "step": 22385 }, { "entropy": 5.639959239959717, "epoch": 1.881075404326822, "grad_norm": 1.4609375, "learning_rate": 0.0004645057390492234, "loss": 5.2063, "mean_token_accuracy": 0.18706902414560317, "num_tokens": 41301838.0, "step": 22390 }, { "entropy": 5.597166204452515, "epoch": 1.881495484141987, "grad_norm": 1.265625, "learning_rate": 0.0004644895934651638, "loss": 5.3138, "mean_token_accuracy": 0.17899076640605927, "num_tokens": 41311104.0, "step": 22395 }, { "entropy": 5.704216480255127, "epoch": 1.8819155639571519, "grad_norm": 1.3203125, "learning_rate": 0.00046447344452439356, "loss": 5.4198, "mean_token_accuracy": 0.1649742141366005, "num_tokens": 41320213.0, "step": 22400 }, { "entropy": 5.6293620586395265, "epoch": 1.8823356437723167, "grad_norm": 1.2109375, "learning_rate": 0.0004644572922271988, "loss": 5.2972, "mean_token_accuracy": 0.17442207932472228, "num_tokens": 41330027.0, "step": 22405 }, { "entropy": 5.657583141326905, "epoch": 1.8827557235874817, "grad_norm": 1.28125, "learning_rate": 0.00046444113657386567, "loss": 5.4056, "mean_token_accuracy": 0.16551758348941803, "num_tokens": 41339481.0, "step": 22410 }, { "entropy": 5.704918766021729, "epoch": 1.8831758034026465, "grad_norm": 1.390625, "learning_rate": 0.00046442497756468037, "loss": 5.4275, "mean_token_accuracy": 0.16912316530942917, "num_tokens": 41348679.0, "step": 22415 }, { "entropy": 5.632734298706055, "epoch": 1.8835958832178115, "grad_norm": 1.3828125, "learning_rate": 0.00046440881519992924, "loss": 5.2812, "mean_token_accuracy": 0.1797910824418068, "num_tokens": 41358736.0, "step": 22420 }, { "entropy": 5.636936283111572, "epoch": 1.8840159630329762, "grad_norm": 1.2890625, "learning_rate": 0.0004643926494798983, "loss": 5.43, "mean_token_accuracy": 0.16520747989416124, "num_tokens": 41368284.0, "step": 22425 }, { "entropy": 5.653887033462524, "epoch": 1.884436042848141, "grad_norm": 1.2421875, "learning_rate": 0.00046437648040487426, "loss": 5.3153, "mean_token_accuracy": 0.1689191997051239, "num_tokens": 41377789.0, "step": 22430 }, { "entropy": 5.650376510620117, "epoch": 1.884856122663306, "grad_norm": 1.421875, "learning_rate": 0.00046436030797514325, "loss": 5.3333, "mean_token_accuracy": 0.17308636307716369, "num_tokens": 41386909.0, "step": 22435 }, { "entropy": 5.675967454910278, "epoch": 1.885276202478471, "grad_norm": 1.25, "learning_rate": 0.0004643441321909919, "loss": 5.3553, "mean_token_accuracy": 0.17917974442243575, "num_tokens": 41396693.0, "step": 22440 }, { "entropy": 5.677432060241699, "epoch": 1.8856962822936358, "grad_norm": 1.3203125, "learning_rate": 0.00046432795305270674, "loss": 5.4418, "mean_token_accuracy": 0.16401530504226686, "num_tokens": 41407193.0, "step": 22445 }, { "entropy": 5.679871845245361, "epoch": 1.8861163621088006, "grad_norm": 1.40625, "learning_rate": 0.00046431177056057446, "loss": 5.394, "mean_token_accuracy": 0.17553680688142775, "num_tokens": 41416567.0, "step": 22450 }, { "entropy": 5.5496378421783445, "epoch": 1.8865364419239654, "grad_norm": 1.2109375, "learning_rate": 0.00046429558471488164, "loss": 5.2553, "mean_token_accuracy": 0.1786068633198738, "num_tokens": 41425328.0, "step": 22455 }, { "entropy": 5.659866619110107, "epoch": 1.8869565217391304, "grad_norm": 1.28125, "learning_rate": 0.000464279395515915, "loss": 5.4151, "mean_token_accuracy": 0.1715554863214493, "num_tokens": 41435229.0, "step": 22460 }, { "entropy": 5.62566819190979, "epoch": 1.8873766015542954, "grad_norm": 1.515625, "learning_rate": 0.00046426320296396136, "loss": 5.3374, "mean_token_accuracy": 0.17149607092142105, "num_tokens": 41445471.0, "step": 22465 }, { "entropy": 5.568106746673584, "epoch": 1.8877966813694602, "grad_norm": 1.34375, "learning_rate": 0.00046424700705930745, "loss": 5.247, "mean_token_accuracy": 0.18762259185314178, "num_tokens": 41454654.0, "step": 22470 }, { "entropy": 5.585866975784302, "epoch": 1.888216761184625, "grad_norm": 1.234375, "learning_rate": 0.0004642308078022403, "loss": 5.274, "mean_token_accuracy": 0.17488398551940917, "num_tokens": 41463341.0, "step": 22475 }, { "entropy": 5.630618524551392, "epoch": 1.88863684099979, "grad_norm": 1.3125, "learning_rate": 0.00046421460519304684, "loss": 5.3228, "mean_token_accuracy": 0.17458095848560334, "num_tokens": 41472677.0, "step": 22480 }, { "entropy": 5.686393451690674, "epoch": 1.8890569208149548, "grad_norm": 1.25, "learning_rate": 0.000464198399232014, "loss": 5.4609, "mean_token_accuracy": 0.16629096865653992, "num_tokens": 41482867.0, "step": 22485 }, { "entropy": 5.745574474334717, "epoch": 1.8894770006301198, "grad_norm": 1.25, "learning_rate": 0.0004641821899194291, "loss": 5.4098, "mean_token_accuracy": 0.16831042617559433, "num_tokens": 41493432.0, "step": 22490 }, { "entropy": 5.7386678695678714, "epoch": 1.8898970804452846, "grad_norm": 1.28125, "learning_rate": 0.00046416597725557903, "loss": 5.4352, "mean_token_accuracy": 0.16686583310365677, "num_tokens": 41503807.0, "step": 22495 }, { "entropy": 5.621123218536377, "epoch": 1.8903171602604494, "grad_norm": 1.421875, "learning_rate": 0.000464149761240751, "loss": 5.3121, "mean_token_accuracy": 0.18151101171970369, "num_tokens": 41512524.0, "step": 22500 }, { "entropy": 5.663379192352295, "epoch": 1.8907372400756144, "grad_norm": 1.6328125, "learning_rate": 0.00046413354187523244, "loss": 5.4507, "mean_token_accuracy": 0.17097427397966386, "num_tokens": 41521915.0, "step": 22505 }, { "entropy": 5.619107627868653, "epoch": 1.8911573198907794, "grad_norm": 1.2265625, "learning_rate": 0.0004641173191593105, "loss": 5.3515, "mean_token_accuracy": 0.17258709371089936, "num_tokens": 41530293.0, "step": 22510 }, { "entropy": 5.65953893661499, "epoch": 1.8915773997059442, "grad_norm": 1.265625, "learning_rate": 0.00046410109309327275, "loss": 5.4098, "mean_token_accuracy": 0.17016493827104567, "num_tokens": 41538660.0, "step": 22515 }, { "entropy": 5.635321474075317, "epoch": 1.891997479521109, "grad_norm": 1.3046875, "learning_rate": 0.00046408486367740647, "loss": 5.3747, "mean_token_accuracy": 0.17979306429624559, "num_tokens": 41547952.0, "step": 22520 }, { "entropy": 5.630529260635376, "epoch": 1.8924175593362738, "grad_norm": 1.15625, "learning_rate": 0.0004640686309119992, "loss": 5.3026, "mean_token_accuracy": 0.18238568902015687, "num_tokens": 41557093.0, "step": 22525 }, { "entropy": 5.610331726074219, "epoch": 1.8928376391514388, "grad_norm": 1.3203125, "learning_rate": 0.00046405239479733844, "loss": 5.3316, "mean_token_accuracy": 0.1757591873407364, "num_tokens": 41565836.0, "step": 22530 }, { "entropy": 5.570929670333863, "epoch": 1.8932577189666038, "grad_norm": 1.21875, "learning_rate": 0.0004640361553337119, "loss": 5.3758, "mean_token_accuracy": 0.18229353278875352, "num_tokens": 41575365.0, "step": 22535 }, { "entropy": 5.628244113922119, "epoch": 1.8936777987817686, "grad_norm": 1.359375, "learning_rate": 0.00046401991252140715, "loss": 5.3339, "mean_token_accuracy": 0.17711923271417618, "num_tokens": 41583690.0, "step": 22540 }, { "entropy": 5.720776605606079, "epoch": 1.8940978785969333, "grad_norm": 2.09375, "learning_rate": 0.000464003666360712, "loss": 5.3243, "mean_token_accuracy": 0.17557096034288405, "num_tokens": 41593536.0, "step": 22545 }, { "entropy": 5.611479806900024, "epoch": 1.8945179584120981, "grad_norm": 1.453125, "learning_rate": 0.0004639874168519143, "loss": 5.3045, "mean_token_accuracy": 0.17414466589689254, "num_tokens": 41602543.0, "step": 22550 }, { "entropy": 5.584366273880005, "epoch": 1.8949380382272631, "grad_norm": 1.28125, "learning_rate": 0.0004639711639953017, "loss": 5.3845, "mean_token_accuracy": 0.17162299007177353, "num_tokens": 41611634.0, "step": 22555 }, { "entropy": 5.548893404006958, "epoch": 1.8953581180424282, "grad_norm": 1.375, "learning_rate": 0.0004639549077911623, "loss": 5.354, "mean_token_accuracy": 0.16889655143022536, "num_tokens": 41621400.0, "step": 22560 }, { "entropy": 5.694586753845215, "epoch": 1.895778197857593, "grad_norm": 1.3359375, "learning_rate": 0.00046393864823978406, "loss": 5.3317, "mean_token_accuracy": 0.17640386521816254, "num_tokens": 41631070.0, "step": 22565 }, { "entropy": 5.727872610092163, "epoch": 1.8961982776727577, "grad_norm": 1.828125, "learning_rate": 0.0004639223853414549, "loss": 5.4155, "mean_token_accuracy": 0.17031230032444, "num_tokens": 41641442.0, "step": 22570 }, { "entropy": 5.665301179885864, "epoch": 1.8966183574879227, "grad_norm": 1.5703125, "learning_rate": 0.000463906119096463, "loss": 5.4178, "mean_token_accuracy": 0.1735200360417366, "num_tokens": 41651616.0, "step": 22575 }, { "entropy": 5.676836919784546, "epoch": 1.8970384373030877, "grad_norm": 1.34375, "learning_rate": 0.0004638898495050963, "loss": 5.3417, "mean_token_accuracy": 0.17591539174318313, "num_tokens": 41660704.0, "step": 22580 }, { "entropy": 5.612502813339233, "epoch": 1.8974585171182525, "grad_norm": 1.5625, "learning_rate": 0.0004638735765676434, "loss": 5.393, "mean_token_accuracy": 0.16929904073476792, "num_tokens": 41669824.0, "step": 22585 }, { "entropy": 5.6595179557800295, "epoch": 1.8978785969334173, "grad_norm": 1.2734375, "learning_rate": 0.0004638573002843922, "loss": 5.3122, "mean_token_accuracy": 0.1842661365866661, "num_tokens": 41680082.0, "step": 22590 }, { "entropy": 5.603770017623901, "epoch": 1.898298676748582, "grad_norm": 1.6015625, "learning_rate": 0.0004638410206556312, "loss": 5.2665, "mean_token_accuracy": 0.17830771952867508, "num_tokens": 41689282.0, "step": 22595 }, { "entropy": 5.625360727310181, "epoch": 1.8987187565637471, "grad_norm": 1.28125, "learning_rate": 0.0004638247376816489, "loss": 5.404, "mean_token_accuracy": 0.1719541594386101, "num_tokens": 41699059.0, "step": 22600 }, { "entropy": 5.763462495803833, "epoch": 1.8991388363789121, "grad_norm": 1.53125, "learning_rate": 0.0004638084513627335, "loss": 5.4994, "mean_token_accuracy": 0.1679268956184387, "num_tokens": 41708674.0, "step": 22605 }, { "entropy": 5.718596315383911, "epoch": 1.899558916194077, "grad_norm": 1.421875, "learning_rate": 0.00046379216169917356, "loss": 5.4022, "mean_token_accuracy": 0.16962596029043198, "num_tokens": 41718418.0, "step": 22610 }, { "entropy": 5.609939289093018, "epoch": 1.8999789960092417, "grad_norm": 1.2265625, "learning_rate": 0.0004637758686912577, "loss": 5.4069, "mean_token_accuracy": 0.16938821971416473, "num_tokens": 41728229.0, "step": 22615 }, { "entropy": 5.617797803878784, "epoch": 1.9003990758244065, "grad_norm": 1.265625, "learning_rate": 0.00046375957233927456, "loss": 5.365, "mean_token_accuracy": 0.17396558225154876, "num_tokens": 41737074.0, "step": 22620 }, { "entropy": 5.647723913192749, "epoch": 1.9008191556395715, "grad_norm": 1.3515625, "learning_rate": 0.00046374327264351277, "loss": 5.2549, "mean_token_accuracy": 0.17883535474538803, "num_tokens": 41745823.0, "step": 22625 }, { "entropy": 5.568923711776733, "epoch": 1.9012392354547365, "grad_norm": 1.1953125, "learning_rate": 0.00046372696960426116, "loss": 5.3503, "mean_token_accuracy": 0.18322131186723709, "num_tokens": 41754591.0, "step": 22630 }, { "entropy": 5.663699960708618, "epoch": 1.9016593152699013, "grad_norm": 1.1796875, "learning_rate": 0.00046371066322180846, "loss": 5.3477, "mean_token_accuracy": 0.1712099567055702, "num_tokens": 41763585.0, "step": 22635 }, { "entropy": 5.690198802947998, "epoch": 1.902079395085066, "grad_norm": 1.2109375, "learning_rate": 0.00046369435349644344, "loss": 5.3829, "mean_token_accuracy": 0.17371902912855147, "num_tokens": 41772712.0, "step": 22640 }, { "entropy": 5.65671181678772, "epoch": 1.902499474900231, "grad_norm": 1.484375, "learning_rate": 0.00046367804042845515, "loss": 5.2653, "mean_token_accuracy": 0.18572683036327362, "num_tokens": 41781516.0, "step": 22645 }, { "entropy": 5.615236091613769, "epoch": 1.902919554715396, "grad_norm": 1.203125, "learning_rate": 0.00046366172401813253, "loss": 5.3415, "mean_token_accuracy": 0.17305969446897507, "num_tokens": 41790731.0, "step": 22650 }, { "entropy": 5.680331754684448, "epoch": 1.9033396345305609, "grad_norm": 1.1875, "learning_rate": 0.0004636454042657647, "loss": 5.368, "mean_token_accuracy": 0.17193576842546462, "num_tokens": 41799654.0, "step": 22655 }, { "entropy": 5.559491348266602, "epoch": 1.9037597143457257, "grad_norm": 1.265625, "learning_rate": 0.00046362908117164055, "loss": 5.2569, "mean_token_accuracy": 0.1793026253581047, "num_tokens": 41809408.0, "step": 22660 }, { "entropy": 5.644749689102173, "epoch": 1.9041797941608904, "grad_norm": 1.1171875, "learning_rate": 0.0004636127547360494, "loss": 5.4225, "mean_token_accuracy": 0.16808681786060334, "num_tokens": 41818868.0, "step": 22665 }, { "entropy": 5.665660381317139, "epoch": 1.9045998739760555, "grad_norm": 1.203125, "learning_rate": 0.0004635964249592804, "loss": 5.3304, "mean_token_accuracy": 0.1782209351658821, "num_tokens": 41827156.0, "step": 22670 }, { "entropy": 5.692537307739258, "epoch": 1.9050199537912205, "grad_norm": 1.421875, "learning_rate": 0.0004635800918416229, "loss": 5.4402, "mean_token_accuracy": 0.16368364691734313, "num_tokens": 41837025.0, "step": 22675 }, { "entropy": 5.745527839660644, "epoch": 1.9054400336063853, "grad_norm": 1.265625, "learning_rate": 0.00046356375538336616, "loss": 5.3507, "mean_token_accuracy": 0.17565076798200607, "num_tokens": 41846196.0, "step": 22680 }, { "entropy": 5.5358936309814455, "epoch": 1.90586011342155, "grad_norm": 1.40625, "learning_rate": 0.00046354741558479956, "loss": 5.3266, "mean_token_accuracy": 0.16661422401666642, "num_tokens": 41855030.0, "step": 22685 }, { "entropy": 5.6051513671875, "epoch": 1.9062801932367148, "grad_norm": 1.375, "learning_rate": 0.0004635310724462126, "loss": 5.2093, "mean_token_accuracy": 0.17713478952646255, "num_tokens": 41863740.0, "step": 22690 }, { "entropy": 5.604347562789917, "epoch": 1.9067002730518798, "grad_norm": 1.390625, "learning_rate": 0.0004635147259678948, "loss": 5.3446, "mean_token_accuracy": 0.17247212529182435, "num_tokens": 41873376.0, "step": 22695 }, { "entropy": 5.698956775665283, "epoch": 1.9071203528670448, "grad_norm": 1.3671875, "learning_rate": 0.00046349837615013563, "loss": 5.4611, "mean_token_accuracy": 0.16106533110141755, "num_tokens": 41882491.0, "step": 22700 }, { "entropy": 5.6646346092224125, "epoch": 1.9075404326822096, "grad_norm": 1.2578125, "learning_rate": 0.0004634820229932248, "loss": 5.3108, "mean_token_accuracy": 0.17672717124223708, "num_tokens": 41891357.0, "step": 22705 }, { "entropy": 5.618336296081543, "epoch": 1.9079605124973744, "grad_norm": 1.46875, "learning_rate": 0.00046346566649745205, "loss": 5.3758, "mean_token_accuracy": 0.17323821932077407, "num_tokens": 41899874.0, "step": 22710 }, { "entropy": 5.639418315887451, "epoch": 1.9083805923125394, "grad_norm": 1.515625, "learning_rate": 0.000463449306663107, "loss": 5.3765, "mean_token_accuracy": 0.17575515508651735, "num_tokens": 41909673.0, "step": 22715 }, { "entropy": 5.713147306442261, "epoch": 1.9088006721277042, "grad_norm": 1.578125, "learning_rate": 0.0004634329434904796, "loss": 5.4385, "mean_token_accuracy": 0.16925620883703232, "num_tokens": 41919126.0, "step": 22720 }, { "entropy": 5.597732830047607, "epoch": 1.9092207519428692, "grad_norm": 1.3046875, "learning_rate": 0.0004634165769798596, "loss": 5.2688, "mean_token_accuracy": 0.17853163182735443, "num_tokens": 41927751.0, "step": 22725 }, { "entropy": 5.636762285232544, "epoch": 1.909640831758034, "grad_norm": 1.2421875, "learning_rate": 0.0004634002071315369, "loss": 5.3374, "mean_token_accuracy": 0.1791643977165222, "num_tokens": 41937290.0, "step": 22730 }, { "entropy": 5.636068058013916, "epoch": 1.9100609115731988, "grad_norm": 1.3515625, "learning_rate": 0.00046338383394580157, "loss": 5.2968, "mean_token_accuracy": 0.18056693077087402, "num_tokens": 41947186.0, "step": 22735 }, { "entropy": 5.5747472763061525, "epoch": 1.9104809913883638, "grad_norm": 1.28125, "learning_rate": 0.00046336745742294366, "loss": 5.306, "mean_token_accuracy": 0.16971218585968018, "num_tokens": 41956197.0, "step": 22740 }, { "entropy": 5.6662568092346195, "epoch": 1.9109010712035288, "grad_norm": 1.2890625, "learning_rate": 0.00046335107756325316, "loss": 5.2903, "mean_token_accuracy": 0.17477723807096482, "num_tokens": 41965881.0, "step": 22745 }, { "entropy": 5.65323395729065, "epoch": 1.9113211510186936, "grad_norm": 1.3828125, "learning_rate": 0.0004633346943670204, "loss": 5.3643, "mean_token_accuracy": 0.16437687277793883, "num_tokens": 41975031.0, "step": 22750 }, { "entropy": 5.643260192871094, "epoch": 1.9117412308338584, "grad_norm": 1.453125, "learning_rate": 0.0004633183078345355, "loss": 5.3197, "mean_token_accuracy": 0.17544271051883698, "num_tokens": 41984187.0, "step": 22755 }, { "entropy": 5.675690174102783, "epoch": 1.9121613106490232, "grad_norm": 1.6328125, "learning_rate": 0.00046330191796608867, "loss": 5.4277, "mean_token_accuracy": 0.17010141164064407, "num_tokens": 41993185.0, "step": 22760 }, { "entropy": 5.658887100219727, "epoch": 1.9125813904641882, "grad_norm": 1.390625, "learning_rate": 0.0004632855247619704, "loss": 5.3799, "mean_token_accuracy": 0.17760641872882843, "num_tokens": 42002521.0, "step": 22765 }, { "entropy": 5.694884634017944, "epoch": 1.9130014702793532, "grad_norm": 2.234375, "learning_rate": 0.000463269128222471, "loss": 5.4865, "mean_token_accuracy": 0.1686519965529442, "num_tokens": 42011444.0, "step": 22770 }, { "entropy": 5.666096448898315, "epoch": 1.913421550094518, "grad_norm": 1.359375, "learning_rate": 0.0004632527283478809, "loss": 5.3742, "mean_token_accuracy": 0.16956177055835725, "num_tokens": 42020916.0, "step": 22775 }, { "entropy": 5.701362133026123, "epoch": 1.9138416299096828, "grad_norm": 1.28125, "learning_rate": 0.00046323632513849063, "loss": 5.3941, "mean_token_accuracy": 0.17448805570602416, "num_tokens": 42029467.0, "step": 22780 }, { "entropy": 5.498393583297729, "epoch": 1.9142617097248478, "grad_norm": 1.2578125, "learning_rate": 0.0004632199185945908, "loss": 5.1487, "mean_token_accuracy": 0.18760445863008499, "num_tokens": 42037435.0, "step": 22785 }, { "entropy": 5.676510238647461, "epoch": 1.9146817895400126, "grad_norm": 1.3515625, "learning_rate": 0.0004632035087164721, "loss": 5.4044, "mean_token_accuracy": 0.1724133387207985, "num_tokens": 42046943.0, "step": 22790 }, { "entropy": 5.623393821716308, "epoch": 1.9151018693551776, "grad_norm": 1.1640625, "learning_rate": 0.0004631870955044251, "loss": 5.2769, "mean_token_accuracy": 0.17786265760660172, "num_tokens": 42055804.0, "step": 22795 }, { "entropy": 5.609493112564087, "epoch": 1.9155219491703424, "grad_norm": 1.7109375, "learning_rate": 0.00046317067895874063, "loss": 5.2838, "mean_token_accuracy": 0.18263700753450393, "num_tokens": 42064655.0, "step": 22800 }, { "entropy": 5.665407276153564, "epoch": 1.9159420289855071, "grad_norm": 1.140625, "learning_rate": 0.00046315425907970947, "loss": 5.322, "mean_token_accuracy": 0.17597611397504806, "num_tokens": 42073663.0, "step": 22805 }, { "entropy": 5.667187929153442, "epoch": 1.9163621088006721, "grad_norm": 1.375, "learning_rate": 0.0004631378358676225, "loss": 5.4126, "mean_token_accuracy": 0.1755566418170929, "num_tokens": 42083931.0, "step": 22810 }, { "entropy": 5.715243768692017, "epoch": 1.9167821886158372, "grad_norm": 1.25, "learning_rate": 0.0004631214093227706, "loss": 5.381, "mean_token_accuracy": 0.16978215724229812, "num_tokens": 42093782.0, "step": 22815 }, { "entropy": 5.629796504974365, "epoch": 1.917202268431002, "grad_norm": 1.3203125, "learning_rate": 0.0004631049794454448, "loss": 5.3331, "mean_token_accuracy": 0.17728287428617479, "num_tokens": 42103392.0, "step": 22820 }, { "entropy": 5.656073045730591, "epoch": 1.9176223482461667, "grad_norm": 1.296875, "learning_rate": 0.0004630885462359362, "loss": 5.2929, "mean_token_accuracy": 0.1793659135699272, "num_tokens": 42112051.0, "step": 22825 }, { "entropy": 5.5138551712036135, "epoch": 1.9180424280613315, "grad_norm": 1.2421875, "learning_rate": 0.0004630721096945358, "loss": 5.321, "mean_token_accuracy": 0.1804273918271065, "num_tokens": 42120156.0, "step": 22830 }, { "entropy": 5.682647180557251, "epoch": 1.9184625078764965, "grad_norm": 1.296875, "learning_rate": 0.0004630556698215349, "loss": 5.4097, "mean_token_accuracy": 0.17985475659370423, "num_tokens": 42129564.0, "step": 22835 }, { "entropy": 5.691924667358398, "epoch": 1.9188825876916615, "grad_norm": 1.2265625, "learning_rate": 0.00046303922661722466, "loss": 5.4662, "mean_token_accuracy": 0.16802815347909927, "num_tokens": 42138144.0, "step": 22840 }, { "entropy": 5.572285509109497, "epoch": 1.9193026675068263, "grad_norm": 1.9609375, "learning_rate": 0.00046302278008189627, "loss": 5.2914, "mean_token_accuracy": 0.1704635813832283, "num_tokens": 42147701.0, "step": 22845 }, { "entropy": 5.540525960922241, "epoch": 1.919722747321991, "grad_norm": 1.203125, "learning_rate": 0.0004630063302158412, "loss": 5.2657, "mean_token_accuracy": 0.1806807652115822, "num_tokens": 42156772.0, "step": 22850 }, { "entropy": 5.553380632400513, "epoch": 1.920142827137156, "grad_norm": 1.2890625, "learning_rate": 0.00046298987701935066, "loss": 5.2087, "mean_token_accuracy": 0.18418700397014617, "num_tokens": 42165227.0, "step": 22855 }, { "entropy": 5.556873607635498, "epoch": 1.920562906952321, "grad_norm": 1.1328125, "learning_rate": 0.0004629734204927164, "loss": 5.2462, "mean_token_accuracy": 0.1809609055519104, "num_tokens": 42174800.0, "step": 22860 }, { "entropy": 5.615631151199341, "epoch": 1.920982986767486, "grad_norm": 1.3125, "learning_rate": 0.0004629569606362298, "loss": 5.3416, "mean_token_accuracy": 0.17381157577037812, "num_tokens": 42184301.0, "step": 22865 }, { "entropy": 5.657670736312866, "epoch": 1.9214030665826507, "grad_norm": 1.203125, "learning_rate": 0.0004629404974501823, "loss": 5.3347, "mean_token_accuracy": 0.17408420890569687, "num_tokens": 42193266.0, "step": 22870 }, { "entropy": 5.550420045852661, "epoch": 1.9218231463978155, "grad_norm": 1.296875, "learning_rate": 0.0004629240309348658, "loss": 5.2736, "mean_token_accuracy": 0.1723278731107712, "num_tokens": 42202051.0, "step": 22875 }, { "entropy": 5.5658341407775875, "epoch": 1.9222432262129805, "grad_norm": 1.28125, "learning_rate": 0.0004629075610905717, "loss": 5.1858, "mean_token_accuracy": 0.18649692982435226, "num_tokens": 42210716.0, "step": 22880 }, { "entropy": 5.521829605102539, "epoch": 1.9226633060281455, "grad_norm": 1.1796875, "learning_rate": 0.000462891087917592, "loss": 5.255, "mean_token_accuracy": 0.1794131278991699, "num_tokens": 42219930.0, "step": 22885 }, { "entropy": 5.61706805229187, "epoch": 1.9230833858433103, "grad_norm": 1.375, "learning_rate": 0.00046287461141621844, "loss": 5.301, "mean_token_accuracy": 0.185006545484066, "num_tokens": 42228864.0, "step": 22890 }, { "entropy": 5.622384357452392, "epoch": 1.923503465658475, "grad_norm": 1.328125, "learning_rate": 0.0004628581315867429, "loss": 5.3738, "mean_token_accuracy": 0.17600143253803252, "num_tokens": 42238030.0, "step": 22895 }, { "entropy": 5.667676210403442, "epoch": 1.9239235454736399, "grad_norm": 1.3203125, "learning_rate": 0.00046284164842945723, "loss": 5.3524, "mean_token_accuracy": 0.17182712703943254, "num_tokens": 42247818.0, "step": 22900 }, { "entropy": 5.661724472045899, "epoch": 1.9243436252888049, "grad_norm": 1.296875, "learning_rate": 0.0004628251619446536, "loss": 5.3038, "mean_token_accuracy": 0.17410755008459092, "num_tokens": 42256772.0, "step": 22905 }, { "entropy": 5.578900766372681, "epoch": 1.9247637051039699, "grad_norm": 1.2265625, "learning_rate": 0.00046280867213262385, "loss": 5.3696, "mean_token_accuracy": 0.16716319620609282, "num_tokens": 42265620.0, "step": 22910 }, { "entropy": 5.678067827224732, "epoch": 1.9251837849191347, "grad_norm": 1.234375, "learning_rate": 0.0004627921789936602, "loss": 5.426, "mean_token_accuracy": 0.16603572368621827, "num_tokens": 42274998.0, "step": 22915 }, { "entropy": 5.700376176834107, "epoch": 1.9256038647342995, "grad_norm": 1.28125, "learning_rate": 0.00046277568252805476, "loss": 5.3521, "mean_token_accuracy": 0.17515442967414857, "num_tokens": 42284849.0, "step": 22920 }, { "entropy": 5.584618091583252, "epoch": 1.9260239445494642, "grad_norm": 1.28125, "learning_rate": 0.0004627591827360998, "loss": 5.3409, "mean_token_accuracy": 0.17606656402349471, "num_tokens": 42294133.0, "step": 22925 }, { "entropy": 5.622400760650635, "epoch": 1.9264440243646292, "grad_norm": 1.265625, "learning_rate": 0.0004627426796180876, "loss": 5.3253, "mean_token_accuracy": 0.18122074604034424, "num_tokens": 42302765.0, "step": 22930 }, { "entropy": 5.661354064941406, "epoch": 1.9268641041797943, "grad_norm": 1.265625, "learning_rate": 0.00046272617317431056, "loss": 5.278, "mean_token_accuracy": 0.17460388243198394, "num_tokens": 42311829.0, "step": 22935 }, { "entropy": 5.653006887435913, "epoch": 1.927284183994959, "grad_norm": 1.25, "learning_rate": 0.00046270966340506087, "loss": 5.4127, "mean_token_accuracy": 0.1800052508711815, "num_tokens": 42321294.0, "step": 22940 }, { "entropy": 5.646188735961914, "epoch": 1.9277042638101238, "grad_norm": 1.296875, "learning_rate": 0.00046269315031063137, "loss": 5.2272, "mean_token_accuracy": 0.179823400080204, "num_tokens": 42329379.0, "step": 22945 }, { "entropy": 5.635144662857056, "epoch": 1.9281243436252888, "grad_norm": 1.2890625, "learning_rate": 0.00046267663389131425, "loss": 5.4211, "mean_token_accuracy": 0.16577421128749847, "num_tokens": 42339867.0, "step": 22950 }, { "entropy": 5.672863578796386, "epoch": 1.9285444234404538, "grad_norm": 1.7421875, "learning_rate": 0.00046266011414740213, "loss": 5.4266, "mean_token_accuracy": 0.17056871354579925, "num_tokens": 42350174.0, "step": 22955 }, { "entropy": 5.623044729232788, "epoch": 1.9289645032556186, "grad_norm": 1.296875, "learning_rate": 0.0004626435910791878, "loss": 5.284, "mean_token_accuracy": 0.1775414004921913, "num_tokens": 42359214.0, "step": 22960 }, { "entropy": 5.5688153266906735, "epoch": 1.9293845830707834, "grad_norm": 1.2109375, "learning_rate": 0.00046262706468696386, "loss": 5.3633, "mean_token_accuracy": 0.17115409225225447, "num_tokens": 42367965.0, "step": 22965 }, { "entropy": 5.6344867706298825, "epoch": 1.9298046628859482, "grad_norm": 1.296875, "learning_rate": 0.0004626105349710231, "loss": 5.3841, "mean_token_accuracy": 0.16720222681760788, "num_tokens": 42377233.0, "step": 22970 }, { "entropy": 5.789755868911743, "epoch": 1.9302247427011132, "grad_norm": 1.1953125, "learning_rate": 0.0004625940019316584, "loss": 5.3816, "mean_token_accuracy": 0.17151414901018142, "num_tokens": 42386060.0, "step": 22975 }, { "entropy": 5.65874752998352, "epoch": 1.9306448225162782, "grad_norm": 1.34375, "learning_rate": 0.00046257746556916236, "loss": 5.3775, "mean_token_accuracy": 0.18202279657125472, "num_tokens": 42395659.0, "step": 22980 }, { "entropy": 5.696528530120849, "epoch": 1.931064902331443, "grad_norm": 1.5625, "learning_rate": 0.00046256092588382825, "loss": 5.3834, "mean_token_accuracy": 0.1711360841989517, "num_tokens": 42403531.0, "step": 22985 }, { "entropy": 5.649896192550659, "epoch": 1.9314849821466078, "grad_norm": 1.3828125, "learning_rate": 0.00046254438287594884, "loss": 5.3348, "mean_token_accuracy": 0.17835707813501359, "num_tokens": 42412364.0, "step": 22990 }, { "entropy": 5.599561738967895, "epoch": 1.9319050619617726, "grad_norm": 1.21875, "learning_rate": 0.00046252783654581733, "loss": 5.3225, "mean_token_accuracy": 0.17222917228937148, "num_tokens": 42422276.0, "step": 22995 }, { "entropy": 5.6603082656860355, "epoch": 1.9323251417769376, "grad_norm": 1.234375, "learning_rate": 0.0004625112868937267, "loss": 5.3528, "mean_token_accuracy": 0.1746899351477623, "num_tokens": 42430853.0, "step": 23000 }, { "entropy": 5.572019052505493, "epoch": 1.9327452215921026, "grad_norm": 1.125, "learning_rate": 0.0004624947339199702, "loss": 5.2428, "mean_token_accuracy": 0.17491891533136367, "num_tokens": 42439034.0, "step": 23005 }, { "entropy": 5.665308284759521, "epoch": 1.9331653014072674, "grad_norm": 1.171875, "learning_rate": 0.000462478177624841, "loss": 5.4216, "mean_token_accuracy": 0.1706569865345955, "num_tokens": 42448494.0, "step": 23010 }, { "entropy": 5.692689990997314, "epoch": 1.9335853812224322, "grad_norm": 1.234375, "learning_rate": 0.00046246161800863244, "loss": 5.3149, "mean_token_accuracy": 0.17972690612077713, "num_tokens": 42457188.0, "step": 23015 }, { "entropy": 5.6557807445526125, "epoch": 1.9340054610375972, "grad_norm": 1.40625, "learning_rate": 0.0004624450550716379, "loss": 5.407, "mean_token_accuracy": 0.16998066902160644, "num_tokens": 42466321.0, "step": 23020 }, { "entropy": 5.624531412124634, "epoch": 1.934425540852762, "grad_norm": 1.53125, "learning_rate": 0.0004624284888141507, "loss": 5.3419, "mean_token_accuracy": 0.17627126276493071, "num_tokens": 42475879.0, "step": 23025 }, { "entropy": 5.653626155853272, "epoch": 1.934845620667927, "grad_norm": 1.296875, "learning_rate": 0.0004624119192364643, "loss": 5.4957, "mean_token_accuracy": 0.1685581237077713, "num_tokens": 42484988.0, "step": 23030 }, { "entropy": 5.609811210632325, "epoch": 1.9352657004830918, "grad_norm": 1.3359375, "learning_rate": 0.00046239534633887223, "loss": 5.2922, "mean_token_accuracy": 0.1745161935687065, "num_tokens": 42493764.0, "step": 23035 }, { "entropy": 5.771266174316406, "epoch": 1.9356857802982566, "grad_norm": 1.2109375, "learning_rate": 0.0004623787701216682, "loss": 5.5004, "mean_token_accuracy": 0.1753440722823143, "num_tokens": 42503312.0, "step": 23040 }, { "entropy": 5.6357780456542965, "epoch": 1.9361058601134216, "grad_norm": 1.09375, "learning_rate": 0.00046236219058514566, "loss": 5.352, "mean_token_accuracy": 0.1730501800775528, "num_tokens": 42512303.0, "step": 23045 }, { "entropy": 5.542242479324341, "epoch": 1.9365259399285866, "grad_norm": 1.1796875, "learning_rate": 0.0004623456077295984, "loss": 5.2403, "mean_token_accuracy": 0.18613847196102143, "num_tokens": 42520928.0, "step": 23050 }, { "entropy": 5.61994481086731, "epoch": 1.9369460197437514, "grad_norm": 1.140625, "learning_rate": 0.0004623290215553201, "loss": 5.2828, "mean_token_accuracy": 0.18155443370342256, "num_tokens": 42529945.0, "step": 23055 }, { "entropy": 5.622451591491699, "epoch": 1.9373660995589161, "grad_norm": 1.359375, "learning_rate": 0.0004623124320626048, "loss": 5.3357, "mean_token_accuracy": 0.1775738701224327, "num_tokens": 42539078.0, "step": 23060 }, { "entropy": 5.602380084991455, "epoch": 1.937786179374081, "grad_norm": 1.2734375, "learning_rate": 0.0004622958392517461, "loss": 5.291, "mean_token_accuracy": 0.17909268736839296, "num_tokens": 42547842.0, "step": 23065 }, { "entropy": 5.6292308330535885, "epoch": 1.938206259189246, "grad_norm": 1.2265625, "learning_rate": 0.0004622792431230381, "loss": 5.2295, "mean_token_accuracy": 0.18587420433759688, "num_tokens": 42556574.0, "step": 23070 }, { "entropy": 5.657032442092896, "epoch": 1.938626339004411, "grad_norm": 1.296875, "learning_rate": 0.00046226264367677476, "loss": 5.3364, "mean_token_accuracy": 0.1658307746052742, "num_tokens": 42565906.0, "step": 23075 }, { "entropy": 5.610013246536255, "epoch": 1.9390464188195757, "grad_norm": 1.2265625, "learning_rate": 0.0004622460409132501, "loss": 5.3061, "mean_token_accuracy": 0.17991530746221543, "num_tokens": 42574929.0, "step": 23080 }, { "entropy": 5.6608904838562015, "epoch": 1.9394664986347405, "grad_norm": 1.484375, "learning_rate": 0.0004622294348327582, "loss": 5.3509, "mean_token_accuracy": 0.17006094008684158, "num_tokens": 42585185.0, "step": 23085 }, { "entropy": 5.600542974472046, "epoch": 1.9398865784499055, "grad_norm": 1.28125, "learning_rate": 0.00046221282543559334, "loss": 5.3681, "mean_token_accuracy": 0.17075276374816895, "num_tokens": 42594272.0, "step": 23090 }, { "entropy": 5.5656678676605225, "epoch": 1.9403066582650703, "grad_norm": 1.3203125, "learning_rate": 0.00046219621272204967, "loss": 5.2697, "mean_token_accuracy": 0.17831842303276063, "num_tokens": 42603410.0, "step": 23095 }, { "entropy": 5.666840028762818, "epoch": 1.9407267380802353, "grad_norm": 1.2734375, "learning_rate": 0.00046217959669242145, "loss": 5.479, "mean_token_accuracy": 0.16195986643433571, "num_tokens": 42613879.0, "step": 23100 }, { "entropy": 5.662532567977905, "epoch": 1.9411468178954001, "grad_norm": 1.3671875, "learning_rate": 0.000462162977347003, "loss": 5.2801, "mean_token_accuracy": 0.17427633106708526, "num_tokens": 42623323.0, "step": 23105 }, { "entropy": 5.668767642974854, "epoch": 1.941566897710565, "grad_norm": 1.234375, "learning_rate": 0.00046214635468608885, "loss": 5.3365, "mean_token_accuracy": 0.17507773339748384, "num_tokens": 42632365.0, "step": 23110 }, { "entropy": 5.6521703720092775, "epoch": 1.94198697752573, "grad_norm": 1.2734375, "learning_rate": 0.00046212972870997336, "loss": 5.3073, "mean_token_accuracy": 0.17932529896497726, "num_tokens": 42641872.0, "step": 23115 }, { "entropy": 5.679893827438354, "epoch": 1.942407057340895, "grad_norm": 1.234375, "learning_rate": 0.0004621130994189511, "loss": 5.3758, "mean_token_accuracy": 0.17578433007001876, "num_tokens": 42652031.0, "step": 23120 }, { "entropy": 5.534250640869141, "epoch": 1.9428271371560597, "grad_norm": 1.3515625, "learning_rate": 0.0004620964668133166, "loss": 5.3045, "mean_token_accuracy": 0.17120088189840316, "num_tokens": 42661040.0, "step": 23125 }, { "entropy": 5.6397205829620365, "epoch": 1.9432472169712245, "grad_norm": 1.2265625, "learning_rate": 0.0004620798308933646, "loss": 5.3392, "mean_token_accuracy": 0.17351713329553603, "num_tokens": 42670559.0, "step": 23130 }, { "entropy": 5.674306726455688, "epoch": 1.9436672967863893, "grad_norm": 1.4296875, "learning_rate": 0.0004620631916593897, "loss": 5.3482, "mean_token_accuracy": 0.17234041541814804, "num_tokens": 42679883.0, "step": 23135 }, { "entropy": 5.741105794906616, "epoch": 1.9440873766015543, "grad_norm": 1.078125, "learning_rate": 0.0004620465491116867, "loss": 5.4648, "mean_token_accuracy": 0.15939529240131378, "num_tokens": 42689746.0, "step": 23140 }, { "entropy": 5.731553983688355, "epoch": 1.9445074564167193, "grad_norm": 1.1171875, "learning_rate": 0.00046202990325055034, "loss": 5.3838, "mean_token_accuracy": 0.17033789455890655, "num_tokens": 42699685.0, "step": 23145 }, { "entropy": 5.571749210357666, "epoch": 1.944927536231884, "grad_norm": 1.1484375, "learning_rate": 0.0004620132540762756, "loss": 5.2458, "mean_token_accuracy": 0.1736294910311699, "num_tokens": 42708873.0, "step": 23150 }, { "entropy": 5.5605018615722654, "epoch": 1.9453476160470489, "grad_norm": 1.2734375, "learning_rate": 0.00046199660158915734, "loss": 5.3166, "mean_token_accuracy": 0.1672689750790596, "num_tokens": 42717807.0, "step": 23155 }, { "entropy": 5.597097682952881, "epoch": 1.9457676958622139, "grad_norm": 1.265625, "learning_rate": 0.00046197994578949056, "loss": 5.3768, "mean_token_accuracy": 0.17283654361963272, "num_tokens": 42726674.0, "step": 23160 }, { "entropy": 5.670198249816894, "epoch": 1.9461877756773787, "grad_norm": 1.4765625, "learning_rate": 0.0004619632866775704, "loss": 5.4328, "mean_token_accuracy": 0.1715935230255127, "num_tokens": 42735621.0, "step": 23165 }, { "entropy": 5.605064630508423, "epoch": 1.9466078554925437, "grad_norm": 1.1796875, "learning_rate": 0.0004619466242536918, "loss": 5.3183, "mean_token_accuracy": 0.17671644389629365, "num_tokens": 42744945.0, "step": 23170 }, { "entropy": 5.675964307785034, "epoch": 1.9470279353077085, "grad_norm": 1.484375, "learning_rate": 0.0004619299585181501, "loss": 5.4318, "mean_token_accuracy": 0.17112542688846588, "num_tokens": 42754906.0, "step": 23175 }, { "entropy": 5.677554368972778, "epoch": 1.9474480151228732, "grad_norm": 1.28125, "learning_rate": 0.00046191328947124027, "loss": 5.3332, "mean_token_accuracy": 0.17521820366382598, "num_tokens": 42764673.0, "step": 23180 }, { "entropy": 5.564341068267822, "epoch": 1.9478680949380383, "grad_norm": 1.25, "learning_rate": 0.00046189661711325784, "loss": 5.3217, "mean_token_accuracy": 0.18367141485214233, "num_tokens": 42774528.0, "step": 23185 }, { "entropy": 5.669634675979614, "epoch": 1.9482881747532033, "grad_norm": 1.2578125, "learning_rate": 0.00046187994144449815, "loss": 5.2309, "mean_token_accuracy": 0.1801608145236969, "num_tokens": 42783813.0, "step": 23190 }, { "entropy": 5.586480951309204, "epoch": 1.948708254568368, "grad_norm": 1.2890625, "learning_rate": 0.0004618632624652565, "loss": 5.3154, "mean_token_accuracy": 0.17071151435375215, "num_tokens": 42793483.0, "step": 23195 }, { "entropy": 5.59461088180542, "epoch": 1.9491283343835328, "grad_norm": 1.2265625, "learning_rate": 0.0004618465801758283, "loss": 5.3859, "mean_token_accuracy": 0.1717785432934761, "num_tokens": 42803177.0, "step": 23200 }, { "entropy": 5.673942232131958, "epoch": 1.9495484141986976, "grad_norm": 1.21875, "learning_rate": 0.00046182989457650925, "loss": 5.3849, "mean_token_accuracy": 0.17533280104398727, "num_tokens": 42812395.0, "step": 23205 }, { "entropy": 5.617794990539551, "epoch": 1.9499684940138626, "grad_norm": 1.3046875, "learning_rate": 0.00046181320566759476, "loss": 5.3511, "mean_token_accuracy": 0.17385358661413192, "num_tokens": 42821495.0, "step": 23210 }, { "entropy": 5.608628606796264, "epoch": 1.9503885738290276, "grad_norm": 1.3046875, "learning_rate": 0.00046179651344938055, "loss": 5.3336, "mean_token_accuracy": 0.17260808795690535, "num_tokens": 42832219.0, "step": 23215 }, { "entropy": 5.632011890411377, "epoch": 1.9508086536441924, "grad_norm": 1.3671875, "learning_rate": 0.00046177981792216234, "loss": 5.2745, "mean_token_accuracy": 0.1757341668009758, "num_tokens": 42841368.0, "step": 23220 }, { "entropy": 5.603061056137085, "epoch": 1.9512287334593572, "grad_norm": 1.3359375, "learning_rate": 0.00046176311908623574, "loss": 5.3093, "mean_token_accuracy": 0.1824861243367195, "num_tokens": 42850512.0, "step": 23225 }, { "entropy": 5.6234039783477785, "epoch": 1.951648813274522, "grad_norm": 1.265625, "learning_rate": 0.0004617464169418967, "loss": 5.3568, "mean_token_accuracy": 0.1736053630709648, "num_tokens": 42860749.0, "step": 23230 }, { "entropy": 5.616316413879394, "epoch": 1.952068893089687, "grad_norm": 1.203125, "learning_rate": 0.00046172971148944106, "loss": 5.3083, "mean_token_accuracy": 0.17737708240747452, "num_tokens": 42869880.0, "step": 23235 }, { "entropy": 5.60381588935852, "epoch": 1.952488972904852, "grad_norm": 1.2421875, "learning_rate": 0.00046171300272916465, "loss": 5.2901, "mean_token_accuracy": 0.18085473626852036, "num_tokens": 42879001.0, "step": 23240 }, { "entropy": 5.551793575286865, "epoch": 1.9529090527200168, "grad_norm": 1.3828125, "learning_rate": 0.00046169629066136357, "loss": 5.2287, "mean_token_accuracy": 0.1819872483611107, "num_tokens": 42888036.0, "step": 23245 }, { "entropy": 5.664321565628052, "epoch": 1.9533291325351816, "grad_norm": 1.2265625, "learning_rate": 0.00046167957528633387, "loss": 5.3401, "mean_token_accuracy": 0.1780684620141983, "num_tokens": 42897460.0, "step": 23250 }, { "entropy": 5.6305899143219, "epoch": 1.9537492123503466, "grad_norm": 1.3203125, "learning_rate": 0.00046166285660437164, "loss": 5.3538, "mean_token_accuracy": 0.1773480087518692, "num_tokens": 42907010.0, "step": 23255 }, { "entropy": 5.664665699005127, "epoch": 1.9541692921655116, "grad_norm": 1.3046875, "learning_rate": 0.000461646134615773, "loss": 5.2976, "mean_token_accuracy": 0.17132930904626847, "num_tokens": 42915684.0, "step": 23260 }, { "entropy": 5.595171546936035, "epoch": 1.9545893719806764, "grad_norm": 1.265625, "learning_rate": 0.00046162940932083414, "loss": 5.3159, "mean_token_accuracy": 0.17843813300132752, "num_tokens": 42924903.0, "step": 23265 }, { "entropy": 5.591875410079956, "epoch": 1.9550094517958412, "grad_norm": 1.3359375, "learning_rate": 0.00046161268071985144, "loss": 5.3182, "mean_token_accuracy": 0.17087887227535248, "num_tokens": 42935234.0, "step": 23270 }, { "entropy": 5.50767765045166, "epoch": 1.955429531611006, "grad_norm": 1.7890625, "learning_rate": 0.0004615959488131212, "loss": 5.2438, "mean_token_accuracy": 0.18054774403572083, "num_tokens": 42944093.0, "step": 23275 }, { "entropy": 5.6021500587463375, "epoch": 1.955849611426171, "grad_norm": 1.390625, "learning_rate": 0.0004615792136009398, "loss": 5.2662, "mean_token_accuracy": 0.17670947611331939, "num_tokens": 42953504.0, "step": 23280 }, { "entropy": 5.602096080780029, "epoch": 1.956269691241336, "grad_norm": 1.375, "learning_rate": 0.00046156247508360375, "loss": 5.3159, "mean_token_accuracy": 0.1776598408818245, "num_tokens": 42962205.0, "step": 23285 }, { "entropy": 5.548053550720215, "epoch": 1.9566897710565008, "grad_norm": 1.28125, "learning_rate": 0.0004615457332614095, "loss": 5.2466, "mean_token_accuracy": 0.17867524921894073, "num_tokens": 42971240.0, "step": 23290 }, { "entropy": 5.659411191940308, "epoch": 1.9571098508716656, "grad_norm": 1.21875, "learning_rate": 0.00046152898813465353, "loss": 5.4036, "mean_token_accuracy": 0.16589925736188887, "num_tokens": 42981573.0, "step": 23295 }, { "entropy": 5.64855694770813, "epoch": 1.9575299306868303, "grad_norm": 1.6484375, "learning_rate": 0.0004615122397036327, "loss": 5.327, "mean_token_accuracy": 0.17262526452541352, "num_tokens": 42991383.0, "step": 23300 }, { "entropy": 5.609686803817749, "epoch": 1.9579500105019954, "grad_norm": 1.234375, "learning_rate": 0.00046149548796864355, "loss": 5.2754, "mean_token_accuracy": 0.1759060487151146, "num_tokens": 43000029.0, "step": 23305 }, { "entropy": 5.634216022491455, "epoch": 1.9583700903171604, "grad_norm": 1.3828125, "learning_rate": 0.00046147873292998285, "loss": 5.3476, "mean_token_accuracy": 0.17457685023546218, "num_tokens": 43008880.0, "step": 23310 }, { "entropy": 5.561314058303833, "epoch": 1.9587901701323251, "grad_norm": 1.3125, "learning_rate": 0.0004614619745879475, "loss": 5.3153, "mean_token_accuracy": 0.1781879886984825, "num_tokens": 43017417.0, "step": 23315 }, { "entropy": 5.620518207550049, "epoch": 1.95921024994749, "grad_norm": 1.6484375, "learning_rate": 0.0004614452129428342, "loss": 5.2382, "mean_token_accuracy": 0.18082706928253173, "num_tokens": 43025738.0, "step": 23320 }, { "entropy": 5.715609693527222, "epoch": 1.959630329762655, "grad_norm": 1.6171875, "learning_rate": 0.0004614284479949399, "loss": 5.3641, "mean_token_accuracy": 0.17538043707609177, "num_tokens": 43035485.0, "step": 23325 }, { "entropy": 5.712373542785644, "epoch": 1.96005040957782, "grad_norm": 1.3515625, "learning_rate": 0.0004614116797445617, "loss": 5.3889, "mean_token_accuracy": 0.1784473180770874, "num_tokens": 43044627.0, "step": 23330 }, { "entropy": 5.570833015441894, "epoch": 1.9604704893929847, "grad_norm": 1.34375, "learning_rate": 0.00046139490819199666, "loss": 5.2968, "mean_token_accuracy": 0.1788347989320755, "num_tokens": 43053790.0, "step": 23335 }, { "entropy": 5.616850519180298, "epoch": 1.9608905692081495, "grad_norm": 1.265625, "learning_rate": 0.0004613781333375417, "loss": 5.2878, "mean_token_accuracy": 0.18670900613069535, "num_tokens": 43063511.0, "step": 23340 }, { "entropy": 5.548789978027344, "epoch": 1.9613106490233143, "grad_norm": 1.359375, "learning_rate": 0.0004613613551814941, "loss": 5.2263, "mean_token_accuracy": 0.18141030222177507, "num_tokens": 43072349.0, "step": 23345 }, { "entropy": 5.640681552886963, "epoch": 1.9617307288384793, "grad_norm": 1.484375, "learning_rate": 0.0004613445737241511, "loss": 5.3351, "mean_token_accuracy": 0.17484953999519348, "num_tokens": 43081552.0, "step": 23350 }, { "entropy": 5.684726333618164, "epoch": 1.9621508086536443, "grad_norm": 1.265625, "learning_rate": 0.00046132778896581, "loss": 5.3775, "mean_token_accuracy": 0.17865779995918274, "num_tokens": 43092321.0, "step": 23355 }, { "entropy": 5.672852087020874, "epoch": 1.9625708884688091, "grad_norm": 1.2265625, "learning_rate": 0.0004613110009067679, "loss": 5.3385, "mean_token_accuracy": 0.17483696341514587, "num_tokens": 43102326.0, "step": 23360 }, { "entropy": 5.655863475799561, "epoch": 1.962990968283974, "grad_norm": 1.2421875, "learning_rate": 0.00046129420954732237, "loss": 5.3726, "mean_token_accuracy": 0.17350683659315108, "num_tokens": 43110895.0, "step": 23365 }, { "entropy": 5.571282768249512, "epoch": 1.9634110480991387, "grad_norm": 1.1953125, "learning_rate": 0.0004612774148877709, "loss": 5.2236, "mean_token_accuracy": 0.1840864822268486, "num_tokens": 43119948.0, "step": 23370 }, { "entropy": 5.671322822570801, "epoch": 1.9638311279143037, "grad_norm": 1.390625, "learning_rate": 0.000461260616928411, "loss": 5.4221, "mean_token_accuracy": 0.17291183322668074, "num_tokens": 43129876.0, "step": 23375 }, { "entropy": 5.6836082458496096, "epoch": 1.9642512077294687, "grad_norm": 1.5390625, "learning_rate": 0.00046124381566954006, "loss": 5.3752, "mean_token_accuracy": 0.1769299626350403, "num_tokens": 43138831.0, "step": 23380 }, { "entropy": 5.644669532775879, "epoch": 1.9646712875446335, "grad_norm": 1.3515625, "learning_rate": 0.00046122701111145587, "loss": 5.3462, "mean_token_accuracy": 0.16992448419332504, "num_tokens": 43147338.0, "step": 23385 }, { "entropy": 5.6005443096160885, "epoch": 1.9650913673597983, "grad_norm": 1.234375, "learning_rate": 0.0004612102032544561, "loss": 5.2866, "mean_token_accuracy": 0.1766381561756134, "num_tokens": 43158587.0, "step": 23390 }, { "entropy": 5.610977602005005, "epoch": 1.9655114471749633, "grad_norm": 1.296875, "learning_rate": 0.00046119339209883846, "loss": 5.2766, "mean_token_accuracy": 0.18496377915143966, "num_tokens": 43167610.0, "step": 23395 }, { "entropy": 5.537552261352539, "epoch": 1.965931526990128, "grad_norm": 1.3984375, "learning_rate": 0.0004611765776449007, "loss": 5.2482, "mean_token_accuracy": 0.17576922178268434, "num_tokens": 43176374.0, "step": 23400 }, { "entropy": 5.630776309967041, "epoch": 1.966351606805293, "grad_norm": 1.4375, "learning_rate": 0.00046115975989294083, "loss": 5.4188, "mean_token_accuracy": 0.16968157142400742, "num_tokens": 43187038.0, "step": 23405 }, { "entropy": 5.745281171798706, "epoch": 1.9667716866204579, "grad_norm": 1.2890625, "learning_rate": 0.0004611429388432566, "loss": 5.4078, "mean_token_accuracy": 0.17005863785743713, "num_tokens": 43197868.0, "step": 23410 }, { "entropy": 5.659871816635132, "epoch": 1.9671917664356227, "grad_norm": 1.265625, "learning_rate": 0.00046112611449614603, "loss": 5.3696, "mean_token_accuracy": 0.16665552854537963, "num_tokens": 43207675.0, "step": 23415 }, { "entropy": 5.648601293563843, "epoch": 1.9676118462507877, "grad_norm": 1.328125, "learning_rate": 0.0004611092868519072, "loss": 5.3676, "mean_token_accuracy": 0.17277957051992415, "num_tokens": 43217154.0, "step": 23420 }, { "entropy": 5.631614065170288, "epoch": 1.9680319260659527, "grad_norm": 1.3671875, "learning_rate": 0.0004610924559108383, "loss": 5.3662, "mean_token_accuracy": 0.17904918119311333, "num_tokens": 43226912.0, "step": 23425 }, { "entropy": 5.663963079452515, "epoch": 1.9684520058811175, "grad_norm": 1.3046875, "learning_rate": 0.0004610756216732372, "loss": 5.3729, "mean_token_accuracy": 0.17254897505044936, "num_tokens": 43236711.0, "step": 23430 }, { "entropy": 5.701264095306397, "epoch": 1.9688720856962822, "grad_norm": 1.5859375, "learning_rate": 0.00046105878413940237, "loss": 5.349, "mean_token_accuracy": 0.18224181234836578, "num_tokens": 43247005.0, "step": 23435 }, { "entropy": 5.405902004241943, "epoch": 1.969292165511447, "grad_norm": 1.2109375, "learning_rate": 0.000461041943309632, "loss": 5.1978, "mean_token_accuracy": 0.18523926436901092, "num_tokens": 43255868.0, "step": 23440 }, { "entropy": 5.579332637786865, "epoch": 1.969712245326612, "grad_norm": 1.2890625, "learning_rate": 0.0004610250991842244, "loss": 5.3133, "mean_token_accuracy": 0.17708868831396102, "num_tokens": 43265727.0, "step": 23445 }, { "entropy": 5.68139796257019, "epoch": 1.970132325141777, "grad_norm": 1.28125, "learning_rate": 0.00046100825176347796, "loss": 5.3433, "mean_token_accuracy": 0.17815263122320174, "num_tokens": 43274530.0, "step": 23450 }, { "entropy": 5.513430643081665, "epoch": 1.9705524049569418, "grad_norm": 1.203125, "learning_rate": 0.000460991401047691, "loss": 5.2518, "mean_token_accuracy": 0.17275855988264083, "num_tokens": 43285130.0, "step": 23455 }, { "entropy": 5.609006071090699, "epoch": 1.9709724847721066, "grad_norm": 1.234375, "learning_rate": 0.0004609745470371622, "loss": 5.3268, "mean_token_accuracy": 0.17718621343374252, "num_tokens": 43293574.0, "step": 23460 }, { "entropy": 5.5410703182220455, "epoch": 1.9713925645872716, "grad_norm": 1.2109375, "learning_rate": 0.0004609576897321902, "loss": 5.1567, "mean_token_accuracy": 0.18253391236066818, "num_tokens": 43301989.0, "step": 23465 }, { "entropy": 5.649783420562744, "epoch": 1.9718126444024364, "grad_norm": 1.21875, "learning_rate": 0.00046094082913307336, "loss": 5.358, "mean_token_accuracy": 0.17399438023567199, "num_tokens": 43310934.0, "step": 23470 }, { "entropy": 5.576969957351684, "epoch": 1.9722327242176014, "grad_norm": 1.234375, "learning_rate": 0.0004609239652401104, "loss": 5.2712, "mean_token_accuracy": 0.17430226355791092, "num_tokens": 43320703.0, "step": 23475 }, { "entropy": 5.605929708480835, "epoch": 1.9726528040327662, "grad_norm": 1.3125, "learning_rate": 0.00046090709805360027, "loss": 5.2428, "mean_token_accuracy": 0.1821880042552948, "num_tokens": 43329444.0, "step": 23480 }, { "entropy": 5.6519848823547365, "epoch": 1.973072883847931, "grad_norm": 1.2421875, "learning_rate": 0.0004608902275738416, "loss": 5.3677, "mean_token_accuracy": 0.18188654333353044, "num_tokens": 43337853.0, "step": 23485 }, { "entropy": 5.696072387695312, "epoch": 1.973492963663096, "grad_norm": 1.25, "learning_rate": 0.0004608733538011333, "loss": 5.4032, "mean_token_accuracy": 0.16969217211008072, "num_tokens": 43347901.0, "step": 23490 }, { "entropy": 5.601595401763916, "epoch": 1.973913043478261, "grad_norm": 1.15625, "learning_rate": 0.0004608564767357741, "loss": 5.2628, "mean_token_accuracy": 0.17602366507053374, "num_tokens": 43357358.0, "step": 23495 }, { "entropy": 5.590727233886719, "epoch": 1.9743331232934258, "grad_norm": 1.3046875, "learning_rate": 0.0004608395963780632, "loss": 5.3724, "mean_token_accuracy": 0.17241780906915666, "num_tokens": 43366749.0, "step": 23500 }, { "entropy": 5.586251354217529, "epoch": 1.9747532031085906, "grad_norm": 1.234375, "learning_rate": 0.0004608227127282996, "loss": 5.3251, "mean_token_accuracy": 0.17821072190999984, "num_tokens": 43375243.0, "step": 23505 }, { "entropy": 5.666031312942505, "epoch": 1.9751732829237554, "grad_norm": 1.3671875, "learning_rate": 0.0004608058257867823, "loss": 5.2887, "mean_token_accuracy": 0.18276388347148895, "num_tokens": 43383470.0, "step": 23510 }, { "entropy": 5.6554535865783695, "epoch": 1.9755933627389204, "grad_norm": 1.3203125, "learning_rate": 0.0004607889355538105, "loss": 5.4184, "mean_token_accuracy": 0.17027001827955246, "num_tokens": 43393527.0, "step": 23515 }, { "entropy": 5.604500722885132, "epoch": 1.9760134425540854, "grad_norm": 1.3203125, "learning_rate": 0.00046077204202968325, "loss": 5.2812, "mean_token_accuracy": 0.17676235735416412, "num_tokens": 43402390.0, "step": 23520 }, { "entropy": 5.573892974853516, "epoch": 1.9764335223692502, "grad_norm": 1.484375, "learning_rate": 0.00046075514521470005, "loss": 5.2718, "mean_token_accuracy": 0.17329889982938768, "num_tokens": 43411479.0, "step": 23525 }, { "entropy": 5.554893112182617, "epoch": 1.976853602184415, "grad_norm": 1.1875, "learning_rate": 0.00046073824510916005, "loss": 5.2121, "mean_token_accuracy": 0.17935867458581925, "num_tokens": 43420402.0, "step": 23530 }, { "entropy": 5.622291040420532, "epoch": 1.9772736819995798, "grad_norm": 1.3515625, "learning_rate": 0.00046072134171336267, "loss": 5.3531, "mean_token_accuracy": 0.16644867211580278, "num_tokens": 43429011.0, "step": 23535 }, { "entropy": 5.605536222457886, "epoch": 1.9776937618147448, "grad_norm": 1.25, "learning_rate": 0.0004607044350276074, "loss": 5.2344, "mean_token_accuracy": 0.17794644683599473, "num_tokens": 43438548.0, "step": 23540 }, { "entropy": 5.648398113250733, "epoch": 1.9781138416299098, "grad_norm": 1.234375, "learning_rate": 0.00046068752505219366, "loss": 5.3322, "mean_token_accuracy": 0.17605538964271544, "num_tokens": 43448332.0, "step": 23545 }, { "entropy": 5.653730773925782, "epoch": 1.9785339214450746, "grad_norm": 1.2734375, "learning_rate": 0.000460670611787421, "loss": 5.4006, "mean_token_accuracy": 0.17038846909999847, "num_tokens": 43457726.0, "step": 23550 }, { "entropy": 5.605834054946899, "epoch": 1.9789540012602393, "grad_norm": 1.234375, "learning_rate": 0.0004606536952335891, "loss": 5.3285, "mean_token_accuracy": 0.17592835873365403, "num_tokens": 43466617.0, "step": 23555 }, { "entropy": 5.556175947189331, "epoch": 1.9793740810754044, "grad_norm": 1.1953125, "learning_rate": 0.00046063677539099756, "loss": 5.3061, "mean_token_accuracy": 0.1715977743268013, "num_tokens": 43476044.0, "step": 23560 }, { "entropy": 5.579178810119629, "epoch": 1.9797941608905694, "grad_norm": 1.3125, "learning_rate": 0.00046061985225994616, "loss": 5.2886, "mean_token_accuracy": 0.17626330852508545, "num_tokens": 43485488.0, "step": 23565 }, { "entropy": 5.6485466957092285, "epoch": 1.9802142407057342, "grad_norm": 1.21875, "learning_rate": 0.00046060292584073465, "loss": 5.3135, "mean_token_accuracy": 0.17889403253793718, "num_tokens": 43494423.0, "step": 23570 }, { "entropy": 5.590170574188233, "epoch": 1.980634320520899, "grad_norm": 1.1953125, "learning_rate": 0.00046058599613366287, "loss": 5.1856, "mean_token_accuracy": 0.1918771132826805, "num_tokens": 43502874.0, "step": 23575 }, { "entropy": 5.672195100784302, "epoch": 1.9810544003360637, "grad_norm": 1.2734375, "learning_rate": 0.0004605690631390308, "loss": 5.4446, "mean_token_accuracy": 0.16917974948883058, "num_tokens": 43512222.0, "step": 23580 }, { "entropy": 5.618256378173828, "epoch": 1.9814744801512287, "grad_norm": 1.6171875, "learning_rate": 0.0004605521268571382, "loss": 5.3509, "mean_token_accuracy": 0.17687894701957702, "num_tokens": 43521577.0, "step": 23585 }, { "entropy": 5.707334375381469, "epoch": 1.9818945599663937, "grad_norm": 1.296875, "learning_rate": 0.00046053518728828534, "loss": 5.3422, "mean_token_accuracy": 0.17523998022079468, "num_tokens": 43529763.0, "step": 23590 }, { "entropy": 5.680575227737426, "epoch": 1.9823146397815585, "grad_norm": 1.4140625, "learning_rate": 0.0004605182444327721, "loss": 5.3651, "mean_token_accuracy": 0.16793065816164016, "num_tokens": 43538663.0, "step": 23595 }, { "entropy": 5.517296218872071, "epoch": 1.9827347195967233, "grad_norm": 1.25, "learning_rate": 0.0004605012982908987, "loss": 5.1665, "mean_token_accuracy": 0.18656257838010787, "num_tokens": 43547302.0, "step": 23600 }, { "entropy": 5.56163215637207, "epoch": 1.983154799411888, "grad_norm": 1.3125, "learning_rate": 0.00046048434886296536, "loss": 5.3806, "mean_token_accuracy": 0.16650519967079164, "num_tokens": 43557222.0, "step": 23605 }, { "entropy": 5.647756576538086, "epoch": 1.9835748792270531, "grad_norm": 1.2109375, "learning_rate": 0.0004604673961492722, "loss": 5.2736, "mean_token_accuracy": 0.18116023987531663, "num_tokens": 43566210.0, "step": 23610 }, { "entropy": 5.562030267715454, "epoch": 1.9839949590422181, "grad_norm": 1.25, "learning_rate": 0.00046045044015011975, "loss": 5.2476, "mean_token_accuracy": 0.1800748810172081, "num_tokens": 43576275.0, "step": 23615 }, { "entropy": 5.579679298400879, "epoch": 1.984415038857383, "grad_norm": 1.265625, "learning_rate": 0.0004604334808658081, "loss": 5.3723, "mean_token_accuracy": 0.17557549476623535, "num_tokens": 43585480.0, "step": 23620 }, { "entropy": 5.653260231018066, "epoch": 1.9848351186725477, "grad_norm": 1.53125, "learning_rate": 0.00046041651829663787, "loss": 5.3961, "mean_token_accuracy": 0.17293741554021835, "num_tokens": 43593911.0, "step": 23625 }, { "entropy": 5.6270537853240965, "epoch": 1.9852551984877127, "grad_norm": 1.375, "learning_rate": 0.00046039955244290957, "loss": 5.3168, "mean_token_accuracy": 0.17904412150382995, "num_tokens": 43604029.0, "step": 23630 }, { "entropy": 5.683732986450195, "epoch": 1.9856752783028777, "grad_norm": 1.78125, "learning_rate": 0.00046038258330492363, "loss": 5.3514, "mean_token_accuracy": 0.17994878441095352, "num_tokens": 43613248.0, "step": 23635 }, { "entropy": 5.647359848022461, "epoch": 1.9860953581180425, "grad_norm": 1.390625, "learning_rate": 0.0004603656108829806, "loss": 5.3049, "mean_token_accuracy": 0.17984101325273513, "num_tokens": 43623232.0, "step": 23640 }, { "entropy": 5.658777713775635, "epoch": 1.9865154379332073, "grad_norm": 1.7109375, "learning_rate": 0.00046034863517738136, "loss": 5.3651, "mean_token_accuracy": 0.16325145363807678, "num_tokens": 43632999.0, "step": 23645 }, { "entropy": 5.650898551940918, "epoch": 1.986935517748372, "grad_norm": 1.390625, "learning_rate": 0.00046033165618842637, "loss": 5.3269, "mean_token_accuracy": 0.17500171065330505, "num_tokens": 43641492.0, "step": 23650 }, { "entropy": 5.711059141159057, "epoch": 1.987355597563537, "grad_norm": 1.2890625, "learning_rate": 0.00046031467391641657, "loss": 5.314, "mean_token_accuracy": 0.1773490861058235, "num_tokens": 43650999.0, "step": 23655 }, { "entropy": 5.643770027160644, "epoch": 1.987775677378702, "grad_norm": 1.203125, "learning_rate": 0.0004602976883616527, "loss": 5.3811, "mean_token_accuracy": 0.16796135902404785, "num_tokens": 43660777.0, "step": 23660 }, { "entropy": 5.592342329025269, "epoch": 1.9881957571938669, "grad_norm": 1.375, "learning_rate": 0.00046028069952443575, "loss": 5.3036, "mean_token_accuracy": 0.17716382443904877, "num_tokens": 43670404.0, "step": 23665 }, { "entropy": 5.582193326950073, "epoch": 1.9886158370090317, "grad_norm": 1.3203125, "learning_rate": 0.00046026370740506663, "loss": 5.2388, "mean_token_accuracy": 0.1850288465619087, "num_tokens": 43679183.0, "step": 23670 }, { "entropy": 5.575860261917114, "epoch": 1.9890359168241964, "grad_norm": 1.421875, "learning_rate": 0.0004602467120038463, "loss": 5.26, "mean_token_accuracy": 0.17996072322130202, "num_tokens": 43688080.0, "step": 23675 }, { "entropy": 5.633952903747558, "epoch": 1.9894559966393615, "grad_norm": 1.2734375, "learning_rate": 0.00046022971332107586, "loss": 5.2255, "mean_token_accuracy": 0.18307080417871474, "num_tokens": 43697271.0, "step": 23680 }, { "entropy": 5.565424203872681, "epoch": 1.9898760764545265, "grad_norm": 1.3203125, "learning_rate": 0.00046021271135705637, "loss": 5.2542, "mean_token_accuracy": 0.183968748152256, "num_tokens": 43705541.0, "step": 23685 }, { "entropy": 5.609846735000611, "epoch": 1.9902961562696913, "grad_norm": 1.1875, "learning_rate": 0.0004601957061120891, "loss": 5.3808, "mean_token_accuracy": 0.17398780435323716, "num_tokens": 43713701.0, "step": 23690 }, { "entropy": 5.566676950454712, "epoch": 1.990716236084856, "grad_norm": 1.21875, "learning_rate": 0.0004601786975864753, "loss": 5.3329, "mean_token_accuracy": 0.18383182138204573, "num_tokens": 43723050.0, "step": 23695 }, { "entropy": 5.576835489273071, "epoch": 1.991136315900021, "grad_norm": 1.6875, "learning_rate": 0.0004601616857805162, "loss": 5.3136, "mean_token_accuracy": 0.180113722383976, "num_tokens": 43733029.0, "step": 23700 }, { "entropy": 5.5873369693756105, "epoch": 1.9915563957151858, "grad_norm": 1.75, "learning_rate": 0.0004601446706945132, "loss": 5.2822, "mean_token_accuracy": 0.1761482909321785, "num_tokens": 43741818.0, "step": 23705 }, { "entropy": 5.640828418731689, "epoch": 1.9919764755303508, "grad_norm": 1.2890625, "learning_rate": 0.00046012765232876767, "loss": 5.3156, "mean_token_accuracy": 0.17892836183309554, "num_tokens": 43750755.0, "step": 23710 }, { "entropy": 5.56833963394165, "epoch": 1.9923965553455156, "grad_norm": 1.3359375, "learning_rate": 0.0004601106306835811, "loss": 5.2021, "mean_token_accuracy": 0.18445106595754623, "num_tokens": 43759135.0, "step": 23715 }, { "entropy": 5.560920858383179, "epoch": 1.9928166351606804, "grad_norm": 1.2421875, "learning_rate": 0.0004600936057592551, "loss": 5.1672, "mean_token_accuracy": 0.1868069976568222, "num_tokens": 43767629.0, "step": 23720 }, { "entropy": 5.553515100479126, "epoch": 1.9932367149758454, "grad_norm": 1.234375, "learning_rate": 0.00046007657755609113, "loss": 5.354, "mean_token_accuracy": 0.1743677958846092, "num_tokens": 43776561.0, "step": 23725 }, { "entropy": 5.648859310150146, "epoch": 1.9936567947910104, "grad_norm": 1.2578125, "learning_rate": 0.0004600595460743908, "loss": 5.4235, "mean_token_accuracy": 0.16585010588169097, "num_tokens": 43786569.0, "step": 23730 }, { "entropy": 5.628311204910278, "epoch": 1.9940768746061752, "grad_norm": 1.3203125, "learning_rate": 0.000460042511314456, "loss": 5.3687, "mean_token_accuracy": 0.16907652020454406, "num_tokens": 43795621.0, "step": 23735 }, { "entropy": 5.744281530380249, "epoch": 1.99449695442134, "grad_norm": 1.546875, "learning_rate": 0.00046002547327658847, "loss": 5.3597, "mean_token_accuracy": 0.1761852040886879, "num_tokens": 43804728.0, "step": 23740 }, { "entropy": 5.586940860748291, "epoch": 1.9949170342365048, "grad_norm": 1.5703125, "learning_rate": 0.0004600084319610898, "loss": 5.2577, "mean_token_accuracy": 0.18133593946695328, "num_tokens": 43813495.0, "step": 23745 }, { "entropy": 5.504205417633057, "epoch": 1.9953371140516698, "grad_norm": 1.265625, "learning_rate": 0.0004599913873682621, "loss": 5.2068, "mean_token_accuracy": 0.1786206528544426, "num_tokens": 43823791.0, "step": 23750 }, { "entropy": 5.565213632583618, "epoch": 1.9957571938668348, "grad_norm": 1.265625, "learning_rate": 0.00045997433949840724, "loss": 5.2772, "mean_token_accuracy": 0.18052580058574677, "num_tokens": 43833904.0, "step": 23755 }, { "entropy": 5.663149499893189, "epoch": 1.9961772736819996, "grad_norm": 1.359375, "learning_rate": 0.00045995728835182716, "loss": 5.364, "mean_token_accuracy": 0.1738879531621933, "num_tokens": 43843430.0, "step": 23760 }, { "entropy": 5.670080518722534, "epoch": 1.9965973534971644, "grad_norm": 1.25, "learning_rate": 0.00045994023392882395, "loss": 5.3107, "mean_token_accuracy": 0.1848461866378784, "num_tokens": 43851405.0, "step": 23765 }, { "entropy": 5.584572267532349, "epoch": 1.9970174333123294, "grad_norm": 1.34375, "learning_rate": 0.00045992317622969977, "loss": 5.3923, "mean_token_accuracy": 0.17312257885932922, "num_tokens": 43860034.0, "step": 23770 }, { "entropy": 5.558753299713135, "epoch": 1.9974375131274942, "grad_norm": 1.265625, "learning_rate": 0.00045990611525475675, "loss": 5.3231, "mean_token_accuracy": 0.17416706085205078, "num_tokens": 43869371.0, "step": 23775 }, { "entropy": 5.634297561645508, "epoch": 1.9978575929426592, "grad_norm": 1.296875, "learning_rate": 0.0004598890510042971, "loss": 5.3685, "mean_token_accuracy": 0.1768188074231148, "num_tokens": 43878462.0, "step": 23780 }, { "entropy": 5.6588939189910885, "epoch": 1.998277672757824, "grad_norm": 1.296875, "learning_rate": 0.000459871983478623, "loss": 5.2981, "mean_token_accuracy": 0.17682368606328963, "num_tokens": 43887435.0, "step": 23785 }, { "entropy": 5.586805820465088, "epoch": 1.9986977525729888, "grad_norm": 1.234375, "learning_rate": 0.00045985491267803703, "loss": 5.3459, "mean_token_accuracy": 0.1741949737071991, "num_tokens": 43896720.0, "step": 23790 }, { "entropy": 5.552562236785889, "epoch": 1.9991178323881538, "grad_norm": 1.3359375, "learning_rate": 0.00045983783860284146, "loss": 5.3472, "mean_token_accuracy": 0.1720125764608383, "num_tokens": 43906403.0, "step": 23795 }, { "entropy": 5.672985076904297, "epoch": 1.9995379122033188, "grad_norm": 1.359375, "learning_rate": 0.00045982076125333874, "loss": 5.3871, "mean_token_accuracy": 0.16746917366981506, "num_tokens": 43915059.0, "step": 23800 }, { "entropy": 5.748750972747803, "epoch": 1.9999579920184836, "grad_norm": 1.6640625, "learning_rate": 0.00045980368062983147, "loss": 5.4214, "mean_token_accuracy": 0.17349109947681426, "num_tokens": 43925598.0, "step": 23805 }, { "entropy": 5.655678378211127, "epoch": 2.000336063852132, "grad_norm": 1.4140625, "learning_rate": 0.0004597865967326221, "loss": 5.2086, "mean_token_accuracy": 0.18223923444747925, "num_tokens": 43934471.0, "step": 23810 }, { "entropy": 5.576621007919312, "epoch": 2.0007561436672967, "grad_norm": 1.421875, "learning_rate": 0.00045976950956201325, "loss": 5.3083, "mean_token_accuracy": 0.17662405222654343, "num_tokens": 43944451.0, "step": 23815 }, { "entropy": 5.652852296829224, "epoch": 2.0011762234824615, "grad_norm": 1.734375, "learning_rate": 0.0004597524191183078, "loss": 5.2708, "mean_token_accuracy": 0.18469424694776534, "num_tokens": 43953892.0, "step": 23820 }, { "entropy": 5.648382472991943, "epoch": 2.0015963032976267, "grad_norm": 1.1796875, "learning_rate": 0.0004597353254018082, "loss": 5.3562, "mean_token_accuracy": 0.17324745506048203, "num_tokens": 43963155.0, "step": 23825 }, { "entropy": 5.600753879547119, "epoch": 2.0020163831127915, "grad_norm": 1.359375, "learning_rate": 0.0004597182284128177, "loss": 5.2033, "mean_token_accuracy": 0.18439362943172455, "num_tokens": 43972468.0, "step": 23830 }, { "entropy": 5.721098184585571, "epoch": 2.0024364629279563, "grad_norm": 1.21875, "learning_rate": 0.0004597011281516387, "loss": 5.4567, "mean_token_accuracy": 0.16708213537931443, "num_tokens": 43982709.0, "step": 23835 }, { "entropy": 5.576871728897094, "epoch": 2.002856542743121, "grad_norm": 1.1953125, "learning_rate": 0.00045968402461857435, "loss": 5.2333, "mean_token_accuracy": 0.18440057784318925, "num_tokens": 43992607.0, "step": 23840 }, { "entropy": 5.603885555267334, "epoch": 2.003276622558286, "grad_norm": 1.2890625, "learning_rate": 0.00045966691781392763, "loss": 5.1848, "mean_token_accuracy": 0.18089883625507355, "num_tokens": 44001265.0, "step": 23845 }, { "entropy": 5.653714561462403, "epoch": 2.003696702373451, "grad_norm": 1.2890625, "learning_rate": 0.00045964980773800156, "loss": 5.4064, "mean_token_accuracy": 0.1741128757596016, "num_tokens": 44010440.0, "step": 23850 }, { "entropy": 5.640526151657104, "epoch": 2.004116782188616, "grad_norm": 1.359375, "learning_rate": 0.0004596326943910993, "loss": 5.2281, "mean_token_accuracy": 0.17560428082942964, "num_tokens": 44020237.0, "step": 23855 }, { "entropy": 5.618943929672241, "epoch": 2.0045368620037807, "grad_norm": 1.203125, "learning_rate": 0.00045961557777352376, "loss": 5.3358, "mean_token_accuracy": 0.17574749439954757, "num_tokens": 44028976.0, "step": 23860 }, { "entropy": 5.63735933303833, "epoch": 2.0049569418189455, "grad_norm": 1.453125, "learning_rate": 0.00045959845788557844, "loss": 5.2992, "mean_token_accuracy": 0.17881839573383332, "num_tokens": 44038186.0, "step": 23865 }, { "entropy": 5.616828918457031, "epoch": 2.0053770216341107, "grad_norm": 1.8125, "learning_rate": 0.0004595813347275665, "loss": 5.2725, "mean_token_accuracy": 0.17441747933626175, "num_tokens": 44047780.0, "step": 23870 }, { "entropy": 5.60105562210083, "epoch": 2.0057971014492755, "grad_norm": 1.34375, "learning_rate": 0.0004595642082997912, "loss": 5.211, "mean_token_accuracy": 0.18210149556398392, "num_tokens": 44056678.0, "step": 23875 }, { "entropy": 5.614001226425171, "epoch": 2.0062171812644403, "grad_norm": 1.2734375, "learning_rate": 0.000459547078602556, "loss": 5.2657, "mean_token_accuracy": 0.17585084587335587, "num_tokens": 44066428.0, "step": 23880 }, { "entropy": 5.570787191390991, "epoch": 2.006637261079605, "grad_norm": 1.3046875, "learning_rate": 0.00045952994563616434, "loss": 5.2614, "mean_token_accuracy": 0.1772843211889267, "num_tokens": 44075285.0, "step": 23885 }, { "entropy": 5.624676752090454, "epoch": 2.00705734089477, "grad_norm": 1.2578125, "learning_rate": 0.0004595128094009197, "loss": 5.2494, "mean_token_accuracy": 0.1796739473938942, "num_tokens": 44084333.0, "step": 23890 }, { "entropy": 5.634045553207398, "epoch": 2.007477420709935, "grad_norm": 1.5546875, "learning_rate": 0.0004594956698971256, "loss": 5.2697, "mean_token_accuracy": 0.17147087454795837, "num_tokens": 44093504.0, "step": 23895 }, { "entropy": 5.668183422088623, "epoch": 2.0078975005251, "grad_norm": 1.1875, "learning_rate": 0.0004594785271250858, "loss": 5.2788, "mean_token_accuracy": 0.17484120875597, "num_tokens": 44102887.0, "step": 23900 }, { "entropy": 5.561066436767578, "epoch": 2.0083175803402646, "grad_norm": 1.4921875, "learning_rate": 0.0004594613810851039, "loss": 5.2687, "mean_token_accuracy": 0.1750637874007225, "num_tokens": 44113074.0, "step": 23905 }, { "entropy": 5.519744539260865, "epoch": 2.0087376601554294, "grad_norm": 1.3984375, "learning_rate": 0.00045944423177748353, "loss": 5.2696, "mean_token_accuracy": 0.18134041875600815, "num_tokens": 44122557.0, "step": 23910 }, { "entropy": 5.658271312713623, "epoch": 2.009157739970594, "grad_norm": 1.2578125, "learning_rate": 0.00045942707920252864, "loss": 5.2783, "mean_token_accuracy": 0.17392106503248214, "num_tokens": 44130198.0, "step": 23915 }, { "entropy": 5.630684757232666, "epoch": 2.0095778197857594, "grad_norm": 1.4921875, "learning_rate": 0.000459409923360543, "loss": 5.2459, "mean_token_accuracy": 0.18495004624128342, "num_tokens": 44139267.0, "step": 23920 }, { "entropy": 5.597193384170533, "epoch": 2.0099978996009242, "grad_norm": 1.375, "learning_rate": 0.0004593927642518305, "loss": 5.3217, "mean_token_accuracy": 0.17152093052864076, "num_tokens": 44149620.0, "step": 23925 }, { "entropy": 5.581966161727905, "epoch": 2.010417979416089, "grad_norm": 1.234375, "learning_rate": 0.0004593756018766951, "loss": 5.1661, "mean_token_accuracy": 0.18001709878444672, "num_tokens": 44158678.0, "step": 23930 }, { "entropy": 5.503856134414673, "epoch": 2.010838059231254, "grad_norm": 1.3046875, "learning_rate": 0.00045935843623544093, "loss": 5.1473, "mean_token_accuracy": 0.18169627338647842, "num_tokens": 44167376.0, "step": 23935 }, { "entropy": 5.6009259700775145, "epoch": 2.011258139046419, "grad_norm": 1.4140625, "learning_rate": 0.0004593412673283719, "loss": 5.275, "mean_token_accuracy": 0.17766901403665541, "num_tokens": 44176001.0, "step": 23940 }, { "entropy": 5.688672161102295, "epoch": 2.011678218861584, "grad_norm": 1.609375, "learning_rate": 0.00045932409515579226, "loss": 5.3321, "mean_token_accuracy": 0.17283178567886354, "num_tokens": 44185132.0, "step": 23945 }, { "entropy": 5.570486927032471, "epoch": 2.0120982986767486, "grad_norm": 1.4609375, "learning_rate": 0.00045930691971800627, "loss": 5.2738, "mean_token_accuracy": 0.1786741316318512, "num_tokens": 44193256.0, "step": 23950 }, { "entropy": 5.657260227203369, "epoch": 2.0125183784919134, "grad_norm": 1.5859375, "learning_rate": 0.00045928974101531805, "loss": 5.37, "mean_token_accuracy": 0.17304892987012863, "num_tokens": 44202884.0, "step": 23955 }, { "entropy": 5.7106156826019285, "epoch": 2.012938458307078, "grad_norm": 1.2890625, "learning_rate": 0.0004592725590480319, "loss": 5.3492, "mean_token_accuracy": 0.16924804002046584, "num_tokens": 44212826.0, "step": 23960 }, { "entropy": 5.6548271656036375, "epoch": 2.0133585381222434, "grad_norm": 1.2890625, "learning_rate": 0.0004592553738164524, "loss": 5.3199, "mean_token_accuracy": 0.16807449012994766, "num_tokens": 44222369.0, "step": 23965 }, { "entropy": 5.565683746337891, "epoch": 2.013778617937408, "grad_norm": 1.3984375, "learning_rate": 0.0004592381853208837, "loss": 5.2165, "mean_token_accuracy": 0.17430078536272048, "num_tokens": 44230964.0, "step": 23970 }, { "entropy": 5.604999732971192, "epoch": 2.014198697752573, "grad_norm": 1.3359375, "learning_rate": 0.0004592209935616304, "loss": 5.289, "mean_token_accuracy": 0.17769130319356918, "num_tokens": 44240199.0, "step": 23975 }, { "entropy": 5.645999479293823, "epoch": 2.0146187775677378, "grad_norm": 1.265625, "learning_rate": 0.0004592037985389971, "loss": 5.2669, "mean_token_accuracy": 0.18346799314022064, "num_tokens": 44249857.0, "step": 23980 }, { "entropy": 5.532536315917969, "epoch": 2.0150388573829026, "grad_norm": 1.234375, "learning_rate": 0.0004591866002532885, "loss": 5.2317, "mean_token_accuracy": 0.17959018796682358, "num_tokens": 44258364.0, "step": 23985 }, { "entropy": 5.497239446640014, "epoch": 2.015458937198068, "grad_norm": 1.3515625, "learning_rate": 0.00045916939870480896, "loss": 5.1629, "mean_token_accuracy": 0.18248820006847383, "num_tokens": 44267473.0, "step": 23990 }, { "entropy": 5.64896559715271, "epoch": 2.0158790170132326, "grad_norm": 1.2890625, "learning_rate": 0.00045915219389386336, "loss": 5.2054, "mean_token_accuracy": 0.1814291298389435, "num_tokens": 44276665.0, "step": 23995 }, { "entropy": 5.596774005889893, "epoch": 2.0162990968283974, "grad_norm": 1.3515625, "learning_rate": 0.0004591349858207565, "loss": 5.2614, "mean_token_accuracy": 0.1755758687853813, "num_tokens": 44285928.0, "step": 24000 }, { "epoch": 2.0162990968283974, "eval_entropy": 5.367871912509312, "eval_loss": 5.36544132232666, "eval_mean_token_accuracy": 0.18292493719046923, "eval_num_tokens": 44285928.0, "eval_runtime": 27.3301, "eval_samples_per_second": 1367.212, "eval_steps_per_second": 170.911, "step": 24000 } ], "logging_steps": 5, "max_steps": 119020, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.4819220631552e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }